#!/usr/bin/python # Classes for scraping Pitchfork for connections between artists and albums. # Get libraries for basic system stuff, web-scraping, and regular expressions. import sys, urllib, re def main(argv = None): '''Scrape Pitchfork for artists related to the artist indicated by the ID passed as argv[1]. See the usage message for more details.''' if argv is None: argv = sys.argv # If we didn't get an artist ID as a command-line argument, print a usage # message and exit. if len(argv) < 2: print('Usage: pitchfork_scraper.py ') print(' Scrapes Pitchfork for all artists who are mentioned by name,') print('or whose albums are mentioned, in any review of an album by') print('the artist whose ID you provide.') print('') print('Example:') print(' pitchfork_scraper.py 1742-godspeed-you-black-emperor') return 1; # Argument 1 is the string passed in on the command line. # It might be a plain artist ID, or it might be a full URL to the # artist's page on Pitchfork. art_id = argv[1] # If the string was a full URL, we should be able to extract the ID. m = re.search(Artist.link_patt, '"%s"' % art_id) if m is not None: art_id = m.group(1) # Construct an Artist object for this artist. # (See below for the Artist class definition.) art = Artist(art_id) # Find the related artists, according to Pitchfork, and print them out. print('\n'.join(art.relatedArtists())) # Now we're done! return 0; class Artist: '''An artist in the Pitchfork database.''' # The regular expression (or "pattern") for HTML links to artists. link_patt = re.compile('"[^"]*/artists/([^"/]+)/?"') # Let's break this pattern down: # - The pattern begins and ends with literal double-quotes. That's because # the URL for a link is always contained in double-quotes. # - The literal string "/artists/" is in the middle. We know the URL will # always contain this string. # - There may or may not be other stuff (namely, "http://pitchfork.com") # before "/artists/". We don't really care what it is, but we know it # must not contain a double-quote character, because that would be the # end of the link. So '[^"]*' represents zero or more of any # characters except the double-quote. # - The sub-pattern '[^"/]+' represents the artist ID. Everything after # "/artists/" until the optional closing "/" is the artist ID. The ID # can't contain a double-quote character, because that would be the end # of the link. It also can't contain a slash, because that would # indicate the beginning of a new part of the URL. So we construct a # character class representing anything *except* quotes and slashes, and # accept any number of these as an artist ID. # - The artist ID sub-pattern is surrounded by parentheses to create a # match group. This means that re.findall will return just the IDs # when it finds matches. # The string-formatting pattern to construct the URL for an artist, given # the ID. The "%s" gets replaced with the ID. url_patt = 'http://pitchfork.com/artists/%s/' def __init__(self, id): '''Construct an artist entry from an ID.''' self.id = id # When we first create an Artist object, the text is empty. We have to # grab it from the web using getText() before we do anything else. self.text = None def getText(self): '''Grab the full text of the artist page and store it in self.text.''' # Create the URL for the artist's page on Pitchfork. artist_url = Artist.url_patt % self.id # Open a connection to the webpage, read it, and close the connection. fid = urllib.urlopen(artist_url) all_text = fid.read() fid.close() # We only need the stuff after
. m = re.search('
', all_text) all_text = all_text[m.start():] # We also only need the stuff up until
. m = re.search('
', all_text) self.text = all_text[:m.start()] # This function doesn't return anything; it just sets self.text. # At this point, self.text contains only the main div, which includes # the album list and any news entries below the album list. def albums(self): '''Returns a set of IDs of all reviewed albums by this artist.''' if self.text == None: self.getText() clipped = self.text # There's some extra stuff we don't need at the beginning of the main # div. The album list begins after the first
# tag, so let's just grab that text. m = re.search('
', self.text) if m is not None: clipped = self.text[m.end():] # We also don't want anything after the album list, like news entries. # The first thing after the album list also begins with another #
tag, so let's clip the text at that point. m = re.search('
', clipped) if m is not None: clipped = clipped[:m.start()] # Now our task is easy: find all strings that match the album link # regular expression, make a list of the IDs return set(re.findall(Album.link_patt, clipped)) def relatedArtists(self): '''Scans reviews of all albums by this artist and returns a set of artist IDs for artists mentioned in reviews or whose albums are mentioned.''' if self.text == None: self.getText() # We start with an empty set of related artists. related_artists = set() # For every album listed on this artist's page... for alb_id in self.albums(): # Look up that album. alb = Album(alb_id) # The artists who worked on this album are related to the original # artist... in fact, they're probably the same. The |= operator # adds all the artists to our set, while ignoring duplicates. related_artists |= alb.artists() # All the artists linked in the review of this album are considered # to be related, too. related_artists |= alb.relatedArtists() # Now we need to look at all the albums linked in this review. For # each linked album... for rel_alb_id in alb.relatedAlbums(): # Look up the album. rel_alb = Album(rel_alb_id) # Add the artists who worked on the album to the related artists # set. related_artists |= rel_alb.artists() # Now, before we return the set, let's remove the artist we're # investigating; it doesn't really say much to claim that an artist is # related to themselves. return related_artists - set([self.id]) # The __str__ member function just creates a representation of the object # as a string. Let's tell the caller what the ID of this artist is. def __str__(self): return "Artist('%s')" % self.id # The __repr__ member function is pretty similar to __str__; we can just # make them the same. __repr__ = __str__ class Album: '''An album in the Pitchfork database.''' # The regular expression (or "pattern") for HTML links to album reviews. # This pattern is constructed very similarly to the Artist.link_patt # pattern. See if you can figure out what all the different parts are for. link_patt = re.compile('"[^"]*/reviews/albums/([^"/]+)/?"') # The string-formatting pattern to construct the URL for an album, given # the ID. The "%s" gets replaced with the ID. url_patt = 'http://pitchfork.com/reviews/albums/%s/' def __init__(self, id): '''Construct an album entry from an ID.''' self.id = id # When we first create an Album object, the text is empty. We have to # grab it from the web using getText() before we do anything else. self.text = None def getText(self): '''Grab the full text of the album page and store it in self.text.''' # Create the URL for the album's review on Pitchfork. album_url = Album.url_patt % self.id # Open a connection to the webpage, read its contents, and then close # the connection. fid = urllib.urlopen(album_url) all_text = fid.read() fid.close() # The stuff we're interested in begins at the main div... m = re.search('
', all_text) all_text = all_text[m.start():] # ... and ends before the side div. m = re.search('
', all_text) self.text = all_text[:m.start()] def artists(self): '''Return the list of artists featured on this album.''' if self.text == None: self.getText() clipped = self.text # The list of artists for this album is preceded by the fixed HTML code # "

Artists:

". m = re.search('

Artists:

', self.text) if m is not None: clipped = self.text[m.end():] # The list of artists ends before the next instance of a tag. m = re.search('', clipped) if m is not None: clipped = clipped[:m.start()] # Now that we've clipped away most of the extraneous code, we know that # anything that matches the artist link pattern is a link to an artist # who worked on this album. Find all of them and return a set of their # ID's. return set(re.findall(Artist.link_patt, clipped)) def relatedArtists(self): '''Return a list of artists mentioned in the review.''' if self.text == None: self.getText() clipped = self.text # The review is contained in an editorial-class div, so we consider only # the text that occurs after this div's opening tag. m = re.search('
', self.text) if m is not None: clipped = self.text[m.end():] # Now any artist links we find are those artists mentioned in the body # of the review. Find them all and return a set of IDs. return set(re.findall(Artist.link_patt, clipped)) def relatedAlbums(self): '''Return a list of albums mentioned in the review.''' if self.text == None: self.getText() clipped = self.text # As above, the review is contained in an editorial-class div. m = re.search('
', self.text) if m is not None: clipped = self.text[m.end():] # Now any album links we find are those albums mentioned in the body of # the review. Find them all and return a set of IDs. return set(re.findall(Album.link_patt, clipped)) def __str__(self): return "Album('%s')" % self.id __repr__ = __str__ # This calls the main() function if this script is invoked directly from the # command line. if __name__ == "__main__": sys.exit(main())