Artists:

#!/usr/bin/python
# Classes for scraping Pitchfork for connections between artists and albums.

# Get libraries for basic system stuff, web-scraping, and regular expressions.
import sys, urllib, re


def main(argv = None):
    '''Scrape Pitchfork for artists related to the artist indicated by the ID
passed as argv[1].  See the usage message for more details.'''
    
    if argv is None:
        argv = sys.argv
    
    # If we didn't get an artist ID as a command-line argument, print a usage
    # message and exit.
    if len(argv) < 2:
        print('Usage: pitchfork_scraper.py <artist-ID>')
        print('  Scrapes Pitchfork for all artists who are mentioned by name,')
        print('or whose albums are mentioned, in any review of an album by')
        print('the artist whose ID you provide.')
        print('')
        print('Example:')
        print('  pitchfork_scraper.py 1742-godspeed-you-black-emperor')
        return 1;
    
    # Argument 1 is the string passed in on the command line.
    # It might be a plain artist ID, or it might be a full URL to the
    # artist's page on Pitchfork.
    art_id = argv[1]
    
    # If the string was a full URL, we should be able to extract the ID.
    m = re.search(Artist.link_patt, '"%s"' % art_id)
    if m is not None:
        art_id = m.group(1)
    
    # Construct an Artist object for this artist.
    # (See below for the Artist class definition.)
    art = Artist(art_id)
    
    # Find the related artists, according to Pitchfork, and print them out.
    print('\n'.join(art.relatedArtists()))
    
    # Now we're done!
    return 0;


class Artist:
    '''An artist in the Pitchfork database.'''
    
    # The regular expression (or "pattern") for HTML links to artists.
    link_patt = re.compile('"[^"]*/artists/([^"/]+)/?"')
    # Let's break this pattern down:
    # - The pattern begins and ends with literal double-quotes.  That's because
    #     the URL for a link is always contained in double-quotes.
    # - The literal string "/artists/" is in the middle.  We know the URL will
    #     always contain this string.
    # - There may or may not be other stuff (namely, "http://pitchfork.com")
    #     before "/artists/".  We don't really care what it is, but we know it
    #     must not contain a double-quote character, because that would be the
    #     end of the link.  So '[^"]*' represents zero or more of any
    #     characters except the double-quote.
    # - The sub-pattern '[^"/]+' represents the artist ID.  Everything after
    #     "/artists/" until the optional closing "/" is the artist ID.  The ID
    #     can't contain a double-quote character, because that would be the end
    #     of the link.  It also can't contain a slash, because that would
    #     indicate the beginning of a new part of the URL.  So we construct a
    #     character class representing anything *except* quotes and slashes, and
    #     accept any number of these as an artist ID.
    # - The artist ID sub-pattern is surrounded by parentheses to create a
    #     match group.  This means that re.findall will return just the IDs
    #     when it finds matches.
    
    # The string-formatting pattern to construct the URL for an artist, given
    # the ID.  The "%s" gets replaced with the ID.
    url_patt = 'http://pitchfork.com/artists/%s/'
    
    
    def __init__(self, id):
        '''Construct an artist entry from an ID.'''
        self.id = id
        # When we first create an Artist object, the text is empty.  We have to
        # grab it from the web using getText() before we do anything else.
        self.text = None
    
    
    def getText(self):
        '''Grab the full text of the artist page and store it in self.text.'''
        
        # Create the URL for the artist's page on Pitchfork.
        artist_url = Artist.url_patt % self.id
        
        # Open a connection to the webpage, read it, and close the connection.
        fid = urllib.urlopen(artist_url)
        all_text = fid.read()
        fid.close()
        
        # We only need the stuff after <div id="main">.
        m = re.search('<div id="main">', all_text)
        all_text = all_text[m.start():]
        
        # We also only need the stuff up until <div id="side">.
        m = re.search('<div id="side">', all_text)
        self.text = all_text[:m.start()]
        
        # This function doesn't return anything; it just sets self.text.
        # At this point, self.text contains only the main div, which includes
        # the album list and any news entries below the album list.
    
    
    def albums(self):
        '''Returns a set of IDs of all reviewed albums by this artist.'''
        if self.text == None:
            self.getText()
        clipped = self.text
        
        # There's some extra stuff we don't need at the beginning of the main
        # div.  The album list begins after the first <div class="search-group">
        # tag, so let's just grab that text.
        m = re.search('<div class="search-group">', self.text)
        if m is not None:
            clipped = self.text[m.end():]
        
        # We also don't want anything after the album list, like news entries.
        # The first thing after the album list also begins with another
        # <div class="search-group"> tag, so let's clip the text at that point.
        m = re.search('<div class="search-group">', clipped)
        if m is not None:
            clipped = clipped[:m.start()]
        
        # Now our task is easy: find all strings that match the album link
        # regular expression, make a list of the IDs
        return set(re.findall(Album.link_patt, clipped))
    
    
    def relatedArtists(self):
        '''Scans reviews of all albums by this artist and returns a set of
        artist IDs for artists mentioned in reviews or whose albums are
        mentioned.'''
        if self.text == None:
            self.getText()
        
        # We start with an empty set of related artists.
        related_artists = set()
        
        # For every album listed on this artist's page...
        for alb_id in self.albums():
            
            # Look up that album.
            alb = Album(alb_id)
            
            # The artists who worked on this album are related to the original
            # artist... in fact, they're probably the same.  The |= operator
            # adds all the artists to our set, while ignoring duplicates.
            related_artists |= alb.artists()
            
            # All the artists linked in the review of this album are considered
            # to be related, too.
            related_artists |= alb.relatedArtists()
            
            # Now we need to look at all the albums linked in this review.  For
            # each linked album...
            for rel_alb_id in alb.relatedAlbums():
                
                # Look up the album.
                rel_alb = Album(rel_alb_id)
                
                # Add the artists who worked on the album to the related artists
                # set.
                related_artists |= rel_alb.artists()
        
        # Now, before we return the set, let's remove the artist we're
        # investigating; it doesn't really say much to claim that an artist is
        # related to themselves.
        return related_artists - set([self.id])
    
    
    # The __str__ member function just creates a representation of the object
    # as a string.  Let's tell the caller what the ID of this artist is.
    def __str__(self):
        return "Artist('%s')" % self.id
    
    # The __repr__ member function is pretty similar to __str__; we can just
    # make them the same.
    __repr__ = __str__


class Album:
    '''An album in the Pitchfork database.'''
    
    # The regular expression (or "pattern") for HTML links to album reviews.
    # This pattern is constructed very similarly to the Artist.link_patt
    # pattern.  See if you can figure out what all the different parts are for.
    link_patt = re.compile('"[^"]*/reviews/albums/([^"/]+)/?"')
    
    # The string-formatting pattern to construct the URL for an album, given
    # the ID.  The "%s" gets replaced with the ID.
    url_patt = 'http://pitchfork.com/reviews/albums/%s/'
    
    
    def __init__(self, id):
        '''Construct an album entry from an ID.'''
        self.id = id
        # When we first create an Album object, the text is empty.  We have to
        # grab it from the web using getText() before we do anything else.
        self.text = None
    
    
    def getText(self):
        '''Grab the full text of the album page and store it in self.text.'''
        
        # Create the URL for the album's review on Pitchfork.
        album_url = Album.url_patt % self.id
        
        # Open a connection to the webpage, read its contents, and then close
        # the connection.
        fid = urllib.urlopen(album_url)
        all_text = fid.read()
        fid.close()
        
        # The stuff we're interested in begins at the main div...
        m = re.search('<div id="main">', all_text)
        all_text = all_text[m.start():]
        
        # ... and ends before the side div.
        m = re.search('<div id="side">', all_text)
        self.text = all_text[:m.start()]
    
    
    def artists(self):
        '''Return the list of artists featured on this album.'''
        if self.text == None:
            self.getText()
        clipped = self.text
        
        # The list of artists for this album is preceded by the fixed HTML code
        # "<h1>Artists:</h1>".
        m = re.search('<h1>Artists:</h1>', self.text)
        if m is not None:
            clipped = self.text[m.end():]
        
        # The list of artists ends before the next instance of a </li> tag.
        m = re.search('</li>', clipped)
        if m is not None:
            clipped = clipped[:m.start()]
        
        # Now that we've clipped away most of the extraneous code, we know that
        # anything that matches the artist link pattern is a link to an artist
        # who worked on this album.  Find all of them and return a set of their
        # ID's.
        return set(re.findall(Artist.link_patt, clipped))
    
    
    def relatedArtists(self):
        '''Return a list of artists mentioned in the review.'''
        if self.text == None:
            self.getText()
        clipped = self.text
        
        # The review is contained in an editorial-class div, so we consider only
        # the text that occurs after this div's opening tag.
        m = re.search('<div class="editorial">', self.text)
        if m is not None:
            clipped = self.text[m.end():]
        
        # Now any artist links we find are those artists mentioned in the body
        # of the review.  Find them all and return a set of IDs.
        return set(re.findall(Artist.link_patt, clipped))
    
    
    def relatedAlbums(self):
        '''Return a list of albums mentioned in the review.'''
        if self.text == None:
            self.getText()
        clipped = self.text
        
        # As above, the review is contained in an editorial-class div.
        m = re.search('<div class="editorial">', self.text)
        if m is not None:
            clipped = self.text[m.end():]
        
        # Now any album links we find are those albums mentioned in the body of
        # the review.  Find them all and return a set of IDs.
        return set(re.findall(Album.link_patt, clipped))
    
    def __str__(self):
        return "Album('%s')" % self.id
    
    __repr__ = __str__


# This calls the main() function if this script is invoked directly from the
# command line.
if __name__ == "__main__":
    sys.exit(main())