# Adapted from code found at # http://stackoverflow.com/questions/4460921/extract-the-first-paragraph-from-a-wikipedia-article-python import urllib import urllib2 def main(): # List of Wikipedia movie articles: update as you like articles = [ 'Dark Star (film)', 'Madame Butterfly (1932 film)', 'Mean Girls','The Angry Red Planet' ] for article in articles: # If a Wikipedia article has a character that gets in the way of being part # of a webpage URL, transform it appropriately quotedArticle = urllib.quote(article) # If something fails when reading the Wikipedia page, the title is likely # bad. try: # This block of text opens up a web connection, convinces Wikipedia that # it is a real browser trying to grab the data (otherwise, Wikipedia # refuses to hand it over), and reads a strong of data containing the # web page opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] #wikipedia needs this resource = opener.open("http://en.wikipedia.org/wiki/" + quotedArticle) data = resource.read() resource.close() # Print the name of the article, and size in bytes. This includes all # kind of other cruft that Wikipedia articles have, but it's at least an # easy to way to count and compare. print article+':', len(data) except urllib2.HTTPError: print "Could not find title",article main()