''' web-precrawler.py Jeff Ondich, 15 September 2013 This program grabs a web page specified on the command line, extracts all the href links from the page, and prints them out. It also, optionally, prints the HTTP headers received from the remote server. The point of this script is to give tiny illustrations of a few important Python libraries that you'll use in your webcrawler assignment: argparse, urllib2, and re. ''' import argparse import re import urllib2 def get_all_href_values(text): href_pattern = re.compile(r'>sys.stderr, 'Trouble reading the url:', e exit() # Show the links, if they were requested on the command line if arguments.showlinks: links = get_all_href_values(text_of_requested_page) for link in links: print link # Show the in-coming headers, if they were requested on the command line if arguments.showheaders: print response.headers # (But what type of object is response.headers? How can you find out?) if __name__ == '__main__': arg_parser = argparse.ArgumentParser(description='Produce a report on the web page specified on the command line.') arg_parser.add_argument('url', help='Include only words that have a sortkey starting with this prefix.') arg_parser.add_argument('--showheaders', action='store_true') arg_parser.add_argument('--showlinks', action='store_true') arguments = arg_parser.parse_args() main(arguments)