'''
    web-precrawler.py
    Jeff Ondich, 15 September 2013

    This program grabs a web page specified on the command line, extracts all the href
    links from the page, and prints them out. It also, optionally, prints the HTTP
    headers received from the remote server.

    The point of this script is to give tiny illustrations of a few important
    Python libraries that you'll use in your webcrawler assignment: argparse, urllib2, and re.
'''

import argparse
import re
import urllib2

def get_all_href_values(text):
    href_pattern = re.compile(r'<a.*?href="(.*?)"')
    links = []
    for href_value in re.findall(href_pattern, text):
        links.append(href_value)
    return links

def main(arguments):
    # Get the text of the requested page.
    try:
        request = urllib2.Request(arguments.url)
        response = urllib2.urlopen(request)
        text_of_requested_page = response.read()
        response.close()
    except Exception, e:
        print >>sys.stderr, 'Trouble reading the url:', e
        exit()

    # Show the links, if they were requested on the command line
    if arguments.showlinks:
        links = get_all_href_values(text_of_requested_page)
        for link in links:
            print link

    # Show the in-coming headers, if they were requested on the command line
    if arguments.showheaders:
        print response.headers

        # (But what type of object is response.headers? How can you find out?)


if __name__ == '__main__':
    arg_parser = argparse.ArgumentParser(description='Produce a report on the web page specified on the command line.')
    arg_parser.add_argument('url', help='Include only words that have a sortkey starting with this prefix.')
    arg_parser.add_argument('--showheaders', action='store_true')
    arg_parser.add_argument('--showlinks', action='store_true')
    arguments = arg_parser.parse_args()
    main(arguments)