#!/usr/bin/python import urllib import urlparse import sys import re RECURSION_LEVEL = 3 def getLinks ( start_page, page_data ) : url_list = [] anchor_href_regex = '<\s*a\s*href\s*=\s*[\x27\x22]?([a-zA-Z0-9:/\\\\._-]*)[\x27\x22]?\s*' urls = re.findall(anchor_href_regex,page_data) for url in urls : url_list.append(urlparse.urljoin( start_page, url )) return url_list def getPage ( url ) : page_data = urllib.urlopen(url).read() return page_data if __name__ == '__main__' : end_results = [] recursion_count = 0 try: page_array = [sys.argv[1]] except IndexError: print 'Please provide a valid url.' sys.exit() end_results.append(sys.argv[1]) while recursion_count < RECURSION_LEVEL: results = [] for current_page in page_array: page_data = getPage( current_page ) link_list = getLinks(current_page, page_data) for item in link_list: if item.find( current_page ) != -1: results.append( item ) results = list(set(results)) page_array = results end_results += results end_results = list(set(end_results)) recursion_count += 1 for item in end_results: print item