Put the following in a file named url_scrape.py.
#!/usr/bin/env python '''Prints a list of URLs that are found in standard input. It will only find URLs between quotes ("" or '') and starting with http:// ''' import re import sys # Pattern for fully-qualified URLs: url_pattern = re.compile('''["']http://[^+]*?['"]''') # build list of all URLs found in standard input s = sys.stdin.read() all = url_pattern.findall(s) # output all the URLs for i in all: print i.strip('"').strip("'")
Example Usage:
wget -O - http://madphilosopher.ca/ | ./url_scrape.py | sort | uniq