Ruby web crawler
This Ruby script reads in a list of links from links.dat, it then picks out the ones it can easily spider and gets a list of URLs from each page listed in links.dat. Every new URL it finds will be added to newlinks.dat for later spidering by another bot running along side this one.
require 'socket' links = File.open("links.dat") while links.gets do #domain = ($_ =~ /http:\/\/.*\.([0-9a-zA-Z\-]+\.com|net|org)/); if %r{http://([^/]+)/([^/]+)}i =~ $_ domain,path = $1, $2 end if proto="http" begin t = TCPSocket.new(domain, 'www') rescue puts "error: #{$!}" else t.print "GET /"+path+" HTTP/1.0\n\n" answer = t.gets(nil) t.close end if %r{<a\s+href="(\w+)://([^"]+)"[^>]*>([^<]*)</a>}i =~ answer proto, url, text = $1, $2, $3 end print proto+"://"+url+"\n" old = File.open("newlinks.dat") new = File.open("links.dat.tmp", File::WRONLY|File::TRUNC|File::CREAT) while old.gets do if $_ != proto+"://"+url new.print $_ end end new.print proto+"://"+url old.close new.close File.rename("newlinks.dat", "links.dat.orig") File.rename("links.dat.tmp", "newlinks.dat") end end links.close