<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DZone Snippets: Lordrich's Code Snippets</title>
    <link>http://snippets.dzone.com/posts</link>
    <pubDate>Thu, 24 Jul 2008 03:36:41 GMT</pubDate>
    <description>DZone Snippets: Lordrich's Code Snippets</description>
    <item>
      <title>securing the  /home directory</title>
      <link>http://snippets.dzone.com/posts/show/2045</link>
      <description>I'm still working on getting this one perfect.&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;chmod 701 /home/*&lt;br /&gt;chmod 705 /home/*/public_html&lt;br /&gt;chmod 604 /home/*/public_html/*.*&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Tue, 16 May 2006 14:46:58 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/2045</guid>
      <author>lordrich ()</author>
    </item>
    <item>
      <title>slashdot protection</title>
      <link>http://snippets.dzone.com/posts/show/1957</link>
      <description>Replace aaa.bbb.ccc.ddd with your own IP (so you always get to your site).&lt;br /&gt;Replace www.yourdomain.com with your actual domain.&lt;br /&gt;&lt;br /&gt;Just don&#8217;t mess with the user_agent and query_string lines. Those ensure that the Coral servers themselves can retrieve your page when they need to.&lt;br /&gt;&lt;br /&gt;(Shamelessly stolen from http://ottodestruct.com/diggprotectionrules.txt)&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;RewriteEngine on&lt;br /&gt;RewriteCond %{REMOTE_ADDR} !^aaa.bbb.ccc.ddd$&lt;br /&gt;RewriteCond %{HTTP_USER_AGENT} !^CoralWebPrx&lt;br /&gt;RewriteCond %{QUERY_STRING} !(^|&amp;)coral-no-serve$&lt;br /&gt;RewriteCond %{HTTP_REFERER} ^http://(www\.)?digg\.com [OR]&lt;br /&gt;RewriteCond %{HTTP_REFERER} ^http://(www\.)?slashdot\.org [OR]&lt;br /&gt;RewriteCond %{HTTP_REFERER} ^http://(www\.)?slashdot\.com [OR]&lt;br /&gt;RewriteCond %{HTTP_REFERER} ^http://(www\.)?fark\.com [OR]&lt;br /&gt;RewriteCond %{HTTP_REFERER} ^http://(www\.)?somethingawful\.com [OR]&lt;br /&gt;RewriteCond %{HTTP_REFERER} ^http://(www\.)?kuro5hin\.org [OR]&lt;br /&gt;RewriteCond %{HTTP_REFERER} ^http://(www\.)?engadget\.com [OR]&lt;br /&gt;RewriteCond %{HTTP_REFERER} ^http://(www\.)?boingboing\.net [OR]&lt;br /&gt;RewriteCond %{HTTP_REFERER} ^http://(www\.)?del\.icio\.us&lt;br /&gt;RewriteRule ^(.*)$ http://www.yourdomain.com.nyud.net:8080$1 [R,L]&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Sun, 23 Apr 2006 14:43:02 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/1957</guid>
      <author>lordrich ()</author>
    </item>
    <item>
      <title>Banning bad bots</title>
      <link>http://snippets.dzone.com/posts/show/1935</link>
      <description>The following code is the contents of /banme/index.php.  This file is linked to from my main website but invisible to web browsers and disallowed in robots.txt.  Therefore, only bad bots will ever follow this link and when they do so they will get banned in .htaccess and their ip address will be emailed to webmaster@example.com.&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;&lt;?php&lt;br /&gt;$i = getenv('REMOTE_ADDR');&lt;br /&gt;$handle = fopen("../.htaccess", "a");&lt;br /&gt;fwrite($handle, "Deny from $i\n");&lt;br /&gt;fclose($handle);&lt;br /&gt;echo "You've just got $i banned from this domain.  You are a very bad person.";&lt;br /&gt;mail("webmaster@example.com", "Banned IP", "Deny from $i");&lt;br /&gt;?&gt;&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Tue, 18 Apr 2006 15:49:13 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/1935</guid>
      <author>lordrich ()</author>
    </item>
    <item>
      <title>Googleit WordPress Plugin</title>
      <link>http://snippets.dzone.com/posts/show/1911</link>
      <description>// description of your code here&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;&lt;?php&lt;br /&gt;/*&lt;br /&gt;Plugin Name: Googleit&lt;br /&gt;Plugin URI: http://lordrich.com/archives/2005/04/02/just-google-it/&lt;br /&gt;Description: Link to google for the current title.  Usage: google_it();&lt;br /&gt;Version: 0.1&lt;br /&gt;Author: Richard Kirkcaldy&lt;br /&gt;Author URI: http://lordrich.com&lt;br /&gt;*/&lt;br /&gt;&lt;br /&gt;function google_it(){&lt;br /&gt;	$google = '&lt;a href="http://www.google.com/search?q='.get_the_title().'"&gt;Google It&lt;/a&gt;';&lt;br /&gt;	echo $google;&lt;br /&gt;	}&lt;br /&gt;?&gt;&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Thu, 13 Apr 2006 16:20:36 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/1911</guid>
      <author>lordrich ()</author>
    </item>
    <item>
      <title>Cerberus recent requesters</title>
      <link>http://snippets.dzone.com/posts/show/1898</link>
      <description>Display a list of email addresses for people who have filed tickets in the past 30 days.&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;&lt;?php&lt;br /&gt;$database = 'cerberus';&lt;br /&gt;$username = 'cerberus';&lt;br /&gt;$password = 'cerberus';&lt;br /&gt;$hostname = 'localhost';&lt;br /&gt;&lt;br /&gt;$sql = "&lt;br /&gt;SELECT distinct address.address_address&lt;br /&gt;FROM ticket, requestor, address&lt;br /&gt;WHERE  DATE_SUB(CURDATE(),INTERVAL 30 DAY) &lt;= last_update_date&lt;br /&gt;AND ticket_status='resolved'&lt;br /&gt;AND ticket.ticket_id=requestor.ticket_id&lt;br /&gt;AND requestor.suppress='0'&lt;br /&gt;AND requestor.address_id=address.address_id";&lt;br /&gt;&lt;br /&gt;mysql_connect($hostname,$username,$password) or die('error connecting to the database');&lt;br /&gt;@mysql_select_db($database) or die('unable to select database');&lt;br /&gt;$result = mysql_query($sql) or die(mysql_error());&lt;br /&gt;$num=mysql_numrows($result);&lt;br /&gt;mysql_close();&lt;br /&gt;&lt;br /&gt;$i = 0;&lt;br /&gt;while($i &lt; $num) {&lt;br /&gt;	$address_address = mysql_result($result,$i,"address.address_address");&lt;br /&gt;	echo "$address_address&lt;br/&gt;";&lt;br /&gt;	$i++;&lt;br /&gt;}&lt;br /&gt;?&gt;&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Tue, 11 Apr 2006 20:52:48 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/1898</guid>
      <author>lordrich ()</author>
    </item>
    <item>
      <title>Random Flickr Photo</title>
      <link>http://snippets.dzone.com/posts/show/1897</link>
      <description>// description of your code here&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;&lt;?php&lt;br /&gt;&lt;br /&gt;    // Displays a single random photo from recent flickr photos with a given tag.&lt;br /&gt;    // Original code stolen from many sources including http://www.thebishop.net/geodog/archives/2004/09/29/fun_hacking_with_flickr_making_a_homemade_flickr_tag_badge_with_magpierss.html and http://prwdot.org/archives/002468.html&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;    // USER CONFIGURATION SECTION&lt;br /&gt;&lt;br /&gt;    // MagpieRSS Configuration&lt;br /&gt;    // This is an example based on my system;&lt;br /&gt;    // you will need to customize it for your system&lt;br /&gt;    // and your preferences. You can remove it entirely&lt;br /&gt;    // if you have done it elsewhere&lt;br /&gt;    // refer to http://magpierss.sourceforge.net/&lt;br /&gt;    require_once('/var/www/bradford/magpierss-0.61/rss_fetch.inc');&lt;br /&gt;    error_reporting(E_ERROR);&lt;br /&gt;    define(MAGPIE_CACHE_ON, true);&lt;br /&gt;    define(MAGPIE_CACHE_DIR, '/var/www/bradford/magpie_cache');&lt;br /&gt;    define(MAGPIE_CACHE_AGE, 300);&lt;br /&gt;    define(MAGPIE_CACHE_FRESH_ONLY, false);&lt;br /&gt;    define(MAGPIE_DETECT_ENCODING, true);&lt;br /&gt;    define(MAGPIE_DEBUG, 0);&lt;br /&gt;    define(MAGPIE_FETCH_TIME_OUT, 15);&lt;br /&gt;    define(MAGPIE_USE_GZIP, true);    &lt;br /&gt;&lt;br /&gt;    // flickr configuration&lt;br /&gt;    // How many photos you want to display&lt;br /&gt;    $num_photos = 2; // for some reason it doesn't like 1&lt;br /&gt;    $tag = 'Bradford';&lt;br /&gt;    // URL for the flickr feed you want to use&lt;br /&gt;    $flickr_feed_url ='http://www.flickr.com/services/feeds/photos_public.gne?tags='.$tag.'&amp;format=rss_200';&lt;br /&gt;?&gt;&lt;br /&gt;&lt;br /&gt;&lt;?php&lt;br /&gt;    // Fetch the feed&lt;br /&gt;    $flickr = fetch_rss( $flickr_feed_url );&lt;br /&gt;    if ($flickr) {&lt;br /&gt;        $flickr_title = $flickr-&gt;channel["title"];&lt;br /&gt;        $flickr_link = $flickr-&gt;channel["link"];&lt;br /&gt;?&gt;&lt;br /&gt;&lt;br /&gt;&lt;!-- Display the title and link to the feed --&gt;&lt;br /&gt;&lt;br /&gt;&lt;?php&lt;br /&gt;    // Pick some random photos&lt;br /&gt;        $random_photos = array_rand($flickr-&gt;items,$num_photos);&lt;br /&gt;        foreach ( $random_photos as $random_photo ) {&lt;br /&gt;            $description = explode("\n\n",$flickr-&gt;items[$random_photo]["description"]);&lt;br /&gt;?&gt;&lt;br /&gt;&lt;br /&gt;    &lt;!-- Display the given photo --&gt;&lt;br /&gt;&lt;br /&gt;	&lt;?php echo ereg_replace('&lt;img src=(.*) width=(.*)&gt;', '&lt;img src=\\1 width="150px"/&gt;', $description[1]);&lt;br /&gt;	  die();?&gt; // ok we've got our first photo - lets exit&lt;br /&gt;&lt;br /&gt;&lt;?php&lt;br /&gt;        }&lt;br /&gt;      } else {&lt;br /&gt;?&gt;&lt;br /&gt;&lt;br /&gt;&lt;!-- Display an error message if things didn't work --&gt;&lt;br /&gt;&lt;p&gt;An error occurred in the MagpieRSS parser:&lt;/p&gt;&lt;br /&gt;&lt;br /&gt;&lt;p&gt;&lt;?php echo magpie_error(); ?&gt;&lt;/p&gt;&lt;br /&gt;&lt;br /&gt;&lt;?php&lt;br /&gt;    } &lt;br /&gt;?&gt;&lt;br /&gt;&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Tue, 11 Apr 2006 20:50:20 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/1897</guid>
      <author>lordrich ()</author>
    </item>
    <item>
      <title>CPP SMTP Client</title>
      <link>http://snippets.dzone.com/posts/show/1896</link>
      <description>An SMTP client in CPP, apparently for a Linux machine.  It was found in my old source code folder and so may not be fully working.&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;&lt;br /&gt;#include &lt;sys/types.h&gt;&lt;br /&gt;#include &lt;sys/socket.h&gt;&lt;br /&gt;#include &lt;netinet/in.h&gt;&lt;br /&gt;#include &lt;netdb.h&gt;&lt;br /&gt;#include &lt;stdio.h&gt;&lt;br /&gt;&lt;br /&gt;#define HELO "HELO\n"&lt;br /&gt;#define DATA "DATA\n"&lt;br /&gt;#define QUIT "QUIT\n"&lt;br /&gt;&lt;br /&gt;FILE *fin;&lt;br /&gt;int sock;&lt;br /&gt;struct sockaddr_in server;&lt;br /&gt;struct hostent *hp, *gethostbyname();&lt;br /&gt;char buf[BUFSIZ+1];&lt;br /&gt;int len;&lt;br /&gt;char *host_id;&lt;br /&gt;char *from_id;&lt;br /&gt;char *to_id;&lt;br /&gt;char *file_id;&lt;br /&gt;char wkstr[100];&lt;br /&gt;&lt;br /&gt;/*=====Send a string to the socket=====*/&lt;br /&gt;&lt;br /&gt;send_socket(char *s)&lt;br /&gt;{&lt;br /&gt;	write(sock,s,strlen(s));&lt;br /&gt;	write(1,s,strlen(s));&lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;/*=====Read a string from the socket=====*/&lt;br /&gt;&lt;br /&gt;read_socket()&lt;br /&gt;{&lt;br /&gt;	len = read(sock,buf,BUFSIZ);&lt;br /&gt;	write(1,buf,len);&lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;/*=====MAIN=====*/&lt;br /&gt;int main(int argc, char* argv[])&lt;br /&gt;{&lt;br /&gt;&lt;br /&gt;if(argc != 5)&lt;br /&gt;{&lt;br /&gt; printf("USAGE: %s &lt;host&gt; &lt;from&gt; &lt;to&gt; &lt;filename&gt;\n\n", argv[0]);&lt;br /&gt; exit(1);&lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;host_id=argv[1];&lt;br /&gt;from_id=argv[2];&lt;br /&gt;to_id=argv[3];&lt;br /&gt;file_id=argv[4];&lt;br /&gt;&lt;br /&gt;/*=====Create Socket=====*/&lt;br /&gt;sock = socket(AF_INET, SOCK_STREAM, 0);&lt;br /&gt;if (sock==-1)&lt;br /&gt;{&lt;br /&gt; perror("opening stream socket");&lt;br /&gt; exit(1);&lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;/*=====Verify host=====*/&lt;br /&gt;server.sin_family = AF_INET;&lt;br /&gt;hp = gethostbyname(host_id);&lt;br /&gt;if (hp==(struct hostent *) 0)&lt;br /&gt;{&lt;br /&gt; fprintf(stderr, "%s: unknown host\n", host_id);&lt;br /&gt; exit(2);&lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;/*=====Connect to port 25 on remote host=====*/&lt;br /&gt;printf ("hostent %s\n", hp-&gt;h_addr_list[0]);&lt;br /&gt;memcpy((char *) &amp;server.sin_addr, (char *) hp-&gt;h_addr, hp-&gt;h_length);&lt;br /&gt;&lt;br /&gt;server.sin_port=htons(25); /* SMTP PORT */&lt;br /&gt;&lt;br /&gt;if (connect(sock, (struct sockaddr *) &amp;server, sizeof server)==-1)&lt;br /&gt;{&lt;br /&gt; perror("connecting stream socket");&lt;br /&gt; exit(1);&lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;/*=====Write some data then read some =====*/&lt;br /&gt;&lt;br /&gt;read_socket(); /* SMTP Server logon string */&lt;br /&gt;&lt;br /&gt;send_socket(HELO); /* introduce ourselves */&lt;br /&gt;read_socket(); /*Read reply */&lt;br /&gt;&lt;br /&gt;send_socket("MAIL from: "); /* Mail from us */&lt;br /&gt;send_socket(from_id);&lt;br /&gt;send_socket("\n");&lt;br /&gt;read_socket(); /* Sender OK */&lt;br /&gt;&lt;br /&gt;send_socket("RCPT To: "); /*Mail to*/&lt;br /&gt;send_socket(to_id);&lt;br /&gt;send_socket("\n");&lt;br /&gt;read_socket(); /*Recipient OK*/&lt;br /&gt;&lt;br /&gt;send_socket(DATA);/*body to follow*/&lt;br /&gt;read_socket(); /*ok to send */&lt;br /&gt;&lt;br /&gt;fin=fopen(file_id, "r"); /* open file */&lt;br /&gt;while(1)&lt;br /&gt;{&lt;br /&gt; if(fgets(wkstr, 100, fin)==NULL) break; /* exit on EOF */&lt;br /&gt; send_socket(wkstr);&lt;br /&gt;}&lt;br /&gt;fclose(fin); /* close file */&lt;br /&gt;&lt;br /&gt;send_socket(fin); /*send file*/&lt;br /&gt;send_socket(".\n");&lt;br /&gt;&lt;br /&gt;read_socket(); /* OK*/&lt;br /&gt;send_socket(QUIT); /* quit */&lt;br /&gt;read_socket(); /* log off */&lt;br /&gt;&lt;br /&gt;/*=====Close socket and finish=====*/&lt;br /&gt;close(sock);&lt;br /&gt;exit(0);&lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Tue, 11 Apr 2006 20:43:40 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/1896</guid>
      <author>lordrich ()</author>
    </item>
    <item>
      <title>Another perl crawler</title>
      <link>http://snippets.dzone.com/posts/show/1895</link>
      <description>Again found in my old source folder, it may not fully work.&lt;br /&gt;&lt;br /&gt;This Perl script reads in the existing links from links.dat into the array @bigarray.  It then loops through the array reading in each link and appending the new links it finds to links.dat.  If the script were run in a loop it would add every single web address it can find to links.dat.&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;#!/usr/bin/perl &lt;br /&gt;use IO::Socket; &lt;br /&gt;use URI; &lt;br /&gt; &lt;br /&gt;open(LINKS, "&lt;&lt; links.dat"); &lt;br /&gt;@bigarray = (); &lt;br /&gt;while (&lt;LINKS&gt;) { &lt;br /&gt;        chomp; &lt;br /&gt;        push(@bigarray, $_); &lt;br /&gt;} &lt;br /&gt;close(LINKS); &lt;br /&gt; &lt;br /&gt;foreach $uri (@bigarray) { &lt;br /&gt;        ($domain = URI-&gt;new($uri)-&gt;authority) =~ s/^www\.//i; &lt;br /&gt;        $socket = IO::Socket::INET-&gt;new(PeerAddr &lt;br /&gt;                                =&gt; $domain, &lt;br /&gt;                                PeerPort =&gt; 80, &lt;br /&gt;                                Proto =&gt; 'tcp', &lt;br /&gt;                                Type =&gt; SOCK_STREAM) &lt;br /&gt;        or die "Couldn't connect"; &lt;br /&gt;        print $socket "GET / HTTP/1.0\n\n"; &lt;br /&gt;        #$page = &lt;$socket&gt;; &lt;br /&gt;        open(LINKS, "&gt;&gt; links.dat"); &lt;br /&gt;        while (defined($line = &lt;$socket&gt;)) { &lt;br /&gt;                $line =~ m{href="(.*?)"}ig; &lt;br /&gt;                print LINKS "$1"; &lt;br /&gt;            } &lt;br /&gt;        close(LINKS); &lt;br /&gt;        close($socket); &lt;br /&gt;}&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Tue, 11 Apr 2006 20:41:05 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/1895</guid>
      <author>lordrich ()</author>
    </item>
    <item>
      <title>PHP Web Crawler</title>
      <link>http://snippets.dzone.com/posts/show/1894</link>
      <description>Example output:&lt;br /&gt;&lt;br /&gt;-bash-2.05b$ php asp.php&lt;br /&gt;http://www.example.com&lt;br /&gt;http://www.rfc-editor.org/rfc/rfc2606.txt&lt;br /&gt;No links.&lt;br /&gt;&lt;br /&gt;-bash-2.05b$ cat links.dat&lt;br /&gt;http://www.example.com&lt;br /&gt;http://www.rfc-editor.org/rfc/rfc2606.txt&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;&lt;?php&lt;br /&gt;$datafile = "links.dat"; // file to keep the list of links in&lt;br /&gt;$regex = "/&lt;\s*a\s+[^&gt;]*href\s*=\s*[\"']?([^\"' &gt;]+)[\"' &gt;]/isU";  // regex to search for hrefs&lt;br /&gt;&lt;br /&gt;$handle = fopen($datafile, "r"); // open the data file&lt;br /&gt;$buffer = fgets($handle, 4096);&lt;br /&gt;$oldlinks[] = $buffer; // read the first link into an array&lt;br /&gt;while (!feof($handle)) {&lt;br /&gt;	$buffer = fgets($handle, 4096);&lt;br /&gt;	array_push($oldlinks,$buffer); // read the rest of the links into an array&lt;br /&gt;}&lt;br /&gt;fclose($handle); // close the data file&lt;br /&gt;&lt;br /&gt;foreach($oldlinks as $value) { // for every link in the array&lt;br /&gt;	print $value; // print it out&lt;br /&gt;	$remote = fopen(trim($value), "r") or die(); //open it or fail nicely&lt;br /&gt;	while (!feof($remote)) {&lt;br /&gt;		$html = fread($remote, 8192); // read in the remote page&lt;br /&gt;	}&lt;br /&gt;	fclose($remote); // close it&lt;br /&gt;	if (preg_match_all($regex, $html, $links)) { // if we find new links&lt;br /&gt;		$local = fopen($datafile, "a+"); // open the data file&lt;br /&gt;		foreach($links[1] as $value) { // for every new link&lt;br /&gt;			$value.="\n"; // append a new line&lt;br /&gt;			if(!in_array($value,$oldlinks)) { // if we haven't seen it before (nb - case sensitive)&lt;br /&gt;				print($value); // print it out&lt;br /&gt;				fwrite($local, $value); // and write it to file&lt;br /&gt;			}&lt;br /&gt;		}&lt;br /&gt;		fclose($local); // close the data file&lt;br /&gt;	}&lt;br /&gt;	else {&lt;br /&gt;		print("No links."); // we didn't find any links in the new file&lt;br /&gt;	}&lt;br /&gt;}&lt;br /&gt;?&gt;&lt;br /&gt;&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Tue, 11 Apr 2006 20:39:36 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/1894</guid>
      <author>lordrich ()</author>
    </item>
    <item>
      <title>Ruby web crawler</title>
      <link>http://snippets.dzone.com/posts/show/1893</link>
      <description>NB. Again, this script was found in my old source code folder, it may not be fully working.&lt;br /&gt;&lt;br /&gt;This Ruby script reads in a list of links from links.dat, it then picks out the ones it can easily spider and gets a list of URLs from each page listed in links.dat.  Every new URL it finds will be added to newlinks.dat for later spidering by another bot running along side this one. &lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;require 'socket' &lt;br /&gt;links = File.open("links.dat") &lt;br /&gt;while links.gets do &lt;br /&gt;        #domain = ($_ =~ /http:\/\/.*\.([0-9a-zA-Z\-]+\.com|net|org)/); &lt;br /&gt;        if %r{http://([^/]+)/([^/]+)}i =~ $_ &lt;br /&gt;                domain,path = $1, $2 &lt;br /&gt;        end &lt;br /&gt;        if proto="http" &lt;br /&gt;                begin &lt;br /&gt;                        t = TCPSocket.new(domain, 'www') &lt;br /&gt;                rescue &lt;br /&gt;                        puts "error: #{$!}" &lt;br /&gt;                else &lt;br /&gt;                        t.print "GET /"+path+" HTTP/1.0\n\n" &lt;br /&gt;                        answer = t.gets(nil) &lt;br /&gt;                        t.close &lt;br /&gt;                end &lt;br /&gt; &lt;br /&gt;                if %r{&lt;a\s+href="(\w+)://([^"]+)"[^&gt;]*&gt;([^&lt;]*)&lt;/a&gt;}i =~ answer &lt;br /&gt;                        proto, url, text = $1, $2, $3 &lt;br /&gt;                end &lt;br /&gt; &lt;br /&gt;                print proto+"://"+url+"\n" &lt;br /&gt;                old = File.open("newlinks.dat") &lt;br /&gt;                new = File.open("links.dat.tmp", File::WRONLY|File::TRUNC|File::CREAT) &lt;br /&gt;                while old.gets do &lt;br /&gt;                        if $_ != proto+"://"+url &lt;br /&gt;                                new.print $_ &lt;br /&gt;                        end &lt;br /&gt;                end &lt;br /&gt;                new.print proto+"://"+url &lt;br /&gt;                old.close &lt;br /&gt;                new.close &lt;br /&gt;                File.rename("newlinks.dat", "links.dat.orig") &lt;br /&gt;                File.rename("links.dat.tmp", "newlinks.dat") &lt;br /&gt;        end &lt;br /&gt;end &lt;br /&gt;links.close&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Tue, 11 Apr 2006 20:36:29 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/1893</guid>
      <author>lordrich ()</author>
    </item>
  </channel>
</rss>
