<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DZone Snippets: show code</title>
    <link>http://snippets.dzone.com/posts</link>
    <pubDate>Thu, 21 Aug 2008 01:35:18 GMT</pubDate>
    <description>DZone Snippets: show code</description>
    <item>
      <title>I can't be bothered going downstairs to watch the daily show</title>
      <link>http://snippets.dzone.com/posts/show/4794</link>
      <description>scraper.rb - scrapes metadata from thedailyshow.com.&lt;br /&gt;Just run it - creates videos/index.yml&lt;br /&gt;&lt;code&gt;&lt;br /&gt;#!/usr/bin/ruby&lt;br /&gt;&lt;br /&gt;require 'rubygems'&lt;br /&gt;require 'leecher'&lt;br /&gt;require 'open-uri'&lt;br /&gt;require 'hpricot'&lt;br /&gt;&lt;br /&gt;class Scraper&lt;br /&gt;	SEARCH_PAGE="http://www.thedailyshow.com/tds_files/includes/search/search_results.jhtml"&lt;br /&gt;	def scrape_day(site, date, force=false)&lt;br /&gt;		return if site.skip.include?(date) and not force&lt;br /&gt;		url = sprintf("#{SEARCH_PAGE}?searchterm=%02d-%02d-%04d",date.month,date.day,date.year)&lt;br /&gt;		puts "Fetching #{url}" if $DEBUG&lt;br /&gt;&lt;br /&gt;		data = open(url) {|f| f.read }&lt;br /&gt;		h = Hpricot(data)&lt;br /&gt;&lt;br /&gt;		# can't use a real xpath - they all use an _id_ istead of a class&lt;br /&gt;		results = (h/"div").find_all {|x| x['id'] == "videoListItem_1" }&lt;br /&gt;		results.each {|result|&lt;br /&gt;			url = result.at("a[1]")['href']&lt;br /&gt;			vid = if url =~ /videoId=(\d+)/&lt;br /&gt;				$1.to_i&lt;br /&gt;			else&lt;br /&gt;				raise "Failed to parse link #{url}"&lt;br /&gt;			end&lt;br /&gt;&lt;br /&gt;			title = ((result/"a")[1]/"text()").to_s&lt;br /&gt;			date = Date.parse((result/"a[@onclick][1]/text()").to_s)&lt;br /&gt;&lt;br /&gt;			descr = result/"div.video_description"&lt;br /&gt;			description = (descr/"div[1]/text()").to_s&lt;br /&gt;			tags = (descr/".tags/a/text()").map {|x| unescape(x.to_s) }&lt;br /&gt;			&lt;br /&gt;			vid = Video.new(site, vid, date, unescape(title), unescape(description), tags)&lt;br /&gt;			puts vid&lt;br /&gt;		}&lt;br /&gt;	&lt;br /&gt;		results.length&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def unescape(t)&lt;br /&gt;		t.gsub(/&amp;([^;]{1,5});/) {|ent|&lt;br /&gt;			case $1&lt;br /&gt;				when /^#(\d+)$/&lt;br /&gt;					[$1.to_i].pack('C')&lt;br /&gt;				when /^#x([0-9a-zA-Z]+)$/&lt;br /&gt;					[$1].pack('H2')&lt;br /&gt;				when 'amp'&lt;br /&gt;					'&amp;'&lt;br /&gt;				when 'gt'&lt;br /&gt;					'&gt;'&lt;br /&gt;				when 'quot'&lt;br /&gt;					'"'&lt;br /&gt;				when 'apos'&lt;br /&gt;					"'"&lt;br /&gt;				when 'lt'&lt;br /&gt;					'&lt;'&lt;br /&gt;				else&lt;br /&gt;					$stderr.puts "Unknown entity #{$1.inspect}"&lt;br /&gt;					'?'&lt;br /&gt;			end&lt;br /&gt;		}&lt;br /&gt;	end&lt;br /&gt;end&lt;br /&gt;&lt;br /&gt;if __FILE__ == $0&lt;br /&gt;	Site.load&lt;br /&gt;	tds = Site.sites.find {|s| s.short_name == "tds" }&lt;br /&gt;	tds ||= Site.new("The Daily Show","tds")&lt;br /&gt;&lt;br /&gt;	day = Date.today&lt;br /&gt;	start = Date.new(1999)&lt;br /&gt;&lt;br /&gt;	counter=0&lt;br /&gt;&lt;br /&gt;	s = Scraper.new&lt;br /&gt;	while day &gt;= start&lt;br /&gt;		results = s.scrape_day(tds, day)&lt;br /&gt;		puts "#{results} results for #{day}" if results # else skipped&lt;br /&gt;&lt;br /&gt;		# Mark days as done once we've scraped them a month after air&lt;br /&gt;		if (Date.today - day) &gt; 30 and not results.nil?&lt;br /&gt;			tds.skip!(day)&lt;br /&gt;		end&lt;br /&gt;&lt;br /&gt;		day -= 1&lt;br /&gt;		unless results.nil? or results.zero?&lt;br /&gt;			Site.save if (counter += 1)%10 == 0&lt;br /&gt;		end&lt;br /&gt;	end&lt;br /&gt;	Site.save&lt;br /&gt;end&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;&lt;br /&gt;leecher.rb - downloads/searches/plays videos&lt;br /&gt;./leecher.rb [download/list/play] [searchterm ... ]&lt;br /&gt;Search terms can be &lt;br /&gt;  date: 2007 or 2007-10 or 2007-10-01&lt;br /&gt;  tag: interview&lt;br /&gt;  id: 31723&lt;br /&gt;  already downloaded?: downloaded or !downloaded&lt;br /&gt;&lt;code&gt;&lt;br /&gt;#!/usr/bin/ruby&lt;br /&gt;&lt;br /&gt;MEDIA_PLAYER = %w{mplayer -fs}&lt;br /&gt;&lt;br /&gt;require 'rubygems'&lt;br /&gt;require 'open-uri'&lt;br /&gt;require 'rexml/document'&lt;br /&gt;require 'rexml/xpath'&lt;br /&gt;require 'fileutils'&lt;br /&gt;require 'yaml'&lt;br /&gt;require 'rio'&lt;br /&gt;require 'set'&lt;br /&gt;&lt;br /&gt;class Site&lt;br /&gt;	class &lt;&lt; self&lt;br /&gt;		attr_reader :base&lt;br /&gt;		attr_reader :alternates&lt;br /&gt;		attr_reader :sites&lt;br /&gt;		&lt;br /&gt;		def init&lt;br /&gt;			unless self.base or self.sites&lt;br /&gt;				@base = "./videos"&lt;br /&gt;				@alternates = []&lt;br /&gt;				@sites = []&lt;br /&gt;			end&lt;br /&gt;		end&lt;br /&gt;&lt;br /&gt;		def load(stream=nil)&lt;br /&gt;			if stream&lt;br /&gt;				stuff = YAML::load(stream)&lt;br /&gt;				@base = stuff['base'] || "./videos"&lt;br /&gt;				@sites = stuff['sites'] || []&lt;br /&gt;				@alternates = stuff['alternates'] || []&lt;br /&gt;			else&lt;br /&gt;				init&lt;br /&gt;				begin&lt;br /&gt;					File.open(File.join(base,'index.yml')) {|f|&lt;br /&gt;						load(f)&lt;br /&gt;					}&lt;br /&gt;				rescue Errno::ENOENT&lt;br /&gt;					$stderr.puts "Warning, no database found, starting a new one"&lt;br /&gt;				end&lt;br /&gt;			end&lt;br /&gt;		end&lt;br /&gt;		&lt;br /&gt;		def save(stream=nil)&lt;br /&gt;			if stream&lt;br /&gt;				YAML::dump({'base' =&gt; base, 'sites' =&gt; sites, 'alternates' =&gt; 'alternates'},stream)&lt;br /&gt;			else&lt;br /&gt;				File.open(File.join(base,'index.yml_'),'w') {|f|&lt;br /&gt;					save(f)&lt;br /&gt;				}&lt;br /&gt;				FileUtils.mv(File.join(base,'index.yml_'),File.join(base,'index.yml'))&lt;br /&gt;			end&lt;br /&gt;		end&lt;br /&gt;&lt;br /&gt;		def each(&amp;block)&lt;br /&gt;			sites.each(&amp;block)&lt;br /&gt;		end&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def initialize(name, short_name=name)&lt;br /&gt;		Site.init&lt;br /&gt;		@videos = {}&lt;br /&gt;		@name, @short_name = name, short_name&lt;br /&gt;		@skip = Set.new&lt;br /&gt;		Site.sites &lt;&lt; self&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	attr_reader :videos&lt;br /&gt;	attr_reader :name&lt;br /&gt;	attr_reader :short_name&lt;br /&gt;	attr_reader :skip&lt;br /&gt;&lt;br /&gt;	def directory&lt;br /&gt;		File.join(Site.base, short_name)&lt;br /&gt;	end&lt;br /&gt;	def directory_alternates&lt;br /&gt;		Site.alternates.map {|d| File.join(d, short_name) }&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def ensure_dir_exists!&lt;br /&gt;		FileUtils.mkpath(directory)&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def &lt;&lt;(vid)&lt;br /&gt;		self.videos[vid.id] = vid&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def skip!(date)&lt;br /&gt;		self.skip &lt;&lt; date&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def [](id)&lt;br /&gt;		self.videos[id]&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def to_s&lt;br /&gt;		name&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def each &lt;br /&gt;		self.videos.each {|k,v| yield v }	&lt;br /&gt;	end&lt;br /&gt;end&lt;br /&gt;&lt;br /&gt;class Video&lt;br /&gt;	attr_reader :tags&lt;br /&gt;	attr_reader :site&lt;br /&gt;	attr_reader :id&lt;br /&gt;	attr_reader :date&lt;br /&gt;	attr_reader :title&lt;br /&gt;	attr_reader :description&lt;br /&gt;&lt;br /&gt;	def initialize(site, id, date=nil, title = nil, description=nil, tags=[]) &lt;br /&gt; 		@site = site&lt;br /&gt;		@id = id&lt;br /&gt;		@title = title&lt;br /&gt;		@tags = tags&lt;br /&gt;		@date = date&lt;br /&gt;		@description = description&lt;br /&gt;&lt;br /&gt;		site &lt;&lt; self&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def filename&lt;br /&gt;		site.directory_alternates.map{|x| &lt;br /&gt;			File.join(x,"#{id}.flv")&lt;br /&gt;		}.find {|f| &lt;br /&gt;			File.exists?(f) &lt;br /&gt;		} || File.join(site.directory, "#{id}.flv")&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def downloaded?&lt;br /&gt;		File.exists?(filename)&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def download&lt;br /&gt;		download! unless downloaded?&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	SHARED_DATA = "http://www.comedycentral.com/sitewide/video_player/shared/data"&lt;br /&gt;	def download!&lt;br /&gt;		site.ensure_dir_exists!&lt;br /&gt;		url = download_url()&lt;br /&gt;		begin&lt;br /&gt;			rio(url) &gt; rio(filename)&lt;br /&gt;			File.size(filename)&lt;br /&gt;		rescue Exception =&gt; x&lt;br /&gt;			begin&lt;br /&gt;				File.delete(filename)&lt;br /&gt;			rescue Exception&lt;br /&gt;			end&lt;br /&gt;			raise x&lt;br /&gt;		end&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def to_s &lt;br /&gt;		sprintf("[%1s %7d - %s - %s - %20s]",(downloaded?? 'D' : ' '), id, date, site, title) &lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def download_url&lt;br /&gt;		manifest = open("#{SHARED_DATA}/flv_xml_gen.jhtml?ml_video=#{id}&amp;hiLoPref=hi") {|f| f.read }&lt;br /&gt;		doc = REXML::Document.new(manifest)&lt;br /&gt;		REXML::XPath.first(doc, "/package/video/item/src/text()").to_s&lt;br /&gt;	end&lt;br /&gt;end&lt;br /&gt;&lt;br /&gt;class Filter&lt;br /&gt;	class &lt;&lt; self&lt;br /&gt;		def method_missing(sym,*args,&amp;block)&lt;br /&gt;			if sym.to_s =~ /^by/&lt;br /&gt;				new.send(sym,*args,&amp;block)&lt;br /&gt;			else&lt;br /&gt;				super&lt;br /&gt;			end&lt;br /&gt;		end&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def initialize(parent=nil,&amp;block)&lt;br /&gt;		@parent = parent&lt;br /&gt;		@test = block&lt;br /&gt;	end&lt;br /&gt;	&lt;br /&gt;	def [](video)&lt;br /&gt;		case video&lt;br /&gt;			when Video&lt;br /&gt;				video if (@test.nil? or @test[video]) and (@parent.nil? or @parent[video])&lt;br /&gt;			when Site&lt;br /&gt;				if block_given?&lt;br /&gt;					video.each {|v| yield v if self[v] }&lt;br /&gt;				else&lt;br /&gt;					video.find_all {|v| self[v] }&lt;br /&gt;				end&lt;br /&gt;			else&lt;br /&gt;				raise ArgumentError&lt;br /&gt;		end&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def each(&amp;block)&lt;br /&gt;		Site.each {|show| self.send(:[], show, &amp;block) }&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def filter(&amp;block)&lt;br /&gt;		Filter.new(self,&amp;block)&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def by_downloaded(dl=true)&lt;br /&gt;		filter {|v| v.downloaded? == !!dl }&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def by_date(y, m=nil, d=nil)&lt;br /&gt;		if m.nil? and d.nil?&lt;br /&gt;			filter {|v| v.date and v.date.year == y }&lt;br /&gt;		elsif d.nil?&lt;br /&gt;			filter {|v| v.date and v.date.year == y and v.date.month == m }&lt;br /&gt;		else&lt;br /&gt;			filter {|v| v.date and v.date.year == y and v.date.month == m and v.date.day == d }&lt;br /&gt;		end&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def by_id(vid)&lt;br /&gt;		filter {|v| v.id == vid }&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def by_tag(tag)&lt;br /&gt;		filter {|v| v.tags.include? tag }&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def by_text(text) &lt;br /&gt;		filter {|v| &lt;br /&gt;			v.title &amp;&amp; v.title.downcase.include?(text.downcase) or &lt;br /&gt;			v.description &amp;&amp; v.description.downcase.include?(text.downcase) or&lt;br /&gt;			v.tags.include? text&lt;br /&gt;		}&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	def by(arg)&lt;br /&gt;		arg = arg.to_s&lt;br /&gt;		case arg&lt;br /&gt;			when 'downloaded'&lt;br /&gt;				by_downloaded(true)&lt;br /&gt;			when '!downloaded'&lt;br /&gt;				by_downloaded(false)&lt;br /&gt;			when /^(\d{4})$/&lt;br /&gt;				by_date($1.to_i)&lt;br /&gt;			when /^(\d{4})-(\d{1,2})$/&lt;br /&gt;				by_date($1.to_i, $2.to_i)&lt;br /&gt;			when /^(\d{4})-(\d{1,2})-(\d{1,2})$/&lt;br /&gt;				by_date($1.to_i, $2.to_i, $3.to_i)&lt;br /&gt;			when /^\d{5,}$/&lt;br /&gt;				by_id(arg.to_i)&lt;br /&gt;			else&lt;br /&gt;				by_text(arg)&lt;br /&gt;		end&lt;br /&gt;	end&lt;br /&gt;end&lt;br /&gt;&lt;br /&gt;if __FILE__ == $0&lt;br /&gt;	command = ARGV.shift or raise "Usage: leecher &lt;download/list&gt; [filters...]"&lt;br /&gt;	action = case command.downcase&lt;br /&gt;		when 'download'&lt;br /&gt;			proc {|v| &lt;br /&gt;				puts "Fetching #{v}" unless v.downloaded?&lt;br /&gt;				begin&lt;br /&gt;					v.download&lt;br /&gt;				rescue OpenURI::HTTPError =&gt; x&lt;br /&gt;					$stderr.puts "Download failed, skipping: #{x}"&lt;br /&gt;				rescue Errno::ENOENT =&gt; xm&lt;br /&gt;					$stderr.puts "Download failed, skipping: #{x}"&lt;br /&gt;				end&lt;br /&gt;			}&lt;br /&gt;		when 'list'&lt;br /&gt;			proc {|v|&lt;br /&gt;				puts "#{v} #{v.tags.join(', ')}"&lt;br /&gt;			}&lt;br /&gt;		when 'play'&lt;br /&gt;			proc {|v|&lt;br /&gt;				puts v&lt;br /&gt;				system(*MEDIA_PLAYER, v.filename)&lt;br /&gt;			}&lt;br /&gt;		else&lt;br /&gt;			raise "Unknown command #{command}"&lt;br /&gt;	end&lt;br /&gt;&lt;br /&gt;	filter = ARGV.inject(Filter.new) {|f, arg| f.by(arg)}&lt;br /&gt;&lt;br /&gt;	Site.load&lt;br /&gt;	filter.each(&amp;action)&lt;br /&gt;end&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;&lt;br /&gt;</description>
      <pubDate>Sun, 18 Nov 2007 06:44:14 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/4794</guid>
      <author>tunah (Sam McCall)</author>
    </item>
  </channel>
</rss>
