I can't be bothered going downstairs to watch the daily show
Just run it - creates videos/index.yml
#!/usr/bin/ruby require 'rubygems' require 'leecher' require 'open-uri' require 'hpricot' class Scraper SEARCH_PAGE="http://www.thedailyshow.com/tds_files/includes/search/search_results.jhtml" def scrape_day(site, date, force=false) return if site.skip.include?(date) and not force url = sprintf("#{SEARCH_PAGE}?searchterm=%02d-%02d-%04d",date.month,date.day,date.year) puts "Fetching #{url}" if $DEBUG data = open(url) {|f| f.read } h = Hpricot(data) # can't use a real xpath - they all use an _id_ istead of a class results = (h/"div").find_all {|x| x['id'] == "videoListItem_1" } results.each {|result| url = result.at("a[1]")['href'] vid = if url =~ /videoId=(\d+)/ $1.to_i else raise "Failed to parse link #{url}" end title = ((result/"a")[1]/"text()").to_s date = Date.parse((result/"a[@onclick][1]/text()").to_s) descr = result/"div.video_description" description = (descr/"div[1]/text()").to_s tags = (descr/".tags/a/text()").map {|x| unescape(x.to_s) } vid = Video.new(site, vid, date, unescape(title), unescape(description), tags) puts vid } results.length end def unescape(t) t.gsub(/&([^;]{1,5});/) {|ent| case $1 when /^#(\d+)$/ [$1.to_i].pack('C') when /^#x([0-9a-zA-Z]+)$/ [$1].pack('H2') when 'amp' '&' when 'gt' '>' when 'quot' '"' when 'apos' "'" when 'lt' '<' else $stderr.puts "Unknown entity #{$1.inspect}" '?' end } end end if __FILE__ == $0 Site.load tds = Site.sites.find {|s| s.short_name == "tds" } tds ||= Site.new("The Daily Show","tds") day = Date.today start = Date.new(1999) counter=0 s = Scraper.new while day >= start results = s.scrape_day(tds, day) puts "#{results} results for #{day}" if results # else skipped # Mark days as done once we've scraped them a month after air if (Date.today - day) > 30 and not results.nil? tds.skip!(day) end day -= 1 unless results.nil? or results.zero? Site.save if (counter += 1)%10 == 0 end end Site.save end
leecher.rb - downloads/searches/plays videos
./leecher.rb [download/list/play] [searchterm ... ]
Search terms can be
date: 2007 or 2007-10 or 2007-10-01
tag: interview
id: 31723
already downloaded?: downloaded or !downloaded
#!/usr/bin/ruby MEDIA_PLAYER = %w{mplayer -fs} require 'rubygems' require 'open-uri' require 'rexml/document' require 'rexml/xpath' require 'fileutils' require 'yaml' require 'rio' require 'set' class Site class << self attr_reader :base attr_reader :alternates attr_reader :sites def init unless self.base or self.sites @base = "./videos" @alternates = [] @sites = [] end end def load(stream=nil) if stream stuff = YAML::load(stream) @base = stuff['base'] || "./videos" @sites = stuff['sites'] || [] @alternates = stuff['alternates'] || [] else init begin File.open(File.join(base,'index.yml')) {|f| load(f) } rescue Errno::ENOENT $stderr.puts "Warning, no database found, starting a new one" end end end def save(stream=nil) if stream YAML::dump({'base' => base, 'sites' => sites, 'alternates' => 'alternates'},stream) else File.open(File.join(base,'index.yml_'),'w') {|f| save(f) } FileUtils.mv(File.join(base,'index.yml_'),File.join(base,'index.yml')) end end def each(&block) sites.each(&block) end end def initialize(name, short_name=name) Site.init @videos = {} @name, @short_name = name, short_name @skip = Set.new Site.sites << self end attr_reader :videos attr_reader :name attr_reader :short_name attr_reader :skip def directory File.join(Site.base, short_name) end def directory_alternates Site.alternates.map {|d| File.join(d, short_name) } end def ensure_dir_exists! FileUtils.mkpath(directory) end def <<(vid) self.videos[vid.id] = vid end def skip!(date) self.skip << date end def [](id) self.videos[id] end def to_s name end def each self.videos.each {|k,v| yield v } end end class Video attr_reader :tags attr_reader :site attr_reader :id attr_reader :date attr_reader :title attr_reader :description def initialize(site, id, date=nil, title = nil, description=nil, tags=[]) @site = site @id = id @title = title @tags = tags @date = date @description = description site << self end def filename site.directory_alternates.map{|x| File.join(x,"#{id}.flv") }.find {|f| File.exists?(f) } || File.join(site.directory, "#{id}.flv") end def downloaded? File.exists?(filename) end def download download! unless downloaded? end SHARED_DATA = "http://www.comedycentral.com/sitewide/video_player/shared/data" def download! site.ensure_dir_exists! url = download_url() begin rio(url) > rio(filename) File.size(filename) rescue Exception => x begin File.delete(filename) rescue Exception end raise x end end def to_s sprintf("[%1s %7d - %s - %s - %20s]",(downloaded?? 'D' : ' '), id, date, site, title) end def download_url manifest = open("#{SHARED_DATA}/flv_xml_gen.jhtml?ml_video=#{id}&hiLoPref=hi") {|f| f.read } doc = REXML::Document.new(manifest) REXML::XPath.first(doc, "/package/video/item/src/text()").to_s end end class Filter class << self def method_missing(sym,*args,&block) if sym.to_s =~ /^by/ new.send(sym,*args,&block) else super end end end def initialize(parent=nil,&block) @parent = parent @test = block end def [](video) case video when Video video if (@test.nil? or @test[video]) and (@parent.nil? or @parent[video]) when Site if block_given? video.each {|v| yield v if self[v] } else video.find_all {|v| self[v] } end else raise ArgumentError end end def each(&block) Site.each {|show| self.send(:[], show, &block) } end def filter(&block) Filter.new(self,&block) end def by_downloaded(dl=true) filter {|v| v.downloaded? == !!dl } end def by_date(y, m=nil, d=nil) if m.nil? and d.nil? filter {|v| v.date and v.date.year == y } elsif d.nil? filter {|v| v.date and v.date.year == y and v.date.month == m } else filter {|v| v.date and v.date.year == y and v.date.month == m and v.date.day == d } end end def by_id(vid) filter {|v| v.id == vid } end def by_tag(tag) filter {|v| v.tags.include? tag } end def by_text(text) filter {|v| v.title && v.title.downcase.include?(text.downcase) or v.description && v.description.downcase.include?(text.downcase) or v.tags.include? text } end def by(arg) arg = arg.to_s case arg when 'downloaded' by_downloaded(true) when '!downloaded' by_downloaded(false) when /^(\d{4})$/ by_date($1.to_i) when /^(\d{4})-(\d{1,2})$/ by_date($1.to_i, $2.to_i) when /^(\d{4})-(\d{1,2})-(\d{1,2})$/ by_date($1.to_i, $2.to_i, $3.to_i) when /^\d{5,}$/ by_id(arg.to_i) else by_text(arg) end end end if __FILE__ == $0 command = ARGV.shift or raise "Usage: leecher <download/list> [filters...]" action = case command.downcase when 'download' proc {|v| puts "Fetching #{v}" unless v.downloaded? begin v.download rescue OpenURI::HTTPError => x $stderr.puts "Download failed, skipping: #{x}" rescue Errno::ENOENT => xm $stderr.puts "Download failed, skipping: #{x}" end } when 'list' proc {|v| puts "#{v} #{v.tags.join(', ')}" } when 'play' proc {|v| puts v system(*MEDIA_PLAYER, v.filename) } else raise "Unknown command #{command}" end filter = ARGV.inject(Filter.new) {|f, arg| f.by(arg)} Site.load filter.each(&action) end