<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DZone Snippets: scrape code</title>
    <link>http://snippets.dzone.com/posts</link>
    <pubDate>Sat, 17 May 2008 16:20:17 GMT</pubDate>
    <description>DZone Snippets: scrape code</description>
    <item>
      <title>Converting XHTML to XML</title>
      <link>http://snippets.dzone.com/posts/show/5127</link>
      <description>Based on the code from &lt;a href="http://www.ibm.com/developerworks/library/x-tiptidy.html"&gt;'Convert from HTML to XML with HTML Tidy'&lt;/a&gt;, this code will read an xhtml file and extract text to gallery.xml as instructed by xhtml2xml.xml&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;#!/usr/bin/ruby&lt;br /&gt;  &lt;br /&gt;  require 'tidy'&lt;br /&gt;  require 'projxslt'&lt;br /&gt;  &lt;br /&gt;  FILE_PATH = "../"&lt;br /&gt;  &lt;br /&gt;  class Xhtml2Xml&lt;br /&gt;    def convert()&lt;br /&gt;      project = 'xhtml2xml'&lt;br /&gt;      filein = 'xhtml2xml.xml'&lt;br /&gt;      filehtml = 'gallery.html'&lt;br /&gt;      filexml = 'gallery_xhtml.xml'&lt;br /&gt;      xslfile_temp = 'gallery.xsl'&lt;br /&gt;      xslfile = 'xhtml2xml.xsl'&lt;br /&gt;      fileout = 'gallery.xml'&lt;br /&gt;      tidy_config = 'tidy.txt'&lt;br /&gt;      &lt;br /&gt;      project_path = FILE_PATH + project + '/'&lt;br /&gt;      tidy_config_path = project_path + tidy_config&lt;br /&gt;      filein_path = project_path + filein&lt;br /&gt;      filehtml_path = project_path + filehtml&lt;br /&gt;      filexml_path = project_path + filexml&lt;br /&gt;      xslfile_temp_path = project_path + xslfile_temp&lt;br /&gt;      xslfile_path = project_path + xslfile&lt;br /&gt;      fileout_path = project_path + fileout&lt;br /&gt;      &lt;br /&gt;      Tidy.path = '/usr/lib/libtidy.so'&lt;br /&gt;&lt;br /&gt;      file = File.new(filehtml_path,'r')&lt;br /&gt;      buffer = file.read&lt;br /&gt;      xml = Tidy.open(:show_warnings=&gt;true) do |tidy|&lt;br /&gt;        tidy.options.output_xml = true&lt;br /&gt;        tidy.load_config(tidy_config_path)&lt;br /&gt;        puts tidy.options.show_warnings&lt;br /&gt;        xml = tidy.clean(buffer)&lt;br /&gt;        puts tidy.errors&lt;br /&gt;        puts tidy.diagnostics&lt;br /&gt;        xml&lt;br /&gt;      end&lt;br /&gt;      &lt;br /&gt;      #strip out the html document type declaration and save the file&lt;br /&gt;      html_declaration = xml[/&lt;!([^&gt;]*&gt;){2}/]&lt;br /&gt;      save_file(filexml_path, xml.gsub(html_declaration,'&lt;html&gt;'))    &lt;br /&gt;      transform(filein_path, xslfile_path, xslfile_temp_path)&lt;br /&gt;      transform(filexml_path, xslfile_temp_path, fileout_path)&lt;br /&gt;      &lt;br /&gt;    end&lt;br /&gt;    &lt;br /&gt;    def transform(xml_filepath, xsl_filepath, save_filepath)&lt;br /&gt;      pxsl = Projxslt.new(xml_filepath, xsl_filepath)&lt;br /&gt;      outfile = pxsl.transform&lt;br /&gt;      save_file(save_filepath, outfile)&lt;br /&gt;    end&lt;br /&gt;    &lt;br /&gt;    def save_file(filepath, buffer)&lt;br /&gt;      file = File.new(filepath,'w') &lt;br /&gt;      file.puts buffer&lt;br /&gt;      file.close&lt;br /&gt;    end    &lt;br /&gt;  end&lt;br /&gt;  &lt;br /&gt;  if __FILE__ == $0&lt;br /&gt;    h2x = Xhtml2Xml.new()&lt;br /&gt;    h2x.convert()&lt;br /&gt;  end&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;file: xhtml2xml.xml&lt;br /&gt;&lt;code&gt;&lt;br /&gt;&lt;root element="gallery"&gt;&lt;br /&gt;  &lt;summary&gt;&lt;br /&gt;    &lt;field element="title" xpath="head/title"/&gt;&lt;br /&gt;  &lt;/summary&gt;&lt;br /&gt;  &lt;record xpath="body/center/table/tr/td" element="photo"&gt;&lt;br /&gt;    &lt;field xpath="font/br[3]/preceding-sibling::text()[1]" element="title"&gt;&lt;/field&gt;&lt;br /&gt;    &lt;field xpath="/html/body/table/tr/td[2]/font/br[3]/preceding-sibling::text()[1]" element="date"&gt;&lt;/field&gt;&lt;br /&gt;    &lt;field xpath="font/br[1]/preceding-sibling::text()[1]" element="image"&gt;&lt;/field&gt;&lt;br /&gt;    &lt;field xpath="font/br[2]/preceding-sibling::text()[1]" element="description"&gt;&lt;/field&gt;&lt;br /&gt;  &lt;/record&gt;&lt;br /&gt;&lt;/root&gt;&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;file:xhtml2xml.xsl (transforms the file xhtml2xml.xml to file gallery.xsl)&lt;br /&gt;&lt;code&gt;&lt;br /&gt;&lt;?xml version="1.0"?&gt;&lt;br /&gt;&lt;xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"&gt;&lt;br /&gt;  &lt;xsl:template match="root"&gt;&lt;br /&gt;    &lt;xsl:variable name="colon"&gt;&lt;xsl:text&gt;:&lt;/xsl:text&gt;&lt;/xsl:variable&gt;&lt;br /&gt;    &lt;br /&gt;    &lt;xsl:element name="xsl:stylesheet"&gt;&lt;br /&gt;      &lt;xsl:attribute name="xmlns{$colon}xsl"&gt;&lt;br /&gt;        &lt;xsl:text&gt;http://www.w3.org/1999/XSL/Transform&lt;/xsl:text&gt;&lt;br /&gt;      &lt;/xsl:attribute&gt;&lt;br /&gt;      &lt;xsl:attribute name="version"&gt;&lt;br /&gt;        &lt;xsl:text&gt;1.0&lt;/xsl:text&gt;&lt;br /&gt;      &lt;/xsl:attribute&gt;&lt;xsl:text&gt;&lt;br /&gt;      &lt;/xsl:text&gt;&lt;br /&gt;&lt;br /&gt;&lt;xsl:element name="xsl:output"&gt;&lt;br /&gt;  &lt;xsl:attribute name="method"&gt;&lt;br /&gt;    &lt;xsl:text&gt;xml&lt;/xsl:text&gt;&lt;br /&gt;  &lt;/xsl:attribute&gt;&lt;br /&gt;  &lt;xsl:attribute name="indent"&gt;&lt;br /&gt;    &lt;xsl:text&gt;yes&lt;/xsl:text&gt;&lt;br /&gt;  &lt;/xsl:attribute&gt;&lt;br /&gt;&lt;/xsl:element&gt;&lt;xsl:text&gt;&lt;br /&gt;&lt;br /&gt;&lt;/xsl:text&gt;&lt;br /&gt;&lt;br /&gt;&lt;xsl:element name="xsl:template"&gt;&lt;br /&gt;      &lt;xsl:attribute name="match"&gt;&lt;br /&gt;        &lt;xsl:text&gt;html&lt;/xsl:text&gt;&lt;br /&gt;      &lt;/xsl:attribute&gt;&lt;xsl:text&gt;&lt;br /&gt;&lt;/xsl:text&gt;&lt;br /&gt;      &lt;xsl:element name="{@element}"&gt;&lt;br /&gt;      &lt;xsl:apply-templates select="summary"/&gt;&lt;br /&gt;&lt;br /&gt;      &lt;xsl:element name="xsl{$colon}for-each"&gt;&lt;br /&gt;        &lt;xsl:attribute name="select"&gt;&lt;br /&gt;          &lt;xsl:value-of select="record/@xpath"/&gt;&lt;br /&gt;        &lt;/xsl:attribute&gt;&lt;xsl:text&gt;&lt;br /&gt;    &lt;/xsl:text&gt;              &lt;br /&gt;&lt;br /&gt;  &lt;xsl:for-each select="record/field"&gt;&lt;br /&gt;    &lt;xsl:element name="xsl:variable"&gt;&lt;br /&gt;      &lt;xsl:attribute name="name"&gt;&lt;br /&gt;        &lt;xsl:value-of select="@element"/&gt;&lt;br /&gt;      &lt;/xsl:attribute&gt;&lt;br /&gt;      &lt;xsl:attribute name="select"&gt;&lt;br /&gt;        &lt;xsl:value-of select="@xpath"/&gt;&lt;br /&gt;      &lt;/xsl:attribute&gt;&lt;br /&gt;    &lt;/xsl:element&gt;&lt;xsl:text&gt;&lt;br /&gt;    &lt;/xsl:text&gt;&lt;br /&gt;  &lt;/xsl:for-each&gt;&lt;br /&gt;&lt;xsl:text&gt;&lt;br /&gt;    &lt;/xsl:text&gt;&lt;br /&gt;&lt;br /&gt;        &lt;xsl:element name="{record/@element}"&gt;&lt;br /&gt;       &lt;xsl:for-each select="record/field"&gt;&lt;br /&gt;              &lt;xsl:element name="{@element}"&gt;&lt;xsl:text&gt;&lt;br /&gt;        &lt;/xsl:text&gt;&lt;br /&gt;            &lt;xsl:element name="xsl:value-of"&gt;&lt;br /&gt;              &lt;xsl:attribute name="select"&gt;&lt;xsl:text&gt;normalize-space($&lt;/xsl:text&gt;&lt;br /&gt;                &lt;xsl:value-of select="@element"/&gt;&lt;br /&gt;                &lt;xsl:text&gt;)&lt;/xsl:text&gt;                &lt;br /&gt;              &lt;/xsl:attribute&gt;&lt;br /&gt;          &lt;/xsl:element&gt;  &lt;xsl:text&gt;&lt;br /&gt;      &lt;/xsl:text&gt;&lt;br /&gt;          &lt;/xsl:element&gt;&lt;br /&gt;&lt;br /&gt;    &lt;/xsl:for-each&gt;&lt;br /&gt;&lt;/xsl:element&gt;&lt;xsl:text&gt;&lt;br /&gt;  &lt;/xsl:text&gt;&lt;br /&gt;&lt;/xsl:element&gt;&lt;xsl:text&gt;&lt;br /&gt;&lt;/xsl:text&gt;&lt;br /&gt; &lt;br /&gt;  &lt;/xsl:element&gt;&lt;br /&gt;&lt;/xsl:element&gt; &lt;!-- template match --&gt;&lt;br /&gt;&lt;/xsl:element&gt; &lt;!-- gallery --&gt;&lt;br /&gt;  &lt;/xsl:template&gt; &lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;xsl:template match="summary/field"&gt;&lt;xsl:text&gt;&lt;br /&gt;&lt;/xsl:text&gt;&lt;br /&gt;      &lt;xsl:element name="xsl:element"&gt;&lt;br /&gt;        &lt;xsl:attribute name="name"&gt;&lt;br /&gt;          &lt;xsl:value-of select="@element"/&gt;&lt;br /&gt;        &lt;/xsl:attribute&gt;&lt;xsl:text&gt;&lt;br /&gt;&lt;/xsl:text&gt;&lt;br /&gt;        &lt;xsl:element name="xsl:value-of"&gt;&lt;br /&gt;          &lt;xsl:attribute name="select"&gt;&lt;br /&gt;            &lt;xsl:value-of select="@xpath"/&gt;&lt;br /&gt;          &lt;/xsl:attribute&gt;&lt;xsl:text&gt;&lt;br /&gt;&lt;/xsl:text&gt;&lt;br /&gt;        &lt;/xsl:element&gt;&lt;xsl:text&gt;&lt;br /&gt;&lt;/xsl:text&gt;&lt;br /&gt;      &lt;/xsl:element&gt;&lt;xsl:text&gt;&lt;br /&gt;&lt;/xsl:text&gt;&lt;br /&gt;&lt;/xsl:template&gt;&lt;br /&gt;&lt;/xsl:stylesheet&gt;&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;output: gallery.xml (this file is the product of gallery_xhtml.xml and gallery.xsl)&lt;br /&gt;&lt;code&gt;&lt;br /&gt;&lt;?xml version="1.0"?&gt;&lt;br /&gt;&lt;gallery&gt;&lt;br /&gt;  &lt;title&gt;Journey to Windsor&lt;/title&gt;&lt;br /&gt;  &lt;photo&gt;&lt;br /&gt;    &lt;title&gt;Windsor Castle&lt;/title&gt;&lt;br /&gt;    &lt;date&gt;July 2003&lt;/date&gt;&lt;br /&gt;    &lt;image&gt;dscn0824.jpg&lt;/image&gt;&lt;br /&gt;    &lt;description&gt;&lt;br /&gt;      A bright, red mailbox inside the castle. It seems oddly familiar in an historic setting.&lt;br /&gt;    &lt;/description&gt;&lt;br /&gt;  &lt;/photo&gt;&lt;br /&gt;&lt;/gallery&gt;&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Sun, 10 Feb 2008 15:35:40 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/5127</guid>
      <author>jrobertson (James Robertson)</author>
    </item>
    <item>
      <title>Scrape an XHTML document using Ruby</title>
      <link>http://snippets.dzone.com/posts/show/5039</link>
      <description>A simple Ruby script to scrape an XHTML file with the selected content being saved to an xml file ready for transformation into an RSS feed.  This example uses the XHTML file from http://newsgang.net/audio/ which is then saved locally as 'thegang.xml'.&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;#!/usr/bin/ruby&lt;br /&gt;# file: thegang.rb&lt;br /&gt;&lt;br /&gt;require 'rexml/document'&lt;br /&gt;include REXML&lt;br /&gt;&lt;br /&gt;class TheGang&lt;br /&gt;  def initialize()&lt;br /&gt;  end&lt;br /&gt;  &lt;br /&gt;  def rssify()&lt;br /&gt;    file = File.new('thegang.xml','r')&lt;br /&gt;    doc = Document.new(file)&lt;br /&gt;    rss_doc = Document.new&lt;br /&gt;    root = Element.new('rss')&lt;br /&gt;    rss_doc.add_element(root)&lt;br /&gt;    &lt;br /&gt;    doc.root.elements.each("body/div/ul/li/h2/a") do |node|    &lt;br /&gt;      o_rssitem = Element.new('item')&lt;br /&gt;      o_li = node.parent.parent&lt;br /&gt;      &lt;br /&gt;      o_rsstitle = Element.new('title')&lt;br /&gt;      o_rsstitle.text = node.text.gsub(/[\n,' ']/,'')&lt;br /&gt;      o_rssitem.add_element(o_rsstitle)&lt;br /&gt;      &lt;br /&gt;      o_rsshref_audio = Element.new('href_audio')&lt;br /&gt;      o_rsshref_audio.text = node.attributes.get_attribute('href').to_s.gsub('amp;&amp;','')      &lt;br /&gt;      o_rssitem.add_element(o_rsshref_audio)&lt;br /&gt;      &lt;br /&gt;      o_rsshref = Element.new('href')&lt;br /&gt;      o_rsshref.text = o_rsshref_audio.text.gsub('&amp;amp;from=audio','')      &lt;br /&gt;      o_rssitem.add_element(o_rsshref)&lt;br /&gt;      &lt;br /&gt;      o_rssdate = Element.new('date')&lt;br /&gt;      o_rssdate.text = "#{o_li.elements["p/span[1]"].text} #{o_li.elements["p/span[2]"].text}"&lt;br /&gt;      o_rssitem.add_element(o_rssdate)&lt;br /&gt;      rss_doc.root.add_element(o_rssitem)&lt;br /&gt;      &lt;br /&gt;    end&lt;br /&gt;&lt;br /&gt;    file = File.new('thegang_rss.xml','w')&lt;br /&gt;    file.puts rss_doc&lt;br /&gt;    file.close&lt;br /&gt;  end&lt;br /&gt;end&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;if __FILE__ == $0&lt;br /&gt;  gang = TheGang.new&lt;br /&gt;  gang.rssify&lt;br /&gt;end&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;&lt;br /&gt;see also: www.dapper.net&lt;br /&gt;&lt;br /&gt;output (extract)&lt;br /&gt;&lt;code&gt;&lt;br /&gt;&lt;rss&gt;&lt;br /&gt;  &lt;item&gt;&lt;title&gt;TheGangXII-II&lt;/title&gt;&lt;href_audio&gt;/gangitem/id=6501&amp;amp;from=audio&lt;/href_audio&gt;&lt;href&gt;/gangitem/id=6501&lt;/href&gt;&lt;date&gt;Jan 25&lt;/date&gt;&lt;/item&gt;&lt;br /&gt;  &lt;item&gt;&lt;title&gt;TheGangXII-I&lt;/title&gt;&lt;href_audio&gt;/gangitem/id=6499&amp;amp;from=audio&lt;/href_audio&gt;&lt;href&gt;/gangitem/id=6499&lt;/href&gt;&lt;date&gt;Jan 25&lt;/date&gt;&lt;/item&gt;&lt;br /&gt;  &lt;item&gt;&lt;title&gt;NewsGangLive01.24.08&lt;/title&gt;&lt;href_audio&gt;/gangitem/id=6445&amp;amp;from=audio&lt;/href_audio&gt;&lt;href&gt;/gangitem/id=6445&lt;/href&gt;&lt;date&gt;Jan 24&lt;/date&gt;&lt;/item&gt;&lt;br /&gt;  &lt;item&gt;&lt;title&gt;NewsGangLiveII&lt;/title&gt;&lt;href_audio&gt;/gangitem/id=6377&amp;amp;from=audio&lt;/href_audio&gt;&lt;href&gt;/gangitem/id=6377&lt;/href&gt;&lt;date&gt;Jan 23&lt;/date&gt;&lt;/item&gt;&lt;br /&gt;  ...&lt;br /&gt;&lt;/rss&gt;&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Sun, 27 Jan 2008 14:09:24 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/5039</guid>
      <author>jrobertson (James Robertson)</author>
    </item>
    <item>
      <title>Screen scape heise.de newsticker (german)</title>
      <link>http://snippets.dzone.com/posts/show/886</link>
      <description>&lt;code&gt;&lt;br /&gt;#!/usr/bin/env python&lt;br /&gt;# -*- encoding: latin1 -*-&lt;br /&gt;&lt;br /&gt;import BeautifulSoup&lt;br /&gt;from PyRSS2Gen import RSSItem, Guid&lt;br /&gt;import ScrapeNFeed&lt;br /&gt;import urllib2&lt;br /&gt;import re&lt;br /&gt;&lt;br /&gt;debug = 0&lt;br /&gt;&lt;br /&gt;def fetch(url):&lt;br /&gt;    response = urllib2.urlopen(urllib2.Request(url))&lt;br /&gt;    return response.read(),response.info()&lt;br /&gt;&lt;br /&gt;class HeiFeed(ScrapeNFeed.ScrapedFeed):    &lt;br /&gt;    def HTML2RSS(self, headers, body):&lt;br /&gt;        items = []&lt;br /&gt;        soup = BeautifulSoup.BeautifulSoup(body)&lt;br /&gt;        for item in soup('a', {'href' : re.compile('^meldung.*')}):&lt;br /&gt;            link = 'http://www.heise.de/newsticker/' + item['href']&lt;br /&gt;            if not self.hasSeen(link):&lt;br /&gt;                title = item.contents[0].strip()&lt;br /&gt;                if debug:&lt;br /&gt;                    print "title: " + title&lt;br /&gt;                    print "link : " + link&lt;br /&gt;                response, headers = fetch(link)&lt;br /&gt;                s = BeautifulSoup.BeautifulSoup(response)&lt;br /&gt;                desc = s.fetch('div',{'class':'meldung_wrapper'})[0].prettify()&lt;br /&gt;                items.append(RSSItem(title=title, description=desc, link=link))&lt;br /&gt;            self.addRSSItems(items)&lt;br /&gt;&lt;br /&gt;HeiFeed.load("heise.de newsticker", 'http://www.heise.de/newsticker/',&lt;br /&gt;             "heise.de newsticker", 'heise_rss.xml', 'heise_rss.pickle',&lt;br /&gt;             managingEditor = 'tsch')&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Sun, 13 Nov 2005 03:04:25 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/886</guid>
      <author>tsch ()</author>
    </item>
  </channel>
</rss>
