Never been to DZone Snippets before?

Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

« Newer Snippets
Older Snippets »
Showing 1-3 of 3 total  RSS 

Converting XHTML to XML

Based on the code from 'Convert from HTML to XML with HTML Tidy', this code will read an xhtml file and extract text to gallery.xml as instructed by xhtml2xml.xml

#!/usr/bin/ruby
  
  require 'tidy'
  require 'projxslt'
  
  FILE_PATH = "../"
  
  class Xhtml2Xml
    def convert()
      project = 'xhtml2xml'
      filein = 'xhtml2xml.xml'
      filehtml = 'gallery.html'
      filexml = 'gallery_xhtml.xml'
      xslfile_temp = 'gallery.xsl'
      xslfile = 'xhtml2xml.xsl'
      fileout = 'gallery.xml'
      tidy_config = 'tidy.txt'
      
      project_path = FILE_PATH + project + '/'
      tidy_config_path = project_path + tidy_config
      filein_path = project_path + filein
      filehtml_path = project_path + filehtml
      filexml_path = project_path + filexml
      xslfile_temp_path = project_path + xslfile_temp
      xslfile_path = project_path + xslfile
      fileout_path = project_path + fileout
      
      Tidy.path = '/usr/lib/libtidy.so'

      file = File.new(filehtml_path,'r')
      buffer = file.read
      xml = Tidy.open(:show_warnings=>true) do |tidy|
        tidy.options.output_xml = true
        tidy.load_config(tidy_config_path)
        puts tidy.options.show_warnings
        xml = tidy.clean(buffer)
        puts tidy.errors
        puts tidy.diagnostics
        xml
      end
      
      #strip out the html document type declaration and save the file
      html_declaration = xml[/<!([^>]*>){2}/]
      save_file(filexml_path, xml.gsub(html_declaration,'<html>'))    
      transform(filein_path, xslfile_path, xslfile_temp_path)
      transform(filexml_path, xslfile_temp_path, fileout_path)
      
    end
    
    def transform(xml_filepath, xsl_filepath, save_filepath)
      pxsl = Projxslt.new(xml_filepath, xsl_filepath)
      outfile = pxsl.transform
      save_file(save_filepath, outfile)
    end
    
    def save_file(filepath, buffer)
      file = File.new(filepath,'w') 
      file.puts buffer
      file.close
    end    
  end
  
  if __FILE__ == $0
    h2x = Xhtml2Xml.new()
    h2x.convert()
  end

file: xhtml2xml.xml
<root element="gallery">
  <summary>
    <field element="title" xpath="head/title"/>
  </summary>
  <record xpath="body/center/table/tr/td" element="photo">
    <field xpath="font/br[3]/preceding-sibling::text()[1]" element="title"></field>
    <field xpath="/html/body/table/tr/td[2]/font/br[3]/preceding-sibling::text()[1]" element="date"></field>
    <field xpath="font/br[1]/preceding-sibling::text()[1]" element="image"></field>
    <field xpath="font/br[2]/preceding-sibling::text()[1]" element="description"></field>
  </record>
</root>

file:xhtml2xml.xsl (transforms the file xhtml2xml.xml to file gallery.xsl)
<?xml version="1.0"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
  <xsl:template match="root">
    <xsl:variable name="colon"><xsl:text>:</xsl:text></xsl:variable>
    
    <xsl:element name="xsl:stylesheet">
      <xsl:attribute name="xmlns{$colon}xsl">
        <xsl:text>http://www.w3.org/1999/XSL/Transform</xsl:text>
      </xsl:attribute>
      <xsl:attribute name="version">
        <xsl:text>1.0</xsl:text>
      </xsl:attribute><xsl:text>
      </xsl:text>

<xsl:element name="xsl:output">
  <xsl:attribute name="method">
    <xsl:text>xml</xsl:text>
  </xsl:attribute>
  <xsl:attribute name="indent">
    <xsl:text>yes</xsl:text>
  </xsl:attribute>
</xsl:element><xsl:text>

</xsl:text>

<xsl:element name="xsl:template">
      <xsl:attribute name="match">
        <xsl:text>html</xsl:text>
      </xsl:attribute><xsl:text>
</xsl:text>
      <xsl:element name="{@element}">
      <xsl:apply-templates select="summary"/>

      <xsl:element name="xsl{$colon}for-each">
        <xsl:attribute name="select">
          <xsl:value-of select="record/@xpath"/>
        </xsl:attribute><xsl:text>
    </xsl:text>              

  <xsl:for-each select="record/field">
    <xsl:element name="xsl:variable">
      <xsl:attribute name="name">
        <xsl:value-of select="@element"/>
      </xsl:attribute>
      <xsl:attribute name="select">
        <xsl:value-of select="@xpath"/>
      </xsl:attribute>
    </xsl:element><xsl:text>
    </xsl:text>
  </xsl:for-each>
<xsl:text>
    </xsl:text>

        <xsl:element name="{record/@element}">
       <xsl:for-each select="record/field">
              <xsl:element name="{@element}"><xsl:text>
        </xsl:text>
            <xsl:element name="xsl:value-of">
              <xsl:attribute name="select"><xsl:text>normalize-space($</xsl:text>
                <xsl:value-of select="@element"/>
                <xsl:text>)</xsl:text>                
              </xsl:attribute>
          </xsl:element>  <xsl:text>
      </xsl:text>
          </xsl:element>

    </xsl:for-each>
</xsl:element><xsl:text>
  </xsl:text>
</xsl:element><xsl:text>
</xsl:text>
 
  </xsl:element>
</xsl:element> <!-- template match -->
</xsl:element> <!-- gallery -->
  </xsl:template> 


<xsl:template match="summary/field"><xsl:text>
</xsl:text>
      <xsl:element name="xsl:element">
        <xsl:attribute name="name">
          <xsl:value-of select="@element"/>
        </xsl:attribute><xsl:text>
</xsl:text>
        <xsl:element name="xsl:value-of">
          <xsl:attribute name="select">
            <xsl:value-of select="@xpath"/>
          </xsl:attribute><xsl:text>
</xsl:text>
        </xsl:element><xsl:text>
</xsl:text>
      </xsl:element><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>

output: gallery.xml (this file is the product of gallery_xhtml.xml and gallery.xsl)
<?xml version="1.0"?>
<gallery>
  <title>Journey to Windsor</title>
  <photo>
    <title>Windsor Castle</title>
    <date>July 2003</date>
    <image>dscn0824.jpg</image>
    <description>
      A bright, red mailbox inside the castle. It seems oddly familiar in an historic setting.
    </description>
  </photo>
</gallery>

Scrape an XHTML document using Ruby

A simple Ruby script to scrape an XHTML file with the selected content being saved to an xml file ready for transformation into an RSS feed. This example uses the XHTML file from http://newsgang.net/audio/ which is then saved locally as 'thegang.xml'.

#!/usr/bin/ruby
# file: thegang.rb

require 'rexml/document'
include REXML

class TheGang
  def initialize()
  end
  
  def rssify()
    file = File.new('thegang.xml','r')
    doc = Document.new(file)
    rss_doc = Document.new
    root = Element.new('rss')
    rss_doc.add_element(root)
    
    doc.root.elements.each("body/div/ul/li/h2/a") do |node|    
      o_rssitem = Element.new('item')
      o_li = node.parent.parent
      
      o_rsstitle = Element.new('title')
      o_rsstitle.text = node.text.gsub(/[\n,' ']/,'')
      o_rssitem.add_element(o_rsstitle)
      
      o_rsshref_audio = Element.new('href_audio')
      o_rsshref_audio.text = node.attributes.get_attribute('href').to_s.gsub('amp;&','')      
      o_rssitem.add_element(o_rsshref_audio)
      
      o_rsshref = Element.new('href')
      o_rsshref.text = o_rsshref_audio.text.gsub('&amp;from=audio','')      
      o_rssitem.add_element(o_rsshref)
      
      o_rssdate = Element.new('date')
      o_rssdate.text = "#{o_li.elements["p/span[1]"].text} #{o_li.elements["p/span[2]"].text}"
      o_rssitem.add_element(o_rssdate)
      rss_doc.root.add_element(o_rssitem)
      
    end

    file = File.new('thegang_rss.xml','w')
    file.puts rss_doc
    file.close
  end
end


if __FILE__ == $0
  gang = TheGang.new
  gang.rssify
end


see also: www.dapper.net

output (extract)
<rss>
  <item><title>TheGangXII-II</title><href_audio>/gangitem/id=6501&amp;from=audio</href_audio><href>/gangitem/id=6501</href><date>Jan 25</date></item>
  <item><title>TheGangXII-I</title><href_audio>/gangitem/id=6499&amp;from=audio</href_audio><href>/gangitem/id=6499</href><date>Jan 25</date></item>
  <item><title>NewsGangLive01.24.08</title><href_audio>/gangitem/id=6445&amp;from=audio</href_audio><href>/gangitem/id=6445</href><date>Jan 24</date></item>
  <item><title>NewsGangLiveII</title><href_audio>/gangitem/id=6377&amp;from=audio</href_audio><href>/gangitem/id=6377</href><date>Jan 23</date></item>
  ...
</rss>

Screen scape heise.de newsticker (german)

#!/usr/bin/env python
# -*- encoding: latin1 -*-

import BeautifulSoup
from PyRSS2Gen import RSSItem, Guid
import ScrapeNFeed
import urllib2
import re

debug = 0

def fetch(url):
    response = urllib2.urlopen(urllib2.Request(url))
    return response.read(),response.info()

class HeiFeed(ScrapeNFeed.ScrapedFeed):    
    def HTML2RSS(self, headers, body):
        items = []
        soup = BeautifulSoup.BeautifulSoup(body)
        for item in soup('a', {'href' : re.compile('^meldung.*')}):
            link = 'http://www.heise.de/newsticker/' + item['href']
            if not self.hasSeen(link):
                title = item.contents[0].strip()
                if debug:
                    print "title: " + title
                    print "link : " + link
                response, headers = fetch(link)
                s = BeautifulSoup.BeautifulSoup(response)
                desc = s.fetch('div',{'class':'meldung_wrapper'})[0].prettify()
                items.append(RSSItem(title=title, description=desc, link=link))
            self.addRSSItems(items)

HeiFeed.load("heise.de newsticker", 'http://www.heise.de/newsticker/',
             "heise.de newsticker", 'heise_rss.xml', 'heise_rss.pickle',
             managingEditor = 'tsch')
« Newer Snippets
Older Snippets »
Showing 1-3 of 3 total  RSS