Never been to DZone Snippets before?

Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Converting XHTML to XML (See related posts)

Based on the code from 'Convert from HTML to XML with HTML Tidy', this code will read an xhtml file and extract text to gallery.xml as instructed by xhtml2xml.xml

#!/usr/bin/ruby
  
  require 'tidy'
  require 'projxslt'
  
  FILE_PATH = "../"
  
  class Xhtml2Xml
    def convert()
      project = 'xhtml2xml'
      filein = 'xhtml2xml.xml'
      filehtml = 'gallery.html'
      filexml = 'gallery_xhtml.xml'
      xslfile_temp = 'gallery.xsl'
      xslfile = 'xhtml2xml.xsl'
      fileout = 'gallery.xml'
      tidy_config = 'tidy.txt'
      
      project_path = FILE_PATH + project + '/'
      tidy_config_path = project_path + tidy_config
      filein_path = project_path + filein
      filehtml_path = project_path + filehtml
      filexml_path = project_path + filexml
      xslfile_temp_path = project_path + xslfile_temp
      xslfile_path = project_path + xslfile
      fileout_path = project_path + fileout
      
      Tidy.path = '/usr/lib/libtidy.so'

      file = File.new(filehtml_path,'r')
      buffer = file.read
      xml = Tidy.open(:show_warnings=>true) do |tidy|
        tidy.options.output_xml = true
        tidy.load_config(tidy_config_path)
        puts tidy.options.show_warnings
        xml = tidy.clean(buffer)
        puts tidy.errors
        puts tidy.diagnostics
        xml
      end
      
      #strip out the html document type declaration and save the file
      html_declaration = xml[/<!([^>]*>){2}/]
      save_file(filexml_path, xml.gsub(html_declaration,'<html>'))    
      transform(filein_path, xslfile_path, xslfile_temp_path)
      transform(filexml_path, xslfile_temp_path, fileout_path)
      
    end
    
    def transform(xml_filepath, xsl_filepath, save_filepath)
      pxsl = Projxslt.new(xml_filepath, xsl_filepath)
      outfile = pxsl.transform
      save_file(save_filepath, outfile)
    end
    
    def save_file(filepath, buffer)
      file = File.new(filepath,'w') 
      file.puts buffer
      file.close
    end    
  end
  
  if __FILE__ == $0
    h2x = Xhtml2Xml.new()
    h2x.convert()
  end

file: xhtml2xml.xml
<root element="gallery">
  <summary>
    <field element="title" xpath="head/title"/>
  </summary>
  <record xpath="body/center/table/tr/td" element="photo">
    <field xpath="font/br[3]/preceding-sibling::text()[1]" element="title"></field>
    <field xpath="/html/body/table/tr/td[2]/font/br[3]/preceding-sibling::text()[1]" element="date"></field>
    <field xpath="font/br[1]/preceding-sibling::text()[1]" element="image"></field>
    <field xpath="font/br[2]/preceding-sibling::text()[1]" element="description"></field>
  </record>
</root>

file:xhtml2xml.xsl (transforms the file xhtml2xml.xml to file gallery.xsl)
<?xml version="1.0"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
  <xsl:template match="root">
    <xsl:variable name="colon"><xsl:text>:</xsl:text></xsl:variable>
    
    <xsl:element name="xsl:stylesheet">
      <xsl:attribute name="xmlns{$colon}xsl">
        <xsl:text>http://www.w3.org/1999/XSL/Transform</xsl:text>
      </xsl:attribute>
      <xsl:attribute name="version">
        <xsl:text>1.0</xsl:text>
      </xsl:attribute><xsl:text>
      </xsl:text>

<xsl:element name="xsl:output">
  <xsl:attribute name="method">
    <xsl:text>xml</xsl:text>
  </xsl:attribute>
  <xsl:attribute name="indent">
    <xsl:text>yes</xsl:text>
  </xsl:attribute>
</xsl:element><xsl:text>

</xsl:text>

<xsl:element name="xsl:template">
      <xsl:attribute name="match">
        <xsl:text>html</xsl:text>
      </xsl:attribute><xsl:text>
</xsl:text>
      <xsl:element name="{@element}">
      <xsl:apply-templates select="summary"/>

      <xsl:element name="xsl{$colon}for-each">
        <xsl:attribute name="select">
          <xsl:value-of select="record/@xpath"/>
        </xsl:attribute><xsl:text>
    </xsl:text>              

  <xsl:for-each select="record/field">
    <xsl:element name="xsl:variable">
      <xsl:attribute name="name">
        <xsl:value-of select="@element"/>
      </xsl:attribute>
      <xsl:attribute name="select">
        <xsl:value-of select="@xpath"/>
      </xsl:attribute>
    </xsl:element><xsl:text>
    </xsl:text>
  </xsl:for-each>
<xsl:text>
    </xsl:text>

        <xsl:element name="{record/@element}">
       <xsl:for-each select="record/field">
              <xsl:element name="{@element}"><xsl:text>
        </xsl:text>
            <xsl:element name="xsl:value-of">
              <xsl:attribute name="select"><xsl:text>normalize-space($</xsl:text>
                <xsl:value-of select="@element"/>
                <xsl:text>)</xsl:text>                
              </xsl:attribute>
          </xsl:element>  <xsl:text>
      </xsl:text>
          </xsl:element>

    </xsl:for-each>
</xsl:element><xsl:text>
  </xsl:text>
</xsl:element><xsl:text>
</xsl:text>
 
  </xsl:element>
</xsl:element> <!-- template match -->
</xsl:element> <!-- gallery -->
  </xsl:template> 


<xsl:template match="summary/field"><xsl:text>
</xsl:text>
      <xsl:element name="xsl:element">
        <xsl:attribute name="name">
          <xsl:value-of select="@element"/>
        </xsl:attribute><xsl:text>
</xsl:text>
        <xsl:element name="xsl:value-of">
          <xsl:attribute name="select">
            <xsl:value-of select="@xpath"/>
          </xsl:attribute><xsl:text>
</xsl:text>
        </xsl:element><xsl:text>
</xsl:text>
      </xsl:element><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>

output: gallery.xml (this file is the product of gallery_xhtml.xml and gallery.xsl)
<?xml version="1.0"?>
<gallery>
  <title>Journey to Windsor</title>
  <photo>
    <title>Windsor Castle</title>
    <date>July 2003</date>
    <image>dscn0824.jpg</image>
    <description>
      A bright, red mailbox inside the castle. It seems oddly familiar in an historic setting.
    </description>
  </photo>
</gallery>

You need to create an account or log in to post comments to this site.


Click here to browse all 4858 code snippets

Related Posts