#!/usr/bin/ruby require 'tidy' require 'projxslt' FILE_PATH = "../" class Xhtml2Xml def convert() project = 'xhtml2xml' filein = 'xhtml2xml.xml' filehtml = 'gallery.html' filexml = 'gallery_xhtml.xml' xslfile_temp = 'gallery.xsl' xslfile = 'xhtml2xml.xsl' fileout = 'gallery.xml' tidy_config = 'tidy.txt' project_path = FILE_PATH + project + '/' tidy_config_path = project_path + tidy_config filein_path = project_path + filein filehtml_path = project_path + filehtml filexml_path = project_path + filexml xslfile_temp_path = project_path + xslfile_temp xslfile_path = project_path + xslfile fileout_path = project_path + fileout Tidy.path = '/usr/lib/libtidy.so' file = File.new(filehtml_path,'r') buffer = file.read xml = Tidy.open(:show_warnings=>true) do |tidy| tidy.options.output_xml = true tidy.load_config(tidy_config_path) puts tidy.options.show_warnings xml = tidy.clean(buffer) puts tidy.errors puts tidy.diagnostics xml end #strip out the html document type declaration and save the file html_declaration = xml[/<!([^>]*>){2}/] save_file(filexml_path, xml.gsub(html_declaration,'<html>')) transform(filein_path, xslfile_path, xslfile_temp_path) transform(filexml_path, xslfile_temp_path, fileout_path) end def transform(xml_filepath, xsl_filepath, save_filepath) pxsl = Projxslt.new(xml_filepath, xsl_filepath) outfile = pxsl.transform save_file(save_filepath, outfile) end def save_file(filepath, buffer) file = File.new(filepath,'w') file.puts buffer file.close end end if __FILE__ == $0 h2x = Xhtml2Xml.new() h2x.convert() end
file: xhtml2xml.xml
<root element="gallery"> <summary> <field element="title" xpath="head/title"/> </summary> <record xpath="body/center/table/tr/td" element="photo"> <field xpath="font/br[3]/preceding-sibling::text()[1]" element="title"></field> <field xpath="/html/body/table/tr/td[2]/font/br[3]/preceding-sibling::text()[1]" element="date"></field> <field xpath="font/br[1]/preceding-sibling::text()[1]" element="image"></field> <field xpath="font/br[2]/preceding-sibling::text()[1]" element="description"></field> </record> </root>
file:xhtml2xml.xsl (transforms the file xhtml2xml.xml to file gallery.xsl)
<?xml version="1.0"?> <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> <xsl:template match="root"> <xsl:variable name="colon"><xsl:text>:</xsl:text></xsl:variable> <xsl:element name="xsl:stylesheet"> <xsl:attribute name="xmlns{$colon}xsl"> <xsl:text>http://www.w3.org/1999/XSL/Transform</xsl:text> </xsl:attribute> <xsl:attribute name="version"> <xsl:text>1.0</xsl:text> </xsl:attribute><xsl:text> </xsl:text> <xsl:element name="xsl:output"> <xsl:attribute name="method"> <xsl:text>xml</xsl:text> </xsl:attribute> <xsl:attribute name="indent"> <xsl:text>yes</xsl:text> </xsl:attribute> </xsl:element><xsl:text> </xsl:text> <xsl:element name="xsl:template"> <xsl:attribute name="match"> <xsl:text>html</xsl:text> </xsl:attribute><xsl:text> </xsl:text> <xsl:element name="{@element}"> <xsl:apply-templates select="summary"/> <xsl:element name="xsl{$colon}for-each"> <xsl:attribute name="select"> <xsl:value-of select="record/@xpath"/> </xsl:attribute><xsl:text> </xsl:text> <xsl:for-each select="record/field"> <xsl:element name="xsl:variable"> <xsl:attribute name="name"> <xsl:value-of select="@element"/> </xsl:attribute> <xsl:attribute name="select"> <xsl:value-of select="@xpath"/> </xsl:attribute> </xsl:element><xsl:text> </xsl:text> </xsl:for-each> <xsl:text> </xsl:text> <xsl:element name="{record/@element}"> <xsl:for-each select="record/field"> <xsl:element name="{@element}"><xsl:text> </xsl:text> <xsl:element name="xsl:value-of"> <xsl:attribute name="select"><xsl:text>normalize-space($</xsl:text> <xsl:value-of select="@element"/> <xsl:text>)</xsl:text> </xsl:attribute> </xsl:element> <xsl:text> </xsl:text> </xsl:element> </xsl:for-each> </xsl:element><xsl:text> </xsl:text> </xsl:element><xsl:text> </xsl:text> </xsl:element> </xsl:element> <!-- template match --> </xsl:element> <!-- gallery --> </xsl:template> <xsl:template match="summary/field"><xsl:text> </xsl:text> <xsl:element name="xsl:element"> <xsl:attribute name="name"> <xsl:value-of select="@element"/> </xsl:attribute><xsl:text> </xsl:text> <xsl:element name="xsl:value-of"> <xsl:attribute name="select"> <xsl:value-of select="@xpath"/> </xsl:attribute><xsl:text> </xsl:text> </xsl:element><xsl:text> </xsl:text> </xsl:element><xsl:text> </xsl:text> </xsl:template> </xsl:stylesheet>
output: gallery.xml (this file is the product of gallery_xhtml.xml and gallery.xsl)
<?xml version="1.0"?> <gallery> <title>Journey to Windsor</title> <photo> <title>Windsor Castle</title> <date>July 2003</date> <image>dscn0824.jpg</image> <description> A bright, red mailbox inside the castle. It seems oddly familiar in an historic setting. </description> </photo> </gallery>