Based on the code from
'Convert from HTML to XML with HTML Tidy', this code will read an xhtml file and extract text to gallery.xml as instructed by xhtml2xml.xml
require 'tidy'
require 'projxslt'
FILE_PATH = "../"
class Xhtml2Xml
def convert()
project = 'xhtml2xml'
filein = 'xhtml2xml.xml'
filehtml = 'gallery.html'
filexml = 'gallery_xhtml.xml'
xslfile_temp = 'gallery.xsl'
xslfile = 'xhtml2xml.xsl'
fileout = 'gallery.xml'
tidy_config = 'tidy.txt'
project_path = FILE_PATH + project + '/'
tidy_config_path = project_path + tidy_config
filein_path = project_path + filein
filehtml_path = project_path + filehtml
filexml_path = project_path + filexml
xslfile_temp_path = project_path + xslfile_temp
xslfile_path = project_path + xslfile
fileout_path = project_path + fileout
Tidy.path = '/usr/lib/libtidy.so'
file = File.new(filehtml_path,'r')
buffer = file.read
xml = Tidy.open(:show_warnings=>true) do |tidy|
tidy.options.output_xml = true
tidy.load_config(tidy_config_path)
puts tidy.options.show_warnings
xml = tidy.clean(buffer)
puts tidy.errors
puts tidy.diagnostics
xml
end
html_declaration = xml[/<!([^>]*>){2}/]
save_file(filexml_path, xml.gsub(html_declaration,'<html>'))
transform(filein_path, xslfile_path, xslfile_temp_path)
transform(filexml_path, xslfile_temp_path, fileout_path)
end
def transform(xml_filepath, xsl_filepath, save_filepath)
pxsl = Projxslt.new(xml_filepath, xsl_filepath)
outfile = pxsl.transform
save_file(save_filepath, outfile)
end
def save_file(filepath, buffer)
file = File.new(filepath,'w')
file.puts buffer
file.close
end
end
if __FILE__ == $0
h2x = Xhtml2Xml.new()
h2x.convert()
end
file: xhtml2xml.xml
<root element="gallery">
<summary>
<field element="title" xpath="head/title"/>
</summary>
<record xpath="body/center/table/tr/td" element="photo">
<field xpath="font/br[3]/preceding-sibling::text()[1]" element="title"></field>
<field xpath="/html/body/table/tr/td[2]/font/br[3]/preceding-sibling::text()[1]" element="date"></field>
<field xpath="font/br[1]/preceding-sibling::text()[1]" element="image"></field>
<field xpath="font/br[2]/preceding-sibling::text()[1]" element="description"></field>
</record>
</root>
file:xhtml2xml.xsl (transforms the file xhtml2xml.xml to file gallery.xsl)
<?xml version="1.0"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:template match="root">
<xsl:variable name="colon"><xsl:text>:</xsl:text></xsl:variable>
<xsl:element name="xsl:stylesheet">
<xsl:attribute name="xmlns{$colon}xsl">
<xsl:text>http://www.w3.org/1999/XSL/Transform</xsl:text>
</xsl:attribute>
<xsl:attribute name="version">
<xsl:text>1.0</xsl:text>
</xsl:attribute><xsl:text>
</xsl:text>
<xsl:element name="xsl:output">
<xsl:attribute name="method">
<xsl:text>xml</xsl:text>
</xsl:attribute>
<xsl:attribute name="indent">
<xsl:text>yes</xsl:text>
</xsl:attribute>
</xsl:element><xsl:text>
</xsl:text>
<xsl:element name="xsl:template">
<xsl:attribute name="match">
<xsl:text>html</xsl:text>
</xsl:attribute><xsl:text>
</xsl:text>
<xsl:element name="{@element}">
<xsl:apply-templates select="summary"/>
<xsl:element name="xsl{$colon}for-each">
<xsl:attribute name="select">
<xsl:value-of select="record/@xpath"/>
</xsl:attribute><xsl:text>
</xsl:text>
<xsl:for-each select="record/field">
<xsl:element name="xsl:variable">
<xsl:attribute name="name">
<xsl:value-of select="@element"/>
</xsl:attribute>
<xsl:attribute name="select">
<xsl:value-of select="@xpath"/>
</xsl:attribute>
</xsl:element><xsl:text>
</xsl:text>
</xsl:for-each>
<xsl:text>
</xsl:text>
<xsl:element name="{record/@element}">
<xsl:for-each select="record/field">
<xsl:element name="{@element}"><xsl:text>
</xsl:text>
<xsl:element name="xsl:value-of">
<xsl:attribute name="select"><xsl:text>normalize-space($</xsl:text>
<xsl:value-of select="@element"/>
<xsl:text>)</xsl:text>
</xsl:attribute>
</xsl:element> <xsl:text>
</xsl:text>
</xsl:element>
</xsl:for-each>
</xsl:element><xsl:text>
</xsl:text>
</xsl:element><xsl:text>
</xsl:text>
</xsl:element>
</xsl:element> <!-- template match -->
</xsl:element> <!-- gallery -->
</xsl:template>
<xsl:template match="summary/field"><xsl:text>
</xsl:text>
<xsl:element name="xsl:element">
<xsl:attribute name="name">
<xsl:value-of select="@element"/>
</xsl:attribute><xsl:text>
</xsl:text>
<xsl:element name="xsl:value-of">
<xsl:attribute name="select">
<xsl:value-of select="@xpath"/>
</xsl:attribute><xsl:text>
</xsl:text>
</xsl:element><xsl:text>
</xsl:text>
</xsl:element><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
output: gallery.xml (this file is the product of gallery_xhtml.xml and gallery.xsl)
<?xml version="1.0"?>
<gallery>
<title>Journey to Windsor</title>
<photo>
<title>Windsor Castle</title>
<date>July 2003</date>
<image>dscn0824.jpg</image>
<description>
A bright, red mailbox inside the castle. It seems oddly familiar in an historic setting.
</description>
</photo>
</gallery>