Based on the code from
'Convert from HTML to XML with HTML Tidy', this code will read an xhtml file and extract text to gallery.xml as instructed by xhtml2xml.xml
1
2
3
4 require 'tidy'
5 require 'projxslt'
6
7 FILE_PATH = "../"
8
9 class Xhtml2Xml
10 def convert()
11 project = 'xhtml2xml'
12 filein = 'xhtml2xml.xml'
13 filehtml = 'gallery.html'
14 filexml = 'gallery_xhtml.xml'
15 xslfile_temp = 'gallery.xsl'
16 xslfile = 'xhtml2xml.xsl'
17 fileout = 'gallery.xml'
18 tidy_config = 'tidy.txt'
19
20 project_path = FILE_PATH + project + '/'
21 tidy_config_path = project_path + tidy_config
22 filein_path = project_path + filein
23 filehtml_path = project_path + filehtml
24 filexml_path = project_path + filexml
25 xslfile_temp_path = project_path + xslfile_temp
26 xslfile_path = project_path + xslfile
27 fileout_path = project_path + fileout
28
29 Tidy.path = '/usr/lib/libtidy.so'
30
31 file = File.new(filehtml_path,'r')
32 buffer = file.read
33 xml = Tidy.open(:show_warnings=>true) do |tidy|
34 tidy.options.output_xml = true
35 tidy.load_config(tidy_config_path)
36 puts tidy.options.show_warnings
37 xml = tidy.clean(buffer)
38 puts tidy.errors
39 puts tidy.diagnostics
40 xml
41 end
42
43
44 html_declaration = xml[/<!([^>]*>){2}/]
45 save_file(filexml_path, xml.gsub(html_declaration,'<html>'))
46 transform(filein_path, xslfile_path, xslfile_temp_path)
47 transform(filexml_path, xslfile_temp_path, fileout_path)
48
49 end
50
51 def transform(xml_filepath, xsl_filepath, save_filepath)
52 pxsl = Projxslt.new(xml_filepath, xsl_filepath)
53 outfile = pxsl.transform
54 save_file(save_filepath, outfile)
55 end
56
57 def save_file(filepath, buffer)
58 file = File.new(filepath,'w')
59 file.puts buffer
60 file.close
61 end
62 end
63
64 if __FILE__ == $0
65 h2x = Xhtml2Xml.new()
66 h2x.convert()
67 end
file: xhtml2xml.xml
1
2 <root element="gallery">
3 <summary>
4 <field element="title" xpath="head/title"/>
5 </summary>
6 <record xpath="body/center/table/tr/td" element="photo">
7 <field xpath="font/br[3]/preceding-sibling::text()[1]" element="title"></field>
8 <field xpath="/html/body/table/tr/td[2]/font/br[3]/preceding-sibling::text()[1]" element="date"></field>
9 <field xpath="font/br[1]/preceding-sibling::text()[1]" element="image"></field>
10 <field xpath="font/br[2]/preceding-sibling::text()[1]" element="description"></field>
11 </record>
12 </root>
file:xhtml2xml.xsl (transforms the file xhtml2xml.xml to file gallery.xsl)
1
2 <?xml version="1.0"?>
3 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
4 <xsl:template match="root">
5 <xsl:variable name="colon"><xsl:text>:</xsl:text></xsl:variable>
6
7 <xsl:element name="xsl:stylesheet">
8 <xsl:attribute name="xmlns{$colon}xsl">
9 <xsl:text>http://www.w3.org/1999/XSL/Transform</xsl:text>
10 </xsl:attribute>
11 <xsl:attribute name="version">
12 <xsl:text>1.0</xsl:text>
13 </xsl:attribute><xsl:text>
14 </xsl:text>
15
16 <xsl:element name="xsl:output">
17 <xsl:attribute name="method">
18 <xsl:text>xml</xsl:text>
19 </xsl:attribute>
20 <xsl:attribute name="indent">
21 <xsl:text>yes</xsl:text>
22 </xsl:attribute>
23 </xsl:element><xsl:text>
24
25 </xsl:text>
26
27 <xsl:element name="xsl:template">
28 <xsl:attribute name="match">
29 <xsl:text>html</xsl:text>
30 </xsl:attribute><xsl:text>
31 </xsl:text>
32 <xsl:element name="{@element}">
33 <xsl:apply-templates select="summary"/>
34
35 <xsl:element name="xsl{$colon}for-each">
36 <xsl:attribute name="select">
37 <xsl:value-of select="record/@xpath"/>
38 </xsl:attribute><xsl:text>
39 </xsl:text>
40
41 <xsl:for-each select="record/field">
42 <xsl:element name="xsl:variable">
43 <xsl:attribute name="name">
44 <xsl:value-of select="@element"/>
45 </xsl:attribute>
46 <xsl:attribute name="select">
47 <xsl:value-of select="@xpath"/>
48 </xsl:attribute>
49 </xsl:element><xsl:text>
50 </xsl:text>
51 </xsl:for-each>
52 <xsl:text>
53 </xsl:text>
54
55 <xsl:element name="{record/@element}">
56 <xsl:for-each select="record/field">
57 <xsl:element name="{@element}"><xsl:text>
58 </xsl:text>
59 <xsl:element name="xsl:value-of">
60 <xsl:attribute name="select"><xsl:text>normalize-space($</xsl:text>
61 <xsl:value-of select="@element"/>
62 <xsl:text>)</xsl:text>
63 </xsl:attribute>
64 </xsl:element> <xsl:text>
65 </xsl:text>
66 </xsl:element>
67
68 </xsl:for-each>
69 </xsl:element><xsl:text>
70 </xsl:text>
71 </xsl:element><xsl:text>
72 </xsl:text>
73
74 </xsl:element>
75 </xsl:element> <!-- template match -->
76 </xsl:element> <!-- gallery -->
77 </xsl:template>
78
79
80 <xsl:template match="summary/field"><xsl:text>
81 </xsl:text>
82 <xsl:element name="xsl:element">
83 <xsl:attribute name="name">
84 <xsl:value-of select="@element"/>
85 </xsl:attribute><xsl:text>
86 </xsl:text>
87 <xsl:element name="xsl:value-of">
88 <xsl:attribute name="select">
89 <xsl:value-of select="@xpath"/>
90 </xsl:attribute><xsl:text>
91 </xsl:text>
92 </xsl:element><xsl:text>
93 </xsl:text>
94 </xsl:element><xsl:text>
95 </xsl:text>
96 </xsl:template>
97 </xsl:stylesheet>
output: gallery.xml (this file is the product of gallery_xhtml.xml and gallery.xsl)
1
2 <?xml version="1.0"?>
3 <gallery>
4 <title>Journey to Windsor</title>
5 <photo>
6 <title>Windsor Castle</title>
7 <date>July 2003</date>
8 <image>dscn0824.jpg</image>
9 <description>
10 A bright, red mailbox inside the castle. It seems oddly familiar in an historic setting.
11 </description>
12 </photo>
13 </gallery>