import HTMLParser, re, sys class html2wiki(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self.wiki = '' # The Wiki text self.wikirow = '' # The current Wiki row of table being constructed from HTML self.inTD = 0 # Used to track if we are inside or outside a <TD>...</TD> tag. self.inTR = 0 # Used to track if we are inside or outside a <TR>...</TR> tag. self.re_multiplespaces = re.compile('\s+') # regular expression used to remove spaces in excess self.rowCount = 0 # output row counter. self.rowspan = '' self.colspan = '' self.linebreak = '<br>' self.data = '' self.prop = '' def handle_starttag(self, tag, attrs): if tag == 'table': self.start_table() elif tag == 'tr': self.start_tr() elif tag == 'td': self.start_td(attrs) def handle_endtag(self, tag): if tag == 'table': self.end_table(); elif tag == 'tr': self.end_tr() elif tag == 'td': self.end_td() def start_table(self): self.wiki += '{| border=1' + self.linebreak self.wiki += '|-' + self.linebreak def end_table(self): self.wiki += '|}' + self.linebreak def start_tr(self): if self.inTR: self.end_tr() # <TR> implies </TR> self.inTR = 1 def end_tr(self): if self.inTD: self.end_td() # </TR> implies </TD> self.inTR = 0 if len(self.wikirow) > 0: self.wiki += self.wikirow self.wiki += '|-' + self.linebreak self.wikirow = '' self.rowCount += 1 def start_td(self, attrs): if not self.inTR: self.start_tr() # <TD> implies <TR> self.data = '' self.prop = '' self.rowspan = '' self.colspan = '' for key, value in attrs: if key == 'rowspan': self.rowspan = value elif key == 'colspan': self.colspan = value self.inTD = 1 def end_td(self): if self.inTD: self.wikirow += '| ' + self.prop + self.re_multiplespaces.sub(' ',self.data.replace('\t',' ').replace(self.linebreak,'').replace('\r','').replace('"','""'))+ self.linebreak; self.data = '' self.inTD = 0 def handle_data(self, data): if self.inTD: if data.strip() != '': self.prop = '' if self.rowspan != '': self.prop = ' rowspan = '+self.rowspan if self.colspan != '': self.prop += ' colspan = '+self.colspan if self.prop: self.prop += ' | ' self.data += data if __name__ == '__main__': parser = html2wiki() if len(sys.argv) == 2: in_file = open(sys.argv[1],"r") text = in_file.read() parser.feed(text) in_file.close() print parser.wiki else: print 'Argument - filename required'
You need to create an account or log in to post comments to this site.