1 2 from sgmllib import SGMLParser 3 4 import urllib 5 6 class ParserHTML(SGMLParser): 7 8 def scrivi(self): 9 self.f = open('/tmp/fileOUT.html', 'w') 10 11 def unknown_starttag(self, tag, attrs): 12 13 value = 0 14 startTAG = '<' + tag 15 16 for i in attrs: 17 if(i[0].lower() == i[1].lower() and not i[0] == i[1]): 18 startTAG = startTAG[:-1] + ' ' + str(i[1]) 19 value = 1 20 else: 21 startTAG += ' ' + str(i[0]) + '="' + str(i[1]) + '"' 22 value = 0 23 24 if(value == 1): startTAG += '"' 25 26 startTAG += '>' 27 self.f.write(startTAG + "\n") 28 29 def handle_data(self, data): 30 31 self.f.write(data + "\n") 32 33 def unknown_endtag(self, tag): 34 35 self.f.write('</' + tag + '>' + "\n") 36 37 if __name__ == '__main__': 38 39 p = ParserHTML() 40 p.scrivi() 41 p.feed(open('/tmp/fileIN.html', 'r').read())
You need to create an account or log in to post comments to this site.