1
2
3
4 __author__="Andrew Pennebaker (andrew.pennebaker@gmail.com)"
5 __date__="10 Dec 2006"
6 __copyright__="Copyright 2006 Andrew Pennebaker"
7 __license__="GPL"
8 __version__="0.0.1"
9 __credits__="Based on http://mail.python.org/pipermail/python-list/2004-November/291562.html"
10 __URL__="http://snippets.dzone.com/posts/show/3127"
11
12 import htmllib
13 from sgmllib import SGMLParser
14
15 import sys
16
17 class html2txt(SGMLParser):
18 """html2txt()"""
19
20 def reset(self):
21 SGMLParser.reset(self)
22 self.pieces=[]
23
24 def handle_data(self, text):
25 self.pieces.append(text)
26
27 def unknown_starttag(self, tag, attributes):
28 pass
29
30 def unknown_endtag(self, tag):
31 pass
32
33 def handle_entityref(self, ref):
34 try:
35 self.pieces.append(htmllib.HTMLParser.entitydefs[ref])
36 except KeyError, e:
37 self.pieces.append("&"+ref)
38
39 def output(self):
40 return "".join(self.pieces)
41
42 if __name__=="__main__":
43 html="".join(sys.stdin.readlines())
44
45 parser = html2txt()
46 parser.feed(html)
47 parser.close()
48
49 print parser.output()