Mercurial > hg > config
annotate python/html2text.py @ 928:84543f2cda0d
restore my real email that companies keep making me change
| author | Jeff Hammel <k0scist@gmail.com> | 
|---|---|
| date | Tue, 14 Oct 2025 14:20:55 -0700 | 
| parents | cb1b91c6bceb | 
| children | 
| rev | line source | 
|---|---|
| 
770
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
1 #!/usr/bin/env python | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
2 # -*- coding: utf-8 -*- | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
3 | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
4 """ | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
5 convert HTML to text using only HTMLParser | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
6 """ | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
7 | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
8 # imports | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
9 import argparse | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
10 import sys | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
11 from HTMLParser import HTMLParser | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
12 | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
13 class HTML2Text(HTMLParser): | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
14 | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
15 def __init__(self): | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
16 HTMLParser.__init__(self) | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
17 self.in_body = False | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
18 self.text = [] | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
19 | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
20 def handle_starttag(self, tag, attrs): | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
21 if tag == 'body': | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
22 self.in_body = True | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
23 | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
24 def handle_data(self, data): | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
25 if self.in_body: | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
26 data = data.strip() | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
27 if data: | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
28 self.text.append(data) | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
29 | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
30 def __str__(self): | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
31 return '\n'.join(self.text) | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
32 | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
33 def main(args=sys.argv[1:]): | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
34 | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
35 # parse command line | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
36 parser = argparse.ArgumentParser(description=__doc__) | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
37 parser.add_argument('input', nargs='?', | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
38 type=argparse.FileType('r'), default=sys.stdin, | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
39 help='input file, or read from stdin if ommitted') | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
40 options = parser.parse_args(args) | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
41 | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
42 # parse HTML | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
43 html = options.input.read() | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
44 html_parser = HTML2Text() | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
45 html_parser.feed(html) | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
46 html_parser.close() | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
47 | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
48 # output it | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
49 print (html_parser) | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
50 | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
51 if __name__ == '__main__': | 
| 
 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
52 main() | 
