Mercurial > hg > config
annotate python/html2text.py @ 929:7c4be71a560b default tip
remove old aliases
| author | Jeff Hammel <k0scist@gmail.com> | 
|---|---|
| date | Mon, 20 Oct 2025 15:22:19 -0700 | 
| parents | cb1b91c6bceb | 
| children | 
| rev | line source | 
|---|---|
| 770 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 1 #!/usr/bin/env python | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 2 # -*- coding: utf-8 -*- | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 3 | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 4 """ | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 5 convert HTML to text using only HTMLParser | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 6 """ | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 7 | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 8 # imports | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 9 import argparse | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 10 import sys | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 11 from HTMLParser import HTMLParser | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 12 | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 13 class HTML2Text(HTMLParser): | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 14 | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 15 def __init__(self): | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 16 HTMLParser.__init__(self) | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 17 self.in_body = False | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 18 self.text = [] | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 19 | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 20 def handle_starttag(self, tag, attrs): | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 21 if tag == 'body': | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 22 self.in_body = True | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 23 | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 24 def handle_data(self, data): | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 25 if self.in_body: | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 26 data = data.strip() | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 27 if data: | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 28 self.text.append(data) | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 29 | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 30 def __str__(self): | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 31 return '\n'.join(self.text) | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 32 | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 33 def main(args=sys.argv[1:]): | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 34 | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 35 # parse command line | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 36 parser = argparse.ArgumentParser(description=__doc__) | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 37 parser.add_argument('input', nargs='?', | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 38 type=argparse.FileType('r'), default=sys.stdin, | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 39 help='input file, or read from stdin if ommitted') | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 40 options = parser.parse_args(args) | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 41 | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 42 # parse HTML | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 43 html = options.input.read() | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 44 html_parser = HTML2Text() | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 45 html_parser.feed(html) | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 46 html_parser.close() | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 47 | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 48 # output it | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 49 print (html_parser) | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 50 | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 51 if __name__ == '__main__': | 
| 
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 52 main() | 
