Mercurial > hg > config
comparison python/html2text.py @ 770:cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
| author | Jeff Hammel <k0scist@gmail.com> |
|---|---|
| date | Mon, 07 Mar 2016 12:22:04 -0800 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 769:489204193cd7 | 770:cb1b91c6bceb |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # -*- coding: utf-8 -*- | |
| 3 | |
| 4 """ | |
| 5 convert HTML to text using only HTMLParser | |
| 6 """ | |
| 7 | |
| 8 # imports | |
| 9 import argparse | |
| 10 import sys | |
| 11 from HTMLParser import HTMLParser | |
| 12 | |
| 13 class HTML2Text(HTMLParser): | |
| 14 | |
| 15 def __init__(self): | |
| 16 HTMLParser.__init__(self) | |
| 17 self.in_body = False | |
| 18 self.text = [] | |
| 19 | |
| 20 def handle_starttag(self, tag, attrs): | |
| 21 if tag == 'body': | |
| 22 self.in_body = True | |
| 23 | |
| 24 def handle_data(self, data): | |
| 25 if self.in_body: | |
| 26 data = data.strip() | |
| 27 if data: | |
| 28 self.text.append(data) | |
| 29 | |
| 30 def __str__(self): | |
| 31 return '\n'.join(self.text) | |
| 32 | |
| 33 def main(args=sys.argv[1:]): | |
| 34 | |
| 35 # parse command line | |
| 36 parser = argparse.ArgumentParser(description=__doc__) | |
| 37 parser.add_argument('input', nargs='?', | |
| 38 type=argparse.FileType('r'), default=sys.stdin, | |
| 39 help='input file, or read from stdin if ommitted') | |
| 40 options = parser.parse_args(args) | |
| 41 | |
| 42 # parse HTML | |
| 43 html = options.input.read() | |
| 44 html_parser = HTML2Text() | |
| 45 html_parser.feed(html) | |
| 46 html_parser.close() | |
| 47 | |
| 48 # output it | |
| 49 print (html_parser) | |
| 50 | |
| 51 if __name__ == '__main__': | |
| 52 main() |
