#!/usr/bin/env python # License: # 2006 Lenny Domnitser, all rights waived. ''' Convert named character references (entities) in HTML to Unicode. ''' import htmlentitydefs import re __all__ = 'convert_entities', RE_ent = re.compile(r'&(\w+);') def _ent_match_to_unicode(match): name = match.group(1) if name not in ('lt', 'gt', 'amp', 'quot', 'apos') and name in htmlentitydefs.name2codepoint: return unichr(htmlentitydefs.name2codepoint[name]) else: return '&%s;' % name def convert_entities(html): return RE_ent.sub(_ent_match_to_unicode, html) def main(): import optparse import sys parser = optparse.OptionParser() parser.set_description(__doc__.strip()) parser.add_option('-m', '--modify', dest='modify', action='store_true', help='modify files in place') parser.add_option('-e', '--output-encoding', dest='output_encoding', metavar='CHARSET', default='utf-8', help='write out in CHARSET. Default is UTF-8.') parser.add_option('--input-encoding', dest='input_encoding', metavar='CHARSET', default='utf-8', help='input is in CHARSET. Default is UTF-8.') options, filenames = parser.parse_args() def do_it(infile): return convert_entities(infile.read().decode(options.input_encoding)).encode(options.output_encoding) if not filenames: sys.stdout.write(do_it(sys.stdin)) else: for filename in filenames: html = do_it(open(filename)) if options.modify: outfile = open(filename, 'w') else: outfile = sys.stdout outfile.write(html) if __name__ == '__main__': main()