#!/usr/bin/python # This file is licensed under CC Zero import sys import html5lib import re if len(sys.argv) != 3: print """! html2xhtml requires two arguments: the filename to read, and the filename to write""" exit() ####################################################################### # Parse HTML and output XHTML f = open(sys.argv[1]) p = html5lib.HTMLParser() t = p.parse(f) o = html5lib.serializer.serialize(t, format='xhtml') f.close() ####################################################################### # Clean up the mess left by html5lib def firstMatch(m): # Python makes s/x(y+)?/z$1/ very difficult if m.group(1): return m.group(1) return '' # Missing XHTML artifacts o = re.sub(']+>', '', o); o = re.sub(']+)?>', lambda m : '', o); # Fix weird reordering o = re.sub('', lambda m : '', o); # Indentation o = re.sub(']+)>\n]+)?><', lambda m : '\n<', o); o = re.sub(']+)?><', lambda m : '\n<', o); o = re.sub('<', '\n<', o); o = re.sub(']+)?><', lambda m : '\n<', o); o = re.sub('<', '\n<', o); o = re.sub('$', '\n', o); o = re.sub('\xa0', ' ', o); # make nbsp visible to people viewing source ####################################################################### # Write to file f = open(sys.argv[2], 'w') f.write(o.encode('utf-8')) f.close()