Update web-platform-tests to revision 132d12daea699ce266324e79eecbe59b10e56502

2025-08-10 16:05:43 +01:00 · 2018-06-08 21:05:21 -04:00 · 2018-06-08 21:05:21 -04:00 · fe00a63040
commit fe00a63040
parent 527d874bc1
1004 changed files with 18598 additions and 92770 deletions
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/.appveyor.yml
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/.appveyor.yml
@ -0,0 +1,31 @@
+# To activate, change the Appveyor settings to use `.appveyor.yml`.
+environment:
+  global:
+    PATH: "C:\\Python27\\Scripts\\;%PATH%"
+    PYTEST_COMMAND: "coverage run -m pytest"
+  matrix:
+    - TOXENV: py27-base
+    - TOXENV: py27-optional
+    - TOXENV: py33-base
+    - TOXENV: py33-optional
+    - TOXENV: py34-base
+    - TOXENV: py34-optional
+    - TOXENV: py35-base
+    - TOXENV: py35-optional
+    - TOXENV: py36-base
+    - TOXENV: py36-optional
+
+install:
+  - git submodule update --init --recursive
+  - python -m pip install tox codecov
+
+build: off
+
+test_script:
+  - tox
+
+after_test:
+  - python debug-info.py
+
+on_success:
+  - codecov
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/.coveragerc
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/.coveragerc
@ -0,0 +1,8 @@
+[run]
+branch = True
+source = html5lib
+
+[paths]
+source =
+   html5lib
+   .tox/*/lib/python*/site-packages/html5lib
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/.gitignore
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/.gitignore
@ -0,0 +1,85 @@
+# Copyright (c) 2014 GitHub, Inc.
+#
+# Permission is hereby granted,  free of charge,  to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to  use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+doc/_build/
+
+# PyBuilder
+target/
+
+# Generated by parse.py -p
+stats.prof
+
+# IDE
+.idea
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/.gitmodules
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/.gitmodules
@ -0,0 +1,3 @@
+[submodule "testdata"]
+	path = html5lib/tests/testdata
+	url = https://github.com/html5lib/html5lib-tests.git
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/.prospector.yaml
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/.prospector.yaml
@ -0,0 +1,21 @@
+strictness: veryhigh
+doc-warnings: false
+test-warnings: false
+
+max-line-length: 139
+
+requirements:
+  - requirements.txt
+  - requirements-test.txt
+  - requirements-optional.txt
+
+ignore-paths:
+  - parse.py
+  - utils/
+
+python-targets:
+  - 2
+  - 3
+
+mccabe:
+  run: false
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/.pylintrc
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/.pylintrc
@ -0,0 +1,10 @@
+[MASTER]
+ignore=tests
+
+[MESSAGES CONTROL]
+# messages up to fixme should probably be fixed somehow
+disable = redefined-builtin,attribute-defined-outside-init,anomalous-backslash-in-string,no-self-use,redefined-outer-name,bad-continuation,wrong-import-order,superfluous-parens,no-member,duplicate-code,super-init-not-called,abstract-method,property-on-old-class,wrong-import-position,no-name-in-module,no-init,bad-mcs-classmethod-argument,bad-classmethod-argument,fixme,invalid-name,import-error,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-boolean-expressions,too-many-branches,too-many-instance-attributes,too-many-locals,too-many-lines,too-many-public-methods,too-many-return-statements,too-many-statements,missing-docstring,line-too-long,locally-disabled,locally-enabled,bad-builtin,deprecated-lambda
+
+[FORMAT]
+max-line-length=139
+single-line-if-stmt=no
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/.pytest.expect
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/.pytest.expect
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/.travis.yml
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/.travis.yml
@ -0,0 +1,32 @@
+language: python
+python:
+  - "pypy"
+  - "3.6"
+  - "3.5"
+  - "3.4"
+  - "3.3"
+  - "2.7"
+
+sudo: false
+
+cache: pip
+
+env:
+  global:
+    - PYTEST_COMMAND="coverage run -m pytest"
+  matrix:
+    - TOXENV=optional
+    - TOXENV=base
+    - TOXENV=six19-optional
+
+install:
+  - pip install tox codecov
+
+script:
+  - tox
+
+after_script:
+  - python debug-info.py
+
+after_success:
+  - codecov
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/AUTHORS.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/AUTHORS.rst
@ -0,0 +1,66 @@
+Credits
+=======
+
+``html5lib`` is written and maintained by:
+
+- James Graham
+- Geoffrey Sneddon
+- Łukasz Langa
+- Will Kahn-Greene
+
+
+Patches and suggestions
+-----------------------
+(In chronological order, by first commit:)
+
+- Anne van Kesteren
+- Lachlan Hunt
+- lantis63
+- Sam Ruby
+- Thomas Broyer
+- Tim Fletcher
+- Mark Pilgrim
+- Ryan King
+- Philip Taylor
+- Edward Z. Yang
+- fantasai
+- Philip Jägenstedt
+- Ms2ger
+- Mohammad Taha Jahangir
+- Andy Wingo
+- Andreas Madsack
+- Karim Valiev
+- Juan Carlos Garcia Segovia
+- Mike West
+- Marc DM
+- Simon Sapin
+- Michael[tm] Smith
+- Ritwik Gupta
+- Marc Abramowitz
+- Tony Lopes
+- lilbludevil
+- Kevin
+- Drew Hubl
+- Austin Kumbera
+- Jim Baker
+- Jon Dufresne
+- Donald Stufft
+- Alex Gaynor
+- Nik Nyby
+- Jakub Wilk
+- Sigmund Cherem
+- Gabi Davar
+- Florian Mounier
+- neumond
+- Vitalik Verhovodov
+- Kovid Goyal
+- Adam Chainz
+- John Vandenberg
+- Eric Amorde
+- Benedikt Morbach
+- Jonathan Vanasco
+- Tom Most
+- Ville Skyttä
+- Hugo van Kemenade
+- Mark Vasilkov
+
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/CHANGES.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/CHANGES.rst
@ -0,0 +1,335 @@
+Change Log
+----------
+
+1.0.1
+~~~~~
+
+Released on December 7, 2017
+
+Breaking changes:
+
+* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!)
+* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!)
+
+Features:
+
+* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most,
+  Will Kahn-Greene!)
+* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!)
+* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!)
+* Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
+* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
+* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
+  Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!)
+* Semver-compliant version number.
+
+Bug fixes:
+
+* Add support for setuptools < 18.5 to support environment markers. (Thank you,
+  John Vandenberg!)
+* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!)
+* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank
+  you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!)
+* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will
+  Kahn-Greene!)
+* Include license file in generated wheel package. (#350) (Thank you, Jon
+  Dufresne!)
+* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!)
+* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you,
+  Komal Dembla, Hugo!)
+
+
+1.0
+~~~
+
+Released and unreleased on December 7, 2017. Badly packaged release.
+
+
+0.999999999/1.0b10
+~~~~~~~~~~~~~~~~~~
+
+Released on July 15, 2016
+
+* Fix attribute order going to the tree builder to be document order
+  instead of reverse document order(!).
+
+
+0.99999999/1.0b9
+~~~~~~~~~~~~~~~~
+
+Released on July 14, 2016
+
+* **Added ordereddict as a mandatory dependency on Python 2.6.**
+
+* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all``
+  extras that will do the right thing based on the specific
+  interpreter implementation.
+
+* Now requires the ``mock`` package for the testsuite.
+
+* Cease supporting DATrie under PyPy.
+
+* **Remove PullDOM support, as this hasn't ever been properly
+  tested, doesn't entirely work, and as far as I can tell is
+  completely unused by anyone.**
+
+* Move testsuite to ``py.test``.
+
+* **Fix #124: move to webencodings for decoding the input byte stream;
+  this makes html5lib compliant with the Encoding Standard, and
+  introduces a required dependency on webencodings.**
+
+* **Cease supporting Python 3.2 (in both CPython and PyPy forms).**
+
+* **Fix comments containing double-dash with lxml 3.5 and above.**
+
+* **Use scripting disabled by default (as we don't implement
+  scripting).**
+
+* **Fix #11, avoiding the XSS bug potentially caused by serializer
+  allowing attribute values to be escaped out of in old browser versions,
+  changing the quote_attr_values option on serializer to take one of
+  three values, "always" (the old True value), "legacy" (the new option,
+  and the new default), and "spec" (the old False value, and the old
+  default).**
+
+* **Fix #72 by rewriting the sanitizer to apply only to treewalkers
+  (instead of the tokenizer); as such, this will require amending all
+  callers of it to use it via the treewalker API.**
+
+* **Drop support of charade, now that chardet is supported once more.**
+
+* **Replace the charset keyword argument on parse and related methods
+  with a set of keyword arguments: override_encoding, transport_encoding,
+  same_origin_parent_encoding, likely_encoding, and default_encoding.**
+
+* **Move filters._base, treebuilder._base, and treewalkers._base to .base
+  to clarify their status as public.**
+
+* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
+  sanitizer.htmlsanitizer module and move that to sanitizer. This means
+  anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
+  code changes.**
+
+* **Rename treewalkers.lxmletree to .etree_lxml and
+  treewalkers.genshistream to .genshi to have a consistent API.**
+
+* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
+  utils) to be underscore prefixed to clarify their status as private.
+
+
+0.9999999/1.0b8
+~~~~~~~~~~~~~~~
+
+Released on September 10, 2015
+
+* Fix #195: fix the sanitizer to drop broken URLs (it threw an
+  exception between 0.9999 and 0.999999).
+
+
+0.999999/1.0b7
+~~~~~~~~~~~~~~
+
+Released on July 7, 2015
+
+* Fix #189: fix the sanitizer to allow relative URLs again (as it did
+  prior to 0.9999/1.0b5).
+
+
+0.99999/1.0b6
+~~~~~~~~~~~~~
+
+Released on April 30, 2015
+
+* Fix #188: fix the sanitizer to not throw an exception when sanitizing
+  bogus data URLs.
+
+
+0.9999/1.0b5
+~~~~~~~~~~~~
+
+Released on April 29, 2015
+
+* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how
+  this sounds, this has no known security implications.  No known version
+  of IE (5.5 to current), Firefox (3 to current), Safari (6 to current),
+  Chrome (1 to current), or Opera (12 to current) will run any script
+  provided in these attributes.
+
+* Pass error message to the ParseError exception in strict parsing mode.
+
+* Allow data URIs in the sanitizer, with a whitelist of content-types.
+
+* Add support for Python implementations that don't support lone
+  surrogates (read: Jython). Fixes #2.
+
+* Remove localization of error messages. This functionality was totally
+  unused (and untested that everything was localizable), so we may as
+  well follow numerous browsers in not supporting translating technical
+  strings.
+
+* Expose treewalkers.pprint as a public API.
+
+* Add a documentEncoding property to HTML5Parser, fix #121.
+
+
+0.999
+~~~~~
+
+Released on December 23, 2013
+
+* Fix #127: add work-around for CPython issue #20007: .read(0) on
+  http.client.HTTPResponse drops the rest of the content.
+
+* Fix #115: lxml treewalker can now deal with fragments containing, at
+  their root level, text nodes with non-ASCII characters on Python 2.
+
+
+0.99
+~~~~
+
+Released on September 10, 2013
+
+* No library changes from 1.0b3; released as 0.99 as pip has changed
+  behaviour from 1.4 to avoid installing pre-release versions per
+  PEP 440.
+
+
+1.0b3
+~~~~~
+
+Released on July 24, 2013
+
+* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
+  implementation using it should be moved to
+  ``NonRecursiveTreeWalker``, as everything bundled with html5lib has
+  for years.
+
+* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
+  object, thereby fixing any case where html5lib is passed a
+  non-seekable RawIOBase-like object.
+
+
+1.0b2
+~~~~~
+
+Released on June 27, 2013
+
+* Removed reordering of attributes within the serializer. There is now
+  an ``alphabetical_attributes`` option which preserves the previous
+  behaviour through a new filter. This allows attribute order to be
+  preserved through html5lib if the tree builder preserves order.
+
+* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
+  ``treeadapters.sax.to_sax`` which is generic and supports any
+  treewalker; it also resolves all known bugs with ``dom2sax``.
+
+* Fix treewalker assertions on hitting bytes strings on
+  Python 2. Previous to 1.0b1, treewalkers coped with mixed
+  bytes/unicode data on Python 2; this reintroduces this prior
+  behaviour on Python 2. Behaviour is unchanged on Python 3.
+
+
+1.0b1
+~~~~~
+
+Released on May 17, 2013
+
+* Implementation updated to implement the `HTML specification
+  <http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
+  2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
+
+* Python 3.2+ supported in a single codebase using the ``six`` library.
+
+* Removed support for Python 2.5 and older.
+
+* Removed the deprecated Beautiful Soup 3 treebuilder.
+  ``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
+  since it doesn't support namespaces, foreign content like SVG and
+  MathML is parsed incorrectly.
+
+* Removed ``simpletree`` from the package. The default tree builder is
+  now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
+  available, and ``xml.etree.ElementTree`` otherwise).
+
+* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
+  output was well-formed XML, and hence provided little of use.
+
+* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
+  longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
+  return the default DOM treebuilder, which uses ``xml.dom.minidom``.
+
+* Optional heuristic character encoding detection now based on
+  ``charade`` for Python 2.6 - 3.3 compatibility.
+
+* Optional ``Genshi`` treewalker support fixed.
+
+* Many bugfixes, including:
+
+  * #33: null in attribute value breaks XML AttValue;
+
+  * #4: nested, indirect descendant, <button> causes infinite loop;
+
+  * `Google Code 215
+    <http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
+    detect seekable streams;
+
+  * `Google Code 206
+    <http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
+    support for <video preload=...>, <audio preload=...>;
+
+  * `Google Code 205
+    <http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
+    support for <video poster=...>;
+
+  * `Google Code 202
+    <http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
+    file breaks InputStream.
+
+* Source code is now mostly PEP 8 compliant.
+
+* Test harness has been improved and now depends on ``nose``.
+
+* Documentation updated and moved to https://html5lib.readthedocs.io/.
+
+
+0.95
+~~~~
+
+Released on February 11, 2012
+
+
+0.90
+~~~~
+
+Released on January 17, 2010
+
+
+0.11.1
+~~~~~~
+
+Released on June 12, 2008
+
+
+0.11
+~~~~
+
+Released on June 10, 2008
+
+
+0.10
+~~~~
+
+Released on October 7, 2007
+
+
+0.9
+~~~
+
+Released on March 11, 2007
+
+
+0.2
+~~~
+
+Released on January 8, 2007
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/CONTRIBUTING.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/CONTRIBUTING.rst
@ -0,0 +1,60 @@
+Contributing
+============
+
+Pull requests are more than welcome — both to the library and to the
+documentation. Some useful information:
+
+- We aim to follow PEP 8 in the library, but ignoring the
+  79-character-per-line limit, instead following a soft limit of 99,
+  but allowing lines over this where it is the readable thing to do.
+
+- We aim to follow PEP 257 for all docstrings, and make them properly
+  parseable by Sphinx while generating API documentation.
+
+- We keep ``pyflakes`` reporting no errors or warnings at all times.
+
+- We keep the master branch passing all tests at all times on all
+  supported versions.
+
+`Travis CI <https://travis-ci.org/html5lib/html5lib-python/>`_ is run
+against all pull requests and should enforce all of the above.
+
+We use `Opera Critic <https://critic.hoppipolla.co.uk/>`_ as an external
+code-review tool, which uses your GitHub login to authenticate.  You'll
+get email notifications for issues raised in the review.
+
+
+Patch submission guidelines
+---------------------------
+
+- **Create a new Git branch specific to your change.** Do not put
+  multiple fixes/features in the same pull request. If you find an
+  unrelated bug, create a distinct branch and submit a separate pull
+  request for the bugfix. This makes life much easier for maintainers
+  and will speed up merging your patches.
+
+- **Write a test** whenever possible. Following existing tests is often
+  easiest, and a good way to tell whether the feature you're modifying
+  is easily testable.
+
+- **Make sure documentation is updated.** Keep docstrings current, and
+  if necessary, update the Sphinx documentation in ``doc/``.
+
+- **Add a changelog entry** at the top of ``CHANGES.rst`` following
+  existing entries' styles.
+
+- **Run tests with tox** if possible, to make sure your changes are
+  compatible with all supported Python versions.
+
+- **Squash commits** before submitting the pull request so that a single
+  commit contains the entire change, and only that change (see the first
+  bullet).
+
+- **Don't rebase after creating the pull request.** Merge with upstream,
+  if necessary, and use ``git commit --fixup`` for fixing issues raised
+  in a Critic review or by a failing Travis build. The reviewer will
+  squash and rebase your pull request while accepting it. Even though
+  GitHub won't recognize the pull request as accepted, the squashed
+  commits will properly specify you as the author.
+
+- **Attribute yourself** in ``AUTHORS.rst``.
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/LICENSE
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/LICENSE
@ -0,0 +1,20 @@
+Copyright (c) 2006-2013 James Graham and other contributors
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/MANIFEST.in
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/MANIFEST.in
@ -0,0 +1,10 @@
+include LICENSE
+include AUTHORS.rst
+include CHANGES.rst
+include README.rst
+include requirements*.txt
+include .pytest.expect
+include tox.ini
+include pytest.ini
+graft html5lib/tests/testdata
+recursive-include html5lib/tests *.py
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/README.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/README.rst
@ -0,0 +1,151 @@
+html5lib
+========
+
+.. image:: https://travis-ci.org/html5lib/html5lib-python.png?branch=master
+  :target: https://travis-ci.org/html5lib/html5lib-python
+
+html5lib is a pure-python library for parsing HTML. It is designed to
+conform to the WHATWG HTML specification, as is implemented by all major
+web browsers.
+
+
+Usage
+-----
+
+Simple usage follows this pattern:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      document = html5lib.parse(f)
+
+or:
+
+.. code-block:: python
+
+  import html5lib
+  document = html5lib.parse("<p>Hello World!")
+
+By default, the ``document`` will be an ``xml.etree`` element instance.
+Whenever possible, html5lib chooses the accelerated ``ElementTree``
+implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
+
+Two other tree types are supported: ``xml.dom.minidom`` and
+``lxml.etree``. To use an alternative format, specify the name of
+a treebuilder:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
+
+When using with ``urllib2`` (Python 2), the charset from HTTP should be
+pass into html5lib as follows:
+
+.. code-block:: python
+
+  from contextlib import closing
+  from urllib2 import urlopen
+  import html5lib
+
+  with closing(urlopen("http://example.com/")) as f:
+      document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
+
+When using with ``urllib.request`` (Python 3), the charset from HTTP
+should be pass into html5lib as follows:
+
+.. code-block:: python
+
+  from urllib.request import urlopen
+  import html5lib
+
+  with urlopen("http://example.com/") as f:
+      document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
+
+To have more control over the parser, create a parser object explicitly.
+For instance, to make the parser raise exceptions on parse errors, use:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      parser = html5lib.HTMLParser(strict=True)
+      document = parser.parse(f)
+
+When you're instantiating parser objects explicitly, pass a treebuilder
+class as the ``tree`` keyword argument to use an alternative document
+format:
+
+.. code-block:: python
+
+  import html5lib
+  parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
+  minidom_document = parser.parse("<p>Hello World!")
+
+More documentation is available at https://html5lib.readthedocs.io/.
+
+
+Installation
+------------
+
+html5lib works on CPython 2.7+, CPython 3.3+ and PyPy.  To install it,
+use:
+
+.. code-block:: bash
+
+    $ pip install html5lib
+
+
+Optional Dependencies
+---------------------
+
+The following third-party libraries may be used for additional
+functionality:
+
+- ``datrie`` can be used under CPython to improve parsing performance
+  (though in almost all cases the improvement is marginal);
+
+- ``lxml`` is supported as a tree format (for both building and
+  walking) under CPython (but *not* PyPy where it is known to cause
+  segfaults);
+
+- ``genshi`` has a treewalker (but not builder); and
+
+- ``chardet`` can be used as a fallback when character encoding cannot
+  be determined.
+
+
+Bugs
+----
+
+Please report any bugs on the `issue tracker
+<https://github.com/html5lib/html5lib-python/issues>`_.
+
+
+Tests
+-----
+
+Unit tests require the ``pytest`` and ``mock`` libraries and can be
+run using the ``py.test`` command in the root directory.
+
+Test data are contained in a separate `html5lib-tests
+<https://github.com/html5lib/html5lib-tests>`_ repository and included
+as a submodule, thus for git checkouts they must be initialized::
+
+  $ git submodule init
+  $ git submodule update
+
+If you have all compatible Python implementations available on your
+system, you can run tests on all of them using the ``tox`` utility,
+which can be found on PyPI.
+
+
+Questions?
+----------
+
+There's a mailing list available for support on Google Groups,
+`html5lib-discuss <http://groups.google.com/group/html5lib-discuss>`_,
+though you may get a quicker response asking on IRC in `#whatwg on
+irc.freenode.net <http://wiki.whatwg.org/wiki/IRC>`_.
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/debug-info.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/debug-info.py
@ -0,0 +1,37 @@
+from __future__ import print_function, unicode_literals
+
+import platform
+import sys
+
+
+info = {
+    "impl": platform.python_implementation(),
+    "version": platform.python_version(),
+    "revision": platform.python_revision(),
+    "maxunicode": sys.maxunicode,
+    "maxsize": sys.maxsize
+}
+
+search_modules = ["chardet", "datrie", "genshi", "html5lib", "lxml", "six"]
+found_modules = []
+
+for m in search_modules:
+    try:
+        __import__(m)
+    except ImportError:
+        pass
+    else:
+        found_modules.append(m)
+
+info["modules"] = ", ".join(found_modules)
+
+
+print("""html5lib debug info:
+
+Python %(version)s (revision: %(revision)s)
+Implementation: %(impl)s
+
+sys.maxunicode: %(maxunicode)X
+sys.maxsize: %(maxsize)X
+
+Installed modules: %(modules)s""" % info)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/Makefile
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/Makefile
@ -0,0 +1,177 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/html5lib.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/html5lib.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/html5lib"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/html5lib"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/changes.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/changes.rst
@ -0,0 +1,3 @@
+.. :changelog:
+
+.. include:: ../CHANGES.rst
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/conf.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/conf.py
@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# html5lib documentation build configuration file, created by
+# sphinx-quickstart on Wed May  8 00:04:49 2013.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.viewcode']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'html5lib'
+copyright = '2006 - 2013, James Graham, Geoffrey Sneddon, and contributors'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '1.0'
+# The full version, including alpha/beta/rc tags.
+sys.path.append(os.path.abspath('..'))
+from html5lib import __version__
+release = __version__
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = 'en'
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build', 'theme']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+#html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'html5libdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+  ('index', 'html5lib.tex', 'html5lib Documentation',
+   'James Graham, Geoffrey Sneddon, and contributors', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'html5lib', 'html5lib Documentation',
+     ['James Graham, Geoffrey Sneddon, and contributors'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'html5lib', 'html5lib Documentation',
+   'James Graham, Geoffrey Sneddon, and contributors', 'html5lib', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
+
+class CExtMock(object):
+    """Required for autodoc on readthedocs.org where you cannot build C extensions."""
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, *args, **kwargs):
+        return CExtMock()
+
+    @classmethod
+    def __getattr__(cls, name):
+        if name in ('__file__', '__path__'):
+            return '/dev/null'
+        else:
+            return CExtMock()
+
+try:
+    import lxml   # flake8: noqa
+except ImportError:
+    sys.modules['lxml'] = CExtMock()
+    sys.modules['lxml.etree'] = CExtMock()
+    print("warning: lxml modules mocked.")
+
+try:
+    import genshi   # flake8: noqa
+except ImportError:
+    sys.modules['genshi'] = CExtMock()
+    sys.modules['genshi.core'] = CExtMock()
+    print("warning: genshi modules mocked.")
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/html5lib.filters.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/html5lib.filters.rst
@ -0,0 +1,58 @@
+filters Package
+===============
+
+:mod:`base` Module
+-------------------
+
+.. automodule:: html5lib.filters.base
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`alphabeticalattributes` Module
+------------------------------------
+
+.. automodule:: html5lib.filters.alphabeticalattributes
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`inject_meta_charset` Module
+---------------------------------
+
+.. automodule:: html5lib.filters.inject_meta_charset
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`lint` Module
+------------------
+
+.. automodule:: html5lib.filters.lint
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`optionaltags` Module
+--------------------------
+
+.. automodule:: html5lib.filters.optionaltags
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`sanitizer` Module
+-----------------------
+
+.. automodule:: html5lib.filters.sanitizer
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`whitespace` Module
+------------------------
+
+.. automodule:: html5lib.filters.whitespace
+    :members:
+    :show-inheritance:
+    :special-members: __init__
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/html5lib.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/html5lib.rst
@ -0,0 +1,38 @@
+html5lib Package
+================
+
+.. automodule:: html5lib
+    :members: __version__
+
+:mod:`constants` Module
+-----------------------
+
+.. automodule:: html5lib.constants
+    :members:
+    :show-inheritance:
+
+:mod:`html5parser` Module
+-------------------------
+
+.. automodule:: html5lib.html5parser
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`serializer` Module
+------------------------
+
+.. automodule:: html5lib.serializer
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+Subpackages
+-----------
+
+.. toctree::
+
+    html5lib.filters
+    html5lib.treebuilders
+    html5lib.treewalkers
+    html5lib.treeadapters
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/html5lib.treeadapters.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/html5lib.treeadapters.rst
@ -0,0 +1,20 @@
+treeadapters Package
+====================
+
+:mod:`~html5lib.treeadapters` Package
+-------------------------------------
+
+.. automodule:: html5lib.treeadapters
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+.. automodule:: html5lib.treeadapters.genshi
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+.. automodule:: html5lib.treeadapters.sax
+    :members:
+    :show-inheritance:
+    :special-members: __init__
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/html5lib.treebuilders.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/html5lib.treebuilders.rst
@ -0,0 +1,42 @@
+treebuilders Package
+====================
+
+:mod:`treebuilders` Package
+---------------------------
+
+.. automodule:: html5lib.treebuilders
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`base` Module
+-------------------
+
+.. automodule:: html5lib.treebuilders.base
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`dom` Module
+-----------------
+
+.. automodule:: html5lib.treebuilders.dom
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`etree` Module
+-------------------
+
+.. automodule:: html5lib.treebuilders.etree
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`etree_lxml` Module
+------------------------
+
+.. automodule:: html5lib.treebuilders.etree_lxml
+    :members:
+    :show-inheritance:
+    :special-members: __init__
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/html5lib.treewalkers.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/html5lib.treewalkers.rst
@ -0,0 +1,50 @@
+treewalkers Package
+===================
+
+:mod:`treewalkers` Package
+--------------------------
+
+.. automodule:: html5lib.treewalkers
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`base` Module
+------------------
+
+.. automodule:: html5lib.treewalkers.base
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`dom` Module
+-----------------
+
+.. automodule:: html5lib.treewalkers.dom
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`etree` Module
+-------------------
+
+.. automodule:: html5lib.treewalkers.etree
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`etree_lxml` Module
+------------------------
+
+.. automodule:: html5lib.treewalkers.etree_lxml
+    :members:
+    :show-inheritance:
+    :special-members: __init__
+
+:mod:`genshi` Module
+--------------------
+
+.. automodule:: html5lib.treewalkers.genshi
+    :members:
+    :show-inheritance:
+    :special-members: __init__
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/index.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/index.rst
@ -0,0 +1,22 @@
+Overview
+========
+
+.. include:: ../README.rst
+   :start-line: 6
+
+.. toctree::
+   :maxdepth: 2
+
+   movingparts
+   modules
+   changes
+   License <license>
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/license.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/license.rst
@ -0,0 +1,4 @@
+License
+=======
+
+.. include:: ../LICENSE
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/make.bat
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/make.bat
@ -0,0 +1,242 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html       to make standalone HTML files
+	echo.  dirhtml    to make HTML files named index.html in directories
+	echo.  singlehtml to make a single large HTML file
+	echo.  pickle     to make pickle files
+	echo.  json       to make JSON files
+	echo.  htmlhelp   to make HTML files and a HTML help project
+	echo.  qthelp     to make HTML files and a qthelp project
+	echo.  devhelp    to make HTML files and a Devhelp project
+	echo.  epub       to make an epub
+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  text       to make text files
+	echo.  man        to make manual pages
+	echo.  texinfo    to make Texinfo files
+	echo.  gettext    to make PO message catalogs
+	echo.  changes    to make an overview over all changed/added/deprecated items
+	echo.  xml        to make Docutils-native XML files
+	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
+	echo.  linkcheck  to check all external links for integrity
+	echo.  doctest    to run all doctests embedded in the documentation if enabled
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "singlehtml" (
+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\html5lib.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\html5lib.ghc
+	goto end
+)
+
+if "%1" == "devhelp" (
+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished.
+	goto end
+)
+
+if "%1" == "epub" (
+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub file is in %BUILDDIR%/epub.
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdf" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf
+	cd %BUILDDIR%/..
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdfja" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf-ja
+	cd %BUILDDIR%/..
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "text" (
+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The text files are in %BUILDDIR%/text.
+	goto end
+)
+
+if "%1" == "man" (
+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The manual pages are in %BUILDDIR%/man.
+	goto end
+)
+
+if "%1" == "texinfo" (
+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+	goto end
+)
+
+if "%1" == "gettext" (
+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+if "%1" == "xml" (
+	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The XML files are in %BUILDDIR%/xml.
+	goto end
+)
+
+if "%1" == "pseudoxml" (
+	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+	goto end
+)
+
+:end
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/modules.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/modules.rst
@ -0,0 +1,7 @@
+html5lib
+========
+
+.. toctree::
+   :maxdepth: 4
+
+   html5lib
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/movingparts.rst
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/doc/movingparts.rst
@ -0,0 +1,165 @@
+The moving parts
+================
+
+html5lib consists of a number of components, which are responsible for
+handling its features.
+
+Parsing uses a *tree builder* to generate a *tree*, the in-memory representation of the document.
+Several tree representations are supported, as are translations to other formats via *tree adapters*.
+The tree may be translated to a token stream with a *tree walker*, from which :class:`~html5lib.serializer.HTMLSerializer` produces a stream of bytes.
+The token stream may also be transformed by use of *filters* to accomplish tasks like sanitization.
+
+Tree builders
+-------------
+
+The parser reads HTML by tokenizing the content and building a tree that
+the user can later access. html5lib can build three types of trees:
+
+* ``etree`` - this is the default; builds a tree based on :mod:`xml.etree`,
+  which can be found in the standard library. Whenever possible, the
+  accelerated ``ElementTree`` implementation (i.e.
+  ``xml.etree.cElementTree`` on Python 2.x) is used.
+
+* ``dom`` - builds a tree based on :mod:`xml.dom.minidom`.
+
+* ``lxml`` - uses the :mod:`lxml.etree` implementation of the ``ElementTree``
+  API.  The performance gains are relatively small compared to using the
+  accelerated ``ElementTree`` module.
+
+You can specify the builder by name when using the shorthand API:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
+
+To get a builder class by name, use the :func:`~html5lib.treebuilders.getTreeBuilder` function.
+
+When instantiating a :class:`~html5lib.html5parser.HTMLParser` object, you must pass a tree builder class via the ``tree`` keyword attribute:
+
+.. code-block:: python
+
+  import html5lib
+  TreeBuilder = html5lib.getTreeBuilder("dom")
+  parser = html5lib.HTMLParser(tree=TreeBuilder)
+  minidom_document = parser.parse("<p>Hello World!")
+
+The implementation of builders can be found in `html5lib/treebuilders/
+<https://github.com/html5lib/html5lib-python/tree/master/html5lib/treebuilders>`_.
+
+
+Tree walkers
+------------
+
+In addition to manipulating a tree directly, you can use a tree walker to generate a streaming view of it.
+html5lib provides walkers for ``etree``, ``dom``, and ``lxml`` trees, as well as ``genshi`` `markup streams <https://genshi.edgewall.org/wiki/Documentation/streams.html>`_.
+
+The implementation of walkers can be found in `html5lib/treewalkers/
+<https://github.com/html5lib/html5lib-python/tree/master/html5lib/treewalkers>`_.
+
+html5lib provides :class:`~html5lib.serializer.HTMLSerializer` for generating a stream of bytes from a token stream, and several filters which manipulate the stream.
+
+HTMLSerializer
+~~~~~~~~~~~~~~
+
+The serializer lets you write HTML back as a stream of bytes.
+
+.. code-block:: pycon
+
+  >>> import html5lib
+  >>> element = html5lib.parse('<p xml:lang="pl">Witam wszystkich')
+  >>> walker = html5lib.getTreeWalker("etree")
+  >>> stream = walker(element)
+  >>> s = html5lib.serializer.HTMLSerializer()
+  >>> output = s.serialize(stream)
+  >>> for item in output:
+  ...   print("%r" % item)
+  '<p'
+  ' '
+  'xml:lang'
+  '='
+  'pl'
+  '>'
+  'Witam wszystkich'
+
+You can customize the serializer behaviour in a variety of ways. Consult
+the :class:`~html5lib.serializer.HTMLSerializer` documentation.
+
+
+Filters
+~~~~~~~
+
+html5lib provides several filters:
+
+* :class:`alphabeticalattributes.Filter
+  <html5lib.filters.alphabeticalattributes.Filter>` sorts attributes on
+  tags to be in alphabetical order
+
+* :class:`inject_meta_charset.Filter
+  <html5lib.filters.inject_meta_charset.Filter>` sets a user-specified
+  encoding in the correct ``<meta>`` tag in the ``<head>`` section of
+  the document
+
+* :class:`lint.Filter <html5lib.filters.lint.Filter>` raises
+  :exc:`AssertionError` exceptions on invalid tag and attribute names, invalid
+  PCDATA, etc.
+
+* :class:`optionaltags.Filter <html5lib.filters.optionaltags.Filter>`
+  removes tags from the token stream which are not necessary to produce valid
+  HTML
+
+* :class:`sanitizer.Filter <html5lib.filters.sanitizer.Filter>` removes
+  unsafe markup and CSS. Elements that are known to be safe are passed
+  through and the rest is converted to visible text. The default
+  configuration of the sanitizer follows the `WHATWG Sanitization Rules
+  <http://wiki.whatwg.org/wiki/Sanitization_rules>`_.
+
+* :class:`whitespace.Filter <html5lib.filters.whitespace.Filter>`
+  collapses all whitespace characters to single spaces unless they're in
+  ``<pre/>`` or ``<textarea/>`` tags.
+
+To use a filter, simply wrap it around a token stream:
+
+.. code-block:: python
+
+  >>> import html5lib
+  >>> from html5lib.filters import sanitizer
+  >>> dom = html5lib.parse("<p><script>alert('Boo!')", treebuilder="dom")
+  >>> walker = html5lib.getTreeWalker("dom")
+  >>> stream = walker(dom)
+  >>> clean_stream = sanitizer.Filter(stream)
+
+
+Tree adapters
+-------------
+
+Tree adapters can be used to translate between tree formats.
+Two adapters are provided by html5lib:
+
+* :func:`html5lib.treeadapters.genshi.to_genshi()` generates a `Genshi markup stream <https://genshi.edgewall.org/wiki/Documentation/streams.html>`_.
+* :func:`html5lib.treeadapters.sax.to_sax()` calls a SAX handler based on the tree.
+
+Encoding discovery
+------------------
+
+Parsed trees are always Unicode. However a large variety of input
+encodings are supported. The encoding of the document is determined in
+the following way:
+
+* The encoding may be explicitly specified by passing the name of the
+  encoding as the encoding parameter to the
+  :meth:`~html5lib.html5parser.HTMLParser.parse` method on
+  :class:`~html5lib.html5parser.HTMLParser` objects.
+
+* If no encoding is specified, the parser will attempt to detect the
+  encoding from a ``<meta>``  element in the first 512 bytes of the
+  document (this is only a partial implementation of the current HTML
+  specification).
+
+* If no encoding can be found and the :mod:`chardet` library is available, an
+  attempt will be made to sniff the encoding from the byte pattern.
+
+* If all else fails, the default encoding will be used. This is usually
+  `Windows-1252 <http://en.wikipedia.org/wiki/Windows-1252>`_, which is
+  a common fallback used by Web browsers.
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/flake8-run.sh
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/flake8-run.sh
@ -0,0 +1,9 @@
+#!/bin/bash -e
+
+if [[ ! -x $(which flake8) ]]; then
+  echo "fatal: flake8 not found on $PATH. Exiting."
+  exit 1
+fi
+
+flake8 `dirname $0`
+exit $?
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/init.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/init.py
@ -0,0 +1,35 @@
+"""
+HTML parsing library based on the `WHATWG HTML specification
+<https://whatwg.org/html>`_. The parser is designed to be compatible with
+existing HTML found in the wild and implements well-defined error recovery that
+is largely compatible with modern desktop web browsers.
+
+Example usage::
+
+    import html5lib
+    with open("my_document.html", "rb") as f:
+        tree = html5lib.parse(f)
+
+For convenience, this module re-exports the following names:
+
+* :func:`~.html5parser.parse`
+* :func:`~.html5parser.parseFragment`
+* :class:`~.html5parser.HTMLParser`
+* :func:`~.treebuilders.getTreeBuilder`
+* :func:`~.treewalkers.getTreeWalker`
+* :func:`~.serializer.serialize`
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .html5parser import HTMLParser, parse, parseFragment
+from .treebuilders import getTreeBuilder
+from .treewalkers import getTreeWalker
+from .serializer import serialize
+
+__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
+           "getTreeWalker", "serialize"]
+
+# this has to be at the top level, see how setup.py parses this
+#: Distribution version number.
+__version__ = "1.0.1"
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_ihatexml.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_ihatexml.py
@ -0,0 +1,288 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+import warnings
+
+from .constants import DataLossWarning
+
+baseChar = """
+[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
+[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
+[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
+[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
+[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
+[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
+[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
+[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
+[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
+[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
+[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
+[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
+[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
+[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
+[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
+[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
+[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
+[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
+[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
+[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
+[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
+[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
+[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
+[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
+[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
+[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
+[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
+[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
+[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
+[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
+#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
+#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
+#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
+[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
+[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
+#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
+[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
+[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
+[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
+[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
+[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
+#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
+[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
+[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
+[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
+[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
+
+ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
+
+combiningCharacter = """
+[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
+[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
+[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
+[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
+#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
+[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
+[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
+#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
+[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
+[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
+#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
+[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
+[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
+[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
+[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
+[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
+#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
+[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
+#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
+[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
+[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
+#x3099 | #x309A"""
+
+digit = """
+[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
+[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
+[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
+[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
+
+extender = """
+#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
+#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
+
+letter = " | ".join([baseChar, ideographic])
+
+# Without the
+name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
+                   extender])
+nameFirst = " | ".join([letter, "_"])
+
+reChar = re.compile(r"#x([\d|A-F]{4,4})")
+reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
+
+
+def charStringToList(chars):
+    charRanges = [item.strip() for item in chars.split(" | ")]
+    rv = []
+    for item in charRanges:
+        foundMatch = False
+        for regexp in (reChar, reCharRange):
+            match = regexp.match(item)
+            if match is not None:
+                rv.append([hexToInt(item) for item in match.groups()])
+                if len(rv[-1]) == 1:
+                    rv[-1] = rv[-1] * 2
+                foundMatch = True
+                break
+        if not foundMatch:
+            assert len(item) == 1
+
+            rv.append([ord(item)] * 2)
+    rv = normaliseCharList(rv)
+    return rv
+
+
+def normaliseCharList(charList):
+    charList = sorted(charList)
+    for item in charList:
+        assert item[1] >= item[0]
+    rv = []
+    i = 0
+    while i < len(charList):
+        j = 1
+        rv.append(charList[i])
+        while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
+            rv[-1][1] = charList[i + j][1]
+            j += 1
+        i += j
+    return rv
+
+# We don't really support characters above the BMP :(
+max_unicode = int("FFFF", 16)
+
+
+def missingRanges(charList):
+    rv = []
+    if charList[0] != 0:
+        rv.append([0, charList[0][0] - 1])
+    for i, item in enumerate(charList[:-1]):
+        rv.append([item[1] + 1, charList[i + 1][0] - 1])
+    if charList[-1][1] != max_unicode:
+        rv.append([charList[-1][1] + 1, max_unicode])
+    return rv
+
+
+def listToRegexpStr(charList):
+    rv = []
+    for item in charList:
+        if item[0] == item[1]:
+            rv.append(escapeRegexp(chr(item[0])))
+        else:
+            rv.append(escapeRegexp(chr(item[0])) + "-" +
+                      escapeRegexp(chr(item[1])))
+    return "[%s]" % "".join(rv)
+
+
+def hexToInt(hex_str):
+    return int(hex_str, 16)
+
+
+def escapeRegexp(string):
+    specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
+                         "[", "]", "|", "(", ")", "-")
+    for char in specialCharacters:
+        string = string.replace(char, "\\" + char)
+
+    return string
+
+# output from the above
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
+
+nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
+
+# Simpler things
+nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
+
+
+class InfosetFilter(object):
+    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
+
+    def __init__(self,
+                 dropXmlnsLocalName=False,
+                 dropXmlnsAttrNs=False,
+                 preventDoubleDashComments=False,
+                 preventDashAtCommentEnd=False,
+                 replaceFormFeedCharacters=True,
+                 preventSingleQuotePubid=False):
+
+        self.dropXmlnsLocalName = dropXmlnsLocalName
+        self.dropXmlnsAttrNs = dropXmlnsAttrNs
+
+        self.preventDoubleDashComments = preventDoubleDashComments
+        self.preventDashAtCommentEnd = preventDashAtCommentEnd
+
+        self.replaceFormFeedCharacters = replaceFormFeedCharacters
+
+        self.preventSingleQuotePubid = preventSingleQuotePubid
+
+        self.replaceCache = {}
+
+    def coerceAttribute(self, name, namespace=None):
+        if self.dropXmlnsLocalName and name.startswith("xmlns:"):
+            warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
+            return None
+        elif (self.dropXmlnsAttrNs and
+              namespace == "http://www.w3.org/2000/xmlns/"):
+            warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
+            return None
+        else:
+            return self.toXmlName(name)
+
+    def coerceElement(self, name):
+        return self.toXmlName(name)
+
+    def coerceComment(self, data):
+        if self.preventDoubleDashComments:
+            while "--" in data:
+                warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
+                data = data.replace("--", "- -")
+            if data.endswith("-"):
+                warnings.warn("Comments cannot end in a dash", DataLossWarning)
+                data += " "
+        return data
+
+    def coerceCharacters(self, data):
+        if self.replaceFormFeedCharacters:
+            for _ in range(data.count("\x0C")):
+                warnings.warn("Text cannot contain U+000C", DataLossWarning)
+            data = data.replace("\x0C", " ")
+        # Other non-xml characters
+        return data
+
+    def coercePubid(self, data):
+        dataOutput = data
+        for char in nonPubidCharRegexp.findall(data):
+            warnings.warn("Coercing non-XML pubid", DataLossWarning)
+            replacement = self.getReplacementCharacter(char)
+            dataOutput = dataOutput.replace(char, replacement)
+        if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
+            warnings.warn("Pubid cannot contain single quote", DataLossWarning)
+            dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
+        return dataOutput
+
+    def toXmlName(self, name):
+        nameFirst = name[0]
+        nameRest = name[1:]
+        m = nonXmlNameFirstBMPRegexp.match(nameFirst)
+        if m:
+            warnings.warn("Coercing non-XML name", DataLossWarning)
+            nameFirstOutput = self.getReplacementCharacter(nameFirst)
+        else:
+            nameFirstOutput = nameFirst
+
+        nameRestOutput = nameRest
+        replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
+        for char in replaceChars:
+            warnings.warn("Coercing non-XML name", DataLossWarning)
+            replacement = self.getReplacementCharacter(char)
+            nameRestOutput = nameRestOutput.replace(char, replacement)
+        return nameFirstOutput + nameRestOutput
+
+    def getReplacementCharacter(self, char):
+        if char in self.replaceCache:
+            replacement = self.replaceCache[char]
+        else:
+            replacement = self.escapeChar(char)
+        return replacement
+
+    def fromXmlName(self, name):
+        for item in set(self.replacementRegexp.findall(name)):
+            name = name.replace(item, self.unescapeChar(item))
+        return name
+
+    def escapeChar(self, char):
+        replacement = "U%05X" % ord(char)
+        self.replaceCache[char] = replacement
+        return replacement
+
+    def unescapeChar(self, charcode):
+        return chr(int(charcode[1:], 16))
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_inputstream.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_inputstream.py
@ -0,0 +1,923 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from six import text_type, binary_type
+from six.moves import http_client, urllib
+
+import codecs
+import re
+
+import webencodings
+
+from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
+from .constants import _ReparseException
+from . import _utils
+
+from io import StringIO
+
+try:
+    from io import BytesIO
+except ImportError:
+    BytesIO = StringIO
+
+# Non-unicode versions of constants for use in the pre-parser
+spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
+asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
+asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
+spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
+
+
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"  # noqa
+
+if _utils.supports_lone_surrogates:
+    # Use one extra step of indirection and create surrogates with
+    # eval. Not using this indirection would introduce an illegal
+    # unicode literal on platforms not supporting such lone
+    # surrogates.
+    assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
+                                    eval('"\\uD800-\\uDFFF"') +  # pylint:disable=eval-used
+                                    "]")
+else:
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
+
+non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
+                                  0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
+                                  0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
+                                  0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
+                                  0x10FFFE, 0x10FFFF])
+
+ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
+
+# Cache for charsUntil()
+charsUntilRegEx = {}
+
+
+class BufferedStream(object):
+    """Buffering for streams that do not have buffering of their own
+
+    The buffer is implemented as a list of chunks on the assumption that
+    joining many strings will be slow since it is O(n**2)
+    """
+
+    def __init__(self, stream):
+        self.stream = stream
+        self.buffer = []
+        self.position = [-1, 0]  # chunk number, offset
+
+    def tell(self):
+        pos = 0
+        for chunk in self.buffer[:self.position[0]]:
+            pos += len(chunk)
+        pos += self.position[1]
+        return pos
+
+    def seek(self, pos):
+        assert pos <= self._bufferedBytes()
+        offset = pos
+        i = 0
+        while len(self.buffer[i]) < offset:
+            offset -= len(self.buffer[i])
+            i += 1
+        self.position = [i, offset]
+
+    def read(self, bytes):
+        if not self.buffer:
+            return self._readStream(bytes)
+        elif (self.position[0] == len(self.buffer) and
+              self.position[1] == len(self.buffer[-1])):
+            return self._readStream(bytes)
+        else:
+            return self._readFromBuffer(bytes)
+
+    def _bufferedBytes(self):
+        return sum([len(item) for item in self.buffer])
+
+    def _readStream(self, bytes):
+        data = self.stream.read(bytes)
+        self.buffer.append(data)
+        self.position[0] += 1
+        self.position[1] = len(data)
+        return data
+
+    def _readFromBuffer(self, bytes):
+        remainingBytes = bytes
+        rv = []
+        bufferIndex = self.position[0]
+        bufferOffset = self.position[1]
+        while bufferIndex < len(self.buffer) and remainingBytes != 0:
+            assert remainingBytes > 0
+            bufferedData = self.buffer[bufferIndex]
+
+            if remainingBytes <= len(bufferedData) - bufferOffset:
+                bytesToRead = remainingBytes
+                self.position = [bufferIndex, bufferOffset + bytesToRead]
+            else:
+                bytesToRead = len(bufferedData) - bufferOffset
+                self.position = [bufferIndex, len(bufferedData)]
+                bufferIndex += 1
+            rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
+            remainingBytes -= bytesToRead
+
+            bufferOffset = 0
+
+        if remainingBytes:
+            rv.append(self._readStream(remainingBytes))
+
+        return b"".join(rv)
+
+
+def HTMLInputStream(source, **kwargs):
+    # Work around Python bug #20007: read(0) closes the connection.
+    # http://bugs.python.org/issue20007
+    if (isinstance(source, http_client.HTTPResponse) or
+        # Also check for addinfourl wrapping HTTPResponse
+        (isinstance(source, urllib.response.addbase) and
+         isinstance(source.fp, http_client.HTTPResponse))):
+        isUnicode = False
+    elif hasattr(source, "read"):
+        isUnicode = isinstance(source.read(0), text_type)
+    else:
+        isUnicode = isinstance(source, text_type)
+
+    if isUnicode:
+        encodings = [x for x in kwargs if x.endswith("_encoding")]
+        if encodings:
+            raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
+
+        return HTMLUnicodeInputStream(source, **kwargs)
+    else:
+        return HTMLBinaryInputStream(source, **kwargs)
+
+
+class HTMLUnicodeInputStream(object):
+    """Provides a unicode stream of characters to the HTMLTokenizer.
+
+    This class takes care of character encoding and removing or replacing
+    incorrect byte-sequences and also provides column and line tracking.
+
+    """
+
+    _defaultChunkSize = 10240
+
+    def __init__(self, source):
+        """Initialises the HTMLInputStream.
+
+        HTMLInputStream(source, [encoding]) -> Normalized stream from source
+        for use by html5lib.
+
+        source can be either a file-object, local filename or a string.
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        """
+
+        if not _utils.supports_lone_surrogates:
+            # Such platforms will have already checked for such
+            # surrogate errors, so no need to do this checking.
+            self.reportCharacterErrors = None
+        elif len("\U0010FFFF") == 1:
+            self.reportCharacterErrors = self.characterErrorsUCS4
+        else:
+            self.reportCharacterErrors = self.characterErrorsUCS2
+
+        # List of where new lines occur
+        self.newLines = [0]
+
+        self.charEncoding = (lookupEncoding("utf-8"), "certain")
+        self.dataStream = self.openStream(source)
+
+        self.reset()
+
+    def reset(self):
+        self.chunk = ""
+        self.chunkSize = 0
+        self.chunkOffset = 0
+        self.errors = []
+
+        # number of (complete) lines in previous chunks
+        self.prevNumLines = 0
+        # number of columns in the last line of the previous chunk
+        self.prevNumCols = 0
+
+        # Deal with CR LF and surrogates split over chunk boundaries
+        self._bufferedCharacter = None
+
+    def openStream(self, source):
+        """Produces a file object from source.
+
+        source can be either a file object, local filename or a string.
+
+        """
+        # Already a file object
+        if hasattr(source, 'read'):
+            stream = source
+        else:
+            stream = StringIO(source)
+
+        return stream
+
+    def _position(self, offset):
+        chunk = self.chunk
+        nLines = chunk.count('\n', 0, offset)
+        positionLine = self.prevNumLines + nLines
+        lastLinePos = chunk.rfind('\n', 0, offset)
+        if lastLinePos == -1:
+            positionColumn = self.prevNumCols + offset
+        else:
+            positionColumn = offset - (lastLinePos + 1)
+        return (positionLine, positionColumn)
+
+    def position(self):
+        """Returns (line, col) of the current position in the stream."""
+        line, col = self._position(self.chunkOffset)
+        return (line + 1, col)
+
+    def char(self):
+        """ Read one character from the stream or queue if available. Return
+            EOF when EOF is reached.
+        """
+        # Read a new chunk from the input stream if necessary
+        if self.chunkOffset >= self.chunkSize:
+            if not self.readChunk():
+                return EOF
+
+        chunkOffset = self.chunkOffset
+        char = self.chunk[chunkOffset]
+        self.chunkOffset = chunkOffset + 1
+
+        return char
+
+    def readChunk(self, chunkSize=None):
+        if chunkSize is None:
+            chunkSize = self._defaultChunkSize
+
+        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
+
+        self.chunk = ""
+        self.chunkSize = 0
+        self.chunkOffset = 0
+
+        data = self.dataStream.read(chunkSize)
+
+        # Deal with CR LF and surrogates broken across chunks
+        if self._bufferedCharacter:
+            data = self._bufferedCharacter + data
+            self._bufferedCharacter = None
+        elif not data:
+            # We have no more data, bye-bye stream
+            return False
+
+        if len(data) > 1:
+            lastv = ord(data[-1])
+            if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
+                self._bufferedCharacter = data[-1]
+                data = data[:-1]
+
+        if self.reportCharacterErrors:
+            self.reportCharacterErrors(data)
+
+        # Replace invalid characters
+        data = data.replace("\r\n", "\n")
+        data = data.replace("\r", "\n")
+
+        self.chunk = data
+        self.chunkSize = len(data)
+
+        return True
+
+    def characterErrorsUCS4(self, data):
+        for _ in range(len(invalid_unicode_re.findall(data))):
+            self.errors.append("invalid-codepoint")
+
+    def characterErrorsUCS2(self, data):
+        # Someone picked the wrong compile option
+        # You lose
+        skip = False
+        for match in invalid_unicode_re.finditer(data):
+            if skip:
+                continue
+            codepoint = ord(match.group())
+            pos = match.start()
+            # Pretty sure there should be endianness issues here
+            if _utils.isSurrogatePair(data[pos:pos + 2]):
+                # We have a surrogate pair!
+                char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
+                if char_val in non_bmp_invalid_codepoints:
+                    self.errors.append("invalid-codepoint")
+                skip = True
+            elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
+                  pos == len(data) - 1):
+                self.errors.append("invalid-codepoint")
+            else:
+                skip = False
+                self.errors.append("invalid-codepoint")
+
+    def charsUntil(self, characters, opposite=False):
+        """ Returns a string of characters from the stream up to but not
+        including any character in 'characters' or EOF. 'characters' must be
+        a container that supports the 'in' method and iteration over its
+        characters.
+        """
+
+        # Use a cache of regexps to find the required characters
+        try:
+            chars = charsUntilRegEx[(characters, opposite)]
+        except KeyError:
+            if __debug__:
+                for c in characters:
+                    assert(ord(c) < 128)
+            regex = "".join(["\\x%02x" % ord(c) for c in characters])
+            if not opposite:
+                regex = "^%s" % regex
+            chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
+
+        rv = []
+
+        while True:
+            # Find the longest matching prefix
+            m = chars.match(self.chunk, self.chunkOffset)
+            if m is None:
+                # If nothing matched, and it wasn't because we ran out of chunk,
+                # then stop
+                if self.chunkOffset != self.chunkSize:
+                    break
+            else:
+                end = m.end()
+                # If not the whole chunk matched, return everything
+                # up to the part that didn't match
+                if end != self.chunkSize:
+                    rv.append(self.chunk[self.chunkOffset:end])
+                    self.chunkOffset = end
+                    break
+            # If the whole remainder of the chunk matched,
+            # use it all and read the next chunk
+            rv.append(self.chunk[self.chunkOffset:])
+            if not self.readChunk():
+                # Reached EOF
+                break
+
+        r = "".join(rv)
+        return r
+
+    def unget(self, char):
+        # Only one character is allowed to be ungotten at once - it must
+        # be consumed again before any further call to unget
+        if char is not None:
+            if self.chunkOffset == 0:
+                # unget is called quite rarely, so it's a good idea to do
+                # more work here if it saves a bit of work in the frequently
+                # called char and charsUntil.
+                # So, just prepend the ungotten character onto the current
+                # chunk:
+                self.chunk = char + self.chunk
+                self.chunkSize += 1
+            else:
+                self.chunkOffset -= 1
+                assert self.chunk[self.chunkOffset] == char
+
+
+class HTMLBinaryInputStream(HTMLUnicodeInputStream):
+    """Provides a unicode stream of characters to the HTMLTokenizer.
+
+    This class takes care of character encoding and removing or replacing
+    incorrect byte-sequences and also provides column and line tracking.
+
+    """
+
+    def __init__(self, source, override_encoding=None, transport_encoding=None,
+                 same_origin_parent_encoding=None, likely_encoding=None,
+                 default_encoding="windows-1252", useChardet=True):
+        """Initialises the HTMLInputStream.
+
+        HTMLInputStream(source, [encoding]) -> Normalized stream from source
+        for use by html5lib.
+
+        source can be either a file-object, local filename or a string.
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        """
+        # Raw Stream - for unicode objects this will encode to utf-8 and set
+        #              self.charEncoding as appropriate
+        self.rawStream = self.openStream(source)
+
+        HTMLUnicodeInputStream.__init__(self, self.rawStream)
+
+        # Encoding Information
+        # Number of bytes to use when looking for a meta element with
+        # encoding information
+        self.numBytesMeta = 1024
+        # Number of bytes to use when using detecting encoding using chardet
+        self.numBytesChardet = 100
+        # Things from args
+        self.override_encoding = override_encoding
+        self.transport_encoding = transport_encoding
+        self.same_origin_parent_encoding = same_origin_parent_encoding
+        self.likely_encoding = likely_encoding
+        self.default_encoding = default_encoding
+
+        # Determine encoding
+        self.charEncoding = self.determineEncoding(useChardet)
+        assert self.charEncoding[0] is not None
+
+        # Call superclass
+        self.reset()
+
+    def reset(self):
+        self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
+        HTMLUnicodeInputStream.reset(self)
+
+    def openStream(self, source):
+        """Produces a file object from source.
+
+        source can be either a file object, local filename or a string.
+
+        """
+        # Already a file object
+        if hasattr(source, 'read'):
+            stream = source
+        else:
+            stream = BytesIO(source)
+
+        try:
+            stream.seek(stream.tell())
+        except:  # pylint:disable=bare-except
+            stream = BufferedStream(stream)
+
+        return stream
+
+    def determineEncoding(self, chardet=True):
+        # BOMs take precedence over everything
+        # This will also read past the BOM if present
+        charEncoding = self.detectBOM(), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # If we've been overriden, we've been overriden
+        charEncoding = lookupEncoding(self.override_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Now check the transport layer
+        charEncoding = lookupEncoding(self.transport_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Look for meta elements with encoding information
+        charEncoding = self.detectEncodingMeta(), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Parent document encoding
+        charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
+        if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
+            return charEncoding
+
+        # "likely" encoding
+        charEncoding = lookupEncoding(self.likely_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Guess with chardet, if available
+        if chardet:
+            try:
+                from chardet.universaldetector import UniversalDetector
+            except ImportError:
+                pass
+            else:
+                buffers = []
+                detector = UniversalDetector()
+                while not detector.done:
+                    buffer = self.rawStream.read(self.numBytesChardet)
+                    assert isinstance(buffer, bytes)
+                    if not buffer:
+                        break
+                    buffers.append(buffer)
+                    detector.feed(buffer)
+                detector.close()
+                encoding = lookupEncoding(detector.result['encoding'])
+                self.rawStream.seek(0)
+                if encoding is not None:
+                    return encoding, "tentative"
+
+        # Try the default encoding
+        charEncoding = lookupEncoding(self.default_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Fallback to html5lib's default if even that hasn't worked
+        return lookupEncoding("windows-1252"), "tentative"
+
+    def changeEncoding(self, newEncoding):
+        assert self.charEncoding[1] != "certain"
+        newEncoding = lookupEncoding(newEncoding)
+        if newEncoding is None:
+            return
+        if newEncoding.name in ("utf-16be", "utf-16le"):
+            newEncoding = lookupEncoding("utf-8")
+            assert newEncoding is not None
+        elif newEncoding == self.charEncoding[0]:
+            self.charEncoding = (self.charEncoding[0], "certain")
+        else:
+            self.rawStream.seek(0)
+            self.charEncoding = (newEncoding, "certain")
+            self.reset()
+            raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
+
+    def detectBOM(self):
+        """Attempts to detect at BOM at the start of the stream. If
+        an encoding can be determined from the BOM return the name of the
+        encoding otherwise return None"""
+        bomDict = {
+            codecs.BOM_UTF8: 'utf-8',
+            codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
+            codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
+        }
+
+        # Go to beginning of file and read in 4 bytes
+        string = self.rawStream.read(4)
+        assert isinstance(string, bytes)
+
+        # Try detecting the BOM using bytes from the string
+        encoding = bomDict.get(string[:3])         # UTF-8
+        seek = 3
+        if not encoding:
+            # Need to detect UTF-32 before UTF-16
+            encoding = bomDict.get(string)         # UTF-32
+            seek = 4
+            if not encoding:
+                encoding = bomDict.get(string[:2])  # UTF-16
+                seek = 2
+
+        # Set the read position past the BOM if one was found, otherwise
+        # set it to the start of the stream
+        if encoding:
+            self.rawStream.seek(seek)
+            return lookupEncoding(encoding)
+        else:
+            self.rawStream.seek(0)
+            return None
+
+    def detectEncodingMeta(self):
+        """Report the encoding declared by the meta element
+        """
+        buffer = self.rawStream.read(self.numBytesMeta)
+        assert isinstance(buffer, bytes)
+        parser = EncodingParser(buffer)
+        self.rawStream.seek(0)
+        encoding = parser.getEncoding()
+
+        if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
+            encoding = lookupEncoding("utf-8")
+
+        return encoding
+
+
+class EncodingBytes(bytes):
+    """String-like object with an associated position and various extra methods
+    If the position is ever greater than the string length then an exception is
+    raised"""
+    def __new__(self, value):
+        assert isinstance(value, bytes)
+        return bytes.__new__(self, value.lower())
+
+    def __init__(self, value):
+        # pylint:disable=unused-argument
+        self._position = -1
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        p = self._position = self._position + 1
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        return self[p:p + 1]
+
+    def next(self):
+        # Py2 compat
+        return self.__next__()
+
+    def previous(self):
+        p = self._position
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        self._position = p = p - 1
+        return self[p:p + 1]
+
+    def setPosition(self, position):
+        if self._position >= len(self):
+            raise StopIteration
+        self._position = position
+
+    def getPosition(self):
+        if self._position >= len(self):
+            raise StopIteration
+        if self._position >= 0:
+            return self._position
+        else:
+            return None
+
+    position = property(getPosition, setPosition)
+
+    def getCurrentByte(self):
+        return self[self.position:self.position + 1]
+
+    currentByte = property(getCurrentByte)
+
+    def skip(self, chars=spaceCharactersBytes):
+        """Skip past a list of characters"""
+        p = self.position               # use property for the error-checking
+        while p < len(self):
+            c = self[p:p + 1]
+            if c not in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
+
+    def skipUntil(self, chars):
+        p = self.position
+        while p < len(self):
+            c = self[p:p + 1]
+            if c in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
+
+    def matchBytes(self, bytes):
+        """Look for a sequence of bytes at the start of a string. If the bytes
+        are found return True and advance the position to the byte after the
+        match. Otherwise return False and leave the position alone"""
+        p = self.position
+        data = self[p:p + len(bytes)]
+        rv = data.startswith(bytes)
+        if rv:
+            self.position += len(bytes)
+        return rv
+
+    def jumpTo(self, bytes):
+        """Look for the next sequence of bytes matching a given sequence. If
+        a match is found advance the position to the last byte of the match"""
+        newPosition = self[self.position:].find(bytes)
+        if newPosition > -1:
+            # XXX: This is ugly, but I can't see a nicer way to fix this.
+            if self._position == -1:
+                self._position = 0
+            self._position += (newPosition + len(bytes) - 1)
+            return True
+        else:
+            raise StopIteration
+
+
+class EncodingParser(object):
+    """Mini parser for detecting character encoding from meta elements"""
+
+    def __init__(self, data):
+        """string - the data to work on for encoding detection"""
+        self.data = EncodingBytes(data)
+        self.encoding = None
+
+    def getEncoding(self):
+        methodDispatch = (
+            (b"<!--", self.handleComment),
+            (b"<meta", self.handleMeta),
+            (b"</", self.handlePossibleEndTag),
+            (b"<!", self.handleOther),
+            (b"<?", self.handleOther),
+            (b"<", self.handlePossibleStartTag))
+        for _ in self.data:
+            keepParsing = True
+            for key, method in methodDispatch:
+                if self.data.matchBytes(key):
+                    try:
+                        keepParsing = method()
+                        break
+                    except StopIteration:
+                        keepParsing = False
+                        break
+            if not keepParsing:
+                break
+
+        return self.encoding
+
+    def handleComment(self):
+        """Skip over comments"""
+        return self.data.jumpTo(b"-->")
+
+    def handleMeta(self):
+        if self.data.currentByte not in spaceCharactersBytes:
+            # if we have <meta not followed by a space so just keep going
+            return True
+        # We have a valid meta element we want to search for attributes
+        hasPragma = False
+        pendingEncoding = None
+        while True:
+            # Try to find the next attribute after the current position
+            attr = self.getAttribute()
+            if attr is None:
+                return True
+            else:
+                if attr[0] == b"http-equiv":
+                    hasPragma = attr[1] == b"content-type"
+                    if hasPragma and pendingEncoding is not None:
+                        self.encoding = pendingEncoding
+                        return False
+                elif attr[0] == b"charset":
+                    tentativeEncoding = attr[1]
+                    codec = lookupEncoding(tentativeEncoding)
+                    if codec is not None:
+                        self.encoding = codec
+                        return False
+                elif attr[0] == b"content":
+                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
+                    tentativeEncoding = contentParser.parse()
+                    if tentativeEncoding is not None:
+                        codec = lookupEncoding(tentativeEncoding)
+                        if codec is not None:
+                            if hasPragma:
+                                self.encoding = codec
+                                return False
+                            else:
+                                pendingEncoding = codec
+
+    def handlePossibleStartTag(self):
+        return self.handlePossibleTag(False)
+
+    def handlePossibleEndTag(self):
+        next(self.data)
+        return self.handlePossibleTag(True)
+
+    def handlePossibleTag(self, endTag):
+        data = self.data
+        if data.currentByte not in asciiLettersBytes:
+            # If the next byte is not an ascii letter either ignore this
+            # fragment (possible start tag case) or treat it according to
+            # handleOther
+            if endTag:
+                data.previous()
+                self.handleOther()
+            return True
+
+        c = data.skipUntil(spacesAngleBrackets)
+        if c == b"<":
+            # return to the first step in the overall "two step" algorithm
+            # reprocessing the < byte
+            data.previous()
+        else:
+            # Read all attributes
+            attr = self.getAttribute()
+            while attr is not None:
+                attr = self.getAttribute()
+        return True
+
+    def handleOther(self):
+        return self.data.jumpTo(b">")
+
+    def getAttribute(self):
+        """Return a name,value pair for the next attribute in the stream,
+        if one is found, or None"""
+        data = self.data
+        # Step 1 (skip chars)
+        c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
+        assert c is None or len(c) == 1
+        # Step 2
+        if c in (b">", None):
+            return None
+        # Step 3
+        attrName = []
+        attrValue = []
+        # Step 4 attribute name
+        while True:
+            if c == b"=" and attrName:
+                break
+            elif c in spaceCharactersBytes:
+                # Step 6!
+                c = data.skip()
+                break
+            elif c in (b"/", b">"):
+                return b"".join(attrName), b""
+            elif c in asciiUppercaseBytes:
+                attrName.append(c.lower())
+            elif c is None:
+                return None
+            else:
+                attrName.append(c)
+            # Step 5
+            c = next(data)
+        # Step 7
+        if c != b"=":
+            data.previous()
+            return b"".join(attrName), b""
+        # Step 8
+        next(data)
+        # Step 9
+        c = data.skip()
+        # Step 10
+        if c in (b"'", b'"'):
+            # 10.1
+            quoteChar = c
+            while True:
+                # 10.2
+                c = next(data)
+                # 10.3
+                if c == quoteChar:
+                    next(data)
+                    return b"".join(attrName), b"".join(attrValue)
+                # 10.4
+                elif c in asciiUppercaseBytes:
+                    attrValue.append(c.lower())
+                # 10.5
+                else:
+                    attrValue.append(c)
+        elif c == b">":
+            return b"".join(attrName), b""
+        elif c in asciiUppercaseBytes:
+            attrValue.append(c.lower())
+        elif c is None:
+            return None
+        else:
+            attrValue.append(c)
+        # Step 11
+        while True:
+            c = next(data)
+            if c in spacesAngleBrackets:
+                return b"".join(attrName), b"".join(attrValue)
+            elif c in asciiUppercaseBytes:
+                attrValue.append(c.lower())
+            elif c is None:
+                return None
+            else:
+                attrValue.append(c)
+
+
+class ContentAttrParser(object):
+    def __init__(self, data):
+        assert isinstance(data, bytes)
+        self.data = data
+
+    def parse(self):
+        try:
+            # Check if the attr name is charset
+            # otherwise return
+            self.data.jumpTo(b"charset")
+            self.data.position += 1
+            self.data.skip()
+            if not self.data.currentByte == b"=":
+                # If there is no = sign keep looking for attrs
+                return None
+            self.data.position += 1
+            self.data.skip()
+            # Look for an encoding between matching quote marks
+            if self.data.currentByte in (b'"', b"'"):
+                quoteMark = self.data.currentByte
+                self.data.position += 1
+                oldPosition = self.data.position
+                if self.data.jumpTo(quoteMark):
+                    return self.data[oldPosition:self.data.position]
+                else:
+                    return None
+            else:
+                # Unquoted value
+                oldPosition = self.data.position
+                try:
+                    self.data.skipUntil(spaceCharactersBytes)
+                    return self.data[oldPosition:self.data.position]
+                except StopIteration:
+                    # Return the whole remaining value
+                    return self.data[oldPosition:]
+        except StopIteration:
+            return None
+
+
+def lookupEncoding(encoding):
+    """Return the python codec name corresponding to an encoding or None if the
+    string doesn't correspond to a valid encoding."""
+    if isinstance(encoding, binary_type):
+        try:
+            encoding = encoding.decode("ascii")
+        except UnicodeDecodeError:
+            return None
+
+    if encoding is not None:
+        try:
+            return webencodings.lookup(encoding)
+        except AttributeError:
+            return None
+    else:
+        return None
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_tokenizer.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_tokenizer.py
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_trie/init.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_trie/init.py
@ -0,0 +1,14 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from .py import Trie as PyTrie
+
+Trie = PyTrie
+
+# pylint:disable=wrong-import-position
+try:
+    from .datrie import Trie as DATrie
+except ImportError:
+    pass
+else:
+    Trie = DATrie
+# pylint:enable=wrong-import-position
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_trie/_base.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_trie/_base.py
@ -0,0 +1,37 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from collections import Mapping
+
+
+class Trie(Mapping):
+    """Abstract base class for tries"""
+
+    def keys(self, prefix=None):
+        # pylint:disable=arguments-differ
+        keys = super(Trie, self).keys()
+
+        if prefix is None:
+            return set(keys)
+
+        return {x for x in keys if x.startswith(prefix)}
+
+    def has_keys_with_prefix(self, prefix):
+        for key in self.keys():
+            if key.startswith(prefix):
+                return True
+
+        return False
+
+    def longest_prefix(self, prefix):
+        if prefix in self:
+            return prefix
+
+        for i in range(1, len(prefix) + 1):
+            if prefix[:-i] in self:
+                return prefix[:-i]
+
+        raise KeyError(prefix)
+
+    def longest_prefix_item(self, prefix):
+        lprefix = self.longest_prefix(prefix)
+        return (lprefix, self[lprefix])
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_trie/datrie.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_trie/datrie.py
@ -0,0 +1,44 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from datrie import Trie as DATrie
+from six import text_type
+
+from ._base import Trie as ABCTrie
+
+
+class Trie(ABCTrie):
+    def __init__(self, data):
+        chars = set()
+        for key in data.keys():
+            if not isinstance(key, text_type):
+                raise TypeError("All keys must be strings")
+            for char in key:
+                chars.add(char)
+
+        self._data = DATrie("".join(chars))
+        for key, value in data.items():
+            self._data[key] = value
+
+    def __contains__(self, key):
+        return key in self._data
+
+    def __len__(self):
+        return len(self._data)
+
+    def __iter__(self):
+        raise NotImplementedError()
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def keys(self, prefix=None):
+        return self._data.keys(prefix)
+
+    def has_keys_with_prefix(self, prefix):
+        return self._data.has_keys_with_prefix(prefix)
+
+    def longest_prefix(self, prefix):
+        return self._data.longest_prefix(prefix)
+
+    def longest_prefix_item(self, prefix):
+        return self._data.longest_prefix_item(prefix)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_trie/py.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_trie/py.py
@ -0,0 +1,67 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from bisect import bisect_left
+
+from ._base import Trie as ABCTrie
+
+
+class Trie(ABCTrie):
+    def __init__(self, data):
+        if not all(isinstance(x, text_type) for x in data.keys()):
+            raise TypeError("All keys must be strings")
+
+        self._data = data
+        self._keys = sorted(data.keys())
+        self._cachestr = ""
+        self._cachepoints = (0, len(data))
+
+    def __contains__(self, key):
+        return key in self._data
+
+    def __len__(self):
+        return len(self._data)
+
+    def __iter__(self):
+        return iter(self._data)
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def keys(self, prefix=None):
+        if prefix is None or prefix == "" or not self._keys:
+            return set(self._keys)
+
+        if prefix.startswith(self._cachestr):
+            lo, hi = self._cachepoints
+            start = i = bisect_left(self._keys, prefix, lo, hi)
+        else:
+            start = i = bisect_left(self._keys, prefix)
+
+        keys = set()
+        if start == len(self._keys):
+            return keys
+
+        while self._keys[i].startswith(prefix):
+            keys.add(self._keys[i])
+            i += 1
+
+        self._cachestr = prefix
+        self._cachepoints = (start, i)
+
+        return keys
+
+    def has_keys_with_prefix(self, prefix):
+        if prefix in self._data:
+            return True
+
+        if prefix.startswith(self._cachestr):
+            lo, hi = self._cachepoints
+            i = bisect_left(self._keys, prefix, lo, hi)
+        else:
+            i = bisect_left(self._keys, prefix)
+
+        if i == len(self._keys):
+            return False
+
+        return self._keys[i].startswith(prefix)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_utils.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/_utils.py
@ -0,0 +1,124 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from types import ModuleType
+
+from six import text_type
+
+try:
+    import xml.etree.cElementTree as default_etree
+except ImportError:
+    import xml.etree.ElementTree as default_etree
+
+
+__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
+           "surrogatePairToCodepoint", "moduleFactoryFactory",
+           "supports_lone_surrogates"]
+
+
+# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
+# caught by the below test. In general this would be any platform
+# using UTF-16 as its encoding of unicode strings, such as
+# Jython. This is because UTF-16 itself is based on the use of such
+# surrogates, and there is no mechanism to further escape such
+# escapes.
+try:
+    _x = eval('"\\uD800"')  # pylint:disable=eval-used
+    if not isinstance(_x, text_type):
+        # We need this with u"" because of http://bugs.jython.org/issue2039
+        _x = eval('u"\\uD800"')  # pylint:disable=eval-used
+        assert isinstance(_x, text_type)
+except:  # pylint:disable=bare-except
+    supports_lone_surrogates = False
+else:
+    supports_lone_surrogates = True
+
+
+class MethodDispatcher(dict):
+    """Dict with 2 special properties:
+
+    On initiation, keys that are lists, sets or tuples are converted to
+    multiple keys so accessing any one of the items in the original
+    list-like object returns the matching value
+
+    md = MethodDispatcher({("foo", "bar"):"baz"})
+    md["foo"] == "baz"
+
+    A default value which can be set through the default attribute.
+    """
+
+    def __init__(self, items=()):
+        # Using _dictEntries instead of directly assigning to self is about
+        # twice as fast. Please do careful performance testing before changing
+        # anything here.
+        _dictEntries = []
+        for name, value in items:
+            if isinstance(name, (list, tuple, frozenset, set)):
+                for item in name:
+                    _dictEntries.append((item, value))
+            else:
+                _dictEntries.append((name, value))
+        dict.__init__(self, _dictEntries)
+        assert len(self) == len(_dictEntries)
+        self.default = None
+
+    def __getitem__(self, key):
+        return dict.get(self, key, self.default)
+
+
+# Some utility functions to deal with weirdness around UCS2 vs UCS4
+# python builds
+
+def isSurrogatePair(data):
+    return (len(data) == 2 and
+            ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
+            ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
+
+
+def surrogatePairToCodepoint(data):
+    char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
+                (ord(data[1]) - 0xDC00))
+    return char_val
+
+# Module Factory Factory (no, this isn't Java, I know)
+# Here to stop this being duplicated all over the place.
+
+
+def moduleFactoryFactory(factory):
+    moduleCache = {}
+
+    def moduleFactory(baseModule, *args, **kwargs):
+        if isinstance(ModuleType.__name__, type("")):
+            name = "_%s_factory" % baseModule.__name__
+        else:
+            name = b"_%s_factory" % baseModule.__name__
+
+        kwargs_tuple = tuple(kwargs.items())
+
+        try:
+            return moduleCache[name][args][kwargs_tuple]
+        except KeyError:
+            mod = ModuleType(name)
+            objs = factory(baseModule, *args, **kwargs)
+            mod.__dict__.update(objs)
+            if "name" not in moduleCache:
+                moduleCache[name] = {}
+            if "args" not in moduleCache[name]:
+                moduleCache[name][args] = {}
+            if "kwargs" not in moduleCache[name][args]:
+                moduleCache[name][args][kwargs_tuple] = {}
+            moduleCache[name][args][kwargs_tuple] = mod
+            return mod
+
+    return moduleFactory
+
+
+def memoize(func):
+    cache = {}
+
+    def wrapped(*args, **kwargs):
+        key = (tuple(args), tuple(kwargs.items()))
+        if key not in cache:
+            cache[key] = func(*args, **kwargs)
+        return cache[key]
+
+    return wrapped
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/constants.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/constants.py
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/init.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/init.py
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/alphabeticalattributes.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/alphabeticalattributes.py
@ -0,0 +1,29 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import base
+
+from collections import OrderedDict
+
+
+def _attr_key(attr):
+    """Return an appropriate key for an attribute for sorting
+
+    Attributes have a namespace that can be either ``None`` or a string. We
+    can't compare the two because they're different types, so we convert
+    ``None`` to an empty string first.
+
+    """
+    return (attr[0][0] or ''), attr[0][1]
+
+
+class Filter(base.Filter):
+    """Alphabetizes attributes for elements"""
+    def __iter__(self):
+        for token in base.Filter.__iter__(self):
+            if token["type"] in ("StartTag", "EmptyTag"):
+                attrs = OrderedDict()
+                for name, value in sorted(token["data"].items(),
+                                          key=_attr_key):
+                    attrs[name] = value
+                token["data"] = attrs
+            yield token
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/base.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/base.py
@ -0,0 +1,12 @@
+from __future__ import absolute_import, division, unicode_literals
+
+
+class Filter(object):
+    def __init__(self, source):
+        self.source = source
+
+    def __iter__(self):
+        return iter(self.source)
+
+    def __getattr__(self, name):
+        return getattr(self.source, name)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/inject_meta_charset.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/inject_meta_charset.py
@ -0,0 +1,73 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import base
+
+
+class Filter(base.Filter):
+    """Injects ``<meta charset=ENCODING>`` tag into head of document"""
+    def __init__(self, source, encoding):
+        """Creates a Filter
+
+        :arg source: the source token stream
+
+        :arg encoding: the encoding to set
+
+        """
+        base.Filter.__init__(self, source)
+        self.encoding = encoding
+
+    def __iter__(self):
+        state = "pre_head"
+        meta_found = (self.encoding is None)
+        pending = []
+
+        for token in base.Filter.__iter__(self):
+            type = token["type"]
+            if type == "StartTag":
+                if token["name"].lower() == "head":
+                    state = "in_head"
+
+            elif type == "EmptyTag":
+                if token["name"].lower() == "meta":
+                    # replace charset with actual encoding
+                    has_http_equiv_content_type = False
+                    for (namespace, name), value in token["data"].items():
+                        if namespace is not None:
+                            continue
+                        elif name.lower() == 'charset':
+                            token["data"][(namespace, name)] = self.encoding
+                            meta_found = True
+                            break
+                        elif name == 'http-equiv' and value.lower() == 'content-type':
+                            has_http_equiv_content_type = True
+                    else:
+                        if has_http_equiv_content_type and (None, "content") in token["data"]:
+                            token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
+                            meta_found = True
+
+                elif token["name"].lower() == "head" and not meta_found:
+                    # insert meta into empty head
+                    yield {"type": "StartTag", "name": "head",
+                           "data": token["data"]}
+                    yield {"type": "EmptyTag", "name": "meta",
+                           "data": {(None, "charset"): self.encoding}}
+                    yield {"type": "EndTag", "name": "head"}
+                    meta_found = True
+                    continue
+
+            elif type == "EndTag":
+                if token["name"].lower() == "head" and pending:
+                    # insert meta into head (if necessary) and flush pending queue
+                    yield pending.pop(0)
+                    if not meta_found:
+                        yield {"type": "EmptyTag", "name": "meta",
+                               "data": {(None, "charset"): self.encoding}}
+                    while pending:
+                        yield pending.pop(0)
+                    meta_found = True
+                    state = "post_head"
+
+            if state == "in_head":
+                pending.append(token)
+            else:
+                yield token
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/lint.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/lint.py
@ -0,0 +1,93 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from six import text_type
+
+from . import base
+from ..constants import namespaces, voidElements
+
+from ..constants import spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+
+class Filter(base.Filter):
+    """Lints the token stream for errors
+
+    If it finds any errors, it'll raise an ``AssertionError``.
+
+    """
+    def __init__(self, source, require_matching_tags=True):
+        """Creates a Filter
+
+        :arg source: the source token stream
+
+        :arg require_matching_tags: whether or not to require matching tags
+
+        """
+        super(Filter, self).__init__(source)
+        self.require_matching_tags = require_matching_tags
+
+    def __iter__(self):
+        open_elements = []
+        for token in base.Filter.__iter__(self):
+            type = token["type"]
+            if type in ("StartTag", "EmptyTag"):
+                namespace = token["namespace"]
+                name = token["name"]
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                assert isinstance(token["data"], dict)
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert type == "EmptyTag"
+                else:
+                    assert type == "StartTag"
+                if type == "StartTag" and self.require_matching_tags:
+                    open_elements.append((namespace, name))
+                for (namespace, name), value in token["data"].items():
+                    assert namespace is None or isinstance(namespace, text_type)
+                    assert namespace != ""
+                    assert isinstance(name, text_type)
+                    assert name != ""
+                    assert isinstance(value, text_type)
+
+            elif type == "EndTag":
+                namespace = token["namespace"]
+                name = token["name"]
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
+                elif self.require_matching_tags:
+                    start = open_elements.pop()
+                    assert start == (namespace, name)
+
+            elif type == "Comment":
+                data = token["data"]
+                assert isinstance(data, text_type)
+
+            elif type in ("Characters", "SpaceCharacters"):
+                data = token["data"]
+                assert isinstance(data, text_type)
+                assert data != ""
+                if type == "SpaceCharacters":
+                    assert data.strip(spaceCharacters) == ""
+
+            elif type == "Doctype":
+                name = token["name"]
+                assert name is None or isinstance(name, text_type)
+                assert token["publicId"] is None or isinstance(name, text_type)
+                assert token["systemId"] is None or isinstance(name, text_type)
+
+            elif type == "Entity":
+                assert isinstance(token["name"], text_type)
+
+            elif type == "SerializerError":
+                assert isinstance(token["data"], text_type)
+
+            else:
+                assert False, "Unknown token type: %(type)s" % {"type": type}
+
+            yield token
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/optionaltags.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/optionaltags.py
@ -0,0 +1,207 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import base
+
+
+class Filter(base.Filter):
+    """Removes optional tags from the token stream"""
+    def slider(self):
+        previous1 = previous2 = None
+        for token in self.source:
+            if previous1 is not None:
+                yield previous2, previous1, token
+            previous2 = previous1
+            previous1 = token
+        if previous1 is not None:
+            yield previous2, previous1, None
+
+    def __iter__(self):
+        for previous, token, next in self.slider():
+            type = token["type"]
+            if type == "StartTag":
+                if (token["data"] or
+                        not self.is_optional_start(token["name"], previous, next)):
+                    yield token
+            elif type == "EndTag":
+                if not self.is_optional_end(token["name"], next):
+                    yield token
+            else:
+                yield token
+
+    def is_optional_start(self, tagname, previous, next):
+        type = next and next["type"] or None
+        if tagname in 'html':
+            # An html element's start tag may be omitted if the first thing
+            # inside the html element is not a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname == 'head':
+            # A head element's start tag may be omitted if the first thing
+            # inside the head element is an element.
+            # XXX: we also omit the start tag if the head element is empty
+            if type in ("StartTag", "EmptyTag"):
+                return True
+            elif type == "EndTag":
+                return next["name"] == "head"
+        elif tagname == 'body':
+            # A body element's start tag may be omitted if the first thing
+            # inside the body element is not a space character or a comment,
+            # except if the first thing inside the body element is a script
+            # or style element and the node immediately preceding the body
+            # element is a head element whose end tag has been omitted.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we do not look at the preceding event, so we never omit
+                # the body element's start tag if it's followed by a script or
+                # a style element.
+                return next["name"] not in ('script', 'style')
+            else:
+                return True
+        elif tagname == 'colgroup':
+            # A colgroup element's start tag may be omitted if the first thing
+            # inside the colgroup element is a col element, and if the element
+            # is not immediately preceded by another colgroup element whose
+            # end tag has been omitted.
+            if type in ("StartTag", "EmptyTag"):
+                # XXX: we do not look at the preceding event, so instead we never
+                # omit the colgroup element's end tag when it is immediately
+                # followed by another colgroup element. See is_optional_end.
+                return next["name"] == "col"
+            else:
+                return False
+        elif tagname == 'tbody':
+            # A tbody element's start tag may be omitted if the first thing
+            # inside the tbody element is a tr element, and if the element is
+            # not immediately preceded by a tbody, thead, or tfoot element
+            # whose end tag has been omitted.
+            if type == "StartTag":
+                # omit the thead and tfoot elements' end tag when they are
+                # immediately followed by a tbody element. See is_optional_end.
+                if previous and previous['type'] == 'EndTag' and \
+                        previous['name'] in ('tbody', 'thead', 'tfoot'):
+                    return False
+                return next["name"] == 'tr'
+            else:
+                return False
+        return False
+
+    def is_optional_end(self, tagname, next):
+        type = next and next["type"] or None
+        if tagname in ('html', 'head', 'body'):
+            # An html element's end tag may be omitted if the html element
+            # is not immediately followed by a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname in ('li', 'optgroup', 'tr'):
+            # A li element's end tag may be omitted if the li element is
+            # immediately followed by another li element or if there is
+            # no more content in the parent element.
+            # An optgroup element's end tag may be omitted if the optgroup
+            # element is immediately followed by another optgroup element,
+            # or if there is no more content in the parent element.
+            # A tr element's end tag may be omitted if the tr element is
+            # immediately followed by another tr element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] == tagname
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('dt', 'dd'):
+            # A dt element's end tag may be omitted if the dt element is
+            # immediately followed by another dt element or a dd element.
+            # A dd element's end tag may be omitted if the dd element is
+            # immediately followed by another dd element or a dt element,
+            # or if there is no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('dt', 'dd')
+            elif tagname == 'dd':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'p':
+            # A p element's end tag may be omitted if the p element is
+            # immediately followed by an address, article, aside,
+            # blockquote, datagrid, dialog, dir, div, dl, fieldset,
+            # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
+            # nav, ol, p, pre, section, table, or ul, element, or if
+            # there is no more content in the parent element.
+            if type in ("StartTag", "EmptyTag"):
+                return next["name"] in ('address', 'article', 'aside',
+                                        'blockquote', 'datagrid', 'dialog',
+                                        'dir', 'div', 'dl', 'fieldset', 'footer',
+                                        'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+                                        'header', 'hr', 'menu', 'nav', 'ol',
+                                        'p', 'pre', 'section', 'table', 'ul')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'option':
+            # An option element's end tag may be omitted if the option
+            # element is immediately followed by another option element,
+            # or if it is immediately followed by an <code>optgroup</code>
+            # element, or if there is no more content in the parent
+            # element.
+            if type == "StartTag":
+                return next["name"] in ('option', 'optgroup')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('rt', 'rp'):
+            # An rt element's end tag may be omitted if the rt element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            # An rp element's end tag may be omitted if the rp element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('rt', 'rp')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'colgroup':
+            # A colgroup element's end tag may be omitted if the colgroup
+            # element is not immediately followed by a space character or
+            # a comment.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we also look for an immediately following colgroup
+                # element. See is_optional_start.
+                return next["name"] != 'colgroup'
+            else:
+                return True
+        elif tagname in ('thead', 'tbody'):
+            # A thead element's end tag may be omitted if the thead element
+            # is immediately followed by a tbody or tfoot element.
+            # A tbody element's end tag may be omitted if the tbody element
+            # is immediately followed by a tbody or tfoot element, or if
+            # there is no more content in the parent element.
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] in ['tbody', 'tfoot']
+            elif tagname == 'tbody':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'tfoot':
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] == 'tbody'
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('td', 'th'):
+            # A td element's end tag may be omitted if the td element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            # A th element's end tag may be omitted if the th element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('td', 'th')
+            else:
+                return type == "EndTag" or type is None
+        return False
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/sanitizer.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/sanitizer.py
@ -0,0 +1,896 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+from xml.sax.saxutils import escape, unescape
+
+from six.moves import urllib_parse as urlparse
+
+from . import base
+from ..constants import namespaces, prefixes
+
+__all__ = ["Filter"]
+
+
+allowed_elements = frozenset((
+    (namespaces['html'], 'a'),
+    (namespaces['html'], 'abbr'),
+    (namespaces['html'], 'acronym'),
+    (namespaces['html'], 'address'),
+    (namespaces['html'], 'area'),
+    (namespaces['html'], 'article'),
+    (namespaces['html'], 'aside'),
+    (namespaces['html'], 'audio'),
+    (namespaces['html'], 'b'),
+    (namespaces['html'], 'big'),
+    (namespaces['html'], 'blockquote'),
+    (namespaces['html'], 'br'),
+    (namespaces['html'], 'button'),
+    (namespaces['html'], 'canvas'),
+    (namespaces['html'], 'caption'),
+    (namespaces['html'], 'center'),
+    (namespaces['html'], 'cite'),
+    (namespaces['html'], 'code'),
+    (namespaces['html'], 'col'),
+    (namespaces['html'], 'colgroup'),
+    (namespaces['html'], 'command'),
+    (namespaces['html'], 'datagrid'),
+    (namespaces['html'], 'datalist'),
+    (namespaces['html'], 'dd'),
+    (namespaces['html'], 'del'),
+    (namespaces['html'], 'details'),
+    (namespaces['html'], 'dfn'),
+    (namespaces['html'], 'dialog'),
+    (namespaces['html'], 'dir'),
+    (namespaces['html'], 'div'),
+    (namespaces['html'], 'dl'),
+    (namespaces['html'], 'dt'),
+    (namespaces['html'], 'em'),
+    (namespaces['html'], 'event-source'),
+    (namespaces['html'], 'fieldset'),
+    (namespaces['html'], 'figcaption'),
+    (namespaces['html'], 'figure'),
+    (namespaces['html'], 'footer'),
+    (namespaces['html'], 'font'),
+    (namespaces['html'], 'form'),
+    (namespaces['html'], 'header'),
+    (namespaces['html'], 'h1'),
+    (namespaces['html'], 'h2'),
+    (namespaces['html'], 'h3'),
+    (namespaces['html'], 'h4'),
+    (namespaces['html'], 'h5'),
+    (namespaces['html'], 'h6'),
+    (namespaces['html'], 'hr'),
+    (namespaces['html'], 'i'),
+    (namespaces['html'], 'img'),
+    (namespaces['html'], 'input'),
+    (namespaces['html'], 'ins'),
+    (namespaces['html'], 'keygen'),
+    (namespaces['html'], 'kbd'),
+    (namespaces['html'], 'label'),
+    (namespaces['html'], 'legend'),
+    (namespaces['html'], 'li'),
+    (namespaces['html'], 'm'),
+    (namespaces['html'], 'map'),
+    (namespaces['html'], 'menu'),
+    (namespaces['html'], 'meter'),
+    (namespaces['html'], 'multicol'),
+    (namespaces['html'], 'nav'),
+    (namespaces['html'], 'nextid'),
+    (namespaces['html'], 'ol'),
+    (namespaces['html'], 'output'),
+    (namespaces['html'], 'optgroup'),
+    (namespaces['html'], 'option'),
+    (namespaces['html'], 'p'),
+    (namespaces['html'], 'pre'),
+    (namespaces['html'], 'progress'),
+    (namespaces['html'], 'q'),
+    (namespaces['html'], 's'),
+    (namespaces['html'], 'samp'),
+    (namespaces['html'], 'section'),
+    (namespaces['html'], 'select'),
+    (namespaces['html'], 'small'),
+    (namespaces['html'], 'sound'),
+    (namespaces['html'], 'source'),
+    (namespaces['html'], 'spacer'),
+    (namespaces['html'], 'span'),
+    (namespaces['html'], 'strike'),
+    (namespaces['html'], 'strong'),
+    (namespaces['html'], 'sub'),
+    (namespaces['html'], 'sup'),
+    (namespaces['html'], 'table'),
+    (namespaces['html'], 'tbody'),
+    (namespaces['html'], 'td'),
+    (namespaces['html'], 'textarea'),
+    (namespaces['html'], 'time'),
+    (namespaces['html'], 'tfoot'),
+    (namespaces['html'], 'th'),
+    (namespaces['html'], 'thead'),
+    (namespaces['html'], 'tr'),
+    (namespaces['html'], 'tt'),
+    (namespaces['html'], 'u'),
+    (namespaces['html'], 'ul'),
+    (namespaces['html'], 'var'),
+    (namespaces['html'], 'video'),
+    (namespaces['mathml'], 'maction'),
+    (namespaces['mathml'], 'math'),
+    (namespaces['mathml'], 'merror'),
+    (namespaces['mathml'], 'mfrac'),
+    (namespaces['mathml'], 'mi'),
+    (namespaces['mathml'], 'mmultiscripts'),
+    (namespaces['mathml'], 'mn'),
+    (namespaces['mathml'], 'mo'),
+    (namespaces['mathml'], 'mover'),
+    (namespaces['mathml'], 'mpadded'),
+    (namespaces['mathml'], 'mphantom'),
+    (namespaces['mathml'], 'mprescripts'),
+    (namespaces['mathml'], 'mroot'),
+    (namespaces['mathml'], 'mrow'),
+    (namespaces['mathml'], 'mspace'),
+    (namespaces['mathml'], 'msqrt'),
+    (namespaces['mathml'], 'mstyle'),
+    (namespaces['mathml'], 'msub'),
+    (namespaces['mathml'], 'msubsup'),
+    (namespaces['mathml'], 'msup'),
+    (namespaces['mathml'], 'mtable'),
+    (namespaces['mathml'], 'mtd'),
+    (namespaces['mathml'], 'mtext'),
+    (namespaces['mathml'], 'mtr'),
+    (namespaces['mathml'], 'munder'),
+    (namespaces['mathml'], 'munderover'),
+    (namespaces['mathml'], 'none'),
+    (namespaces['svg'], 'a'),
+    (namespaces['svg'], 'animate'),
+    (namespaces['svg'], 'animateColor'),
+    (namespaces['svg'], 'animateMotion'),
+    (namespaces['svg'], 'animateTransform'),
+    (namespaces['svg'], 'clipPath'),
+    (namespaces['svg'], 'circle'),
+    (namespaces['svg'], 'defs'),
+    (namespaces['svg'], 'desc'),
+    (namespaces['svg'], 'ellipse'),
+    (namespaces['svg'], 'font-face'),
+    (namespaces['svg'], 'font-face-name'),
+    (namespaces['svg'], 'font-face-src'),
+    (namespaces['svg'], 'g'),
+    (namespaces['svg'], 'glyph'),
+    (namespaces['svg'], 'hkern'),
+    (namespaces['svg'], 'linearGradient'),
+    (namespaces['svg'], 'line'),
+    (namespaces['svg'], 'marker'),
+    (namespaces['svg'], 'metadata'),
+    (namespaces['svg'], 'missing-glyph'),
+    (namespaces['svg'], 'mpath'),
+    (namespaces['svg'], 'path'),
+    (namespaces['svg'], 'polygon'),
+    (namespaces['svg'], 'polyline'),
+    (namespaces['svg'], 'radialGradient'),
+    (namespaces['svg'], 'rect'),
+    (namespaces['svg'], 'set'),
+    (namespaces['svg'], 'stop'),
+    (namespaces['svg'], 'svg'),
+    (namespaces['svg'], 'switch'),
+    (namespaces['svg'], 'text'),
+    (namespaces['svg'], 'title'),
+    (namespaces['svg'], 'tspan'),
+    (namespaces['svg'], 'use'),
+))
+
+allowed_attributes = frozenset((
+    # HTML attributes
+    (None, 'abbr'),
+    (None, 'accept'),
+    (None, 'accept-charset'),
+    (None, 'accesskey'),
+    (None, 'action'),
+    (None, 'align'),
+    (None, 'alt'),
+    (None, 'autocomplete'),
+    (None, 'autofocus'),
+    (None, 'axis'),
+    (None, 'background'),
+    (None, 'balance'),
+    (None, 'bgcolor'),
+    (None, 'bgproperties'),
+    (None, 'border'),
+    (None, 'bordercolor'),
+    (None, 'bordercolordark'),
+    (None, 'bordercolorlight'),
+    (None, 'bottompadding'),
+    (None, 'cellpadding'),
+    (None, 'cellspacing'),
+    (None, 'ch'),
+    (None, 'challenge'),
+    (None, 'char'),
+    (None, 'charoff'),
+    (None, 'choff'),
+    (None, 'charset'),
+    (None, 'checked'),
+    (None, 'cite'),
+    (None, 'class'),
+    (None, 'clear'),
+    (None, 'color'),
+    (None, 'cols'),
+    (None, 'colspan'),
+    (None, 'compact'),
+    (None, 'contenteditable'),
+    (None, 'controls'),
+    (None, 'coords'),
+    (None, 'data'),
+    (None, 'datafld'),
+    (None, 'datapagesize'),
+    (None, 'datasrc'),
+    (None, 'datetime'),
+    (None, 'default'),
+    (None, 'delay'),
+    (None, 'dir'),
+    (None, 'disabled'),
+    (None, 'draggable'),
+    (None, 'dynsrc'),
+    (None, 'enctype'),
+    (None, 'end'),
+    (None, 'face'),
+    (None, 'for'),
+    (None, 'form'),
+    (None, 'frame'),
+    (None, 'galleryimg'),
+    (None, 'gutter'),
+    (None, 'headers'),
+    (None, 'height'),
+    (None, 'hidefocus'),
+    (None, 'hidden'),
+    (None, 'high'),
+    (None, 'href'),
+    (None, 'hreflang'),
+    (None, 'hspace'),
+    (None, 'icon'),
+    (None, 'id'),
+    (None, 'inputmode'),
+    (None, 'ismap'),
+    (None, 'keytype'),
+    (None, 'label'),
+    (None, 'leftspacing'),
+    (None, 'lang'),
+    (None, 'list'),
+    (None, 'longdesc'),
+    (None, 'loop'),
+    (None, 'loopcount'),
+    (None, 'loopend'),
+    (None, 'loopstart'),
+    (None, 'low'),
+    (None, 'lowsrc'),
+    (None, 'max'),
+    (None, 'maxlength'),
+    (None, 'media'),
+    (None, 'method'),
+    (None, 'min'),
+    (None, 'multiple'),
+    (None, 'name'),
+    (None, 'nohref'),
+    (None, 'noshade'),
+    (None, 'nowrap'),
+    (None, 'open'),
+    (None, 'optimum'),
+    (None, 'pattern'),
+    (None, 'ping'),
+    (None, 'point-size'),
+    (None, 'poster'),
+    (None, 'pqg'),
+    (None, 'preload'),
+    (None, 'prompt'),
+    (None, 'radiogroup'),
+    (None, 'readonly'),
+    (None, 'rel'),
+    (None, 'repeat-max'),
+    (None, 'repeat-min'),
+    (None, 'replace'),
+    (None, 'required'),
+    (None, 'rev'),
+    (None, 'rightspacing'),
+    (None, 'rows'),
+    (None, 'rowspan'),
+    (None, 'rules'),
+    (None, 'scope'),
+    (None, 'selected'),
+    (None, 'shape'),
+    (None, 'size'),
+    (None, 'span'),
+    (None, 'src'),
+    (None, 'start'),
+    (None, 'step'),
+    (None, 'style'),
+    (None, 'summary'),
+    (None, 'suppress'),
+    (None, 'tabindex'),
+    (None, 'target'),
+    (None, 'template'),
+    (None, 'title'),
+    (None, 'toppadding'),
+    (None, 'type'),
+    (None, 'unselectable'),
+    (None, 'usemap'),
+    (None, 'urn'),
+    (None, 'valign'),
+    (None, 'value'),
+    (None, 'variable'),
+    (None, 'volume'),
+    (None, 'vspace'),
+    (None, 'vrml'),
+    (None, 'width'),
+    (None, 'wrap'),
+    (namespaces['xml'], 'lang'),
+    # MathML attributes
+    (None, 'actiontype'),
+    (None, 'align'),
+    (None, 'columnalign'),
+    (None, 'columnalign'),
+    (None, 'columnalign'),
+    (None, 'columnlines'),
+    (None, 'columnspacing'),
+    (None, 'columnspan'),
+    (None, 'depth'),
+    (None, 'display'),
+    (None, 'displaystyle'),
+    (None, 'equalcolumns'),
+    (None, 'equalrows'),
+    (None, 'fence'),
+    (None, 'fontstyle'),
+    (None, 'fontweight'),
+    (None, 'frame'),
+    (None, 'height'),
+    (None, 'linethickness'),
+    (None, 'lspace'),
+    (None, 'mathbackground'),
+    (None, 'mathcolor'),
+    (None, 'mathvariant'),
+    (None, 'mathvariant'),
+    (None, 'maxsize'),
+    (None, 'minsize'),
+    (None, 'other'),
+    (None, 'rowalign'),
+    (None, 'rowalign'),
+    (None, 'rowalign'),
+    (None, 'rowlines'),
+    (None, 'rowspacing'),
+    (None, 'rowspan'),
+    (None, 'rspace'),
+    (None, 'scriptlevel'),
+    (None, 'selection'),
+    (None, 'separator'),
+    (None, 'stretchy'),
+    (None, 'width'),
+    (None, 'width'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xlink'], 'show'),
+    (namespaces['xlink'], 'type'),
+    # SVG attributes
+    (None, 'accent-height'),
+    (None, 'accumulate'),
+    (None, 'additive'),
+    (None, 'alphabetic'),
+    (None, 'arabic-form'),
+    (None, 'ascent'),
+    (None, 'attributeName'),
+    (None, 'attributeType'),
+    (None, 'baseProfile'),
+    (None, 'bbox'),
+    (None, 'begin'),
+    (None, 'by'),
+    (None, 'calcMode'),
+    (None, 'cap-height'),
+    (None, 'class'),
+    (None, 'clip-path'),
+    (None, 'color'),
+    (None, 'color-rendering'),
+    (None, 'content'),
+    (None, 'cx'),
+    (None, 'cy'),
+    (None, 'd'),
+    (None, 'dx'),
+    (None, 'dy'),
+    (None, 'descent'),
+    (None, 'display'),
+    (None, 'dur'),
+    (None, 'end'),
+    (None, 'fill'),
+    (None, 'fill-opacity'),
+    (None, 'fill-rule'),
+    (None, 'font-family'),
+    (None, 'font-size'),
+    (None, 'font-stretch'),
+    (None, 'font-style'),
+    (None, 'font-variant'),
+    (None, 'font-weight'),
+    (None, 'from'),
+    (None, 'fx'),
+    (None, 'fy'),
+    (None, 'g1'),
+    (None, 'g2'),
+    (None, 'glyph-name'),
+    (None, 'gradientUnits'),
+    (None, 'hanging'),
+    (None, 'height'),
+    (None, 'horiz-adv-x'),
+    (None, 'horiz-origin-x'),
+    (None, 'id'),
+    (None, 'ideographic'),
+    (None, 'k'),
+    (None, 'keyPoints'),
+    (None, 'keySplines'),
+    (None, 'keyTimes'),
+    (None, 'lang'),
+    (None, 'marker-end'),
+    (None, 'marker-mid'),
+    (None, 'marker-start'),
+    (None, 'markerHeight'),
+    (None, 'markerUnits'),
+    (None, 'markerWidth'),
+    (None, 'mathematical'),
+    (None, 'max'),
+    (None, 'min'),
+    (None, 'name'),
+    (None, 'offset'),
+    (None, 'opacity'),
+    (None, 'orient'),
+    (None, 'origin'),
+    (None, 'overline-position'),
+    (None, 'overline-thickness'),
+    (None, 'panose-1'),
+    (None, 'path'),
+    (None, 'pathLength'),
+    (None, 'points'),
+    (None, 'preserveAspectRatio'),
+    (None, 'r'),
+    (None, 'refX'),
+    (None, 'refY'),
+    (None, 'repeatCount'),
+    (None, 'repeatDur'),
+    (None, 'requiredExtensions'),
+    (None, 'requiredFeatures'),
+    (None, 'restart'),
+    (None, 'rotate'),
+    (None, 'rx'),
+    (None, 'ry'),
+    (None, 'slope'),
+    (None, 'stemh'),
+    (None, 'stemv'),
+    (None, 'stop-color'),
+    (None, 'stop-opacity'),
+    (None, 'strikethrough-position'),
+    (None, 'strikethrough-thickness'),
+    (None, 'stroke'),
+    (None, 'stroke-dasharray'),
+    (None, 'stroke-dashoffset'),
+    (None, 'stroke-linecap'),
+    (None, 'stroke-linejoin'),
+    (None, 'stroke-miterlimit'),
+    (None, 'stroke-opacity'),
+    (None, 'stroke-width'),
+    (None, 'systemLanguage'),
+    (None, 'target'),
+    (None, 'text-anchor'),
+    (None, 'to'),
+    (None, 'transform'),
+    (None, 'type'),
+    (None, 'u1'),
+    (None, 'u2'),
+    (None, 'underline-position'),
+    (None, 'underline-thickness'),
+    (None, 'unicode'),
+    (None, 'unicode-range'),
+    (None, 'units-per-em'),
+    (None, 'values'),
+    (None, 'version'),
+    (None, 'viewBox'),
+    (None, 'visibility'),
+    (None, 'width'),
+    (None, 'widths'),
+    (None, 'x'),
+    (None, 'x-height'),
+    (None, 'x1'),
+    (None, 'x2'),
+    (namespaces['xlink'], 'actuate'),
+    (namespaces['xlink'], 'arcrole'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xlink'], 'role'),
+    (namespaces['xlink'], 'show'),
+    (namespaces['xlink'], 'title'),
+    (namespaces['xlink'], 'type'),
+    (namespaces['xml'], 'base'),
+    (namespaces['xml'], 'lang'),
+    (namespaces['xml'], 'space'),
+    (None, 'y'),
+    (None, 'y1'),
+    (None, 'y2'),
+    (None, 'zoomAndPan'),
+))
+
+attr_val_is_uri = frozenset((
+    (None, 'href'),
+    (None, 'src'),
+    (None, 'cite'),
+    (None, 'action'),
+    (None, 'longdesc'),
+    (None, 'poster'),
+    (None, 'background'),
+    (None, 'datasrc'),
+    (None, 'dynsrc'),
+    (None, 'lowsrc'),
+    (None, 'ping'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xml'], 'base'),
+))
+
+svg_attr_val_allows_ref = frozenset((
+    (None, 'clip-path'),
+    (None, 'color-profile'),
+    (None, 'cursor'),
+    (None, 'fill'),
+    (None, 'filter'),
+    (None, 'marker'),
+    (None, 'marker-start'),
+    (None, 'marker-mid'),
+    (None, 'marker-end'),
+    (None, 'mask'),
+    (None, 'stroke'),
+))
+
+svg_allow_local_href = frozenset((
+    (None, 'altGlyph'),
+    (None, 'animate'),
+    (None, 'animateColor'),
+    (None, 'animateMotion'),
+    (None, 'animateTransform'),
+    (None, 'cursor'),
+    (None, 'feImage'),
+    (None, 'filter'),
+    (None, 'linearGradient'),
+    (None, 'pattern'),
+    (None, 'radialGradient'),
+    (None, 'textpath'),
+    (None, 'tref'),
+    (None, 'set'),
+    (None, 'use')
+))
+
+allowed_css_properties = frozenset((
+    'azimuth',
+    'background-color',
+    'border-bottom-color',
+    'border-collapse',
+    'border-color',
+    'border-left-color',
+    'border-right-color',
+    'border-top-color',
+    'clear',
+    'color',
+    'cursor',
+    'direction',
+    'display',
+    'elevation',
+    'float',
+    'font',
+    'font-family',
+    'font-size',
+    'font-style',
+    'font-variant',
+    'font-weight',
+    'height',
+    'letter-spacing',
+    'line-height',
+    'overflow',
+    'pause',
+    'pause-after',
+    'pause-before',
+    'pitch',
+    'pitch-range',
+    'richness',
+    'speak',
+    'speak-header',
+    'speak-numeral',
+    'speak-punctuation',
+    'speech-rate',
+    'stress',
+    'text-align',
+    'text-decoration',
+    'text-indent',
+    'unicode-bidi',
+    'vertical-align',
+    'voice-family',
+    'volume',
+    'white-space',
+    'width',
+))
+
+allowed_css_keywords = frozenset((
+    'auto',
+    'aqua',
+    'black',
+    'block',
+    'blue',
+    'bold',
+    'both',
+    'bottom',
+    'brown',
+    'center',
+    'collapse',
+    'dashed',
+    'dotted',
+    'fuchsia',
+    'gray',
+    'green',
+    '!important',
+    'italic',
+    'left',
+    'lime',
+    'maroon',
+    'medium',
+    'none',
+    'navy',
+    'normal',
+    'nowrap',
+    'olive',
+    'pointer',
+    'purple',
+    'red',
+    'right',
+    'solid',
+    'silver',
+    'teal',
+    'top',
+    'transparent',
+    'underline',
+    'white',
+    'yellow',
+))
+
+allowed_svg_properties = frozenset((
+    'fill',
+    'fill-opacity',
+    'fill-rule',
+    'stroke',
+    'stroke-width',
+    'stroke-linecap',
+    'stroke-linejoin',
+    'stroke-opacity',
+))
+
+allowed_protocols = frozenset((
+    'ed2k',
+    'ftp',
+    'http',
+    'https',
+    'irc',
+    'mailto',
+    'news',
+    'gopher',
+    'nntp',
+    'telnet',
+    'webcal',
+    'xmpp',
+    'callto',
+    'feed',
+    'urn',
+    'aim',
+    'rsync',
+    'tag',
+    'ssh',
+    'sftp',
+    'rtsp',
+    'afs',
+    'data',
+))
+
+allowed_content_types = frozenset((
+    'image/png',
+    'image/jpeg',
+    'image/gif',
+    'image/webp',
+    'image/bmp',
+    'text/plain',
+))
+
+
+data_content_type = re.compile(r'''
+                                ^
+                                # Match a content type <application>/<type>
+                                (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+                                # Match any character set and encoding
+                                (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
+                                  |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
+                                # Assume the rest is data
+                                ,.*
+                                $
+                                ''',
+                               re.VERBOSE)
+
+
+class Filter(base.Filter):
+    """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
+    def __init__(self,
+                 source,
+                 allowed_elements=allowed_elements,
+                 allowed_attributes=allowed_attributes,
+                 allowed_css_properties=allowed_css_properties,
+                 allowed_css_keywords=allowed_css_keywords,
+                 allowed_svg_properties=allowed_svg_properties,
+                 allowed_protocols=allowed_protocols,
+                 allowed_content_types=allowed_content_types,
+                 attr_val_is_uri=attr_val_is_uri,
+                 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
+                 svg_allow_local_href=svg_allow_local_href):
+        """Creates a Filter
+
+        :arg allowed_elements: set of elements to allow--everything else will
+            be escaped
+
+        :arg allowed_attributes: set of attributes to allow in
+            elements--everything else will be stripped
+
+        :arg allowed_css_properties: set of CSS properties to allow--everything
+            else will be stripped
+
+        :arg allowed_css_keywords: set of CSS keywords to allow--everything
+            else will be stripped
+
+        :arg allowed_svg_properties: set of SVG properties to allow--everything
+            else will be removed
+
+        :arg allowed_protocols: set of allowed protocols for URIs
+
+        :arg allowed_content_types: set of allowed content types for ``data`` URIs.
+
+        :arg attr_val_is_uri: set of attributes that have URI values--values
+            that have a scheme not listed in ``allowed_protocols`` are removed
+
+        :arg svg_attr_val_allows_ref: set of SVG attributes that can have
+            references
+
+        :arg svg_allow_local_href: set of SVG elements that can have local
+            hrefs--these are removed
+
+        """
+        super(Filter, self).__init__(source)
+        self.allowed_elements = allowed_elements
+        self.allowed_attributes = allowed_attributes
+        self.allowed_css_properties = allowed_css_properties
+        self.allowed_css_keywords = allowed_css_keywords
+        self.allowed_svg_properties = allowed_svg_properties
+        self.allowed_protocols = allowed_protocols
+        self.allowed_content_types = allowed_content_types
+        self.attr_val_is_uri = attr_val_is_uri
+        self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
+        self.svg_allow_local_href = svg_allow_local_href
+
+    def __iter__(self):
+        for token in base.Filter.__iter__(self):
+            token = self.sanitize_token(token)
+            if token:
+                yield token
+
+    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+    # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
+    # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
+    # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
+    # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
+    # allowed.
+    #
+    #   sanitize_html('<script> do_nasty_stuff() </script>')
+    #    => &lt;script> do_nasty_stuff() &lt;/script>
+    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+    #    => <a>Click here for $100</a>
+    def sanitize_token(self, token):
+
+        # accommodate filters which use token_type differently
+        token_type = token["type"]
+        if token_type in ("StartTag", "EndTag", "EmptyTag"):
+            name = token["name"]
+            namespace = token["namespace"]
+            if ((namespace, name) in self.allowed_elements or
+                (namespace is None and
+                 (namespaces["html"], name) in self.allowed_elements)):
+                return self.allowed_token(token)
+            else:
+                return self.disallowed_token(token)
+        elif token_type == "Comment":
+            pass
+        else:
+            return token
+
+    def allowed_token(self, token):
+        if "data" in token:
+            attrs = token["data"]
+            attr_names = set(attrs.keys())
+
+            # Remove forbidden attributes
+            for to_remove in (attr_names - self.allowed_attributes):
+                del token["data"][to_remove]
+                attr_names.remove(to_remove)
+
+            # Remove attributes with disallowed URL values
+            for attr in (attr_names & self.attr_val_is_uri):
+                assert attr in attrs
+                # I don't have a clue where this regexp comes from or why it matches those
+                # characters, nor why we call unescape. I just know it's always been here.
+                # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
+                # this will do is remove *more* than it otherwise would.
+                val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
+                                       unescape(attrs[attr])).lower()
+                # remove replacement characters from unescaped characters
+                val_unescaped = val_unescaped.replace("\ufffd", "")
+                try:
+                    uri = urlparse.urlparse(val_unescaped)
+                except ValueError:
+                    uri = None
+                    del attrs[attr]
+                if uri and uri.scheme:
+                    if uri.scheme not in self.allowed_protocols:
+                        del attrs[attr]
+                    if uri.scheme == 'data':
+                        m = data_content_type.match(uri.path)
+                        if not m:
+                            del attrs[attr]
+                        elif m.group('content_type') not in self.allowed_content_types:
+                            del attrs[attr]
+
+            for attr in self.svg_attr_val_allows_ref:
+                if attr in attrs:
+                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                         ' ',
+                                         unescape(attrs[attr]))
+            if (token["name"] in self.svg_allow_local_href and
+                (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
+                                                                     attrs[(namespaces['xlink'], 'href')])):
+                del attrs[(namespaces['xlink'], 'href')]
+            if (None, 'style') in attrs:
+                attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
+            token["data"] = attrs
+        return token
+
+    def disallowed_token(self, token):
+        token_type = token["type"]
+        if token_type == "EndTag":
+            token["data"] = "</%s>" % token["name"]
+        elif token["data"]:
+            assert token_type in ("StartTag", "EmptyTag")
+            attrs = []
+            for (ns, name), v in token["data"].items():
+                attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
+            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
+        else:
+            token["data"] = "<%s>" % token["name"]
+        if token.get("selfClosing"):
+            token["data"] = token["data"][:-1] + "/>"
+
+        token["type"] = "Characters"
+
+        del token["name"]
+        return token
+
+    def sanitize_css(self, style):
+        # disallow urls
+        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+        # gauntlet
+        if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+            return ''
+        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+            return ''
+
+        clean = []
+        for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
+            if not value:
+                continue
+            if prop.lower() in self.allowed_css_properties:
+                clean.append(prop + ': ' + value + ';')
+            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
+                                                'padding']:
+                for keyword in value.split():
+                    if keyword not in self.allowed_css_keywords and \
+                            not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):  # noqa
+                        break
+                else:
+                    clean.append(prop + ': ' + value + ';')
+            elif prop.lower() in self.allowed_svg_properties:
+                clean.append(prop + ': ' + value + ';')
+
+        return ' '.join(clean)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/whitespace.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/filters/whitespace.py
@ -0,0 +1,38 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+
+from . import base
+from ..constants import rcdataElements, spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
+
+
+class Filter(base.Filter):
+    """Collapses whitespace except in pre, textarea, and script elements"""
+    spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
+
+    def __iter__(self):
+        preserve = 0
+        for token in base.Filter.__iter__(self):
+            type = token["type"]
+            if type == "StartTag" \
+                    and (preserve or token["name"] in self.spacePreserveElements):
+                preserve += 1
+
+            elif type == "EndTag" and preserve:
+                preserve -= 1
+
+            elif not preserve and type == "SpaceCharacters" and token["data"]:
+                # Test on token["data"] above to not introduce spaces where there were not
+                token["data"] = " "
+
+            elif not preserve and type == "Characters":
+                token["data"] = collapse_spaces(token["data"])
+
+            yield token
+
+
+def collapse_spaces(text):
+    return SPACES_REGEX.sub(' ', text)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/html5parser.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/html5parser.py
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/serializer.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/serializer.py
@ -0,0 +1,409 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+import re
+
+from codecs import register_error, xmlcharrefreplace_errors
+
+from .constants import voidElements, booleanAttributes, spaceCharacters
+from .constants import rcdataElements, entities, xmlEntities
+from . import treewalkers, _utils
+from xml.sax.saxutils import escape
+
+_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
+_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
+_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
+                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
+                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
+                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
+                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
+                                   "\u3000]")
+
+
+_encode_entity_map = {}
+_is_ucs4 = len("\U0010FFFF") == 1
+for k, v in list(entities.items()):
+    # skip multi-character entities
+    if ((_is_ucs4 and len(v) > 1) or
+            (not _is_ucs4 and len(v) > 2)):
+        continue
+    if v != "&":
+        if len(v) == 2:
+            v = _utils.surrogatePairToCodepoint(v)
+        else:
+            v = ord(v)
+        if v not in _encode_entity_map or k.islower():
+            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
+            _encode_entity_map[v] = k
+
+
+def htmlentityreplace_errors(exc):
+    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+        res = []
+        codepoints = []
+        skip = False
+        for i, c in enumerate(exc.object[exc.start:exc.end]):
+            if skip:
+                skip = False
+                continue
+            index = i + exc.start
+            if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
+                codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
+                skip = True
+            else:
+                codepoint = ord(c)
+            codepoints.append(codepoint)
+        for cp in codepoints:
+            e = _encode_entity_map.get(cp)
+            if e:
+                res.append("&")
+                res.append(e)
+                if not e.endswith(";"):
+                    res.append(";")
+            else:
+                res.append("&#x%s;" % (hex(cp)[2:]))
+        return ("".join(res), exc.end)
+    else:
+        return xmlcharrefreplace_errors(exc)
+
+
+register_error("htmlentityreplace", htmlentityreplace_errors)
+
+
+def serialize(input, tree="etree", encoding=None, **serializer_opts):
+    """Serializes the input token stream using the specified treewalker
+
+    :arg input: the token stream to serialize
+
+    :arg tree: the treewalker to use
+
+    :arg encoding: the encoding to use
+
+    :arg serializer_opts: any options to pass to the
+        :py:class:`html5lib.serializer.HTMLSerializer` that gets created
+
+    :returns: the tree serialized as a string
+
+    Example:
+
+    >>> from html5lib.html5parser import parse
+    >>> from html5lib.serializer import serialize
+    >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
+    >>> serialize(token_stream, omit_optional_tags=False)
+    '<html><head></head><body><p>Hi!</p></body></html>'
+
+    """
+    # XXX: Should we cache this?
+    walker = treewalkers.getTreeWalker(tree)
+    s = HTMLSerializer(**serializer_opts)
+    return s.render(walker(input), encoding)
+
+
+class HTMLSerializer(object):
+
+    # attribute quoting options
+    quote_attr_values = "legacy"  # be secure by default
+    quote_char = '"'
+    use_best_quote_char = True
+
+    # tag syntax options
+    omit_optional_tags = True
+    minimize_boolean_attributes = True
+    use_trailing_solidus = False
+    space_before_trailing_solidus = True
+
+    # escaping options
+    escape_lt_in_attrs = False
+    escape_rcdata = False
+    resolve_entities = True
+
+    # miscellaneous options
+    alphabetical_attributes = False
+    inject_meta_charset = True
+    strip_whitespace = False
+    sanitize = False
+
+    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
+               "omit_optional_tags", "minimize_boolean_attributes",
+               "use_trailing_solidus", "space_before_trailing_solidus",
+               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
+               "alphabetical_attributes", "inject_meta_charset",
+               "strip_whitespace", "sanitize")
+
+    def __init__(self, **kwargs):
+        """Initialize HTMLSerializer
+
+        :arg inject_meta_charset: Whether or not to inject the meta charset.
+
+            Defaults to ``True``.
+
+        :arg quote_attr_values: Whether to quote attribute values that don't
+            require quoting per legacy browser behavior (``"legacy"``), when
+            required by the standard (``"spec"``), or always (``"always"``).
+
+            Defaults to ``"legacy"``.
+
+        :arg quote_char: Use given quote character for attribute quoting.
+
+            Defaults to ``"`` which will use double quotes unless attribute
+            value contains a double quote, in which case single quotes are
+            used.
+
+        :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
+            values.
+
+            Defaults to ``False``.
+
+        :arg escape_rcdata: Whether to escape characters that need to be
+            escaped within normal elements within rcdata elements such as
+            style.
+
+            Defaults to ``False``.
+
+        :arg resolve_entities: Whether to resolve named character entities that
+            appear in the source tree. The XML predefined entities &lt; &gt;
+            &amp; &quot; &apos; are unaffected by this setting.
+
+            Defaults to ``True``.
+
+        :arg strip_whitespace: Whether to remove semantically meaningless
+            whitespace. (This compresses all whitespace to a single space
+            except within ``pre``.)
+
+            Defaults to ``False``.
+
+        :arg minimize_boolean_attributes: Shortens boolean attributes to give
+            just the attribute value, for example::
+
+              <input disabled="disabled">
+
+            becomes::
+
+              <input disabled>
+
+            Defaults to ``True``.
+
+        :arg use_trailing_solidus: Includes a close-tag slash at the end of the
+            start tag of void elements (empty elements whose end tag is
+            forbidden). E.g. ``<hr/>``.
+
+            Defaults to ``False``.
+
+        :arg space_before_trailing_solidus: Places a space immediately before
+            the closing slash in a tag using a trailing solidus. E.g.
+            ``<hr />``. Requires ``use_trailing_solidus=True``.
+
+            Defaults to ``True``.
+
+        :arg sanitize: Strip all unsafe or unknown constructs from output.
+            See :py:class:`html5lib.filters.sanitizer.Filter`.
+
+            Defaults to ``False``.
+
+        :arg omit_optional_tags: Omit start/end tags that are optional.
+
+            Defaults to ``True``.
+
+        :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
+
+            Defaults to ``False``.
+
+        """
+        unexpected_args = frozenset(kwargs) - frozenset(self.options)
+        if len(unexpected_args) > 0:
+            raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
+        if 'quote_char' in kwargs:
+            self.use_best_quote_char = False
+        for attr in self.options:
+            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
+        self.errors = []
+        self.strict = False
+
+    def encode(self, string):
+        assert(isinstance(string, text_type))
+        if self.encoding:
+            return string.encode(self.encoding, "htmlentityreplace")
+        else:
+            return string
+
+    def encodeStrict(self, string):
+        assert(isinstance(string, text_type))
+        if self.encoding:
+            return string.encode(self.encoding, "strict")
+        else:
+            return string
+
+    def serialize(self, treewalker, encoding=None):
+        # pylint:disable=too-many-nested-blocks
+        self.encoding = encoding
+        in_cdata = False
+        self.errors = []
+
+        if encoding and self.inject_meta_charset:
+            from .filters.inject_meta_charset import Filter
+            treewalker = Filter(treewalker, encoding)
+        # Alphabetical attributes is here under the assumption that none of
+        # the later filters add or change order of attributes; it needs to be
+        # before the sanitizer so escaped elements come out correctly
+        if self.alphabetical_attributes:
+            from .filters.alphabeticalattributes import Filter
+            treewalker = Filter(treewalker)
+        # WhitespaceFilter should be used before OptionalTagFilter
+        # for maximum efficiently of this latter filter
+        if self.strip_whitespace:
+            from .filters.whitespace import Filter
+            treewalker = Filter(treewalker)
+        if self.sanitize:
+            from .filters.sanitizer import Filter
+            treewalker = Filter(treewalker)
+        if self.omit_optional_tags:
+            from .filters.optionaltags import Filter
+            treewalker = Filter(treewalker)
+
+        for token in treewalker:
+            type = token["type"]
+            if type == "Doctype":
+                doctype = "<!DOCTYPE %s" % token["name"]
+
+                if token["publicId"]:
+                    doctype += ' PUBLIC "%s"' % token["publicId"]
+                elif token["systemId"]:
+                    doctype += " SYSTEM"
+                if token["systemId"]:
+                    if token["systemId"].find('"') >= 0:
+                        if token["systemId"].find("'") >= 0:
+                            self.serializeError("System identifer contains both single and double quote characters")
+                        quote_char = "'"
+                    else:
+                        quote_char = '"'
+                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
+
+                doctype += ">"
+                yield self.encodeStrict(doctype)
+
+            elif type in ("Characters", "SpaceCharacters"):
+                if type == "SpaceCharacters" or in_cdata:
+                    if in_cdata and token["data"].find("</") >= 0:
+                        self.serializeError("Unexpected </ in CDATA")
+                    yield self.encode(token["data"])
+                else:
+                    yield self.encode(escape(token["data"]))
+
+            elif type in ("StartTag", "EmptyTag"):
+                name = token["name"]
+                yield self.encodeStrict("<%s" % name)
+                if name in rcdataElements and not self.escape_rcdata:
+                    in_cdata = True
+                elif in_cdata:
+                    self.serializeError("Unexpected child element of a CDATA element")
+                for (_, attr_name), attr_value in token["data"].items():
+                    # TODO: Add namespace support here
+                    k = attr_name
+                    v = attr_value
+                    yield self.encodeStrict(' ')
+
+                    yield self.encodeStrict(k)
+                    if not self.minimize_boolean_attributes or \
+                        (k not in booleanAttributes.get(name, tuple()) and
+                         k not in booleanAttributes.get("", tuple())):
+                        yield self.encodeStrict("=")
+                        if self.quote_attr_values == "always" or len(v) == 0:
+                            quote_attr = True
+                        elif self.quote_attr_values == "spec":
+                            quote_attr = _quoteAttributeSpec.search(v) is not None
+                        elif self.quote_attr_values == "legacy":
+                            quote_attr = _quoteAttributeLegacy.search(v) is not None
+                        else:
+                            raise ValueError("quote_attr_values must be one of: "
+                                             "'always', 'spec', or 'legacy'")
+                        v = v.replace("&", "&amp;")
+                        if self.escape_lt_in_attrs:
+                            v = v.replace("<", "&lt;")
+                        if quote_attr:
+                            quote_char = self.quote_char
+                            if self.use_best_quote_char:
+                                if "'" in v and '"' not in v:
+                                    quote_char = '"'
+                                elif '"' in v and "'" not in v:
+                                    quote_char = "'"
+                            if quote_char == "'":
+                                v = v.replace("'", "&#39;")
+                            else:
+                                v = v.replace('"', "&quot;")
+                            yield self.encodeStrict(quote_char)
+                            yield self.encode(v)
+                            yield self.encodeStrict(quote_char)
+                        else:
+                            yield self.encode(v)
+                if name in voidElements and self.use_trailing_solidus:
+                    if self.space_before_trailing_solidus:
+                        yield self.encodeStrict(" /")
+                    else:
+                        yield self.encodeStrict("/")
+                yield self.encode(">")
+
+            elif type == "EndTag":
+                name = token["name"]
+                if name in rcdataElements:
+                    in_cdata = False
+                elif in_cdata:
+                    self.serializeError("Unexpected child element of a CDATA element")
+                yield self.encodeStrict("</%s>" % name)
+
+            elif type == "Comment":
+                data = token["data"]
+                if data.find("--") >= 0:
+                    self.serializeError("Comment contains --")
+                yield self.encodeStrict("<!--%s-->" % token["data"])
+
+            elif type == "Entity":
+                name = token["name"]
+                key = name + ";"
+                if key not in entities:
+                    self.serializeError("Entity %s not recognized" % name)
+                if self.resolve_entities and key not in xmlEntities:
+                    data = entities[key]
+                else:
+                    data = "&%s;" % name
+                yield self.encodeStrict(data)
+
+            else:
+                self.serializeError(token["data"])
+
+    def render(self, treewalker, encoding=None):
+        """Serializes the stream from the treewalker into a string
+
+        :arg treewalker: the treewalker to serialize
+
+        :arg encoding: the string encoding to use
+
+        :returns: the serialized tree
+
+        Example:
+
+        >>> from html5lib import parse, getTreeWalker
+        >>> from html5lib.serializer import HTMLSerializer
+        >>> token_stream = parse('<html><body>Hi!</body></html>')
+        >>> walker = getTreeWalker('etree')
+        >>> serializer = HTMLSerializer(omit_optional_tags=False)
+        >>> serializer.render(walker(token_stream))
+        '<html><head></head><body>Hi!</body></html>'
+
+        """
+        if encoding:
+            return b"".join(list(self.serialize(treewalker, encoding)))
+        else:
+            return "".join(list(self.serialize(treewalker)))
+
+    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
+        # XXX The idea is to make data mandatory.
+        self.errors.append(data)
+        if self.strict:
+            raise SerializeError
+
+
+class SerializeError(Exception):
+    """Error in serialized tree"""
+    pass
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/init.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/init.py
@ -0,0 +1 @@
+from __future__ import absolute_import, division, unicode_literals
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/conftest.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/conftest.py
@ -0,0 +1,108 @@
+from __future__ import print_function
+import os.path
+import sys
+
+import pkg_resources
+import pytest
+
+from .tree_construction import TreeConstructionFile
+from .tokenizer import TokenizerFile
+from .sanitizer import SanitizerFile
+
+_dir = os.path.abspath(os.path.dirname(__file__))
+_root = os.path.join(_dir, "..", "..")
+_testdata = os.path.join(_dir, "testdata")
+_tree_construction = os.path.join(_testdata, "tree-construction")
+_tokenizer = os.path.join(_testdata, "tokenizer")
+_sanitizer_testdata = os.path.join(_dir, "sanitizer-testdata")
+
+
+def fail_if_missing_pytest_expect():
+    """Throws an exception halting pytest if pytest-expect isn't working"""
+    try:
+        from pytest_expect import expect  # noqa
+    except ImportError:
+        header = '*' * 78
+        print(
+            '\n' +
+            header + '\n' +
+            'ERROR: Either pytest-expect or its dependency u-msgpack-python is not\n' +
+            'installed. Please install them both before running pytest.\n' +
+            header + '\n',
+            file=sys.stderr
+        )
+        raise
+
+
+fail_if_missing_pytest_expect()
+
+
+def pytest_configure(config):
+    msgs = []
+
+    if not os.path.exists(_testdata):
+        msg = "testdata not available! "
+        if os.path.exists(os.path.join(_root, ".git")):
+            msg += ("Please run git submodule update --init --recursive " +
+                    "and then run tests again.")
+        else:
+            msg += ("The testdata doesn't appear to be included with this package, " +
+                    "so finding the right version will be hard. :(")
+        msgs.append(msg)
+
+    if config.option.update_xfail:
+        # Check for optional requirements
+        req_file = os.path.join(_root, "requirements-optional.txt")
+        if os.path.exists(req_file):
+            with open(req_file, "r") as fp:
+                for line in fp:
+                    if (line.strip() and
+                        not (line.startswith("-r") or
+                             line.startswith("#"))):
+                        if ";" in line:
+                            spec, marker = line.strip().split(";", 1)
+                        else:
+                            spec, marker = line.strip(), None
+                        req = pkg_resources.Requirement.parse(spec)
+                        if marker and not pkg_resources.evaluate_marker(marker):
+                            msgs.append("%s not available in this environment" % spec)
+                        else:
+                            try:
+                                installed = pkg_resources.working_set.find(req)
+                            except pkg_resources.VersionConflict:
+                                msgs.append("Outdated version of %s installed, need %s" % (req.name, spec))
+                            else:
+                                if not installed:
+                                    msgs.append("Need %s" % spec)
+
+        # Check cElementTree
+        import xml.etree.ElementTree as ElementTree
+
+        try:
+            import xml.etree.cElementTree as cElementTree
+        except ImportError:
+            msgs.append("cElementTree unable to be imported")
+        else:
+            if cElementTree.Element is ElementTree.Element:
+                msgs.append("cElementTree is just an alias for ElementTree")
+
+    if msgs:
+        pytest.exit("\n".join(msgs))
+
+
+def pytest_collect_file(path, parent):
+    dir = os.path.abspath(path.dirname)
+    dir_and_parents = set()
+    while dir not in dir_and_parents:
+        dir_and_parents.add(dir)
+        dir = os.path.dirname(dir)
+
+    if _tree_construction in dir_and_parents:
+        if path.ext == ".dat":
+            return TreeConstructionFile(path, parent)
+    elif _tokenizer in dir_and_parents:
+        if path.ext == ".test":
+            return TokenizerFile(path, parent)
+    elif _sanitizer_testdata in dir_and_parents:
+        if path.ext == ".dat":
+            return SanitizerFile(path, parent)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/sanitizer-testdata/tests1.dat
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/sanitizer-testdata/tests1.dat
@ -0,0 +1,433 @@
+[
+  {
+    "name": "IE_Comments",
+    "input": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->",
+    "output": ""
+  },
+
+  {
+    "name": "IE_Comments_2",
+    "input": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
+    "output": "&lt;script&gt;alert('XSS');&lt;/script&gt;"
+  },
+
+  {
+    "name": "allow_colons_in_path_component",
+    "input": "<a href=\"./this:that\">foo</a>",
+    "output": "<a href='./this:that'>foo</a>"
+  },
+
+  {
+    "name": "background_attribute",
+    "input": "<div background=\"javascript:alert('XSS')\"></div>",
+    "output": "<div></div>"
+  },
+
+  {
+    "name": "bgsound",
+    "input": "<bgsound src=\"javascript:alert('XSS');\" />",
+    "output": "&lt;bgsound src=\"javascript:alert('XSS');\"&gt;&lt;/bgsound&gt;"
+  },
+
+  {
+    "name": "div_background_image_unicode_encoded",
+    "input": "<div style=\"background-image:\u00a5\u00a2\u006C\u0028'\u006a\u0061\u00a6\u0061\u00a3\u0063\u00a2\u0069\u00a0\u00a4\u003a\u0061\u006c\u0065\u00a2\u00a4\u0028.1027\u0058.1053\u0053\u0027\u0029'\u0029\">foo</div>",
+    "output": "<div style=''>foo</div>"
+  },
+
+  {
+    "name": "div_expression",
+    "input": "<div style=\"width: expression(alert('XSS'));\">foo</div>",
+    "output": "<div style=''>foo</div>"
+  },
+
+  {
+    "name": "double_open_angle_brackets",
+    "input": "<img src=http://ha.ckers.org/scriptlet.html <",
+    "output": ""
+  },
+
+  {
+    "name": "double_open_angle_brackets_2",
+    "input": "<script src=http://ha.ckers.org/scriptlet.html <",
+    "output": ""
+  },
+
+  {
+    "name": "grave_accents",
+    "input": "<img src=`javascript:alert('XSS')` />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "img_dynsrc_lowsrc",
+    "input": "<img dynsrc=\"javascript:alert('XSS')\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "img_vbscript",
+    "input": "<img src='vbscript:msgbox(\"XSS\")' />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "input_image",
+    "input": "<input type=\"image\" src=\"javascript:alert('XSS');\" />",
+    "output": "<input type='image'/>"
+  },
+
+  {
+    "name": "link_stylesheets",
+    "input": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\" />",
+    "output": "&lt;link href=\"javascript:alert('XSS');\" rel=\"stylesheet\"&gt;"
+  },
+
+  {
+    "name": "link_stylesheets_2",
+    "input": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\" />",
+    "output": "&lt;link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\"&gt;"
+  },
+
+  {
+    "name": "list_style_image",
+    "input": "<li style=\"list-style-image: url(javascript:alert('XSS'))\">foo</li>",
+    "output": "<li style=''>foo</li>"
+  },
+
+  {
+    "name": "no_closing_script_tags",
+    "input": "<script src=http://ha.ckers.org/xss.js?<b>",
+    "output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;"
+  },
+
+  {
+    "name": "non_alpha_non_digit",
+    "input": "<script/XSS src=\"http://ha.ckers.org/xss.js\"></script>",
+    "output": "&lt;script src=\"http://ha.ckers.org/xss.js\" xss=\"\"&gt;&lt;/script&gt;"
+  },
+
+  {
+    "name": "non_alpha_non_digit_2",
+    "input": "<a onclick!\\#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>",
+    "output": "<a>foo</a>"
+  },
+
+  {
+    "name": "non_alpha_non_digit_3",
+    "input": "<img/src=\"http://ha.ckers.org/xss.js\"/>",
+    "output": "<img src='http://ha.ckers.org/xss.js'/>"
+  },
+
+  {
+    "name": "non_alpha_non_digit_II",
+    "input": "<a href!\\#$%&()*~+-_.,:;?@[/|]^`=alert('XSS')>foo</a>",
+    "output": "<a>foo</a>"
+  },
+
+  {
+    "name": "non_alpha_non_digit_III",
+    "input": "<a/href=\"javascript:alert('XSS');\">foo</a>",
+    "output": "<a>foo</a>"
+  },
+
+  {
+    "name": "platypus",
+    "input": "<a href=\"http://www.ragingplatypus.com/\" style=\"display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;\">never trust your upstream platypus</a>",
+    "output": "<a href='http://www.ragingplatypus.com/' style='display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;'>never trust your upstream platypus</a>"
+  },
+
+  {
+    "name": "protocol_resolution_in_script_tag",
+    "input": "<script src=//ha.ckers.org/.j></script>",
+    "output": "&lt;script src=\"//ha.ckers.org/.j\"&gt;&lt;/script&gt;"
+  },
+
+  {
+    "name": "should_allow_anchors",
+    "input": "<a href='foo' onclick='bar'><script>baz</script></a>",
+    "output": "<a href='foo'>&lt;script&gt;baz&lt;/script&gt;</a>"
+  },
+
+  {
+    "name": "should_allow_image_alt_attribute",
+    "input": "<img alt='foo' onclick='bar' />",
+    "output": "<img alt='foo'/>"
+  },
+
+  {
+    "name": "should_allow_image_height_attribute",
+    "input": "<img height='foo' onclick='bar' />",
+    "output": "<img height='foo'/>"
+  },
+
+  {
+    "name": "should_allow_image_src_attribute",
+    "input": "<img src='foo' onclick='bar' />",
+    "output": "<img src='foo'/>"
+  },
+
+  {
+    "name": "should_allow_image_width_attribute",
+    "input": "<img width='foo' onclick='bar' />",
+    "output": "<img width='foo'/>"
+  },
+
+  {
+    "name": "should_handle_blank_text",
+    "input": "",
+    "output": ""
+  },
+
+  {
+    "name": "should_handle_malformed_image_tags",
+    "input": "<img \"\"\"><script>alert(\"XSS\")</script>\">",
+    "output": "<img/>&lt;script&gt;alert(\"XSS\")&lt;/script&gt;\"&gt;"
+  },
+
+  {
+    "name": "should_handle_non_html",
+    "input": "abc",
+    "output": "abc"
+  },
+
+  {
+    "name": "should_not_fall_for_ridiculous_hack",
+    "input": "<img\nsrc\n=\n\"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n\"\n />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_0",
+    "input": "<img src=\"javascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_1",
+    "input": "<img src=javascript:alert('XSS') />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_10",
+    "input": "<img src=\"jav&#x0A;ascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_11",
+    "input": "<img src=\"jav&#x0D;ascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_12",
+    "input": "<img src=\" &#14;  javascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_13",
+    "input": "<img src=\"&#x20;javascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_14",
+    "input": "<img src=\"&#xA0;javascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_2",
+    "input": "<img src=\"JaVaScRiPt:alert('XSS')\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_3",
+    "input": "<img src='javascript:alert(&quot;XSS&quot;)' />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_4",
+    "input": "<img src='javascript:alert(String.fromCharCode(88,83,83))' />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_5",
+    "input": "<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_6",
+    "input": "<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_7",
+    "input": "<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_8",
+    "input": "<img src=\"jav\tascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_9",
+    "input": "<img src=\"jav&#x09;ascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_sanitize_half_open_scripts",
+    "input": "<img src=\"javascript:alert('XSS')\"",
+    "output": ""
+  },
+
+  {
+    "name": "should_sanitize_invalid_script_tag",
+    "input": "<script/XSS SRC=\"http://ha.ckers.org/xss.js\"></script>",
+    "output": "&lt;script src=\"http://ha.ckers.org/xss.js\" xss=\"\"&gt;&lt;/script&gt;"
+  },
+
+  {
+    "name": "should_sanitize_script_tag_with_multiple_open_brackets",
+    "input": "<<script>alert(\"XSS\");//<</script>",
+    "output": "&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;"
+  },
+
+  {
+    "name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
+    "input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
+    "output": ""
+  },
+
+  {
+    "name": "should_sanitize_tag_broken_up_by_null",
+    "input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
+    "output": "&lt;scr\ufffdipt&gt;alert(\"XSS\")&lt;/scr\ufffdipt&gt;"
+  },
+
+  {
+    "name": "should_sanitize_unclosed_script",
+    "input": "<script src=http://ha.ckers.org/xss.js?<b>",
+    "output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;"
+  },
+
+  {
+    "name": "should_strip_href_attribute_in_a_with_bad_protocols",
+    "input": "<a href=\"javascript:XSS\" title=\"1\">boo</a>",
+    "output": "<a title='1'>boo</a>"
+  },
+
+  {
+    "name": "should_strip_href_attribute_in_a_with_bad_protocols_and_whitespace",
+    "input": "<a href=\" javascript:XSS\" title=\"1\">boo</a>",
+    "output": "<a title='1'>boo</a>"
+  },
+
+  {
+    "name": "should_strip_src_attribute_in_img_with_bad_protocols",
+    "input": "<img src=\"javascript:XSS\" title=\"1\">boo</img>",
+    "output": "<img title='1'/>boo"
+  },
+
+  {
+    "name": "should_strip_src_attribute_in_img_with_bad_protocols_and_whitespace",
+    "input": "<img src=\" javascript:XSS\" title=\"1\">boo</img>",
+    "output": "<img title='1'/>boo"
+  },
+
+  {
+    "name": "xml_base",
+    "input": "<div xml:base=\"javascript:alert('XSS');//\">foo</div>",
+    "output": "<div>foo</div>"
+  },
+
+  {
+    "name": "xul",
+    "input": "<p style=\"-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')\">fubar</p>",
+    "output": "<p style=''>fubar</p>"
+  },
+
+  {
+    "name": "quotes_in_attributes",
+    "input": "<img src='foo' title='\"foo\" bar' />",
+    "output": "<img src='foo' title='\"foo\" bar'/>"
+  },
+
+  {
+    "name": "uri_refs_in_svg_attributes",
+    "input": "<svg><rect fill='url(#foo)' />",
+    "output": "<svg><rect fill='url(#foo)'></rect></svg>"
+  },
+
+  {
+    "name": "absolute_uri_refs_in_svg_attributes",
+    "input": "<svg><rect fill='url(http://bad.com/) #fff' />",
+    "output": "<svg><rect fill='  #fff'></rect></svg>"
+  },
+
+  {
+    "name": "uri_ref_with_space_in svg_attribute",
+    "input": "<svg><rect fill='url(\n#foo)' />",
+    "output": "<svg><rect fill='url(\n#foo)'></rect></svg>"
+  },
+
+  {
+    "name": "absolute_uri_ref_with_space_in svg_attribute",
+    "input": "<svg><rect fill=\"url(\nhttp://bad.com/)\" />",
+    "output": "<svg><rect fill=' '></rect></svg>"
+  },
+
+  {
+    "name": "allow_html5_image_tag",
+    "input": "<image src='foo' />",
+    "output": "<img src='foo'/>"
+  },
+
+  {
+    "name": "style_attr_end_with_nothing",
+    "input": "<div style=\"color: blue\" />",
+    "output": "<div style='color: blue;'></div>"
+  },
+
+  {
+    "name": "style_attr_end_with_space",
+    "input": "<div style=\"color: blue \" />",
+    "output": "<div style='color: blue ;'></div>"
+  },
+
+  {
+    "name": "style_attr_end_with_semicolon",
+    "input": "<div style=\"color: blue;\" />",
+    "output": "<div style='color: blue;'></div>"
+  },
+
+  {
+    "name": "style_attr_end_with_semicolon_space",
+    "input": "<div style=\"color: blue; \" />",
+    "output": "<div style='color: blue;'></div>"
+  },
+  
+  {
+   "name": "attributes_with_embedded_quotes",
+   "input": "<img src=doesntexist.jpg\"'onerror=\"alert(1) />",
+   "output": "<img src='doesntexist.jpg\"&#39;onerror=\"alert(1)'/>"
+  },
+  
+  {
+   "name": "attributes_with_embedded_quotes_II",
+   "input": "<img src=notthere.jpg\"\"onerror=\"alert(2) />",
+   "output": "<img src='notthere.jpg\"\"onerror=\"alert(2)'/>"
+  }
+]
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/sanitizer.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/sanitizer.py
@ -0,0 +1,50 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import codecs
+import json
+
+import pytest
+
+from html5lib import parseFragment, serialize
+
+
+class SanitizerFile(pytest.File):
+    def collect(self):
+        with codecs.open(str(self.fspath), "r", encoding="utf-8") as fp:
+            tests = json.load(fp)
+        for i, test in enumerate(tests):
+            yield SanitizerTest(str(i), self, test=test)
+
+
+class SanitizerTest(pytest.Item):
+    def __init__(self, name, parent, test):
+        super(SanitizerTest, self).__init__(name, parent)
+        self.obj = lambda: 1  # this is to hack around skipif needing a function!
+        self.test = test
+
+    def runtest(self):
+        input = self.test["input"]
+        expected = self.test["output"]
+
+        parsed = parseFragment(input)
+        serialized = serialize(parsed,
+                               sanitize=True,
+                               omit_optional_tags=False,
+                               use_trailing_solidus=True,
+                               space_before_trailing_solidus=False,
+                               quote_attr_values="always",
+                               quote_char="'",
+                               alphabetical_attributes=True)
+        errorMsg = "\n".join(["\n\nInput:", input,
+                              "\nExpected:", expected,
+                              "\nReceived:", serialized])
+        assert expected == serialized, errorMsg
+
+    def repr_failure(self, excinfo):
+        traceback = excinfo.traceback
+        ntraceback = traceback.cut(path=__file__)
+        excinfo.traceback = ntraceback.filter()
+
+        return excinfo.getrepr(funcargs=True,
+                               showlocals=False,
+                               style="short", tbfilter=False)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/serializer-testdata/core.test
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/serializer-testdata/core.test
@ -0,0 +1,395 @@
+{
+    "tests": [
+        {
+            "expected": [
+                "<span title='test \"with\" &amp;quot;'>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "test \"with\" &quot;"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value escaping"
+        },
+        {
+            "expected": [
+                "<span title=foo>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value non-quoting"
+        },
+        {
+            "expected": [
+                "<span title=\"foo<bar\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo<bar"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value non-quoting (with <)"
+        },
+        {
+            "expected": [
+                "<span title=\"foo=bar\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo=bar"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value quoting (with =)"
+        },
+        {
+            "expected": [
+                "<span title=\"foo>bar\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo>bar"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value quoting (with >)"
+        },
+        {
+            "expected": [
+                "<span title='foo\"bar'>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo\"bar"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value quoting (with \")"
+        },
+        {
+            "expected": [
+                "<span title=\"foo'bar\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo'bar"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value quoting (with ')"
+        },
+        {
+            "expected": [
+                "<span title=\"foo'bar&quot;baz\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo'bar\"baz"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value quoting (with both \" and ')"
+        },
+        {
+            "expected": [
+                "<span title=\"foo bar\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo bar"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value quoting (with space)"
+        },
+        {
+            "expected": [
+                "<span title=\"foo\tbar\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo\tbar"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value quoting (with tab)"
+        },
+        {
+            "expected": [
+                "<span title=\"foo\nbar\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo\nbar"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value quoting (with LF)"
+        },
+        {
+            "expected": [
+                "<span title=\"foo\rbar\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo\rbar"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value quoting (with CR)"
+        },
+        {
+            "expected": [
+                "<span title=\"foo\u000bbar\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo\u000bbar"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value non-quoting (with linetab)"
+        },
+        {
+            "expected": [
+                "<span title=\"foo\fbar\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "foo\fbar"
+                        }
+                    ]
+                ]
+            ],
+            "description": "proper attribute value quoting (with form feed)"
+        },
+        {
+            "expected": [
+                "<img>"
+            ],
+            "input": [
+                [
+                    "EmptyTag",
+                    "img",
+                    {}
+                ]
+            ],
+            "description": "void element (as EmptyTag token)"
+        },
+        {
+            "expected": [
+                "<!DOCTYPE foo>"
+            ],
+            "input": [
+                [
+                    "Doctype",
+                    "foo"
+                ]
+            ],
+            "description": "doctype in error"
+        },
+        {
+            "expected": [
+                "a&lt;b&gt;c&amp;d"
+            ],
+            "input": [
+                [
+                    "Characters",
+                    "a<b>c&d"
+                ]
+            ],
+            "description": "character data",
+            "options": {
+                "encoding": "utf-8"
+            }
+        },
+        {
+            "expected": [
+                "<script>a<b>c&d"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "script",
+                    {}
+                ],
+                [
+                    "Characters",
+                    "a<b>c&d"
+                ]
+            ],
+            "description": "rcdata"
+        },
+        {
+            "expected": [
+                "<!DOCTYPE HTML>"
+            ],
+            "input": [
+                [
+                    "Doctype",
+                    "HTML"
+                ]
+            ],
+            "description": "doctype"
+        },
+        {
+            "expected": [
+                "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"
+            ],
+            "input": [
+                [
+                    "Doctype",
+                    "HTML",
+                    "-//W3C//DTD HTML 4.01//EN",
+                    "http://www.w3.org/TR/html4/strict.dtd"
+                ]
+            ],
+            "description": "HTML 4.01 DOCTYPE"
+        },
+        {
+            "expected": [
+                "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">"
+            ],
+            "input": [
+                [
+                    "Doctype",
+                    "HTML",
+                    "-//W3C//DTD HTML 4.01//EN"
+                ]
+            ],
+            "description": "HTML 4.01 DOCTYPE without system identifer"
+        },
+        {
+            "expected": [
+                "<!DOCTYPE html SYSTEM \"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd\">"
+            ],
+            "input": [
+                [
+                    "Doctype",
+                    "html",
+                    "",
+                    "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"
+                ]
+            ],
+            "description": "IBM DOCTYPE without public identifer"
+        }
+    ]
+}
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/serializer-testdata/injectmeta.test
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/serializer-testdata/injectmeta.test
@ -0,0 +1,350 @@
+{
+    "tests": [
+        {
+            "expected": [
+                ""
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head",
+                    {}
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head"
+                ]
+            ],
+            "description": "no encoding",
+            "options": {
+                "inject_meta_charset": true
+            }
+        },
+        {
+            "expected": [
+                "<meta charset=utf-8>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head",
+                    {}
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head"
+                ]
+            ],
+            "description": "empytag head",
+            "options": {
+                "encoding": "utf-8",
+                "inject_meta_charset": true
+            }
+        },
+        {
+            "expected": [
+                "<meta charset=utf-8><title>foo</title>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head",
+                    {}
+                ],
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "title",
+                    {}
+                ],
+                [
+                    "Characters",
+                    "foo"
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "title"
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head"
+                ]
+            ],
+            "description": "head w/title",
+            "options": {
+                "encoding": "utf-8",
+                "inject_meta_charset": true
+            }
+        },
+        {
+            "expected": [
+                "<meta charset=utf-8>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head",
+                    {}
+                ],
+                [
+                    "EmptyTag",
+                    "meta",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "charset",
+                            "value": "ascii"
+                        }
+                    ]
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head"
+                ]
+            ],
+            "description": "head w/meta-charset",
+            "options": {
+                "encoding": "utf-8",
+                "inject_meta_charset": true
+            }
+        },
+        {
+            "expected": [
+                "<meta charset=utf-8><meta charset=utf-8>",
+                "<head><meta charset=utf-8><meta charset=ascii>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head",
+                    {}
+                ],
+                [
+                    "EmptyTag",
+                    "meta",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "charset",
+                            "value": "ascii"
+                        }
+                    ]
+                ],
+                [
+                    "EmptyTag",
+                    "meta",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "charset",
+                            "value": "ascii"
+                        }
+                    ]
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head"
+                ]
+            ],
+            "description": "head w/ two meta-charset",
+            "options": {
+                "encoding": "utf-8",
+                "inject_meta_charset": true
+            }
+        },
+        {
+            "expected": [
+                "<meta charset=utf-8><meta content=noindex name=robots>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head",
+                    {}
+                ],
+                [
+                    "EmptyTag",
+                    "meta",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "name",
+                            "value": "robots"
+                        },
+                        {
+                            "namespace": null,
+                            "name": "content",
+                            "value": "noindex"
+                        }
+                    ]
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head"
+                ]
+            ],
+            "description": "head w/robots",
+            "options": {
+                "encoding": "utf-8",
+                "inject_meta_charset": true
+            }
+        },
+        {
+            "expected": [
+                "<meta content=noindex name=robots><meta charset=utf-8>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head",
+                    {}
+                ],
+                [
+                    "EmptyTag",
+                    "meta",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "name",
+                            "value": "robots"
+                        },
+                        {
+                            "namespace": null,
+                            "name": "content",
+                            "value": "noindex"
+                        }
+                    ]
+                ],
+                [
+                    "EmptyTag",
+                    "meta",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "charset",
+                            "value": "ascii"
+                        }
+                    ]
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head"
+                ]
+            ],
+            "description": "head w/robots & charset",
+            "options": {
+                "encoding": "utf-8",
+                "inject_meta_charset": true
+            }
+        },
+        {
+            "expected": [
+                "<meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head",
+                    {}
+                ],
+                [
+                    "EmptyTag",
+                    "meta",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "http-equiv",
+                            "value": "content-type"
+                        },
+                        {
+                            "namespace": null,
+                            "name": "content",
+                            "value": "text/html; charset=ascii"
+                        }
+                    ]
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head"
+                ]
+            ],
+            "description": "head w/ charset in http-equiv content-type",
+            "options": {
+                "encoding": "utf-8",
+                "inject_meta_charset": true
+            }
+        },
+        {
+            "expected": [
+                "<meta content=noindex name=robots><meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head",
+                    {}
+                ],
+                [
+                    "EmptyTag",
+                    "meta",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "name",
+                            "value": "robots"
+                        },
+                        {
+                            "namespace": null,
+                            "name": "content",
+                            "value": "noindex"
+                        }
+                    ]
+                ],
+                [
+                    "EmptyTag",
+                    "meta",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "http-equiv",
+                            "value": "content-type"
+                        },
+                        {
+                            "namespace": null,
+                            "name": "content",
+                            "value": "text/html; charset=ascii"
+                        }
+                    ]
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "head"
+                ]
+            ],
+            "description": "head w/robots & charset in http-equiv content-type",
+            "options": {
+                "encoding": "utf-8",
+                "inject_meta_charset": true
+            }
+        }
+    ]
+}
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/serializer-testdata/optionaltags.test
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/serializer-testdata/optionaltags.test
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/serializer-testdata/options.test
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/serializer-testdata/options.test
@ -0,0 +1,334 @@
+{
+    "tests": [
+        {
+            "expected": [
+                "<span title='test &#39;with&#39; quote_char'>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "test 'with' quote_char"
+                        }
+                    ]
+                ]
+            ],
+            "description": "quote_char=\"'\"",
+            "options": {
+                "quote_char": "'"
+            }
+        },
+        {
+            "expected": [
+                "<button disabled>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "button",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "disabled",
+                            "value": "disabled"
+                        }
+                    ]
+                ]
+            ],
+            "description": "quote_attr_values='always'",
+            "options": {
+                "quote_attr_values": "always"
+            }
+        },
+        {
+            "expected": [
+                "<div itemscope>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "itemscope",
+                            "value": "itemscope"
+                        }
+                    ]
+                ]
+            ],
+            "description": "quote_attr_values='always' with itemscope",
+            "options": {
+                "quote_attr_values": "always"
+            }
+        },
+        {
+            "expected": [
+                "<div irrelevant>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "irrelevant",
+                            "value": "irrelevant"
+                        }
+                    ]
+                ]
+            ],
+            "description": "quote_attr_values='always' with irrelevant",
+            "options": {
+                "quote_attr_values": "always"
+            }
+        },
+        {
+            "expected": [
+                "<div class=\"foo\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "class",
+                            "value": "foo"
+                        }
+                    ]
+                ]
+            ],
+            "description": "non-minimized quote_attr_values='always'",
+            "options": {
+                "quote_attr_values": "always"
+            }
+        },
+        {
+            "expected": [
+                "<div class=foo>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "class",
+                            "value": "foo"
+                        }
+                    ]
+                ]
+            ],
+            "description": "non-minimized quote_attr_values='legacy'",
+            "options": {
+                "quote_attr_values": "legacy"
+            }
+        },
+        {
+            "expected": [
+                "<div class=foo>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "class",
+                            "value": "foo"
+                        }
+                    ]
+                ]
+            ],
+            "description": "non-minimized quote_attr_values='spec'",
+            "options": {
+                "quote_attr_values": "spec"
+            }
+        },
+        {
+            "expected": [
+                "<img />"
+            ],
+            "input": [
+                [
+                    "EmptyTag",
+                    "img",
+                    {}
+                ]
+            ],
+            "description": "use_trailing_solidus=true with void element",
+            "options": {
+                "use_trailing_solidus": true
+            }
+        },
+        {
+            "expected": [
+                "<div>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    {}
+                ]
+            ],
+            "description": "use_trailing_solidus=true with non-void element",
+            "options": {
+                "use_trailing_solidus": true
+            }
+        },
+        {
+            "expected": [
+                "<div itemscope=itemscope>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "itemscope",
+                            "value": "itemscope"
+                        }
+                    ]
+                ]
+            ],
+            "description": "minimize_boolean_attributes=false",
+            "options": {
+                "minimize_boolean_attributes": false
+            }
+        },
+        {
+            "expected": [
+                "<div irrelevant=irrelevant>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "irrelevant",
+                            "value": "irrelevant"
+                        }
+                    ]
+                ]
+            ],
+            "description": "minimize_boolean_attributes=false",
+            "options": {
+                "minimize_boolean_attributes": false
+            }
+        },
+        {
+            "expected": [
+                "<div itemscope=\"\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "itemscope",
+                            "value": ""
+                        }
+                    ]
+                ]
+            ],
+            "description": "minimize_boolean_attributes=false with empty value",
+            "options": {
+                "minimize_boolean_attributes": false
+            }
+        },
+        {
+            "expected": [
+                "<div irrelevant=\"\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "irrelevant",
+                            "value": ""
+                        }
+                    ]
+                ]
+            ],
+            "description": "minimize_boolean_attributes=false with empty value",
+            "options": {
+                "minimize_boolean_attributes": false
+            }
+        },
+        {
+            "expected": [
+                "<a title=\"a&lt;b>c&amp;d\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "a",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "title",
+                            "value": "a<b>c&d"
+                        }
+                    ]
+                ]
+            ],
+            "description": "escape less than signs in attribute values",
+            "options": {
+                "escape_lt_in_attrs": true
+            }
+        },
+        {
+            "expected": [
+                "<script>a&lt;b&gt;c&amp;d"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "script",
+                    {}
+                ],
+                [
+                    "Characters",
+                    "a<b>c&d"
+                ]
+            ],
+            "description": "rcdata",
+            "options": {
+                "escape_rcdata": true
+            }
+        }
+    ]
+}
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/serializer-testdata/whitespace.test
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/serializer-testdata/whitespace.test
@ -0,0 +1,198 @@
+{
+    "tests": [
+        {
+            "expected": [
+                " foo"
+            ],
+            "input": [
+                [
+                    "Characters",
+                    "\t\r\n\f foo"
+                ]
+            ],
+            "description": "bare text with leading spaces",
+            "options": {
+                "strip_whitespace": true
+            }
+        },
+        {
+            "expected": [
+                "foo "
+            ],
+            "input": [
+                [
+                    "Characters",
+                    "foo \t\r\n\f"
+                ]
+            ],
+            "description": "bare text with trailing spaces",
+            "options": {
+                "strip_whitespace": true
+            }
+        },
+        {
+            "expected": [
+                "foo bar"
+            ],
+            "input": [
+                [
+                    "Characters",
+                    "foo \t\r\n\f bar"
+                ]
+            ],
+            "description": "bare text with inner spaces",
+            "options": {
+                "strip_whitespace": true
+            }
+        },
+        {
+            "expected": [
+                "<pre>\t\r\n\f foo \t\r\n\f bar \t\r\n\f</pre>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "pre",
+                    {}
+                ],
+                [
+                    "Characters",
+                    "\t\r\n\f foo \t\r\n\f bar \t\r\n\f"
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "pre"
+                ]
+            ],
+            "description": "text within <pre>",
+            "options": {
+                "strip_whitespace": true
+            }
+        },
+        {
+            "expected": [
+                "<pre>\t\r\n\f fo<span>o \t\r\n\f b</span>ar \t\r\n\f</pre>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "pre",
+                    {}
+                ],
+                [
+                    "Characters",
+                    "\t\r\n\f fo"
+                ],
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span",
+                    {}
+                ],
+                [
+                    "Characters",
+                    "o \t\r\n\f b"
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "span"
+                ],
+                [
+                    "Characters",
+                    "ar \t\r\n\f"
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "pre"
+                ]
+            ],
+            "description": "text within <pre>, with inner markup",
+            "options": {
+                "strip_whitespace": true
+            }
+        },
+        {
+            "expected": [
+                "<textarea>\t\r\n\f foo \t\r\n\f bar \t\r\n\f</textarea>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "textarea",
+                    {}
+                ],
+                [
+                    "Characters",
+                    "\t\r\n\f foo \t\r\n\f bar \t\r\n\f"
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "textarea"
+                ]
+            ],
+            "description": "text within <textarea>",
+            "options": {
+                "strip_whitespace": true
+            }
+        },
+        {
+            "expected": [
+                "<script>\t\r\n\f foo \t\r\n\f bar \t\r\n\f</script>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "script",
+                    {}
+                ],
+                [
+                    "Characters",
+                    "\t\r\n\f foo \t\r\n\f bar \t\r\n\f"
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "script"
+                ]
+            ],
+            "description": "text within <script>",
+            "options": {
+                "strip_whitespace": true
+            }
+        },
+        {
+            "expected": [
+                "<style>\t\r\n\f foo \t\r\n\f bar \t\r\n\f</style>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "style",
+                    {}
+                ],
+                [
+                    "Characters",
+                    "\t\r\n\f foo \t\r\n\f bar \t\r\n\f"
+                ],
+                [
+                    "EndTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "style"
+                ]
+            ],
+            "description": "text within <style>",
+            "options": {
+                "strip_whitespace": true
+            }
+        }
+    ]
+}
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/support.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/support.py
@ -0,0 +1,198 @@
+from __future__ import absolute_import, division, unicode_literals
+
+# pylint:disable=wrong-import-position
+
+import os
+import sys
+import codecs
+import glob
+import xml.sax.handler
+
+base_path = os.path.split(__file__)[0]
+
+test_dir = os.path.join(base_path, 'testdata')
+sys.path.insert(0, os.path.abspath(os.path.join(base_path,
+                                                os.path.pardir,
+                                                os.path.pardir)))
+
+from html5lib import treebuilders, treewalkers, treeadapters  # noqa
+del base_path
+
+# Build a dict of available trees
+treeTypes = {}
+
+# DOM impls
+treeTypes["DOM"] = {
+    "builder": treebuilders.getTreeBuilder("dom"),
+    "walker": treewalkers.getTreeWalker("dom")
+}
+
+# ElementTree impls
+import xml.etree.ElementTree as ElementTree  # noqa
+treeTypes['ElementTree'] = {
+    "builder": treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True),
+    "walker": treewalkers.getTreeWalker("etree", ElementTree)
+}
+
+try:
+    import xml.etree.cElementTree as cElementTree  # noqa
+except ImportError:
+    treeTypes['cElementTree'] = None
+else:
+    # On Python 3.3 and above cElementTree is an alias, don't run them twice.
+    if cElementTree.Element is ElementTree.Element:
+        treeTypes['cElementTree'] = None
+    else:
+        treeTypes['cElementTree'] = {
+            "builder": treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True),
+            "walker": treewalkers.getTreeWalker("etree", cElementTree)
+        }
+
+try:
+    import lxml.etree as lxml  # noqa
+except ImportError:
+    treeTypes['lxml'] = None
+else:
+    treeTypes['lxml'] = {
+        "builder": treebuilders.getTreeBuilder("lxml"),
+        "walker": treewalkers.getTreeWalker("lxml")
+    }
+
+# Genshi impls
+try:
+    import genshi  # noqa
+except ImportError:
+    treeTypes["genshi"] = None
+else:
+    treeTypes["genshi"] = {
+        "builder": treebuilders.getTreeBuilder("dom"),
+        "adapter": lambda tree: treeadapters.genshi.to_genshi(treewalkers.getTreeWalker("dom")(tree)),
+        "walker": treewalkers.getTreeWalker("genshi")
+    }
+
+# pylint:enable=wrong-import-position
+
+
+def get_data_files(subdirectory, files='*.dat', search_dir=test_dir):
+    return sorted(glob.glob(os.path.join(search_dir, subdirectory, files)))
+
+
+class DefaultDict(dict):
+    def __init__(self, default, *args, **kwargs):
+        self.default = default
+        dict.__init__(self, *args, **kwargs)
+
+    def __getitem__(self, key):
+        return dict.get(self, key, self.default)
+
+
+class TestData(object):
+    def __init__(self, filename, newTestHeading="data", encoding="utf8"):
+        if encoding is None:
+            self.f = open(filename, mode="rb")
+        else:
+            self.f = codecs.open(filename, encoding=encoding)
+        self.encoding = encoding
+        self.newTestHeading = newTestHeading
+
+    def __iter__(self):
+        data = DefaultDict(None)
+        key = None
+        for line in self.f:
+            heading = self.isSectionHeading(line)
+            if heading:
+                if data and heading == self.newTestHeading:
+                    # Remove trailing newline
+                    data[key] = data[key][:-1]
+                    yield self.normaliseOutput(data)
+                    data = DefaultDict(None)
+                key = heading
+                data[key] = "" if self.encoding else b""
+            elif key is not None:
+                data[key] += line
+        if data:
+            yield self.normaliseOutput(data)
+
+    def isSectionHeading(self, line):
+        """If the current heading is a test section heading return the heading,
+        otherwise return False"""
+        # print(line)
+        if line.startswith("#" if self.encoding else b"#"):
+            return line[1:].strip()
+        else:
+            return False
+
+    def normaliseOutput(self, data):
+        # Remove trailing newlines
+        for key, value in data.items():
+            if value.endswith("\n" if self.encoding else b"\n"):
+                data[key] = value[:-1]
+        return data
+
+
+def convert(stripChars):
+    def convertData(data):
+        """convert the output of str(document) to the format used in the testcases"""
+        data = data.split("\n")
+        rv = []
+        for line in data:
+            if line.startswith("|"):
+                rv.append(line[stripChars:])
+            else:
+                rv.append(line)
+        return "\n".join(rv)
+    return convertData
+
+convertExpected = convert(2)
+
+
+def errorMessage(input, expected, actual):
+    msg = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n" %
+           (repr(input), repr(expected), repr(actual)))
+    if sys.version_info[0] == 2:
+        msg = msg.encode("ascii", "backslashreplace")
+    return msg
+
+
+class TracingSaxHandler(xml.sax.handler.ContentHandler):
+    def __init__(self):
+        xml.sax.handler.ContentHandler.__init__(self)
+        self.visited = []
+
+    def startDocument(self):
+        self.visited.append('startDocument')
+
+    def endDocument(self):
+        self.visited.append('endDocument')
+
+    def startPrefixMapping(self, prefix, uri):
+        # These are ignored as their order is not guaranteed
+        pass
+
+    def endPrefixMapping(self, prefix):
+        # These are ignored as their order is not guaranteed
+        pass
+
+    def startElement(self, name, attrs):
+        self.visited.append(('startElement', name, attrs))
+
+    def endElement(self, name):
+        self.visited.append(('endElement', name))
+
+    def startElementNS(self, name, qname, attrs):
+        self.visited.append(('startElementNS', name, qname, dict(attrs)))
+
+    def endElementNS(self, name, qname):
+        self.visited.append(('endElementNS', name, qname))
+
+    def characters(self, content):
+        self.visited.append(('characters', content))
+
+    def ignorableWhitespace(self, whitespace):
+        self.visited.append(('ignorableWhitespace', whitespace))
+
+    def processingInstruction(self, target, data):
+        self.visited.append(('processingInstruction', target, data))
+
+    def skippedEntity(self, name):
+        self.visited.append(('skippedEntity', name))
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_alphabeticalattributes.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_alphabeticalattributes.py
@ -0,0 +1,78 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from collections import OrderedDict
+
+import pytest
+
+import html5lib
+from html5lib.filters.alphabeticalattributes import Filter
+from html5lib.serializer import HTMLSerializer
+
+
+@pytest.mark.parametrize('msg, attrs, expected_attrs', [
+    (
+        'no attrs',
+        {},
+        {}
+    ),
+    (
+        'one attr',
+        {(None, 'alt'): 'image'},
+        OrderedDict([((None, 'alt'), 'image')])
+    ),
+    (
+        'multiple attrs',
+        {
+            (None, 'src'): 'foo',
+            (None, 'alt'): 'image',
+            (None, 'style'): 'border: 1px solid black;'
+        },
+        OrderedDict([
+            ((None, 'alt'), 'image'),
+            ((None, 'src'), 'foo'),
+            ((None, 'style'), 'border: 1px solid black;')
+        ])
+    ),
+])
+def test_alphabetizing(msg, attrs, expected_attrs):
+    tokens = [{'type': 'StartTag', 'name': 'img', 'data': attrs}]
+    output_tokens = list(Filter(tokens))
+
+    attrs = output_tokens[0]['data']
+    assert attrs == expected_attrs
+
+
+def test_with_different_namespaces():
+    tokens = [{
+        'type': 'StartTag',
+        'name': 'pattern',
+        'data': {
+            (None, 'id'): 'patt1',
+            ('http://www.w3.org/1999/xlink', 'href'): '#patt2'
+        }
+    }]
+    output_tokens = list(Filter(tokens))
+
+    attrs = output_tokens[0]['data']
+    assert attrs == OrderedDict([
+        ((None, 'id'), 'patt1'),
+        (('http://www.w3.org/1999/xlink', 'href'), '#patt2')
+    ])
+
+
+def test_with_serializer():
+    """Verify filter works in the context of everything else"""
+    parser = html5lib.HTMLParser()
+    dom = parser.parseFragment('<svg><pattern xlink:href="#patt2" id="patt1"></svg>')
+    walker = html5lib.getTreeWalker('etree')
+    ser = HTMLSerializer(
+        alphabetical_attributes=True,
+        quote_attr_values='always'
+    )
+
+    # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When
+    # that gets fixed, we can fix this expected result.
+    assert (
+        ser.render(walker(dom)) ==
+        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
+    )
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_encoding.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_encoding.py
@ -0,0 +1,116 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+
+import pytest
+
+from .support import get_data_files, test_dir, errorMessage, TestData as _TestData
+from html5lib import HTMLParser, _inputstream
+
+
+def test_basic_prescan_length():
+    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
+    pad = 1024 - len(data) + 1
+    data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
+    assert len(data) == 1024  # Sanity
+    stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
+    assert 'utf-8' == stream.charEncoding[0].name
+
+
+def test_parser_reparse():
+    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
+    pad = 10240 - len(data) + 1
+    data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
+    assert len(data) == 10240  # Sanity
+    stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
+    assert 'windows-1252' == stream.charEncoding[0].name
+    p = HTMLParser(namespaceHTMLElements=False)
+    doc = p.parse(data, useChardet=False)
+    assert 'utf-8' == p.documentEncoding
+    assert doc.find(".//title").text == "Caf\u00E9"
+
+
+@pytest.mark.parametrize("expected,data,kwargs", [
+    ("utf-16le", b"\xFF\xFE", {"override_encoding": "iso-8859-2"}),
+    ("utf-16be", b"\xFE\xFF", {"override_encoding": "iso-8859-2"}),
+    ("utf-8", b"\xEF\xBB\xBF", {"override_encoding": "iso-8859-2"}),
+    ("iso-8859-2", b"", {"override_encoding": "iso-8859-2", "transport_encoding": "iso-8859-3"}),
+    ("iso-8859-2", b"<meta charset=iso-8859-3>", {"transport_encoding": "iso-8859-2"}),
+    ("iso-8859-2", b"<meta charset=iso-8859-2>", {"same_origin_parent_encoding": "iso-8859-3"}),
+    ("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-2", "likely_encoding": "iso-8859-3"}),
+    ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16", "likely_encoding": "iso-8859-2"}),
+    ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16be", "likely_encoding": "iso-8859-2"}),
+    ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16le", "likely_encoding": "iso-8859-2"}),
+    ("iso-8859-2", b"", {"likely_encoding": "iso-8859-2", "default_encoding": "iso-8859-3"}),
+    ("iso-8859-2", b"", {"default_encoding": "iso-8859-2"}),
+    ("windows-1252", b"", {"default_encoding": "totally-bogus-string"}),
+    ("windows-1252", b"", {}),
+])
+def test_parser_args(expected, data, kwargs):
+    stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs)
+    assert expected == stream.charEncoding[0].name
+    p = HTMLParser()
+    p.parse(data, useChardet=False, **kwargs)
+    assert expected == p.documentEncoding
+
+
+@pytest.mark.parametrize("kwargs", [
+    {"override_encoding": "iso-8859-2"},
+    {"override_encoding": None},
+    {"transport_encoding": "iso-8859-2"},
+    {"transport_encoding": None},
+    {"same_origin_parent_encoding": "iso-8859-2"},
+    {"same_origin_parent_encoding": None},
+    {"likely_encoding": "iso-8859-2"},
+    {"likely_encoding": None},
+    {"default_encoding": "iso-8859-2"},
+    {"default_encoding": None},
+    {"foo_encoding": "iso-8859-2"},
+    {"foo_encoding": None},
+])
+def test_parser_args_raises(kwargs):
+    with pytest.raises(TypeError) as exc_info:
+        p = HTMLParser()
+        p.parse("", useChardet=False, **kwargs)
+    assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")
+
+
+def runParserEncodingTest(data, encoding):
+    p = HTMLParser()
+    assert p.documentEncoding is None
+    p.parse(data, useChardet=False)
+    encoding = encoding.lower().decode("ascii")
+
+    assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
+
+
+def runPreScanEncodingTest(data, encoding):
+    stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
+    encoding = encoding.lower().decode("ascii")
+
+    # Very crude way to ignore irrelevant tests
+    if len(data) > stream.numBytesMeta:
+        return
+
+    assert encoding == stream.charEncoding[0].name, errorMessage(data, encoding, stream.charEncoding[0].name)
+
+
+def test_encoding():
+    for filename in get_data_files("encoding"):
+        tests = _TestData(filename, b"data", encoding=None)
+        for test in tests:
+            yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
+            yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
+
+
+# pylint:disable=wrong-import-position
+try:
+    import chardet  # noqa
+except ImportError:
+    print("chardet not found, skipping chardet tests")
+else:
+    def test_chardet():
+        with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp:
+            encoding = _inputstream.HTMLInputStream(fp.read()).charEncoding
+            assert encoding[0].name == "big5"
+# pylint:enable=wrong-import-position
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_meta.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_meta.py
@ -0,0 +1,41 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import six
+from mock import Mock
+
+from . import support
+
+
+def _createReprMock(r):
+    """Creates a mock with a __repr__ returning r
+
+    Also provides __str__ mock with default mock behaviour"""
+    mock = Mock()
+    mock.__repr__ = Mock()
+    mock.__repr__.return_value = r
+    mock.__str__ = Mock(wraps=mock.__str__)
+    return mock
+
+
+def test_errorMessage():
+    # Create mock objects to take repr of
+    input = _createReprMock("1")
+    expected = _createReprMock("2")
+    actual = _createReprMock("3")
+
+    # Run the actual test
+    r = support.errorMessage(input, expected, actual)
+
+    # Assertions!
+    if six.PY2:
+        assert b"Input:\n1\nExpected:\n2\nRecieved\n3\n" == r
+    else:
+        assert six.PY3
+        assert "Input:\n1\nExpected:\n2\nRecieved\n3\n" == r
+
+    assert input.__repr__.call_count == 1
+    assert expected.__repr__.call_count == 1
+    assert actual.__repr__.call_count == 1
+    assert not input.__str__.called
+    assert not expected.__str__.called
+    assert not actual.__str__.called
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_optionaltags_filter.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_optionaltags_filter.py
@ -0,0 +1,7 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from html5lib.filters.optionaltags import Filter
+
+
+def test_empty():
+    assert list(Filter([])) == []
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_parser2.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_parser2.py
@ -0,0 +1,130 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from six import PY2, text_type, unichr
+
+import io
+
+from . import support  # noqa
+
+from html5lib.constants import namespaces, tokenTypes
+from html5lib import parse, parseFragment, HTMLParser
+
+
+# tests that aren't autogenerated from text files
+def test_assertDoctypeCloneable():
+    doc = parse('<!DOCTYPE HTML>', treebuilder="dom")
+    assert doc.cloneNode(True) is not None
+
+
+def test_line_counter():
+    # http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
+    assert parse("<pre>\nx\n&gt;\n</pre>") is not None
+
+
+def test_namespace_html_elements_0_dom():
+    doc = parse("<html></html>",
+                treebuilder="dom",
+                namespaceHTMLElements=True)
+    assert doc.childNodes[0].namespaceURI == namespaces["html"]
+
+
+def test_namespace_html_elements_1_dom():
+    doc = parse("<html></html>",
+                treebuilder="dom",
+                namespaceHTMLElements=False)
+    assert doc.childNodes[0].namespaceURI is None
+
+
+def test_namespace_html_elements_0_etree():
+    doc = parse("<html></html>",
+                treebuilder="etree",
+                namespaceHTMLElements=True)
+    assert doc.tag == "{%s}html" % (namespaces["html"],)
+
+
+def test_namespace_html_elements_1_etree():
+    doc = parse("<html></html>",
+                treebuilder="etree",
+                namespaceHTMLElements=False)
+    assert doc.tag == "html"
+
+
+def test_unicode_file():
+    assert parse(io.StringIO("a")) is not None
+
+
+def test_maintain_attribute_order():
+    # This is here because we impl it in parser and not tokenizer
+    p = HTMLParser()
+    # generate loads to maximize the chance a hash-based mutation will occur
+    attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
+    token = {'name': 'html',
+             'selfClosing': False,
+             'selfClosingAcknowledged': False,
+             'type': tokenTypes["StartTag"],
+             'data': attrs}
+    out = p.normalizeToken(token)
+    attr_order = list(out["data"].keys())
+    assert attr_order == [x for x, i in attrs]
+
+
+def test_duplicate_attribute():
+    # This is here because we impl it in parser and not tokenizer
+    doc = parse('<p class=a class=b>')
+    el = doc[1][0]
+    assert el.get("class") == "a"
+
+
+def test_maintain_duplicate_attribute_order():
+    # This is here because we impl it in parser and not tokenizer
+    p = HTMLParser()
+    attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
+    token = {'name': 'html',
+             'selfClosing': False,
+             'selfClosingAcknowledged': False,
+             'type': tokenTypes["StartTag"],
+             'data': attrs + [('a', len(attrs))]}
+    out = p.normalizeToken(token)
+    attr_order = list(out["data"].keys())
+    assert attr_order == [x for x, i in attrs]
+
+
+def test_debug_log():
+    parser = HTMLParser(debug=True)
+    parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")
+
+    expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}),
+                ('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
+                ('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
+                ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
+                ('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
+                ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}),
+                ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
+                ('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
+                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
+                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
+                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
+                ('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
+                ('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
+                ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}),
+                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
+                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}),
+                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})]
+
+    if PY2:
+        for i, log in enumerate(expected):
+            log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log]
+            expected[i] = tuple(log)
+
+    assert parser.log == expected
+
+
+def test_no_duplicate_clone():
+    frag = parseFragment("<b><em><foo><foob><fooc><aside></b></em>")
+    assert len(frag) == 2
+
+
+def test_self_closing_col():
+    parser = HTMLParser()
+    parser.parseFragment('<table><colgroup><col /></colgroup></table>')
+    assert not parser.errors
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_sanitizer.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_sanitizer.py
@ -0,0 +1,127 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from html5lib import constants, parseFragment, serialize
+from html5lib.filters import sanitizer
+
+
+def runSanitizerTest(_, expected, input):
+    parsed = parseFragment(expected)
+    expected = serialize(parsed,
+                         omit_optional_tags=False,
+                         use_trailing_solidus=True,
+                         space_before_trailing_solidus=False,
+                         quote_attr_values="always",
+                         quote_char='"',
+                         alphabetical_attributes=True)
+    assert expected == sanitize_html(input)
+
+
+def sanitize_html(stream):
+    parsed = parseFragment(stream)
+    serialized = serialize(parsed,
+                           sanitize=True,
+                           omit_optional_tags=False,
+                           use_trailing_solidus=True,
+                           space_before_trailing_solidus=False,
+                           quote_attr_values="always",
+                           quote_char='"',
+                           alphabetical_attributes=True)
+    return serialized
+
+
+def test_should_handle_astral_plane_characters():
+    sanitized = sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
+    expected = '<p>\U0001d4b5 \U0001d538</p>'
+    assert expected == sanitized
+
+
+def test_should_allow_relative_uris():
+    sanitized = sanitize_html('<p><a href="/example.com"></a></p>')
+    expected = '<p><a href="/example.com"></a></p>'
+    assert expected == sanitized
+
+
+def test_invalid_data_uri():
+    sanitized = sanitize_html('<audio controls="" src="data:foobar"></audio>')
+    expected = '<audio controls></audio>'
+    assert expected == sanitized
+
+
+def test_invalid_ipv6_url():
+    sanitized = sanitize_html('<a href="h://]">')
+    expected = "<a></a>"
+    assert expected == sanitized
+
+
+def test_data_uri_disallowed_type():
+    sanitized = sanitize_html('<audio controls="" src="data:text/html,<html>"></audio>')
+    expected = "<audio controls></audio>"
+    assert expected == sanitized
+
+
+def test_sanitizer():
+    for ns, tag_name in sanitizer.allowed_elements:
+        if ns != constants.namespaces["html"]:
+            continue
+        if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td',
+                        'tfoot', 'th', 'thead', 'tr', 'select']:
+            continue  # TODO
+        if tag_name == 'image':
+            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
+                   "<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
+                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
+        elif tag_name == 'br':
+            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
+                   "<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
+                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
+        elif tag_name in constants.voidElements:
+            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
+                   "<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
+                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
+        else:
+            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
+                   "<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
+                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
+
+    for ns, attribute_name in sanitizer.allowed_attributes:
+        if ns is not None:
+            continue
+        if attribute_name != attribute_name.lower():
+            continue  # TODO
+        if attribute_name == 'style':
+            continue
+        attribute_value = 'foo'
+        if attribute_name in sanitizer.attr_val_is_uri:
+            attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0]
+        yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
+               "<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
+               "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value))
+
+    for protocol in sanitizer.allowed_protocols:
+        rest_of_uri = '//sub.domain.tld/path/object.ext'
+        if protocol == 'data':
+            rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
+        yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
+               "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
+               """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))
+
+    for protocol in sanitizer.allowed_protocols:
+        rest_of_uri = '//sub.domain.tld/path/object.ext'
+        if protocol == 'data':
+            rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
+        protocol = protocol.upper()
+        yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
+               "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
+               """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))
+
+
+def test_lowercase_color_codes_in_style():
+    sanitized = sanitize_html("<p style=\"border: 1px solid #a2a2a2;\"></p>")
+    expected = '<p style=\"border: 1px solid #a2a2a2;\"></p>'
+    assert expected == sanitized
+
+
+def test_uppercase_color_codes_in_style():
+    sanitized = sanitize_html("<p style=\"border: 1px solid #A2A2A2;\"></p>")
+    expected = '<p style=\"border: 1px solid #A2A2A2;\"></p>'
+    assert expected == sanitized
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_serializer.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_serializer.py
@ -0,0 +1,225 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import json
+
+import pytest
+
+from .support import get_data_files
+
+from html5lib import constants
+from html5lib.filters.lint import Filter as Lint
+from html5lib.serializer import HTMLSerializer, serialize
+from html5lib.treewalkers.base import TreeWalker
+
+# pylint:disable=wrong-import-position
+optionals_loaded = []
+
+try:
+    from lxml import etree
+    optionals_loaded.append("lxml")
+except ImportError:
+    pass
+# pylint:enable=wrong-import-position
+
+default_namespace = constants.namespaces["html"]
+
+
+class JsonWalker(TreeWalker):
+    def __iter__(self):
+        for token in self.tree:
+            type = token[0]
+            if type == "StartTag":
+                if len(token) == 4:
+                    namespace, name, attrib = token[1:4]
+                else:
+                    namespace = default_namespace
+                    name, attrib = token[1:3]
+                yield self.startTag(namespace, name, self._convertAttrib(attrib))
+            elif type == "EndTag":
+                if len(token) == 3:
+                    namespace, name = token[1:3]
+                else:
+                    namespace = default_namespace
+                    name = token[1]
+                yield self.endTag(namespace, name)
+            elif type == "EmptyTag":
+                if len(token) == 4:
+                    namespace, name, attrib = token[1:]
+                else:
+                    namespace = default_namespace
+                    name, attrib = token[1:]
+                for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
+                    yield token
+            elif type == "Comment":
+                yield self.comment(token[1])
+            elif type in ("Characters", "SpaceCharacters"):
+                for token in self.text(token[1]):
+                    yield token
+            elif type == "Doctype":
+                if len(token) == 4:
+                    yield self.doctype(token[1], token[2], token[3])
+                elif len(token) == 3:
+                    yield self.doctype(token[1], token[2])
+                else:
+                    yield self.doctype(token[1])
+            else:
+                raise ValueError("Unknown token type: " + type)
+
+    def _convertAttrib(self, attribs):
+        """html5lib tree-walkers use a dict of (namespace, name): value for
+        attributes, but JSON cannot represent this. Convert from the format
+        in the serializer tests (a list of dicts with "namespace", "name",
+        and "value" as keys) to html5lib's tree-walker format."""
+        attrs = {}
+        for attrib in attribs:
+            name = (attrib["namespace"], attrib["name"])
+            assert(name not in attrs)
+            attrs[name] = attrib["value"]
+        return attrs
+
+
+def serialize_html(input, options):
+    options = dict([(str(k), v) for k, v in options.items()])
+    encoding = options.get("encoding", None)
+    if "encoding" in options:
+        del options["encoding"]
+    stream = Lint(JsonWalker(input), False)
+    serializer = HTMLSerializer(alphabetical_attributes=True, **options)
+    return serializer.render(stream, encoding)
+
+
+def runSerializerTest(input, expected, options):
+    encoding = options.get("encoding", None)
+
+    if encoding:
+        expected = list(map(lambda x: x.encode(encoding), expected))
+
+    result = serialize_html(input, options)
+    if len(expected) == 1:
+        assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options))
+    elif result not in expected:
+        assert False, "Expected: %s, Received: %s" % (expected, result)
+
+
+def throwsWithLatin1(input):
+    with pytest.raises(UnicodeEncodeError):
+        serialize_html(input, {"encoding": "iso-8859-1"})
+
+
+def testDoctypeName():
+    throwsWithLatin1([["Doctype", "\u0101"]])
+
+
+def testDoctypePublicId():
+    throwsWithLatin1([["Doctype", "potato", "\u0101"]])
+
+
+def testDoctypeSystemId():
+    throwsWithLatin1([["Doctype", "potato", "potato", "\u0101"]])
+
+
+def testCdataCharacters():
+    runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]],
+                      ["<style>&amacr;"], {"encoding": "iso-8859-1"})
+
+
+def testCharacters():
+    runSerializerTest([["Characters", "\u0101"]],
+                      ["&amacr;"], {"encoding": "iso-8859-1"})
+
+
+def testStartTagName():
+    throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "\u0101", []]])
+
+
+def testAttributeName():
+    throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "\u0101", "value": "potato"}]]])
+
+
+def testAttributeValue():
+    runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "span",
+                        [{"namespace": None, "name": "potato", "value": "\u0101"}]]],
+                      ["<span potato=&amacr;>"], {"encoding": "iso-8859-1"})
+
+
+def testEndTagName():
+    throwsWithLatin1([["EndTag", "http://www.w3.org/1999/xhtml", "\u0101"]])
+
+
+def testComment():
+    throwsWithLatin1([["Comment", "\u0101"]])
+
+
+def testThrowsUnknownOption():
+    with pytest.raises(TypeError):
+        HTMLSerializer(foobar=None)
+
+
+@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"))
+def testSpecQuoteAttribute(c):
+    input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
+               [{"namespace": None, "name": "foo", "value": c}]]]
+    if c == '"':
+        output_ = ["<span foo='%s'>" % c]
+    else:
+        output_ = ['<span foo="%s">' % c]
+    options_ = {"quote_attr_values": "spec"}
+    runSerializerTest(input_, output_, options_)
+
+
+@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"
+                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
+                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
+                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
+                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
+                                   "\u3000"))
+def testLegacyQuoteAttribute(c):
+    input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
+               [{"namespace": None, "name": "foo", "value": c}]]]
+    if c == '"':
+        output_ = ["<span foo='%s'>" % c]
+    else:
+        output_ = ['<span foo="%s">' % c]
+    options_ = {"quote_attr_values": "legacy"}
+    runSerializerTest(input_, output_, options_)
+
+
+@pytest.fixture
+def lxml_parser():
+    return etree.XMLParser(resolve_entities=False)
+
+
+@pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
+def testEntityReplacement(lxml_parser):
+    doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
+    tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
+    result = serialize(tree, tree="lxml", omit_optional_tags=False)
+    assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>'
+
+
+@pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
+def testEntityXML(lxml_parser):
+    doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>'
+    tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
+    result = serialize(tree, tree="lxml", omit_optional_tags=False)
+    assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>'
+
+
+@pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
+def testEntityNoResolve(lxml_parser):
+    doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
+    tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
+    result = serialize(tree, tree="lxml", omit_optional_tags=False,
+                                  resolve_entities=False)
+    assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
+
+
+def test_serializer():
+    for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)):
+        with open(filename) as fp:
+            tests = json.load(fp)
+            for test in tests['tests']:
+                yield runSerializerTest, test["input"], test["expected"], test.get("options", {})
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_stream.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_stream.py
@ -0,0 +1,323 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import support  # noqa
+
+import codecs
+import sys
+from io import BytesIO, StringIO
+
+import pytest
+
+import six
+from six.moves import http_client, urllib
+
+from html5lib._inputstream import (BufferedStream, HTMLInputStream,
+                                   HTMLUnicodeInputStream, HTMLBinaryInputStream)
+from html5lib._utils import supports_lone_surrogates
+
+
+def test_basic():
+    s = b"abc"
+    fp = BufferedStream(BytesIO(s))
+    read = fp.read(10)
+    assert read == s
+
+
+def test_read_length():
+    fp = BufferedStream(BytesIO(b"abcdef"))
+    read1 = fp.read(1)
+    assert read1 == b"a"
+    read2 = fp.read(2)
+    assert read2 == b"bc"
+    read3 = fp.read(3)
+    assert read3 == b"def"
+    read4 = fp.read(4)
+    assert read4 == b""
+
+
+def test_tell():
+    fp = BufferedStream(BytesIO(b"abcdef"))
+    read1 = fp.read(1)
+    assert read1 == b"a"
+    assert fp.tell() == 1
+    read2 = fp.read(2)
+    assert read2 == b"bc"
+    assert fp.tell() == 3
+    read3 = fp.read(3)
+    assert read3 == b"def"
+    assert fp.tell() == 6
+    read4 = fp.read(4)
+    assert read4 == b""
+    assert fp.tell() == 6
+
+
+def test_seek():
+    fp = BufferedStream(BytesIO(b"abcdef"))
+    read1 = fp.read(1)
+    assert read1 == b"a"
+    fp.seek(0)
+    read2 = fp.read(1)
+    assert read2 == b"a"
+    read3 = fp.read(2)
+    assert read3 == b"bc"
+    fp.seek(2)
+    read4 = fp.read(2)
+    assert read4 == b"cd"
+    fp.seek(4)
+    read5 = fp.read(2)
+    assert read5 == b"ef"
+
+
+def test_seek_tell():
+    fp = BufferedStream(BytesIO(b"abcdef"))
+    read1 = fp.read(1)
+    assert read1 == b"a"
+    assert fp.tell() == 1
+    fp.seek(0)
+    read2 = fp.read(1)
+    assert read2 == b"a"
+    assert fp.tell() == 1
+    read3 = fp.read(2)
+    assert read3 == b"bc"
+    assert fp.tell() == 3
+    fp.seek(2)
+    read4 = fp.read(2)
+    assert read4 == b"cd"
+    assert fp.tell() == 4
+    fp.seek(4)
+    read5 = fp.read(2)
+    assert read5 == b"ef"
+    assert fp.tell() == 6
+
+
+class HTMLUnicodeInputStreamShortChunk(HTMLUnicodeInputStream):
+    _defaultChunkSize = 2
+
+
+class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream):
+    _defaultChunkSize = 2
+
+
+def test_char_ascii():
+    stream = HTMLInputStream(b"'", override_encoding='ascii')
+    assert stream.charEncoding[0].name == 'windows-1252'
+    assert stream.char() == "'"
+
+
+def test_char_utf8():
+    stream = HTMLInputStream('\u2018'.encode('utf-8'), override_encoding='utf-8')
+    assert stream.charEncoding[0].name == 'utf-8'
+    assert stream.char() == '\u2018'
+
+
+def test_char_win1252():
+    stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252'))
+    assert stream.charEncoding[0].name == 'windows-1252'
+    assert stream.char() == "\xa9"
+    assert stream.char() == "\xf1"
+    assert stream.char() == "\u2019"
+
+
+def test_bom():
+    stream = HTMLInputStream(codecs.BOM_UTF8 + b"'")
+    assert stream.charEncoding[0].name == 'utf-8'
+    assert stream.char() == "'"
+
+
+def test_utf_16():
+    stream = HTMLInputStream((' ' * 1025).encode('utf-16'))
+    assert stream.charEncoding[0].name in ['utf-16le', 'utf-16be']
+    assert len(stream.charsUntil(' ', True)) == 1025
+
+
+def test_newlines():
+    stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe")
+    assert stream.position() == (1, 0)
+    assert stream.charsUntil('c') == "a\nbb\n"
+    assert stream.position() == (3, 0)
+    assert stream.charsUntil('x') == "ccc\ndddd"
+    assert stream.position() == (4, 4)
+    assert stream.charsUntil('e') == "x"
+    assert stream.position() == (4, 5)
+
+
+def test_newlines2():
+    size = HTMLUnicodeInputStream._defaultChunkSize
+    stream = HTMLInputStream("\r" * size + "\n")
+    assert stream.charsUntil('x') == "\n" * size
+
+
+def test_position():
+    stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh")
+    assert stream.position() == (1, 0)
+    assert stream.charsUntil('c') == "a\nbb\n"
+    assert stream.position() == (3, 0)
+    stream.unget("\n")
+    assert stream.position() == (2, 2)
+    assert stream.charsUntil('c') == "\n"
+    assert stream.position() == (3, 0)
+    stream.unget("\n")
+    assert stream.position() == (2, 2)
+    assert stream.char() == "\n"
+    assert stream.position() == (3, 0)
+    assert stream.charsUntil('e') == "ccc\nddd"
+    assert stream.position() == (4, 3)
+    assert stream.charsUntil('h') == "e\nf\ng"
+    assert stream.position() == (6, 1)
+
+
+def test_position2():
+    stream = HTMLUnicodeInputStreamShortChunk("abc\nd")
+    assert stream.position() == (1, 0)
+    assert stream.char() == "a"
+    assert stream.position() == (1, 1)
+    assert stream.char() == "b"
+    assert stream.position() == (1, 2)
+    assert stream.char() == "c"
+    assert stream.position() == (1, 3)
+    assert stream.char() == "\n"
+    assert stream.position() == (2, 0)
+    assert stream.char() == "d"
+    assert stream.position() == (2, 1)
+
+
+def test_python_issue_20007():
+    """
+    Make sure we have a work-around for Python bug #20007
+    http://bugs.python.org/issue20007
+    """
+    class FakeSocket(object):
+        def makefile(self, _mode, _bufsize=None):
+            # pylint:disable=unused-argument
+            return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
+
+    source = http_client.HTTPResponse(FakeSocket())
+    source.begin()
+    stream = HTMLInputStream(source)
+    assert stream.charsUntil(" ") == "Text"
+
+
+def test_python_issue_20007_b():
+    """
+    Make sure we have a work-around for Python bug #20007
+    http://bugs.python.org/issue20007
+    """
+    if six.PY2:
+        return
+
+    class FakeSocket(object):
+        def makefile(self, _mode, _bufsize=None):
+            # pylint:disable=unused-argument
+            return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
+
+    source = http_client.HTTPResponse(FakeSocket())
+    source.begin()
+    wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com")
+    stream = HTMLInputStream(wrapped)
+    assert stream.charsUntil(" ") == "Text"
+
+
+@pytest.mark.parametrize("inp,num",
+                         [("\u0000", 0),
+                          ("\u0001", 1),
+                          ("\u0008", 1),
+                          ("\u0009", 0),
+                          ("\u000A", 0),
+                          ("\u000B", 1),
+                          ("\u000C", 0),
+                          ("\u000D", 0),
+                          ("\u000E", 1),
+                          ("\u001F", 1),
+                          ("\u0020", 0),
+                          ("\u007E", 0),
+                          ("\u007F", 1),
+                          ("\u009F", 1),
+                          ("\u00A0", 0),
+                          ("\uFDCF", 0),
+                          ("\uFDD0", 1),
+                          ("\uFDEF", 1),
+                          ("\uFDF0", 0),
+                          ("\uFFFD", 0),
+                          ("\uFFFE", 1),
+                          ("\uFFFF", 1),
+                          ("\U0001FFFD", 0),
+                          ("\U0001FFFE", 1),
+                          ("\U0001FFFF", 1),
+                          ("\U0002FFFD", 0),
+                          ("\U0002FFFE", 1),
+                          ("\U0002FFFF", 1),
+                          ("\U0003FFFD", 0),
+                          ("\U0003FFFE", 1),
+                          ("\U0003FFFF", 1),
+                          ("\U0004FFFD", 0),
+                          ("\U0004FFFE", 1),
+                          ("\U0004FFFF", 1),
+                          ("\U0005FFFD", 0),
+                          ("\U0005FFFE", 1),
+                          ("\U0005FFFF", 1),
+                          ("\U0006FFFD", 0),
+                          ("\U0006FFFE", 1),
+                          ("\U0006FFFF", 1),
+                          ("\U0007FFFD", 0),
+                          ("\U0007FFFE", 1),
+                          ("\U0007FFFF", 1),
+                          ("\U0008FFFD", 0),
+                          ("\U0008FFFE", 1),
+                          ("\U0008FFFF", 1),
+                          ("\U0009FFFD", 0),
+                          ("\U0009FFFE", 1),
+                          ("\U0009FFFF", 1),
+                          ("\U000AFFFD", 0),
+                          ("\U000AFFFE", 1),
+                          ("\U000AFFFF", 1),
+                          ("\U000BFFFD", 0),
+                          ("\U000BFFFE", 1),
+                          ("\U000BFFFF", 1),
+                          ("\U000CFFFD", 0),
+                          ("\U000CFFFE", 1),
+                          ("\U000CFFFF", 1),
+                          ("\U000DFFFD", 0),
+                          ("\U000DFFFE", 1),
+                          ("\U000DFFFF", 1),
+                          ("\U000EFFFD", 0),
+                          ("\U000EFFFE", 1),
+                          ("\U000EFFFF", 1),
+                          ("\U000FFFFD", 0),
+                          ("\U000FFFFE", 1),
+                          ("\U000FFFFF", 1),
+                          ("\U0010FFFD", 0),
+                          ("\U0010FFFE", 1),
+                          ("\U0010FFFF", 1),
+                          ("\x01\x01\x01", 3),
+                          ("a\x01a\x01a\x01a", 3)])
+def test_invalid_codepoints(inp, num):
+    stream = HTMLUnicodeInputStream(StringIO(inp))
+    for _i in range(len(inp)):
+        stream.char()
+    assert len(stream.errors) == num
+
+
+@pytest.mark.skipif(not supports_lone_surrogates, reason="doesn't support lone surrogates")
+@pytest.mark.parametrize("inp,num",
+                         [("'\\uD7FF'", 0),
+                          ("'\\uD800'", 1),
+                          ("'\\uDBFF'", 1),
+                          ("'\\uDC00'", 1),
+                          ("'\\uDFFF'", 1),
+                          ("'\\uE000'", 0),
+                          ("'\\uD800\\uD800\\uD800'", 3),
+                          ("'a\\uD800a\\uD800a\\uD800a'", 3),
+                          ("'\\uDFFF\\uDBFF'", 2),
+                          pytest.mark.skipif(sys.maxunicode == 0xFFFF,
+                                             ("'\\uDBFF\\uDFFF'", 2),
+                                             reason="narrow Python")])
+def test_invalid_codepoints_surrogates(inp, num):
+    inp = eval(inp)  # pylint:disable=eval-used
+    fp = StringIO(inp)
+    if ord(max(fp.read())) > 0xFFFF:
+        pytest.skip("StringIO altered string")
+    fp.seek(0)
+    stream = HTMLUnicodeInputStream(fp)
+    for _i in range(len(inp)):
+        stream.char()
+    assert len(stream.errors) == num
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_treeadapters.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_treeadapters.py
@ -0,0 +1,40 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import support  # noqa
+
+import html5lib
+from html5lib.treeadapters import sax
+from html5lib.treewalkers import getTreeWalker
+
+
+def test_to_sax():
+    handler = support.TracingSaxHandler()
+    tree = html5lib.parse("""<html xml:lang="en">
+        <title>Directory Listing</title>
+        <a href="/"><b/></p>
+    """, treebuilder="etree")
+    walker = getTreeWalker("etree")
+    sax.to_sax(walker(tree), handler)
+    expected = [
+        'startDocument',
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'),
+            'html', {(None, 'xml:lang'): 'en'}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}),
+        ('characters', 'Directory Listing'),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
+        ('characters', '\n        '),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'),
+        ('characters', '\n    '),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'),
+        'endDocument',
+    ]
+    assert expected == handler.visited
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_treewalkers.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_treewalkers.py
@ -0,0 +1,136 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import itertools
+
+import pytest
+
+try:
+    import lxml.etree
+except ImportError:
+    pass
+
+from .support import treeTypes
+
+from html5lib import html5parser, treewalkers
+from html5lib.filters.lint import Filter as Lint
+
+import re
+attrlist = re.compile(r"^(\s+)\w+=.*(\n\1\w+=.*)+", re.M)
+
+
+def sortattrs(x):
+    lines = x.group(0).split("\n")
+    lines.sort()
+    return "\n".join(lines)
+
+
+def test_all_tokens():
+    expected = [
+        {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'},
+        {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
+        {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
+        {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
+        {'data': 'a', 'type': 'Characters'},
+        {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
+        {'data': 'b', 'type': 'Characters'},
+        {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
+        {'data': 'c', 'type': 'Characters'},
+        {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
+        {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
+    ]
+    for _, treeCls in sorted(treeTypes.items()):
+        if treeCls is None:
+            continue
+        p = html5parser.HTMLParser(tree=treeCls["builder"])
+        document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
+        document = treeCls.get("adapter", lambda x: x)(document)
+        output = Lint(treeCls["walker"](document))
+        for expectedToken, outputToken in zip(expected, output):
+            assert expectedToken == outputToken
+
+
+def set_attribute_on_first_child(docfrag, name, value, treeName):
+    """naively sets an attribute on the first child of the document
+    fragment passed in"""
+    setter = {'ElementTree': lambda d: d[0].set,
+              'DOM': lambda d: d.firstChild.setAttribute}
+    setter['cElementTree'] = setter['ElementTree']
+    try:
+        setter.get(treeName, setter['DOM'])(docfrag)(name, value)
+    except AttributeError:
+        setter['ElementTree'](docfrag)(name, value)
+
+
+def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
+    """tests what happens when we add attributes to the intext"""
+    treeName, treeClass = tree
+    if treeClass is None:
+        pytest.skip("Treebuilder not loaded")
+    parser = html5parser.HTMLParser(tree=treeClass["builder"])
+    document = parser.parseFragment(intext)
+    for nom, val in attrs_to_add:
+        set_attribute_on_first_child(document, nom, val, treeName)
+
+    document = treeClass.get("adapter", lambda x: x)(document)
+    output = treewalkers.pprint(treeClass["walker"](document))
+    output = attrlist.sub(sortattrs, output)
+    if output not in expected:
+        raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
+
+
+def test_treewalker_six_mix():
+    """Str/Unicode mix. If str attrs added to tree"""
+
+    # On Python 2.x string literals are of type str. Unless, like this
+    # file, the programmer imports unicode_literals from __future__.
+    # In that case, string literals become objects of type unicode.
+
+    # This test simulates a Py2 user, modifying attributes on a document
+    # fragment but not using the u'' syntax nor importing unicode_literals
+    sm_tests = [
+        ('<a href="http://example.com">Example</a>',
+         [(str('class'), str('test123'))],
+         '<a>\n  class="test123"\n  href="http://example.com"\n  "Example"'),
+
+        ('<link href="http://example.com/cow">',
+         [(str('rel'), str('alternate'))],
+         '<link>\n  href="http://example.com/cow"\n  rel="alternate"\n  "Example"')
+    ]
+
+    for tree in sorted(treeTypes.items()):
+        for intext, attrs, expected in sm_tests:
+            yield runTreewalkerEditTest, intext, expected, attrs, tree
+
+
+@pytest.mark.parametrize("tree,char", itertools.product(sorted(treeTypes.items()), ["x", "\u1234"]))
+def test_fragment_single_char(tree, char):
+    expected = [
+        {'data': char, 'type': 'Characters'}
+    ]
+
+    treeName, treeClass = tree
+    if treeClass is None:
+        pytest.skip("Treebuilder not loaded")
+
+    parser = html5parser.HTMLParser(tree=treeClass["builder"])
+    document = parser.parseFragment(char)
+    document = treeClass.get("adapter", lambda x: x)(document)
+    output = Lint(treeClass["walker"](document))
+
+    assert list(output) == expected
+
+
+@pytest.mark.skipif(treeTypes["lxml"] is None, reason="lxml not importable")
+def test_lxml_xml():
+    expected = [
+        {'data': {}, 'name': 'div', 'namespace': None, 'type': 'StartTag'},
+        {'data': {}, 'name': 'div', 'namespace': None, 'type': 'StartTag'},
+        {'name': 'div', 'namespace': None, 'type': 'EndTag'},
+        {'name': 'div', 'namespace': None, 'type': 'EndTag'}
+    ]
+
+    lxmltree = lxml.etree.fromstring('<div><div></div></div>')
+    walker = treewalkers.getTreeWalker('lxml')
+    output = Lint(walker(lxmltree))
+
+    assert list(output) == expected
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_whitespace_filter.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/test_whitespace_filter.py
@ -0,0 +1,125 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from html5lib.filters.whitespace import Filter
+from html5lib.constants import spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+
+def runTest(input, expected):
+    output = list(Filter(input))
+    errorMsg = "\n".join(["\n\nInput:", str(input),
+                          "\nExpected:", str(expected),
+                          "\nReceived:", str(output)])
+    assert expected == output, errorMsg
+
+
+def runTestUnmodifiedOutput(input):
+    runTest(input, input)
+
+
+def testPhrasingElements():
+    runTestUnmodifiedOutput(
+        [{"type": "Characters", "data": "This is a "},
+         {"type": "StartTag", "name": "span", "data": []},
+         {"type": "Characters", "data": "phrase"},
+         {"type": "EndTag", "name": "span", "data": []},
+         {"type": "SpaceCharacters", "data": " "},
+         {"type": "Characters", "data": "with"},
+         {"type": "SpaceCharacters", "data": " "},
+         {"type": "StartTag", "name": "em", "data": []},
+         {"type": "Characters", "data": "emphasised text"},
+         {"type": "EndTag", "name": "em", "data": []},
+         {"type": "Characters", "data": " and an "},
+         {"type": "StartTag", "name": "img", "data": [["alt", "image"]]},
+         {"type": "Characters", "data": "."}])
+
+
+def testLeadingWhitespace():
+    runTest(
+        [{"type": "StartTag", "name": "p", "data": []},
+         {"type": "SpaceCharacters", "data": spaceCharacters},
+         {"type": "Characters", "data": "foo"},
+         {"type": "EndTag", "name": "p", "data": []}],
+        [{"type": "StartTag", "name": "p", "data": []},
+         {"type": "SpaceCharacters", "data": " "},
+         {"type": "Characters", "data": "foo"},
+         {"type": "EndTag", "name": "p", "data": []}])
+
+
+def testLeadingWhitespaceAsCharacters():
+    runTest(
+        [{"type": "StartTag", "name": "p", "data": []},
+         {"type": "Characters", "data": spaceCharacters + "foo"},
+         {"type": "EndTag", "name": "p", "data": []}],
+        [{"type": "StartTag", "name": "p", "data": []},
+         {"type": "Characters", "data": " foo"},
+         {"type": "EndTag", "name": "p", "data": []}])
+
+
+def testTrailingWhitespace():
+    runTest(
+        [{"type": "StartTag", "name": "p", "data": []},
+         {"type": "Characters", "data": "foo"},
+         {"type": "SpaceCharacters", "data": spaceCharacters},
+         {"type": "EndTag", "name": "p", "data": []}],
+        [{"type": "StartTag", "name": "p", "data": []},
+         {"type": "Characters", "data": "foo"},
+         {"type": "SpaceCharacters", "data": " "},
+         {"type": "EndTag", "name": "p", "data": []}])
+
+
+def testTrailingWhitespaceAsCharacters():
+    runTest(
+        [{"type": "StartTag", "name": "p", "data": []},
+         {"type": "Characters", "data": "foo" + spaceCharacters},
+         {"type": "EndTag", "name": "p", "data": []}],
+        [{"type": "StartTag", "name": "p", "data": []},
+         {"type": "Characters", "data": "foo "},
+         {"type": "EndTag", "name": "p", "data": []}])
+
+
+def testWhitespace():
+    runTest(
+        [{"type": "StartTag", "name": "p", "data": []},
+         {"type": "Characters", "data": "foo" + spaceCharacters + "bar"},
+         {"type": "EndTag", "name": "p", "data": []}],
+        [{"type": "StartTag", "name": "p", "data": []},
+         {"type": "Characters", "data": "foo bar"},
+         {"type": "EndTag", "name": "p", "data": []}])
+
+
+def testLeadingWhitespaceInPre():
+    runTestUnmodifiedOutput(
+        [{"type": "StartTag", "name": "pre", "data": []},
+         {"type": "SpaceCharacters", "data": spaceCharacters},
+         {"type": "Characters", "data": "foo"},
+         {"type": "EndTag", "name": "pre", "data": []}])
+
+
+def testLeadingWhitespaceAsCharactersInPre():
+    runTestUnmodifiedOutput(
+        [{"type": "StartTag", "name": "pre", "data": []},
+         {"type": "Characters", "data": spaceCharacters + "foo"},
+         {"type": "EndTag", "name": "pre", "data": []}])
+
+
+def testTrailingWhitespaceInPre():
+    runTestUnmodifiedOutput(
+        [{"type": "StartTag", "name": "pre", "data": []},
+         {"type": "Characters", "data": "foo"},
+         {"type": "SpaceCharacters", "data": spaceCharacters},
+         {"type": "EndTag", "name": "pre", "data": []}])
+
+
+def testTrailingWhitespaceAsCharactersInPre():
+    runTestUnmodifiedOutput(
+        [{"type": "StartTag", "name": "pre", "data": []},
+         {"type": "Characters", "data": "foo" + spaceCharacters},
+         {"type": "EndTag", "name": "pre", "data": []}])
+
+
+def testWhitespaceInPre():
+    runTestUnmodifiedOutput(
+        [{"type": "StartTag", "name": "pre", "data": []},
+         {"type": "Characters", "data": "foo" + spaceCharacters + "bar"},
+         {"type": "EndTag", "name": "pre", "data": []}])
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/tokenizer.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/tokenizer.py
@ -0,0 +1,252 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import codecs
+import json
+import warnings
+import re
+
+import pytest
+from six import unichr
+
+from html5lib._tokenizer import HTMLTokenizer
+from html5lib import constants, _utils
+
+
+class TokenizerTestParser(object):
+    def __init__(self, initialState, lastStartTag=None):
+        self.tokenizer = HTMLTokenizer
+        self._state = initialState
+        self._lastStartTag = lastStartTag
+
+    def parse(self, stream, encoding=None, innerHTML=False):
+        # pylint:disable=unused-argument
+        tokenizer = self.tokenizer(stream, encoding)
+        self.outputTokens = []
+
+        tokenizer.state = getattr(tokenizer, self._state)
+        if self._lastStartTag is not None:
+            tokenizer.currentToken = {"type": "startTag",
+                                      "name": self._lastStartTag}
+
+        types = dict((v, k) for k, v in constants.tokenTypes.items())
+        for token in tokenizer:
+            getattr(self, 'process%s' % types[token["type"]])(token)
+
+        return self.outputTokens
+
+    def processDoctype(self, token):
+        self.outputTokens.append(["DOCTYPE", token["name"], token["publicId"],
+                                  token["systemId"], token["correct"]])
+
+    def processStartTag(self, token):
+        self.outputTokens.append(["StartTag", token["name"],
+                                  dict(token["data"][::-1]), token["selfClosing"]])
+
+    def processEmptyTag(self, token):
+        if token["name"] not in constants.voidElements:
+            self.outputTokens.append("ParseError")
+        self.outputTokens.append(["StartTag", token["name"], dict(token["data"][::-1])])
+
+    def processEndTag(self, token):
+        self.outputTokens.append(["EndTag", token["name"],
+                                  token["selfClosing"]])
+
+    def processComment(self, token):
+        self.outputTokens.append(["Comment", token["data"]])
+
+    def processSpaceCharacters(self, token):
+        self.outputTokens.append(["Character", token["data"]])
+        self.processSpaceCharacters = self.processCharacters
+
+    def processCharacters(self, token):
+        self.outputTokens.append(["Character", token["data"]])
+
+    def processEOF(self, token):
+        pass
+
+    def processParseError(self, token):
+        self.outputTokens.append(["ParseError", token["data"]])
+
+
+def concatenateCharacterTokens(tokens):
+    outputTokens = []
+    for token in tokens:
+        if "ParseError" not in token and token[0] == "Character":
+            if (outputTokens and "ParseError" not in outputTokens[-1] and
+                    outputTokens[-1][0] == "Character"):
+                outputTokens[-1][1] += token[1]
+            else:
+                outputTokens.append(token)
+        else:
+            outputTokens.append(token)
+    return outputTokens
+
+
+def normalizeTokens(tokens):
+    # TODO: convert tests to reflect arrays
+    for i, token in enumerate(tokens):
+        if token[0] == 'ParseError':
+            tokens[i] = token[0]
+    return tokens
+
+
+def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
+                ignoreErrors=False):
+    """Test whether the test has passed or failed
+
+    If the ignoreErrorOrder flag is set to true we don't test the relative
+    positions of parse errors and non parse errors
+    """
+    checkSelfClosing = False
+    for token in expectedTokens:
+        if (token[0] == "StartTag" and len(token) == 4 or
+                token[0] == "EndTag" and len(token) == 3):
+            checkSelfClosing = True
+            break
+
+    if not checkSelfClosing:
+        for token in receivedTokens:
+            if token[0] == "StartTag" or token[0] == "EndTag":
+                token.pop()
+
+    if not ignoreErrorOrder and not ignoreErrors:
+        expectedTokens = concatenateCharacterTokens(expectedTokens)
+        return expectedTokens == receivedTokens
+    else:
+        # Sort the tokens into two groups; non-parse errors and parse errors
+        tokens = {"expected": [[], []], "received": [[], []]}
+        for tokenType, tokenList in zip(list(tokens.keys()),
+                                        (expectedTokens, receivedTokens)):
+            for token in tokenList:
+                if token != "ParseError":
+                    tokens[tokenType][0].append(token)
+                else:
+                    if not ignoreErrors:
+                        tokens[tokenType][1].append(token)
+            tokens[tokenType][0] = concatenateCharacterTokens(tokens[tokenType][0])
+        return tokens["expected"] == tokens["received"]
+
+
+_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")
+
+
+def unescape(test):
+    def decode(inp):
+        """Decode \\uXXXX escapes
+
+        This decodes \\uXXXX escapes, possibly into non-BMP characters when
+        two surrogate character escapes are adjacent to each other.
+        """
+        # This cannot be implemented using the unicode_escape codec
+        # because that requires its input be ISO-8859-1, and we need
+        # arbitrary unicode as input.
+        def repl(m):
+            if m.group(2) is not None:
+                high = int(m.group(1), 16)
+                low = int(m.group(2), 16)
+                if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF:
+                    cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
+                    return unichr(cp)
+                else:
+                    return unichr(high) + unichr(low)
+            else:
+                return unichr(int(m.group(1), 16))
+        try:
+            return _surrogateRe.sub(repl, inp)
+        except ValueError:
+            # This occurs when unichr throws ValueError, which should
+            # only be for a lone-surrogate.
+            if _utils.supports_lone_surrogates:
+                raise
+            return None
+
+    test["input"] = decode(test["input"])
+    for token in test["output"]:
+        if token == "ParseError":
+            continue
+        else:
+            token[1] = decode(token[1])
+            if len(token) > 2:
+                for key, value in token[2]:
+                    del token[2][key]
+                    token[2][decode(key)] = decode(value)
+    return test
+
+
+def _doCapitalize(match):
+    return match.group(1).upper()
+
+_capitalizeRe = re.compile(r"\W+(\w)").sub
+
+
+def capitalize(s):
+    s = s.lower()
+    s = _capitalizeRe(_doCapitalize, s)
+    return s
+
+
+class TokenizerFile(pytest.File):
+    def collect(self):
+        with codecs.open(str(self.fspath), "r", encoding="utf-8") as fp:
+            tests = json.load(fp)
+        if 'tests' in tests:
+            for i, test in enumerate(tests['tests']):
+                yield TokenizerTestCollector(str(i), self, testdata=test)
+
+
+class TokenizerTestCollector(pytest.Collector):
+    def __init__(self, name, parent=None, config=None, session=None, testdata=None):
+        super(TokenizerTestCollector, self).__init__(name, parent, config, session)
+        if 'initialStates' not in testdata:
+            testdata["initialStates"] = ["Data state"]
+        if 'doubleEscaped' in testdata:
+            testdata = unescape(testdata)
+        self.testdata = testdata
+
+    def collect(self):
+        for initialState in self.testdata["initialStates"]:
+            initialState = capitalize(initialState)
+            item = TokenizerTest(initialState,
+                                 self,
+                                 self.testdata,
+                                 initialState)
+            if self.testdata["input"] is None:
+                item.add_marker(pytest.mark.skipif(True, reason="Relies on lone surrogates"))
+            yield item
+
+
+class TokenizerTest(pytest.Item):
+    def __init__(self, name, parent, test, initialState):
+        super(TokenizerTest, self).__init__(name, parent)
+        self.obj = lambda: 1  # this is to hack around skipif needing a function!
+        self.test = test
+        self.initialState = initialState
+
+    def runtest(self):
+        warnings.resetwarnings()
+        warnings.simplefilter("error")
+
+        expected = self.test['output']
+        if 'lastStartTag' not in self.test:
+            self.test['lastStartTag'] = None
+        parser = TokenizerTestParser(self.initialState,
+                                     self.test['lastStartTag'])
+        tokens = parser.parse(self.test['input'])
+        received = normalizeTokens(tokens)
+        errorMsg = "\n".join(["\n\nInitial state:",
+                              self.initialState,
+                              "\nInput:", self.test['input'],
+                              "\nExpected:", repr(expected),
+                              "\nreceived:", repr(tokens)])
+        errorMsg = errorMsg
+        ignoreErrorOrder = self.test.get('ignoreErrorOrder', False)
+        assert tokensMatch(expected, received, ignoreErrorOrder, True), errorMsg
+
+    def repr_failure(self, excinfo):
+        traceback = excinfo.traceback
+        ntraceback = traceback.cut(path=__file__)
+        excinfo.traceback = ntraceback.filter()
+
+        return excinfo.getrepr(funcargs=True,
+                               showlocals=False,
+                               style="short", tbfilter=False)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/tokenizertotree.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/tokenizertotree.py
@ -0,0 +1,68 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import sys
+import os
+import json
+import re
+
+import html5lib
+from . import support
+from . import test_tokenizer
+
+p = html5lib.HTMLParser()
+
+unnamespaceExpected = re.compile(r"^(\|\s*)<html ([^>]+)>", re.M).sub
+
+
+def main(out_path):
+    if not os.path.exists(out_path):
+        sys.stderr.write("Path %s does not exist" % out_path)
+        sys.exit(1)
+
+    for filename in support.get_data_files('tokenizer', '*.test'):
+        run_file(filename, out_path)
+
+
+def run_file(filename, out_path):
+    try:
+        tests_data = json.load(open(filename, "r"))
+    except ValueError:
+        sys.stderr.write("Failed to load %s\n" % filename)
+        return
+    name = os.path.splitext(os.path.split(filename)[1])[0]
+    output_file = open(os.path.join(out_path, "tokenizer_%s.dat" % name), "w")
+
+    if 'tests' in tests_data:
+        for test_data in tests_data['tests']:
+            if 'initialStates' not in test_data:
+                test_data["initialStates"] = ["Data state"]
+
+            for initial_state in test_data["initialStates"]:
+                if initial_state != "Data state":
+                    # don't support this yet
+                    continue
+                test = make_test(test_data)
+                output_file.write(test)
+
+    output_file.close()
+
+
+def make_test(test_data):
+    if 'doubleEscaped' in test_data:
+        test_data = test_tokenizer.unescape_test(test_data)
+
+    rv = []
+    rv.append("#data")
+    rv.append(test_data["input"].encode("utf8"))
+    rv.append("#errors")
+    tree = p.parse(test_data["input"])
+    output = p.tree.testSerializer(tree)
+    output = "\n".join(("| " + line[3:]) if line.startswith("|  ") else line
+                       for line in output.split("\n"))
+    output = unnamespaceExpected(r"\1<\2>", output)
+    rv.append(output.encode("utf8"))
+    rv.append("")
+    return "\n".join(rv)
+
+if __name__ == "__main__":
+    main(sys.argv[1])
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/tree_construction.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/tree_construction.py
@ -0,0 +1,204 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import itertools
+import re
+import warnings
+from difflib import unified_diff
+
+import pytest
+
+from .support import TestData, convert, convertExpected, treeTypes
+from html5lib import html5parser, constants, treewalkers
+from html5lib.filters.lint import Filter as Lint
+
+_attrlist_re = re.compile(r"^(\s+)\w+=.*(\n\1\w+=.*)+", re.M)
+
+
+def sortattrs(s):
+    def replace(m):
+        lines = m.group(0).split("\n")
+        lines.sort()
+        return "\n".join(lines)
+    return _attrlist_re.sub(replace, s)
+
+
+class TreeConstructionFile(pytest.File):
+    def collect(self):
+        tests = TestData(str(self.fspath), "data")
+        for i, test in enumerate(tests):
+            yield TreeConstructionTest(str(i), self, testdata=test)
+
+
+class TreeConstructionTest(pytest.Collector):
+    def __init__(self, name, parent=None, config=None, session=None, testdata=None):
+        super(TreeConstructionTest, self).__init__(name, parent, config, session)
+        self.testdata = testdata
+
+    def collect(self):
+        for treeName, treeAPIs in sorted(treeTypes.items()):
+            for x in itertools.chain(self._getParserTests(treeName, treeAPIs),
+                                     self._getTreeWalkerTests(treeName, treeAPIs)):
+                yield x
+
+    def _getParserTests(self, treeName, treeAPIs):
+        if treeAPIs is not None and "adapter" in treeAPIs:
+            return
+        for namespaceHTMLElements in (True, False):
+            if namespaceHTMLElements:
+                nodeid = "%s::parser::namespaced" % treeName
+            else:
+                nodeid = "%s::parser::void-namespace" % treeName
+            item = ParserTest(nodeid,
+                              self,
+                              self.testdata,
+                              treeAPIs["builder"] if treeAPIs is not None else None,
+                              namespaceHTMLElements)
+            item.add_marker(getattr(pytest.mark, treeName))
+            item.add_marker(pytest.mark.parser)
+            if namespaceHTMLElements:
+                item.add_marker(pytest.mark.namespaced)
+            if treeAPIs is None:
+                item.add_marker(pytest.mark.skipif(True, reason="Treebuilder not loaded"))
+            yield item
+
+    def _getTreeWalkerTests(self, treeName, treeAPIs):
+        nodeid = "%s::treewalker" % treeName
+        item = TreeWalkerTest(nodeid,
+                              self,
+                              self.testdata,
+                              treeAPIs)
+        item.add_marker(getattr(pytest.mark, treeName))
+        item.add_marker(pytest.mark.treewalker)
+        if treeAPIs is None:
+            item.add_marker(pytest.mark.skipif(True, reason="Treebuilder not loaded"))
+        yield item
+
+
+def convertTreeDump(data):
+    return "\n".join(convert(3)(data).split("\n")[1:])
+
+namespaceExpected = re.compile(r"^(\s*)<(\S+)>", re.M).sub
+
+
+class ParserTest(pytest.Item):
+    def __init__(self, name, parent, test, treeClass, namespaceHTMLElements):
+        super(ParserTest, self).__init__(name, parent)
+        self.obj = lambda: 1  # this is to hack around skipif needing a function!
+        self.test = test
+        self.treeClass = treeClass
+        self.namespaceHTMLElements = namespaceHTMLElements
+
+    def runtest(self):
+        p = html5parser.HTMLParser(tree=self.treeClass,
+                                   namespaceHTMLElements=self.namespaceHTMLElements)
+
+        input = self.test['data']
+        fragmentContainer = self.test['document-fragment']
+        expected = convertExpected(self.test['document'])
+        expectedErrors = self.test['errors'].split("\n") if self.test['errors'] else []
+
+        scripting = False
+        if 'script-on' in self.test:
+            scripting = True
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            try:
+                if fragmentContainer:
+                    document = p.parseFragment(input, fragmentContainer, scripting=scripting)
+                else:
+                    document = p.parse(input, scripting=scripting)
+            except constants.DataLossWarning:
+                pytest.skip("data loss warning")
+
+        output = convertTreeDump(p.tree.testSerializer(document))
+
+        expected = expected
+        if self.namespaceHTMLElements:
+            expected = namespaceExpected(r"\1<html \2>", expected)
+
+        errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
+                              "\nReceived:", output])
+        assert expected == output, errorMsg
+
+        errStr = []
+        for (line, col), errorcode, datavars in p.errors:
+            assert isinstance(datavars, dict), "%s, %s" % (errorcode, repr(datavars))
+            errStr.append("Line: %i Col: %i %s" % (line, col,
+                                                   constants.E[errorcode] % datavars))
+
+        errorMsg2 = "\n".join(["\n\nInput:", input,
+                               "\nExpected errors (" + str(len(expectedErrors)) + "):\n" + "\n".join(expectedErrors),
+                               "\nActual errors (" + str(len(p.errors)) + "):\n" + "\n".join(errStr)])
+        if False:  # we're currently not testing parse errors
+            assert len(p.errors) == len(expectedErrors), errorMsg2
+
+    def repr_failure(self, excinfo):
+        traceback = excinfo.traceback
+        ntraceback = traceback.cut(path=__file__)
+        excinfo.traceback = ntraceback.filter()
+
+        return excinfo.getrepr(funcargs=True,
+                               showlocals=False,
+                               style="short", tbfilter=False)
+
+
+class TreeWalkerTest(pytest.Item):
+    def __init__(self, name, parent, test, treeAPIs):
+        super(TreeWalkerTest, self).__init__(name, parent)
+        self.obj = lambda: 1  # this is to hack around skipif needing a function!
+        self.test = test
+        self.treeAPIs = treeAPIs
+
+    def runtest(self):
+        p = html5parser.HTMLParser(tree=self.treeAPIs["builder"])
+
+        input = self.test['data']
+        fragmentContainer = self.test['document-fragment']
+        expected = convertExpected(self.test['document'])
+
+        scripting = False
+        if 'script-on' in self.test:
+            scripting = True
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            try:
+                if fragmentContainer:
+                    document = p.parseFragment(input, fragmentContainer, scripting=scripting)
+                else:
+                    document = p.parse(input, scripting=scripting)
+            except constants.DataLossWarning:
+                pytest.skip("data loss warning")
+
+        poutput = convertTreeDump(p.tree.testSerializer(document))
+        namespace_expected = namespaceExpected(r"\1<html \2>", expected)
+        if poutput != namespace_expected:
+            pytest.skip("parser output incorrect")
+
+        document = self.treeAPIs.get("adapter", lambda x: x)(document)
+
+        try:
+            output = treewalkers.pprint(Lint(self.treeAPIs["walker"](document)))
+            output = sortattrs(output)
+            expected = sortattrs(expected)
+            diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()],
+                                        [line + "\n" for line in output.splitlines()],
+                                        "Expected", "Received"))
+            assert expected == output, "\n".join([
+                "", "Input:", input,
+                    "", "Expected:", expected,
+                    "", "Received:", output,
+                    "", "Diff:", diff,
+            ])
+        except NotImplementedError:
+            pytest.skip("tree walker NotImplementedError")
+
+    def repr_failure(self, excinfo):
+        traceback = excinfo.traceback
+        ntraceback = traceback.cut(path=__file__)
+        excinfo.traceback = ntraceback.filter()
+
+        return excinfo.getrepr(funcargs=True,
+                               showlocals=False,
+                               style="short", tbfilter=False)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/us-ascii.html
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/us-ascii.html
@ -0,0 +1,3 @@
+<!doctype html>
+<title>Test</title>
+<p>Hello World!
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/utf-8-bom.html
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/tests/utf-8-bom.html
@ -0,0 +1,3 @@
+<!doctype html>
+<title>Test</title>
+<p>Hello World! ©
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treeadapters/init.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treeadapters/init.py
@ -0,0 +1,30 @@
+"""Tree adapters let you convert from one tree structure to another
+
+Example:
+
+.. code-block:: python
+
+   import html5lib
+   from html5lib.treeadapters import genshi
+
+   doc = '<html><body>Hi!</body></html>'
+   treebuilder = html5lib.getTreeBuilder('etree')
+   parser = html5lib.HTMLParser(tree=treebuilder)
+   tree = parser.parse(doc)
+   TreeWalker = html5lib.getTreeWalker('etree')
+
+   genshi_tree = genshi.to_genshi(TreeWalker(tree))
+
+"""
+from __future__ import absolute_import, division, unicode_literals
+
+from . import sax
+
+__all__ = ["sax"]
+
+try:
+    from . import genshi  # noqa
+except ImportError:
+    pass
+else:
+    __all__.append("genshi")
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treeadapters/genshi.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treeadapters/genshi.py
@ -0,0 +1,54 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from genshi.core import QName, Attrs
+from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
+
+
+def to_genshi(walker):
+    """Convert a tree to a genshi tree
+
+    :arg walker: the treewalker to use to walk the tree to convert it
+
+    :returns: generator of genshi nodes
+
+    """
+    text = []
+    for token in walker:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            text.append(token["data"])
+        elif text:
+            yield TEXT, "".join(text), (None, -1, -1)
+            text = []
+
+        if type in ("StartTag", "EmptyTag"):
+            if token["namespace"]:
+                name = "{%s}%s" % (token["namespace"], token["name"])
+            else:
+                name = token["name"]
+            attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
+                           for attr, value in token["data"].items()])
+            yield (START, (QName(name), attrs), (None, -1, -1))
+            if type == "EmptyTag":
+                type = "EndTag"
+
+        if type == "EndTag":
+            if token["namespace"]:
+                name = "{%s}%s" % (token["namespace"], token["name"])
+            else:
+                name = token["name"]
+
+            yield END, QName(name), (None, -1, -1)
+
+        elif type == "Comment":
+            yield COMMENT, token["data"], (None, -1, -1)
+
+        elif type == "Doctype":
+            yield DOCTYPE, (token["name"], token["publicId"],
+                            token["systemId"]), (None, -1, -1)
+
+        else:
+            pass  # FIXME: What to do?
+
+    if text:
+        yield TEXT, "".join(text), (None, -1, -1)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treeadapters/sax.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treeadapters/sax.py
@ -0,0 +1,50 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.sax.xmlreader import AttributesNSImpl
+
+from ..constants import adjustForeignAttributes, unadjustForeignAttributes
+
+prefix_mapping = {}
+for prefix, localName, namespace in adjustForeignAttributes.values():
+    if prefix is not None:
+        prefix_mapping[prefix] = namespace
+
+
+def to_sax(walker, handler):
+    """Call SAX-like content handler based on treewalker walker
+
+    :arg walker: the treewalker to use to walk the tree to convert it
+
+    :arg handler: SAX handler to use
+
+    """
+    handler.startDocument()
+    for prefix, namespace in prefix_mapping.items():
+        handler.startPrefixMapping(prefix, namespace)
+
+    for token in walker:
+        type = token["type"]
+        if type == "Doctype":
+            continue
+        elif type in ("StartTag", "EmptyTag"):
+            attrs = AttributesNSImpl(token["data"],
+                                     unadjustForeignAttributes)
+            handler.startElementNS((token["namespace"], token["name"]),
+                                   token["name"],
+                                   attrs)
+            if type == "EmptyTag":
+                handler.endElementNS((token["namespace"], token["name"]),
+                                     token["name"])
+        elif type == "EndTag":
+            handler.endElementNS((token["namespace"], token["name"]),
+                                 token["name"])
+        elif type in ("Characters", "SpaceCharacters"):
+            handler.characters(token["data"])
+        elif type == "Comment":
+            pass
+        else:
+            assert False, "Unknown token type"
+
+    for prefix, namespace in prefix_mapping.items():
+        handler.endPrefixMapping(prefix)
+    handler.endDocument()
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treebuilders/init.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treebuilders/init.py
@ -0,0 +1,88 @@
+"""A collection of modules for building different kinds of trees from HTML
+documents.
+
+To create a treebuilder for a new type of tree, you need to do
+implement several things:
+
+1. A set of classes for various types of elements: Document, Doctype, Comment,
+   Element. These must implement the interface of ``base.treebuilders.Node``
+   (although comment nodes have a different signature for their constructor,
+   see ``treebuilders.etree.Comment``) Textual content may also be implemented
+   as another node type, or not, as your tree implementation requires.
+
+2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
+   from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
+
+   * ``documentClass`` - the class to use for the bottommost node of a document
+   * ``elementClass`` - the class to use for HTML Elements
+   * ``commentClass`` - the class to use for comments
+   * ``doctypeClass`` - the class to use for doctypes
+
+   It also has one required method:
+
+   * ``getDocument`` - Returns the root node of the complete document tree
+
+3. If you wish to run the unit tests, you must also create a ``testSerializer``
+   method on your treebuilder which accepts a node and returns a string
+   containing Node and its children serialized according to the format used in
+   the unittests
+
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .._utils import default_etree
+
+treeBuilderCache = {}
+
+
+def getTreeBuilder(treeType, implementation=None, **kwargs):
+    """Get a TreeBuilder class for various types of trees with built-in support
+
+    :arg treeType: the name of the tree type required (case-insensitive). Supported
+        values are:
+
+        * "dom" - A generic builder for DOM implementations, defaulting to a
+          xml.dom.minidom based implementation.
+        * "etree" - A generic builder for tree implementations exposing an
+          ElementTree-like interface, defaulting to xml.etree.cElementTree if
+          available and xml.etree.ElementTree if not.
+        * "lxml" - A etree-based builder for lxml.etree, handling limitations
+          of lxml's implementation.
+
+    :arg implementation: (Currently applies to the "etree" and "dom" tree
+        types). A module implementing the tree type e.g. xml.etree.ElementTree
+        or xml.etree.cElementTree.
+
+    :arg kwargs: Any additional options to pass to the TreeBuilder when
+        creating it.
+
+    Example:
+
+    >>> from html5lib.treebuilders import getTreeBuilder
+    >>> builder = getTreeBuilder('etree')
+
+    """
+
+    treeType = treeType.lower()
+    if treeType not in treeBuilderCache:
+        if treeType == "dom":
+            from . import dom
+            # Come up with a sane default (pref. from the stdlib)
+            if implementation is None:
+                from xml.dom import minidom
+                implementation = minidom
+            # NEVER cache here, caching is done in the dom submodule
+            return dom.getDomModule(implementation, **kwargs).TreeBuilder
+        elif treeType == "lxml":
+            from . import etree_lxml
+            treeBuilderCache[treeType] = etree_lxml.TreeBuilder
+        elif treeType == "etree":
+            from . import etree
+            if implementation is None:
+                implementation = default_etree
+            # NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeBuilder
+        else:
+            raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
+    return treeBuilderCache.get(treeType)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treebuilders/base.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treebuilders/base.py
@ -0,0 +1,417 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from ..constants import scopingElements, tableInsertModeElements, namespaces
+
+# The scope markers are inserted when entering object elements,
+# marquees, table cells, and table captions, and are used to prevent formatting
+# from "leaking" into tables, object elements, and marquees.
+Marker = None
+
+listElementsMap = {
+    None: (frozenset(scopingElements), False),
+    "button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
+    "list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
+                                              (namespaces["html"], "ul")])), False),
+    "table": (frozenset([(namespaces["html"], "html"),
+                         (namespaces["html"], "table")]), False),
+    "select": (frozenset([(namespaces["html"], "optgroup"),
+                          (namespaces["html"], "option")]), True)
+}
+
+
+class Node(object):
+    """Represents an item in the tree"""
+    def __init__(self, name):
+        """Creates a Node
+
+        :arg name: The tag name associated with the node
+
+        """
+        # The tag name assocaited with the node
+        self.name = name
+        # The parent of the current node (or None for the document node)
+        self.parent = None
+        # The value of the current node (applies to text nodes and comments)
+        self.value = None
+        # A dict holding name -> value pairs for attributes of the node
+        self.attributes = {}
+        # A list of child nodes of the current node. This must include all
+        # elements but not necessarily other node types.
+        self.childNodes = []
+        # A list of miscellaneous flags that can be set on the node.
+        self._flags = []
+
+    def __str__(self):
+        attributesStr = " ".join(["%s=\"%s\"" % (name, value)
+                                  for name, value in
+                                  self.attributes.items()])
+        if attributesStr:
+            return "<%s %s>" % (self.name, attributesStr)
+        else:
+            return "<%s>" % (self.name)
+
+    def __repr__(self):
+        return "<%s>" % (self.name)
+
+    def appendChild(self, node):
+        """Insert node as a child of the current node
+
+        :arg node: the node to insert
+
+        """
+        raise NotImplementedError
+
+    def insertText(self, data, insertBefore=None):
+        """Insert data as text in the current node, positioned before the
+        start of node insertBefore or to the end of the node's text.
+
+        :arg data: the data to insert
+
+        :arg insertBefore: True if you want to insert the text before the node
+            and False if you want to insert it after the node
+
+        """
+        raise NotImplementedError
+
+    def insertBefore(self, node, refNode):
+        """Insert node as a child of the current node, before refNode in the
+        list of child nodes. Raises ValueError if refNode is not a child of
+        the current node
+
+        :arg node: the node to insert
+
+        :arg refNode: the child node to insert the node before
+
+        """
+        raise NotImplementedError
+
+    def removeChild(self, node):
+        """Remove node from the children of the current node
+
+        :arg node: the child node to remove
+
+        """
+        raise NotImplementedError
+
+    def reparentChildren(self, newParent):
+        """Move all the children of the current node to newParent.
+        This is needed so that trees that don't store text as nodes move the
+        text in the correct way
+
+        :arg newParent: the node to move all this node's children to
+
+        """
+        # XXX - should this method be made more general?
+        for child in self.childNodes:
+            newParent.appendChild(child)
+        self.childNodes = []
+
+    def cloneNode(self):
+        """Return a shallow copy of the current node i.e. a node with the same
+        name and attributes but with no parent or child nodes
+        """
+        raise NotImplementedError
+
+    def hasContent(self):
+        """Return true if the node has children or text, false otherwise
+        """
+        raise NotImplementedError
+
+
+class ActiveFormattingElements(list):
+    def append(self, node):
+        equalCount = 0
+        if node != Marker:
+            for element in self[::-1]:
+                if element == Marker:
+                    break
+                if self.nodesEqual(element, node):
+                    equalCount += 1
+                if equalCount == 3:
+                    self.remove(element)
+                    break
+        list.append(self, node)
+
+    def nodesEqual(self, node1, node2):
+        if not node1.nameTuple == node2.nameTuple:
+            return False
+
+        if not node1.attributes == node2.attributes:
+            return False
+
+        return True
+
+
+class TreeBuilder(object):
+    """Base treebuilder implementation
+
+    * documentClass - the class to use for the bottommost node of a document
+    * elementClass - the class to use for HTML Elements
+    * commentClass - the class to use for comments
+    * doctypeClass - the class to use for doctypes
+
+    """
+    # pylint:disable=not-callable
+
+    # Document class
+    documentClass = None
+
+    # The class to use for creating a node
+    elementClass = None
+
+    # The class to use for creating comments
+    commentClass = None
+
+    # The class to use for creating doctypes
+    doctypeClass = None
+
+    # Fragment class
+    fragmentClass = None
+
+    def __init__(self, namespaceHTMLElements):
+        """Create a TreeBuilder
+
+        :arg namespaceHTMLElements: whether or not to namespace HTML elements
+
+        """
+        if namespaceHTMLElements:
+            self.defaultNamespace = "http://www.w3.org/1999/xhtml"
+        else:
+            self.defaultNamespace = None
+        self.reset()
+
+    def reset(self):
+        self.openElements = []
+        self.activeFormattingElements = ActiveFormattingElements()
+
+        # XXX - rename these to headElement, formElement
+        self.headPointer = None
+        self.formPointer = None
+
+        self.insertFromTable = False
+
+        self.document = self.documentClass()
+
+    def elementInScope(self, target, variant=None):
+
+        # If we pass a node in we match that. if we pass a string
+        # match any node with that name
+        exactNode = hasattr(target, "nameTuple")
+        if not exactNode:
+            if isinstance(target, text_type):
+                target = (namespaces["html"], target)
+            assert isinstance(target, tuple)
+
+        listElements, invert = listElementsMap[variant]
+
+        for node in reversed(self.openElements):
+            if exactNode and node == target:
+                return True
+            elif not exactNode and node.nameTuple == target:
+                return True
+            elif (invert ^ (node.nameTuple in listElements)):
+                return False
+
+        assert False  # We should never reach this point
+
+    def reconstructActiveFormattingElements(self):
+        # Within this algorithm the order of steps described in the
+        # specification is not quite the same as the order of steps in the
+        # code. It should still do the same though.
+
+        # Step 1: stop the algorithm when there's nothing to do.
+        if not self.activeFormattingElements:
+            return
+
+        # Step 2 and step 3: we start with the last element. So i is -1.
+        i = len(self.activeFormattingElements) - 1
+        entry = self.activeFormattingElements[i]
+        if entry == Marker or entry in self.openElements:
+            return
+
+        # Step 6
+        while entry != Marker and entry not in self.openElements:
+            if i == 0:
+                # This will be reset to 0 below
+                i = -1
+                break
+            i -= 1
+            # Step 5: let entry be one earlier in the list.
+            entry = self.activeFormattingElements[i]
+
+        while True:
+            # Step 7
+            i += 1
+
+            # Step 8
+            entry = self.activeFormattingElements[i]
+            clone = entry.cloneNode()  # Mainly to get a new copy of the attributes
+
+            # Step 9
+            element = self.insertElement({"type": "StartTag",
+                                          "name": clone.name,
+                                          "namespace": clone.namespace,
+                                          "data": clone.attributes})
+
+            # Step 10
+            self.activeFormattingElements[i] = element
+
+            # Step 11
+            if element == self.activeFormattingElements[-1]:
+                break
+
+    def clearActiveFormattingElements(self):
+        entry = self.activeFormattingElements.pop()
+        while self.activeFormattingElements and entry != Marker:
+            entry = self.activeFormattingElements.pop()
+
+    def elementInActiveFormattingElements(self, name):
+        """Check if an element exists between the end of the active
+        formatting elements and the last marker. If it does, return it, else
+        return false"""
+
+        for item in self.activeFormattingElements[::-1]:
+            # Check for Marker first because if it's a Marker it doesn't have a
+            # name attribute.
+            if item == Marker:
+                break
+            elif item.name == name:
+                return item
+        return False
+
+    def insertRoot(self, token):
+        element = self.createElement(token)
+        self.openElements.append(element)
+        self.document.appendChild(element)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = self.doctypeClass(name, publicId, systemId)
+        self.document.appendChild(doctype)
+
+    def insertComment(self, token, parent=None):
+        if parent is None:
+            parent = self.openElements[-1]
+        parent.appendChild(self.commentClass(token["data"]))
+
+    def createElement(self, token):
+        """Create an element but don't insert it anywhere"""
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
+        return element
+
+    def _getInsertFromTable(self):
+        return self._insertFromTable
+
+    def _setInsertFromTable(self, value):
+        """Switch the function used to insert an element from the
+        normal one to the misnested table one and back again"""
+        self._insertFromTable = value
+        if value:
+            self.insertElement = self.insertElementTable
+        else:
+            self.insertElement = self.insertElementNormal
+
+    insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
+
+    def insertElementNormal(self, token):
+        name = token["name"]
+        assert isinstance(name, text_type), "Element %s not unicode" % name
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
+        self.openElements[-1].appendChild(element)
+        self.openElements.append(element)
+        return element
+
+    def insertElementTable(self, token):
+        """Create an element and insert it into the tree"""
+        element = self.createElement(token)
+        if self.openElements[-1].name not in tableInsertModeElements:
+            return self.insertElementNormal(token)
+        else:
+            # We should be in the InTable mode. This means we want to do
+            # special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            if insertBefore is None:
+                parent.appendChild(element)
+            else:
+                parent.insertBefore(element, insertBefore)
+            self.openElements.append(element)
+        return element
+
+    def insertText(self, data, parent=None):
+        """Insert text data."""
+        if parent is None:
+            parent = self.openElements[-1]
+
+        if (not self.insertFromTable or (self.insertFromTable and
+                                         self.openElements[-1].name
+                                         not in tableInsertModeElements)):
+            parent.insertText(data)
+        else:
+            # We should be in the InTable mode. This means we want to do
+            # special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            parent.insertText(data, insertBefore)
+
+    def getTableMisnestedNodePosition(self):
+        """Get the foster parent element, and sibling to insert before
+        (or None) when inserting a misnested table node"""
+        # The foster parent element is the one which comes before the most
+        # recently opened table element
+        # XXX - this is really inelegant
+        lastTable = None
+        fosterParent = None
+        insertBefore = None
+        for elm in self.openElements[::-1]:
+            if elm.name == "table":
+                lastTable = elm
+                break
+        if lastTable:
+            # XXX - we should really check that this parent is actually a
+            # node here
+            if lastTable.parent:
+                fosterParent = lastTable.parent
+                insertBefore = lastTable
+            else:
+                fosterParent = self.openElements[
+                    self.openElements.index(lastTable) - 1]
+        else:
+            fosterParent = self.openElements[0]
+        return fosterParent, insertBefore
+
+    def generateImpliedEndTags(self, exclude=None):
+        name = self.openElements[-1].name
+        # XXX td, th and tr are not actually needed
+        if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
+                name != exclude):
+            self.openElements.pop()
+            # XXX This is not entirely what the specification says. We should
+            # investigate it more closely.
+            self.generateImpliedEndTags(exclude)
+
+    def getDocument(self):
+        """Return the final tree"""
+        return self.document
+
+    def getFragment(self):
+        """Return the final fragment"""
+        # assert self.innerHTML
+        fragment = self.fragmentClass()
+        self.openElements[0].reparentChildren(fragment)
+        return fragment
+
+    def testSerializer(self, node):
+        """Serialize the subtree of node in the format required by unit tests
+
+        :arg node: the node from which to start serializing
+
+        """
+        raise NotImplementedError
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treebuilders/dom.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treebuilders/dom.py
@ -0,0 +1,236 @@
+from __future__ import absolute_import, division, unicode_literals
+
+
+from collections import MutableMapping
+from xml.dom import minidom, Node
+import weakref
+
+from . import base
+from .. import constants
+from ..constants import namespaces
+from .._utils import moduleFactoryFactory
+
+
+def getDomBuilder(DomImplementation):
+    Dom = DomImplementation
+
+    class AttrList(MutableMapping):
+        def __init__(self, element):
+            self.element = element
+
+        def __iter__(self):
+            return iter(self.element.attributes.keys())
+
+        def __setitem__(self, name, value):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                attr = self.element.ownerDocument.createAttribute(name)
+                attr.value = value
+                self.element.attributes[name] = attr
+
+        def __len__(self):
+            return len(self.element.attributes)
+
+        def items(self):
+            return list(self.element.attributes.items())
+
+        def values(self):
+            return list(self.element.attributes.values())
+
+        def __getitem__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                return self.element.attributes[name].value
+
+        def __delitem__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                del self.element.attributes[name]
+
+    class NodeBuilder(base.Node):
+        def __init__(self, element):
+            base.Node.__init__(self, element.nodeName)
+            self.element = element
+
+        namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
+                             self.element.namespaceURI or None)
+
+        def appendChild(self, node):
+            node.parent = self
+            self.element.appendChild(node.element)
+
+        def insertText(self, data, insertBefore=None):
+            text = self.element.ownerDocument.createTextNode(data)
+            if insertBefore:
+                self.element.insertBefore(text, insertBefore.element)
+            else:
+                self.element.appendChild(text)
+
+        def insertBefore(self, node, refNode):
+            self.element.insertBefore(node.element, refNode.element)
+            node.parent = self
+
+        def removeChild(self, node):
+            if node.element.parentNode == self.element:
+                self.element.removeChild(node.element)
+            node.parent = None
+
+        def reparentChildren(self, newParent):
+            while self.element.hasChildNodes():
+                child = self.element.firstChild
+                self.element.removeChild(child)
+                newParent.element.appendChild(child)
+            self.childNodes = []
+
+        def getAttributes(self):
+            return AttrList(self.element)
+
+        def setAttributes(self, attributes):
+            if attributes:
+                for name, value in list(attributes.items()):
+                    if isinstance(name, tuple):
+                        if name[0] is not None:
+                            qualifiedName = (name[0] + ":" + name[1])
+                        else:
+                            qualifiedName = name[1]
+                        self.element.setAttributeNS(name[2], qualifiedName,
+                                                    value)
+                    else:
+                        self.element.setAttribute(
+                            name, value)
+        attributes = property(getAttributes, setAttributes)
+
+        def cloneNode(self):
+            return NodeBuilder(self.element.cloneNode(False))
+
+        def hasContent(self):
+            return self.element.hasChildNodes()
+
+        def getNameTuple(self):
+            if self.namespace is None:
+                return namespaces["html"], self.name
+            else:
+                return self.namespace, self.name
+
+        nameTuple = property(getNameTuple)
+
+    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
+        def documentClass(self):
+            self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
+            return weakref.proxy(self)
+
+        def insertDoctype(self, token):
+            name = token["name"]
+            publicId = token["publicId"]
+            systemId = token["systemId"]
+
+            domimpl = Dom.getDOMImplementation()
+            doctype = domimpl.createDocumentType(name, publicId, systemId)
+            self.document.appendChild(NodeBuilder(doctype))
+            if Dom == minidom:
+                doctype.ownerDocument = self.dom
+
+        def elementClass(self, name, namespace=None):
+            if namespace is None and self.defaultNamespace is None:
+                node = self.dom.createElement(name)
+            else:
+                node = self.dom.createElementNS(namespace, name)
+
+            return NodeBuilder(node)
+
+        def commentClass(self, data):
+            return NodeBuilder(self.dom.createComment(data))
+
+        def fragmentClass(self):
+            return NodeBuilder(self.dom.createDocumentFragment())
+
+        def appendChild(self, node):
+            self.dom.appendChild(node.element)
+
+        def testSerializer(self, element):
+            return testSerializer(element)
+
+        def getDocument(self):
+            return self.dom
+
+        def getFragment(self):
+            return base.TreeBuilder.getFragment(self).element
+
+        def insertText(self, data, parent=None):
+            data = data
+            if parent != self:
+                base.TreeBuilder.insertText(self, data, parent)
+            else:
+                # HACK: allow text nodes as children of the document node
+                if hasattr(self.dom, '_child_node_types'):
+                    # pylint:disable=protected-access
+                    if Node.TEXT_NODE not in self.dom._child_node_types:
+                        self.dom._child_node_types = list(self.dom._child_node_types)
+                        self.dom._child_node_types.append(Node.TEXT_NODE)
+                self.dom.appendChild(self.dom.createTextNode(data))
+
+        implementation = DomImplementation
+        name = None
+
+    def testSerializer(element):
+        element.normalize()
+        rv = []
+
+        def serializeElement(element, indent=0):
+            if element.nodeType == Node.DOCUMENT_TYPE_NODE:
+                if element.name:
+                    if element.publicId or element.systemId:
+                        publicId = element.publicId or ""
+                        systemId = element.systemId or ""
+                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (' ' * indent, element.name, publicId, systemId))
+                    else:
+                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
+                else:
+                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
+            elif element.nodeType == Node.DOCUMENT_NODE:
+                rv.append("#document")
+            elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+                rv.append("#document-fragment")
+            elif element.nodeType == Node.COMMENT_NODE:
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
+            elif element.nodeType == Node.TEXT_NODE:
+                rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
+            else:
+                if (hasattr(element, "namespaceURI") and
+                        element.namespaceURI is not None):
+                    name = "%s %s" % (constants.prefixes[element.namespaceURI],
+                                      element.nodeName)
+                else:
+                    name = element.nodeName
+                rv.append("|%s<%s>" % (' ' * indent, name))
+                if element.hasAttributes():
+                    attributes = []
+                    for i in range(len(element.attributes)):
+                        attr = element.attributes.item(i)
+                        name = attr.nodeName
+                        value = attr.value
+                        ns = attr.namespaceURI
+                        if ns:
+                            name = "%s %s" % (constants.prefixes[ns], attr.localName)
+                        else:
+                            name = attr.nodeName
+                        attributes.append((name, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+            indent += 2
+            for child in element.childNodes:
+                serializeElement(child, indent)
+        serializeElement(element, 0)
+
+        return "\n".join(rv)
+
+    return locals()
+
+
+# The actual means to get a module!
+getDomModule = moduleFactoryFactory(getDomBuilder)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treebuilders/etree.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treebuilders/etree.py
@ -0,0 +1,340 @@
+from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
+from six import text_type
+
+import re
+
+from . import base
+from .. import _ihatexml
+from .. import constants
+from ..constants import namespaces
+from .._utils import moduleFactoryFactory
+
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+
+def getETreeBuilder(ElementTreeImplementation, fullTree=False):
+    ElementTree = ElementTreeImplementation
+    ElementTreeCommentType = ElementTree.Comment("asd").tag
+
+    class Element(base.Node):
+        def __init__(self, name, namespace=None):
+            self._name = name
+            self._namespace = namespace
+            self._element = ElementTree.Element(self._getETreeTag(name,
+                                                                  namespace))
+            if namespace is None:
+                self.nameTuple = namespaces["html"], self._name
+            else:
+                self.nameTuple = self._namespace, self._name
+            self.parent = None
+            self._childNodes = []
+            self._flags = []
+
+        def _getETreeTag(self, name, namespace):
+            if namespace is None:
+                etree_tag = name
+            else:
+                etree_tag = "{%s}%s" % (namespace, name)
+            return etree_tag
+
+        def _setName(self, name):
+            self._name = name
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
+
+        def _getName(self):
+            return self._name
+
+        name = property(_getName, _setName)
+
+        def _setNamespace(self, namespace):
+            self._namespace = namespace
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
+
+        def _getNamespace(self):
+            return self._namespace
+
+        namespace = property(_getNamespace, _setNamespace)
+
+        def _getAttributes(self):
+            return self._element.attrib
+
+        def _setAttributes(self, attributes):
+            # Delete existing attributes first
+            # XXX - there may be a better way to do this...
+            for key in list(self._element.attrib.keys()):
+                del self._element.attrib[key]
+            for key, value in attributes.items():
+                if isinstance(key, tuple):
+                    name = "{%s}%s" % (key[2], key[1])
+                else:
+                    name = key
+                self._element.set(name, value)
+
+        attributes = property(_getAttributes, _setAttributes)
+
+        def _getChildNodes(self):
+            return self._childNodes
+
+        def _setChildNodes(self, value):
+            del self._element[:]
+            self._childNodes = []
+            for element in value:
+                self.insertChild(element)
+
+        childNodes = property(_getChildNodes, _setChildNodes)
+
+        def hasContent(self):
+            """Return true if the node has children or text"""
+            return bool(self._element.text or len(self._element))
+
+        def appendChild(self, node):
+            self._childNodes.append(node)
+            self._element.append(node._element)
+            node.parent = self
+
+        def insertBefore(self, node, refNode):
+            index = list(self._element).index(refNode._element)
+            self._element.insert(index, node._element)
+            node.parent = self
+
+        def removeChild(self, node):
+            self._childNodes.remove(node)
+            self._element.remove(node._element)
+            node.parent = None
+
+        def insertText(self, data, insertBefore=None):
+            if not(len(self._element)):
+                if not self._element.text:
+                    self._element.text = ""
+                self._element.text += data
+            elif insertBefore is None:
+                # Insert the text as the tail of the last child element
+                if not self._element[-1].tail:
+                    self._element[-1].tail = ""
+                self._element[-1].tail += data
+            else:
+                # Insert the text before the specified node
+                children = list(self._element)
+                index = children.index(insertBefore._element)
+                if index > 0:
+                    if not self._element[index - 1].tail:
+                        self._element[index - 1].tail = ""
+                    self._element[index - 1].tail += data
+                else:
+                    if not self._element.text:
+                        self._element.text = ""
+                    self._element.text += data
+
+        def cloneNode(self):
+            element = type(self)(self.name, self.namespace)
+            for name, value in self.attributes.items():
+                element.attributes[name] = value
+            return element
+
+        def reparentChildren(self, newParent):
+            if newParent.childNodes:
+                newParent.childNodes[-1]._element.tail += self._element.text
+            else:
+                if not newParent._element.text:
+                    newParent._element.text = ""
+                if self._element.text is not None:
+                    newParent._element.text += self._element.text
+            self._element.text = ""
+            base.Node.reparentChildren(self, newParent)
+
+    class Comment(Element):
+        def __init__(self, data):
+            # Use the superclass constructor to set all properties on the
+            # wrapper element
+            self._element = ElementTree.Comment(data)
+            self.parent = None
+            self._childNodes = []
+            self._flags = []
+
+        def _getData(self):
+            return self._element.text
+
+        def _setData(self, value):
+            self._element.text = value
+
+        data = property(_getData, _setData)
+
+    class DocumentType(Element):
+        def __init__(self, name, publicId, systemId):
+            Element.__init__(self, "<!DOCTYPE>")
+            self._element.text = name
+            self.publicId = publicId
+            self.systemId = systemId
+
+        def _getPublicId(self):
+            return self._element.get("publicId", "")
+
+        def _setPublicId(self, value):
+            if value is not None:
+                self._element.set("publicId", value)
+
+        publicId = property(_getPublicId, _setPublicId)
+
+        def _getSystemId(self):
+            return self._element.get("systemId", "")
+
+        def _setSystemId(self, value):
+            if value is not None:
+                self._element.set("systemId", value)
+
+        systemId = property(_getSystemId, _setSystemId)
+
+    class Document(Element):
+        def __init__(self):
+            Element.__init__(self, "DOCUMENT_ROOT")
+
+    class DocumentFragment(Element):
+        def __init__(self):
+            Element.__init__(self, "DOCUMENT_FRAGMENT")
+
+    def testSerializer(element):
+        rv = []
+
+        def serializeElement(element, indent=0):
+            if not(hasattr(element, "tag")):
+                element = element.getroot()
+            if element.tag == "<!DOCTYPE>":
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append("""<!DOCTYPE %s "%s" "%s">""" %
+                              (element.text, publicId, systemId))
+                else:
+                    rv.append("<!DOCTYPE %s>" % (element.text,))
+            elif element.tag == "DOCUMENT_ROOT":
+                rv.append("#document")
+                if element.text is not None:
+                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+                if element.tail is not None:
+                    raise TypeError("Document node cannot have tail")
+                if hasattr(element, "attrib") and len(element.attrib):
+                    raise TypeError("Document node cannot have attributes")
+            elif element.tag == ElementTreeCommentType:
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
+            else:
+                assert isinstance(element.tag, text_type), \
+                    "Expected unicode, got %s, %s" % (type(element.tag), element.tag)
+                nsmatch = tag_regexp.match(element.tag)
+
+                if nsmatch is None:
+                    name = element.tag
+                else:
+                    ns, name = nsmatch.groups()
+                    prefix = constants.prefixes[ns]
+                    name = "%s %s" % (prefix, name)
+                rv.append("|%s<%s>" % (' ' * indent, name))
+
+                if hasattr(element, "attrib"):
+                    attributes = []
+                    for name, value in element.attrib.items():
+                        nsmatch = tag_regexp.match(name)
+                        if nsmatch is not None:
+                            ns, name = nsmatch.groups()
+                            prefix = constants.prefixes[ns]
+                            attr_string = "%s %s" % (prefix, name)
+                        else:
+                            attr_string = name
+                        attributes.append((attr_string, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+                if element.text:
+                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+            indent += 2
+            for child in element:
+                serializeElement(child, indent)
+            if element.tail:
+                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
+        serializeElement(element, 0)
+
+        return "\n".join(rv)
+
+    def tostring(element):  # pylint:disable=unused-variable
+        """Serialize an element and its child nodes to a string"""
+        rv = []
+        filter = _ihatexml.InfosetFilter()
+
+        def serializeElement(element):
+            if isinstance(element, ElementTree.ElementTree):
+                element = element.getroot()
+
+            if element.tag == "<!DOCTYPE>":
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
+                              (element.text, publicId, systemId))
+                else:
+                    rv.append("<!DOCTYPE %s>" % (element.text,))
+            elif element.tag == "DOCUMENT_ROOT":
+                if element.text is not None:
+                    rv.append(element.text)
+                if element.tail is not None:
+                    raise TypeError("Document node cannot have tail")
+                if hasattr(element, "attrib") and len(element.attrib):
+                    raise TypeError("Document node cannot have attributes")
+
+                for child in element:
+                    serializeElement(child)
+
+            elif element.tag == ElementTreeCommentType:
+                rv.append("<!--%s-->" % (element.text,))
+            else:
+                # This is assumed to be an ordinary element
+                if not element.attrib:
+                    rv.append("<%s>" % (filter.fromXmlName(element.tag),))
+                else:
+                    attr = " ".join(["%s=\"%s\"" % (
+                        filter.fromXmlName(name), value)
+                        for name, value in element.attrib.items()])
+                    rv.append("<%s %s>" % (element.tag, attr))
+                if element.text:
+                    rv.append(element.text)
+
+                for child in element:
+                    serializeElement(child)
+
+                rv.append("</%s>" % (element.tag,))
+
+            if element.tail:
+                rv.append(element.tail)
+
+        serializeElement(element)
+
+        return "".join(rv)
+
+    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
+        documentClass = Document
+        doctypeClass = DocumentType
+        elementClass = Element
+        commentClass = Comment
+        fragmentClass = DocumentFragment
+        implementation = ElementTreeImplementation
+
+        def testSerializer(self, element):
+            return testSerializer(element)
+
+        def getDocument(self):
+            if fullTree:
+                return self.document._element
+            else:
+                if self.defaultNamespace is not None:
+                    return self.document._element.find(
+                        "{%s}html" % self.defaultNamespace)
+                else:
+                    return self.document._element.find("html")
+
+        def getFragment(self):
+            return base.TreeBuilder.getFragment(self)._element
+
+    return locals()
+
+
+getETreeModule = moduleFactoryFactory(getETreeBuilder)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treebuilders/etree_lxml.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treebuilders/etree_lxml.py
@ -0,0 +1,366 @@
+"""Module for supporting the lxml.etree library. The idea here is to use as much
+of the native library as possible, without using fragile hacks like custom element
+names that break between releases. The downside of this is that we cannot represent
+all possible trees; specifically the following are known to cause problems:
+
+Text or comments as siblings of the root element
+Docypes with no name
+
+When any of these things occur, we emit a DataLossWarning
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
+import warnings
+import re
+import sys
+
+from . import base
+from ..constants import DataLossWarning
+from .. import constants
+from . import etree as etree_builders
+from .. import _ihatexml
+
+import lxml.etree as etree
+
+
+fullTree = True
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+comment_type = etree.Comment("asd").tag
+
+
+class DocumentType(object):
+    def __init__(self, name, publicId, systemId):
+        self.name = name
+        self.publicId = publicId
+        self.systemId = systemId
+
+
+class Document(object):
+    def __init__(self):
+        self._elementTree = None
+        self._childNodes = []
+
+    def appendChild(self, element):
+        self._elementTree.getroot().addnext(element._element)
+
+    def _getChildNodes(self):
+        return self._childNodes
+
+    childNodes = property(_getChildNodes)
+
+
+def testSerializer(element):
+    rv = []
+    infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
+
+    def serializeElement(element, indent=0):
+        if not hasattr(element, "tag"):
+            if hasattr(element, "getroot"):
+                # Full tree case
+                rv.append("#document")
+                if element.docinfo.internalDTD:
+                    if not (element.docinfo.public_id or
+                            element.docinfo.system_url):
+                        dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
+                    else:
+                        dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
+                            element.docinfo.root_name,
+                            element.docinfo.public_id,
+                            element.docinfo.system_url)
+                    rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
+                next_element = element.getroot()
+                while next_element.getprevious() is not None:
+                    next_element = next_element.getprevious()
+                while next_element is not None:
+                    serializeElement(next_element, indent + 2)
+                    next_element = next_element.getnext()
+            elif isinstance(element, str) or isinstance(element, bytes):
+                # Text in a fragment
+                assert isinstance(element, str) or sys.version_info[0] == 2
+                rv.append("|%s\"%s\"" % (' ' * indent, element))
+            else:
+                # Fragment case
+                rv.append("#document-fragment")
+                for next_element in element:
+                    serializeElement(next_element, indent + 2)
+        elif element.tag == comment_type:
+            rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
+            if hasattr(element, "tail") and element.tail:
+                rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
+        else:
+            assert isinstance(element, etree._Element)
+            nsmatch = etree_builders.tag_regexp.match(element.tag)
+            if nsmatch is not None:
+                ns = nsmatch.group(1)
+                tag = nsmatch.group(2)
+                prefix = constants.prefixes[ns]
+                rv.append("|%s<%s %s>" % (' ' * indent, prefix,
+                                          infosetFilter.fromXmlName(tag)))
+            else:
+                rv.append("|%s<%s>" % (' ' * indent,
+                                       infosetFilter.fromXmlName(element.tag)))
+
+            if hasattr(element, "attrib"):
+                attributes = []
+                for name, value in element.attrib.items():
+                    nsmatch = tag_regexp.match(name)
+                    if nsmatch is not None:
+                        ns, name = nsmatch.groups()
+                        name = infosetFilter.fromXmlName(name)
+                        prefix = constants.prefixes[ns]
+                        attr_string = "%s %s" % (prefix, name)
+                    else:
+                        attr_string = infosetFilter.fromXmlName(name)
+                    attributes.append((attr_string, value))
+
+                for name, value in sorted(attributes):
+                    rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+
+            if element.text:
+                rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+            indent += 2
+            for child in element:
+                serializeElement(child, indent)
+            if hasattr(element, "tail") and element.tail:
+                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
+    serializeElement(element, 0)
+
+    return "\n".join(rv)
+
+
+def tostring(element):
+    """Serialize an element and its child nodes to a string"""
+    rv = []
+
+    def serializeElement(element):
+        if not hasattr(element, "tag"):
+            if element.docinfo.internalDTD:
+                if element.docinfo.doctype:
+                    dtd_str = element.docinfo.doctype
+                else:
+                    dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
+                rv.append(dtd_str)
+            serializeElement(element.getroot())
+
+        elif element.tag == comment_type:
+            rv.append("<!--%s-->" % (element.text,))
+
+        else:
+            # This is assumed to be an ordinary element
+            if not element.attrib:
+                rv.append("<%s>" % (element.tag,))
+            else:
+                attr = " ".join(["%s=\"%s\"" % (name, value)
+                                 for name, value in element.attrib.items()])
+                rv.append("<%s %s>" % (element.tag, attr))
+            if element.text:
+                rv.append(element.text)
+
+            for child in element:
+                serializeElement(child)
+
+            rv.append("</%s>" % (element.tag,))
+
+        if hasattr(element, "tail") and element.tail:
+            rv.append(element.tail)
+
+    serializeElement(element)
+
+    return "".join(rv)
+
+
+class TreeBuilder(base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = None
+    commentClass = None
+    fragmentClass = Document
+    implementation = etree
+
+    def __init__(self, namespaceHTMLElements, fullTree=False):
+        builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
+        infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
+        self.namespaceHTMLElements = namespaceHTMLElements
+
+        class Attributes(dict):
+            def __init__(self, element, value=None):
+                if value is None:
+                    value = {}
+                self._element = element
+                dict.__init__(self, value)  # pylint:disable=non-parent-init-called
+                for key, value in self.items():
+                    if isinstance(key, tuple):
+                        name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
+                    else:
+                        name = infosetFilter.coerceAttribute(key)
+                    self._element._element.attrib[name] = value
+
+            def __setitem__(self, key, value):
+                dict.__setitem__(self, key, value)
+                if isinstance(key, tuple):
+                    name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
+                else:
+                    name = infosetFilter.coerceAttribute(key)
+                self._element._element.attrib[name] = value
+
+        class Element(builder.Element):
+            def __init__(self, name, namespace):
+                name = infosetFilter.coerceElement(name)
+                builder.Element.__init__(self, name, namespace=namespace)
+                self._attributes = Attributes(self)
+
+            def _setName(self, name):
+                self._name = infosetFilter.coerceElement(name)
+                self._element.tag = self._getETreeTag(
+                    self._name, self._namespace)
+
+            def _getName(self):
+                return infosetFilter.fromXmlName(self._name)
+
+            name = property(_getName, _setName)
+
+            def _getAttributes(self):
+                return self._attributes
+
+            def _setAttributes(self, attributes):
+                self._attributes = Attributes(self, attributes)
+
+            attributes = property(_getAttributes, _setAttributes)
+
+            def insertText(self, data, insertBefore=None):
+                data = infosetFilter.coerceCharacters(data)
+                builder.Element.insertText(self, data, insertBefore)
+
+            def appendChild(self, child):
+                builder.Element.appendChild(self, child)
+
+        class Comment(builder.Comment):
+            def __init__(self, data):
+                data = infosetFilter.coerceComment(data)
+                builder.Comment.__init__(self, data)
+
+            def _setData(self, data):
+                data = infosetFilter.coerceComment(data)
+                self._element.text = data
+
+            def _getData(self):
+                return self._element.text
+
+            data = property(_getData, _setData)
+
+        self.elementClass = Element
+        self.commentClass = Comment
+        # self.fragmentClass = builder.DocumentFragment
+        base.TreeBuilder.__init__(self, namespaceHTMLElements)
+
+    def reset(self):
+        base.TreeBuilder.reset(self)
+        self.insertComment = self.insertCommentInitial
+        self.initial_comments = []
+        self.doctype = None
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        if fullTree:
+            return self.document._elementTree
+        else:
+            return self.document._elementTree.getroot()
+
+    def getFragment(self):
+        fragment = []
+        element = self.openElements[0]._element
+        if element.text:
+            fragment.append(element.text)
+        fragment.extend(list(element))
+        if element.tail:
+            fragment.append(element.tail)
+        return fragment
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        if not name:
+            warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
+            self.doctype = None
+        else:
+            coercedName = self.infosetFilter.coerceElement(name)
+            if coercedName != name:
+                warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
+
+            doctype = self.doctypeClass(coercedName, publicId, systemId)
+            self.doctype = doctype
+
+    def insertCommentInitial(self, data, parent=None):
+        assert parent is None or parent is self.document
+        assert self.document._elementTree is None
+        self.initial_comments.append(data)
+
+    def insertCommentMain(self, data, parent=None):
+        if (parent == self.document and
+                self.document._elementTree.getroot()[-1].tag == comment_type):
+            warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
+        super(TreeBuilder, self).insertComment(data, parent)
+
+    def insertRoot(self, token):
+        # Because of the way libxml2 works, it doesn't seem to be possible to
+        # alter information like the doctype after the tree has been parsed.
+        # Therefore we need to use the built-in parser to create our initial
+        # tree, after which we can add elements like normal
+        docStr = ""
+        if self.doctype:
+            assert self.doctype.name
+            docStr += "<!DOCTYPE %s" % self.doctype.name
+            if (self.doctype.publicId is not None or
+                    self.doctype.systemId is not None):
+                docStr += (' PUBLIC "%s" ' %
+                           (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
+                if self.doctype.systemId:
+                    sysid = self.doctype.systemId
+                    if sysid.find("'") >= 0 and sysid.find('"') >= 0:
+                        warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
+                        sysid = sysid.replace("'", 'U00027')
+                    if sysid.find("'") >= 0:
+                        docStr += '"%s"' % sysid
+                    else:
+                        docStr += "'%s'" % sysid
+                else:
+                    docStr += "''"
+            docStr += ">"
+            if self.doctype.name != token["name"]:
+                warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
+        docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
+        root = etree.fromstring(docStr)
+
+        # Append the initial comments:
+        for comment_token in self.initial_comments:
+            comment = self.commentClass(comment_token["data"])
+            root.addprevious(comment._element)
+
+        # Create the root document and add the ElementTree to it
+        self.document = self.documentClass()
+        self.document._elementTree = root.getroottree()
+
+        # Give the root element the right name
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        if namespace is None:
+            etree_tag = name
+        else:
+            etree_tag = "{%s}%s" % (namespace, name)
+        root.tag = etree_tag
+
+        # Add the root element to the internal child/open data structures
+        root_element = self.elementClass(name, namespace)
+        root_element._element = root
+        self.document._childNodes.append(root_element)
+        self.openElements.append(root_element)
+
+        # Reset to the default insert comment function
+        self.insertComment = self.insertCommentMain
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treewalkers/init.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treewalkers/init.py
@ -0,0 +1,154 @@
+"""A collection of modules for iterating through different kinds of
+tree, generating tokens identical to those produced by the tokenizer
+module.
+
+To create a tree walker for a new type of tree, you need to do
+implement a tree walker object (called TreeWalker by convention) that
+implements a 'serialize' method taking a tree as sole argument and
+returning an iterator generating tokens.
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .. import constants
+from .._utils import default_etree
+
+__all__ = ["getTreeWalker", "pprint"]
+
+treeWalkerCache = {}
+
+
+def getTreeWalker(treeType, implementation=None, **kwargs):
+    """Get a TreeWalker class for various types of tree with built-in support
+
+    :arg str treeType: the name of the tree type required (case-insensitive).
+        Supported values are:
+
+        * "dom": The xml.dom.minidom DOM implementation
+        * "etree": A generic walker for tree implementations exposing an
+          elementtree-like interface (known to work with ElementTree,
+          cElementTree and lxml.etree).
+        * "lxml": Optimized walker for lxml.etree
+        * "genshi": a Genshi stream
+
+    :arg implementation: A module implementing the tree type e.g.
+        xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
+        tree type only).
+
+    :arg kwargs: keyword arguments passed to the etree walker--for other
+        walkers, this has no effect
+
+    :returns: a TreeWalker class
+
+    """
+
+    treeType = treeType.lower()
+    if treeType not in treeWalkerCache:
+        if treeType == "dom":
+            from . import dom
+            treeWalkerCache[treeType] = dom.TreeWalker
+        elif treeType == "genshi":
+            from . import genshi
+            treeWalkerCache[treeType] = genshi.TreeWalker
+        elif treeType == "lxml":
+            from . import etree_lxml
+            treeWalkerCache[treeType] = etree_lxml.TreeWalker
+        elif treeType == "etree":
+            from . import etree
+            if implementation is None:
+                implementation = default_etree
+            # XXX: NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeWalker
+    return treeWalkerCache.get(treeType)
+
+
+def concatenateCharacterTokens(tokens):
+    pendingCharacters = []
+    for token in tokens:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            pendingCharacters.append(token["data"])
+        else:
+            if pendingCharacters:
+                yield {"type": "Characters", "data": "".join(pendingCharacters)}
+                pendingCharacters = []
+            yield token
+    if pendingCharacters:
+        yield {"type": "Characters", "data": "".join(pendingCharacters)}
+
+
+def pprint(walker):
+    """Pretty printer for tree walkers
+
+    Takes a TreeWalker instance and pretty prints the output of walking the tree.
+
+    :arg walker: a TreeWalker instance
+
+    """
+    output = []
+    indent = 0
+    for token in concatenateCharacterTokens(walker):
+        type = token["type"]
+        if type in ("StartTag", "EmptyTag"):
+            # tag name
+            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
+                if token["namespace"] in constants.prefixes:
+                    ns = constants.prefixes[token["namespace"]]
+                else:
+                    ns = token["namespace"]
+                name = "%s %s" % (ns, token["name"])
+            else:
+                name = token["name"]
+            output.append("%s<%s>" % (" " * indent, name))
+            indent += 2
+            # attributes (sorted for consistent ordering)
+            attrs = token["data"]
+            for (namespace, localname), value in sorted(attrs.items()):
+                if namespace:
+                    if namespace in constants.prefixes:
+                        ns = constants.prefixes[namespace]
+                    else:
+                        ns = namespace
+                    name = "%s %s" % (ns, localname)
+                else:
+                    name = localname
+                output.append("%s%s=\"%s\"" % (" " * indent, name, value))
+            # self-closing
+            if type == "EmptyTag":
+                indent -= 2
+
+        elif type == "EndTag":
+            indent -= 2
+
+        elif type == "Comment":
+            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
+
+        elif type == "Doctype":
+            if token["name"]:
+                if token["publicId"]:
+                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["publicId"],
+                                   token["systemId"] if token["systemId"] else ""))
+                elif token["systemId"]:
+                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["systemId"]))
+                else:
+                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
+                                                       token["name"]))
+            else:
+                output.append("%s<!DOCTYPE >" % (" " * indent,))
+
+        elif type == "Characters":
+            output.append("%s\"%s\"" % (" " * indent, token["data"]))
+
+        elif type == "SpaceCharacters":
+            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
+
+        else:
+            raise ValueError("Unknown token type, %s" % type)
+
+    return "\n".join(output)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treewalkers/base.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treewalkers/base.py
@ -0,0 +1,252 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.dom import Node
+from ..constants import namespaces, voidElements, spaceCharacters
+
+__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
+           "TreeWalker", "NonRecursiveTreeWalker"]
+
+DOCUMENT = Node.DOCUMENT_NODE
+DOCTYPE = Node.DOCUMENT_TYPE_NODE
+TEXT = Node.TEXT_NODE
+ELEMENT = Node.ELEMENT_NODE
+COMMENT = Node.COMMENT_NODE
+ENTITY = Node.ENTITY_NODE
+UNKNOWN = "<#UNKNOWN#>"
+
+spaceCharacters = "".join(spaceCharacters)
+
+
+class TreeWalker(object):
+    """Walks a tree yielding tokens
+
+    Tokens are dicts that all have a ``type`` field specifying the type of the
+    token.
+
+    """
+    def __init__(self, tree):
+        """Creates a TreeWalker
+
+        :arg tree: the tree to walk
+
+        """
+        self.tree = tree
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def error(self, msg):
+        """Generates an error token with the given message
+
+        :arg msg: the error message
+
+        :returns: SerializeError token
+
+        """
+        return {"type": "SerializeError", "data": msg}
+
+    def emptyTag(self, namespace, name, attrs, hasChildren=False):
+        """Generates an EmptyTag token
+
+        :arg namespace: the namespace of the token--can be ``None``
+
+        :arg name: the name of the element
+
+        :arg attrs: the attributes of the element as a dict
+
+        :arg hasChildren: whether or not to yield a SerializationError because
+            this tag shouldn't have children
+
+        :returns: EmptyTag token
+
+        """
+        yield {"type": "EmptyTag", "name": name,
+               "namespace": namespace,
+               "data": attrs}
+        if hasChildren:
+            yield self.error("Void element has children")
+
+    def startTag(self, namespace, name, attrs):
+        """Generates a StartTag token
+
+        :arg namespace: the namespace of the token--can be ``None``
+
+        :arg name: the name of the element
+
+        :arg attrs: the attributes of the element as a dict
+
+        :returns: StartTag token
+
+        """
+        return {"type": "StartTag",
+                "name": name,
+                "namespace": namespace,
+                "data": attrs}
+
+    def endTag(self, namespace, name):
+        """Generates an EndTag token
+
+        :arg namespace: the namespace of the token--can be ``None``
+
+        :arg name: the name of the element
+
+        :returns: EndTag token
+
+        """
+        return {"type": "EndTag",
+                "name": name,
+                "namespace": namespace}
+
+    def text(self, data):
+        """Generates SpaceCharacters and Characters tokens
+
+        Depending on what's in the data, this generates one or more
+        ``SpaceCharacters`` and ``Characters`` tokens.
+
+        For example:
+
+            >>> from html5lib.treewalkers.base import TreeWalker
+            >>> # Give it an empty tree just so it instantiates
+            >>> walker = TreeWalker([])
+            >>> list(walker.text(''))
+            []
+            >>> list(walker.text('  '))
+            [{u'data': '  ', u'type': u'SpaceCharacters'}]
+            >>> list(walker.text(' abc '))  # doctest: +NORMALIZE_WHITESPACE
+            [{u'data': ' ', u'type': u'SpaceCharacters'},
+            {u'data': u'abc', u'type': u'Characters'},
+            {u'data': u' ', u'type': u'SpaceCharacters'}]
+
+        :arg data: the text data
+
+        :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
+
+        """
+        data = data
+        middle = data.lstrip(spaceCharacters)
+        left = data[:len(data) - len(middle)]
+        if left:
+            yield {"type": "SpaceCharacters", "data": left}
+        data = middle
+        middle = data.rstrip(spaceCharacters)
+        right = data[len(middle):]
+        if middle:
+            yield {"type": "Characters", "data": middle}
+        if right:
+            yield {"type": "SpaceCharacters", "data": right}
+
+    def comment(self, data):
+        """Generates a Comment token
+
+        :arg data: the comment
+
+        :returns: Comment token
+
+        """
+        return {"type": "Comment", "data": data}
+
+    def doctype(self, name, publicId=None, systemId=None):
+        """Generates a Doctype token
+
+        :arg name:
+
+        :arg publicId:
+
+        :arg systemId:
+
+        :returns: the Doctype token
+
+        """
+        return {"type": "Doctype",
+                "name": name,
+                "publicId": publicId,
+                "systemId": systemId}
+
+    def entity(self, name):
+        """Generates an Entity token
+
+        :arg name: the entity name
+
+        :returns: an Entity token
+
+        """
+        return {"type": "Entity", "name": name}
+
+    def unknown(self, nodeType):
+        """Handles unknown node types"""
+        return self.error("Unknown node type: " + nodeType)
+
+
+class NonRecursiveTreeWalker(TreeWalker):
+    def getNodeDetails(self, node):
+        raise NotImplementedError
+
+    def getFirstChild(self, node):
+        raise NotImplementedError
+
+    def getNextSibling(self, node):
+        raise NotImplementedError
+
+    def getParentNode(self, node):
+        raise NotImplementedError
+
+    def __iter__(self):
+        currentNode = self.tree
+        while currentNode is not None:
+            details = self.getNodeDetails(currentNode)
+            type, details = details[0], details[1:]
+            hasChildren = False
+
+            if type == DOCTYPE:
+                yield self.doctype(*details)
+
+            elif type == TEXT:
+                for token in self.text(*details):
+                    yield token
+
+            elif type == ELEMENT:
+                namespace, name, attributes, hasChildren = details
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    for token in self.emptyTag(namespace, name, attributes,
+                                               hasChildren):
+                        yield token
+                    hasChildren = False
+                else:
+                    yield self.startTag(namespace, name, attributes)
+
+            elif type == COMMENT:
+                yield self.comment(details[0])
+
+            elif type == ENTITY:
+                yield self.entity(details[0])
+
+            elif type == DOCUMENT:
+                hasChildren = True
+
+            else:
+                yield self.unknown(details[0])
+
+            if hasChildren:
+                firstChild = self.getFirstChild(currentNode)
+            else:
+                firstChild = None
+
+            if firstChild is not None:
+                currentNode = firstChild
+            else:
+                while currentNode is not None:
+                    details = self.getNodeDetails(currentNode)
+                    type, details = details[0], details[1:]
+                    if type == ELEMENT:
+                        namespace, name, attributes, hasChildren = details
+                        if (namespace and namespace != namespaces["html"]) or name not in voidElements:
+                            yield self.endTag(namespace, name)
+                    if self.tree is currentNode:
+                        currentNode = None
+                        break
+                    nextSibling = self.getNextSibling(currentNode)
+                    if nextSibling is not None:
+                        currentNode = nextSibling
+                        break
+                    else:
+                        currentNode = self.getParentNode(currentNode)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treewalkers/dom.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treewalkers/dom.py
@ -0,0 +1,43 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.dom import Node
+
+from . import base
+
+
+class TreeWalker(base.NonRecursiveTreeWalker):
+    def getNodeDetails(self, node):
+        if node.nodeType == Node.DOCUMENT_TYPE_NODE:
+            return base.DOCTYPE, node.name, node.publicId, node.systemId
+
+        elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
+            return base.TEXT, node.nodeValue
+
+        elif node.nodeType == Node.ELEMENT_NODE:
+            attrs = {}
+            for attr in list(node.attributes.keys()):
+                attr = node.getAttributeNode(attr)
+                if attr.namespaceURI:
+                    attrs[(attr.namespaceURI, attr.localName)] = attr.value
+                else:
+                    attrs[(None, attr.name)] = attr.value
+            return (base.ELEMENT, node.namespaceURI, node.nodeName,
+                    attrs, node.hasChildNodes())
+
+        elif node.nodeType == Node.COMMENT_NODE:
+            return base.COMMENT, node.nodeValue
+
+        elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
+            return (base.DOCUMENT,)
+
+        else:
+            return base.UNKNOWN, node.nodeType
+
+    def getFirstChild(self, node):
+        return node.firstChild
+
+    def getNextSibling(self, node):
+        return node.nextSibling
+
+    def getParentNode(self, node):
+        return node.parentNode
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treewalkers/etree.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treewalkers/etree.py
@ -0,0 +1,130 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from collections import OrderedDict
+import re
+
+from six import string_types
+
+from . import base
+from .._utils import moduleFactoryFactory
+
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+
+def getETreeBuilder(ElementTreeImplementation):
+    ElementTree = ElementTreeImplementation
+    ElementTreeCommentType = ElementTree.Comment("asd").tag
+
+    class TreeWalker(base.NonRecursiveTreeWalker):  # pylint:disable=unused-variable
+        """Given the particular ElementTree representation, this implementation,
+        to avoid using recursion, returns "nodes" as tuples with the following
+        content:
+
+        1. The current element
+
+        2. The index of the element relative to its parent
+
+        3. A stack of ancestor elements
+
+        4. A flag "text", "tail" or None to indicate if the current node is a
+           text node; either the text or tail of the current element (1)
+        """
+        def getNodeDetails(self, node):
+            if isinstance(node, tuple):  # It might be the root Element
+                elt, _, _, flag = node
+                if flag in ("text", "tail"):
+                    return base.TEXT, getattr(elt, flag)
+                else:
+                    node = elt
+
+            if not(hasattr(node, "tag")):
+                node = node.getroot()
+
+            if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
+                return (base.DOCUMENT,)
+
+            elif node.tag == "<!DOCTYPE>":
+                return (base.DOCTYPE, node.text,
+                        node.get("publicId"), node.get("systemId"))
+
+            elif node.tag == ElementTreeCommentType:
+                return base.COMMENT, node.text
+
+            else:
+                assert isinstance(node.tag, string_types), type(node.tag)
+                # This is assumed to be an ordinary element
+                match = tag_regexp.match(node.tag)
+                if match:
+                    namespace, tag = match.groups()
+                else:
+                    namespace = None
+                    tag = node.tag
+                attrs = OrderedDict()
+                for name, value in list(node.attrib.items()):
+                    match = tag_regexp.match(name)
+                    if match:
+                        attrs[(match.group(1), match.group(2))] = value
+                    else:
+                        attrs[(None, name)] = value
+                return (base.ELEMENT, namespace, tag,
+                        attrs, len(node) or node.text)
+
+        def getFirstChild(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                element, key, parents, flag = node, None, [], None
+
+            if flag in ("text", "tail"):
+                return None
+            else:
+                if element.text:
+                    return element, key, parents, "text"
+                elif len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+
+        def getNextSibling(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                return None
+
+            if flag == "text":
+                if len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+            else:
+                if element.tail and flag != "tail":
+                    return element, key, parents, "tail"
+                elif key < len(parents[-1]) - 1:
+                    return parents[-1][key + 1], key + 1, parents, None
+                else:
+                    return None
+
+        def getParentNode(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                return None
+
+            if flag == "text":
+                if not parents:
+                    return element
+                else:
+                    return element, key, parents, None
+            else:
+                parent = parents.pop()
+                if not parents:
+                    return parent
+                else:
+                    assert list(parents[-1]).count(parent) == 1
+                    return parent, list(parents[-1]).index(parent), parents, None
+
+    return locals()
+
+getETreeModule = moduleFactoryFactory(getETreeBuilder)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treewalkers/etree_lxml.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treewalkers/etree_lxml.py
@ -0,0 +1,213 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from lxml import etree
+from ..treebuilders.etree import tag_regexp
+
+from . import base
+
+from .. import _ihatexml
+
+
+def ensure_str(s):
+    if s is None:
+        return None
+    elif isinstance(s, text_type):
+        return s
+    else:
+        return s.decode("ascii", "strict")
+
+
+class Root(object):
+    def __init__(self, et):
+        self.elementtree = et
+        self.children = []
+
+        try:
+            if et.docinfo.internalDTD:
+                self.children.append(Doctype(self,
+                                             ensure_str(et.docinfo.root_name),
+                                             ensure_str(et.docinfo.public_id),
+                                             ensure_str(et.docinfo.system_url)))
+        except AttributeError:
+            pass
+
+        try:
+            node = et.getroot()
+        except AttributeError:
+            node = et
+
+        while node.getprevious() is not None:
+            node = node.getprevious()
+        while node is not None:
+            self.children.append(node)
+            node = node.getnext()
+
+        self.text = None
+        self.tail = None
+
+    def __getitem__(self, key):
+        return self.children[key]
+
+    def getnext(self):
+        return None
+
+    def __len__(self):
+        return 1
+
+
+class Doctype(object):
+    def __init__(self, root_node, name, public_id, system_id):
+        self.root_node = root_node
+        self.name = name
+        self.public_id = public_id
+        self.system_id = system_id
+
+        self.text = None
+        self.tail = None
+
+    def getnext(self):
+        return self.root_node.children[1]
+
+
+class FragmentRoot(Root):
+    def __init__(self, children):
+        self.children = [FragmentWrapper(self, child) for child in children]
+        self.text = self.tail = None
+
+    def getnext(self):
+        return None
+
+
+class FragmentWrapper(object):
+    def __init__(self, fragment_root, obj):
+        self.root_node = fragment_root
+        self.obj = obj
+        if hasattr(self.obj, 'text'):
+            self.text = ensure_str(self.obj.text)
+        else:
+            self.text = None
+        if hasattr(self.obj, 'tail'):
+            self.tail = ensure_str(self.obj.tail)
+        else:
+            self.tail = None
+
+    def __getattr__(self, name):
+        return getattr(self.obj, name)
+
+    def getnext(self):
+        siblings = self.root_node.children
+        idx = siblings.index(self)
+        if idx < len(siblings) - 1:
+            return siblings[idx + 1]
+        else:
+            return None
+
+    def __getitem__(self, key):
+        return self.obj[key]
+
+    def __bool__(self):
+        return bool(self.obj)
+
+    def getparent(self):
+        return None
+
+    def __str__(self):
+        return str(self.obj)
+
+    def __unicode__(self):
+        return str(self.obj)
+
+    def __len__(self):
+        return len(self.obj)
+
+
+class TreeWalker(base.NonRecursiveTreeWalker):
+    def __init__(self, tree):
+        # pylint:disable=redefined-variable-type
+        if isinstance(tree, list):
+            self.fragmentChildren = set(tree)
+            tree = FragmentRoot(tree)
+        else:
+            self.fragmentChildren = set()
+            tree = Root(tree)
+        base.NonRecursiveTreeWalker.__init__(self, tree)
+        self.filter = _ihatexml.InfosetFilter()
+
+    def getNodeDetails(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
+            return base.TEXT, ensure_str(getattr(node, key))
+
+        elif isinstance(node, Root):
+            return (base.DOCUMENT,)
+
+        elif isinstance(node, Doctype):
+            return base.DOCTYPE, node.name, node.public_id, node.system_id
+
+        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
+            return base.TEXT, ensure_str(node.obj)
+
+        elif node.tag == etree.Comment:
+            return base.COMMENT, ensure_str(node.text)
+
+        elif node.tag == etree.Entity:
+            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;
+
+        else:
+            # This is assumed to be an ordinary element
+            match = tag_regexp.match(ensure_str(node.tag))
+            if match:
+                namespace, tag = match.groups()
+            else:
+                namespace = None
+                tag = ensure_str(node.tag)
+            attrs = {}
+            for name, value in list(node.attrib.items()):
+                name = ensure_str(name)
+                value = ensure_str(value)
+                match = tag_regexp.match(name)
+                if match:
+                    attrs[(match.group(1), match.group(2))] = value
+                else:
+                    attrs[(None, name)] = value
+            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
+                    attrs, len(node) > 0 or node.text)
+
+    def getFirstChild(self, node):
+        assert not isinstance(node, tuple), "Text nodes have no children"
+
+        assert len(node) or node.text, "Node has no children"
+        if node.text:
+            return (node, "text")
+        else:
+            return node[0]
+
+    def getNextSibling(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
+            if key == "text":
+                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
+                # because node[0] might evaluate to False if it has no child element
+                if len(node):
+                    return node[0]
+                else:
+                    return None
+            else:  # tail
+                return node.getnext()
+
+        return (node, "tail") if node.tail else node.getnext()
+
+    def getParentNode(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
+            if key == "text":
+                return node
+            # else: fallback to "normal" processing
+        elif node in self.fragmentChildren:
+            return None
+
+        return node.getparent()
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treewalkers/genshi.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/html5lib/treewalkers/genshi.py
@ -0,0 +1,69 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from genshi.core import QName
+from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
+from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
+
+from . import base
+
+from ..constants import voidElements, namespaces
+
+
+class TreeWalker(base.TreeWalker):
+    def __iter__(self):
+        # Buffer the events so we can pass in the following one
+        previous = None
+        for event in self.tree:
+            if previous is not None:
+                for token in self.tokens(previous, event):
+                    yield token
+            previous = event
+
+        # Don't forget the final event!
+        if previous is not None:
+            for token in self.tokens(previous, None):
+                yield token
+
+    def tokens(self, event, next):
+        kind, data, _ = event
+        if kind == START:
+            tag, attribs = data
+            name = tag.localname
+            namespace = tag.namespace
+            converted_attribs = {}
+            for k, v in attribs:
+                if isinstance(k, QName):
+                    converted_attribs[(k.namespace, k.localname)] = v
+                else:
+                    converted_attribs[(None, k)] = v
+
+            if namespace == namespaces["html"] and name in voidElements:
+                for token in self.emptyTag(namespace, name, converted_attribs,
+                                           not next or next[0] != END or
+                                           next[1] != tag):
+                    yield token
+            else:
+                yield self.startTag(namespace, name, converted_attribs)
+
+        elif kind == END:
+            name = data.localname
+            namespace = data.namespace
+            if namespace != namespaces["html"] or name not in voidElements:
+                yield self.endTag(namespace, name)
+
+        elif kind == COMMENT:
+            yield self.comment(data)
+
+        elif kind == TEXT:
+            for token in self.text(data):
+                yield token
+
+        elif kind == DOCTYPE:
+            yield self.doctype(*data)
+
+        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
+                      START_CDATA, END_CDATA, PI):
+            pass
+
+        else:
+            yield self.unknown(kind)
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/parse.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/parse.py
@ -0,0 +1,244 @@
+#!/usr/bin/env python
+"""usage: %prog [options] filename
+
+Parse a document to a tree, with optional profiling
+"""
+
+import sys
+import traceback
+from optparse import OptionParser
+
+from html5lib import html5parser
+from html5lib import treebuilders, serializer, treewalkers
+from html5lib import constants
+from html5lib import _utils
+
+
+def parse():
+    optParser = getOptParser()
+    opts, args = optParser.parse_args()
+    encoding = "utf8"
+
+    try:
+        f = args[-1]
+        # Try opening from the internet
+        if f.startswith('http://'):
+            try:
+                import urllib.request
+                import urllib.parse
+                import urllib.error
+                import cgi
+                f = urllib.request.urlopen(f)
+                contentType = f.headers.get('content-type')
+                if contentType:
+                    (mediaType, params) = cgi.parse_header(contentType)
+                    encoding = params.get('charset')
+            except:
+                pass
+        elif f == '-':
+            f = sys.stdin
+            if sys.version_info[0] >= 3:
+                encoding = None
+        else:
+            try:
+                # Try opening from file system
+                f = open(f, "rb")
+            except IOError as e:
+                sys.stderr.write("Unable to open file: %s\n" % e)
+                sys.exit(1)
+    except IndexError:
+        sys.stderr.write("No filename provided. Use -h for help\n")
+        sys.exit(1)
+
+    treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
+
+    p = html5parser.HTMLParser(tree=treebuilder, debug=opts.log)
+
+    if opts.fragment:
+        parseMethod = p.parseFragment
+    else:
+        parseMethod = p.parse
+
+    if opts.profile:
+        import cProfile
+        import pstats
+        cProfile.runctx("run(parseMethod, f, encoding, scripting)", None,
+                        {"run": run,
+                         "parseMethod": parseMethod,
+                         "f": f,
+                         "encoding": encoding,
+                         "scripting": opts.scripting},
+                        "stats.prof")
+        # XXX - We should use a temp file here
+        stats = pstats.Stats('stats.prof')
+        stats.strip_dirs()
+        stats.sort_stats('time')
+        stats.print_stats()
+    elif opts.time:
+        import time
+        t0 = time.time()
+        document = run(parseMethod, f, encoding, opts.scripting)
+        t1 = time.time()
+        if document:
+            printOutput(p, document, opts)
+            t2 = time.time()
+            sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1))
+        else:
+            sys.stderr.write("\n\nRun took: %fs" % (t1 - t0))
+    else:
+        document = run(parseMethod, f, encoding, opts.scripting)
+        if document:
+            printOutput(p, document, opts)
+
+
+def run(parseMethod, f, encoding, scripting):
+    try:
+        document = parseMethod(f, override_encoding=encoding, scripting=scripting)
+    except:
+        document = None
+        traceback.print_exc()
+    return document
+
+
+def printOutput(parser, document, opts):
+    if opts.encoding:
+        print("Encoding:", parser.tokenizer.stream.charEncoding)
+
+    for item in parser.log:
+        print(item)
+
+    if document is not None:
+        if opts.xml:
+            tb = opts.treebuilder.lower()
+            if tb == "dom":
+                document.writexml(sys.stdout, encoding="utf-8")
+            elif tb == "lxml":
+                import lxml.etree
+                sys.stdout.write(lxml.etree.tostring(document, encoding="unicode"))
+            elif tb == "etree":
+                sys.stdout.write(_utils.default_etree.tostring(document, encoding="unicode"))
+        elif opts.tree:
+            if not hasattr(document, '__getitem__'):
+                document = [document]
+            for fragment in document:
+                print(parser.tree.testSerializer(fragment))
+        elif opts.html:
+            kwargs = {}
+            for opt in serializer.HTMLSerializer.options:
+                try:
+                    kwargs[opt] = getattr(opts, opt)
+                except:
+                    pass
+            if not kwargs['quote_char']:
+                del kwargs['quote_char']
+
+            if opts.sanitize:
+                kwargs["sanitize"] = True
+
+            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
+            if sys.version_info[0] >= 3:
+                encoding = None
+            else:
+                encoding = "utf-8"
+            for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
+                sys.stdout.write(text)
+            if not text.endswith('\n'):
+                sys.stdout.write('\n')
+    if opts.error:
+        errList = []
+        for pos, errorcode, datavars in parser.errors:
+            errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
+        sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
+
+
+def getOptParser():
+    parser = OptionParser(usage=__doc__)
+
+    parser.add_option("-p", "--profile", action="store_true", default=False,
+                      dest="profile", help="Use the hotshot profiler to "
+                      "produce a detailed log of the run")
+
+    parser.add_option("-t", "--time",
+                      action="store_true", default=False, dest="time",
+                      help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
+
+    parser.add_option("-b", "--treebuilder", action="store", type="string",
+                      dest="treebuilder", default="etree")
+
+    parser.add_option("-e", "--error", action="store_true", default=False,
+                      dest="error", help="Print a list of parse errors")
+
+    parser.add_option("-f", "--fragment", action="store_true", default=False,
+                      dest="fragment", help="Parse as a fragment")
+
+    parser.add_option("-s", "--scripting", action="store_true", default=False,
+                      dest="scripting", help="Handle noscript tags as if scripting was enabled")
+
+    parser.add_option("", "--tree", action="store_true", default=False,
+                      dest="tree", help="Output as debug tree")
+
+    parser.add_option("-x", "--xml", action="store_true", default=False,
+                      dest="xml", help="Output as xml")
+
+    parser.add_option("", "--no-html", action="store_false", default=True,
+                      dest="html", help="Don't output html")
+
+    parser.add_option("-c", "--encoding", action="store_true", default=False,
+                      dest="encoding", help="Print character encoding used")
+
+    parser.add_option("", "--inject-meta-charset", action="store_true",
+                      default=False, dest="inject_meta_charset",
+                      help="inject <meta charset>")
+
+    parser.add_option("", "--strip-whitespace", action="store_true",
+                      default=False, dest="strip_whitespace",
+                      help="strip whitespace")
+
+    parser.add_option("", "--omit-optional-tags", action="store_true",
+                      default=False, dest="omit_optional_tags",
+                      help="omit optional tags")
+
+    parser.add_option("", "--quote-attr-values", action="store_true",
+                      default=False, dest="quote_attr_values",
+                      help="quote attribute values")
+
+    parser.add_option("", "--use-best-quote-char", action="store_true",
+                      default=False, dest="use_best_quote_char",
+                      help="use best quote character")
+
+    parser.add_option("", "--quote-char", action="store",
+                      default=None, dest="quote_char",
+                      help="quote character")
+
+    parser.add_option("", "--no-minimize-boolean-attributes",
+                      action="store_false", default=True,
+                      dest="minimize_boolean_attributes",
+                      help="minimize boolean attributes")
+
+    parser.add_option("", "--use-trailing-solidus", action="store_true",
+                      default=False, dest="use_trailing_solidus",
+                      help="use trailing solidus")
+
+    parser.add_option("", "--space-before-trailing-solidus",
+                      action="store_true", default=False,
+                      dest="space_before_trailing_solidus",
+                      help="add space before trailing solidus")
+
+    parser.add_option("", "--escape-lt-in-attrs", action="store_true",
+                      default=False, dest="escape_lt_in_attrs",
+                      help="escape less than signs in attribute values")
+
+    parser.add_option("", "--escape-rcdata", action="store_true",
+                      default=False, dest="escape_rcdata",
+                      help="escape rcdata element values")
+
+    parser.add_option("", "--sanitize", action="store_true", default=False,
+                      dest="sanitize", help="sanitize")
+
+    parser.add_option("-l", "--log", action="store_true", default=False,
+                      dest="log", help="log state transitions")
+
+    return parser
+
+if __name__ == "__main__":
+    parse()
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/pytest.ini
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/pytest.ini
@ -0,0 +1,17 @@
+[pytest]
+# Output fails, errors, xpass, and warnings; ignore doctest; make warnings errors
+addopts = -rfEXw -p no:doctest --strict
+
+# Make xpass results be considered fail
+xfail_strict = true
+
+# Document our markers
+markers =
+    DOM: mark a test as a DOM tree test
+    ElementTree: mark a test as a ElementTree tree test
+    cElementTree: mark a test as a cElementTree tree test
+    lxml: mark a test as a lxml tree test
+    genshi: mark a test as a genshi tree test
+    parser: mark a test as a parser test
+    namespaced: mark a test as a namespaced parser test
+    treewalker: mark a test as a treewalker test
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/requirements-install.sh
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/requirements-install.sh
@ -0,0 +1,15 @@
+#!/bin/bash -ex
+
+if [[ $SIX_VERSION ]]; then
+  pip install six==$SIX_VERSION
+fi
+
+pip install -r requirements-test.txt
+
+if [[ $USE_OPTIONAL == "true" ]]; then
+  pip install -r requirements-optional.txt
+fi
+
+if [[ $CI == "true" ]]; then
+  pip install codecov
+fi
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/requirements-optional.txt
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/requirements-optional.txt
@ -0,0 +1,17 @@
+-r requirements.txt
+
+# We support a Genshi treewalker that can be used to serialize Genshi
+# streams.
+genshi
+
+# chardet can be used as a fallback in case we are unable to determine
+# the encoding of a document.
+chardet>=2.2
+
+# lxml is supported with its own treebuilder ("lxml") and otherwise
+# uses the standard ElementTree support
+lxml ; platform_python_implementation == 'CPython'
+
+# DATrie can be used in place of our Python trie implementation for
+# slightly better parsing performance.
+datrie ; platform_python_implementation == 'CPython'
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/requirements-test.txt
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/requirements-test.txt
@ -0,0 +1,10 @@
+-r requirements.txt
+
+tox
+
+flake8<3.0
+
+pytest==3.2.5
+coverage
+pytest-expect>=1.1,<2.0
+mock
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/requirements.txt
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/requirements.txt
@ -0,0 +1,2 @@
+six>=1.9
+webencodings
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/setup.cfg
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/setup.cfg
@ -0,0 +1,14 @@
+[bdist_wheel]
+universal = 1
+
+[pep8]
+ignore = N
+max-line-length = 139
+exclude = .git,__pycache__,.tox,doc
+
+[flake8]
+ignore = N
+max-line-length = 139
+
+[metadata]
+license_file = LICENSE
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/setup.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/setup.py
@ -0,0 +1,125 @@
+from __future__ import print_function
+
+import ast
+import codecs
+import sys
+
+from os.path import join, dirname
+from setuptools import setup, find_packages, __version__ as setuptools_version
+from pkg_resources import parse_version
+
+import pkg_resources
+
+try:
+    import _markerlib.markers
+except ImportError:
+    _markerlib = None
+
+
+# _markerlib.default_environment() obtains its data from _VARS
+# and wraps it in another dict, but _markerlib_evaluate writes
+# to the dict while it is iterating the keys, causing an error
+# on Python 3 only.
+# Replace _markerlib.default_environment to return a custom dict
+# that has all the necessary markers, and ignores any writes.
+
+class Python3MarkerDict(dict):
+
+    def __setitem__(self, key, value):
+        pass
+
+    def pop(self, i=-1):
+        return self[i]
+
+
+if _markerlib and sys.version_info[0] == 3:
+    env = _markerlib.markers._VARS
+    for key in list(env.keys()):
+        new_key = key.replace('.', '_')
+        if new_key != key:
+            env[new_key] = env[key]
+
+    _markerlib.markers._VARS = Python3MarkerDict(env)
+
+    def default_environment():
+        return _markerlib.markers._VARS
+
+    _markerlib.default_environment = default_environment
+
+# Avoid the very buggy pkg_resources.parser, which doesnt consistently
+# recognise the markers needed by this setup.py
+# Change this to setuptools 20.10.0 to support all markers.
+if pkg_resources:
+    if parse_version(setuptools_version) < parse_version('18.5'):
+        MarkerEvaluation = pkg_resources.MarkerEvaluation
+
+        del pkg_resources.parser
+        pkg_resources.evaluate_marker = MarkerEvaluation._markerlib_evaluate
+        MarkerEvaluation.evaluate_marker = MarkerEvaluation._markerlib_evaluate
+
+classifiers = [
+    'Development Status :: 5 - Production/Stable',
+    'Intended Audience :: Developers',
+    'License :: OSI Approved :: MIT License',
+    'Operating System :: OS Independent',
+    'Programming Language :: Python',
+    'Programming Language :: Python :: 2',
+    'Programming Language :: Python :: 2.7',
+    'Programming Language :: Python :: 3',
+    'Programming Language :: Python :: 3.3',
+    'Programming Language :: Python :: 3.4',
+    'Programming Language :: Python :: 3.5',
+    'Programming Language :: Python :: 3.6',
+    'Topic :: Software Development :: Libraries :: Python Modules',
+    'Topic :: Text Processing :: Markup :: HTML'
+]
+
+here = dirname(__file__)
+with codecs.open(join(here, 'README.rst'), 'r', 'utf8') as readme_file:
+    with codecs.open(join(here, 'CHANGES.rst'), 'r', 'utf8') as changes_file:
+        long_description = readme_file.read() + '\n' + changes_file.read()
+
+version = None
+with open(join(here, "html5lib", "__init__.py"), "rb") as init_file:
+    t = ast.parse(init_file.read(), filename="__init__.py", mode="exec")
+    assert isinstance(t, ast.Module)
+    assignments = filter(lambda x: isinstance(x, ast.Assign), t.body)
+    for a in assignments:
+        if (len(a.targets) == 1 and
+                isinstance(a.targets[0], ast.Name) and
+                a.targets[0].id == "__version__" and
+                isinstance(a.value, ast.Str)):
+            version = a.value.s
+
+setup(name='html5lib',
+      version=version,
+      url='https://github.com/html5lib/html5lib-python',
+      license="MIT License",
+      description='HTML parser based on the WHATWG HTML specification',
+      long_description=long_description,
+      classifiers=classifiers,
+      maintainer='James Graham',
+      maintainer_email='james@hoppipolla.co.uk',
+      packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
+      install_requires=[
+          'six>=1.9',
+          'webencodings',
+      ],
+      extras_require={
+          # A conditional extra will only install these items when the extra is
+          # requested and the condition matches.
+          "datrie:platform_python_implementation == 'CPython'": ["datrie"],
+          "lxml:platform_python_implementation == 'CPython'": ["lxml"],
+
+          # Standard extras, will be installed when the extra is requested.
+          "genshi": ["genshi"],
+          "chardet": ["chardet>=2.2"],
+
+          # The all extra combines a standard extra which will be used anytime
+          # the all extra is requested, and it extends it with a conditional
+          # extra that will be installed whenever the condition matches and the
+          # all extra is requested.
+          "all": ["genshi", "chardet>=2.2"],
+          "all:platform_python_implementation == 'CPython'": ["datrie", "lxml"],
+      },
+      )
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/tox.ini
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/tox.ini
@ -0,0 +1,23 @@
+[tox]
+envlist = {py27,py33,py34,py35,py36,pypy}-{base,six19,optional}
+
+[testenv]
+deps =
+  optional: -r{toxinidir}/requirements-optional.txt
+  -r{toxinidir}/requirements-test.txt
+  doc: Sphinx
+
+passenv =
+  PYTEST_COMMAND
+  COVERAGE_RUN_OPTIONS
+commands =
+  six19: pip install six==1.9
+  {env:PYTEST_COMMAND:{envbindir}/py.test} {posargs}
+  flake8 {toxinidir}
+
+[testenv:doc]
+changedir = doc
+commands = sphinx-build -b html . _build
+
+[flake8]
+exclude = ./.tox
--- a/tests/wpt/web-platform-tests/tools/third_party/html5lib/utils/entities.py
+++ b/tests/wpt/web-platform-tests/tools/third_party/html5lib/utils/entities.py
@ -0,0 +1,100 @@
+import json
+
+import html5lib
+
+
+def parse(path="html5ents.xml"):
+    return html5lib.parse(open(path), treebuilder="lxml")
+
+
+def entity_table(tree):
+    return dict((entity_name("".join(tr[0].xpath(".//text()"))),
+                 entity_characters(tr[1].text))
+                for tr in tree.xpath("//h:tbody/h:tr",
+                                     namespaces={"h": "http://www.w3.org/1999/xhtml"}))
+
+
+def entity_name(inp):
+    return inp.strip()
+
+
+def entity_characters(inp):
+    return "".join(codepoint_to_character(item)
+                   for item in inp.split()
+                   if item)
+
+
+def codepoint_to_character(inp):
+    return ("\\U000" + inp[2:]).decode("unicode-escape")
+
+
+def make_tests_json(entities):
+    test_list = make_test_list(entities)
+    tests_json = {"tests":
+                  [make_test(*item) for item in test_list]
+                  }
+    return tests_json
+
+
+def make_test(name, characters, good):
+    return {
+        "description": test_description(name, good),
+        "input": "&%s" % name,
+        "output": test_expected(name, characters, good)
+    }
+
+
+def test_description(name, good):
+    with_semicolon = name.endswith(";")
+    semicolon_text = {True: "with a semi-colon",
+                      False: "without a semi-colon"}[with_semicolon]
+    if good:
+        text = "Named entity: %s %s" % (name, semicolon_text)
+    else:
+        text = "Bad named entity: %s %s" % (name, semicolon_text)
+    return text
+
+
+def test_expected(name, characters, good):
+    rv = []
+    if not good or not name.endswith(";"):
+        rv.append("ParseError")
+    rv.append(["Character", characters])
+    return rv
+
+
+def make_test_list(entities):
+    tests = []
+    for entity_name, characters in entities.items():
+        if entity_name.endswith(";") and not subentity_exists(entity_name, entities):
+            tests.append((entity_name[:-1], "&" + entity_name[:-1], False))
+        tests.append((entity_name, characters, True))
+    return sorted(tests)
+
+
+def subentity_exists(entity_name, entities):
+    for i in range(1, len(entity_name)):
+        if entity_name[:-i] in entities:
+            return True
+    return False
+
+
+def make_entities_code(entities):
+    entities_text = "\n".join("    \"%s\": u\"%s\"," % (
+        name, entities[name].encode(
+            "unicode-escape").replace("\"", "\\\""))
+        for name in sorted(entities.keys()))
+    return """entities = {
+%s
+}""" % entities_text
+
+
+def main():
+    entities = entity_table(parse())
+    tests_json = make_tests_json(entities)
+    json.dump(tests_json, open("namedEntities.test", "w"), indent=4)
+    code = make_entities_code(entities)
+    open("entities_constants.py", "w").write(code)
+
+if __name__ == "__main__":
+    main()
				`@ -0,0 +1 @@`
				`from __future__ import absolute_import, division, unicode_literals`