Update web-platform-tests to revision 132d12daea699ce266324e79eecbe59b10e56502

This commit is contained in:
WPT Sync Bot 2018-06-08 21:05:21 -04:00
parent 527d874bc1
commit fe00a63040
1004 changed files with 18598 additions and 92770 deletions

View file

@ -0,0 +1,31 @@
# To activate, change the Appveyor settings to use `.appveyor.yml`.
environment:
global:
PATH: "C:\\Python27\\Scripts\\;%PATH%"
PYTEST_COMMAND: "coverage run -m pytest"
matrix:
- TOXENV: py27-base
- TOXENV: py27-optional
- TOXENV: py33-base
- TOXENV: py33-optional
- TOXENV: py34-base
- TOXENV: py34-optional
- TOXENV: py35-base
- TOXENV: py35-optional
- TOXENV: py36-base
- TOXENV: py36-optional
install:
- git submodule update --init --recursive
- python -m pip install tox codecov
build: off
test_script:
- tox
after_test:
- python debug-info.py
on_success:
- codecov

View file

@ -0,0 +1,8 @@
[run]
branch = True
source = html5lib
[paths]
source =
html5lib
.tox/*/lib/python*/site-packages/html5lib

View file

@ -0,0 +1,85 @@
# Copyright (c) 2014 GitHub, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
doc/_build/
# PyBuilder
target/
# Generated by parse.py -p
stats.prof
# IDE
.idea

View file

@ -0,0 +1,3 @@
[submodule "testdata"]
path = html5lib/tests/testdata
url = https://github.com/html5lib/html5lib-tests.git

View file

@ -0,0 +1,21 @@
strictness: veryhigh
doc-warnings: false
test-warnings: false
max-line-length: 139
requirements:
- requirements.txt
- requirements-test.txt
- requirements-optional.txt
ignore-paths:
- parse.py
- utils/
python-targets:
- 2
- 3
mccabe:
run: false

View file

@ -0,0 +1,10 @@
[MASTER]
ignore=tests
[MESSAGES CONTROL]
# messages up to fixme should probably be fixed somehow
disable = redefined-builtin,attribute-defined-outside-init,anomalous-backslash-in-string,no-self-use,redefined-outer-name,bad-continuation,wrong-import-order,superfluous-parens,no-member,duplicate-code,super-init-not-called,abstract-method,property-on-old-class,wrong-import-position,no-name-in-module,no-init,bad-mcs-classmethod-argument,bad-classmethod-argument,fixme,invalid-name,import-error,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-boolean-expressions,too-many-branches,too-many-instance-attributes,too-many-locals,too-many-lines,too-many-public-methods,too-many-return-statements,too-many-statements,missing-docstring,line-too-long,locally-disabled,locally-enabled,bad-builtin,deprecated-lambda
[FORMAT]
max-line-length=139
single-line-if-stmt=no

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,32 @@
language: python
python:
- "pypy"
- "3.6"
- "3.5"
- "3.4"
- "3.3"
- "2.7"
sudo: false
cache: pip
env:
global:
- PYTEST_COMMAND="coverage run -m pytest"
matrix:
- TOXENV=optional
- TOXENV=base
- TOXENV=six19-optional
install:
- pip install tox codecov
script:
- tox
after_script:
- python debug-info.py
after_success:
- codecov

View file

@ -0,0 +1,66 @@
Credits
=======
``html5lib`` is written and maintained by:
- James Graham
- Geoffrey Sneddon
- Łukasz Langa
- Will Kahn-Greene
Patches and suggestions
-----------------------
(In chronological order, by first commit:)
- Anne van Kesteren
- Lachlan Hunt
- lantis63
- Sam Ruby
- Thomas Broyer
- Tim Fletcher
- Mark Pilgrim
- Ryan King
- Philip Taylor
- Edward Z. Yang
- fantasai
- Philip Jägenstedt
- Ms2ger
- Mohammad Taha Jahangir
- Andy Wingo
- Andreas Madsack
- Karim Valiev
- Juan Carlos Garcia Segovia
- Mike West
- Marc DM
- Simon Sapin
- Michael[tm] Smith
- Ritwik Gupta
- Marc Abramowitz
- Tony Lopes
- lilbludevil
- Kevin
- Drew Hubl
- Austin Kumbera
- Jim Baker
- Jon Dufresne
- Donald Stufft
- Alex Gaynor
- Nik Nyby
- Jakub Wilk
- Sigmund Cherem
- Gabi Davar
- Florian Mounier
- neumond
- Vitalik Verhovodov
- Kovid Goyal
- Adam Chainz
- John Vandenberg
- Eric Amorde
- Benedikt Morbach
- Jonathan Vanasco
- Tom Most
- Ville Skyttä
- Hugo van Kemenade
- Mark Vasilkov

View file

@ -0,0 +1,335 @@
Change Log
----------
1.0.1
~~~~~
Released on December 7, 2017
Breaking changes:
* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!)
* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!)
Features:
* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most,
Will Kahn-Greene!)
* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!)
* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!)
* Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!)
* Semver-compliant version number.
Bug fixes:
* Add support for setuptools < 18.5 to support environment markers. (Thank you,
John Vandenberg!)
* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!)
* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank
you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!)
* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will
Kahn-Greene!)
* Include license file in generated wheel package. (#350) (Thank you, Jon
Dufresne!)
* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!)
* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you,
Komal Dembla, Hugo!)
1.0
~~~
Released and unreleased on December 7, 2017. Badly packaged release.
0.999999999/1.0b10
~~~~~~~~~~~~~~~~~~
Released on July 15, 2016
* Fix attribute order going to the tree builder to be document order
instead of reverse document order(!).
0.99999999/1.0b9
~~~~~~~~~~~~~~~~
Released on July 14, 2016
* **Added ordereddict as a mandatory dependency on Python 2.6.**
* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all``
extras that will do the right thing based on the specific
interpreter implementation.
* Now requires the ``mock`` package for the testsuite.
* Cease supporting DATrie under PyPy.
* **Remove PullDOM support, as this hasn't ever been properly
tested, doesn't entirely work, and as far as I can tell is
completely unused by anyone.**
* Move testsuite to ``py.test``.
* **Fix #124: move to webencodings for decoding the input byte stream;
this makes html5lib compliant with the Encoding Standard, and
introduces a required dependency on webencodings.**
* **Cease supporting Python 3.2 (in both CPython and PyPy forms).**
* **Fix comments containing double-dash with lxml 3.5 and above.**
* **Use scripting disabled by default (as we don't implement
scripting).**
* **Fix #11, avoiding the XSS bug potentially caused by serializer
allowing attribute values to be escaped out of in old browser versions,
changing the quote_attr_values option on serializer to take one of
three values, "always" (the old True value), "legacy" (the new option,
and the new default), and "spec" (the old False value, and the old
default).**
* **Fix #72 by rewriting the sanitizer to apply only to treewalkers
(instead of the tokenizer); as such, this will require amending all
callers of it to use it via the treewalker API.**
* **Drop support of charade, now that chardet is supported once more.**
* **Replace the charset keyword argument on parse and related methods
with a set of keyword arguments: override_encoding, transport_encoding,
same_origin_parent_encoding, likely_encoding, and default_encoding.**
* **Move filters._base, treebuilder._base, and treewalkers._base to .base
to clarify their status as public.**
* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
sanitizer.htmlsanitizer module and move that to sanitizer. This means
anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
code changes.**
* **Rename treewalkers.lxmletree to .etree_lxml and
treewalkers.genshistream to .genshi to have a consistent API.**
* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
utils) to be underscore prefixed to clarify their status as private.
0.9999999/1.0b8
~~~~~~~~~~~~~~~
Released on September 10, 2015
* Fix #195: fix the sanitizer to drop broken URLs (it threw an
exception between 0.9999 and 0.999999).
0.999999/1.0b7
~~~~~~~~~~~~~~
Released on July 7, 2015
* Fix #189: fix the sanitizer to allow relative URLs again (as it did
prior to 0.9999/1.0b5).
0.99999/1.0b6
~~~~~~~~~~~~~
Released on April 30, 2015
* Fix #188: fix the sanitizer to not throw an exception when sanitizing
bogus data URLs.
0.9999/1.0b5
~~~~~~~~~~~~
Released on April 29, 2015
* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how
this sounds, this has no known security implications. No known version
of IE (5.5 to current), Firefox (3 to current), Safari (6 to current),
Chrome (1 to current), or Opera (12 to current) will run any script
provided in these attributes.
* Pass error message to the ParseError exception in strict parsing mode.
* Allow data URIs in the sanitizer, with a whitelist of content-types.
* Add support for Python implementations that don't support lone
surrogates (read: Jython). Fixes #2.
* Remove localization of error messages. This functionality was totally
unused (and untested that everything was localizable), so we may as
well follow numerous browsers in not supporting translating technical
strings.
* Expose treewalkers.pprint as a public API.
* Add a documentEncoding property to HTML5Parser, fix #121.
0.999
~~~~~
Released on December 23, 2013
* Fix #127: add work-around for CPython issue #20007: .read(0) on
http.client.HTTPResponse drops the rest of the content.
* Fix #115: lxml treewalker can now deal with fragments containing, at
their root level, text nodes with non-ASCII characters on Python 2.
0.99
~~~~
Released on September 10, 2013
* No library changes from 1.0b3; released as 0.99 as pip has changed
behaviour from 1.4 to avoid installing pre-release versions per
PEP 440.
1.0b3
~~~~~
Released on July 24, 2013
* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
implementation using it should be moved to
``NonRecursiveTreeWalker``, as everything bundled with html5lib has
for years.
* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
object, thereby fixing any case where html5lib is passed a
non-seekable RawIOBase-like object.
1.0b2
~~~~~
Released on June 27, 2013
* Removed reordering of attributes within the serializer. There is now
an ``alphabetical_attributes`` option which preserves the previous
behaviour through a new filter. This allows attribute order to be
preserved through html5lib if the tree builder preserves order.
* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
``treeadapters.sax.to_sax`` which is generic and supports any
treewalker; it also resolves all known bugs with ``dom2sax``.
* Fix treewalker assertions on hitting bytes strings on
Python 2. Previous to 1.0b1, treewalkers coped with mixed
bytes/unicode data on Python 2; this reintroduces this prior
behaviour on Python 2. Behaviour is unchanged on Python 3.
1.0b1
~~~~~
Released on May 17, 2013
* Implementation updated to implement the `HTML specification
<http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
* Python 3.2+ supported in a single codebase using the ``six`` library.
* Removed support for Python 2.5 and older.
* Removed the deprecated Beautiful Soup 3 treebuilder.
``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
since it doesn't support namespaces, foreign content like SVG and
MathML is parsed incorrectly.
* Removed ``simpletree`` from the package. The default tree builder is
now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
available, and ``xml.etree.ElementTree`` otherwise).
* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
output was well-formed XML, and hence provided little of use.
* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
return the default DOM treebuilder, which uses ``xml.dom.minidom``.
* Optional heuristic character encoding detection now based on
``charade`` for Python 2.6 - 3.3 compatibility.
* Optional ``Genshi`` treewalker support fixed.
* Many bugfixes, including:
* #33: null in attribute value breaks XML AttValue;
* #4: nested, indirect descendant, <button> causes infinite loop;
* `Google Code 215
<http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
detect seekable streams;
* `Google Code 206
<http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
support for <video preload=...>, <audio preload=...>;
* `Google Code 205
<http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
support for <video poster=...>;
* `Google Code 202
<http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
file breaks InputStream.
* Source code is now mostly PEP 8 compliant.
* Test harness has been improved and now depends on ``nose``.
* Documentation updated and moved to https://html5lib.readthedocs.io/.
0.95
~~~~
Released on February 11, 2012
0.90
~~~~
Released on January 17, 2010
0.11.1
~~~~~~
Released on June 12, 2008
0.11
~~~~
Released on June 10, 2008
0.10
~~~~
Released on October 7, 2007
0.9
~~~
Released on March 11, 2007
0.2
~~~
Released on January 8, 2007

View file

@ -0,0 +1,60 @@
Contributing
============
Pull requests are more than welcome — both to the library and to the
documentation. Some useful information:
- We aim to follow PEP 8 in the library, but ignoring the
79-character-per-line limit, instead following a soft limit of 99,
but allowing lines over this where it is the readable thing to do.
- We aim to follow PEP 257 for all docstrings, and make them properly
parseable by Sphinx while generating API documentation.
- We keep ``pyflakes`` reporting no errors or warnings at all times.
- We keep the master branch passing all tests at all times on all
supported versions.
`Travis CI <https://travis-ci.org/html5lib/html5lib-python/>`_ is run
against all pull requests and should enforce all of the above.
We use `Opera Critic <https://critic.hoppipolla.co.uk/>`_ as an external
code-review tool, which uses your GitHub login to authenticate. You'll
get email notifications for issues raised in the review.
Patch submission guidelines
---------------------------
- **Create a new Git branch specific to your change.** Do not put
multiple fixes/features in the same pull request. If you find an
unrelated bug, create a distinct branch and submit a separate pull
request for the bugfix. This makes life much easier for maintainers
and will speed up merging your patches.
- **Write a test** whenever possible. Following existing tests is often
easiest, and a good way to tell whether the feature you're modifying
is easily testable.
- **Make sure documentation is updated.** Keep docstrings current, and
if necessary, update the Sphinx documentation in ``doc/``.
- **Add a changelog entry** at the top of ``CHANGES.rst`` following
existing entries' styles.
- **Run tests with tox** if possible, to make sure your changes are
compatible with all supported Python versions.
- **Squash commits** before submitting the pull request so that a single
commit contains the entire change, and only that change (see the first
bullet).
- **Don't rebase after creating the pull request.** Merge with upstream,
if necessary, and use ``git commit --fixup`` for fixing issues raised
in a Critic review or by a failing Travis build. The reviewer will
squash and rebase your pull request while accepting it. Even though
GitHub won't recognize the pull request as accepted, the squashed
commits will properly specify you as the author.
- **Attribute yourself** in ``AUTHORS.rst``.

View file

@ -0,0 +1,20 @@
Copyright (c) 2006-2013 James Graham and other contributors
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View file

@ -0,0 +1,10 @@
include LICENSE
include AUTHORS.rst
include CHANGES.rst
include README.rst
include requirements*.txt
include .pytest.expect
include tox.ini
include pytest.ini
graft html5lib/tests/testdata
recursive-include html5lib/tests *.py

View file

@ -0,0 +1,151 @@
html5lib
========
.. image:: https://travis-ci.org/html5lib/html5lib-python.png?branch=master
:target: https://travis-ci.org/html5lib/html5lib-python
html5lib is a pure-python library for parsing HTML. It is designed to
conform to the WHATWG HTML specification, as is implemented by all major
web browsers.
Usage
-----
Simple usage follows this pattern:
.. code-block:: python
import html5lib
with open("mydocument.html", "rb") as f:
document = html5lib.parse(f)
or:
.. code-block:: python
import html5lib
document = html5lib.parse("<p>Hello World!")
By default, the ``document`` will be an ``xml.etree`` element instance.
Whenever possible, html5lib chooses the accelerated ``ElementTree``
implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
Two other tree types are supported: ``xml.dom.minidom`` and
``lxml.etree``. To use an alternative format, specify the name of
a treebuilder:
.. code-block:: python
import html5lib
with open("mydocument.html", "rb") as f:
lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
When using with ``urllib2`` (Python 2), the charset from HTTP should be
pass into html5lib as follows:
.. code-block:: python
from contextlib import closing
from urllib2 import urlopen
import html5lib
with closing(urlopen("http://example.com/")) as f:
document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
When using with ``urllib.request`` (Python 3), the charset from HTTP
should be pass into html5lib as follows:
.. code-block:: python
from urllib.request import urlopen
import html5lib
with urlopen("http://example.com/") as f:
document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
To have more control over the parser, create a parser object explicitly.
For instance, to make the parser raise exceptions on parse errors, use:
.. code-block:: python
import html5lib
with open("mydocument.html", "rb") as f:
parser = html5lib.HTMLParser(strict=True)
document = parser.parse(f)
When you're instantiating parser objects explicitly, pass a treebuilder
class as the ``tree`` keyword argument to use an alternative document
format:
.. code-block:: python
import html5lib
parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
minidom_document = parser.parse("<p>Hello World!")
More documentation is available at https://html5lib.readthedocs.io/.
Installation
------------
html5lib works on CPython 2.7+, CPython 3.3+ and PyPy. To install it,
use:
.. code-block:: bash
$ pip install html5lib
Optional Dependencies
---------------------
The following third-party libraries may be used for additional
functionality:
- ``datrie`` can be used under CPython to improve parsing performance
(though in almost all cases the improvement is marginal);
- ``lxml`` is supported as a tree format (for both building and
walking) under CPython (but *not* PyPy where it is known to cause
segfaults);
- ``genshi`` has a treewalker (but not builder); and
- ``chardet`` can be used as a fallback when character encoding cannot
be determined.
Bugs
----
Please report any bugs on the `issue tracker
<https://github.com/html5lib/html5lib-python/issues>`_.
Tests
-----
Unit tests require the ``pytest`` and ``mock`` libraries and can be
run using the ``py.test`` command in the root directory.
Test data are contained in a separate `html5lib-tests
<https://github.com/html5lib/html5lib-tests>`_ repository and included
as a submodule, thus for git checkouts they must be initialized::
$ git submodule init
$ git submodule update
If you have all compatible Python implementations available on your
system, you can run tests on all of them using the ``tox`` utility,
which can be found on PyPI.
Questions?
----------
There's a mailing list available for support on Google Groups,
`html5lib-discuss <http://groups.google.com/group/html5lib-discuss>`_,
though you may get a quicker response asking on IRC in `#whatwg on
irc.freenode.net <http://wiki.whatwg.org/wiki/IRC>`_.

View file

@ -0,0 +1,37 @@
from __future__ import print_function, unicode_literals
import platform
import sys
info = {
"impl": platform.python_implementation(),
"version": platform.python_version(),
"revision": platform.python_revision(),
"maxunicode": sys.maxunicode,
"maxsize": sys.maxsize
}
search_modules = ["chardet", "datrie", "genshi", "html5lib", "lxml", "six"]
found_modules = []
for m in search_modules:
try:
__import__(m)
except ImportError:
pass
else:
found_modules.append(m)
info["modules"] = ", ".join(found_modules)
print("""html5lib debug info:
Python %(version)s (revision: %(revision)s)
Implementation: %(impl)s
sys.maxunicode: %(maxunicode)X
sys.maxsize: %(maxsize)X
Installed modules: %(modules)s""" % info)

View file

@ -0,0 +1,177 @@
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/html5lib.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/html5lib.qhc"
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/html5lib"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/html5lib"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."

View file

@ -0,0 +1,3 @@
.. :changelog:
.. include:: ../CHANGES.rst

View file

@ -0,0 +1,280 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# html5lib documentation build configuration file, created by
# sphinx-quickstart on Wed May 8 00:04:49 2013.
#
# This file is execfile()d with the current directory set to its containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys, os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#sys.path.insert(0, os.path.abspath('.'))
# -- General configuration -----------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.viewcode']
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = 'html5lib'
copyright = '2006 - 2013, James Graham, Geoffrey Sneddon, and contributors'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = '1.0'
# The full version, including alpha/beta/rc tags.
sys.path.append(os.path.abspath('..'))
from html5lib import __version__
release = __version__
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = 'en'
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build', 'theme']
# The reST default role (used for this markup: `text`) to use for all documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
#keep_warnings = False
# -- Options for HTML output ---------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'default'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
#html_static_path = ['_static']
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'html5libdoc'
# -- Options for LaTeX output --------------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
('index', 'html5lib.tex', 'html5lib Documentation',
'James Graham, Geoffrey Sneddon, and contributors', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output --------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'html5lib', 'html5lib Documentation',
['James Graham, Geoffrey Sneddon, and contributors'], 1)
]
# If true, show URL addresses after external links.
#man_show_urls = False
# -- Options for Texinfo output ------------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'html5lib', 'html5lib Documentation',
'James Graham, Geoffrey Sneddon, and contributors', 'html5lib', 'One line description of project.',
'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# If false, no module index is generated.
#texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
#texinfo_no_detailmenu = False
class CExtMock(object):
"""Required for autodoc on readthedocs.org where you cannot build C extensions."""
def __init__(self, *args, **kwargs):
pass
def __call__(self, *args, **kwargs):
return CExtMock()
@classmethod
def __getattr__(cls, name):
if name in ('__file__', '__path__'):
return '/dev/null'
else:
return CExtMock()
try:
import lxml # flake8: noqa
except ImportError:
sys.modules['lxml'] = CExtMock()
sys.modules['lxml.etree'] = CExtMock()
print("warning: lxml modules mocked.")
try:
import genshi # flake8: noqa
except ImportError:
sys.modules['genshi'] = CExtMock()
sys.modules['genshi.core'] = CExtMock()
print("warning: genshi modules mocked.")

View file

@ -0,0 +1,58 @@
filters Package
===============
:mod:`base` Module
-------------------
.. automodule:: html5lib.filters.base
:members:
:show-inheritance:
:special-members: __init__
:mod:`alphabeticalattributes` Module
------------------------------------
.. automodule:: html5lib.filters.alphabeticalattributes
:members:
:show-inheritance:
:special-members: __init__
:mod:`inject_meta_charset` Module
---------------------------------
.. automodule:: html5lib.filters.inject_meta_charset
:members:
:show-inheritance:
:special-members: __init__
:mod:`lint` Module
------------------
.. automodule:: html5lib.filters.lint
:members:
:show-inheritance:
:special-members: __init__
:mod:`optionaltags` Module
--------------------------
.. automodule:: html5lib.filters.optionaltags
:members:
:show-inheritance:
:special-members: __init__
:mod:`sanitizer` Module
-----------------------
.. automodule:: html5lib.filters.sanitizer
:members:
:show-inheritance:
:special-members: __init__
:mod:`whitespace` Module
------------------------
.. automodule:: html5lib.filters.whitespace
:members:
:show-inheritance:
:special-members: __init__

View file

@ -0,0 +1,38 @@
html5lib Package
================
.. automodule:: html5lib
:members: __version__
:mod:`constants` Module
-----------------------
.. automodule:: html5lib.constants
:members:
:show-inheritance:
:mod:`html5parser` Module
-------------------------
.. automodule:: html5lib.html5parser
:members:
:show-inheritance:
:special-members: __init__
:mod:`serializer` Module
------------------------
.. automodule:: html5lib.serializer
:members:
:show-inheritance:
:special-members: __init__
Subpackages
-----------
.. toctree::
html5lib.filters
html5lib.treebuilders
html5lib.treewalkers
html5lib.treeadapters

View file

@ -0,0 +1,20 @@
treeadapters Package
====================
:mod:`~html5lib.treeadapters` Package
-------------------------------------
.. automodule:: html5lib.treeadapters
:members:
:show-inheritance:
:special-members: __init__
.. automodule:: html5lib.treeadapters.genshi
:members:
:show-inheritance:
:special-members: __init__
.. automodule:: html5lib.treeadapters.sax
:members:
:show-inheritance:
:special-members: __init__

View file

@ -0,0 +1,42 @@
treebuilders Package
====================
:mod:`treebuilders` Package
---------------------------
.. automodule:: html5lib.treebuilders
:members:
:show-inheritance:
:special-members: __init__
:mod:`base` Module
-------------------
.. automodule:: html5lib.treebuilders.base
:members:
:show-inheritance:
:special-members: __init__
:mod:`dom` Module
-----------------
.. automodule:: html5lib.treebuilders.dom
:members:
:show-inheritance:
:special-members: __init__
:mod:`etree` Module
-------------------
.. automodule:: html5lib.treebuilders.etree
:members:
:show-inheritance:
:special-members: __init__
:mod:`etree_lxml` Module
------------------------
.. automodule:: html5lib.treebuilders.etree_lxml
:members:
:show-inheritance:
:special-members: __init__

View file

@ -0,0 +1,50 @@
treewalkers Package
===================
:mod:`treewalkers` Package
--------------------------
.. automodule:: html5lib.treewalkers
:members:
:show-inheritance:
:special-members: __init__
:mod:`base` Module
------------------
.. automodule:: html5lib.treewalkers.base
:members:
:show-inheritance:
:special-members: __init__
:mod:`dom` Module
-----------------
.. automodule:: html5lib.treewalkers.dom
:members:
:show-inheritance:
:special-members: __init__
:mod:`etree` Module
-------------------
.. automodule:: html5lib.treewalkers.etree
:members:
:show-inheritance:
:special-members: __init__
:mod:`etree_lxml` Module
------------------------
.. automodule:: html5lib.treewalkers.etree_lxml
:members:
:show-inheritance:
:special-members: __init__
:mod:`genshi` Module
--------------------
.. automodule:: html5lib.treewalkers.genshi
:members:
:show-inheritance:
:special-members: __init__

View file

@ -0,0 +1,22 @@
Overview
========
.. include:: ../README.rst
:start-line: 6
.. toctree::
:maxdepth: 2
movingparts
modules
changes
License <license>
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

View file

@ -0,0 +1,4 @@
License
=======
.. include:: ../LICENSE

View file

@ -0,0 +1,242 @@
@ECHO OFF
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set BUILDDIR=_build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
set I18NSPHINXOPTS=%SPHINXOPTS% .
if NOT "%PAPER%" == "" (
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
)
if "%1" == "" goto help
if "%1" == "help" (
:help
echo.Please use `make ^<target^>` where ^<target^> is one of
echo. html to make standalone HTML files
echo. dirhtml to make HTML files named index.html in directories
echo. singlehtml to make a single large HTML file
echo. pickle to make pickle files
echo. json to make JSON files
echo. htmlhelp to make HTML files and a HTML help project
echo. qthelp to make HTML files and a qthelp project
echo. devhelp to make HTML files and a Devhelp project
echo. epub to make an epub
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
echo. text to make text files
echo. man to make manual pages
echo. texinfo to make Texinfo files
echo. gettext to make PO message catalogs
echo. changes to make an overview over all changed/added/deprecated items
echo. xml to make Docutils-native XML files
echo. pseudoxml to make pseudoxml-XML files for display purposes
echo. linkcheck to check all external links for integrity
echo. doctest to run all doctests embedded in the documentation if enabled
goto end
)
if "%1" == "clean" (
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
del /q /s %BUILDDIR%\*
goto end
)
%SPHINXBUILD% 2> nul
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
if "%1" == "html" (
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
goto end
)
if "%1" == "dirhtml" (
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
goto end
)
if "%1" == "singlehtml" (
%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
goto end
)
if "%1" == "pickle" (
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the pickle files.
goto end
)
if "%1" == "json" (
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the JSON files.
goto end
)
if "%1" == "htmlhelp" (
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
goto end
)
if "%1" == "qthelp" (
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\html5lib.qhcp
echo.To view the help file:
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\html5lib.ghc
goto end
)
if "%1" == "devhelp" (
%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished.
goto end
)
if "%1" == "epub" (
%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The epub file is in %BUILDDIR%/epub.
goto end
)
if "%1" == "latex" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
if errorlevel 1 exit /b 1
echo.
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdf" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf
cd %BUILDDIR%/..
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdfja" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf-ja
cd %BUILDDIR%/..
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "text" (
%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The text files are in %BUILDDIR%/text.
goto end
)
if "%1" == "man" (
%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The manual pages are in %BUILDDIR%/man.
goto end
)
if "%1" == "texinfo" (
%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
goto end
)
if "%1" == "gettext" (
%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
goto end
)
if "%1" == "changes" (
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
if errorlevel 1 exit /b 1
echo.
echo.The overview file is in %BUILDDIR%/changes.
goto end
)
if "%1" == "linkcheck" (
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
if errorlevel 1 exit /b 1
echo.
echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
goto end
)
if "%1" == "doctest" (
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
if errorlevel 1 exit /b 1
echo.
echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
goto end
)
if "%1" == "xml" (
%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The XML files are in %BUILDDIR%/xml.
goto end
)
if "%1" == "pseudoxml" (
%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
goto end
)
:end

View file

@ -0,0 +1,7 @@
html5lib
========
.. toctree::
:maxdepth: 4
html5lib

View file

@ -0,0 +1,165 @@
The moving parts
================
html5lib consists of a number of components, which are responsible for
handling its features.
Parsing uses a *tree builder* to generate a *tree*, the in-memory representation of the document.
Several tree representations are supported, as are translations to other formats via *tree adapters*.
The tree may be translated to a token stream with a *tree walker*, from which :class:`~html5lib.serializer.HTMLSerializer` produces a stream of bytes.
The token stream may also be transformed by use of *filters* to accomplish tasks like sanitization.
Tree builders
-------------
The parser reads HTML by tokenizing the content and building a tree that
the user can later access. html5lib can build three types of trees:
* ``etree`` - this is the default; builds a tree based on :mod:`xml.etree`,
which can be found in the standard library. Whenever possible, the
accelerated ``ElementTree`` implementation (i.e.
``xml.etree.cElementTree`` on Python 2.x) is used.
* ``dom`` - builds a tree based on :mod:`xml.dom.minidom`.
* ``lxml`` - uses the :mod:`lxml.etree` implementation of the ``ElementTree``
API. The performance gains are relatively small compared to using the
accelerated ``ElementTree`` module.
You can specify the builder by name when using the shorthand API:
.. code-block:: python
import html5lib
with open("mydocument.html", "rb") as f:
lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
To get a builder class by name, use the :func:`~html5lib.treebuilders.getTreeBuilder` function.
When instantiating a :class:`~html5lib.html5parser.HTMLParser` object, you must pass a tree builder class via the ``tree`` keyword attribute:
.. code-block:: python
import html5lib
TreeBuilder = html5lib.getTreeBuilder("dom")
parser = html5lib.HTMLParser(tree=TreeBuilder)
minidom_document = parser.parse("<p>Hello World!")
The implementation of builders can be found in `html5lib/treebuilders/
<https://github.com/html5lib/html5lib-python/tree/master/html5lib/treebuilders>`_.
Tree walkers
------------
In addition to manipulating a tree directly, you can use a tree walker to generate a streaming view of it.
html5lib provides walkers for ``etree``, ``dom``, and ``lxml`` trees, as well as ``genshi`` `markup streams <https://genshi.edgewall.org/wiki/Documentation/streams.html>`_.
The implementation of walkers can be found in `html5lib/treewalkers/
<https://github.com/html5lib/html5lib-python/tree/master/html5lib/treewalkers>`_.
html5lib provides :class:`~html5lib.serializer.HTMLSerializer` for generating a stream of bytes from a token stream, and several filters which manipulate the stream.
HTMLSerializer
~~~~~~~~~~~~~~
The serializer lets you write HTML back as a stream of bytes.
.. code-block:: pycon
>>> import html5lib
>>> element = html5lib.parse('<p xml:lang="pl">Witam wszystkich')
>>> walker = html5lib.getTreeWalker("etree")
>>> stream = walker(element)
>>> s = html5lib.serializer.HTMLSerializer()
>>> output = s.serialize(stream)
>>> for item in output:
... print("%r" % item)
'<p'
' '
'xml:lang'
'='
'pl'
'>'
'Witam wszystkich'
You can customize the serializer behaviour in a variety of ways. Consult
the :class:`~html5lib.serializer.HTMLSerializer` documentation.
Filters
~~~~~~~
html5lib provides several filters:
* :class:`alphabeticalattributes.Filter
<html5lib.filters.alphabeticalattributes.Filter>` sorts attributes on
tags to be in alphabetical order
* :class:`inject_meta_charset.Filter
<html5lib.filters.inject_meta_charset.Filter>` sets a user-specified
encoding in the correct ``<meta>`` tag in the ``<head>`` section of
the document
* :class:`lint.Filter <html5lib.filters.lint.Filter>` raises
:exc:`AssertionError` exceptions on invalid tag and attribute names, invalid
PCDATA, etc.
* :class:`optionaltags.Filter <html5lib.filters.optionaltags.Filter>`
removes tags from the token stream which are not necessary to produce valid
HTML
* :class:`sanitizer.Filter <html5lib.filters.sanitizer.Filter>` removes
unsafe markup and CSS. Elements that are known to be safe are passed
through and the rest is converted to visible text. The default
configuration of the sanitizer follows the `WHATWG Sanitization Rules
<http://wiki.whatwg.org/wiki/Sanitization_rules>`_.
* :class:`whitespace.Filter <html5lib.filters.whitespace.Filter>`
collapses all whitespace characters to single spaces unless they're in
``<pre/>`` or ``<textarea/>`` tags.
To use a filter, simply wrap it around a token stream:
.. code-block:: python
>>> import html5lib
>>> from html5lib.filters import sanitizer
>>> dom = html5lib.parse("<p><script>alert('Boo!')", treebuilder="dom")
>>> walker = html5lib.getTreeWalker("dom")
>>> stream = walker(dom)
>>> clean_stream = sanitizer.Filter(stream)
Tree adapters
-------------
Tree adapters can be used to translate between tree formats.
Two adapters are provided by html5lib:
* :func:`html5lib.treeadapters.genshi.to_genshi()` generates a `Genshi markup stream <https://genshi.edgewall.org/wiki/Documentation/streams.html>`_.
* :func:`html5lib.treeadapters.sax.to_sax()` calls a SAX handler based on the tree.
Encoding discovery
------------------
Parsed trees are always Unicode. However a large variety of input
encodings are supported. The encoding of the document is determined in
the following way:
* The encoding may be explicitly specified by passing the name of the
encoding as the encoding parameter to the
:meth:`~html5lib.html5parser.HTMLParser.parse` method on
:class:`~html5lib.html5parser.HTMLParser` objects.
* If no encoding is specified, the parser will attempt to detect the
encoding from a ``<meta>`` element in the first 512 bytes of the
document (this is only a partial implementation of the current HTML
specification).
* If no encoding can be found and the :mod:`chardet` library is available, an
attempt will be made to sniff the encoding from the byte pattern.
* If all else fails, the default encoding will be used. This is usually
`Windows-1252 <http://en.wikipedia.org/wiki/Windows-1252>`_, which is
a common fallback used by Web browsers.

View file

@ -0,0 +1,9 @@
#!/bin/bash -e
if [[ ! -x $(which flake8) ]]; then
echo "fatal: flake8 not found on $PATH. Exiting."
exit 1
fi
flake8 `dirname $0`
exit $?

View file

@ -0,0 +1,35 @@
"""
HTML parsing library based on the `WHATWG HTML specification
<https://whatwg.org/html>`_. The parser is designed to be compatible with
existing HTML found in the wild and implements well-defined error recovery that
is largely compatible with modern desktop web browsers.
Example usage::
import html5lib
with open("my_document.html", "rb") as f:
tree = html5lib.parse(f)
For convenience, this module re-exports the following names:
* :func:`~.html5parser.parse`
* :func:`~.html5parser.parseFragment`
* :class:`~.html5parser.HTMLParser`
* :func:`~.treebuilders.getTreeBuilder`
* :func:`~.treewalkers.getTreeWalker`
* :func:`~.serializer.serialize`
"""
from __future__ import absolute_import, division, unicode_literals
from .html5parser import HTMLParser, parse, parseFragment
from .treebuilders import getTreeBuilder
from .treewalkers import getTreeWalker
from .serializer import serialize
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
"getTreeWalker", "serialize"]
# this has to be at the top level, see how setup.py parses this
#: Distribution version number.
__version__ = "1.0.1"

View file

@ -0,0 +1,288 @@
from __future__ import absolute_import, division, unicode_literals
import re
import warnings
from .constants import DataLossWarning
baseChar = """
[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
combiningCharacter = """
[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
#x3099 | #x309A"""
digit = """
[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
extender = """
#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
letter = " | ".join([baseChar, ideographic])
# Without the
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
extender])
nameFirst = " | ".join([letter, "_"])
reChar = re.compile(r"#x([\d|A-F]{4,4})")
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
def charStringToList(chars):
charRanges = [item.strip() for item in chars.split(" | ")]
rv = []
for item in charRanges:
foundMatch = False
for regexp in (reChar, reCharRange):
match = regexp.match(item)
if match is not None:
rv.append([hexToInt(item) for item in match.groups()])
if len(rv[-1]) == 1:
rv[-1] = rv[-1] * 2
foundMatch = True
break
if not foundMatch:
assert len(item) == 1
rv.append([ord(item)] * 2)
rv = normaliseCharList(rv)
return rv
def normaliseCharList(charList):
charList = sorted(charList)
for item in charList:
assert item[1] >= item[0]
rv = []
i = 0
while i < len(charList):
j = 1
rv.append(charList[i])
while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
rv[-1][1] = charList[i + j][1]
j += 1
i += j
return rv
# We don't really support characters above the BMP :(
max_unicode = int("FFFF", 16)
def missingRanges(charList):
rv = []
if charList[0] != 0:
rv.append([0, charList[0][0] - 1])
for i, item in enumerate(charList[:-1]):
rv.append([item[1] + 1, charList[i + 1][0] - 1])
if charList[-1][1] != max_unicode:
rv.append([charList[-1][1] + 1, max_unicode])
return rv
def listToRegexpStr(charList):
rv = []
for item in charList:
if item[0] == item[1]:
rv.append(escapeRegexp(chr(item[0])))
else:
rv.append(escapeRegexp(chr(item[0])) + "-" +
escapeRegexp(chr(item[1])))
return "[%s]" % "".join(rv)
def hexToInt(hex_str):
return int(hex_str, 16)
def escapeRegexp(string):
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
"[", "]", "|", "(", ")", "-")
for char in specialCharacters:
string = string.replace(char, "\\" + char)
return string
# output from the above
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
# Simpler things
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self,
dropXmlnsLocalName=False,
dropXmlnsAttrNs=False,
preventDoubleDashComments=False,
preventDashAtCommentEnd=False,
replaceFormFeedCharacters=True,
preventSingleQuotePubid=False):
self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs
self.preventDoubleDashComments = preventDoubleDashComments
self.preventDashAtCommentEnd = preventDashAtCommentEnd
self.replaceFormFeedCharacters = replaceFormFeedCharacters
self.preventSingleQuotePubid = preventSingleQuotePubid
self.replaceCache = {}
def coerceAttribute(self, name, namespace=None):
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
return None
elif (self.dropXmlnsAttrNs and
namespace == "http://www.w3.org/2000/xmlns/"):
warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
return None
else:
return self.toXmlName(name)
def coerceElement(self, name):
return self.toXmlName(name)
def coerceComment(self, data):
if self.preventDoubleDashComments:
while "--" in data:
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
data = data.replace("--", "- -")
if data.endswith("-"):
warnings.warn("Comments cannot end in a dash", DataLossWarning)
data += " "
return data
def coerceCharacters(self, data):
if self.replaceFormFeedCharacters:
for _ in range(data.count("\x0C")):
warnings.warn("Text cannot contain U+000C", DataLossWarning)
data = data.replace("\x0C", " ")
# Other non-xml characters
return data
def coercePubid(self, data):
dataOutput = data
for char in nonPubidCharRegexp.findall(data):
warnings.warn("Coercing non-XML pubid", DataLossWarning)
replacement = self.getReplacementCharacter(char)
dataOutput = dataOutput.replace(char, replacement)
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
return dataOutput
def toXmlName(self, name):
nameFirst = name[0]
nameRest = name[1:]
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
if m:
warnings.warn("Coercing non-XML name", DataLossWarning)
nameFirstOutput = self.getReplacementCharacter(nameFirst)
else:
nameFirstOutput = nameFirst
nameRestOutput = nameRest
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
for char in replaceChars:
warnings.warn("Coercing non-XML name", DataLossWarning)
replacement = self.getReplacementCharacter(char)
nameRestOutput = nameRestOutput.replace(char, replacement)
return nameFirstOutput + nameRestOutput
def getReplacementCharacter(self, char):
if char in self.replaceCache:
replacement = self.replaceCache[char]
else:
replacement = self.escapeChar(char)
return replacement
def fromXmlName(self, name):
for item in set(self.replacementRegexp.findall(name)):
name = name.replace(item, self.unescapeChar(item))
return name
def escapeChar(self, char):
replacement = "U%05X" % ord(char)
self.replaceCache[char] = replacement
return replacement
def unescapeChar(self, charcode):
return chr(int(charcode[1:], 16))

View file

@ -0,0 +1,923 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type, binary_type
from six.moves import http_client, urllib
import codecs
import re
import webencodings
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from .constants import _ReparseException
from . import _utils
from io import StringIO
try:
from io import BytesIO
except ImportError:
BytesIO = StringIO
# Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
if _utils.supports_lone_surrogates:
# Use one extra step of indirection and create surrogates with
# eval. Not using this indirection would introduce an illegal
# unicode literal on platforms not supporting such lone
# surrogates.
assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
"]")
else:
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
0x10FFFE, 0x10FFFF])
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
# Cache for charsUntil()
charsUntilRegEx = {}
class BufferedStream(object):
"""Buffering for streams that do not have buffering of their own
The buffer is implemented as a list of chunks on the assumption that
joining many strings will be slow since it is O(n**2)
"""
def __init__(self, stream):
self.stream = stream
self.buffer = []
self.position = [-1, 0] # chunk number, offset
def tell(self):
pos = 0
for chunk in self.buffer[:self.position[0]]:
pos += len(chunk)
pos += self.position[1]
return pos
def seek(self, pos):
assert pos <= self._bufferedBytes()
offset = pos
i = 0
while len(self.buffer[i]) < offset:
offset -= len(self.buffer[i])
i += 1
self.position = [i, offset]
def read(self, bytes):
if not self.buffer:
return self._readStream(bytes)
elif (self.position[0] == len(self.buffer) and
self.position[1] == len(self.buffer[-1])):
return self._readStream(bytes)
else:
return self._readFromBuffer(bytes)
def _bufferedBytes(self):
return sum([len(item) for item in self.buffer])
def _readStream(self, bytes):
data = self.stream.read(bytes)
self.buffer.append(data)
self.position[0] += 1
self.position[1] = len(data)
return data
def _readFromBuffer(self, bytes):
remainingBytes = bytes
rv = []
bufferIndex = self.position[0]
bufferOffset = self.position[1]
while bufferIndex < len(self.buffer) and remainingBytes != 0:
assert remainingBytes > 0
bufferedData = self.buffer[bufferIndex]
if remainingBytes <= len(bufferedData) - bufferOffset:
bytesToRead = remainingBytes
self.position = [bufferIndex, bufferOffset + bytesToRead]
else:
bytesToRead = len(bufferedData) - bufferOffset
self.position = [bufferIndex, len(bufferedData)]
bufferIndex += 1
rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
remainingBytes -= bytesToRead
bufferOffset = 0
if remainingBytes:
rv.append(self._readStream(remainingBytes))
return b"".join(rv)
def HTMLInputStream(source, **kwargs):
# Work around Python bug #20007: read(0) closes the connection.
# http://bugs.python.org/issue20007
if (isinstance(source, http_client.HTTPResponse) or
# Also check for addinfourl wrapping HTTPResponse
(isinstance(source, urllib.response.addbase) and
isinstance(source.fp, http_client.HTTPResponse))):
isUnicode = False
elif hasattr(source, "read"):
isUnicode = isinstance(source.read(0), text_type)
else:
isUnicode = isinstance(source, text_type)
if isUnicode:
encodings = [x for x in kwargs if x.endswith("_encoding")]
if encodings:
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
return HTMLUnicodeInputStream(source, **kwargs)
else:
return HTMLBinaryInputStream(source, **kwargs)
class HTMLUnicodeInputStream(object):
"""Provides a unicode stream of characters to the HTMLTokenizer.
This class takes care of character encoding and removing or replacing
incorrect byte-sequences and also provides column and line tracking.
"""
_defaultChunkSize = 10240
def __init__(self, source):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
for use by html5lib.
source can be either a file-object, local filename or a string.
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
"""
if not _utils.supports_lone_surrogates:
# Such platforms will have already checked for such
# surrogate errors, so no need to do this checking.
self.reportCharacterErrors = None
elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
else:
self.reportCharacterErrors = self.characterErrorsUCS2
# List of where new lines occur
self.newLines = [0]
self.charEncoding = (lookupEncoding("utf-8"), "certain")
self.dataStream = self.openStream(source)
self.reset()
def reset(self):
self.chunk = ""
self.chunkSize = 0
self.chunkOffset = 0
self.errors = []
# number of (complete) lines in previous chunks
self.prevNumLines = 0
# number of columns in the last line of the previous chunk
self.prevNumCols = 0
# Deal with CR LF and surrogates split over chunk boundaries
self._bufferedCharacter = None
def openStream(self, source):
"""Produces a file object from source.
source can be either a file object, local filename or a string.
"""
# Already a file object
if hasattr(source, 'read'):
stream = source
else:
stream = StringIO(source)
return stream
def _position(self, offset):
chunk = self.chunk
nLines = chunk.count('\n', 0, offset)
positionLine = self.prevNumLines + nLines
lastLinePos = chunk.rfind('\n', 0, offset)
if lastLinePos == -1:
positionColumn = self.prevNumCols + offset
else:
positionColumn = offset - (lastLinePos + 1)
return (positionLine, positionColumn)
def position(self):
"""Returns (line, col) of the current position in the stream."""
line, col = self._position(self.chunkOffset)
return (line + 1, col)
def char(self):
""" Read one character from the stream or queue if available. Return
EOF when EOF is reached.
"""
# Read a new chunk from the input stream if necessary
if self.chunkOffset >= self.chunkSize:
if not self.readChunk():
return EOF
chunkOffset = self.chunkOffset
char = self.chunk[chunkOffset]
self.chunkOffset = chunkOffset + 1
return char
def readChunk(self, chunkSize=None):
if chunkSize is None:
chunkSize = self._defaultChunkSize
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
self.chunk = ""
self.chunkSize = 0
self.chunkOffset = 0
data = self.dataStream.read(chunkSize)
# Deal with CR LF and surrogates broken across chunks
if self._bufferedCharacter:
data = self._bufferedCharacter + data
self._bufferedCharacter = None
elif not data:
# We have no more data, bye-bye stream
return False
if len(data) > 1:
lastv = ord(data[-1])
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
self._bufferedCharacter = data[-1]
data = data[:-1]
if self.reportCharacterErrors:
self.reportCharacterErrors(data)
# Replace invalid characters
data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n")
self.chunk = data
self.chunkSize = len(data)
return True
def characterErrorsUCS4(self, data):
for _ in range(len(invalid_unicode_re.findall(data))):
self.errors.append("invalid-codepoint")
def characterErrorsUCS2(self, data):
# Someone picked the wrong compile option
# You lose
skip = False
for match in invalid_unicode_re.finditer(data):
if skip:
continue
codepoint = ord(match.group())
pos = match.start()
# Pretty sure there should be endianness issues here
if _utils.isSurrogatePair(data[pos:pos + 2]):
# We have a surrogate pair!
char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
if char_val in non_bmp_invalid_codepoints:
self.errors.append("invalid-codepoint")
skip = True
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
pos == len(data) - 1):
self.errors.append("invalid-codepoint")
else:
skip = False
self.errors.append("invalid-codepoint")
def charsUntil(self, characters, opposite=False):
""" Returns a string of characters from the stream up to but not
including any character in 'characters' or EOF. 'characters' must be
a container that supports the 'in' method and iteration over its
characters.
"""
# Use a cache of regexps to find the required characters
try:
chars = charsUntilRegEx[(characters, opposite)]
except KeyError:
if __debug__:
for c in characters:
assert(ord(c) < 128)
regex = "".join(["\\x%02x" % ord(c) for c in characters])
if not opposite:
regex = "^%s" % regex
chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
rv = []
while True:
# Find the longest matching prefix
m = chars.match(self.chunk, self.chunkOffset)
if m is None:
# If nothing matched, and it wasn't because we ran out of chunk,
# then stop
if self.chunkOffset != self.chunkSize:
break
else:
end = m.end()
# If not the whole chunk matched, return everything
# up to the part that didn't match
if end != self.chunkSize:
rv.append(self.chunk[self.chunkOffset:end])
self.chunkOffset = end
break
# If the whole remainder of the chunk matched,
# use it all and read the next chunk
rv.append(self.chunk[self.chunkOffset:])
if not self.readChunk():
# Reached EOF
break
r = "".join(rv)
return r
def unget(self, char):
# Only one character is allowed to be ungotten at once - it must
# be consumed again before any further call to unget
if char is not None:
if self.chunkOffset == 0:
# unget is called quite rarely, so it's a good idea to do
# more work here if it saves a bit of work in the frequently
# called char and charsUntil.
# So, just prepend the ungotten character onto the current
# chunk:
self.chunk = char + self.chunk
self.chunkSize += 1
else:
self.chunkOffset -= 1
assert self.chunk[self.chunkOffset] == char
class HTMLBinaryInputStream(HTMLUnicodeInputStream):
"""Provides a unicode stream of characters to the HTMLTokenizer.
This class takes care of character encoding and removing or replacing
incorrect byte-sequences and also provides column and line tracking.
"""
def __init__(self, source, override_encoding=None, transport_encoding=None,
same_origin_parent_encoding=None, likely_encoding=None,
default_encoding="windows-1252", useChardet=True):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
for use by html5lib.
source can be either a file-object, local filename or a string.
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
"""
# Raw Stream - for unicode objects this will encode to utf-8 and set
# self.charEncoding as appropriate
self.rawStream = self.openStream(source)
HTMLUnicodeInputStream.__init__(self, self.rawStream)
# Encoding Information
# Number of bytes to use when looking for a meta element with
# encoding information
self.numBytesMeta = 1024
# Number of bytes to use when using detecting encoding using chardet
self.numBytesChardet = 100
# Things from args
self.override_encoding = override_encoding
self.transport_encoding = transport_encoding
self.same_origin_parent_encoding = same_origin_parent_encoding
self.likely_encoding = likely_encoding
self.default_encoding = default_encoding
# Determine encoding
self.charEncoding = self.determineEncoding(useChardet)
assert self.charEncoding[0] is not None
# Call superclass
self.reset()
def reset(self):
self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
HTMLUnicodeInputStream.reset(self)
def openStream(self, source):
"""Produces a file object from source.
source can be either a file object, local filename or a string.
"""
# Already a file object
if hasattr(source, 'read'):
stream = source
else:
stream = BytesIO(source)
try:
stream.seek(stream.tell())
except: # pylint:disable=bare-except
stream = BufferedStream(stream)
return stream
def determineEncoding(self, chardet=True):
# BOMs take precedence over everything
# This will also read past the BOM if present
charEncoding = self.detectBOM(), "certain"
if charEncoding[0] is not None:
return charEncoding
# If we've been overriden, we've been overriden
charEncoding = lookupEncoding(self.override_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding
# Now check the transport layer
charEncoding = lookupEncoding(self.transport_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding
# Look for meta elements with encoding information
charEncoding = self.detectEncodingMeta(), "tentative"
if charEncoding[0] is not None:
return charEncoding
# Parent document encoding
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
return charEncoding
# "likely" encoding
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding
# Guess with chardet, if available
if chardet:
try:
from chardet.universaldetector import UniversalDetector
except ImportError:
pass
else:
buffers = []
detector = UniversalDetector()
while not detector.done:
buffer = self.rawStream.read(self.numBytesChardet)
assert isinstance(buffer, bytes)
if not buffer:
break
buffers.append(buffer)
detector.feed(buffer)
detector.close()
encoding = lookupEncoding(detector.result['encoding'])
self.rawStream.seek(0)
if encoding is not None:
return encoding, "tentative"
# Try the default encoding
charEncoding = lookupEncoding(self.default_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding
# Fallback to html5lib's default if even that hasn't worked
return lookupEncoding("windows-1252"), "tentative"
def changeEncoding(self, newEncoding):
assert self.charEncoding[1] != "certain"
newEncoding = lookupEncoding(newEncoding)
if newEncoding is None:
return
if newEncoding.name in ("utf-16be", "utf-16le"):
newEncoding = lookupEncoding("utf-8")
assert newEncoding is not None
elif newEncoding == self.charEncoding[0]:
self.charEncoding = (self.charEncoding[0], "certain")
else:
self.rawStream.seek(0)
self.charEncoding = (newEncoding, "certain")
self.reset()
raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
def detectBOM(self):
"""Attempts to detect at BOM at the start of the stream. If
an encoding can be determined from the BOM return the name of the
encoding otherwise return None"""
bomDict = {
codecs.BOM_UTF8: 'utf-8',
codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
}
# Go to beginning of file and read in 4 bytes
string = self.rawStream.read(4)
assert isinstance(string, bytes)
# Try detecting the BOM using bytes from the string
encoding = bomDict.get(string[:3]) # UTF-8
seek = 3
if not encoding:
# Need to detect UTF-32 before UTF-16
encoding = bomDict.get(string) # UTF-32
seek = 4
if not encoding:
encoding = bomDict.get(string[:2]) # UTF-16
seek = 2
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
if encoding:
self.rawStream.seek(seek)
return lookupEncoding(encoding)
else:
self.rawStream.seek(0)
return None
def detectEncodingMeta(self):
"""Report the encoding declared by the meta element
"""
buffer = self.rawStream.read(self.numBytesMeta)
assert isinstance(buffer, bytes)
parser = EncodingParser(buffer)
self.rawStream.seek(0)
encoding = parser.getEncoding()
if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
encoding = lookupEncoding("utf-8")
return encoding
class EncodingBytes(bytes):
"""String-like object with an associated position and various extra methods
If the position is ever greater than the string length then an exception is
raised"""
def __new__(self, value):
assert isinstance(value, bytes)
return bytes.__new__(self, value.lower())
def __init__(self, value):
# pylint:disable=unused-argument
self._position = -1
def __iter__(self):
return self
def __next__(self):
p = self._position = self._position + 1
if p >= len(self):
raise StopIteration
elif p < 0:
raise TypeError
return self[p:p + 1]
def next(self):
# Py2 compat
return self.__next__()
def previous(self):
p = self._position
if p >= len(self):
raise StopIteration
elif p < 0:
raise TypeError
self._position = p = p - 1
return self[p:p + 1]
def setPosition(self, position):
if self._position >= len(self):
raise StopIteration
self._position = position
def getPosition(self):
if self._position >= len(self):
raise StopIteration
if self._position >= 0:
return self._position
else:
return None
position = property(getPosition, setPosition)
def getCurrentByte(self):
return self[self.position:self.position + 1]
currentByte = property(getCurrentByte)
def skip(self, chars=spaceCharactersBytes):
"""Skip past a list of characters"""
p = self.position # use property for the error-checking
while p < len(self):
c = self[p:p + 1]
if c not in chars:
self._position = p
return c
p += 1
self._position = p
return None
def skipUntil(self, chars):
p = self.position
while p < len(self):
c = self[p:p + 1]
if c in chars:
self._position = p
return c
p += 1
self._position = p
return None
def matchBytes(self, bytes):
"""Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone"""
p = self.position
data = self[p:p + len(bytes)]
rv = data.startswith(bytes)
if rv:
self.position += len(bytes)
return rv
def jumpTo(self, bytes):
"""Look for the next sequence of bytes matching a given sequence. If
a match is found advance the position to the last byte of the match"""
newPosition = self[self.position:].find(bytes)
if newPosition > -1:
# XXX: This is ugly, but I can't see a nicer way to fix this.
if self._position == -1:
self._position = 0
self._position += (newPosition + len(bytes) - 1)
return True
else:
raise StopIteration
class EncodingParser(object):
"""Mini parser for detecting character encoding from meta elements"""
def __init__(self, data):
"""string - the data to work on for encoding detection"""
self.data = EncodingBytes(data)
self.encoding = None
def getEncoding(self):
methodDispatch = (
(b"<!--", self.handleComment),
(b"<meta", self.handleMeta),
(b"</", self.handlePossibleEndTag),
(b"<!", self.handleOther),
(b"<?", self.handleOther),
(b"<", self.handlePossibleStartTag))
for _ in self.data:
keepParsing = True
for key, method in methodDispatch:
if self.data.matchBytes(key):
try:
keepParsing = method()
break
except StopIteration:
keepParsing = False
break
if not keepParsing:
break
return self.encoding
def handleComment(self):
"""Skip over comments"""
return self.data.jumpTo(b"-->")
def handleMeta(self):
if self.data.currentByte not in spaceCharactersBytes:
# if we have <meta not followed by a space so just keep going
return True
# We have a valid meta element we want to search for attributes
hasPragma = False
pendingEncoding = None
while True:
# Try to find the next attribute after the current position
attr = self.getAttribute()
if attr is None:
return True
else:
if attr[0] == b"http-equiv":
hasPragma = attr[1] == b"content-type"
if hasPragma and pendingEncoding is not None:
self.encoding = pendingEncoding
return False
elif attr[0] == b"charset":
tentativeEncoding = attr[1]
codec = lookupEncoding(tentativeEncoding)
if codec is not None:
self.encoding = codec
return False
elif attr[0] == b"content":
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
tentativeEncoding = contentParser.parse()
if tentativeEncoding is not None:
codec = lookupEncoding(tentativeEncoding)
if codec is not None:
if hasPragma:
self.encoding = codec
return False
else:
pendingEncoding = codec
def handlePossibleStartTag(self):
return self.handlePossibleTag(False)
def handlePossibleEndTag(self):
next(self.data)
return self.handlePossibleTag(True)
def handlePossibleTag(self, endTag):
data = self.data
if data.currentByte not in asciiLettersBytes:
# If the next byte is not an ascii letter either ignore this
# fragment (possible start tag case) or treat it according to
# handleOther
if endTag:
data.previous()
self.handleOther()
return True
c = data.skipUntil(spacesAngleBrackets)
if c == b"<":
# return to the first step in the overall "two step" algorithm
# reprocessing the < byte
data.previous()
else:
# Read all attributes
attr = self.getAttribute()
while attr is not None:
attr = self.getAttribute()
return True
def handleOther(self):
return self.data.jumpTo(b">")
def getAttribute(self):
"""Return a name,value pair for the next attribute in the stream,
if one is found, or None"""
data = self.data
# Step 1 (skip chars)
c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
assert c is None or len(c) == 1
# Step 2
if c in (b">", None):
return None
# Step 3
attrName = []
attrValue = []
# Step 4 attribute name
while True:
if c == b"=" and attrName:
break
elif c in spaceCharactersBytes:
# Step 6!
c = data.skip()
break
elif c in (b"/", b">"):
return b"".join(attrName), b""
elif c in asciiUppercaseBytes:
attrName.append(c.lower())
elif c is None:
return None
else:
attrName.append(c)
# Step 5
c = next(data)
# Step 7
if c != b"=":
data.previous()
return b"".join(attrName), b""
# Step 8
next(data)
# Step 9
c = data.skip()
# Step 10
if c in (b"'", b'"'):
# 10.1
quoteChar = c
while True:
# 10.2
c = next(data)
# 10.3
if c == quoteChar:
next(data)
return b"".join(attrName), b"".join(attrValue)
# 10.4
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
# 10.5
else:
attrValue.append(c)
elif c == b">":
return b"".join(attrName), b""
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
elif c is None:
return None
else:
attrValue.append(c)
# Step 11
while True:
c = next(data)
if c in spacesAngleBrackets:
return b"".join(attrName), b"".join(attrValue)
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
elif c is None:
return None
else:
attrValue.append(c)
class ContentAttrParser(object):
def __init__(self, data):
assert isinstance(data, bytes)
self.data = data
def parse(self):
try:
# Check if the attr name is charset
# otherwise return
self.data.jumpTo(b"charset")
self.data.position += 1
self.data.skip()
if not self.data.currentByte == b"=":
# If there is no = sign keep looking for attrs
return None
self.data.position += 1
self.data.skip()
# Look for an encoding between matching quote marks
if self.data.currentByte in (b'"', b"'"):
quoteMark = self.data.currentByte
self.data.position += 1
oldPosition = self.data.position
if self.data.jumpTo(quoteMark):
return self.data[oldPosition:self.data.position]
else:
return None
else:
# Unquoted value
oldPosition = self.data.position
try:
self.data.skipUntil(spaceCharactersBytes)
return self.data[oldPosition:self.data.position]
except StopIteration:
# Return the whole remaining value
return self.data[oldPosition:]
except StopIteration:
return None
def lookupEncoding(encoding):
"""Return the python codec name corresponding to an encoding or None if the
string doesn't correspond to a valid encoding."""
if isinstance(encoding, binary_type):
try:
encoding = encoding.decode("ascii")
except UnicodeDecodeError:
return None
if encoding is not None:
try:
return webencodings.lookup(encoding)
except AttributeError:
return None
else:
return None

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
from __future__ import absolute_import, division, unicode_literals
from .py import Trie as PyTrie
Trie = PyTrie
# pylint:disable=wrong-import-position
try:
from .datrie import Trie as DATrie
except ImportError:
pass
else:
Trie = DATrie
# pylint:enable=wrong-import-position

View file

@ -0,0 +1,37 @@
from __future__ import absolute_import, division, unicode_literals
from collections import Mapping
class Trie(Mapping):
"""Abstract base class for tries"""
def keys(self, prefix=None):
# pylint:disable=arguments-differ
keys = super(Trie, self).keys()
if prefix is None:
return set(keys)
return {x for x in keys if x.startswith(prefix)}
def has_keys_with_prefix(self, prefix):
for key in self.keys():
if key.startswith(prefix):
return True
return False
def longest_prefix(self, prefix):
if prefix in self:
return prefix
for i in range(1, len(prefix) + 1):
if prefix[:-i] in self:
return prefix[:-i]
raise KeyError(prefix)
def longest_prefix_item(self, prefix):
lprefix = self.longest_prefix(prefix)
return (lprefix, self[lprefix])

View file

@ -0,0 +1,44 @@
from __future__ import absolute_import, division, unicode_literals
from datrie import Trie as DATrie
from six import text_type
from ._base import Trie as ABCTrie
class Trie(ABCTrie):
def __init__(self, data):
chars = set()
for key in data.keys():
if not isinstance(key, text_type):
raise TypeError("All keys must be strings")
for char in key:
chars.add(char)
self._data = DATrie("".join(chars))
for key, value in data.items():
self._data[key] = value
def __contains__(self, key):
return key in self._data
def __len__(self):
return len(self._data)
def __iter__(self):
raise NotImplementedError()
def __getitem__(self, key):
return self._data[key]
def keys(self, prefix=None):
return self._data.keys(prefix)
def has_keys_with_prefix(self, prefix):
return self._data.has_keys_with_prefix(prefix)
def longest_prefix(self, prefix):
return self._data.longest_prefix(prefix)
def longest_prefix_item(self, prefix):
return self._data.longest_prefix_item(prefix)

View file

@ -0,0 +1,67 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from bisect import bisect_left
from ._base import Trie as ABCTrie
class Trie(ABCTrie):
def __init__(self, data):
if not all(isinstance(x, text_type) for x in data.keys()):
raise TypeError("All keys must be strings")
self._data = data
self._keys = sorted(data.keys())
self._cachestr = ""
self._cachepoints = (0, len(data))
def __contains__(self, key):
return key in self._data
def __len__(self):
return len(self._data)
def __iter__(self):
return iter(self._data)
def __getitem__(self, key):
return self._data[key]
def keys(self, prefix=None):
if prefix is None or prefix == "" or not self._keys:
return set(self._keys)
if prefix.startswith(self._cachestr):
lo, hi = self._cachepoints
start = i = bisect_left(self._keys, prefix, lo, hi)
else:
start = i = bisect_left(self._keys, prefix)
keys = set()
if start == len(self._keys):
return keys
while self._keys[i].startswith(prefix):
keys.add(self._keys[i])
i += 1
self._cachestr = prefix
self._cachepoints = (start, i)
return keys
def has_keys_with_prefix(self, prefix):
if prefix in self._data:
return True
if prefix.startswith(self._cachestr):
lo, hi = self._cachepoints
i = bisect_left(self._keys, prefix, lo, hi)
else:
i = bisect_left(self._keys, prefix)
if i == len(self._keys):
return False
return self._keys[i].startswith(prefix)

View file

@ -0,0 +1,124 @@
from __future__ import absolute_import, division, unicode_literals
from types import ModuleType
from six import text_type
try:
import xml.etree.cElementTree as default_etree
except ImportError:
import xml.etree.ElementTree as default_etree
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
"surrogatePairToCodepoint", "moduleFactoryFactory",
"supports_lone_surrogates"]
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
# caught by the below test. In general this would be any platform
# using UTF-16 as its encoding of unicode strings, such as
# Jython. This is because UTF-16 itself is based on the use of such
# surrogates, and there is no mechanism to further escape such
# escapes.
try:
_x = eval('"\\uD800"') # pylint:disable=eval-used
if not isinstance(_x, text_type):
# We need this with u"" because of http://bugs.jython.org/issue2039
_x = eval('u"\\uD800"') # pylint:disable=eval-used
assert isinstance(_x, text_type)
except: # pylint:disable=bare-except
supports_lone_surrogates = False
else:
supports_lone_surrogates = True
class MethodDispatcher(dict):
"""Dict with 2 special properties:
On initiation, keys that are lists, sets or tuples are converted to
multiple keys so accessing any one of the items in the original
list-like object returns the matching value
md = MethodDispatcher({("foo", "bar"):"baz"})
md["foo"] == "baz"
A default value which can be set through the default attribute.
"""
def __init__(self, items=()):
# Using _dictEntries instead of directly assigning to self is about
# twice as fast. Please do careful performance testing before changing
# anything here.
_dictEntries = []
for name, value in items:
if isinstance(name, (list, tuple, frozenset, set)):
for item in name:
_dictEntries.append((item, value))
else:
_dictEntries.append((name, value))
dict.__init__(self, _dictEntries)
assert len(self) == len(_dictEntries)
self.default = None
def __getitem__(self, key):
return dict.get(self, key, self.default)
# Some utility functions to deal with weirdness around UCS2 vs UCS4
# python builds
def isSurrogatePair(data):
return (len(data) == 2 and
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
def surrogatePairToCodepoint(data):
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
(ord(data[1]) - 0xDC00))
return char_val
# Module Factory Factory (no, this isn't Java, I know)
# Here to stop this being duplicated all over the place.
def moduleFactoryFactory(factory):
moduleCache = {}
def moduleFactory(baseModule, *args, **kwargs):
if isinstance(ModuleType.__name__, type("")):
name = "_%s_factory" % baseModule.__name__
else:
name = b"_%s_factory" % baseModule.__name__
kwargs_tuple = tuple(kwargs.items())
try:
return moduleCache[name][args][kwargs_tuple]
except KeyError:
mod = ModuleType(name)
objs = factory(baseModule, *args, **kwargs)
mod.__dict__.update(objs)
if "name" not in moduleCache:
moduleCache[name] = {}
if "args" not in moduleCache[name]:
moduleCache[name][args] = {}
if "kwargs" not in moduleCache[name][args]:
moduleCache[name][args][kwargs_tuple] = {}
moduleCache[name][args][kwargs_tuple] = mod
return mod
return moduleFactory
def memoize(func):
cache = {}
def wrapped(*args, **kwargs):
key = (tuple(args), tuple(kwargs.items()))
if key not in cache:
cache[key] = func(*args, **kwargs)
return cache[key]
return wrapped

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,29 @@
from __future__ import absolute_import, division, unicode_literals
from . import base
from collections import OrderedDict
def _attr_key(attr):
"""Return an appropriate key for an attribute for sorting
Attributes have a namespace that can be either ``None`` or a string. We
can't compare the two because they're different types, so we convert
``None`` to an empty string first.
"""
return (attr[0][0] or ''), attr[0][1]
class Filter(base.Filter):
"""Alphabetizes attributes for elements"""
def __iter__(self):
for token in base.Filter.__iter__(self):
if token["type"] in ("StartTag", "EmptyTag"):
attrs = OrderedDict()
for name, value in sorted(token["data"].items(),
key=_attr_key):
attrs[name] = value
token["data"] = attrs
yield token

View file

@ -0,0 +1,12 @@
from __future__ import absolute_import, division, unicode_literals
class Filter(object):
def __init__(self, source):
self.source = source
def __iter__(self):
return iter(self.source)
def __getattr__(self, name):
return getattr(self.source, name)

View file

@ -0,0 +1,73 @@
from __future__ import absolute_import, division, unicode_literals
from . import base
class Filter(base.Filter):
"""Injects ``<meta charset=ENCODING>`` tag into head of document"""
def __init__(self, source, encoding):
"""Creates a Filter
:arg source: the source token stream
:arg encoding: the encoding to set
"""
base.Filter.__init__(self, source)
self.encoding = encoding
def __iter__(self):
state = "pre_head"
meta_found = (self.encoding is None)
pending = []
for token in base.Filter.__iter__(self):
type = token["type"]
if type == "StartTag":
if token["name"].lower() == "head":
state = "in_head"
elif type == "EmptyTag":
if token["name"].lower() == "meta":
# replace charset with actual encoding
has_http_equiv_content_type = False
for (namespace, name), value in token["data"].items():
if namespace is not None:
continue
elif name.lower() == 'charset':
token["data"][(namespace, name)] = self.encoding
meta_found = True
break
elif name == 'http-equiv' and value.lower() == 'content-type':
has_http_equiv_content_type = True
else:
if has_http_equiv_content_type and (None, "content") in token["data"]:
token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
meta_found = True
elif token["name"].lower() == "head" and not meta_found:
# insert meta into empty head
yield {"type": "StartTag", "name": "head",
"data": token["data"]}
yield {"type": "EmptyTag", "name": "meta",
"data": {(None, "charset"): self.encoding}}
yield {"type": "EndTag", "name": "head"}
meta_found = True
continue
elif type == "EndTag":
if token["name"].lower() == "head" and pending:
# insert meta into head (if necessary) and flush pending queue
yield pending.pop(0)
if not meta_found:
yield {"type": "EmptyTag", "name": "meta",
"data": {(None, "charset"): self.encoding}}
while pending:
yield pending.pop(0)
meta_found = True
state = "post_head"
if state == "in_head":
pending.append(token)
else:
yield token

View file

@ -0,0 +1,93 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from . import base
from ..constants import namespaces, voidElements
from ..constants import spaceCharacters
spaceCharacters = "".join(spaceCharacters)
class Filter(base.Filter):
"""Lints the token stream for errors
If it finds any errors, it'll raise an ``AssertionError``.
"""
def __init__(self, source, require_matching_tags=True):
"""Creates a Filter
:arg source: the source token stream
:arg require_matching_tags: whether or not to require matching tags
"""
super(Filter, self).__init__(source)
self.require_matching_tags = require_matching_tags
def __iter__(self):
open_elements = []
for token in base.Filter.__iter__(self):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
namespace = token["namespace"]
name = token["name"]
assert namespace is None or isinstance(namespace, text_type)
assert namespace != ""
assert isinstance(name, text_type)
assert name != ""
assert isinstance(token["data"], dict)
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
assert type == "EmptyTag"
else:
assert type == "StartTag"
if type == "StartTag" and self.require_matching_tags:
open_elements.append((namespace, name))
for (namespace, name), value in token["data"].items():
assert namespace is None or isinstance(namespace, text_type)
assert namespace != ""
assert isinstance(name, text_type)
assert name != ""
assert isinstance(value, text_type)
elif type == "EndTag":
namespace = token["namespace"]
name = token["name"]
assert namespace is None or isinstance(namespace, text_type)
assert namespace != ""
assert isinstance(name, text_type)
assert name != ""
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
elif self.require_matching_tags:
start = open_elements.pop()
assert start == (namespace, name)
elif type == "Comment":
data = token["data"]
assert isinstance(data, text_type)
elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
assert isinstance(data, text_type)
assert data != ""
if type == "SpaceCharacters":
assert data.strip(spaceCharacters) == ""
elif type == "Doctype":
name = token["name"]
assert name is None or isinstance(name, text_type)
assert token["publicId"] is None or isinstance(name, text_type)
assert token["systemId"] is None or isinstance(name, text_type)
elif type == "Entity":
assert isinstance(token["name"], text_type)
elif type == "SerializerError":
assert isinstance(token["data"], text_type)
else:
assert False, "Unknown token type: %(type)s" % {"type": type}
yield token

View file

@ -0,0 +1,207 @@
from __future__ import absolute_import, division, unicode_literals
from . import base
class Filter(base.Filter):
"""Removes optional tags from the token stream"""
def slider(self):
previous1 = previous2 = None
for token in self.source:
if previous1 is not None:
yield previous2, previous1, token
previous2 = previous1
previous1 = token
if previous1 is not None:
yield previous2, previous1, None
def __iter__(self):
for previous, token, next in self.slider():
type = token["type"]
if type == "StartTag":
if (token["data"] or
not self.is_optional_start(token["name"], previous, next)):
yield token
elif type == "EndTag":
if not self.is_optional_end(token["name"], next):
yield token
else:
yield token
def is_optional_start(self, tagname, previous, next):
type = next and next["type"] or None
if tagname in 'html':
# An html element's start tag may be omitted if the first thing
# inside the html element is not a space character or a comment.
return type not in ("Comment", "SpaceCharacters")
elif tagname == 'head':
# A head element's start tag may be omitted if the first thing
# inside the head element is an element.
# XXX: we also omit the start tag if the head element is empty
if type in ("StartTag", "EmptyTag"):
return True
elif type == "EndTag":
return next["name"] == "head"
elif tagname == 'body':
# A body element's start tag may be omitted if the first thing
# inside the body element is not a space character or a comment,
# except if the first thing inside the body element is a script
# or style element and the node immediately preceding the body
# element is a head element whose end tag has been omitted.
if type in ("Comment", "SpaceCharacters"):
return False
elif type == "StartTag":
# XXX: we do not look at the preceding event, so we never omit
# the body element's start tag if it's followed by a script or
# a style element.
return next["name"] not in ('script', 'style')
else:
return True
elif tagname == 'colgroup':
# A colgroup element's start tag may be omitted if the first thing
# inside the colgroup element is a col element, and if the element
# is not immediately preceded by another colgroup element whose
# end tag has been omitted.
if type in ("StartTag", "EmptyTag"):
# XXX: we do not look at the preceding event, so instead we never
# omit the colgroup element's end tag when it is immediately
# followed by another colgroup element. See is_optional_end.
return next["name"] == "col"
else:
return False
elif tagname == 'tbody':
# A tbody element's start tag may be omitted if the first thing
# inside the tbody element is a tr element, and if the element is
# not immediately preceded by a tbody, thead, or tfoot element
# whose end tag has been omitted.
if type == "StartTag":
# omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end.
if previous and previous['type'] == 'EndTag' and \
previous['name'] in ('tbody', 'thead', 'tfoot'):
return False
return next["name"] == 'tr'
else:
return False
return False
def is_optional_end(self, tagname, next):
type = next and next["type"] or None
if tagname in ('html', 'head', 'body'):
# An html element's end tag may be omitted if the html element
# is not immediately followed by a space character or a comment.
return type not in ("Comment", "SpaceCharacters")
elif tagname in ('li', 'optgroup', 'tr'):
# A li element's end tag may be omitted if the li element is
# immediately followed by another li element or if there is
# no more content in the parent element.
# An optgroup element's end tag may be omitted if the optgroup
# element is immediately followed by another optgroup element,
# or if there is no more content in the parent element.
# A tr element's end tag may be omitted if the tr element is
# immediately followed by another tr element, or if there is
# no more content in the parent element.
if type == "StartTag":
return next["name"] == tagname
else:
return type == "EndTag" or type is None
elif tagname in ('dt', 'dd'):
# A dt element's end tag may be omitted if the dt element is
# immediately followed by another dt element or a dd element.
# A dd element's end tag may be omitted if the dd element is
# immediately followed by another dd element or a dt element,
# or if there is no more content in the parent element.
if type == "StartTag":
return next["name"] in ('dt', 'dd')
elif tagname == 'dd':
return type == "EndTag" or type is None
else:
return False
elif tagname == 'p':
# A p element's end tag may be omitted if the p element is
# immediately followed by an address, article, aside,
# blockquote, datagrid, dialog, dir, div, dl, fieldset,
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
# nav, ol, p, pre, section, table, or ul, element, or if
# there is no more content in the parent element.
if type in ("StartTag", "EmptyTag"):
return next["name"] in ('address', 'article', 'aside',
'blockquote', 'datagrid', 'dialog',
'dir', 'div', 'dl', 'fieldset', 'footer',
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'header', 'hr', 'menu', 'nav', 'ol',
'p', 'pre', 'section', 'table', 'ul')
else:
return type == "EndTag" or type is None
elif tagname == 'option':
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if it is immediately followed by an <code>optgroup</code>
# element, or if there is no more content in the parent
# element.
if type == "StartTag":
return next["name"] in ('option', 'optgroup')
else:
return type == "EndTag" or type is None
elif tagname in ('rt', 'rp'):
# An rt element's end tag may be omitted if the rt element is
# immediately followed by an rt or rp element, or if there is
# no more content in the parent element.
# An rp element's end tag may be omitted if the rp element is
# immediately followed by an rt or rp element, or if there is
# no more content in the parent element.
if type == "StartTag":
return next["name"] in ('rt', 'rp')
else:
return type == "EndTag" or type is None
elif tagname == 'colgroup':
# A colgroup element's end tag may be omitted if the colgroup
# element is not immediately followed by a space character or
# a comment.
if type in ("Comment", "SpaceCharacters"):
return False
elif type == "StartTag":
# XXX: we also look for an immediately following colgroup
# element. See is_optional_start.
return next["name"] != 'colgroup'
else:
return True
elif tagname in ('thead', 'tbody'):
# A thead element's end tag may be omitted if the thead element
# is immediately followed by a tbody or tfoot element.
# A tbody element's end tag may be omitted if the tbody element
# is immediately followed by a tbody or tfoot element, or if
# there is no more content in the parent element.
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == "StartTag":
return next["name"] in ['tbody', 'tfoot']
elif tagname == 'tbody':
return type == "EndTag" or type is None
else:
return False
elif tagname == 'tfoot':
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == "StartTag":
return next["name"] == 'tbody'
else:
return type == "EndTag" or type is None
elif tagname in ('td', 'th'):
# A td element's end tag may be omitted if the td element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
# A th element's end tag may be omitted if the th element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
if type == "StartTag":
return next["name"] in ('td', 'th')
else:
return type == "EndTag" or type is None
return False

View file

@ -0,0 +1,896 @@
from __future__ import absolute_import, division, unicode_literals
import re
from xml.sax.saxutils import escape, unescape
from six.moves import urllib_parse as urlparse
from . import base
from ..constants import namespaces, prefixes
__all__ = ["Filter"]
allowed_elements = frozenset((
(namespaces['html'], 'a'),
(namespaces['html'], 'abbr'),
(namespaces['html'], 'acronym'),
(namespaces['html'], 'address'),
(namespaces['html'], 'area'),
(namespaces['html'], 'article'),
(namespaces['html'], 'aside'),
(namespaces['html'], 'audio'),
(namespaces['html'], 'b'),
(namespaces['html'], 'big'),
(namespaces['html'], 'blockquote'),
(namespaces['html'], 'br'),
(namespaces['html'], 'button'),
(namespaces['html'], 'canvas'),
(namespaces['html'], 'caption'),
(namespaces['html'], 'center'),
(namespaces['html'], 'cite'),
(namespaces['html'], 'code'),
(namespaces['html'], 'col'),
(namespaces['html'], 'colgroup'),
(namespaces['html'], 'command'),
(namespaces['html'], 'datagrid'),
(namespaces['html'], 'datalist'),
(namespaces['html'], 'dd'),
(namespaces['html'], 'del'),
(namespaces['html'], 'details'),
(namespaces['html'], 'dfn'),
(namespaces['html'], 'dialog'),
(namespaces['html'], 'dir'),
(namespaces['html'], 'div'),
(namespaces['html'], 'dl'),
(namespaces['html'], 'dt'),
(namespaces['html'], 'em'),
(namespaces['html'], 'event-source'),
(namespaces['html'], 'fieldset'),
(namespaces['html'], 'figcaption'),
(namespaces['html'], 'figure'),
(namespaces['html'], 'footer'),
(namespaces['html'], 'font'),
(namespaces['html'], 'form'),
(namespaces['html'], 'header'),
(namespaces['html'], 'h1'),
(namespaces['html'], 'h2'),
(namespaces['html'], 'h3'),
(namespaces['html'], 'h4'),
(namespaces['html'], 'h5'),
(namespaces['html'], 'h6'),
(namespaces['html'], 'hr'),
(namespaces['html'], 'i'),
(namespaces['html'], 'img'),
(namespaces['html'], 'input'),
(namespaces['html'], 'ins'),
(namespaces['html'], 'keygen'),
(namespaces['html'], 'kbd'),
(namespaces['html'], 'label'),
(namespaces['html'], 'legend'),
(namespaces['html'], 'li'),
(namespaces['html'], 'm'),
(namespaces['html'], 'map'),
(namespaces['html'], 'menu'),
(namespaces['html'], 'meter'),
(namespaces['html'], 'multicol'),
(namespaces['html'], 'nav'),
(namespaces['html'], 'nextid'),
(namespaces['html'], 'ol'),
(namespaces['html'], 'output'),
(namespaces['html'], 'optgroup'),
(namespaces['html'], 'option'),
(namespaces['html'], 'p'),
(namespaces['html'], 'pre'),
(namespaces['html'], 'progress'),
(namespaces['html'], 'q'),
(namespaces['html'], 's'),
(namespaces['html'], 'samp'),
(namespaces['html'], 'section'),
(namespaces['html'], 'select'),
(namespaces['html'], 'small'),
(namespaces['html'], 'sound'),
(namespaces['html'], 'source'),
(namespaces['html'], 'spacer'),
(namespaces['html'], 'span'),
(namespaces['html'], 'strike'),
(namespaces['html'], 'strong'),
(namespaces['html'], 'sub'),
(namespaces['html'], 'sup'),
(namespaces['html'], 'table'),
(namespaces['html'], 'tbody'),
(namespaces['html'], 'td'),
(namespaces['html'], 'textarea'),
(namespaces['html'], 'time'),
(namespaces['html'], 'tfoot'),
(namespaces['html'], 'th'),
(namespaces['html'], 'thead'),
(namespaces['html'], 'tr'),
(namespaces['html'], 'tt'),
(namespaces['html'], 'u'),
(namespaces['html'], 'ul'),
(namespaces['html'], 'var'),
(namespaces['html'], 'video'),
(namespaces['mathml'], 'maction'),
(namespaces['mathml'], 'math'),
(namespaces['mathml'], 'merror'),
(namespaces['mathml'], 'mfrac'),
(namespaces['mathml'], 'mi'),
(namespaces['mathml'], 'mmultiscripts'),
(namespaces['mathml'], 'mn'),
(namespaces['mathml'], 'mo'),
(namespaces['mathml'], 'mover'),
(namespaces['mathml'], 'mpadded'),
(namespaces['mathml'], 'mphantom'),
(namespaces['mathml'], 'mprescripts'),
(namespaces['mathml'], 'mroot'),
(namespaces['mathml'], 'mrow'),
(namespaces['mathml'], 'mspace'),
(namespaces['mathml'], 'msqrt'),
(namespaces['mathml'], 'mstyle'),
(namespaces['mathml'], 'msub'),
(namespaces['mathml'], 'msubsup'),
(namespaces['mathml'], 'msup'),
(namespaces['mathml'], 'mtable'),
(namespaces['mathml'], 'mtd'),
(namespaces['mathml'], 'mtext'),
(namespaces['mathml'], 'mtr'),
(namespaces['mathml'], 'munder'),
(namespaces['mathml'], 'munderover'),
(namespaces['mathml'], 'none'),
(namespaces['svg'], 'a'),
(namespaces['svg'], 'animate'),
(namespaces['svg'], 'animateColor'),
(namespaces['svg'], 'animateMotion'),
(namespaces['svg'], 'animateTransform'),
(namespaces['svg'], 'clipPath'),
(namespaces['svg'], 'circle'),
(namespaces['svg'], 'defs'),
(namespaces['svg'], 'desc'),
(namespaces['svg'], 'ellipse'),
(namespaces['svg'], 'font-face'),
(namespaces['svg'], 'font-face-name'),
(namespaces['svg'], 'font-face-src'),
(namespaces['svg'], 'g'),
(namespaces['svg'], 'glyph'),
(namespaces['svg'], 'hkern'),
(namespaces['svg'], 'linearGradient'),
(namespaces['svg'], 'line'),
(namespaces['svg'], 'marker'),
(namespaces['svg'], 'metadata'),
(namespaces['svg'], 'missing-glyph'),
(namespaces['svg'], 'mpath'),
(namespaces['svg'], 'path'),
(namespaces['svg'], 'polygon'),
(namespaces['svg'], 'polyline'),
(namespaces['svg'], 'radialGradient'),
(namespaces['svg'], 'rect'),
(namespaces['svg'], 'set'),
(namespaces['svg'], 'stop'),
(namespaces['svg'], 'svg'),
(namespaces['svg'], 'switch'),
(namespaces['svg'], 'text'),
(namespaces['svg'], 'title'),
(namespaces['svg'], 'tspan'),
(namespaces['svg'], 'use'),
))
allowed_attributes = frozenset((
# HTML attributes
(None, 'abbr'),
(None, 'accept'),
(None, 'accept-charset'),
(None, 'accesskey'),
(None, 'action'),
(None, 'align'),
(None, 'alt'),
(None, 'autocomplete'),
(None, 'autofocus'),
(None, 'axis'),
(None, 'background'),
(None, 'balance'),
(None, 'bgcolor'),
(None, 'bgproperties'),
(None, 'border'),
(None, 'bordercolor'),
(None, 'bordercolordark'),
(None, 'bordercolorlight'),
(None, 'bottompadding'),
(None, 'cellpadding'),
(None, 'cellspacing'),
(None, 'ch'),
(None, 'challenge'),
(None, 'char'),
(None, 'charoff'),
(None, 'choff'),
(None, 'charset'),
(None, 'checked'),
(None, 'cite'),
(None, 'class'),
(None, 'clear'),
(None, 'color'),
(None, 'cols'),
(None, 'colspan'),
(None, 'compact'),
(None, 'contenteditable'),
(None, 'controls'),
(None, 'coords'),
(None, 'data'),
(None, 'datafld'),
(None, 'datapagesize'),
(None, 'datasrc'),
(None, 'datetime'),
(None, 'default'),
(None, 'delay'),
(None, 'dir'),
(None, 'disabled'),
(None, 'draggable'),
(None, 'dynsrc'),
(None, 'enctype'),
(None, 'end'),
(None, 'face'),
(None, 'for'),
(None, 'form'),
(None, 'frame'),
(None, 'galleryimg'),
(None, 'gutter'),
(None, 'headers'),
(None, 'height'),
(None, 'hidefocus'),
(None, 'hidden'),
(None, 'high'),
(None, 'href'),
(None, 'hreflang'),
(None, 'hspace'),
(None, 'icon'),
(None, 'id'),
(None, 'inputmode'),
(None, 'ismap'),
(None, 'keytype'),
(None, 'label'),
(None, 'leftspacing'),
(None, 'lang'),
(None, 'list'),
(None, 'longdesc'),
(None, 'loop'),
(None, 'loopcount'),
(None, 'loopend'),
(None, 'loopstart'),
(None, 'low'),
(None, 'lowsrc'),
(None, 'max'),
(None, 'maxlength'),
(None, 'media'),
(None, 'method'),
(None, 'min'),
(None, 'multiple'),
(None, 'name'),
(None, 'nohref'),
(None, 'noshade'),
(None, 'nowrap'),
(None, 'open'),
(None, 'optimum'),
(None, 'pattern'),
(None, 'ping'),
(None, 'point-size'),
(None, 'poster'),
(None, 'pqg'),
(None, 'preload'),
(None, 'prompt'),
(None, 'radiogroup'),
(None, 'readonly'),
(None, 'rel'),
(None, 'repeat-max'),
(None, 'repeat-min'),
(None, 'replace'),
(None, 'required'),
(None, 'rev'),
(None, 'rightspacing'),
(None, 'rows'),
(None, 'rowspan'),
(None, 'rules'),
(None, 'scope'),
(None, 'selected'),
(None, 'shape'),
(None, 'size'),
(None, 'span'),
(None, 'src'),
(None, 'start'),
(None, 'step'),
(None, 'style'),
(None, 'summary'),
(None, 'suppress'),
(None, 'tabindex'),
(None, 'target'),
(None, 'template'),
(None, 'title'),
(None, 'toppadding'),
(None, 'type'),
(None, 'unselectable'),
(None, 'usemap'),
(None, 'urn'),
(None, 'valign'),
(None, 'value'),
(None, 'variable'),
(None, 'volume'),
(None, 'vspace'),
(None, 'vrml'),
(None, 'width'),
(None, 'wrap'),
(namespaces['xml'], 'lang'),
# MathML attributes
(None, 'actiontype'),
(None, 'align'),
(None, 'columnalign'),
(None, 'columnalign'),
(None, 'columnalign'),
(None, 'columnlines'),
(None, 'columnspacing'),
(None, 'columnspan'),
(None, 'depth'),
(None, 'display'),
(None, 'displaystyle'),
(None, 'equalcolumns'),
(None, 'equalrows'),
(None, 'fence'),
(None, 'fontstyle'),
(None, 'fontweight'),
(None, 'frame'),
(None, 'height'),
(None, 'linethickness'),
(None, 'lspace'),
(None, 'mathbackground'),
(None, 'mathcolor'),
(None, 'mathvariant'),
(None, 'mathvariant'),
(None, 'maxsize'),
(None, 'minsize'),
(None, 'other'),
(None, 'rowalign'),
(None, 'rowalign'),
(None, 'rowalign'),
(None, 'rowlines'),
(None, 'rowspacing'),
(None, 'rowspan'),
(None, 'rspace'),
(None, 'scriptlevel'),
(None, 'selection'),
(None, 'separator'),
(None, 'stretchy'),
(None, 'width'),
(None, 'width'),
(namespaces['xlink'], 'href'),
(namespaces['xlink'], 'show'),
(namespaces['xlink'], 'type'),
# SVG attributes
(None, 'accent-height'),
(None, 'accumulate'),
(None, 'additive'),
(None, 'alphabetic'),
(None, 'arabic-form'),
(None, 'ascent'),
(None, 'attributeName'),
(None, 'attributeType'),
(None, 'baseProfile'),
(None, 'bbox'),
(None, 'begin'),
(None, 'by'),
(None, 'calcMode'),
(None, 'cap-height'),
(None, 'class'),
(None, 'clip-path'),
(None, 'color'),
(None, 'color-rendering'),
(None, 'content'),
(None, 'cx'),
(None, 'cy'),
(None, 'd'),
(None, 'dx'),
(None, 'dy'),
(None, 'descent'),
(None, 'display'),
(None, 'dur'),
(None, 'end'),
(None, 'fill'),
(None, 'fill-opacity'),
(None, 'fill-rule'),
(None, 'font-family'),
(None, 'font-size'),
(None, 'font-stretch'),
(None, 'font-style'),
(None, 'font-variant'),
(None, 'font-weight'),
(None, 'from'),
(None, 'fx'),
(None, 'fy'),
(None, 'g1'),
(None, 'g2'),
(None, 'glyph-name'),
(None, 'gradientUnits'),
(None, 'hanging'),
(None, 'height'),
(None, 'horiz-adv-x'),
(None, 'horiz-origin-x'),
(None, 'id'),
(None, 'ideographic'),
(None, 'k'),
(None, 'keyPoints'),
(None, 'keySplines'),
(None, 'keyTimes'),
(None, 'lang'),
(None, 'marker-end'),
(None, 'marker-mid'),
(None, 'marker-start'),
(None, 'markerHeight'),
(None, 'markerUnits'),
(None, 'markerWidth'),
(None, 'mathematical'),
(None, 'max'),
(None, 'min'),
(None, 'name'),
(None, 'offset'),
(None, 'opacity'),
(None, 'orient'),
(None, 'origin'),
(None, 'overline-position'),
(None, 'overline-thickness'),
(None, 'panose-1'),
(None, 'path'),
(None, 'pathLength'),
(None, 'points'),
(None, 'preserveAspectRatio'),
(None, 'r'),
(None, 'refX'),
(None, 'refY'),
(None, 'repeatCount'),
(None, 'repeatDur'),
(None, 'requiredExtensions'),
(None, 'requiredFeatures'),
(None, 'restart'),
(None, 'rotate'),
(None, 'rx'),
(None, 'ry'),
(None, 'slope'),
(None, 'stemh'),
(None, 'stemv'),
(None, 'stop-color'),
(None, 'stop-opacity'),
(None, 'strikethrough-position'),
(None, 'strikethrough-thickness'),
(None, 'stroke'),
(None, 'stroke-dasharray'),
(None, 'stroke-dashoffset'),
(None, 'stroke-linecap'),
(None, 'stroke-linejoin'),
(None, 'stroke-miterlimit'),
(None, 'stroke-opacity'),
(None, 'stroke-width'),
(None, 'systemLanguage'),
(None, 'target'),
(None, 'text-anchor'),
(None, 'to'),
(None, 'transform'),
(None, 'type'),
(None, 'u1'),
(None, 'u2'),
(None, 'underline-position'),
(None, 'underline-thickness'),
(None, 'unicode'),
(None, 'unicode-range'),
(None, 'units-per-em'),
(None, 'values'),
(None, 'version'),
(None, 'viewBox'),
(None, 'visibility'),
(None, 'width'),
(None, 'widths'),
(None, 'x'),
(None, 'x-height'),
(None, 'x1'),
(None, 'x2'),
(namespaces['xlink'], 'actuate'),
(namespaces['xlink'], 'arcrole'),
(namespaces['xlink'], 'href'),
(namespaces['xlink'], 'role'),
(namespaces['xlink'], 'show'),
(namespaces['xlink'], 'title'),
(namespaces['xlink'], 'type'),
(namespaces['xml'], 'base'),
(namespaces['xml'], 'lang'),
(namespaces['xml'], 'space'),
(None, 'y'),
(None, 'y1'),
(None, 'y2'),
(None, 'zoomAndPan'),
))
attr_val_is_uri = frozenset((
(None, 'href'),
(None, 'src'),
(None, 'cite'),
(None, 'action'),
(None, 'longdesc'),
(None, 'poster'),
(None, 'background'),
(None, 'datasrc'),
(None, 'dynsrc'),
(None, 'lowsrc'),
(None, 'ping'),
(namespaces['xlink'], 'href'),
(namespaces['xml'], 'base'),
))
svg_attr_val_allows_ref = frozenset((
(None, 'clip-path'),
(None, 'color-profile'),
(None, 'cursor'),
(None, 'fill'),
(None, 'filter'),
(None, 'marker'),
(None, 'marker-start'),
(None, 'marker-mid'),
(None, 'marker-end'),
(None, 'mask'),
(None, 'stroke'),
))
svg_allow_local_href = frozenset((
(None, 'altGlyph'),
(None, 'animate'),
(None, 'animateColor'),
(None, 'animateMotion'),
(None, 'animateTransform'),
(None, 'cursor'),
(None, 'feImage'),
(None, 'filter'),
(None, 'linearGradient'),
(None, 'pattern'),
(None, 'radialGradient'),
(None, 'textpath'),
(None, 'tref'),
(None, 'set'),
(None, 'use')
))
allowed_css_properties = frozenset((
'azimuth',
'background-color',
'border-bottom-color',
'border-collapse',
'border-color',
'border-left-color',
'border-right-color',
'border-top-color',
'clear',
'color',
'cursor',
'direction',
'display',
'elevation',
'float',
'font',
'font-family',
'font-size',
'font-style',
'font-variant',
'font-weight',
'height',
'letter-spacing',
'line-height',
'overflow',
'pause',
'pause-after',
'pause-before',
'pitch',
'pitch-range',
'richness',
'speak',
'speak-header',
'speak-numeral',
'speak-punctuation',
'speech-rate',
'stress',
'text-align',
'text-decoration',
'text-indent',
'unicode-bidi',
'vertical-align',
'voice-family',
'volume',
'white-space',
'width',
))
allowed_css_keywords = frozenset((
'auto',
'aqua',
'black',
'block',
'blue',
'bold',
'both',
'bottom',
'brown',
'center',
'collapse',
'dashed',
'dotted',
'fuchsia',
'gray',
'green',
'!important',
'italic',
'left',
'lime',
'maroon',
'medium',
'none',
'navy',
'normal',
'nowrap',
'olive',
'pointer',
'purple',
'red',
'right',
'solid',
'silver',
'teal',
'top',
'transparent',
'underline',
'white',
'yellow',
))
allowed_svg_properties = frozenset((
'fill',
'fill-opacity',
'fill-rule',
'stroke',
'stroke-width',
'stroke-linecap',
'stroke-linejoin',
'stroke-opacity',
))
allowed_protocols = frozenset((
'ed2k',
'ftp',
'http',
'https',
'irc',
'mailto',
'news',
'gopher',
'nntp',
'telnet',
'webcal',
'xmpp',
'callto',
'feed',
'urn',
'aim',
'rsync',
'tag',
'ssh',
'sftp',
'rtsp',
'afs',
'data',
))
allowed_content_types = frozenset((
'image/png',
'image/jpeg',
'image/gif',
'image/webp',
'image/bmp',
'text/plain',
))
data_content_type = re.compile(r'''
^
# Match a content type <application>/<type>
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
# Match any character set and encoding
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
# Assume the rest is data
,.*
$
''',
re.VERBOSE)
class Filter(base.Filter):
"""Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
def __init__(self,
source,
allowed_elements=allowed_elements,
allowed_attributes=allowed_attributes,
allowed_css_properties=allowed_css_properties,
allowed_css_keywords=allowed_css_keywords,
allowed_svg_properties=allowed_svg_properties,
allowed_protocols=allowed_protocols,
allowed_content_types=allowed_content_types,
attr_val_is_uri=attr_val_is_uri,
svg_attr_val_allows_ref=svg_attr_val_allows_ref,
svg_allow_local_href=svg_allow_local_href):
"""Creates a Filter
:arg allowed_elements: set of elements to allow--everything else will
be escaped
:arg allowed_attributes: set of attributes to allow in
elements--everything else will be stripped
:arg allowed_css_properties: set of CSS properties to allow--everything
else will be stripped
:arg allowed_css_keywords: set of CSS keywords to allow--everything
else will be stripped
:arg allowed_svg_properties: set of SVG properties to allow--everything
else will be removed
:arg allowed_protocols: set of allowed protocols for URIs
:arg allowed_content_types: set of allowed content types for ``data`` URIs.
:arg attr_val_is_uri: set of attributes that have URI values--values
that have a scheme not listed in ``allowed_protocols`` are removed
:arg svg_attr_val_allows_ref: set of SVG attributes that can have
references
:arg svg_allow_local_href: set of SVG elements that can have local
hrefs--these are removed
"""
super(Filter, self).__init__(source)
self.allowed_elements = allowed_elements
self.allowed_attributes = allowed_attributes
self.allowed_css_properties = allowed_css_properties
self.allowed_css_keywords = allowed_css_keywords
self.allowed_svg_properties = allowed_svg_properties
self.allowed_protocols = allowed_protocols
self.allowed_content_types = allowed_content_types
self.attr_val_is_uri = attr_val_is_uri
self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
self.svg_allow_local_href = svg_allow_local_href
def __iter__(self):
for token in base.Filter.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
# are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
# ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
# are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
# allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_token(self, token):
# accommodate filters which use token_type differently
token_type = token["type"]
if token_type in ("StartTag", "EndTag", "EmptyTag"):
name = token["name"]
namespace = token["namespace"]
if ((namespace, name) in self.allowed_elements or
(namespace is None and
(namespaces["html"], name) in self.allowed_elements)):
return self.allowed_token(token)
else:
return self.disallowed_token(token)
elif token_type == "Comment":
pass
else:
return token
def allowed_token(self, token):
if "data" in token:
attrs = token["data"]
attr_names = set(attrs.keys())
# Remove forbidden attributes
for to_remove in (attr_names - self.allowed_attributes):
del token["data"][to_remove]
attr_names.remove(to_remove)
# Remove attributes with disallowed URL values
for attr in (attr_names & self.attr_val_is_uri):
assert attr in attrs
# I don't have a clue where this regexp comes from or why it matches those
# characters, nor why we call unescape. I just know it's always been here.
# Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
# this will do is remove *more* than it otherwise would.
val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
try:
uri = urlparse.urlparse(val_unescaped)
except ValueError:
uri = None
del attrs[attr]
if uri and uri.scheme:
if uri.scheme not in self.allowed_protocols:
del attrs[attr]
if uri.scheme == 'data':
m = data_content_type.match(uri.path)
if not m:
del attrs[attr]
elif m.group('content_type') not in self.allowed_content_types:
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
(namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
attrs[(namespaces['xlink'], 'href')])):
del attrs[(namespaces['xlink'], 'href')]
if (None, 'style') in attrs:
attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
token["data"] = attrs
return token
def disallowed_token(self, token):
token_type = token["type"]
if token_type == "EndTag":
token["data"] = "</%s>" % token["name"]
elif token["data"]:
assert token_type in ("StartTag", "EmptyTag")
attrs = []
for (ns, name), v in token["data"].items():
attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
else:
token["data"] = "<%s>" % token["name"]
if token.get("selfClosing"):
token["data"] = token["data"][:-1] + "/>"
token["type"] = "Characters"
del token["name"]
return token
def sanitize_css(self, style):
# disallow urls
style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet
if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
return ''
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ''
clean = []
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
'padding']:
for keyword in value.split():
if keyword not in self.allowed_css_keywords and \
not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
break
else:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)

View file

@ -0,0 +1,38 @@
from __future__ import absolute_import, division, unicode_literals
import re
from . import base
from ..constants import rcdataElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters)
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
class Filter(base.Filter):
"""Collapses whitespace except in pre, textarea, and script elements"""
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
def __iter__(self):
preserve = 0
for token in base.Filter.__iter__(self):
type = token["type"]
if type == "StartTag" \
and (preserve or token["name"] in self.spacePreserveElements):
preserve += 1
elif type == "EndTag" and preserve:
preserve -= 1
elif not preserve and type == "SpaceCharacters" and token["data"]:
# Test on token["data"] above to not introduce spaces where there were not
token["data"] = " "
elif not preserve and type == "Characters":
token["data"] = collapse_spaces(token["data"])
yield token
def collapse_spaces(text):
return SPACES_REGEX.sub(' ', text)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,409 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
import re
from codecs import register_error, xmlcharrefreplace_errors
from .constants import voidElements, booleanAttributes, spaceCharacters
from .constants import rcdataElements, entities, xmlEntities
from . import treewalkers, _utils
from xml.sax.saxutils import escape
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
"\u3000]")
_encode_entity_map = {}
_is_ucs4 = len("\U0010FFFF") == 1
for k, v in list(entities.items()):
# skip multi-character entities
if ((_is_ucs4 and len(v) > 1) or
(not _is_ucs4 and len(v) > 2)):
continue
if v != "&":
if len(v) == 2:
v = _utils.surrogatePairToCodepoint(v)
else:
v = ord(v)
if v not in _encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
_encode_entity_map[v] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
codepoints = []
skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = _encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
res.append("&#x%s;" % (hex(cp)[2:]))
return ("".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
register_error("htmlentityreplace", htmlentityreplace_errors)
def serialize(input, tree="etree", encoding=None, **serializer_opts):
"""Serializes the input token stream using the specified treewalker
:arg input: the token stream to serialize
:arg tree: the treewalker to use
:arg encoding: the encoding to use
:arg serializer_opts: any options to pass to the
:py:class:`html5lib.serializer.HTMLSerializer` that gets created
:returns: the tree serialized as a string
Example:
>>> from html5lib.html5parser import parse
>>> from html5lib.serializer import serialize
>>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
>>> serialize(token_stream, omit_optional_tags=False)
'<html><head></head><body><p>Hi!</p></body></html>'
"""
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
s = HTMLSerializer(**serializer_opts)
return s.render(walker(input), encoding)
class HTMLSerializer(object):
# attribute quoting options
quote_attr_values = "legacy" # be secure by default
quote_char = '"'
use_best_quote_char = True
# tag syntax options
omit_optional_tags = True
minimize_boolean_attributes = True
use_trailing_solidus = False
space_before_trailing_solidus = True
# escaping options
escape_lt_in_attrs = False
escape_rcdata = False
resolve_entities = True
# miscellaneous options
alphabetical_attributes = False
inject_meta_charset = True
strip_whitespace = False
sanitize = False
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"omit_optional_tags", "minimize_boolean_attributes",
"use_trailing_solidus", "space_before_trailing_solidus",
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
"alphabetical_attributes", "inject_meta_charset",
"strip_whitespace", "sanitize")
def __init__(self, **kwargs):
"""Initialize HTMLSerializer
:arg inject_meta_charset: Whether or not to inject the meta charset.
Defaults to ``True``.
:arg quote_attr_values: Whether to quote attribute values that don't
require quoting per legacy browser behavior (``"legacy"``), when
required by the standard (``"spec"``), or always (``"always"``).
Defaults to ``"legacy"``.
:arg quote_char: Use given quote character for attribute quoting.
Defaults to ``"`` which will use double quotes unless attribute
value contains a double quote, in which case single quotes are
used.
:arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
values.
Defaults to ``False``.
:arg escape_rcdata: Whether to escape characters that need to be
escaped within normal elements within rcdata elements such as
style.
Defaults to ``False``.
:arg resolve_entities: Whether to resolve named character entities that
appear in the source tree. The XML predefined entities &lt; &gt;
&amp; &quot; &apos; are unaffected by this setting.
Defaults to ``True``.
:arg strip_whitespace: Whether to remove semantically meaningless
whitespace. (This compresses all whitespace to a single space
except within ``pre``.)
Defaults to ``False``.
:arg minimize_boolean_attributes: Shortens boolean attributes to give
just the attribute value, for example::
<input disabled="disabled">
becomes::
<input disabled>
Defaults to ``True``.
:arg use_trailing_solidus: Includes a close-tag slash at the end of the
start tag of void elements (empty elements whose end tag is
forbidden). E.g. ``<hr/>``.
Defaults to ``False``.
:arg space_before_trailing_solidus: Places a space immediately before
the closing slash in a tag using a trailing solidus. E.g.
``<hr />``. Requires ``use_trailing_solidus=True``.
Defaults to ``True``.
:arg sanitize: Strip all unsafe or unknown constructs from output.
See :py:class:`html5lib.filters.sanitizer.Filter`.
Defaults to ``False``.
:arg omit_optional_tags: Omit start/end tags that are optional.
Defaults to ``True``.
:arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
Defaults to ``False``.
"""
unexpected_args = frozenset(kwargs) - frozenset(self.options)
if len(unexpected_args) > 0:
raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
if 'quote_char' in kwargs:
self.use_best_quote_char = False
for attr in self.options:
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
self.errors = []
self.strict = False
def encode(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, "htmlentityreplace")
else:
return string
def encodeStrict(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, "strict")
else:
return string
def serialize(self, treewalker, encoding=None):
# pylint:disable=too-many-nested-blocks
self.encoding = encoding
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
from .filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
# Alphabetical attributes is here under the assumption that none of
# the later filters add or change order of attributes; it needs to be
# before the sanitizer so escaped elements come out correctly
if self.alphabetical_attributes:
from .filters.alphabeticalattributes import Filter
treewalker = Filter(treewalker)
# WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
from .filters.whitespace import Filter
treewalker = Filter(treewalker)
if self.sanitize:
from .filters.sanitizer import Filter
treewalker = Filter(treewalker)
if self.omit_optional_tags:
from .filters.optionaltags import Filter
treewalker = Filter(treewalker)
for token in treewalker:
type = token["type"]
if type == "Doctype":
doctype = "<!DOCTYPE %s" % token["name"]
if token["publicId"]:
doctype += ' PUBLIC "%s"' % token["publicId"]
elif token["systemId"]:
doctype += " SYSTEM"
if token["systemId"]:
if token["systemId"].find('"') >= 0:
if token["systemId"].find("'") >= 0:
self.serializeError("System identifer contains both single and double quote characters")
quote_char = "'"
else:
quote_char = '"'
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
doctype += ">"
yield self.encodeStrict(doctype)
elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("</") >= 0:
self.serializeError("Unexpected </ in CDATA")
yield self.encode(token["data"])
else:
yield self.encode(escape(token["data"]))
elif type in ("StartTag", "EmptyTag"):
name = token["name"]
yield self.encodeStrict("<%s" % name)
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element")
for (_, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here
k = attr_name
v = attr_value
yield self.encodeStrict(' ')
yield self.encodeStrict(k)
if not self.minimize_boolean_attributes or \
(k not in booleanAttributes.get(name, tuple()) and
k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict("=")
if self.quote_attr_values == "always" or len(v) == 0:
quote_attr = True
elif self.quote_attr_values == "spec":
quote_attr = _quoteAttributeSpec.search(v) is not None
elif self.quote_attr_values == "legacy":
quote_attr = _quoteAttributeLegacy.search(v) is not None
else:
raise ValueError("quote_attr_values must be one of: "
"'always', 'spec', or 'legacy'")
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs:
v = v.replace("<", "&lt;")
if quote_attr:
quote_char = self.quote_char
if self.use_best_quote_char:
if "'" in v and '"' not in v:
quote_char = '"'
elif '"' in v and "'" not in v:
quote_char = "'"
if quote_char == "'":
v = v.replace("'", "&#39;")
else:
v = v.replace('"', "&quot;")
yield self.encodeStrict(quote_char)
yield self.encode(v)
yield self.encodeStrict(quote_char)
else:
yield self.encode(v)
if name in voidElements and self.use_trailing_solidus:
if self.space_before_trailing_solidus:
yield self.encodeStrict(" /")
else:
yield self.encodeStrict("/")
yield self.encode(">")
elif type == "EndTag":
name = token["name"]
if name in rcdataElements:
in_cdata = False
elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element")
yield self.encodeStrict("</%s>" % name)
elif type == "Comment":
data = token["data"]
if data.find("--") >= 0:
self.serializeError("Comment contains --")
yield self.encodeStrict("<!--%s-->" % token["data"])
elif type == "Entity":
name = token["name"]
key = name + ";"
if key not in entities:
self.serializeError("Entity %s not recognized" % name)
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:
data = "&%s;" % name
yield self.encodeStrict(data)
else:
self.serializeError(token["data"])
def render(self, treewalker, encoding=None):
"""Serializes the stream from the treewalker into a string
:arg treewalker: the treewalker to serialize
:arg encoding: the string encoding to use
:returns: the serialized tree
Example:
>>> from html5lib import parse, getTreeWalker
>>> from html5lib.serializer import HTMLSerializer
>>> token_stream = parse('<html><body>Hi!</body></html>')
>>> walker = getTreeWalker('etree')
>>> serializer = HTMLSerializer(omit_optional_tags=False)
>>> serializer.render(walker(token_stream))
'<html><head></head><body>Hi!</body></html>'
"""
if encoding:
return b"".join(list(self.serialize(treewalker, encoding)))
else:
return "".join(list(self.serialize(treewalker)))
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
self.errors.append(data)
if self.strict:
raise SerializeError
class SerializeError(Exception):
"""Error in serialized tree"""
pass

View file

@ -0,0 +1 @@
from __future__ import absolute_import, division, unicode_literals

View file

@ -0,0 +1,108 @@
from __future__ import print_function
import os.path
import sys
import pkg_resources
import pytest
from .tree_construction import TreeConstructionFile
from .tokenizer import TokenizerFile
from .sanitizer import SanitizerFile
_dir = os.path.abspath(os.path.dirname(__file__))
_root = os.path.join(_dir, "..", "..")
_testdata = os.path.join(_dir, "testdata")
_tree_construction = os.path.join(_testdata, "tree-construction")
_tokenizer = os.path.join(_testdata, "tokenizer")
_sanitizer_testdata = os.path.join(_dir, "sanitizer-testdata")
def fail_if_missing_pytest_expect():
"""Throws an exception halting pytest if pytest-expect isn't working"""
try:
from pytest_expect import expect # noqa
except ImportError:
header = '*' * 78
print(
'\n' +
header + '\n' +
'ERROR: Either pytest-expect or its dependency u-msgpack-python is not\n' +
'installed. Please install them both before running pytest.\n' +
header + '\n',
file=sys.stderr
)
raise
fail_if_missing_pytest_expect()
def pytest_configure(config):
msgs = []
if not os.path.exists(_testdata):
msg = "testdata not available! "
if os.path.exists(os.path.join(_root, ".git")):
msg += ("Please run git submodule update --init --recursive " +
"and then run tests again.")
else:
msg += ("The testdata doesn't appear to be included with this package, " +
"so finding the right version will be hard. :(")
msgs.append(msg)
if config.option.update_xfail:
# Check for optional requirements
req_file = os.path.join(_root, "requirements-optional.txt")
if os.path.exists(req_file):
with open(req_file, "r") as fp:
for line in fp:
if (line.strip() and
not (line.startswith("-r") or
line.startswith("#"))):
if ";" in line:
spec, marker = line.strip().split(";", 1)
else:
spec, marker = line.strip(), None
req = pkg_resources.Requirement.parse(spec)
if marker and not pkg_resources.evaluate_marker(marker):
msgs.append("%s not available in this environment" % spec)
else:
try:
installed = pkg_resources.working_set.find(req)
except pkg_resources.VersionConflict:
msgs.append("Outdated version of %s installed, need %s" % (req.name, spec))
else:
if not installed:
msgs.append("Need %s" % spec)
# Check cElementTree
import xml.etree.ElementTree as ElementTree
try:
import xml.etree.cElementTree as cElementTree
except ImportError:
msgs.append("cElementTree unable to be imported")
else:
if cElementTree.Element is ElementTree.Element:
msgs.append("cElementTree is just an alias for ElementTree")
if msgs:
pytest.exit("\n".join(msgs))
def pytest_collect_file(path, parent):
dir = os.path.abspath(path.dirname)
dir_and_parents = set()
while dir not in dir_and_parents:
dir_and_parents.add(dir)
dir = os.path.dirname(dir)
if _tree_construction in dir_and_parents:
if path.ext == ".dat":
return TreeConstructionFile(path, parent)
elif _tokenizer in dir_and_parents:
if path.ext == ".test":
return TokenizerFile(path, parent)
elif _sanitizer_testdata in dir_and_parents:
if path.ext == ".dat":
return SanitizerFile(path, parent)

View file

@ -0,0 +1,433 @@
[
{
"name": "IE_Comments",
"input": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->",
"output": ""
},
{
"name": "IE_Comments_2",
"input": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
"output": "&lt;script&gt;alert('XSS');&lt;/script&gt;"
},
{
"name": "allow_colons_in_path_component",
"input": "<a href=\"./this:that\">foo</a>",
"output": "<a href='./this:that'>foo</a>"
},
{
"name": "background_attribute",
"input": "<div background=\"javascript:alert('XSS')\"></div>",
"output": "<div></div>"
},
{
"name": "bgsound",
"input": "<bgsound src=\"javascript:alert('XSS');\" />",
"output": "&lt;bgsound src=\"javascript:alert('XSS');\"&gt;&lt;/bgsound&gt;"
},
{
"name": "div_background_image_unicode_encoded",
"input": "<div style=\"background-image:\u00a5\u00a2\u006C\u0028'\u006a\u0061\u00a6\u0061\u00a3\u0063\u00a2\u0069\u00a0\u00a4\u003a\u0061\u006c\u0065\u00a2\u00a4\u0028.1027\u0058.1053\u0053\u0027\u0029'\u0029\">foo</div>",
"output": "<div style=''>foo</div>"
},
{
"name": "div_expression",
"input": "<div style=\"width: expression(alert('XSS'));\">foo</div>",
"output": "<div style=''>foo</div>"
},
{
"name": "double_open_angle_brackets",
"input": "<img src=http://ha.ckers.org/scriptlet.html <",
"output": ""
},
{
"name": "double_open_angle_brackets_2",
"input": "<script src=http://ha.ckers.org/scriptlet.html <",
"output": ""
},
{
"name": "grave_accents",
"input": "<img src=`javascript:alert('XSS')` />",
"output": "<img/>"
},
{
"name": "img_dynsrc_lowsrc",
"input": "<img dynsrc=\"javascript:alert('XSS')\" />",
"output": "<img/>"
},
{
"name": "img_vbscript",
"input": "<img src='vbscript:msgbox(\"XSS\")' />",
"output": "<img/>"
},
{
"name": "input_image",
"input": "<input type=\"image\" src=\"javascript:alert('XSS');\" />",
"output": "<input type='image'/>"
},
{
"name": "link_stylesheets",
"input": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\" />",
"output": "&lt;link href=\"javascript:alert('XSS');\" rel=\"stylesheet\"&gt;"
},
{
"name": "link_stylesheets_2",
"input": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\" />",
"output": "&lt;link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\"&gt;"
},
{
"name": "list_style_image",
"input": "<li style=\"list-style-image: url(javascript:alert('XSS'))\">foo</li>",
"output": "<li style=''>foo</li>"
},
{
"name": "no_closing_script_tags",
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;"
},
{
"name": "non_alpha_non_digit",
"input": "<script/XSS src=\"http://ha.ckers.org/xss.js\"></script>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js\" xss=\"\"&gt;&lt;/script&gt;"
},
{
"name": "non_alpha_non_digit_2",
"input": "<a onclick!\\#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>",
"output": "<a>foo</a>"
},
{
"name": "non_alpha_non_digit_3",
"input": "<img/src=\"http://ha.ckers.org/xss.js\"/>",
"output": "<img src='http://ha.ckers.org/xss.js'/>"
},
{
"name": "non_alpha_non_digit_II",
"input": "<a href!\\#$%&()*~+-_.,:;?@[/|]^`=alert('XSS')>foo</a>",
"output": "<a>foo</a>"
},
{
"name": "non_alpha_non_digit_III",
"input": "<a/href=\"javascript:alert('XSS');\">foo</a>",
"output": "<a>foo</a>"
},
{
"name": "platypus",
"input": "<a href=\"http://www.ragingplatypus.com/\" style=\"display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;\">never trust your upstream platypus</a>",
"output": "<a href='http://www.ragingplatypus.com/' style='display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;'>never trust your upstream platypus</a>"
},
{
"name": "protocol_resolution_in_script_tag",
"input": "<script src=//ha.ckers.org/.j></script>",
"output": "&lt;script src=\"//ha.ckers.org/.j\"&gt;&lt;/script&gt;"
},
{
"name": "should_allow_anchors",
"input": "<a href='foo' onclick='bar'><script>baz</script></a>",
"output": "<a href='foo'>&lt;script&gt;baz&lt;/script&gt;</a>"
},
{
"name": "should_allow_image_alt_attribute",
"input": "<img alt='foo' onclick='bar' />",
"output": "<img alt='foo'/>"
},
{
"name": "should_allow_image_height_attribute",
"input": "<img height='foo' onclick='bar' />",
"output": "<img height='foo'/>"
},
{
"name": "should_allow_image_src_attribute",
"input": "<img src='foo' onclick='bar' />",
"output": "<img src='foo'/>"
},
{
"name": "should_allow_image_width_attribute",
"input": "<img width='foo' onclick='bar' />",
"output": "<img width='foo'/>"
},
{
"name": "should_handle_blank_text",
"input": "",
"output": ""
},
{
"name": "should_handle_malformed_image_tags",
"input": "<img \"\"\"><script>alert(\"XSS\")</script>\">",
"output": "<img/>&lt;script&gt;alert(\"XSS\")&lt;/script&gt;\"&gt;"
},
{
"name": "should_handle_non_html",
"input": "abc",
"output": "abc"
},
{
"name": "should_not_fall_for_ridiculous_hack",
"input": "<img\nsrc\n=\n\"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n\"\n />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_0",
"input": "<img src=\"javascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_1",
"input": "<img src=javascript:alert('XSS') />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_10",
"input": "<img src=\"jav&#x0A;ascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_11",
"input": "<img src=\"jav&#x0D;ascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_12",
"input": "<img src=\" &#14; javascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_13",
"input": "<img src=\"&#x20;javascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_14",
"input": "<img src=\"&#xA0;javascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_2",
"input": "<img src=\"JaVaScRiPt:alert('XSS')\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_3",
"input": "<img src='javascript:alert(&quot;XSS&quot;)' />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_4",
"input": "<img src='javascript:alert(String.fromCharCode(88,83,83))' />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_5",
"input": "<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_6",
"input": "<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_7",
"input": "<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_8",
"input": "<img src=\"jav\tascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_9",
"input": "<img src=\"jav&#x09;ascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_sanitize_half_open_scripts",
"input": "<img src=\"javascript:alert('XSS')\"",
"output": ""
},
{
"name": "should_sanitize_invalid_script_tag",
"input": "<script/XSS SRC=\"http://ha.ckers.org/xss.js\"></script>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js\" xss=\"\"&gt;&lt;/script&gt;"
},
{
"name": "should_sanitize_script_tag_with_multiple_open_brackets",
"input": "<<script>alert(\"XSS\");//<</script>",
"output": "&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;"
},
{
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
"input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
"output": ""
},
{
"name": "should_sanitize_tag_broken_up_by_null",
"input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
"output": "&lt;scr\ufffdipt&gt;alert(\"XSS\")&lt;/scr\ufffdipt&gt;"
},
{
"name": "should_sanitize_unclosed_script",
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;"
},
{
"name": "should_strip_href_attribute_in_a_with_bad_protocols",
"input": "<a href=\"javascript:XSS\" title=\"1\">boo</a>",
"output": "<a title='1'>boo</a>"
},
{
"name": "should_strip_href_attribute_in_a_with_bad_protocols_and_whitespace",
"input": "<a href=\" javascript:XSS\" title=\"1\">boo</a>",
"output": "<a title='1'>boo</a>"
},
{
"name": "should_strip_src_attribute_in_img_with_bad_protocols",
"input": "<img src=\"javascript:XSS\" title=\"1\">boo</img>",
"output": "<img title='1'/>boo"
},
{
"name": "should_strip_src_attribute_in_img_with_bad_protocols_and_whitespace",
"input": "<img src=\" javascript:XSS\" title=\"1\">boo</img>",
"output": "<img title='1'/>boo"
},
{
"name": "xml_base",
"input": "<div xml:base=\"javascript:alert('XSS');//\">foo</div>",
"output": "<div>foo</div>"
},
{
"name": "xul",
"input": "<p style=\"-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')\">fubar</p>",
"output": "<p style=''>fubar</p>"
},
{
"name": "quotes_in_attributes",
"input": "<img src='foo' title='\"foo\" bar' />",
"output": "<img src='foo' title='\"foo\" bar'/>"
},
{
"name": "uri_refs_in_svg_attributes",
"input": "<svg><rect fill='url(#foo)' />",
"output": "<svg><rect fill='url(#foo)'></rect></svg>"
},
{
"name": "absolute_uri_refs_in_svg_attributes",
"input": "<svg><rect fill='url(http://bad.com/) #fff' />",
"output": "<svg><rect fill=' #fff'></rect></svg>"
},
{
"name": "uri_ref_with_space_in svg_attribute",
"input": "<svg><rect fill='url(\n#foo)' />",
"output": "<svg><rect fill='url(\n#foo)'></rect></svg>"
},
{
"name": "absolute_uri_ref_with_space_in svg_attribute",
"input": "<svg><rect fill=\"url(\nhttp://bad.com/)\" />",
"output": "<svg><rect fill=' '></rect></svg>"
},
{
"name": "allow_html5_image_tag",
"input": "<image src='foo' />",
"output": "<img src='foo'/>"
},
{
"name": "style_attr_end_with_nothing",
"input": "<div style=\"color: blue\" />",
"output": "<div style='color: blue;'></div>"
},
{
"name": "style_attr_end_with_space",
"input": "<div style=\"color: blue \" />",
"output": "<div style='color: blue ;'></div>"
},
{
"name": "style_attr_end_with_semicolon",
"input": "<div style=\"color: blue;\" />",
"output": "<div style='color: blue;'></div>"
},
{
"name": "style_attr_end_with_semicolon_space",
"input": "<div style=\"color: blue; \" />",
"output": "<div style='color: blue;'></div>"
},
{
"name": "attributes_with_embedded_quotes",
"input": "<img src=doesntexist.jpg\"'onerror=\"alert(1) />",
"output": "<img src='doesntexist.jpg\"&#39;onerror=\"alert(1)'/>"
},
{
"name": "attributes_with_embedded_quotes_II",
"input": "<img src=notthere.jpg\"\"onerror=\"alert(2) />",
"output": "<img src='notthere.jpg\"\"onerror=\"alert(2)'/>"
}
]

View file

@ -0,0 +1,50 @@
from __future__ import absolute_import, division, unicode_literals
import codecs
import json
import pytest
from html5lib import parseFragment, serialize
class SanitizerFile(pytest.File):
def collect(self):
with codecs.open(str(self.fspath), "r", encoding="utf-8") as fp:
tests = json.load(fp)
for i, test in enumerate(tests):
yield SanitizerTest(str(i), self, test=test)
class SanitizerTest(pytest.Item):
def __init__(self, name, parent, test):
super(SanitizerTest, self).__init__(name, parent)
self.obj = lambda: 1 # this is to hack around skipif needing a function!
self.test = test
def runtest(self):
input = self.test["input"]
expected = self.test["output"]
parsed = parseFragment(input)
serialized = serialize(parsed,
sanitize=True,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char="'",
alphabetical_attributes=True)
errorMsg = "\n".join(["\n\nInput:", input,
"\nExpected:", expected,
"\nReceived:", serialized])
assert expected == serialized, errorMsg
def repr_failure(self, excinfo):
traceback = excinfo.traceback
ntraceback = traceback.cut(path=__file__)
excinfo.traceback = ntraceback.filter()
return excinfo.getrepr(funcargs=True,
showlocals=False,
style="short", tbfilter=False)

View file

@ -0,0 +1,395 @@
{
"tests": [
{
"expected": [
"<span title='test \"with\" &amp;quot;'>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "test \"with\" &quot;"
}
]
]
],
"description": "proper attribute value escaping"
},
{
"expected": [
"<span title=foo>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo"
}
]
]
],
"description": "proper attribute value non-quoting"
},
{
"expected": [
"<span title=\"foo<bar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo<bar"
}
]
]
],
"description": "proper attribute value non-quoting (with <)"
},
{
"expected": [
"<span title=\"foo=bar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo=bar"
}
]
]
],
"description": "proper attribute value quoting (with =)"
},
{
"expected": [
"<span title=\"foo>bar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo>bar"
}
]
]
],
"description": "proper attribute value quoting (with >)"
},
{
"expected": [
"<span title='foo\"bar'>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo\"bar"
}
]
]
],
"description": "proper attribute value quoting (with \")"
},
{
"expected": [
"<span title=\"foo'bar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo'bar"
}
]
]
],
"description": "proper attribute value quoting (with ')"
},
{
"expected": [
"<span title=\"foo'bar&quot;baz\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo'bar\"baz"
}
]
]
],
"description": "proper attribute value quoting (with both \" and ')"
},
{
"expected": [
"<span title=\"foo bar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo bar"
}
]
]
],
"description": "proper attribute value quoting (with space)"
},
{
"expected": [
"<span title=\"foo\tbar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo\tbar"
}
]
]
],
"description": "proper attribute value quoting (with tab)"
},
{
"expected": [
"<span title=\"foo\nbar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo\nbar"
}
]
]
],
"description": "proper attribute value quoting (with LF)"
},
{
"expected": [
"<span title=\"foo\rbar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo\rbar"
}
]
]
],
"description": "proper attribute value quoting (with CR)"
},
{
"expected": [
"<span title=\"foo\u000bbar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo\u000bbar"
}
]
]
],
"description": "proper attribute value non-quoting (with linetab)"
},
{
"expected": [
"<span title=\"foo\fbar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo\fbar"
}
]
]
],
"description": "proper attribute value quoting (with form feed)"
},
{
"expected": [
"<img>"
],
"input": [
[
"EmptyTag",
"img",
{}
]
],
"description": "void element (as EmptyTag token)"
},
{
"expected": [
"<!DOCTYPE foo>"
],
"input": [
[
"Doctype",
"foo"
]
],
"description": "doctype in error"
},
{
"expected": [
"a&lt;b&gt;c&amp;d"
],
"input": [
[
"Characters",
"a<b>c&d"
]
],
"description": "character data",
"options": {
"encoding": "utf-8"
}
},
{
"expected": [
"<script>a<b>c&d"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"script",
{}
],
[
"Characters",
"a<b>c&d"
]
],
"description": "rcdata"
},
{
"expected": [
"<!DOCTYPE HTML>"
],
"input": [
[
"Doctype",
"HTML"
]
],
"description": "doctype"
},
{
"expected": [
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"
],
"input": [
[
"Doctype",
"HTML",
"-//W3C//DTD HTML 4.01//EN",
"http://www.w3.org/TR/html4/strict.dtd"
]
],
"description": "HTML 4.01 DOCTYPE"
},
{
"expected": [
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">"
],
"input": [
[
"Doctype",
"HTML",
"-//W3C//DTD HTML 4.01//EN"
]
],
"description": "HTML 4.01 DOCTYPE without system identifer"
},
{
"expected": [
"<!DOCTYPE html SYSTEM \"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd\">"
],
"input": [
[
"Doctype",
"html",
"",
"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"
]
],
"description": "IBM DOCTYPE without public identifer"
}
]
}

View file

@ -0,0 +1,350 @@
{
"tests": [
{
"expected": [
""
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "no encoding",
"options": {
"inject_meta_charset": true
}
},
{
"expected": [
"<meta charset=utf-8>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "empytag head",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta charset=utf-8><title>foo</title>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"title",
{}
],
[
"Characters",
"foo"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"title"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/title",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta charset=utf-8>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "charset",
"value": "ascii"
}
]
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/meta-charset",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta charset=utf-8><meta charset=utf-8>",
"<head><meta charset=utf-8><meta charset=ascii>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "charset",
"value": "ascii"
}
]
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "charset",
"value": "ascii"
}
]
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/ two meta-charset",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta charset=utf-8><meta content=noindex name=robots>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "name",
"value": "robots"
},
{
"namespace": null,
"name": "content",
"value": "noindex"
}
]
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/robots",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta content=noindex name=robots><meta charset=utf-8>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "name",
"value": "robots"
},
{
"namespace": null,
"name": "content",
"value": "noindex"
}
]
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "charset",
"value": "ascii"
}
]
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/robots & charset",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "http-equiv",
"value": "content-type"
},
{
"namespace": null,
"name": "content",
"value": "text/html; charset=ascii"
}
]
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/ charset in http-equiv content-type",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta content=noindex name=robots><meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "name",
"value": "robots"
},
{
"namespace": null,
"name": "content",
"value": "noindex"
}
]
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "http-equiv",
"value": "content-type"
},
{
"namespace": null,
"name": "content",
"value": "text/html; charset=ascii"
}
]
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/robots & charset in http-equiv content-type",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
}
]
}

View file

@ -0,0 +1,334 @@
{
"tests": [
{
"expected": [
"<span title='test &#39;with&#39; quote_char'>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "test 'with' quote_char"
}
]
]
],
"description": "quote_char=\"'\"",
"options": {
"quote_char": "'"
}
},
{
"expected": [
"<button disabled>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"button",
[
{
"namespace": null,
"name": "disabled",
"value": "disabled"
}
]
]
],
"description": "quote_attr_values='always'",
"options": {
"quote_attr_values": "always"
}
},
{
"expected": [
"<div itemscope>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "itemscope",
"value": "itemscope"
}
]
]
],
"description": "quote_attr_values='always' with itemscope",
"options": {
"quote_attr_values": "always"
}
},
{
"expected": [
"<div irrelevant>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "irrelevant",
"value": "irrelevant"
}
]
]
],
"description": "quote_attr_values='always' with irrelevant",
"options": {
"quote_attr_values": "always"
}
},
{
"expected": [
"<div class=\"foo\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "class",
"value": "foo"
}
]
]
],
"description": "non-minimized quote_attr_values='always'",
"options": {
"quote_attr_values": "always"
}
},
{
"expected": [
"<div class=foo>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "class",
"value": "foo"
}
]
]
],
"description": "non-minimized quote_attr_values='legacy'",
"options": {
"quote_attr_values": "legacy"
}
},
{
"expected": [
"<div class=foo>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "class",
"value": "foo"
}
]
]
],
"description": "non-minimized quote_attr_values='spec'",
"options": {
"quote_attr_values": "spec"
}
},
{
"expected": [
"<img />"
],
"input": [
[
"EmptyTag",
"img",
{}
]
],
"description": "use_trailing_solidus=true with void element",
"options": {
"use_trailing_solidus": true
}
},
{
"expected": [
"<div>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
{}
]
],
"description": "use_trailing_solidus=true with non-void element",
"options": {
"use_trailing_solidus": true
}
},
{
"expected": [
"<div itemscope=itemscope>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "itemscope",
"value": "itemscope"
}
]
]
],
"description": "minimize_boolean_attributes=false",
"options": {
"minimize_boolean_attributes": false
}
},
{
"expected": [
"<div irrelevant=irrelevant>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "irrelevant",
"value": "irrelevant"
}
]
]
],
"description": "minimize_boolean_attributes=false",
"options": {
"minimize_boolean_attributes": false
}
},
{
"expected": [
"<div itemscope=\"\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "itemscope",
"value": ""
}
]
]
],
"description": "minimize_boolean_attributes=false with empty value",
"options": {
"minimize_boolean_attributes": false
}
},
{
"expected": [
"<div irrelevant=\"\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "irrelevant",
"value": ""
}
]
]
],
"description": "minimize_boolean_attributes=false with empty value",
"options": {
"minimize_boolean_attributes": false
}
},
{
"expected": [
"<a title=\"a&lt;b>c&amp;d\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"a",
[
{
"namespace": null,
"name": "title",
"value": "a<b>c&d"
}
]
]
],
"description": "escape less than signs in attribute values",
"options": {
"escape_lt_in_attrs": true
}
},
{
"expected": [
"<script>a&lt;b&gt;c&amp;d"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"script",
{}
],
[
"Characters",
"a<b>c&d"
]
],
"description": "rcdata",
"options": {
"escape_rcdata": true
}
}
]
}

View file

@ -0,0 +1,198 @@
{
"tests": [
{
"expected": [
" foo"
],
"input": [
[
"Characters",
"\t\r\n\f foo"
]
],
"description": "bare text with leading spaces",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"foo "
],
"input": [
[
"Characters",
"foo \t\r\n\f"
]
],
"description": "bare text with trailing spaces",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"foo bar"
],
"input": [
[
"Characters",
"foo \t\r\n\f bar"
]
],
"description": "bare text with inner spaces",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"<pre>\t\r\n\f foo \t\r\n\f bar \t\r\n\f</pre>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"pre",
{}
],
[
"Characters",
"\t\r\n\f foo \t\r\n\f bar \t\r\n\f"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"pre"
]
],
"description": "text within <pre>",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"<pre>\t\r\n\f fo<span>o \t\r\n\f b</span>ar \t\r\n\f</pre>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"pre",
{}
],
[
"Characters",
"\t\r\n\f fo"
],
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
{}
],
[
"Characters",
"o \t\r\n\f b"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"span"
],
[
"Characters",
"ar \t\r\n\f"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"pre"
]
],
"description": "text within <pre>, with inner markup",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"<textarea>\t\r\n\f foo \t\r\n\f bar \t\r\n\f</textarea>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"textarea",
{}
],
[
"Characters",
"\t\r\n\f foo \t\r\n\f bar \t\r\n\f"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"textarea"
]
],
"description": "text within <textarea>",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"<script>\t\r\n\f foo \t\r\n\f bar \t\r\n\f</script>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"script",
{}
],
[
"Characters",
"\t\r\n\f foo \t\r\n\f bar \t\r\n\f"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"script"
]
],
"description": "text within <script>",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"<style>\t\r\n\f foo \t\r\n\f bar \t\r\n\f</style>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"style",
{}
],
[
"Characters",
"\t\r\n\f foo \t\r\n\f bar \t\r\n\f"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"style"
]
],
"description": "text within <style>",
"options": {
"strip_whitespace": true
}
}
]
}

View file

@ -0,0 +1,198 @@
from __future__ import absolute_import, division, unicode_literals
# pylint:disable=wrong-import-position
import os
import sys
import codecs
import glob
import xml.sax.handler
base_path = os.path.split(__file__)[0]
test_dir = os.path.join(base_path, 'testdata')
sys.path.insert(0, os.path.abspath(os.path.join(base_path,
os.path.pardir,
os.path.pardir)))
from html5lib import treebuilders, treewalkers, treeadapters # noqa
del base_path
# Build a dict of available trees
treeTypes = {}
# DOM impls
treeTypes["DOM"] = {
"builder": treebuilders.getTreeBuilder("dom"),
"walker": treewalkers.getTreeWalker("dom")
}
# ElementTree impls
import xml.etree.ElementTree as ElementTree # noqa
treeTypes['ElementTree'] = {
"builder": treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True),
"walker": treewalkers.getTreeWalker("etree", ElementTree)
}
try:
import xml.etree.cElementTree as cElementTree # noqa
except ImportError:
treeTypes['cElementTree'] = None
else:
# On Python 3.3 and above cElementTree is an alias, don't run them twice.
if cElementTree.Element is ElementTree.Element:
treeTypes['cElementTree'] = None
else:
treeTypes['cElementTree'] = {
"builder": treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True),
"walker": treewalkers.getTreeWalker("etree", cElementTree)
}
try:
import lxml.etree as lxml # noqa
except ImportError:
treeTypes['lxml'] = None
else:
treeTypes['lxml'] = {
"builder": treebuilders.getTreeBuilder("lxml"),
"walker": treewalkers.getTreeWalker("lxml")
}
# Genshi impls
try:
import genshi # noqa
except ImportError:
treeTypes["genshi"] = None
else:
treeTypes["genshi"] = {
"builder": treebuilders.getTreeBuilder("dom"),
"adapter": lambda tree: treeadapters.genshi.to_genshi(treewalkers.getTreeWalker("dom")(tree)),
"walker": treewalkers.getTreeWalker("genshi")
}
# pylint:enable=wrong-import-position
def get_data_files(subdirectory, files='*.dat', search_dir=test_dir):
return sorted(glob.glob(os.path.join(search_dir, subdirectory, files)))
class DefaultDict(dict):
def __init__(self, default, *args, **kwargs):
self.default = default
dict.__init__(self, *args, **kwargs)
def __getitem__(self, key):
return dict.get(self, key, self.default)
class TestData(object):
def __init__(self, filename, newTestHeading="data", encoding="utf8"):
if encoding is None:
self.f = open(filename, mode="rb")
else:
self.f = codecs.open(filename, encoding=encoding)
self.encoding = encoding
self.newTestHeading = newTestHeading
def __iter__(self):
data = DefaultDict(None)
key = None
for line in self.f:
heading = self.isSectionHeading(line)
if heading:
if data and heading == self.newTestHeading:
# Remove trailing newline
data[key] = data[key][:-1]
yield self.normaliseOutput(data)
data = DefaultDict(None)
key = heading
data[key] = "" if self.encoding else b""
elif key is not None:
data[key] += line
if data:
yield self.normaliseOutput(data)
def isSectionHeading(self, line):
"""If the current heading is a test section heading return the heading,
otherwise return False"""
# print(line)
if line.startswith("#" if self.encoding else b"#"):
return line[1:].strip()
else:
return False
def normaliseOutput(self, data):
# Remove trailing newlines
for key, value in data.items():
if value.endswith("\n" if self.encoding else b"\n"):
data[key] = value[:-1]
return data
def convert(stripChars):
def convertData(data):
"""convert the output of str(document) to the format used in the testcases"""
data = data.split("\n")
rv = []
for line in data:
if line.startswith("|"):
rv.append(line[stripChars:])
else:
rv.append(line)
return "\n".join(rv)
return convertData
convertExpected = convert(2)
def errorMessage(input, expected, actual):
msg = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n" %
(repr(input), repr(expected), repr(actual)))
if sys.version_info[0] == 2:
msg = msg.encode("ascii", "backslashreplace")
return msg
class TracingSaxHandler(xml.sax.handler.ContentHandler):
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self.visited = []
def startDocument(self):
self.visited.append('startDocument')
def endDocument(self):
self.visited.append('endDocument')
def startPrefixMapping(self, prefix, uri):
# These are ignored as their order is not guaranteed
pass
def endPrefixMapping(self, prefix):
# These are ignored as their order is not guaranteed
pass
def startElement(self, name, attrs):
self.visited.append(('startElement', name, attrs))
def endElement(self, name):
self.visited.append(('endElement', name))
def startElementNS(self, name, qname, attrs):
self.visited.append(('startElementNS', name, qname, dict(attrs)))
def endElementNS(self, name, qname):
self.visited.append(('endElementNS', name, qname))
def characters(self, content):
self.visited.append(('characters', content))
def ignorableWhitespace(self, whitespace):
self.visited.append(('ignorableWhitespace', whitespace))
def processingInstruction(self, target, data):
self.visited.append(('processingInstruction', target, data))
def skippedEntity(self, name):
self.visited.append(('skippedEntity', name))

View file

@ -0,0 +1,78 @@
from __future__ import absolute_import, division, unicode_literals
from collections import OrderedDict
import pytest
import html5lib
from html5lib.filters.alphabeticalattributes import Filter
from html5lib.serializer import HTMLSerializer
@pytest.mark.parametrize('msg, attrs, expected_attrs', [
(
'no attrs',
{},
{}
),
(
'one attr',
{(None, 'alt'): 'image'},
OrderedDict([((None, 'alt'), 'image')])
),
(
'multiple attrs',
{
(None, 'src'): 'foo',
(None, 'alt'): 'image',
(None, 'style'): 'border: 1px solid black;'
},
OrderedDict([
((None, 'alt'), 'image'),
((None, 'src'), 'foo'),
((None, 'style'), 'border: 1px solid black;')
])
),
])
def test_alphabetizing(msg, attrs, expected_attrs):
tokens = [{'type': 'StartTag', 'name': 'img', 'data': attrs}]
output_tokens = list(Filter(tokens))
attrs = output_tokens[0]['data']
assert attrs == expected_attrs
def test_with_different_namespaces():
tokens = [{
'type': 'StartTag',
'name': 'pattern',
'data': {
(None, 'id'): 'patt1',
('http://www.w3.org/1999/xlink', 'href'): '#patt2'
}
}]
output_tokens = list(Filter(tokens))
attrs = output_tokens[0]['data']
assert attrs == OrderedDict([
((None, 'id'), 'patt1'),
(('http://www.w3.org/1999/xlink', 'href'), '#patt2')
])
def test_with_serializer():
"""Verify filter works in the context of everything else"""
parser = html5lib.HTMLParser()
dom = parser.parseFragment('<svg><pattern xlink:href="#patt2" id="patt1"></svg>')
walker = html5lib.getTreeWalker('etree')
ser = HTMLSerializer(
alphabetical_attributes=True,
quote_attr_values='always'
)
# FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When
# that gets fixed, we can fix this expected result.
assert (
ser.render(walker(dom)) ==
'<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
)

View file

@ -0,0 +1,116 @@
from __future__ import absolute_import, division, unicode_literals
import os
import pytest
from .support import get_data_files, test_dir, errorMessage, TestData as _TestData
from html5lib import HTMLParser, _inputstream
def test_basic_prescan_length():
data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
pad = 1024 - len(data) + 1
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
assert len(data) == 1024 # Sanity
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
assert 'utf-8' == stream.charEncoding[0].name
def test_parser_reparse():
data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
pad = 10240 - len(data) + 1
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
assert len(data) == 10240 # Sanity
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
assert 'windows-1252' == stream.charEncoding[0].name
p = HTMLParser(namespaceHTMLElements=False)
doc = p.parse(data, useChardet=False)
assert 'utf-8' == p.documentEncoding
assert doc.find(".//title").text == "Caf\u00E9"
@pytest.mark.parametrize("expected,data,kwargs", [
("utf-16le", b"\xFF\xFE", {"override_encoding": "iso-8859-2"}),
("utf-16be", b"\xFE\xFF", {"override_encoding": "iso-8859-2"}),
("utf-8", b"\xEF\xBB\xBF", {"override_encoding": "iso-8859-2"}),
("iso-8859-2", b"", {"override_encoding": "iso-8859-2", "transport_encoding": "iso-8859-3"}),
("iso-8859-2", b"<meta charset=iso-8859-3>", {"transport_encoding": "iso-8859-2"}),
("iso-8859-2", b"<meta charset=iso-8859-2>", {"same_origin_parent_encoding": "iso-8859-3"}),
("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-2", "likely_encoding": "iso-8859-3"}),
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16", "likely_encoding": "iso-8859-2"}),
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16be", "likely_encoding": "iso-8859-2"}),
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16le", "likely_encoding": "iso-8859-2"}),
("iso-8859-2", b"", {"likely_encoding": "iso-8859-2", "default_encoding": "iso-8859-3"}),
("iso-8859-2", b"", {"default_encoding": "iso-8859-2"}),
("windows-1252", b"", {"default_encoding": "totally-bogus-string"}),
("windows-1252", b"", {}),
])
def test_parser_args(expected, data, kwargs):
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs)
assert expected == stream.charEncoding[0].name
p = HTMLParser()
p.parse(data, useChardet=False, **kwargs)
assert expected == p.documentEncoding
@pytest.mark.parametrize("kwargs", [
{"override_encoding": "iso-8859-2"},
{"override_encoding": None},
{"transport_encoding": "iso-8859-2"},
{"transport_encoding": None},
{"same_origin_parent_encoding": "iso-8859-2"},
{"same_origin_parent_encoding": None},
{"likely_encoding": "iso-8859-2"},
{"likely_encoding": None},
{"default_encoding": "iso-8859-2"},
{"default_encoding": None},
{"foo_encoding": "iso-8859-2"},
{"foo_encoding": None},
])
def test_parser_args_raises(kwargs):
with pytest.raises(TypeError) as exc_info:
p = HTMLParser()
p.parse("", useChardet=False, **kwargs)
assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")
def runParserEncodingTest(data, encoding):
p = HTMLParser()
assert p.documentEncoding is None
p.parse(data, useChardet=False)
encoding = encoding.lower().decode("ascii")
assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
def runPreScanEncodingTest(data, encoding):
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
encoding = encoding.lower().decode("ascii")
# Very crude way to ignore irrelevant tests
if len(data) > stream.numBytesMeta:
return
assert encoding == stream.charEncoding[0].name, errorMessage(data, encoding, stream.charEncoding[0].name)
def test_encoding():
for filename in get_data_files("encoding"):
tests = _TestData(filename, b"data", encoding=None)
for test in tests:
yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
# pylint:disable=wrong-import-position
try:
import chardet # noqa
except ImportError:
print("chardet not found, skipping chardet tests")
else:
def test_chardet():
with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp:
encoding = _inputstream.HTMLInputStream(fp.read()).charEncoding
assert encoding[0].name == "big5"
# pylint:enable=wrong-import-position

View file

@ -0,0 +1,41 @@
from __future__ import absolute_import, division, unicode_literals
import six
from mock import Mock
from . import support
def _createReprMock(r):
"""Creates a mock with a __repr__ returning r
Also provides __str__ mock with default mock behaviour"""
mock = Mock()
mock.__repr__ = Mock()
mock.__repr__.return_value = r
mock.__str__ = Mock(wraps=mock.__str__)
return mock
def test_errorMessage():
# Create mock objects to take repr of
input = _createReprMock("1")
expected = _createReprMock("2")
actual = _createReprMock("3")
# Run the actual test
r = support.errorMessage(input, expected, actual)
# Assertions!
if six.PY2:
assert b"Input:\n1\nExpected:\n2\nRecieved\n3\n" == r
else:
assert six.PY3
assert "Input:\n1\nExpected:\n2\nRecieved\n3\n" == r
assert input.__repr__.call_count == 1
assert expected.__repr__.call_count == 1
assert actual.__repr__.call_count == 1
assert not input.__str__.called
assert not expected.__str__.called
assert not actual.__str__.called

View file

@ -0,0 +1,7 @@
from __future__ import absolute_import, division, unicode_literals
from html5lib.filters.optionaltags import Filter
def test_empty():
assert list(Filter([])) == []

View file

@ -0,0 +1,130 @@
from __future__ import absolute_import, division, unicode_literals
from six import PY2, text_type, unichr
import io
from . import support # noqa
from html5lib.constants import namespaces, tokenTypes
from html5lib import parse, parseFragment, HTMLParser
# tests that aren't autogenerated from text files
def test_assertDoctypeCloneable():
doc = parse('<!DOCTYPE HTML>', treebuilder="dom")
assert doc.cloneNode(True) is not None
def test_line_counter():
# http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
assert parse("<pre>\nx\n&gt;\n</pre>") is not None
def test_namespace_html_elements_0_dom():
doc = parse("<html></html>",
treebuilder="dom",
namespaceHTMLElements=True)
assert doc.childNodes[0].namespaceURI == namespaces["html"]
def test_namespace_html_elements_1_dom():
doc = parse("<html></html>",
treebuilder="dom",
namespaceHTMLElements=False)
assert doc.childNodes[0].namespaceURI is None
def test_namespace_html_elements_0_etree():
doc = parse("<html></html>",
treebuilder="etree",
namespaceHTMLElements=True)
assert doc.tag == "{%s}html" % (namespaces["html"],)
def test_namespace_html_elements_1_etree():
doc = parse("<html></html>",
treebuilder="etree",
namespaceHTMLElements=False)
assert doc.tag == "html"
def test_unicode_file():
assert parse(io.StringIO("a")) is not None
def test_maintain_attribute_order():
# This is here because we impl it in parser and not tokenizer
p = HTMLParser()
# generate loads to maximize the chance a hash-based mutation will occur
attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
token = {'name': 'html',
'selfClosing': False,
'selfClosingAcknowledged': False,
'type': tokenTypes["StartTag"],
'data': attrs}
out = p.normalizeToken(token)
attr_order = list(out["data"].keys())
assert attr_order == [x for x, i in attrs]
def test_duplicate_attribute():
# This is here because we impl it in parser and not tokenizer
doc = parse('<p class=a class=b>')
el = doc[1][0]
assert el.get("class") == "a"
def test_maintain_duplicate_attribute_order():
# This is here because we impl it in parser and not tokenizer
p = HTMLParser()
attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
token = {'name': 'html',
'selfClosing': False,
'selfClosingAcknowledged': False,
'type': tokenTypes["StartTag"],
'data': attrs + [('a', len(attrs))]}
out = p.normalizeToken(token)
attr_order = list(out["data"].keys())
assert attr_order == [x for x, i in attrs]
def test_debug_log():
parser = HTMLParser(debug=True)
parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")
expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}),
('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}),
('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})]
if PY2:
for i, log in enumerate(expected):
log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log]
expected[i] = tuple(log)
assert parser.log == expected
def test_no_duplicate_clone():
frag = parseFragment("<b><em><foo><foob><fooc><aside></b></em>")
assert len(frag) == 2
def test_self_closing_col():
parser = HTMLParser()
parser.parseFragment('<table><colgroup><col /></colgroup></table>')
assert not parser.errors

View file

@ -0,0 +1,127 @@
from __future__ import absolute_import, division, unicode_literals
from html5lib import constants, parseFragment, serialize
from html5lib.filters import sanitizer
def runSanitizerTest(_, expected, input):
parsed = parseFragment(expected)
expected = serialize(parsed,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char='"',
alphabetical_attributes=True)
assert expected == sanitize_html(input)
def sanitize_html(stream):
parsed = parseFragment(stream)
serialized = serialize(parsed,
sanitize=True,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char='"',
alphabetical_attributes=True)
return serialized
def test_should_handle_astral_plane_characters():
sanitized = sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
expected = '<p>\U0001d4b5 \U0001d538</p>'
assert expected == sanitized
def test_should_allow_relative_uris():
sanitized = sanitize_html('<p><a href="/example.com"></a></p>')
expected = '<p><a href="/example.com"></a></p>'
assert expected == sanitized
def test_invalid_data_uri():
sanitized = sanitize_html('<audio controls="" src="data:foobar"></audio>')
expected = '<audio controls></audio>'
assert expected == sanitized
def test_invalid_ipv6_url():
sanitized = sanitize_html('<a href="h://]">')
expected = "<a></a>"
assert expected == sanitized
def test_data_uri_disallowed_type():
sanitized = sanitize_html('<audio controls="" src="data:text/html,<html>"></audio>')
expected = "<audio controls></audio>"
assert expected == sanitized
def test_sanitizer():
for ns, tag_name in sanitizer.allowed_elements:
if ns != constants.namespaces["html"]:
continue
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td',
'tfoot', 'th', 'thead', 'tr', 'select']:
continue # TODO
if tag_name == 'image':
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
elif tag_name == 'br':
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
elif tag_name in constants.voidElements:
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
else:
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
for ns, attribute_name in sanitizer.allowed_attributes:
if ns is not None:
continue
if attribute_name != attribute_name.lower():
continue # TODO
if attribute_name == 'style':
continue
attribute_value = 'foo'
if attribute_name in sanitizer.attr_val_is_uri:
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0]
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
"<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value))
for protocol in sanitizer.allowed_protocols:
rest_of_uri = '//sub.domain.tld/path/object.ext'
if protocol == 'data':
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))
for protocol in sanitizer.allowed_protocols:
rest_of_uri = '//sub.domain.tld/path/object.ext'
if protocol == 'data':
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
protocol = protocol.upper()
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))
def test_lowercase_color_codes_in_style():
sanitized = sanitize_html("<p style=\"border: 1px solid #a2a2a2;\"></p>")
expected = '<p style=\"border: 1px solid #a2a2a2;\"></p>'
assert expected == sanitized
def test_uppercase_color_codes_in_style():
sanitized = sanitize_html("<p style=\"border: 1px solid #A2A2A2;\"></p>")
expected = '<p style=\"border: 1px solid #A2A2A2;\"></p>'
assert expected == sanitized

View file

@ -0,0 +1,225 @@
from __future__ import absolute_import, division, unicode_literals
import os
import json
import pytest
from .support import get_data_files
from html5lib import constants
from html5lib.filters.lint import Filter as Lint
from html5lib.serializer import HTMLSerializer, serialize
from html5lib.treewalkers.base import TreeWalker
# pylint:disable=wrong-import-position
optionals_loaded = []
try:
from lxml import etree
optionals_loaded.append("lxml")
except ImportError:
pass
# pylint:enable=wrong-import-position
default_namespace = constants.namespaces["html"]
class JsonWalker(TreeWalker):
def __iter__(self):
for token in self.tree:
type = token[0]
if type == "StartTag":
if len(token) == 4:
namespace, name, attrib = token[1:4]
else:
namespace = default_namespace
name, attrib = token[1:3]
yield self.startTag(namespace, name, self._convertAttrib(attrib))
elif type == "EndTag":
if len(token) == 3:
namespace, name = token[1:3]
else:
namespace = default_namespace
name = token[1]
yield self.endTag(namespace, name)
elif type == "EmptyTag":
if len(token) == 4:
namespace, name, attrib = token[1:]
else:
namespace = default_namespace
name, attrib = token[1:]
for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
yield token
elif type == "Comment":
yield self.comment(token[1])
elif type in ("Characters", "SpaceCharacters"):
for token in self.text(token[1]):
yield token
elif type == "Doctype":
if len(token) == 4:
yield self.doctype(token[1], token[2], token[3])
elif len(token) == 3:
yield self.doctype(token[1], token[2])
else:
yield self.doctype(token[1])
else:
raise ValueError("Unknown token type: " + type)
def _convertAttrib(self, attribs):
"""html5lib tree-walkers use a dict of (namespace, name): value for
attributes, but JSON cannot represent this. Convert from the format
in the serializer tests (a list of dicts with "namespace", "name",
and "value" as keys) to html5lib's tree-walker format."""
attrs = {}
for attrib in attribs:
name = (attrib["namespace"], attrib["name"])
assert(name not in attrs)
attrs[name] = attrib["value"]
return attrs
def serialize_html(input, options):
options = dict([(str(k), v) for k, v in options.items()])
encoding = options.get("encoding", None)
if "encoding" in options:
del options["encoding"]
stream = Lint(JsonWalker(input), False)
serializer = HTMLSerializer(alphabetical_attributes=True, **options)
return serializer.render(stream, encoding)
def runSerializerTest(input, expected, options):
encoding = options.get("encoding", None)
if encoding:
expected = list(map(lambda x: x.encode(encoding), expected))
result = serialize_html(input, options)
if len(expected) == 1:
assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options))
elif result not in expected:
assert False, "Expected: %s, Received: %s" % (expected, result)
def throwsWithLatin1(input):
with pytest.raises(UnicodeEncodeError):
serialize_html(input, {"encoding": "iso-8859-1"})
def testDoctypeName():
throwsWithLatin1([["Doctype", "\u0101"]])
def testDoctypePublicId():
throwsWithLatin1([["Doctype", "potato", "\u0101"]])
def testDoctypeSystemId():
throwsWithLatin1([["Doctype", "potato", "potato", "\u0101"]])
def testCdataCharacters():
runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]],
["<style>&amacr;"], {"encoding": "iso-8859-1"})
def testCharacters():
runSerializerTest([["Characters", "\u0101"]],
["&amacr;"], {"encoding": "iso-8859-1"})
def testStartTagName():
throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "\u0101", []]])
def testAttributeName():
throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "\u0101", "value": "potato"}]]])
def testAttributeValue():
runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "span",
[{"namespace": None, "name": "potato", "value": "\u0101"}]]],
["<span potato=&amacr;>"], {"encoding": "iso-8859-1"})
def testEndTagName():
throwsWithLatin1([["EndTag", "http://www.w3.org/1999/xhtml", "\u0101"]])
def testComment():
throwsWithLatin1([["Comment", "\u0101"]])
def testThrowsUnknownOption():
with pytest.raises(TypeError):
HTMLSerializer(foobar=None)
@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"))
def testSpecQuoteAttribute(c):
input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
[{"namespace": None, "name": "foo", "value": c}]]]
if c == '"':
output_ = ["<span foo='%s'>" % c]
else:
output_ = ['<span foo="%s">' % c]
options_ = {"quote_attr_values": "spec"}
runSerializerTest(input_, output_, options_)
@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
"\u3000"))
def testLegacyQuoteAttribute(c):
input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
[{"namespace": None, "name": "foo", "value": c}]]]
if c == '"':
output_ = ["<span foo='%s'>" % c]
else:
output_ = ['<span foo="%s">' % c]
options_ = {"quote_attr_values": "legacy"}
runSerializerTest(input_, output_, options_)
@pytest.fixture
def lxml_parser():
return etree.XMLParser(resolve_entities=False)
@pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
def testEntityReplacement(lxml_parser):
doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
result = serialize(tree, tree="lxml", omit_optional_tags=False)
assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>'
@pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
def testEntityXML(lxml_parser):
doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>'
tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
result = serialize(tree, tree="lxml", omit_optional_tags=False)
assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>'
@pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
def testEntityNoResolve(lxml_parser):
doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
result = serialize(tree, tree="lxml", omit_optional_tags=False,
resolve_entities=False)
assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
def test_serializer():
for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)):
with open(filename) as fp:
tests = json.load(fp)
for test in tests['tests']:
yield runSerializerTest, test["input"], test["expected"], test.get("options", {})

View file

@ -0,0 +1,323 @@
from __future__ import absolute_import, division, unicode_literals
from . import support # noqa
import codecs
import sys
from io import BytesIO, StringIO
import pytest
import six
from six.moves import http_client, urllib
from html5lib._inputstream import (BufferedStream, HTMLInputStream,
HTMLUnicodeInputStream, HTMLBinaryInputStream)
from html5lib._utils import supports_lone_surrogates
def test_basic():
s = b"abc"
fp = BufferedStream(BytesIO(s))
read = fp.read(10)
assert read == s
def test_read_length():
fp = BufferedStream(BytesIO(b"abcdef"))
read1 = fp.read(1)
assert read1 == b"a"
read2 = fp.read(2)
assert read2 == b"bc"
read3 = fp.read(3)
assert read3 == b"def"
read4 = fp.read(4)
assert read4 == b""
def test_tell():
fp = BufferedStream(BytesIO(b"abcdef"))
read1 = fp.read(1)
assert read1 == b"a"
assert fp.tell() == 1
read2 = fp.read(2)
assert read2 == b"bc"
assert fp.tell() == 3
read3 = fp.read(3)
assert read3 == b"def"
assert fp.tell() == 6
read4 = fp.read(4)
assert read4 == b""
assert fp.tell() == 6
def test_seek():
fp = BufferedStream(BytesIO(b"abcdef"))
read1 = fp.read(1)
assert read1 == b"a"
fp.seek(0)
read2 = fp.read(1)
assert read2 == b"a"
read3 = fp.read(2)
assert read3 == b"bc"
fp.seek(2)
read4 = fp.read(2)
assert read4 == b"cd"
fp.seek(4)
read5 = fp.read(2)
assert read5 == b"ef"
def test_seek_tell():
fp = BufferedStream(BytesIO(b"abcdef"))
read1 = fp.read(1)
assert read1 == b"a"
assert fp.tell() == 1
fp.seek(0)
read2 = fp.read(1)
assert read2 == b"a"
assert fp.tell() == 1
read3 = fp.read(2)
assert read3 == b"bc"
assert fp.tell() == 3
fp.seek(2)
read4 = fp.read(2)
assert read4 == b"cd"
assert fp.tell() == 4
fp.seek(4)
read5 = fp.read(2)
assert read5 == b"ef"
assert fp.tell() == 6
class HTMLUnicodeInputStreamShortChunk(HTMLUnicodeInputStream):
_defaultChunkSize = 2
class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream):
_defaultChunkSize = 2
def test_char_ascii():
stream = HTMLInputStream(b"'", override_encoding='ascii')
assert stream.charEncoding[0].name == 'windows-1252'
assert stream.char() == "'"
def test_char_utf8():
stream = HTMLInputStream('\u2018'.encode('utf-8'), override_encoding='utf-8')
assert stream.charEncoding[0].name == 'utf-8'
assert stream.char() == '\u2018'
def test_char_win1252():
stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252'))
assert stream.charEncoding[0].name == 'windows-1252'
assert stream.char() == "\xa9"
assert stream.char() == "\xf1"
assert stream.char() == "\u2019"
def test_bom():
stream = HTMLInputStream(codecs.BOM_UTF8 + b"'")
assert stream.charEncoding[0].name == 'utf-8'
assert stream.char() == "'"
def test_utf_16():
stream = HTMLInputStream((' ' * 1025).encode('utf-16'))
assert stream.charEncoding[0].name in ['utf-16le', 'utf-16be']
assert len(stream.charsUntil(' ', True)) == 1025
def test_newlines():
stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe")
assert stream.position() == (1, 0)
assert stream.charsUntil('c') == "a\nbb\n"
assert stream.position() == (3, 0)
assert stream.charsUntil('x') == "ccc\ndddd"
assert stream.position() == (4, 4)
assert stream.charsUntil('e') == "x"
assert stream.position() == (4, 5)
def test_newlines2():
size = HTMLUnicodeInputStream._defaultChunkSize
stream = HTMLInputStream("\r" * size + "\n")
assert stream.charsUntil('x') == "\n" * size
def test_position():
stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh")
assert stream.position() == (1, 0)
assert stream.charsUntil('c') == "a\nbb\n"
assert stream.position() == (3, 0)
stream.unget("\n")
assert stream.position() == (2, 2)
assert stream.charsUntil('c') == "\n"
assert stream.position() == (3, 0)
stream.unget("\n")
assert stream.position() == (2, 2)
assert stream.char() == "\n"
assert stream.position() == (3, 0)
assert stream.charsUntil('e') == "ccc\nddd"
assert stream.position() == (4, 3)
assert stream.charsUntil('h') == "e\nf\ng"
assert stream.position() == (6, 1)
def test_position2():
stream = HTMLUnicodeInputStreamShortChunk("abc\nd")
assert stream.position() == (1, 0)
assert stream.char() == "a"
assert stream.position() == (1, 1)
assert stream.char() == "b"
assert stream.position() == (1, 2)
assert stream.char() == "c"
assert stream.position() == (1, 3)
assert stream.char() == "\n"
assert stream.position() == (2, 0)
assert stream.char() == "d"
assert stream.position() == (2, 1)
def test_python_issue_20007():
"""
Make sure we have a work-around for Python bug #20007
http://bugs.python.org/issue20007
"""
class FakeSocket(object):
def makefile(self, _mode, _bufsize=None):
# pylint:disable=unused-argument
return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
source = http_client.HTTPResponse(FakeSocket())
source.begin()
stream = HTMLInputStream(source)
assert stream.charsUntil(" ") == "Text"
def test_python_issue_20007_b():
"""
Make sure we have a work-around for Python bug #20007
http://bugs.python.org/issue20007
"""
if six.PY2:
return
class FakeSocket(object):
def makefile(self, _mode, _bufsize=None):
# pylint:disable=unused-argument
return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
source = http_client.HTTPResponse(FakeSocket())
source.begin()
wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com")
stream = HTMLInputStream(wrapped)
assert stream.charsUntil(" ") == "Text"
@pytest.mark.parametrize("inp,num",
[("\u0000", 0),
("\u0001", 1),
("\u0008", 1),
("\u0009", 0),
("\u000A", 0),
("\u000B", 1),
("\u000C", 0),
("\u000D", 0),
("\u000E", 1),
("\u001F", 1),
("\u0020", 0),
("\u007E", 0),
("\u007F", 1),
("\u009F", 1),
("\u00A0", 0),
("\uFDCF", 0),
("\uFDD0", 1),
("\uFDEF", 1),
("\uFDF0", 0),
("\uFFFD", 0),
("\uFFFE", 1),
("\uFFFF", 1),
("\U0001FFFD", 0),
("\U0001FFFE", 1),
("\U0001FFFF", 1),
("\U0002FFFD", 0),
("\U0002FFFE", 1),
("\U0002FFFF", 1),
("\U0003FFFD", 0),
("\U0003FFFE", 1),
("\U0003FFFF", 1),
("\U0004FFFD", 0),
("\U0004FFFE", 1),
("\U0004FFFF", 1),
("\U0005FFFD", 0),
("\U0005FFFE", 1),
("\U0005FFFF", 1),
("\U0006FFFD", 0),
("\U0006FFFE", 1),
("\U0006FFFF", 1),
("\U0007FFFD", 0),
("\U0007FFFE", 1),
("\U0007FFFF", 1),
("\U0008FFFD", 0),
("\U0008FFFE", 1),
("\U0008FFFF", 1),
("\U0009FFFD", 0),
("\U0009FFFE", 1),
("\U0009FFFF", 1),
("\U000AFFFD", 0),
("\U000AFFFE", 1),
("\U000AFFFF", 1),
("\U000BFFFD", 0),
("\U000BFFFE", 1),
("\U000BFFFF", 1),
("\U000CFFFD", 0),
("\U000CFFFE", 1),
("\U000CFFFF", 1),
("\U000DFFFD", 0),
("\U000DFFFE", 1),
("\U000DFFFF", 1),
("\U000EFFFD", 0),
("\U000EFFFE", 1),
("\U000EFFFF", 1),
("\U000FFFFD", 0),
("\U000FFFFE", 1),
("\U000FFFFF", 1),
("\U0010FFFD", 0),
("\U0010FFFE", 1),
("\U0010FFFF", 1),
("\x01\x01\x01", 3),
("a\x01a\x01a\x01a", 3)])
def test_invalid_codepoints(inp, num):
stream = HTMLUnicodeInputStream(StringIO(inp))
for _i in range(len(inp)):
stream.char()
assert len(stream.errors) == num
@pytest.mark.skipif(not supports_lone_surrogates, reason="doesn't support lone surrogates")
@pytest.mark.parametrize("inp,num",
[("'\\uD7FF'", 0),
("'\\uD800'", 1),
("'\\uDBFF'", 1),
("'\\uDC00'", 1),
("'\\uDFFF'", 1),
("'\\uE000'", 0),
("'\\uD800\\uD800\\uD800'", 3),
("'a\\uD800a\\uD800a\\uD800a'", 3),
("'\\uDFFF\\uDBFF'", 2),
pytest.mark.skipif(sys.maxunicode == 0xFFFF,
("'\\uDBFF\\uDFFF'", 2),
reason="narrow Python")])
def test_invalid_codepoints_surrogates(inp, num):
inp = eval(inp) # pylint:disable=eval-used
fp = StringIO(inp)
if ord(max(fp.read())) > 0xFFFF:
pytest.skip("StringIO altered string")
fp.seek(0)
stream = HTMLUnicodeInputStream(fp)
for _i in range(len(inp)):
stream.char()
assert len(stream.errors) == num

View file

@ -0,0 +1,40 @@
from __future__ import absolute_import, division, unicode_literals
from . import support # noqa
import html5lib
from html5lib.treeadapters import sax
from html5lib.treewalkers import getTreeWalker
def test_to_sax():
handler = support.TracingSaxHandler()
tree = html5lib.parse("""<html xml:lang="en">
<title>Directory Listing</title>
<a href="/"><b/></p>
""", treebuilder="etree")
walker = getTreeWalker("etree")
sax.to_sax(walker(tree), handler)
expected = [
'startDocument',
('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'),
'html', {(None, 'xml:lang'): 'en'}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}),
('characters', 'Directory Listing'),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
('characters', '\n '),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'),
('characters', '\n '),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'),
'endDocument',
]
assert expected == handler.visited

View file

@ -0,0 +1,136 @@
from __future__ import absolute_import, division, unicode_literals
import itertools
import pytest
try:
import lxml.etree
except ImportError:
pass
from .support import treeTypes
from html5lib import html5parser, treewalkers
from html5lib.filters.lint import Filter as Lint
import re
attrlist = re.compile(r"^(\s+)\w+=.*(\n\1\w+=.*)+", re.M)
def sortattrs(x):
lines = x.group(0).split("\n")
lines.sort()
return "\n".join(lines)
def test_all_tokens():
expected = [
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'},
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
{'data': 'a', 'type': 'Characters'},
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
{'data': 'b', 'type': 'Characters'},
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
{'data': 'c', 'type': 'Characters'},
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
]
for _, treeCls in sorted(treeTypes.items()):
if treeCls is None:
continue
p = html5parser.HTMLParser(tree=treeCls["builder"])
document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
document = treeCls.get("adapter", lambda x: x)(document)
output = Lint(treeCls["walker"](document))
for expectedToken, outputToken in zip(expected, output):
assert expectedToken == outputToken
def set_attribute_on_first_child(docfrag, name, value, treeName):
"""naively sets an attribute on the first child of the document
fragment passed in"""
setter = {'ElementTree': lambda d: d[0].set,
'DOM': lambda d: d.firstChild.setAttribute}
setter['cElementTree'] = setter['ElementTree']
try:
setter.get(treeName, setter['DOM'])(docfrag)(name, value)
except AttributeError:
setter['ElementTree'](docfrag)(name, value)
def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
"""tests what happens when we add attributes to the intext"""
treeName, treeClass = tree
if treeClass is None:
pytest.skip("Treebuilder not loaded")
parser = html5parser.HTMLParser(tree=treeClass["builder"])
document = parser.parseFragment(intext)
for nom, val in attrs_to_add:
set_attribute_on_first_child(document, nom, val, treeName)
document = treeClass.get("adapter", lambda x: x)(document)
output = treewalkers.pprint(treeClass["walker"](document))
output = attrlist.sub(sortattrs, output)
if output not in expected:
raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
def test_treewalker_six_mix():
"""Str/Unicode mix. If str attrs added to tree"""
# On Python 2.x string literals are of type str. Unless, like this
# file, the programmer imports unicode_literals from __future__.
# In that case, string literals become objects of type unicode.
# This test simulates a Py2 user, modifying attributes on a document
# fragment but not using the u'' syntax nor importing unicode_literals
sm_tests = [
('<a href="http://example.com">Example</a>',
[(str('class'), str('test123'))],
'<a>\n class="test123"\n href="http://example.com"\n "Example"'),
('<link href="http://example.com/cow">',
[(str('rel'), str('alternate'))],
'<link>\n href="http://example.com/cow"\n rel="alternate"\n "Example"')
]
for tree in sorted(treeTypes.items()):
for intext, attrs, expected in sm_tests:
yield runTreewalkerEditTest, intext, expected, attrs, tree
@pytest.mark.parametrize("tree,char", itertools.product(sorted(treeTypes.items()), ["x", "\u1234"]))
def test_fragment_single_char(tree, char):
expected = [
{'data': char, 'type': 'Characters'}
]
treeName, treeClass = tree
if treeClass is None:
pytest.skip("Treebuilder not loaded")
parser = html5parser.HTMLParser(tree=treeClass["builder"])
document = parser.parseFragment(char)
document = treeClass.get("adapter", lambda x: x)(document)
output = Lint(treeClass["walker"](document))
assert list(output) == expected
@pytest.mark.skipif(treeTypes["lxml"] is None, reason="lxml not importable")
def test_lxml_xml():
expected = [
{'data': {}, 'name': 'div', 'namespace': None, 'type': 'StartTag'},
{'data': {}, 'name': 'div', 'namespace': None, 'type': 'StartTag'},
{'name': 'div', 'namespace': None, 'type': 'EndTag'},
{'name': 'div', 'namespace': None, 'type': 'EndTag'}
]
lxmltree = lxml.etree.fromstring('<div><div></div></div>')
walker = treewalkers.getTreeWalker('lxml')
output = Lint(walker(lxmltree))
assert list(output) == expected

View file

@ -0,0 +1,125 @@
from __future__ import absolute_import, division, unicode_literals
from html5lib.filters.whitespace import Filter
from html5lib.constants import spaceCharacters
spaceCharacters = "".join(spaceCharacters)
def runTest(input, expected):
output = list(Filter(input))
errorMsg = "\n".join(["\n\nInput:", str(input),
"\nExpected:", str(expected),
"\nReceived:", str(output)])
assert expected == output, errorMsg
def runTestUnmodifiedOutput(input):
runTest(input, input)
def testPhrasingElements():
runTestUnmodifiedOutput(
[{"type": "Characters", "data": "This is a "},
{"type": "StartTag", "name": "span", "data": []},
{"type": "Characters", "data": "phrase"},
{"type": "EndTag", "name": "span", "data": []},
{"type": "SpaceCharacters", "data": " "},
{"type": "Characters", "data": "with"},
{"type": "SpaceCharacters", "data": " "},
{"type": "StartTag", "name": "em", "data": []},
{"type": "Characters", "data": "emphasised text"},
{"type": "EndTag", "name": "em", "data": []},
{"type": "Characters", "data": " and an "},
{"type": "StartTag", "name": "img", "data": [["alt", "image"]]},
{"type": "Characters", "data": "."}])
def testLeadingWhitespace():
runTest(
[{"type": "StartTag", "name": "p", "data": []},
{"type": "SpaceCharacters", "data": spaceCharacters},
{"type": "Characters", "data": "foo"},
{"type": "EndTag", "name": "p", "data": []}],
[{"type": "StartTag", "name": "p", "data": []},
{"type": "SpaceCharacters", "data": " "},
{"type": "Characters", "data": "foo"},
{"type": "EndTag", "name": "p", "data": []}])
def testLeadingWhitespaceAsCharacters():
runTest(
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": spaceCharacters + "foo"},
{"type": "EndTag", "name": "p", "data": []}],
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": " foo"},
{"type": "EndTag", "name": "p", "data": []}])
def testTrailingWhitespace():
runTest(
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": "foo"},
{"type": "SpaceCharacters", "data": spaceCharacters},
{"type": "EndTag", "name": "p", "data": []}],
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": "foo"},
{"type": "SpaceCharacters", "data": " "},
{"type": "EndTag", "name": "p", "data": []}])
def testTrailingWhitespaceAsCharacters():
runTest(
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": "foo" + spaceCharacters},
{"type": "EndTag", "name": "p", "data": []}],
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": "foo "},
{"type": "EndTag", "name": "p", "data": []}])
def testWhitespace():
runTest(
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": "foo" + spaceCharacters + "bar"},
{"type": "EndTag", "name": "p", "data": []}],
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": "foo bar"},
{"type": "EndTag", "name": "p", "data": []}])
def testLeadingWhitespaceInPre():
runTestUnmodifiedOutput(
[{"type": "StartTag", "name": "pre", "data": []},
{"type": "SpaceCharacters", "data": spaceCharacters},
{"type": "Characters", "data": "foo"},
{"type": "EndTag", "name": "pre", "data": []}])
def testLeadingWhitespaceAsCharactersInPre():
runTestUnmodifiedOutput(
[{"type": "StartTag", "name": "pre", "data": []},
{"type": "Characters", "data": spaceCharacters + "foo"},
{"type": "EndTag", "name": "pre", "data": []}])
def testTrailingWhitespaceInPre():
runTestUnmodifiedOutput(
[{"type": "StartTag", "name": "pre", "data": []},
{"type": "Characters", "data": "foo"},
{"type": "SpaceCharacters", "data": spaceCharacters},
{"type": "EndTag", "name": "pre", "data": []}])
def testTrailingWhitespaceAsCharactersInPre():
runTestUnmodifiedOutput(
[{"type": "StartTag", "name": "pre", "data": []},
{"type": "Characters", "data": "foo" + spaceCharacters},
{"type": "EndTag", "name": "pre", "data": []}])
def testWhitespaceInPre():
runTestUnmodifiedOutput(
[{"type": "StartTag", "name": "pre", "data": []},
{"type": "Characters", "data": "foo" + spaceCharacters + "bar"},
{"type": "EndTag", "name": "pre", "data": []}])

View file

@ -0,0 +1,252 @@
from __future__ import absolute_import, division, unicode_literals
import codecs
import json
import warnings
import re
import pytest
from six import unichr
from html5lib._tokenizer import HTMLTokenizer
from html5lib import constants, _utils
class TokenizerTestParser(object):
def __init__(self, initialState, lastStartTag=None):
self.tokenizer = HTMLTokenizer
self._state = initialState
self._lastStartTag = lastStartTag
def parse(self, stream, encoding=None, innerHTML=False):
# pylint:disable=unused-argument
tokenizer = self.tokenizer(stream, encoding)
self.outputTokens = []
tokenizer.state = getattr(tokenizer, self._state)
if self._lastStartTag is not None:
tokenizer.currentToken = {"type": "startTag",
"name": self._lastStartTag}
types = dict((v, k) for k, v in constants.tokenTypes.items())
for token in tokenizer:
getattr(self, 'process%s' % types[token["type"]])(token)
return self.outputTokens
def processDoctype(self, token):
self.outputTokens.append(["DOCTYPE", token["name"], token["publicId"],
token["systemId"], token["correct"]])
def processStartTag(self, token):
self.outputTokens.append(["StartTag", token["name"],
dict(token["data"][::-1]), token["selfClosing"]])
def processEmptyTag(self, token):
if token["name"] not in constants.voidElements:
self.outputTokens.append("ParseError")
self.outputTokens.append(["StartTag", token["name"], dict(token["data"][::-1])])
def processEndTag(self, token):
self.outputTokens.append(["EndTag", token["name"],
token["selfClosing"]])
def processComment(self, token):
self.outputTokens.append(["Comment", token["data"]])
def processSpaceCharacters(self, token):
self.outputTokens.append(["Character", token["data"]])
self.processSpaceCharacters = self.processCharacters
def processCharacters(self, token):
self.outputTokens.append(["Character", token["data"]])
def processEOF(self, token):
pass
def processParseError(self, token):
self.outputTokens.append(["ParseError", token["data"]])
def concatenateCharacterTokens(tokens):
outputTokens = []
for token in tokens:
if "ParseError" not in token and token[0] == "Character":
if (outputTokens and "ParseError" not in outputTokens[-1] and
outputTokens[-1][0] == "Character"):
outputTokens[-1][1] += token[1]
else:
outputTokens.append(token)
else:
outputTokens.append(token)
return outputTokens
def normalizeTokens(tokens):
# TODO: convert tests to reflect arrays
for i, token in enumerate(tokens):
if token[0] == 'ParseError':
tokens[i] = token[0]
return tokens
def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
ignoreErrors=False):
"""Test whether the test has passed or failed
If the ignoreErrorOrder flag is set to true we don't test the relative
positions of parse errors and non parse errors
"""
checkSelfClosing = False
for token in expectedTokens:
if (token[0] == "StartTag" and len(token) == 4 or
token[0] == "EndTag" and len(token) == 3):
checkSelfClosing = True
break
if not checkSelfClosing:
for token in receivedTokens:
if token[0] == "StartTag" or token[0] == "EndTag":
token.pop()
if not ignoreErrorOrder and not ignoreErrors:
expectedTokens = concatenateCharacterTokens(expectedTokens)
return expectedTokens == receivedTokens
else:
# Sort the tokens into two groups; non-parse errors and parse errors
tokens = {"expected": [[], []], "received": [[], []]}
for tokenType, tokenList in zip(list(tokens.keys()),
(expectedTokens, receivedTokens)):
for token in tokenList:
if token != "ParseError":
tokens[tokenType][0].append(token)
else:
if not ignoreErrors:
tokens[tokenType][1].append(token)
tokens[tokenType][0] = concatenateCharacterTokens(tokens[tokenType][0])
return tokens["expected"] == tokens["received"]
_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")
def unescape(test):
def decode(inp):
"""Decode \\uXXXX escapes
This decodes \\uXXXX escapes, possibly into non-BMP characters when
two surrogate character escapes are adjacent to each other.
"""
# This cannot be implemented using the unicode_escape codec
# because that requires its input be ISO-8859-1, and we need
# arbitrary unicode as input.
def repl(m):
if m.group(2) is not None:
high = int(m.group(1), 16)
low = int(m.group(2), 16)
if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF:
cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
return unichr(cp)
else:
return unichr(high) + unichr(low)
else:
return unichr(int(m.group(1), 16))
try:
return _surrogateRe.sub(repl, inp)
except ValueError:
# This occurs when unichr throws ValueError, which should
# only be for a lone-surrogate.
if _utils.supports_lone_surrogates:
raise
return None
test["input"] = decode(test["input"])
for token in test["output"]:
if token == "ParseError":
continue
else:
token[1] = decode(token[1])
if len(token) > 2:
for key, value in token[2]:
del token[2][key]
token[2][decode(key)] = decode(value)
return test
def _doCapitalize(match):
return match.group(1).upper()
_capitalizeRe = re.compile(r"\W+(\w)").sub
def capitalize(s):
s = s.lower()
s = _capitalizeRe(_doCapitalize, s)
return s
class TokenizerFile(pytest.File):
def collect(self):
with codecs.open(str(self.fspath), "r", encoding="utf-8") as fp:
tests = json.load(fp)
if 'tests' in tests:
for i, test in enumerate(tests['tests']):
yield TokenizerTestCollector(str(i), self, testdata=test)
class TokenizerTestCollector(pytest.Collector):
def __init__(self, name, parent=None, config=None, session=None, testdata=None):
super(TokenizerTestCollector, self).__init__(name, parent, config, session)
if 'initialStates' not in testdata:
testdata["initialStates"] = ["Data state"]
if 'doubleEscaped' in testdata:
testdata = unescape(testdata)
self.testdata = testdata
def collect(self):
for initialState in self.testdata["initialStates"]:
initialState = capitalize(initialState)
item = TokenizerTest(initialState,
self,
self.testdata,
initialState)
if self.testdata["input"] is None:
item.add_marker(pytest.mark.skipif(True, reason="Relies on lone surrogates"))
yield item
class TokenizerTest(pytest.Item):
def __init__(self, name, parent, test, initialState):
super(TokenizerTest, self).__init__(name, parent)
self.obj = lambda: 1 # this is to hack around skipif needing a function!
self.test = test
self.initialState = initialState
def runtest(self):
warnings.resetwarnings()
warnings.simplefilter("error")
expected = self.test['output']
if 'lastStartTag' not in self.test:
self.test['lastStartTag'] = None
parser = TokenizerTestParser(self.initialState,
self.test['lastStartTag'])
tokens = parser.parse(self.test['input'])
received = normalizeTokens(tokens)
errorMsg = "\n".join(["\n\nInitial state:",
self.initialState,
"\nInput:", self.test['input'],
"\nExpected:", repr(expected),
"\nreceived:", repr(tokens)])
errorMsg = errorMsg
ignoreErrorOrder = self.test.get('ignoreErrorOrder', False)
assert tokensMatch(expected, received, ignoreErrorOrder, True), errorMsg
def repr_failure(self, excinfo):
traceback = excinfo.traceback
ntraceback = traceback.cut(path=__file__)
excinfo.traceback = ntraceback.filter()
return excinfo.getrepr(funcargs=True,
showlocals=False,
style="short", tbfilter=False)

View file

@ -0,0 +1,68 @@
from __future__ import absolute_import, division, unicode_literals
import sys
import os
import json
import re
import html5lib
from . import support
from . import test_tokenizer
p = html5lib.HTMLParser()
unnamespaceExpected = re.compile(r"^(\|\s*)<html ([^>]+)>", re.M).sub
def main(out_path):
if not os.path.exists(out_path):
sys.stderr.write("Path %s does not exist" % out_path)
sys.exit(1)
for filename in support.get_data_files('tokenizer', '*.test'):
run_file(filename, out_path)
def run_file(filename, out_path):
try:
tests_data = json.load(open(filename, "r"))
except ValueError:
sys.stderr.write("Failed to load %s\n" % filename)
return
name = os.path.splitext(os.path.split(filename)[1])[0]
output_file = open(os.path.join(out_path, "tokenizer_%s.dat" % name), "w")
if 'tests' in tests_data:
for test_data in tests_data['tests']:
if 'initialStates' not in test_data:
test_data["initialStates"] = ["Data state"]
for initial_state in test_data["initialStates"]:
if initial_state != "Data state":
# don't support this yet
continue
test = make_test(test_data)
output_file.write(test)
output_file.close()
def make_test(test_data):
if 'doubleEscaped' in test_data:
test_data = test_tokenizer.unescape_test(test_data)
rv = []
rv.append("#data")
rv.append(test_data["input"].encode("utf8"))
rv.append("#errors")
tree = p.parse(test_data["input"])
output = p.tree.testSerializer(tree)
output = "\n".join(("| " + line[3:]) if line.startswith("| ") else line
for line in output.split("\n"))
output = unnamespaceExpected(r"\1<\2>", output)
rv.append(output.encode("utf8"))
rv.append("")
return "\n".join(rv)
if __name__ == "__main__":
main(sys.argv[1])

View file

@ -0,0 +1,204 @@
from __future__ import absolute_import, division, unicode_literals
import itertools
import re
import warnings
from difflib import unified_diff
import pytest
from .support import TestData, convert, convertExpected, treeTypes
from html5lib import html5parser, constants, treewalkers
from html5lib.filters.lint import Filter as Lint
_attrlist_re = re.compile(r"^(\s+)\w+=.*(\n\1\w+=.*)+", re.M)
def sortattrs(s):
def replace(m):
lines = m.group(0).split("\n")
lines.sort()
return "\n".join(lines)
return _attrlist_re.sub(replace, s)
class TreeConstructionFile(pytest.File):
def collect(self):
tests = TestData(str(self.fspath), "data")
for i, test in enumerate(tests):
yield TreeConstructionTest(str(i), self, testdata=test)
class TreeConstructionTest(pytest.Collector):
def __init__(self, name, parent=None, config=None, session=None, testdata=None):
super(TreeConstructionTest, self).__init__(name, parent, config, session)
self.testdata = testdata
def collect(self):
for treeName, treeAPIs in sorted(treeTypes.items()):
for x in itertools.chain(self._getParserTests(treeName, treeAPIs),
self._getTreeWalkerTests(treeName, treeAPIs)):
yield x
def _getParserTests(self, treeName, treeAPIs):
if treeAPIs is not None and "adapter" in treeAPIs:
return
for namespaceHTMLElements in (True, False):
if namespaceHTMLElements:
nodeid = "%s::parser::namespaced" % treeName
else:
nodeid = "%s::parser::void-namespace" % treeName
item = ParserTest(nodeid,
self,
self.testdata,
treeAPIs["builder"] if treeAPIs is not None else None,
namespaceHTMLElements)
item.add_marker(getattr(pytest.mark, treeName))
item.add_marker(pytest.mark.parser)
if namespaceHTMLElements:
item.add_marker(pytest.mark.namespaced)
if treeAPIs is None:
item.add_marker(pytest.mark.skipif(True, reason="Treebuilder not loaded"))
yield item
def _getTreeWalkerTests(self, treeName, treeAPIs):
nodeid = "%s::treewalker" % treeName
item = TreeWalkerTest(nodeid,
self,
self.testdata,
treeAPIs)
item.add_marker(getattr(pytest.mark, treeName))
item.add_marker(pytest.mark.treewalker)
if treeAPIs is None:
item.add_marker(pytest.mark.skipif(True, reason="Treebuilder not loaded"))
yield item
def convertTreeDump(data):
return "\n".join(convert(3)(data).split("\n")[1:])
namespaceExpected = re.compile(r"^(\s*)<(\S+)>", re.M).sub
class ParserTest(pytest.Item):
def __init__(self, name, parent, test, treeClass, namespaceHTMLElements):
super(ParserTest, self).__init__(name, parent)
self.obj = lambda: 1 # this is to hack around skipif needing a function!
self.test = test
self.treeClass = treeClass
self.namespaceHTMLElements = namespaceHTMLElements
def runtest(self):
p = html5parser.HTMLParser(tree=self.treeClass,
namespaceHTMLElements=self.namespaceHTMLElements)
input = self.test['data']
fragmentContainer = self.test['document-fragment']
expected = convertExpected(self.test['document'])
expectedErrors = self.test['errors'].split("\n") if self.test['errors'] else []
scripting = False
if 'script-on' in self.test:
scripting = True
with warnings.catch_warnings():
warnings.simplefilter("error")
try:
if fragmentContainer:
document = p.parseFragment(input, fragmentContainer, scripting=scripting)
else:
document = p.parse(input, scripting=scripting)
except constants.DataLossWarning:
pytest.skip("data loss warning")
output = convertTreeDump(p.tree.testSerializer(document))
expected = expected
if self.namespaceHTMLElements:
expected = namespaceExpected(r"\1<html \2>", expected)
errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
"\nReceived:", output])
assert expected == output, errorMsg
errStr = []
for (line, col), errorcode, datavars in p.errors:
assert isinstance(datavars, dict), "%s, %s" % (errorcode, repr(datavars))
errStr.append("Line: %i Col: %i %s" % (line, col,
constants.E[errorcode] % datavars))
errorMsg2 = "\n".join(["\n\nInput:", input,
"\nExpected errors (" + str(len(expectedErrors)) + "):\n" + "\n".join(expectedErrors),
"\nActual errors (" + str(len(p.errors)) + "):\n" + "\n".join(errStr)])
if False: # we're currently not testing parse errors
assert len(p.errors) == len(expectedErrors), errorMsg2
def repr_failure(self, excinfo):
traceback = excinfo.traceback
ntraceback = traceback.cut(path=__file__)
excinfo.traceback = ntraceback.filter()
return excinfo.getrepr(funcargs=True,
showlocals=False,
style="short", tbfilter=False)
class TreeWalkerTest(pytest.Item):
def __init__(self, name, parent, test, treeAPIs):
super(TreeWalkerTest, self).__init__(name, parent)
self.obj = lambda: 1 # this is to hack around skipif needing a function!
self.test = test
self.treeAPIs = treeAPIs
def runtest(self):
p = html5parser.HTMLParser(tree=self.treeAPIs["builder"])
input = self.test['data']
fragmentContainer = self.test['document-fragment']
expected = convertExpected(self.test['document'])
scripting = False
if 'script-on' in self.test:
scripting = True
with warnings.catch_warnings():
warnings.simplefilter("error")
try:
if fragmentContainer:
document = p.parseFragment(input, fragmentContainer, scripting=scripting)
else:
document = p.parse(input, scripting=scripting)
except constants.DataLossWarning:
pytest.skip("data loss warning")
poutput = convertTreeDump(p.tree.testSerializer(document))
namespace_expected = namespaceExpected(r"\1<html \2>", expected)
if poutput != namespace_expected:
pytest.skip("parser output incorrect")
document = self.treeAPIs.get("adapter", lambda x: x)(document)
try:
output = treewalkers.pprint(Lint(self.treeAPIs["walker"](document)))
output = sortattrs(output)
expected = sortattrs(expected)
diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()],
[line + "\n" for line in output.splitlines()],
"Expected", "Received"))
assert expected == output, "\n".join([
"", "Input:", input,
"", "Expected:", expected,
"", "Received:", output,
"", "Diff:", diff,
])
except NotImplementedError:
pytest.skip("tree walker NotImplementedError")
def repr_failure(self, excinfo):
traceback = excinfo.traceback
ntraceback = traceback.cut(path=__file__)
excinfo.traceback = ntraceback.filter()
return excinfo.getrepr(funcargs=True,
showlocals=False,
style="short", tbfilter=False)

View file

@ -0,0 +1,3 @@
<!doctype html>
<title>Test</title>
<p>Hello World!

View file

@ -0,0 +1,3 @@
<!doctype html>
<title>Test</title>
<p>Hello World! ©

View file

@ -0,0 +1,30 @@
"""Tree adapters let you convert from one tree structure to another
Example:
.. code-block:: python
import html5lib
from html5lib.treeadapters import genshi
doc = '<html><body>Hi!</body></html>'
treebuilder = html5lib.getTreeBuilder('etree')
parser = html5lib.HTMLParser(tree=treebuilder)
tree = parser.parse(doc)
TreeWalker = html5lib.getTreeWalker('etree')
genshi_tree = genshi.to_genshi(TreeWalker(tree))
"""
from __future__ import absolute_import, division, unicode_literals
from . import sax
__all__ = ["sax"]
try:
from . import genshi # noqa
except ImportError:
pass
else:
__all__.append("genshi")

View file

@ -0,0 +1,54 @@
from __future__ import absolute_import, division, unicode_literals
from genshi.core import QName, Attrs
from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
def to_genshi(walker):
"""Convert a tree to a genshi tree
:arg walker: the treewalker to use to walk the tree to convert it
:returns: generator of genshi nodes
"""
text = []
for token in walker:
type = token["type"]
if type in ("Characters", "SpaceCharacters"):
text.append(token["data"])
elif text:
yield TEXT, "".join(text), (None, -1, -1)
text = []
if type in ("StartTag", "EmptyTag"):
if token["namespace"]:
name = "{%s}%s" % (token["namespace"], token["name"])
else:
name = token["name"]
attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
for attr, value in token["data"].items()])
yield (START, (QName(name), attrs), (None, -1, -1))
if type == "EmptyTag":
type = "EndTag"
if type == "EndTag":
if token["namespace"]:
name = "{%s}%s" % (token["namespace"], token["name"])
else:
name = token["name"]
yield END, QName(name), (None, -1, -1)
elif type == "Comment":
yield COMMENT, token["data"], (None, -1, -1)
elif type == "Doctype":
yield DOCTYPE, (token["name"], token["publicId"],
token["systemId"]), (None, -1, -1)
else:
pass # FIXME: What to do?
if text:
yield TEXT, "".join(text), (None, -1, -1)

View file

@ -0,0 +1,50 @@
from __future__ import absolute_import, division, unicode_literals
from xml.sax.xmlreader import AttributesNSImpl
from ..constants import adjustForeignAttributes, unadjustForeignAttributes
prefix_mapping = {}
for prefix, localName, namespace in adjustForeignAttributes.values():
if prefix is not None:
prefix_mapping[prefix] = namespace
def to_sax(walker, handler):
"""Call SAX-like content handler based on treewalker walker
:arg walker: the treewalker to use to walk the tree to convert it
:arg handler: SAX handler to use
"""
handler.startDocument()
for prefix, namespace in prefix_mapping.items():
handler.startPrefixMapping(prefix, namespace)
for token in walker:
type = token["type"]
if type == "Doctype":
continue
elif type in ("StartTag", "EmptyTag"):
attrs = AttributesNSImpl(token["data"],
unadjustForeignAttributes)
handler.startElementNS((token["namespace"], token["name"]),
token["name"],
attrs)
if type == "EmptyTag":
handler.endElementNS((token["namespace"], token["name"]),
token["name"])
elif type == "EndTag":
handler.endElementNS((token["namespace"], token["name"]),
token["name"])
elif type in ("Characters", "SpaceCharacters"):
handler.characters(token["data"])
elif type == "Comment":
pass
else:
assert False, "Unknown token type"
for prefix, namespace in prefix_mapping.items():
handler.endPrefixMapping(prefix)
handler.endDocument()

View file

@ -0,0 +1,88 @@
"""A collection of modules for building different kinds of trees from HTML
documents.
To create a treebuilder for a new type of tree, you need to do
implement several things:
1. A set of classes for various types of elements: Document, Doctype, Comment,
Element. These must implement the interface of ``base.treebuilders.Node``
(although comment nodes have a different signature for their constructor,
see ``treebuilders.etree.Comment``) Textual content may also be implemented
as another node type, or not, as your tree implementation requires.
2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
* ``documentClass`` - the class to use for the bottommost node of a document
* ``elementClass`` - the class to use for HTML Elements
* ``commentClass`` - the class to use for comments
* ``doctypeClass`` - the class to use for doctypes
It also has one required method:
* ``getDocument`` - Returns the root node of the complete document tree
3. If you wish to run the unit tests, you must also create a ``testSerializer``
method on your treebuilder which accepts a node and returns a string
containing Node and its children serialized according to the format used in
the unittests
"""
from __future__ import absolute_import, division, unicode_literals
from .._utils import default_etree
treeBuilderCache = {}
def getTreeBuilder(treeType, implementation=None, **kwargs):
"""Get a TreeBuilder class for various types of trees with built-in support
:arg treeType: the name of the tree type required (case-insensitive). Supported
values are:
* "dom" - A generic builder for DOM implementations, defaulting to a
xml.dom.minidom based implementation.
* "etree" - A generic builder for tree implementations exposing an
ElementTree-like interface, defaulting to xml.etree.cElementTree if
available and xml.etree.ElementTree if not.
* "lxml" - A etree-based builder for lxml.etree, handling limitations
of lxml's implementation.
:arg implementation: (Currently applies to the "etree" and "dom" tree
types). A module implementing the tree type e.g. xml.etree.ElementTree
or xml.etree.cElementTree.
:arg kwargs: Any additional options to pass to the TreeBuilder when
creating it.
Example:
>>> from html5lib.treebuilders import getTreeBuilder
>>> builder = getTreeBuilder('etree')
"""
treeType = treeType.lower()
if treeType not in treeBuilderCache:
if treeType == "dom":
from . import dom
# Come up with a sane default (pref. from the stdlib)
if implementation is None:
from xml.dom import minidom
implementation = minidom
# NEVER cache here, caching is done in the dom submodule
return dom.getDomModule(implementation, **kwargs).TreeBuilder
elif treeType == "lxml":
from . import etree_lxml
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
elif treeType == "etree":
from . import etree
if implementation is None:
implementation = default_etree
# NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
else:
raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
return treeBuilderCache.get(treeType)

View file

@ -0,0 +1,417 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from ..constants import scopingElements, tableInsertModeElements, namespaces
# The scope markers are inserted when entering object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, object elements, and marquees.
Marker = None
listElementsMap = {
None: (frozenset(scopingElements), False),
"button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
"list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
(namespaces["html"], "ul")])), False),
"table": (frozenset([(namespaces["html"], "html"),
(namespaces["html"], "table")]), False),
"select": (frozenset([(namespaces["html"], "optgroup"),
(namespaces["html"], "option")]), True)
}
class Node(object):
"""Represents an item in the tree"""
def __init__(self, name):
"""Creates a Node
:arg name: The tag name associated with the node
"""
# The tag name assocaited with the node
self.name = name
# The parent of the current node (or None for the document node)
self.parent = None
# The value of the current node (applies to text nodes and comments)
self.value = None
# A dict holding name -> value pairs for attributes of the node
self.attributes = {}
# A list of child nodes of the current node. This must include all
# elements but not necessarily other node types.
self.childNodes = []
# A list of miscellaneous flags that can be set on the node.
self._flags = []
def __str__(self):
attributesStr = " ".join(["%s=\"%s\"" % (name, value)
for name, value in
self.attributes.items()])
if attributesStr:
return "<%s %s>" % (self.name, attributesStr)
else:
return "<%s>" % (self.name)
def __repr__(self):
return "<%s>" % (self.name)
def appendChild(self, node):
"""Insert node as a child of the current node
:arg node: the node to insert
"""
raise NotImplementedError
def insertText(self, data, insertBefore=None):
"""Insert data as text in the current node, positioned before the
start of node insertBefore or to the end of the node's text.
:arg data: the data to insert
:arg insertBefore: True if you want to insert the text before the node
and False if you want to insert it after the node
"""
raise NotImplementedError
def insertBefore(self, node, refNode):
"""Insert node as a child of the current node, before refNode in the
list of child nodes. Raises ValueError if refNode is not a child of
the current node
:arg node: the node to insert
:arg refNode: the child node to insert the node before
"""
raise NotImplementedError
def removeChild(self, node):
"""Remove node from the children of the current node
:arg node: the child node to remove
"""
raise NotImplementedError
def reparentChildren(self, newParent):
"""Move all the children of the current node to newParent.
This is needed so that trees that don't store text as nodes move the
text in the correct way
:arg newParent: the node to move all this node's children to
"""
# XXX - should this method be made more general?
for child in self.childNodes:
newParent.appendChild(child)
self.childNodes = []
def cloneNode(self):
"""Return a shallow copy of the current node i.e. a node with the same
name and attributes but with no parent or child nodes
"""
raise NotImplementedError
def hasContent(self):
"""Return true if the node has children or text, false otherwise
"""
raise NotImplementedError
class ActiveFormattingElements(list):
def append(self, node):
equalCount = 0
if node != Marker:
for element in self[::-1]:
if element == Marker:
break
if self.nodesEqual(element, node):
equalCount += 1
if equalCount == 3:
self.remove(element)
break
list.append(self, node)
def nodesEqual(self, node1, node2):
if not node1.nameTuple == node2.nameTuple:
return False
if not node1.attributes == node2.attributes:
return False
return True
class TreeBuilder(object):
"""Base treebuilder implementation
* documentClass - the class to use for the bottommost node of a document
* elementClass - the class to use for HTML Elements
* commentClass - the class to use for comments
* doctypeClass - the class to use for doctypes
"""
# pylint:disable=not-callable
# Document class
documentClass = None
# The class to use for creating a node
elementClass = None
# The class to use for creating comments
commentClass = None
# The class to use for creating doctypes
doctypeClass = None
# Fragment class
fragmentClass = None
def __init__(self, namespaceHTMLElements):
"""Create a TreeBuilder
:arg namespaceHTMLElements: whether or not to namespace HTML elements
"""
if namespaceHTMLElements:
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
else:
self.defaultNamespace = None
self.reset()
def reset(self):
self.openElements = []
self.activeFormattingElements = ActiveFormattingElements()
# XXX - rename these to headElement, formElement
self.headPointer = None
self.formPointer = None
self.insertFromTable = False
self.document = self.documentClass()
def elementInScope(self, target, variant=None):
# If we pass a node in we match that. if we pass a string
# match any node with that name
exactNode = hasattr(target, "nameTuple")
if not exactNode:
if isinstance(target, text_type):
target = (namespaces["html"], target)
assert isinstance(target, tuple)
listElements, invert = listElementsMap[variant]
for node in reversed(self.openElements):
if exactNode and node == target:
return True
elif not exactNode and node.nameTuple == target:
return True
elif (invert ^ (node.nameTuple in listElements)):
return False
assert False # We should never reach this point
def reconstructActiveFormattingElements(self):
# Within this algorithm the order of steps described in the
# specification is not quite the same as the order of steps in the
# code. It should still do the same though.
# Step 1: stop the algorithm when there's nothing to do.
if not self.activeFormattingElements:
return
# Step 2 and step 3: we start with the last element. So i is -1.
i = len(self.activeFormattingElements) - 1
entry = self.activeFormattingElements[i]
if entry == Marker or entry in self.openElements:
return
# Step 6
while entry != Marker and entry not in self.openElements:
if i == 0:
# This will be reset to 0 below
i = -1
break
i -= 1
# Step 5: let entry be one earlier in the list.
entry = self.activeFormattingElements[i]
while True:
# Step 7
i += 1
# Step 8
entry = self.activeFormattingElements[i]
clone = entry.cloneNode() # Mainly to get a new copy of the attributes
# Step 9
element = self.insertElement({"type": "StartTag",
"name": clone.name,
"namespace": clone.namespace,
"data": clone.attributes})
# Step 10
self.activeFormattingElements[i] = element
# Step 11
if element == self.activeFormattingElements[-1]:
break
def clearActiveFormattingElements(self):
entry = self.activeFormattingElements.pop()
while self.activeFormattingElements and entry != Marker:
entry = self.activeFormattingElements.pop()
def elementInActiveFormattingElements(self, name):
"""Check if an element exists between the end of the active
formatting elements and the last marker. If it does, return it, else
return false"""
for item in self.activeFormattingElements[::-1]:
# Check for Marker first because if it's a Marker it doesn't have a
# name attribute.
if item == Marker:
break
elif item.name == name:
return item
return False
def insertRoot(self, token):
element = self.createElement(token)
self.openElements.append(element)
self.document.appendChild(element)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
doctype = self.doctypeClass(name, publicId, systemId)
self.document.appendChild(doctype)
def insertComment(self, token, parent=None):
if parent is None:
parent = self.openElements[-1]
parent.appendChild(self.commentClass(token["data"]))
def createElement(self, token):
"""Create an element but don't insert it anywhere"""
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace)
element.attributes = token["data"]
return element
def _getInsertFromTable(self):
return self._insertFromTable
def _setInsertFromTable(self, value):
"""Switch the function used to insert an element from the
normal one to the misnested table one and back again"""
self._insertFromTable = value
if value:
self.insertElement = self.insertElementTable
else:
self.insertElement = self.insertElementNormal
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
def insertElementNormal(self, token):
name = token["name"]
assert isinstance(name, text_type), "Element %s not unicode" % name
namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace)
element.attributes = token["data"]
self.openElements[-1].appendChild(element)
self.openElements.append(element)
return element
def insertElementTable(self, token):
"""Create an element and insert it into the tree"""
element = self.createElement(token)
if self.openElements[-1].name not in tableInsertModeElements:
return self.insertElementNormal(token)
else:
# We should be in the InTable mode. This means we want to do
# special magic element rearranging
parent, insertBefore = self.getTableMisnestedNodePosition()
if insertBefore is None:
parent.appendChild(element)
else:
parent.insertBefore(element, insertBefore)
self.openElements.append(element)
return element
def insertText(self, data, parent=None):
"""Insert text data."""
if parent is None:
parent = self.openElements[-1]
if (not self.insertFromTable or (self.insertFromTable and
self.openElements[-1].name
not in tableInsertModeElements)):
parent.insertText(data)
else:
# We should be in the InTable mode. This means we want to do
# special magic element rearranging
parent, insertBefore = self.getTableMisnestedNodePosition()
parent.insertText(data, insertBefore)
def getTableMisnestedNodePosition(self):
"""Get the foster parent element, and sibling to insert before
(or None) when inserting a misnested table node"""
# The foster parent element is the one which comes before the most
# recently opened table element
# XXX - this is really inelegant
lastTable = None
fosterParent = None
insertBefore = None
for elm in self.openElements[::-1]:
if elm.name == "table":
lastTable = elm
break
if lastTable:
# XXX - we should really check that this parent is actually a
# node here
if lastTable.parent:
fosterParent = lastTable.parent
insertBefore = lastTable
else:
fosterParent = self.openElements[
self.openElements.index(lastTable) - 1]
else:
fosterParent = self.openElements[0]
return fosterParent, insertBefore
def generateImpliedEndTags(self, exclude=None):
name = self.openElements[-1].name
# XXX td, th and tr are not actually needed
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
name != exclude):
self.openElements.pop()
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
self.generateImpliedEndTags(exclude)
def getDocument(self):
"""Return the final tree"""
return self.document
def getFragment(self):
"""Return the final fragment"""
# assert self.innerHTML
fragment = self.fragmentClass()
self.openElements[0].reparentChildren(fragment)
return fragment
def testSerializer(self, node):
"""Serialize the subtree of node in the format required by unit tests
:arg node: the node from which to start serializing
"""
raise NotImplementedError

View file

@ -0,0 +1,236 @@
from __future__ import absolute_import, division, unicode_literals
from collections import MutableMapping
from xml.dom import minidom, Node
import weakref
from . import base
from .. import constants
from ..constants import namespaces
from .._utils import moduleFactoryFactory
def getDomBuilder(DomImplementation):
Dom = DomImplementation
class AttrList(MutableMapping):
def __init__(self, element):
self.element = element
def __iter__(self):
return iter(self.element.attributes.keys())
def __setitem__(self, name, value):
if isinstance(name, tuple):
raise NotImplementedError
else:
attr = self.element.ownerDocument.createAttribute(name)
attr.value = value
self.element.attributes[name] = attr
def __len__(self):
return len(self.element.attributes)
def items(self):
return list(self.element.attributes.items())
def values(self):
return list(self.element.attributes.values())
def __getitem__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
return self.element.attributes[name].value
def __delitem__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
del self.element.attributes[name]
class NodeBuilder(base.Node):
def __init__(self, element):
base.Node.__init__(self, element.nodeName)
self.element = element
namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
self.element.namespaceURI or None)
def appendChild(self, node):
node.parent = self
self.element.appendChild(node.element)
def insertText(self, data, insertBefore=None):
text = self.element.ownerDocument.createTextNode(data)
if insertBefore:
self.element.insertBefore(text, insertBefore.element)
else:
self.element.appendChild(text)
def insertBefore(self, node, refNode):
self.element.insertBefore(node.element, refNode.element)
node.parent = self
def removeChild(self, node):
if node.element.parentNode == self.element:
self.element.removeChild(node.element)
node.parent = None
def reparentChildren(self, newParent):
while self.element.hasChildNodes():
child = self.element.firstChild
self.element.removeChild(child)
newParent.element.appendChild(child)
self.childNodes = []
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes:
for name, value in list(attributes.items()):
if isinstance(name, tuple):
if name[0] is not None:
qualifiedName = (name[0] + ":" + name[1])
else:
qualifiedName = name[1]
self.element.setAttributeNS(name[2], qualifiedName,
value)
else:
self.element.setAttribute(
name, value)
attributes = property(getAttributes, setAttributes)
def cloneNode(self):
return NodeBuilder(self.element.cloneNode(False))
def hasContent(self):
return self.element.hasChildNodes()
def getNameTuple(self):
if self.namespace is None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
def documentClass(self):
self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
return weakref.proxy(self)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
domimpl = Dom.getDOMImplementation()
doctype = domimpl.createDocumentType(name, publicId, systemId)
self.document.appendChild(NodeBuilder(doctype))
if Dom == minidom:
doctype.ownerDocument = self.dom
def elementClass(self, name, namespace=None):
if namespace is None and self.defaultNamespace is None:
node = self.dom.createElement(name)
else:
node = self.dom.createElementNS(namespace, name)
return NodeBuilder(node)
def commentClass(self, data):
return NodeBuilder(self.dom.createComment(data))
def fragmentClass(self):
return NodeBuilder(self.dom.createDocumentFragment())
def appendChild(self, node):
self.dom.appendChild(node.element)
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.dom
def getFragment(self):
return base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
data = data
if parent != self:
base.TreeBuilder.insertText(self, data, parent)
else:
# HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'):
# pylint:disable=protected-access
if Node.TEXT_NODE not in self.dom._child_node_types:
self.dom._child_node_types = list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE)
self.dom.appendChild(self.dom.createTextNode(data))
implementation = DomImplementation
name = None
def testSerializer(element):
element.normalize()
rv = []
def serializeElement(element, indent=0):
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
if element.name:
if element.publicId or element.systemId:
publicId = element.publicId or ""
systemId = element.systemId or ""
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
(' ' * indent, element.name, publicId, systemId))
else:
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
else:
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
elif element.nodeType == Node.DOCUMENT_NODE:
rv.append("#document")
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
rv.append("#document-fragment")
elif element.nodeType == Node.COMMENT_NODE:
rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
elif element.nodeType == Node.TEXT_NODE:
rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
else:
if (hasattr(element, "namespaceURI") and
element.namespaceURI is not None):
name = "%s %s" % (constants.prefixes[element.namespaceURI],
element.nodeName)
else:
name = element.nodeName
rv.append("|%s<%s>" % (' ' * indent, name))
if element.hasAttributes():
attributes = []
for i in range(len(element.attributes)):
attr = element.attributes.item(i)
name = attr.nodeName
value = attr.value
ns = attr.namespaceURI
if ns:
name = "%s %s" % (constants.prefixes[ns], attr.localName)
else:
name = attr.nodeName
attributes.append((name, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
indent += 2
for child in element.childNodes:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)
return locals()
# The actual means to get a module!
getDomModule = moduleFactoryFactory(getDomBuilder)

View file

@ -0,0 +1,340 @@
from __future__ import absolute_import, division, unicode_literals
# pylint:disable=protected-access
from six import text_type
import re
from . import base
from .. import _ihatexml
from .. import constants
from ..constants import namespaces
from .._utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)")
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
class Element(base.Node):
def __init__(self, name, namespace=None):
self._name = name
self._namespace = namespace
self._element = ElementTree.Element(self._getETreeTag(name,
namespace))
if namespace is None:
self.nameTuple = namespaces["html"], self._name
else:
self.nameTuple = self._namespace, self._name
self.parent = None
self._childNodes = []
self._flags = []
def _getETreeTag(self, name, namespace):
if namespace is None:
etree_tag = name
else:
etree_tag = "{%s}%s" % (namespace, name)
return etree_tag
def _setName(self, name):
self._name = name
self._element.tag = self._getETreeTag(self._name, self._namespace)
def _getName(self):
return self._name
name = property(_getName, _setName)
def _setNamespace(self, namespace):
self._namespace = namespace
self._element.tag = self._getETreeTag(self._name, self._namespace)
def _getNamespace(self):
return self._namespace
namespace = property(_getNamespace, _setNamespace)
def _getAttributes(self):
return self._element.attrib
def _setAttributes(self, attributes):
# Delete existing attributes first
# XXX - there may be a better way to do this...
for key in list(self._element.attrib.keys()):
del self._element.attrib[key]
for key, value in attributes.items():
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], key[1])
else:
name = key
self._element.set(name, value)
attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self):
return self._childNodes
def _setChildNodes(self, value):
del self._element[:]
self._childNodes = []
for element in value:
self.insertChild(element)
childNodes = property(_getChildNodes, _setChildNodes)
def hasContent(self):
"""Return true if the node has children or text"""
return bool(self._element.text or len(self._element))
def appendChild(self, node):
self._childNodes.append(node)
self._element.append(node._element)
node.parent = self
def insertBefore(self, node, refNode):
index = list(self._element).index(refNode._element)
self._element.insert(index, node._element)
node.parent = self
def removeChild(self, node):
self._childNodes.remove(node)
self._element.remove(node._element)
node.parent = None
def insertText(self, data, insertBefore=None):
if not(len(self._element)):
if not self._element.text:
self._element.text = ""
self._element.text += data
elif insertBefore is None:
# Insert the text as the tail of the last child element
if not self._element[-1].tail:
self._element[-1].tail = ""
self._element[-1].tail += data
else:
# Insert the text before the specified node
children = list(self._element)
index = children.index(insertBefore._element)
if index > 0:
if not self._element[index - 1].tail:
self._element[index - 1].tail = ""
self._element[index - 1].tail += data
else:
if not self._element.text:
self._element.text = ""
self._element.text += data
def cloneNode(self):
element = type(self)(self.name, self.namespace)
for name, value in self.attributes.items():
element.attributes[name] = value
return element
def reparentChildren(self, newParent):
if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text
else:
if not newParent._element.text:
newParent._element.text = ""
if self._element.text is not None:
newParent._element.text += self._element.text
self._element.text = ""
base.Node.reparentChildren(self, newParent)
class Comment(Element):
def __init__(self, data):
# Use the superclass constructor to set all properties on the
# wrapper element
self._element = ElementTree.Comment(data)
self.parent = None
self._childNodes = []
self._flags = []
def _getData(self):
return self._element.text
def _setData(self, value):
self._element.text = value
data = property(_getData, _setData)
class DocumentType(Element):
def __init__(self, name, publicId, systemId):
Element.__init__(self, "<!DOCTYPE>")
self._element.text = name
self.publicId = publicId
self.systemId = systemId
def _getPublicId(self):
return self._element.get("publicId", "")
def _setPublicId(self, value):
if value is not None:
self._element.set("publicId", value)
publicId = property(_getPublicId, _setPublicId)
def _getSystemId(self):
return self._element.get("systemId", "")
def _setSystemId(self, value):
if value is not None:
self._element.set("systemId", value)
systemId = property(_getSystemId, _setSystemId)
class Document(Element):
def __init__(self):
Element.__init__(self, "DOCUMENT_ROOT")
class DocumentFragment(Element):
def __init__(self):
Element.__init__(self, "DOCUMENT_FRAGMENT")
def testSerializer(element):
rv = []
def serializeElement(element, indent=0):
if not(hasattr(element, "tag")):
element = element.getroot()
if element.tag == "<!DOCTYPE>":
if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append("""<!DOCTYPE %s "%s" "%s">""" %
(element.text, publicId, systemId))
else:
rv.append("<!DOCTYPE %s>" % (element.text,))
elif element.tag == "DOCUMENT_ROOT":
rv.append("#document")
if element.text is not None:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
if element.tail is not None:
raise TypeError("Document node cannot have tail")
if hasattr(element, "attrib") and len(element.attrib):
raise TypeError("Document node cannot have attributes")
elif element.tag == ElementTreeCommentType:
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
else:
assert isinstance(element.tag, text_type), \
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
nsmatch = tag_regexp.match(element.tag)
if nsmatch is None:
name = element.tag
else:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
name = "%s %s" % (prefix, name)
rv.append("|%s<%s>" % (' ' * indent, name))
if hasattr(element, "attrib"):
attributes = []
for name, value in element.attrib.items():
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
attr_string = "%s %s" % (prefix, name)
else:
attr_string = name
attributes.append((attr_string, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
if element.text:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
indent += 2
for child in element:
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0)
return "\n".join(rv)
def tostring(element): # pylint:disable=unused-variable
"""Serialize an element and its child nodes to a string"""
rv = []
filter = _ihatexml.InfosetFilter()
def serializeElement(element):
if isinstance(element, ElementTree.ElementTree):
element = element.getroot()
if element.tag == "<!DOCTYPE>":
if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
(element.text, publicId, systemId))
else:
rv.append("<!DOCTYPE %s>" % (element.text,))
elif element.tag == "DOCUMENT_ROOT":
if element.text is not None:
rv.append(element.text)
if element.tail is not None:
raise TypeError("Document node cannot have tail")
if hasattr(element, "attrib") and len(element.attrib):
raise TypeError("Document node cannot have attributes")
for child in element:
serializeElement(child)
elif element.tag == ElementTreeCommentType:
rv.append("<!--%s-->" % (element.text,))
else:
# This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>" % (filter.fromXmlName(element.tag),))
else:
attr = " ".join(["%s=\"%s\"" % (
filter.fromXmlName(name), value)
for name, value in element.attrib.items()])
rv.append("<%s %s>" % (element.tag, attr))
if element.text:
rv.append(element.text)
for child in element:
serializeElement(child)
rv.append("</%s>" % (element.tag,))
if element.tail:
rv.append(element.tail)
serializeElement(element)
return "".join(rv)
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = Comment
fragmentClass = DocumentFragment
implementation = ElementTreeImplementation
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
if fullTree:
return self.document._element
else:
if self.defaultNamespace is not None:
return self.document._element.find(
"{%s}html" % self.defaultNamespace)
else:
return self.document._element.find("html")
def getFragment(self):
return base.TreeBuilder.getFragment(self)._element
return locals()
getETreeModule = moduleFactoryFactory(getETreeBuilder)

View file

@ -0,0 +1,366 @@
"""Module for supporting the lxml.etree library. The idea here is to use as much
of the native library as possible, without using fragile hacks like custom element
names that break between releases. The downside of this is that we cannot represent
all possible trees; specifically the following are known to cause problems:
Text or comments as siblings of the root element
Docypes with no name
When any of these things occur, we emit a DataLossWarning
"""
from __future__ import absolute_import, division, unicode_literals
# pylint:disable=protected-access
import warnings
import re
import sys
from . import base
from ..constants import DataLossWarning
from .. import constants
from . import etree as etree_builders
from .. import _ihatexml
import lxml.etree as etree
fullTree = True
tag_regexp = re.compile("{([^}]*)}(.*)")
comment_type = etree.Comment("asd").tag
class DocumentType(object):
def __init__(self, name, publicId, systemId):
self.name = name
self.publicId = publicId
self.systemId = systemId
class Document(object):
def __init__(self):
self._elementTree = None
self._childNodes = []
def appendChild(self, element):
self._elementTree.getroot().addnext(element._element)
def _getChildNodes(self):
return self._childNodes
childNodes = property(_getChildNodes)
def testSerializer(element):
rv = []
infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
def serializeElement(element, indent=0):
if not hasattr(element, "tag"):
if hasattr(element, "getroot"):
# Full tree case
rv.append("#document")
if element.docinfo.internalDTD:
if not (element.docinfo.public_id or
element.docinfo.system_url):
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
else:
dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
element.docinfo.root_name,
element.docinfo.public_id,
element.docinfo.system_url)
rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
next_element = element.getroot()
while next_element.getprevious() is not None:
next_element = next_element.getprevious()
while next_element is not None:
serializeElement(next_element, indent + 2)
next_element = next_element.getnext()
elif isinstance(element, str) or isinstance(element, bytes):
# Text in a fragment
assert isinstance(element, str) or sys.version_info[0] == 2
rv.append("|%s\"%s\"" % (' ' * indent, element))
else:
# Fragment case
rv.append("#document-fragment")
for next_element in element:
serializeElement(next_element, indent + 2)
elif element.tag == comment_type:
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
else:
assert isinstance(element, etree._Element)
nsmatch = etree_builders.tag_regexp.match(element.tag)
if nsmatch is not None:
ns = nsmatch.group(1)
tag = nsmatch.group(2)
prefix = constants.prefixes[ns]
rv.append("|%s<%s %s>" % (' ' * indent, prefix,
infosetFilter.fromXmlName(tag)))
else:
rv.append("|%s<%s>" % (' ' * indent,
infosetFilter.fromXmlName(element.tag)))
if hasattr(element, "attrib"):
attributes = []
for name, value in element.attrib.items():
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
name = infosetFilter.fromXmlName(name)
prefix = constants.prefixes[ns]
attr_string = "%s %s" % (prefix, name)
else:
attr_string = infosetFilter.fromXmlName(name)
attributes.append((attr_string, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
if element.text:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
indent += 2
for child in element:
serializeElement(child, indent)
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0)
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
def serializeElement(element):
if not hasattr(element, "tag"):
if element.docinfo.internalDTD:
if element.docinfo.doctype:
dtd_str = element.docinfo.doctype
else:
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
rv.append(dtd_str)
serializeElement(element.getroot())
elif element.tag == comment_type:
rv.append("<!--%s-->" % (element.text,))
else:
# This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>" % (element.tag,))
else:
attr = " ".join(["%s=\"%s\"" % (name, value)
for name, value in element.attrib.items()])
rv.append("<%s %s>" % (element.tag, attr))
if element.text:
rv.append(element.text)
for child in element:
serializeElement(child)
rv.append("</%s>" % (element.tag,))
if hasattr(element, "tail") and element.tail:
rv.append(element.tail)
serializeElement(element)
return "".join(rv)
class TreeBuilder(base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
commentClass = None
fragmentClass = Document
implementation = etree
def __init__(self, namespaceHTMLElements, fullTree=False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict):
def __init__(self, element, value=None):
if value is None:
value = {}
self._element = element
dict.__init__(self, value) # pylint:disable=non-parent-init-called
for key, value in self.items():
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else:
name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value
def __setitem__(self, key, value):
dict.__setitem__(self, key, value)
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else:
name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value
class Element(builder.Element):
def __init__(self, name, namespace):
name = infosetFilter.coerceElement(name)
builder.Element.__init__(self, name, namespace=namespace)
self._attributes = Attributes(self)
def _setName(self, name):
self._name = infosetFilter.coerceElement(name)
self._element.tag = self._getETreeTag(
self._name, self._namespace)
def _getName(self):
return infosetFilter.fromXmlName(self._name)
name = property(_getName, _setName)
def _getAttributes(self):
return self._attributes
def _setAttributes(self, attributes):
self._attributes = Attributes(self, attributes)
attributes = property(_getAttributes, _setAttributes)
def insertText(self, data, insertBefore=None):
data = infosetFilter.coerceCharacters(data)
builder.Element.insertText(self, data, insertBefore)
def appendChild(self, child):
builder.Element.appendChild(self, child)
class Comment(builder.Comment):
def __init__(self, data):
data = infosetFilter.coerceComment(data)
builder.Comment.__init__(self, data)
def _setData(self, data):
data = infosetFilter.coerceComment(data)
self._element.text = data
def _getData(self):
return self._element.text
data = property(_getData, _setData)
self.elementClass = Element
self.commentClass = Comment
# self.fragmentClass = builder.DocumentFragment
base.TreeBuilder.__init__(self, namespaceHTMLElements)
def reset(self):
base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial
self.initial_comments = []
self.doctype = None
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
if fullTree:
return self.document._elementTree
else:
return self.document._elementTree.getroot()
def getFragment(self):
fragment = []
element = self.openElements[0]._element
if element.text:
fragment.append(element.text)
fragment.extend(list(element))
if element.tail:
fragment.append(element.tail)
return fragment
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
if not name:
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
self.doctype = None
else:
coercedName = self.infosetFilter.coerceElement(name)
if coercedName != name:
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
doctype = self.doctypeClass(coercedName, publicId, systemId)
self.doctype = doctype
def insertCommentInitial(self, data, parent=None):
assert parent is None or parent is self.document
assert self.document._elementTree is None
self.initial_comments.append(data)
def insertCommentMain(self, data, parent=None):
if (parent == self.document and
self.document._elementTree.getroot()[-1].tag == comment_type):
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
super(TreeBuilder, self).insertComment(data, parent)
def insertRoot(self, token):
# Because of the way libxml2 works, it doesn't seem to be possible to
# alter information like the doctype after the tree has been parsed.
# Therefore we need to use the built-in parser to create our initial
# tree, after which we can add elements like normal
docStr = ""
if self.doctype:
assert self.doctype.name
docStr += "<!DOCTYPE %s" % self.doctype.name
if (self.doctype.publicId is not None or
self.doctype.systemId is not None):
docStr += (' PUBLIC "%s" ' %
(self.infosetFilter.coercePubid(self.doctype.publicId or "")))
if self.doctype.systemId:
sysid = self.doctype.systemId
if sysid.find("'") >= 0 and sysid.find('"') >= 0:
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
sysid = sysid.replace("'", 'U00027')
if sysid.find("'") >= 0:
docStr += '"%s"' % sysid
else:
docStr += "'%s'" % sysid
else:
docStr += "''"
docStr += ">"
if self.doctype.name != token["name"]:
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
root = etree.fromstring(docStr)
# Append the initial comments:
for comment_token in self.initial_comments:
comment = self.commentClass(comment_token["data"])
root.addprevious(comment._element)
# Create the root document and add the ElementTree to it
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
# Give the root element the right name
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
if namespace is None:
etree_tag = name
else:
etree_tag = "{%s}%s" % (namespace, name)
root.tag = etree_tag
# Add the root element to the internal child/open data structures
root_element = self.elementClass(name, namespace)
root_element._element = root
self.document._childNodes.append(root_element)
self.openElements.append(root_element)
# Reset to the default insert comment function
self.insertComment = self.insertCommentMain

View file

@ -0,0 +1,154 @@
"""A collection of modules for iterating through different kinds of
tree, generating tokens identical to those produced by the tokenizer
module.
To create a tree walker for a new type of tree, you need to do
implement a tree walker object (called TreeWalker by convention) that
implements a 'serialize' method taking a tree as sole argument and
returning an iterator generating tokens.
"""
from __future__ import absolute_import, division, unicode_literals
from .. import constants
from .._utils import default_etree
__all__ = ["getTreeWalker", "pprint"]
treeWalkerCache = {}
def getTreeWalker(treeType, implementation=None, **kwargs):
"""Get a TreeWalker class for various types of tree with built-in support
:arg str treeType: the name of the tree type required (case-insensitive).
Supported values are:
* "dom": The xml.dom.minidom DOM implementation
* "etree": A generic walker for tree implementations exposing an
elementtree-like interface (known to work with ElementTree,
cElementTree and lxml.etree).
* "lxml": Optimized walker for lxml.etree
* "genshi": a Genshi stream
:arg implementation: A module implementing the tree type e.g.
xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
tree type only).
:arg kwargs: keyword arguments passed to the etree walker--for other
walkers, this has no effect
:returns: a TreeWalker class
"""
treeType = treeType.lower()
if treeType not in treeWalkerCache:
if treeType == "dom":
from . import dom
treeWalkerCache[treeType] = dom.TreeWalker
elif treeType == "genshi":
from . import genshi
treeWalkerCache[treeType] = genshi.TreeWalker
elif treeType == "lxml":
from . import etree_lxml
treeWalkerCache[treeType] = etree_lxml.TreeWalker
elif treeType == "etree":
from . import etree
if implementation is None:
implementation = default_etree
# XXX: NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeWalker
return treeWalkerCache.get(treeType)
def concatenateCharacterTokens(tokens):
pendingCharacters = []
for token in tokens:
type = token["type"]
if type in ("Characters", "SpaceCharacters"):
pendingCharacters.append(token["data"])
else:
if pendingCharacters:
yield {"type": "Characters", "data": "".join(pendingCharacters)}
pendingCharacters = []
yield token
if pendingCharacters:
yield {"type": "Characters", "data": "".join(pendingCharacters)}
def pprint(walker):
"""Pretty printer for tree walkers
Takes a TreeWalker instance and pretty prints the output of walking the tree.
:arg walker: a TreeWalker instance
"""
output = []
indent = 0
for token in concatenateCharacterTokens(walker):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
# tag name
if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
if token["namespace"] in constants.prefixes:
ns = constants.prefixes[token["namespace"]]
else:
ns = token["namespace"]
name = "%s %s" % (ns, token["name"])
else:
name = token["name"]
output.append("%s<%s>" % (" " * indent, name))
indent += 2
# attributes (sorted for consistent ordering)
attrs = token["data"]
for (namespace, localname), value in sorted(attrs.items()):
if namespace:
if namespace in constants.prefixes:
ns = constants.prefixes[namespace]
else:
ns = namespace
name = "%s %s" % (ns, localname)
else:
name = localname
output.append("%s%s=\"%s\"" % (" " * indent, name, value))
# self-closing
if type == "EmptyTag":
indent -= 2
elif type == "EndTag":
indent -= 2
elif type == "Comment":
output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
elif type == "Doctype":
if token["name"]:
if token["publicId"]:
output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
(" " * indent,
token["name"],
token["publicId"],
token["systemId"] if token["systemId"] else ""))
elif token["systemId"]:
output.append("""%s<!DOCTYPE %s "" "%s">""" %
(" " * indent,
token["name"],
token["systemId"]))
else:
output.append("%s<!DOCTYPE %s>" % (" " * indent,
token["name"]))
else:
output.append("%s<!DOCTYPE >" % (" " * indent,))
elif type == "Characters":
output.append("%s\"%s\"" % (" " * indent, token["data"]))
elif type == "SpaceCharacters":
assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
else:
raise ValueError("Unknown token type, %s" % type)
return "\n".join(output)

View file

@ -0,0 +1,252 @@
from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node
from ..constants import namespaces, voidElements, spaceCharacters
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
"TreeWalker", "NonRecursiveTreeWalker"]
DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>"
spaceCharacters = "".join(spaceCharacters)
class TreeWalker(object):
"""Walks a tree yielding tokens
Tokens are dicts that all have a ``type`` field specifying the type of the
token.
"""
def __init__(self, tree):
"""Creates a TreeWalker
:arg tree: the tree to walk
"""
self.tree = tree
def __iter__(self):
raise NotImplementedError
def error(self, msg):
"""Generates an error token with the given message
:arg msg: the error message
:returns: SerializeError token
"""
return {"type": "SerializeError", "data": msg}
def emptyTag(self, namespace, name, attrs, hasChildren=False):
"""Generates an EmptyTag token
:arg namespace: the namespace of the token--can be ``None``
:arg name: the name of the element
:arg attrs: the attributes of the element as a dict
:arg hasChildren: whether or not to yield a SerializationError because
this tag shouldn't have children
:returns: EmptyTag token
"""
yield {"type": "EmptyTag", "name": name,
"namespace": namespace,
"data": attrs}
if hasChildren:
yield self.error("Void element has children")
def startTag(self, namespace, name, attrs):
"""Generates a StartTag token
:arg namespace: the namespace of the token--can be ``None``
:arg name: the name of the element
:arg attrs: the attributes of the element as a dict
:returns: StartTag token
"""
return {"type": "StartTag",
"name": name,
"namespace": namespace,
"data": attrs}
def endTag(self, namespace, name):
"""Generates an EndTag token
:arg namespace: the namespace of the token--can be ``None``
:arg name: the name of the element
:returns: EndTag token
"""
return {"type": "EndTag",
"name": name,
"namespace": namespace}
def text(self, data):
"""Generates SpaceCharacters and Characters tokens
Depending on what's in the data, this generates one or more
``SpaceCharacters`` and ``Characters`` tokens.
For example:
>>> from html5lib.treewalkers.base import TreeWalker
>>> # Give it an empty tree just so it instantiates
>>> walker = TreeWalker([])
>>> list(walker.text(''))
[]
>>> list(walker.text(' '))
[{u'data': ' ', u'type': u'SpaceCharacters'}]
>>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
[{u'data': ' ', u'type': u'SpaceCharacters'},
{u'data': u'abc', u'type': u'Characters'},
{u'data': u' ', u'type': u'SpaceCharacters'}]
:arg data: the text data
:returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
"""
data = data
middle = data.lstrip(spaceCharacters)
left = data[:len(data) - len(middle)]
if left:
yield {"type": "SpaceCharacters", "data": left}
data = middle
middle = data.rstrip(spaceCharacters)
right = data[len(middle):]
if middle:
yield {"type": "Characters", "data": middle}
if right:
yield {"type": "SpaceCharacters", "data": right}
def comment(self, data):
"""Generates a Comment token
:arg data: the comment
:returns: Comment token
"""
return {"type": "Comment", "data": data}
def doctype(self, name, publicId=None, systemId=None):
"""Generates a Doctype token
:arg name:
:arg publicId:
:arg systemId:
:returns: the Doctype token
"""
return {"type": "Doctype",
"name": name,
"publicId": publicId,
"systemId": systemId}
def entity(self, name):
"""Generates an Entity token
:arg name: the entity name
:returns: an Entity token
"""
return {"type": "Entity", "name": name}
def unknown(self, nodeType):
"""Handles unknown node types"""
return self.error("Unknown node type: " + nodeType)
class NonRecursiveTreeWalker(TreeWalker):
def getNodeDetails(self, node):
raise NotImplementedError
def getFirstChild(self, node):
raise NotImplementedError
def getNextSibling(self, node):
raise NotImplementedError
def getParentNode(self, node):
raise NotImplementedError
def __iter__(self):
currentNode = self.tree
while currentNode is not None:
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
hasChildren = False
if type == DOCTYPE:
yield self.doctype(*details)
elif type == TEXT:
for token in self.text(*details):
yield token
elif type == ELEMENT:
namespace, name, attributes, hasChildren = details
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
for token in self.emptyTag(namespace, name, attributes,
hasChildren):
yield token
hasChildren = False
else:
yield self.startTag(namespace, name, attributes)
elif type == COMMENT:
yield self.comment(details[0])
elif type == ENTITY:
yield self.entity(details[0])
elif type == DOCUMENT:
hasChildren = True
else:
yield self.unknown(details[0])
if hasChildren:
firstChild = self.getFirstChild(currentNode)
else:
firstChild = None
if firstChild is not None:
currentNode = firstChild
else:
while currentNode is not None:
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
if type == ELEMENT:
namespace, name, attributes, hasChildren = details
if (namespace and namespace != namespaces["html"]) or name not in voidElements:
yield self.endTag(namespace, name)
if self.tree is currentNode:
currentNode = None
break
nextSibling = self.getNextSibling(currentNode)
if nextSibling is not None:
currentNode = nextSibling
break
else:
currentNode = self.getParentNode(currentNode)

View file

@ -0,0 +1,43 @@
from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node
from . import base
class TreeWalker(base.NonRecursiveTreeWalker):
def getNodeDetails(self, node):
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
return base.DOCTYPE, node.name, node.publicId, node.systemId
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
return base.TEXT, node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE:
attrs = {}
for attr in list(node.attributes.keys()):
attr = node.getAttributeNode(attr)
if attr.namespaceURI:
attrs[(attr.namespaceURI, attr.localName)] = attr.value
else:
attrs[(None, attr.name)] = attr.value
return (base.ELEMENT, node.namespaceURI, node.nodeName,
attrs, node.hasChildNodes())
elif node.nodeType == Node.COMMENT_NODE:
return base.COMMENT, node.nodeValue
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
return (base.DOCUMENT,)
else:
return base.UNKNOWN, node.nodeType
def getFirstChild(self, node):
return node.firstChild
def getNextSibling(self, node):
return node.nextSibling
def getParentNode(self, node):
return node.parentNode

View file

@ -0,0 +1,130 @@
from __future__ import absolute_import, division, unicode_literals
from collections import OrderedDict
import re
from six import string_types
from . import base
from .._utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)")
def getETreeBuilder(ElementTreeImplementation):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
"""Given the particular ElementTree representation, this implementation,
to avoid using recursion, returns "nodes" as tuples with the following
content:
1. The current element
2. The index of the element relative to its parent
3. A stack of ancestor elements
4. A flag "text", "tail" or None to indicate if the current node is a
text node; either the text or tail of the current element (1)
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
elt, _, _, flag = node
if flag in ("text", "tail"):
return base.TEXT, getattr(elt, flag)
else:
node = elt
if not(hasattr(node, "tag")):
node = node.getroot()
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
return (base.DOCUMENT,)
elif node.tag == "<!DOCTYPE>":
return (base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId"))
elif node.tag == ElementTreeCommentType:
return base.COMMENT, node.text
else:
assert isinstance(node.tag, string_types), type(node.tag)
# This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = node.tag
attrs = OrderedDict()
for name, value in list(node.attrib.items()):
match = tag_regexp.match(name)
if match:
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
return (base.ELEMENT, namespace, tag,
attrs, len(node) or node.text)
def getFirstChild(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
element, key, parents, flag = node, None, [], None
if flag in ("text", "tail"):
return None
else:
if element.text:
return element, key, parents, "text"
elif len(element):
parents.append(element)
return element[0], 0, parents, None
else:
return None
def getNextSibling(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
return None
if flag == "text":
if len(element):
parents.append(element)
return element[0], 0, parents, None
else:
return None
else:
if element.tail and flag != "tail":
return element, key, parents, "tail"
elif key < len(parents[-1]) - 1:
return parents[-1][key + 1], key + 1, parents, None
else:
return None
def getParentNode(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
return None
if flag == "text":
if not parents:
return element
else:
return element, key, parents, None
else:
parent = parents.pop()
if not parents:
return parent
else:
assert list(parents[-1]).count(parent) == 1
return parent, list(parents[-1]).index(parent), parents, None
return locals()
getETreeModule = moduleFactoryFactory(getETreeBuilder)

View file

@ -0,0 +1,213 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from lxml import etree
from ..treebuilders.etree import tag_regexp
from . import base
from .. import _ihatexml
def ensure_str(s):
if s is None:
return None
elif isinstance(s, text_type):
return s
else:
return s.decode("ascii", "strict")
class Root(object):
def __init__(self, et):
self.elementtree = et
self.children = []
try:
if et.docinfo.internalDTD:
self.children.append(Doctype(self,
ensure_str(et.docinfo.root_name),
ensure_str(et.docinfo.public_id),
ensure_str(et.docinfo.system_url)))
except AttributeError:
pass
try:
node = et.getroot()
except AttributeError:
node = et
while node.getprevious() is not None:
node = node.getprevious()
while node is not None:
self.children.append(node)
node = node.getnext()
self.text = None
self.tail = None
def __getitem__(self, key):
return self.children[key]
def getnext(self):
return None
def __len__(self):
return 1
class Doctype(object):
def __init__(self, root_node, name, public_id, system_id):
self.root_node = root_node
self.name = name
self.public_id = public_id
self.system_id = system_id
self.text = None
self.tail = None
def getnext(self):
return self.root_node.children[1]
class FragmentRoot(Root):
def __init__(self, children):
self.children = [FragmentWrapper(self, child) for child in children]
self.text = self.tail = None
def getnext(self):
return None
class FragmentWrapper(object):
def __init__(self, fragment_root, obj):
self.root_node = fragment_root
self.obj = obj
if hasattr(self.obj, 'text'):
self.text = ensure_str(self.obj.text)
else:
self.text = None
if hasattr(self.obj, 'tail'):
self.tail = ensure_str(self.obj.tail)
else:
self.tail = None
def __getattr__(self, name):
return getattr(self.obj, name)
def getnext(self):
siblings = self.root_node.children
idx = siblings.index(self)
if idx < len(siblings) - 1:
return siblings[idx + 1]
else:
return None
def __getitem__(self, key):
return self.obj[key]
def __bool__(self):
return bool(self.obj)
def getparent(self):
return None
def __str__(self):
return str(self.obj)
def __unicode__(self):
return str(self.obj)
def __len__(self):
return len(self.obj)
class TreeWalker(base.NonRecursiveTreeWalker):
def __init__(self, tree):
# pylint:disable=redefined-variable-type
if isinstance(tree, list):
self.fragmentChildren = set(tree)
tree = FragmentRoot(tree)
else:
self.fragmentChildren = set()
tree = Root(tree)
base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = _ihatexml.InfosetFilter()
def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
return base.TEXT, ensure_str(getattr(node, key))
elif isinstance(node, Root):
return (base.DOCUMENT,)
elif isinstance(node, Doctype):
return base.DOCTYPE, node.name, node.public_id, node.system_id
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
return base.TEXT, ensure_str(node.obj)
elif node.tag == etree.Comment:
return base.COMMENT, ensure_str(node.text)
elif node.tag == etree.Entity:
return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
else:
# This is assumed to be an ordinary element
match = tag_regexp.match(ensure_str(node.tag))
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = ensure_str(node.tag)
attrs = {}
for name, value in list(node.attrib.items()):
name = ensure_str(name)
value = ensure_str(value)
match = tag_regexp.match(name)
if match:
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
attrs, len(node) > 0 or node.text)
def getFirstChild(self, node):
assert not isinstance(node, tuple), "Text nodes have no children"
assert len(node) or node.text, "Node has no children"
if node.text:
return (node, "text")
else:
return node[0]
def getNextSibling(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
if key == "text":
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
# because node[0] might evaluate to False if it has no child element
if len(node):
return node[0]
else:
return None
else: # tail
return node.getnext()
return (node, "tail") if node.tail else node.getnext()
def getParentNode(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
if key == "text":
return node
# else: fallback to "normal" processing
elif node in self.fragmentChildren:
return None
return node.getparent()

View file

@ -0,0 +1,69 @@
from __future__ import absolute_import, division, unicode_literals
from genshi.core import QName
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from . import base
from ..constants import voidElements, namespaces
class TreeWalker(base.TreeWalker):
def __iter__(self):
# Buffer the events so we can pass in the following one
previous = None
for event in self.tree:
if previous is not None:
for token in self.tokens(previous, event):
yield token
previous = event
# Don't forget the final event!
if previous is not None:
for token in self.tokens(previous, None):
yield token
def tokens(self, event, next):
kind, data, _ = event
if kind == START:
tag, attribs = data
name = tag.localname
namespace = tag.namespace
converted_attribs = {}
for k, v in attribs:
if isinstance(k, QName):
converted_attribs[(k.namespace, k.localname)] = v
else:
converted_attribs[(None, k)] = v
if namespace == namespaces["html"] and name in voidElements:
for token in self.emptyTag(namespace, name, converted_attribs,
not next or next[0] != END or
next[1] != tag):
yield token
else:
yield self.startTag(namespace, name, converted_attribs)
elif kind == END:
name = data.localname
namespace = data.namespace
if namespace != namespaces["html"] or name not in voidElements:
yield self.endTag(namespace, name)
elif kind == COMMENT:
yield self.comment(data)
elif kind == TEXT:
for token in self.text(data):
yield token
elif kind == DOCTYPE:
yield self.doctype(*data)
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
START_CDATA, END_CDATA, PI):
pass
else:
yield self.unknown(kind)

View file

@ -0,0 +1,244 @@
#!/usr/bin/env python
"""usage: %prog [options] filename
Parse a document to a tree, with optional profiling
"""
import sys
import traceback
from optparse import OptionParser
from html5lib import html5parser
from html5lib import treebuilders, serializer, treewalkers
from html5lib import constants
from html5lib import _utils
def parse():
optParser = getOptParser()
opts, args = optParser.parse_args()
encoding = "utf8"
try:
f = args[-1]
# Try opening from the internet
if f.startswith('http://'):
try:
import urllib.request
import urllib.parse
import urllib.error
import cgi
f = urllib.request.urlopen(f)
contentType = f.headers.get('content-type')
if contentType:
(mediaType, params) = cgi.parse_header(contentType)
encoding = params.get('charset')
except:
pass
elif f == '-':
f = sys.stdin
if sys.version_info[0] >= 3:
encoding = None
else:
try:
# Try opening from file system
f = open(f, "rb")
except IOError as e:
sys.stderr.write("Unable to open file: %s\n" % e)
sys.exit(1)
except IndexError:
sys.stderr.write("No filename provided. Use -h for help\n")
sys.exit(1)
treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
p = html5parser.HTMLParser(tree=treebuilder, debug=opts.log)
if opts.fragment:
parseMethod = p.parseFragment
else:
parseMethod = p.parse
if opts.profile:
import cProfile
import pstats
cProfile.runctx("run(parseMethod, f, encoding, scripting)", None,
{"run": run,
"parseMethod": parseMethod,
"f": f,
"encoding": encoding,
"scripting": opts.scripting},
"stats.prof")
# XXX - We should use a temp file here
stats = pstats.Stats('stats.prof')
stats.strip_dirs()
stats.sort_stats('time')
stats.print_stats()
elif opts.time:
import time
t0 = time.time()
document = run(parseMethod, f, encoding, opts.scripting)
t1 = time.time()
if document:
printOutput(p, document, opts)
t2 = time.time()
sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1))
else:
sys.stderr.write("\n\nRun took: %fs" % (t1 - t0))
else:
document = run(parseMethod, f, encoding, opts.scripting)
if document:
printOutput(p, document, opts)
def run(parseMethod, f, encoding, scripting):
try:
document = parseMethod(f, override_encoding=encoding, scripting=scripting)
except:
document = None
traceback.print_exc()
return document
def printOutput(parser, document, opts):
if opts.encoding:
print("Encoding:", parser.tokenizer.stream.charEncoding)
for item in parser.log:
print(item)
if document is not None:
if opts.xml:
tb = opts.treebuilder.lower()
if tb == "dom":
document.writexml(sys.stdout, encoding="utf-8")
elif tb == "lxml":
import lxml.etree
sys.stdout.write(lxml.etree.tostring(document, encoding="unicode"))
elif tb == "etree":
sys.stdout.write(_utils.default_etree.tostring(document, encoding="unicode"))
elif opts.tree:
if not hasattr(document, '__getitem__'):
document = [document]
for fragment in document:
print(parser.tree.testSerializer(fragment))
elif opts.html:
kwargs = {}
for opt in serializer.HTMLSerializer.options:
try:
kwargs[opt] = getattr(opts, opt)
except:
pass
if not kwargs['quote_char']:
del kwargs['quote_char']
if opts.sanitize:
kwargs["sanitize"] = True
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
if sys.version_info[0] >= 3:
encoding = None
else:
encoding = "utf-8"
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
sys.stdout.write(text)
if not text.endswith('\n'):
sys.stdout.write('\n')
if opts.error:
errList = []
for pos, errorcode, datavars in parser.errors:
errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
def getOptParser():
parser = OptionParser(usage=__doc__)
parser.add_option("-p", "--profile", action="store_true", default=False,
dest="profile", help="Use the hotshot profiler to "
"produce a detailed log of the run")
parser.add_option("-t", "--time",
action="store_true", default=False, dest="time",
help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
parser.add_option("-b", "--treebuilder", action="store", type="string",
dest="treebuilder", default="etree")
parser.add_option("-e", "--error", action="store_true", default=False,
dest="error", help="Print a list of parse errors")
parser.add_option("-f", "--fragment", action="store_true", default=False,
dest="fragment", help="Parse as a fragment")
parser.add_option("-s", "--scripting", action="store_true", default=False,
dest="scripting", help="Handle noscript tags as if scripting was enabled")
parser.add_option("", "--tree", action="store_true", default=False,
dest="tree", help="Output as debug tree")
parser.add_option("-x", "--xml", action="store_true", default=False,
dest="xml", help="Output as xml")
parser.add_option("", "--no-html", action="store_false", default=True,
dest="html", help="Don't output html")
parser.add_option("-c", "--encoding", action="store_true", default=False,
dest="encoding", help="Print character encoding used")
parser.add_option("", "--inject-meta-charset", action="store_true",
default=False, dest="inject_meta_charset",
help="inject <meta charset>")
parser.add_option("", "--strip-whitespace", action="store_true",
default=False, dest="strip_whitespace",
help="strip whitespace")
parser.add_option("", "--omit-optional-tags", action="store_true",
default=False, dest="omit_optional_tags",
help="omit optional tags")
parser.add_option("", "--quote-attr-values", action="store_true",
default=False, dest="quote_attr_values",
help="quote attribute values")
parser.add_option("", "--use-best-quote-char", action="store_true",
default=False, dest="use_best_quote_char",
help="use best quote character")
parser.add_option("", "--quote-char", action="store",
default=None, dest="quote_char",
help="quote character")
parser.add_option("", "--no-minimize-boolean-attributes",
action="store_false", default=True,
dest="minimize_boolean_attributes",
help="minimize boolean attributes")
parser.add_option("", "--use-trailing-solidus", action="store_true",
default=False, dest="use_trailing_solidus",
help="use trailing solidus")
parser.add_option("", "--space-before-trailing-solidus",
action="store_true", default=False,
dest="space_before_trailing_solidus",
help="add space before trailing solidus")
parser.add_option("", "--escape-lt-in-attrs", action="store_true",
default=False, dest="escape_lt_in_attrs",
help="escape less than signs in attribute values")
parser.add_option("", "--escape-rcdata", action="store_true",
default=False, dest="escape_rcdata",
help="escape rcdata element values")
parser.add_option("", "--sanitize", action="store_true", default=False,
dest="sanitize", help="sanitize")
parser.add_option("-l", "--log", action="store_true", default=False,
dest="log", help="log state transitions")
return parser
if __name__ == "__main__":
parse()

View file

@ -0,0 +1,17 @@
[pytest]
# Output fails, errors, xpass, and warnings; ignore doctest; make warnings errors
addopts = -rfEXw -p no:doctest --strict
# Make xpass results be considered fail
xfail_strict = true
# Document our markers
markers =
DOM: mark a test as a DOM tree test
ElementTree: mark a test as a ElementTree tree test
cElementTree: mark a test as a cElementTree tree test
lxml: mark a test as a lxml tree test
genshi: mark a test as a genshi tree test
parser: mark a test as a parser test
namespaced: mark a test as a namespaced parser test
treewalker: mark a test as a treewalker test

View file

@ -0,0 +1,15 @@
#!/bin/bash -ex
if [[ $SIX_VERSION ]]; then
pip install six==$SIX_VERSION
fi
pip install -r requirements-test.txt
if [[ $USE_OPTIONAL == "true" ]]; then
pip install -r requirements-optional.txt
fi
if [[ $CI == "true" ]]; then
pip install codecov
fi

View file

@ -0,0 +1,17 @@
-r requirements.txt
# We support a Genshi treewalker that can be used to serialize Genshi
# streams.
genshi
# chardet can be used as a fallback in case we are unable to determine
# the encoding of a document.
chardet>=2.2
# lxml is supported with its own treebuilder ("lxml") and otherwise
# uses the standard ElementTree support
lxml ; platform_python_implementation == 'CPython'
# DATrie can be used in place of our Python trie implementation for
# slightly better parsing performance.
datrie ; platform_python_implementation == 'CPython'

View file

@ -0,0 +1,10 @@
-r requirements.txt
tox
flake8<3.0
pytest==3.2.5
coverage
pytest-expect>=1.1,<2.0
mock

View file

@ -0,0 +1,2 @@
six>=1.9
webencodings

View file

@ -0,0 +1,14 @@
[bdist_wheel]
universal = 1
[pep8]
ignore = N
max-line-length = 139
exclude = .git,__pycache__,.tox,doc
[flake8]
ignore = N
max-line-length = 139
[metadata]
license_file = LICENSE

View file

@ -0,0 +1,125 @@
from __future__ import print_function
import ast
import codecs
import sys
from os.path import join, dirname
from setuptools import setup, find_packages, __version__ as setuptools_version
from pkg_resources import parse_version
import pkg_resources
try:
import _markerlib.markers
except ImportError:
_markerlib = None
# _markerlib.default_environment() obtains its data from _VARS
# and wraps it in another dict, but _markerlib_evaluate writes
# to the dict while it is iterating the keys, causing an error
# on Python 3 only.
# Replace _markerlib.default_environment to return a custom dict
# that has all the necessary markers, and ignores any writes.
class Python3MarkerDict(dict):
def __setitem__(self, key, value):
pass
def pop(self, i=-1):
return self[i]
if _markerlib and sys.version_info[0] == 3:
env = _markerlib.markers._VARS
for key in list(env.keys()):
new_key = key.replace('.', '_')
if new_key != key:
env[new_key] = env[key]
_markerlib.markers._VARS = Python3MarkerDict(env)
def default_environment():
return _markerlib.markers._VARS
_markerlib.default_environment = default_environment
# Avoid the very buggy pkg_resources.parser, which doesnt consistently
# recognise the markers needed by this setup.py
# Change this to setuptools 20.10.0 to support all markers.
if pkg_resources:
if parse_version(setuptools_version) < parse_version('18.5'):
MarkerEvaluation = pkg_resources.MarkerEvaluation
del pkg_resources.parser
pkg_resources.evaluate_marker = MarkerEvaluation._markerlib_evaluate
MarkerEvaluation.evaluate_marker = MarkerEvaluation._markerlib_evaluate
classifiers = [
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Text Processing :: Markup :: HTML'
]
here = dirname(__file__)
with codecs.open(join(here, 'README.rst'), 'r', 'utf8') as readme_file:
with codecs.open(join(here, 'CHANGES.rst'), 'r', 'utf8') as changes_file:
long_description = readme_file.read() + '\n' + changes_file.read()
version = None
with open(join(here, "html5lib", "__init__.py"), "rb") as init_file:
t = ast.parse(init_file.read(), filename="__init__.py", mode="exec")
assert isinstance(t, ast.Module)
assignments = filter(lambda x: isinstance(x, ast.Assign), t.body)
for a in assignments:
if (len(a.targets) == 1 and
isinstance(a.targets[0], ast.Name) and
a.targets[0].id == "__version__" and
isinstance(a.value, ast.Str)):
version = a.value.s
setup(name='html5lib',
version=version,
url='https://github.com/html5lib/html5lib-python',
license="MIT License",
description='HTML parser based on the WHATWG HTML specification',
long_description=long_description,
classifiers=classifiers,
maintainer='James Graham',
maintainer_email='james@hoppipolla.co.uk',
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
install_requires=[
'six>=1.9',
'webencodings',
],
extras_require={
# A conditional extra will only install these items when the extra is
# requested and the condition matches.
"datrie:platform_python_implementation == 'CPython'": ["datrie"],
"lxml:platform_python_implementation == 'CPython'": ["lxml"],
# Standard extras, will be installed when the extra is requested.
"genshi": ["genshi"],
"chardet": ["chardet>=2.2"],
# The all extra combines a standard extra which will be used anytime
# the all extra is requested, and it extends it with a conditional
# extra that will be installed whenever the condition matches and the
# all extra is requested.
"all": ["genshi", "chardet>=2.2"],
"all:platform_python_implementation == 'CPython'": ["datrie", "lxml"],
},
)

View file

@ -0,0 +1,23 @@
[tox]
envlist = {py27,py33,py34,py35,py36,pypy}-{base,six19,optional}
[testenv]
deps =
optional: -r{toxinidir}/requirements-optional.txt
-r{toxinidir}/requirements-test.txt
doc: Sphinx
passenv =
PYTEST_COMMAND
COVERAGE_RUN_OPTIONS
commands =
six19: pip install six==1.9
{env:PYTEST_COMMAND:{envbindir}/py.test} {posargs}
flake8 {toxinidir}
[testenv:doc]
changedir = doc
commands = sphinx-build -b html . _build
[flake8]
exclude = ./.tox

View file

@ -0,0 +1,100 @@
import json
import html5lib
def parse(path="html5ents.xml"):
return html5lib.parse(open(path), treebuilder="lxml")
def entity_table(tree):
return dict((entity_name("".join(tr[0].xpath(".//text()"))),
entity_characters(tr[1].text))
for tr in tree.xpath("//h:tbody/h:tr",
namespaces={"h": "http://www.w3.org/1999/xhtml"}))
def entity_name(inp):
return inp.strip()
def entity_characters(inp):
return "".join(codepoint_to_character(item)
for item in inp.split()
if item)
def codepoint_to_character(inp):
return ("\\U000" + inp[2:]).decode("unicode-escape")
def make_tests_json(entities):
test_list = make_test_list(entities)
tests_json = {"tests":
[make_test(*item) for item in test_list]
}
return tests_json
def make_test(name, characters, good):
return {
"description": test_description(name, good),
"input": "&%s" % name,
"output": test_expected(name, characters, good)
}
def test_description(name, good):
with_semicolon = name.endswith(";")
semicolon_text = {True: "with a semi-colon",
False: "without a semi-colon"}[with_semicolon]
if good:
text = "Named entity: %s %s" % (name, semicolon_text)
else:
text = "Bad named entity: %s %s" % (name, semicolon_text)
return text
def test_expected(name, characters, good):
rv = []
if not good or not name.endswith(";"):
rv.append("ParseError")
rv.append(["Character", characters])
return rv
def make_test_list(entities):
tests = []
for entity_name, characters in entities.items():
if entity_name.endswith(";") and not subentity_exists(entity_name, entities):
tests.append((entity_name[:-1], "&" + entity_name[:-1], False))
tests.append((entity_name, characters, True))
return sorted(tests)
def subentity_exists(entity_name, entities):
for i in range(1, len(entity_name)):
if entity_name[:-i] in entities:
return True
return False
def make_entities_code(entities):
entities_text = "\n".join(" \"%s\": u\"%s\"," % (
name, entities[name].encode(
"unicode-escape").replace("\"", "\\\""))
for name in sorted(entities.keys()))
return """entities = {
%s
}""" % entities_text
def main():
entities = entity_table(parse())
tests_json = make_tests_json(entities)
json.dump(tests_json, open("namedEntities.test", "w"), indent=4)
code = make_entities_code(entities)
open("entities_constants.py", "w").write(code)
if __name__ == "__main__":
main()