mirror of
https://github.com/servo/servo.git
synced 2025-07-12 18:03:49 +01:00
Update web-platform-tests to revision 0d318188757a9c996e20b82db201fd04de5aa255
This commit is contained in:
parent
b2a5225831
commit
1a81b18b9f
12321 changed files with 544385 additions and 6 deletions
122
tests/wpt/web-platform-tests/tools/html5lib/utils/spider.py
Normal file
122
tests/wpt/web-platform-tests/tools/html5lib/utils/spider.py
Normal file
|
@ -0,0 +1,122 @@
|
|||
#!/usr/bin/env python
|
||||
"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree
|
||||
|
||||
usage:
|
||||
import spider
|
||||
s = spider.Spider()
|
||||
s.spider("http://www.google.com", maxURLs=100)
|
||||
"""
|
||||
|
||||
import urllib.request, urllib.error, urllib.parse
|
||||
import urllib.robotparser
|
||||
import md5
|
||||
|
||||
import httplib2
|
||||
|
||||
import html5lib
|
||||
from html5lib.treebuilders import etree
|
||||
|
||||
class Spider(object):
|
||||
def __init__(self):
|
||||
self.unvisitedURLs = set()
|
||||
self.visitedURLs = set()
|
||||
self.buggyURLs=set()
|
||||
self.robotParser = urllib.robotparser.RobotFileParser()
|
||||
self.contentDigest = {}
|
||||
self.http = httplib2.Http(".cache")
|
||||
|
||||
def run(self, initialURL, maxURLs=1000):
|
||||
urlNumber = 0
|
||||
self.visitedURLs.add(initialURL)
|
||||
content = self.loadURL(initialURL)
|
||||
while maxURLs is None or urlNumber < maxURLs:
|
||||
if content is not None:
|
||||
self.parse(content)
|
||||
urlNumber += 1
|
||||
if not self.unvisitedURLs:
|
||||
break
|
||||
content = self.loadURL(self.unvisitedURLs.pop())
|
||||
|
||||
def parse(self, content):
|
||||
failed = False
|
||||
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
|
||||
try:
|
||||
tree = p.parse(content)
|
||||
except:
|
||||
self.buggyURLs.add(self.currentURL)
|
||||
failed = True
|
||||
print("BUGGY:", self.currentURL)
|
||||
self.visitedURLs.add(self.currentURL)
|
||||
if not failed:
|
||||
self.updateURLs(tree)
|
||||
|
||||
def loadURL(self, url):
|
||||
resp, content = self.http.request(url, "GET")
|
||||
self.currentURL = url
|
||||
digest = md5.md5(content).hexdigest()
|
||||
if digest in self.contentDigest:
|
||||
content = None
|
||||
self.visitedURLs.add(url)
|
||||
else:
|
||||
self.contentDigest[digest] = url
|
||||
|
||||
if resp['status'] != "200":
|
||||
content = None
|
||||
|
||||
return content
|
||||
|
||||
def updateURLs(self, tree):
|
||||
"""Take all the links in the current document, extract the URLs and
|
||||
update the list of visited and unvisited URLs according to whether we
|
||||
have seen them before or not"""
|
||||
urls = set()
|
||||
#Remove all links we have already visited
|
||||
for link in tree.findall(".//a"):
|
||||
try:
|
||||
url = urllib.parse.urldefrag(link.attrib['href'])[0]
|
||||
if (url and url not in self.unvisitedURLs and url
|
||||
not in self.visitedURLs):
|
||||
urls.add(url)
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
#Remove all non-http URLs and a dd a sutiable base URL where that is
|
||||
#missing
|
||||
newUrls = set()
|
||||
for url in urls:
|
||||
splitURL = list(urllib.parse.urlsplit(url))
|
||||
if splitURL[0] != "http":
|
||||
continue
|
||||
if splitURL[1] == "":
|
||||
splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
|
||||
newUrls.add(urllib.parse.urlunsplit(splitURL))
|
||||
urls = newUrls
|
||||
|
||||
responseHeaders = {}
|
||||
#Now we want to find the content types of the links we haven't visited
|
||||
for url in urls:
|
||||
try:
|
||||
resp, content = self.http.request(url, "HEAD")
|
||||
responseHeaders[url] = resp
|
||||
except AttributeError as KeyError:
|
||||
#Don't know why this happens
|
||||
pass
|
||||
|
||||
|
||||
#Remove links not of content-type html or pages not found
|
||||
#XXX - need to deal with other status codes?
|
||||
toVisit = set([url for url in urls if url in responseHeaders and
|
||||
"html" in responseHeaders[url]['content-type'] and
|
||||
responseHeaders[url]['status'] == "200"])
|
||||
|
||||
#Now check we are allowed to spider the page
|
||||
for url in toVisit:
|
||||
robotURL = list(urllib.parse.urlsplit(url)[:2])
|
||||
robotURL.extend(["robots.txt", "", ""])
|
||||
robotURL = urllib.parse.urlunsplit(robotURL)
|
||||
self.robotParser.set_url(robotURL)
|
||||
if not self.robotParser.can_fetch("*", url):
|
||||
toVisit.remove(url)
|
||||
|
||||
self.visitedURLs.update(urls)
|
||||
self.unvisitedURLs.update(toVisit)
|
Loading…
Add table
Add a link
Reference in a new issue