mirror of
https://github.com/servo/servo.git
synced 2025-06-26 10:04:33 +01:00
122 lines
4.1 KiB
Python
122 lines
4.1 KiB
Python
#!/usr/bin/env python
|
|
"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree
|
|
|
|
usage:
|
|
import spider
|
|
s = spider.Spider()
|
|
s.spider("http://www.google.com", maxURLs=100)
|
|
"""
|
|
|
|
import urllib.request, urllib.error, urllib.parse
|
|
import urllib.robotparser
|
|
import md5
|
|
|
|
import httplib2
|
|
|
|
import html5lib
|
|
from html5lib.treebuilders import etree
|
|
|
|
class Spider(object):
|
|
def __init__(self):
|
|
self.unvisitedURLs = set()
|
|
self.visitedURLs = set()
|
|
self.buggyURLs=set()
|
|
self.robotParser = urllib.robotparser.RobotFileParser()
|
|
self.contentDigest = {}
|
|
self.http = httplib2.Http(".cache")
|
|
|
|
def run(self, initialURL, maxURLs=1000):
|
|
urlNumber = 0
|
|
self.visitedURLs.add(initialURL)
|
|
content = self.loadURL(initialURL)
|
|
while maxURLs is None or urlNumber < maxURLs:
|
|
if content is not None:
|
|
self.parse(content)
|
|
urlNumber += 1
|
|
if not self.unvisitedURLs:
|
|
break
|
|
content = self.loadURL(self.unvisitedURLs.pop())
|
|
|
|
def parse(self, content):
|
|
failed = False
|
|
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
|
|
try:
|
|
tree = p.parse(content)
|
|
except:
|
|
self.buggyURLs.add(self.currentURL)
|
|
failed = True
|
|
print("BUGGY:", self.currentURL)
|
|
self.visitedURLs.add(self.currentURL)
|
|
if not failed:
|
|
self.updateURLs(tree)
|
|
|
|
def loadURL(self, url):
|
|
resp, content = self.http.request(url, "GET")
|
|
self.currentURL = url
|
|
digest = md5.md5(content).hexdigest()
|
|
if digest in self.contentDigest:
|
|
content = None
|
|
self.visitedURLs.add(url)
|
|
else:
|
|
self.contentDigest[digest] = url
|
|
|
|
if resp['status'] != "200":
|
|
content = None
|
|
|
|
return content
|
|
|
|
def updateURLs(self, tree):
|
|
"""Take all the links in the current document, extract the URLs and
|
|
update the list of visited and unvisited URLs according to whether we
|
|
have seen them before or not"""
|
|
urls = set()
|
|
#Remove all links we have already visited
|
|
for link in tree.findall(".//a"):
|
|
try:
|
|
url = urllib.parse.urldefrag(link.attrib['href'])[0]
|
|
if (url and url not in self.unvisitedURLs and url
|
|
not in self.visitedURLs):
|
|
urls.add(url)
|
|
except KeyError:
|
|
pass
|
|
|
|
#Remove all non-http URLs and a dd a sutiable base URL where that is
|
|
#missing
|
|
newUrls = set()
|
|
for url in urls:
|
|
splitURL = list(urllib.parse.urlsplit(url))
|
|
if splitURL[0] != "http":
|
|
continue
|
|
if splitURL[1] == "":
|
|
splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
|
|
newUrls.add(urllib.parse.urlunsplit(splitURL))
|
|
urls = newUrls
|
|
|
|
responseHeaders = {}
|
|
#Now we want to find the content types of the links we haven't visited
|
|
for url in urls:
|
|
try:
|
|
resp, content = self.http.request(url, "HEAD")
|
|
responseHeaders[url] = resp
|
|
except AttributeError as KeyError:
|
|
#Don't know why this happens
|
|
pass
|
|
|
|
|
|
#Remove links not of content-type html or pages not found
|
|
#XXX - need to deal with other status codes?
|
|
toVisit = set([url for url in urls if url in responseHeaders and
|
|
"html" in responseHeaders[url]['content-type'] and
|
|
responseHeaders[url]['status'] == "200"])
|
|
|
|
#Now check we are allowed to spider the page
|
|
for url in toVisit:
|
|
robotURL = list(urllib.parse.urlsplit(url)[:2])
|
|
robotURL.extend(["robots.txt", "", ""])
|
|
robotURL = urllib.parse.urlunsplit(robotURL)
|
|
self.robotParser.set_url(robotURL)
|
|
if not self.robotParser.can_fetch("*", url):
|
|
toVisit.remove(url)
|
|
|
|
self.visitedURLs.update(urls)
|
|
self.unvisitedURLs.update(toVisit)
|