Crawler performance improvements

This commit is contained in:
Simon 2018-07-25 11:27:50 -04:00
parent fbbe952e4d
commit 34d1f375a8
2 changed files with 8 additions and 7 deletions

View File

@ -4,6 +4,7 @@ from urllib.parse import urlparse, urljoin
from threading import Thread from threading import Thread
from queue import Queue, Empty from queue import Queue, Empty
from crawl_server import logger from crawl_server import logger
from pybloom_live import ScalableBloomFilter
class TooManyConnectionsError(Exception): class TooManyConnectionsError(Exception):
@ -21,11 +22,11 @@ class File:
self.is_dir = is_dir self.is_dir = is_dir
def __bytes__(self): def __bytes__(self):
return b"|".join([ return b"".join([
self.name.encode(), self.name.encode(),
b"D" if self.is_dir else b"F", b"D" if self.is_dir else b"F",
str(self.size).encode(), self.size.to_bytes(6, byteorder="little"),
str(self.mtime).encode(), self.mtime.to_bytes(6, byteorder="little"),
]) ])
def to_json(self): def to_json(self):
@ -81,7 +82,7 @@ class RemoteDirectoryCrawler:
def __init__(self, url, max_threads: int): def __init__(self, url, max_threads: int):
self.url = url self.url = url
self.max_threads = max_threads self.max_threads = max_threads
self.crawled_paths = list() self.crawled_paths = ScalableBloomFilter(error_rate=0.0001)
self.status_code = "success" self.status_code = "success"
def crawl_directory(self, out_file: str) -> CrawlResult: def crawl_directory(self, out_file: str) -> CrawlResult:
@ -91,7 +92,7 @@ class RemoteDirectoryCrawler:
logger.info("Crawling directory " + self.url + " with " + str(type(directory))) logger.info("Crawling directory " + self.url + " with " + str(type(directory)))
path_id, root_listing = directory.list_dir(urlparse(self.url).path) path_id, root_listing = directory.list_dir(urlparse(self.url).path)
if root_listing: if root_listing:
self.crawled_paths.append(path_id) self.crawled_paths.add(path_id)
else: else:
logger.info("No files in root listing for " + self.url) logger.info("No files in root listing for " + self.url)
return CrawlResult(0, "empty") return CrawlResult(0, "empty")
@ -152,7 +153,7 @@ class RemoteDirectoryCrawler:
try: try:
path_id, listing = directory.list_dir(path) path_id, listing = directory.list_dir(path)
if len(listing) > 0 and path_id not in self.crawled_paths: if len(listing) > 0 and path_id not in self.crawled_paths:
self.crawled_paths.append(path_id) self.crawled_paths.add(path_id)
for f in listing: for f in listing:
if f.is_dir: if f.is_dir:

View File

@ -102,7 +102,7 @@ class HttpDirectory(RemoteDirectory):
def list_dir(self, path): def list_dir(self, path):
current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1] current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1]
path_identifier = hashlib.sha1(current_dir_name.encode()) path_identifier = hashlib.md5(current_dir_name.encode())
path_url = urljoin(self.base_url, path, "") path_url = urljoin(self.base_url, path, "")
body = self._stream_body(path_url) body = self._stream_body(path_url)
anchors = self._parse_links(body) anchors = self._parse_links(body)