mirror of
https://github.com/simon987/od-database.git
synced 2025-04-24 12:45:51 +00:00
Crawler performance improvements
This commit is contained in:
parent
fbbe952e4d
commit
34d1f375a8
@ -4,6 +4,7 @@ from urllib.parse import urlparse, urljoin
|
|||||||
from threading import Thread
|
from threading import Thread
|
||||||
from queue import Queue, Empty
|
from queue import Queue, Empty
|
||||||
from crawl_server import logger
|
from crawl_server import logger
|
||||||
|
from pybloom_live import ScalableBloomFilter
|
||||||
|
|
||||||
|
|
||||||
class TooManyConnectionsError(Exception):
|
class TooManyConnectionsError(Exception):
|
||||||
@ -21,11 +22,11 @@ class File:
|
|||||||
self.is_dir = is_dir
|
self.is_dir = is_dir
|
||||||
|
|
||||||
def __bytes__(self):
|
def __bytes__(self):
|
||||||
return b"|".join([
|
return b"".join([
|
||||||
self.name.encode(),
|
self.name.encode(),
|
||||||
b"D" if self.is_dir else b"F",
|
b"D" if self.is_dir else b"F",
|
||||||
str(self.size).encode(),
|
self.size.to_bytes(6, byteorder="little"),
|
||||||
str(self.mtime).encode(),
|
self.mtime.to_bytes(6, byteorder="little"),
|
||||||
])
|
])
|
||||||
|
|
||||||
def to_json(self):
|
def to_json(self):
|
||||||
@ -81,7 +82,7 @@ class RemoteDirectoryCrawler:
|
|||||||
def __init__(self, url, max_threads: int):
|
def __init__(self, url, max_threads: int):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.max_threads = max_threads
|
self.max_threads = max_threads
|
||||||
self.crawled_paths = list()
|
self.crawled_paths = ScalableBloomFilter(error_rate=0.0001)
|
||||||
self.status_code = "success"
|
self.status_code = "success"
|
||||||
|
|
||||||
def crawl_directory(self, out_file: str) -> CrawlResult:
|
def crawl_directory(self, out_file: str) -> CrawlResult:
|
||||||
@ -91,7 +92,7 @@ class RemoteDirectoryCrawler:
|
|||||||
logger.info("Crawling directory " + self.url + " with " + str(type(directory)))
|
logger.info("Crawling directory " + self.url + " with " + str(type(directory)))
|
||||||
path_id, root_listing = directory.list_dir(urlparse(self.url).path)
|
path_id, root_listing = directory.list_dir(urlparse(self.url).path)
|
||||||
if root_listing:
|
if root_listing:
|
||||||
self.crawled_paths.append(path_id)
|
self.crawled_paths.add(path_id)
|
||||||
else:
|
else:
|
||||||
logger.info("No files in root listing for " + self.url)
|
logger.info("No files in root listing for " + self.url)
|
||||||
return CrawlResult(0, "empty")
|
return CrawlResult(0, "empty")
|
||||||
@ -152,7 +153,7 @@ class RemoteDirectoryCrawler:
|
|||||||
try:
|
try:
|
||||||
path_id, listing = directory.list_dir(path)
|
path_id, listing = directory.list_dir(path)
|
||||||
if len(listing) > 0 and path_id not in self.crawled_paths:
|
if len(listing) > 0 and path_id not in self.crawled_paths:
|
||||||
self.crawled_paths.append(path_id)
|
self.crawled_paths.add(path_id)
|
||||||
|
|
||||||
for f in listing:
|
for f in listing:
|
||||||
if f.is_dir:
|
if f.is_dir:
|
||||||
|
@ -102,7 +102,7 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
def list_dir(self, path):
|
def list_dir(self, path):
|
||||||
|
|
||||||
current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1]
|
current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1]
|
||||||
path_identifier = hashlib.sha1(current_dir_name.encode())
|
path_identifier = hashlib.md5(current_dir_name.encode())
|
||||||
path_url = urljoin(self.base_url, path, "")
|
path_url = urljoin(self.base_url, path, "")
|
||||||
body = self._stream_body(path_url)
|
body = self._stream_body(path_url)
|
||||||
anchors = self._parse_links(body)
|
anchors = self._parse_links(body)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user