mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Should fix memory usage problem when crawling (part two)
This commit is contained in:
parent
adb94cf326
commit
9d0a0a8b42
@ -1,7 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
import logging
|
|
||||||
import ujson
|
import ujson
|
||||||
import logging
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from timeout_decorator.timeout_decorator import TimeoutError
|
from timeout_decorator.timeout_decorator import TimeoutError
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
@ -93,14 +91,14 @@ class RemoteDirectoryCrawler:
|
|||||||
def __init__(self, url, max_threads: int):
|
def __init__(self, url, max_threads: int):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.max_threads = max_threads
|
self.max_threads = max_threads
|
||||||
self.crawled_paths = set()
|
self.crawled_paths = list()
|
||||||
|
|
||||||
def crawl_directory(self, out_file: str) -> CrawlResult:
|
def crawl_directory(self, out_file: str) -> CrawlResult:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
directory = RemoteDirectoryFactory.get_directory(self.url)
|
directory = RemoteDirectoryFactory.get_directory(self.url)
|
||||||
root_listing = directory.list_dir("")
|
root_listing = directory.list_dir("")
|
||||||
self.crawled_paths.add("")
|
self.crawled_paths.append("")
|
||||||
directory.close()
|
directory.close()
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
return CrawlResult(0, "timeout")
|
return CrawlResult(0, "timeout")
|
||||||
@ -109,7 +107,7 @@ class RemoteDirectoryCrawler:
|
|||||||
files_q = Queue(maxsize=0)
|
files_q = Queue(maxsize=0)
|
||||||
for f in root_listing:
|
for f in root_listing:
|
||||||
if f.is_dir:
|
if f.is_dir:
|
||||||
in_q.put(f)
|
in_q.put(os.path.join(f.path, f.name, ""))
|
||||||
else:
|
else:
|
||||||
files_q.put(f)
|
files_q.put(f)
|
||||||
|
|
||||||
@ -143,41 +141,41 @@ class RemoteDirectoryCrawler:
|
|||||||
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
|
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
|
||||||
|
|
||||||
while directory:
|
while directory:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
file = in_q.get(timeout=60)
|
path = in_q.get(timeout=60)
|
||||||
except Empty:
|
except Empty:
|
||||||
|
directory.close()
|
||||||
break
|
break
|
||||||
|
|
||||||
if file is None:
|
if path is None:
|
||||||
break
|
break
|
||||||
|
|
||||||
try:
|
try:
|
||||||
path = os.path.join(file.path, file.name, "")
|
|
||||||
if path not in self.crawled_paths:
|
if path not in self.crawled_paths:
|
||||||
self.crawled_paths.add(path)
|
self.crawled_paths.append(path)
|
||||||
listing = directory.list_dir(path)
|
listing = directory.list_dir(path)
|
||||||
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
|
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
|
||||||
|
|
||||||
for f in listing:
|
for f in listing:
|
||||||
if f.is_dir:
|
if f.is_dir:
|
||||||
in_q.put(f)
|
in_q.put(os.path.join(f.path, f.name, ""))
|
||||||
else:
|
else:
|
||||||
files_q.put(f)
|
files_q.put(f)
|
||||||
|
print("LISTED " + repr(path) + "dirs:" + str(in_q.qsize()))
|
||||||
except TooManyConnectionsError:
|
except TooManyConnectionsError:
|
||||||
print("Too many connections")
|
print("Too many connections")
|
||||||
# Kill worker and resubmit listing task
|
# Kill worker and resubmit listing task
|
||||||
directory.close()
|
directory.close()
|
||||||
in_q.put(file)
|
in_q.put(path)
|
||||||
break
|
break
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
if timeout_retries > 0:
|
if timeout_retries > 0:
|
||||||
timeout_retries -= 1
|
timeout_retries -= 1
|
||||||
# TODO: Remove debug info
|
# TODO: Remove debug info
|
||||||
print("TIMEOUT, " + str(timeout_retries) + " retries left")
|
print("TIMEOUT, " + str(timeout_retries) + " retries left")
|
||||||
in_q.put(file)
|
in_q.put(path)
|
||||||
else:
|
else:
|
||||||
print("Dropping listing for " + os.path.join(file.path, file.name, ""))
|
print("Dropping listing for " + path)
|
||||||
finally:
|
finally:
|
||||||
in_q.task_done()
|
in_q.task_done()
|
||||||
|
|
||||||
@ -190,7 +188,7 @@ class RemoteDirectoryCrawler:
|
|||||||
while True:
|
while True:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
file = files_q.get(timeout=30)
|
file = files_q.get(timeout=240)
|
||||||
except Empty:
|
except Empty:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -202,6 +200,7 @@ class RemoteDirectoryCrawler:
|
|||||||
files_q.task_done()
|
files_q.task_done()
|
||||||
|
|
||||||
files_written.append(counter)
|
files_written.append(counter)
|
||||||
|
print("File writer done")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,13 +11,6 @@ import config
|
|||||||
from dateutil.parser import parse as parse_date
|
from dateutil.parser import parse as parse_date
|
||||||
|
|
||||||
|
|
||||||
class Link:
|
|
||||||
|
|
||||||
def __init__(self, text: str, url: str):
|
|
||||||
self.text = text
|
|
||||||
self.url = url
|
|
||||||
|
|
||||||
|
|
||||||
class HttpDirectory(RemoteDirectory):
|
class HttpDirectory(RemoteDirectory):
|
||||||
|
|
||||||
SCHEMES = ("http", "https",)
|
SCHEMES = ("http", "https",)
|
||||||
@ -37,6 +30,8 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
super().__init__(url)
|
super().__init__(url)
|
||||||
self.parser = etree.HTMLParser(collect_ids=False, encoding='utf-8')
|
self.parser = etree.HTMLParser(collect_ids=False, encoding='utf-8')
|
||||||
|
self.session = requests.Session()
|
||||||
|
self.session.headers = HttpDirectory.HEADERS
|
||||||
|
|
||||||
def list_dir(self, path) -> list:
|
def list_dir(self, path) -> list:
|
||||||
results = []
|
results = []
|
||||||
@ -50,20 +45,19 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
urls_to_request = []
|
urls_to_request = []
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
|
if self._should_ignore(self.base_url, link):
|
||||||
if self._should_ignore(link):
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
file_url = urljoin(path_url, link.url)
|
file_url = urljoin(path_url, link[1])
|
||||||
path, file_name = os.path.split(file_url[len(self.base_url) - 1:])
|
path, file_name = os.path.split(file_url[len(self.base_url) - 1:])
|
||||||
|
|
||||||
if self._isdir(link):
|
if self._isdir(link):
|
||||||
results.append(File(
|
results.append(File(
|
||||||
name=file_name,
|
name=file_name,
|
||||||
mtime=0,
|
mtime=None,
|
||||||
size=-1,
|
size=None,
|
||||||
is_dir=True,
|
path=path,
|
||||||
path=path
|
is_dir=True
|
||||||
))
|
))
|
||||||
else:
|
else:
|
||||||
urls_to_request.append(file_url)
|
urls_to_request.append(file_url)
|
||||||
@ -74,18 +68,15 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
|
|
||||||
def request_files(self, urls_to_request: list) -> list:
|
def request_files(self, urls_to_request: list) -> list:
|
||||||
|
|
||||||
results = []
|
if len(urls_to_request) > 30:
|
||||||
|
|
||||||
if len(urls_to_request) > 4:
|
|
||||||
# Many urls, use multi-threaded solution
|
# Many urls, use multi-threaded solution
|
||||||
pool = ThreadPool(processes=10)
|
pool = ThreadPool(processes=10)
|
||||||
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
|
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
|
||||||
pool.close()
|
pool.close()
|
||||||
for file in files:
|
return [f for f in files if f]
|
||||||
if file:
|
|
||||||
results.append(file)
|
|
||||||
else:
|
else:
|
||||||
# Too few urls to create thread pool
|
# Too few urls to create thread pool
|
||||||
|
results = []
|
||||||
for url in urls_to_request:
|
for url in urls_to_request:
|
||||||
file = self._request_file(url)
|
file = self._request_file(url)
|
||||||
if file:
|
if file:
|
||||||
@ -96,22 +87,21 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
def _get_url(self, path: str):
|
def _get_url(self, path: str):
|
||||||
return urljoin(self.base_url, path)
|
return urljoin(self.base_url, path)
|
||||||
|
|
||||||
@staticmethod
|
def _fetch_body(self, url: str):
|
||||||
def _fetch_body(url: str):
|
|
||||||
|
|
||||||
retries = HttpDirectory.MAX_RETRIES
|
retries = HttpDirectory.MAX_RETRIES
|
||||||
while retries > 0:
|
while retries > 0:
|
||||||
try:
|
try:
|
||||||
r = requests.get(url, headers=HttpDirectory.HEADERS)
|
r = self.session.get(url)
|
||||||
return r.text
|
return r.content
|
||||||
except RequestException:
|
except RequestException:
|
||||||
retries -= 1
|
retries -= 1
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _parse_links(self, body: bytes) -> set:
|
def _parse_links(self, body: bytes) -> list:
|
||||||
|
|
||||||
result = set()
|
result = list()
|
||||||
tree = etree.HTML(body, parser=self.parser)
|
tree = etree.HTML(body, parser=self.parser)
|
||||||
links = []
|
links = []
|
||||||
try:
|
try:
|
||||||
@ -120,25 +110,25 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
result.add(Link(link.text, link.get("href")))
|
result.append((link.text, link.get("href")))
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _isdir(link: Link):
|
def _isdir(link: tuple):
|
||||||
return link.url.rsplit("?", maxsplit=1)[0].endswith("/")
|
return link[1].rsplit("?", maxsplit=1)[0].endswith("/")
|
||||||
|
|
||||||
def _request_file(self, url):
|
def _request_file(self, url):
|
||||||
|
|
||||||
retries = HttpDirectory.MAX_RETRIES
|
retries = HttpDirectory.MAX_RETRIES
|
||||||
while retries > 0:
|
while retries > 0:
|
||||||
try:
|
try:
|
||||||
r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50)
|
r = self.session.head(url, allow_redirects=False, timeout=50)
|
||||||
|
|
||||||
stripped_url = url[len(self.base_url) - 1:]
|
stripped_url = url[len(self.base_url) - 1:]
|
||||||
|
|
||||||
path, name = os.path.split(stripped_url)
|
path, name = os.path.split(stripped_url)
|
||||||
date = r.headers["Date"] if "Date" in r.headers else "1970-01-01"
|
date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
|
||||||
return File(
|
return File(
|
||||||
path=unquote(path).strip("/"),
|
path=unquote(path).strip("/"),
|
||||||
name=unquote(name),
|
name=unquote(name),
|
||||||
@ -152,6 +142,14 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _should_ignore(link: Link):
|
def _should_ignore(base_url, link: tuple):
|
||||||
return link.text == "../" or link.url.endswith(HttpDirectory.BLACK_LIST)
|
if link[0] == "../" or link[1].endswith(HttpDirectory.BLACK_LIST):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Ignore external links
|
||||||
|
if link[1].startswith("http") and not link[1].startswith(base_url):
|
||||||
|
return True
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.session.close()
|
||||||
|
|
||||||
|
@ -77,6 +77,7 @@ class TaskManager:
|
|||||||
task_result, db_path, current_tasks = result.result()
|
task_result, db_path, current_tasks = result.result()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Exception during task " + str(e))
|
print("Exception during task " + str(e))
|
||||||
|
return
|
||||||
|
|
||||||
print(task_result.status_code)
|
print(task_result.status_code)
|
||||||
print(task_result.file_count)
|
print(task_result.file_count)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user