Support for more than just utf-8 and removed some debug info

This commit is contained in:
Simon 2018-06-18 13:44:19 -04:00
parent 7c47b0f00c
commit 8a73142ff8
4 changed files with 38 additions and 39 deletions

View File

@ -80,8 +80,6 @@ class RemoteDirectoryCrawler:
def crawl_directory(self, out_file: str) -> CrawlResult: def crawl_directory(self, out_file: str) -> CrawlResult:
import gc
gc.set_debug(gc.DEBUG_LEAK)
try: try:
directory = RemoteDirectoryFactory.get_directory(self.url) directory = RemoteDirectoryFactory.get_directory(self.url)
root_listing = directory.list_dir("") root_listing = directory.list_dir("")

View File

@ -1,4 +1,5 @@
from urllib.parse import unquote, urljoin from urllib.parse import unquote, urljoin
import warnings
import os import os
from html.parser import HTMLParser from html.parser import HTMLParser
from itertools import repeat from itertools import repeat
@ -118,44 +119,47 @@ class HttpDirectory(RemoteDirectory):
def _request_file(self, url): def _request_file(self, url):
retries = HttpDirectory.MAX_RETRIES with warnings.catch_warnings():
while retries > 0: warnings.simplefilter("ignore")
try: retries = HttpDirectory.MAX_RETRIES
r = self.session.head(url, allow_redirects=False, timeout=40) while retries > 0:
try:
r = self.session.head(url, allow_redirects=False, timeout=40)
stripped_url = url[len(self.base_url) - 1:] stripped_url = url[len(self.base_url) - 1:]
path, name = os.path.split(stripped_url) path, name = os.path.split(stripped_url)
date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01" date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
return File( return File(
path=unquote(path).strip("/"), path=unquote(path).strip("/"),
name=unquote(name), name=unquote(name),
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1, size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
mtime=int(parse_date(date).timestamp()), mtime=int(parse_date(date).timestamp()),
is_dir=False is_dir=False
) )
except RequestException: except RequestException:
self.session.close() self.session.close()
retries -= 1 retries -= 1
return None return None
def _stream_body(self, url: str): def _stream_body(self, url: str):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
retries = HttpDirectory.MAX_RETRIES
while retries > 0:
try:
r = self.session.get(url, stream=True, timeout=40)
for chunk in r.iter_content(chunk_size=4096):
yield chunk.decode(r.encoding, errors="ignore")
r.close()
del r
break
except RequestException:
self.session.close()
retries -= 1
retries = HttpDirectory.MAX_RETRIES return None
while retries > 0:
try:
r = self.session.get(url, stream=True, timeout=40)
for chunk in r.iter_content(chunk_size=4096):
yield chunk
r.close()
del r
break
except RequestException:
self.session.close()
retries -= 1
return None
@staticmethod @staticmethod
def _parse_links(body): def _parse_links(body):
@ -163,7 +167,7 @@ class HttpDirectory(RemoteDirectory):
parser = HTMLAnchorParser() parser = HTMLAnchorParser()
for chunk in body: for chunk in body:
parser.feed(chunk.decode("utf-8", errors="ignore")) parser.feed(chunk)
for anchor in parser.anchors: for anchor in parser.anchors:
yield anchor yield anchor

View File

@ -53,9 +53,6 @@ class TaskManager:
@staticmethod @staticmethod
def run_task(task, db_path, current_tasks): def run_task(task, db_path, current_tasks):
# import gc
# gc.set_debug(gc.DEBUG_LEAK)
result = TaskResult() result = TaskResult()
result.start_time = datetime.utcnow() result.start_time = datetime.utcnow()
result.website_id = task.website_id result.website_id = task.website_id

View File

@ -4,7 +4,7 @@ import json
payload = json.dumps({ payload = json.dumps({
"website_id": 123, "website_id": 123,
"url": "http://liminaire.fr/TEXTES/", "url": "https://computerarchive.org/files/computer/",
# "url": "http://localhost:8000/", # "url": "http://localhost:8000/",
# "url": "http://ubuntu.mirrorservice.org/", # "url": "http://ubuntu.mirrorservice.org/",
"priority": 2, "priority": 2,