Support for more than just utf-8 and removed some debug info

This commit is contained in:
Simon 2018-06-18 13:44:19 -04:00
parent 7c47b0f00c
commit 8a73142ff8
4 changed files with 38 additions and 39 deletions

View File

@ -80,8 +80,6 @@ class RemoteDirectoryCrawler:
def crawl_directory(self, out_file: str) -> CrawlResult:
import gc
gc.set_debug(gc.DEBUG_LEAK)
try:
directory = RemoteDirectoryFactory.get_directory(self.url)
root_listing = directory.list_dir("")

View File

@ -1,4 +1,5 @@
from urllib.parse import unquote, urljoin
import warnings
import os
from html.parser import HTMLParser
from itertools import repeat
@ -118,44 +119,47 @@ class HttpDirectory(RemoteDirectory):
def _request_file(self, url):
retries = HttpDirectory.MAX_RETRIES
while retries > 0:
try:
r = self.session.head(url, allow_redirects=False, timeout=40)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
retries = HttpDirectory.MAX_RETRIES
while retries > 0:
try:
r = self.session.head(url, allow_redirects=False, timeout=40)
stripped_url = url[len(self.base_url) - 1:]
stripped_url = url[len(self.base_url) - 1:]
path, name = os.path.split(stripped_url)
date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
return File(
path=unquote(path).strip("/"),
name=unquote(name),
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
mtime=int(parse_date(date).timestamp()),
is_dir=False
)
except RequestException:
self.session.close()
retries -= 1
path, name = os.path.split(stripped_url)
date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
return File(
path=unquote(path).strip("/"),
name=unquote(name),
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
mtime=int(parse_date(date).timestamp()),
is_dir=False
)
except RequestException:
self.session.close()
retries -= 1
return None
return None
def _stream_body(self, url: str):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
retries = HttpDirectory.MAX_RETRIES
while retries > 0:
try:
r = self.session.get(url, stream=True, timeout=40)
for chunk in r.iter_content(chunk_size=4096):
yield chunk.decode(r.encoding, errors="ignore")
r.close()
del r
break
except RequestException:
self.session.close()
retries -= 1
retries = HttpDirectory.MAX_RETRIES
while retries > 0:
try:
r = self.session.get(url, stream=True, timeout=40)
for chunk in r.iter_content(chunk_size=4096):
yield chunk
r.close()
del r
break
except RequestException:
self.session.close()
retries -= 1
return None
return None
@staticmethod
def _parse_links(body):
@ -163,7 +167,7 @@ class HttpDirectory(RemoteDirectory):
parser = HTMLAnchorParser()
for chunk in body:
parser.feed(chunk.decode("utf-8", errors="ignore"))
parser.feed(chunk)
for anchor in parser.anchors:
yield anchor

View File

@ -53,9 +53,6 @@ class TaskManager:
@staticmethod
def run_task(task, db_path, current_tasks):
# import gc
# gc.set_debug(gc.DEBUG_LEAK)
result = TaskResult()
result.start_time = datetime.utcnow()
result.website_id = task.website_id

View File

@ -4,7 +4,7 @@ import json
payload = json.dumps({
"website_id": 123,
"url": "http://liminaire.fr/TEXTES/",
"url": "https://computerarchive.org/files/computer/",
# "url": "http://localhost:8000/",
# "url": "http://ubuntu.mirrorservice.org/",
"priority": 2,