Support for more than just utf-8 and removed some debug info

This commit is contained in:
Simon 2018-06-18 13:44:19 -04:00
parent 7c47b0f00c
commit 8a73142ff8
4 changed files with 38 additions and 39 deletions

View File

@ -80,8 +80,6 @@ class RemoteDirectoryCrawler:
def crawl_directory(self, out_file: str) -> CrawlResult: def crawl_directory(self, out_file: str) -> CrawlResult:
import gc
gc.set_debug(gc.DEBUG_LEAK)
try: try:
directory = RemoteDirectoryFactory.get_directory(self.url) directory = RemoteDirectoryFactory.get_directory(self.url)
root_listing = directory.list_dir("") root_listing = directory.list_dir("")

View File

@ -1,4 +1,5 @@
from urllib.parse import unquote, urljoin from urllib.parse import unquote, urljoin
import warnings
import os import os
from html.parser import HTMLParser from html.parser import HTMLParser
from itertools import repeat from itertools import repeat
@ -118,6 +119,8 @@ class HttpDirectory(RemoteDirectory):
def _request_file(self, url): def _request_file(self, url):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
retries = HttpDirectory.MAX_RETRIES retries = HttpDirectory.MAX_RETRIES
while retries > 0: while retries > 0:
try: try:
@ -141,13 +144,14 @@ class HttpDirectory(RemoteDirectory):
return None return None
def _stream_body(self, url: str): def _stream_body(self, url: str):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
retries = HttpDirectory.MAX_RETRIES retries = HttpDirectory.MAX_RETRIES
while retries > 0: while retries > 0:
try: try:
r = self.session.get(url, stream=True, timeout=40) r = self.session.get(url, stream=True, timeout=40)
for chunk in r.iter_content(chunk_size=4096): for chunk in r.iter_content(chunk_size=4096):
yield chunk yield chunk.decode(r.encoding, errors="ignore")
r.close() r.close()
del r del r
break break
@ -163,7 +167,7 @@ class HttpDirectory(RemoteDirectory):
parser = HTMLAnchorParser() parser = HTMLAnchorParser()
for chunk in body: for chunk in body:
parser.feed(chunk.decode("utf-8", errors="ignore")) parser.feed(chunk)
for anchor in parser.anchors: for anchor in parser.anchors:
yield anchor yield anchor

View File

@ -53,9 +53,6 @@ class TaskManager:
@staticmethod @staticmethod
def run_task(task, db_path, current_tasks): def run_task(task, db_path, current_tasks):
# import gc
# gc.set_debug(gc.DEBUG_LEAK)
result = TaskResult() result = TaskResult()
result.start_time = datetime.utcnow() result.start_time = datetime.utcnow()
result.website_id = task.website_id result.website_id = task.website_id

View File

@ -4,7 +4,7 @@ import json
payload = json.dumps({ payload = json.dumps({
"website_id": 123, "website_id": 123,
"url": "http://liminaire.fr/TEXTES/", "url": "https://computerarchive.org/files/computer/",
# "url": "http://localhost:8000/", # "url": "http://localhost:8000/",
# "url": "http://ubuntu.mirrorservice.org/", # "url": "http://ubuntu.mirrorservice.org/",
"priority": 2, "priority": 2,