mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Support for more than just utf-8 and removed some debug info
This commit is contained in:
parent
7c47b0f00c
commit
8a73142ff8
@ -80,8 +80,6 @@ class RemoteDirectoryCrawler:
|
|||||||
|
|
||||||
def crawl_directory(self, out_file: str) -> CrawlResult:
|
def crawl_directory(self, out_file: str) -> CrawlResult:
|
||||||
|
|
||||||
import gc
|
|
||||||
gc.set_debug(gc.DEBUG_LEAK)
|
|
||||||
try:
|
try:
|
||||||
directory = RemoteDirectoryFactory.get_directory(self.url)
|
directory = RemoteDirectoryFactory.get_directory(self.url)
|
||||||
root_listing = directory.list_dir("")
|
root_listing = directory.list_dir("")
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from urllib.parse import unquote, urljoin
|
from urllib.parse import unquote, urljoin
|
||||||
|
import warnings
|
||||||
import os
|
import os
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from itertools import repeat
|
from itertools import repeat
|
||||||
@ -118,44 +119,47 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
|
|
||||||
def _request_file(self, url):
|
def _request_file(self, url):
|
||||||
|
|
||||||
retries = HttpDirectory.MAX_RETRIES
|
with warnings.catch_warnings():
|
||||||
while retries > 0:
|
warnings.simplefilter("ignore")
|
||||||
try:
|
retries = HttpDirectory.MAX_RETRIES
|
||||||
r = self.session.head(url, allow_redirects=False, timeout=40)
|
while retries > 0:
|
||||||
|
try:
|
||||||
|
r = self.session.head(url, allow_redirects=False, timeout=40)
|
||||||
|
|
||||||
stripped_url = url[len(self.base_url) - 1:]
|
stripped_url = url[len(self.base_url) - 1:]
|
||||||
|
|
||||||
path, name = os.path.split(stripped_url)
|
path, name = os.path.split(stripped_url)
|
||||||
date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
|
date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
|
||||||
return File(
|
return File(
|
||||||
path=unquote(path).strip("/"),
|
path=unquote(path).strip("/"),
|
||||||
name=unquote(name),
|
name=unquote(name),
|
||||||
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
|
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
|
||||||
mtime=int(parse_date(date).timestamp()),
|
mtime=int(parse_date(date).timestamp()),
|
||||||
is_dir=False
|
is_dir=False
|
||||||
)
|
)
|
||||||
except RequestException:
|
except RequestException:
|
||||||
self.session.close()
|
self.session.close()
|
||||||
retries -= 1
|
retries -= 1
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _stream_body(self, url: str):
|
def _stream_body(self, url: str):
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("ignore")
|
||||||
|
retries = HttpDirectory.MAX_RETRIES
|
||||||
|
while retries > 0:
|
||||||
|
try:
|
||||||
|
r = self.session.get(url, stream=True, timeout=40)
|
||||||
|
for chunk in r.iter_content(chunk_size=4096):
|
||||||
|
yield chunk.decode(r.encoding, errors="ignore")
|
||||||
|
r.close()
|
||||||
|
del r
|
||||||
|
break
|
||||||
|
except RequestException:
|
||||||
|
self.session.close()
|
||||||
|
retries -= 1
|
||||||
|
|
||||||
retries = HttpDirectory.MAX_RETRIES
|
return None
|
||||||
while retries > 0:
|
|
||||||
try:
|
|
||||||
r = self.session.get(url, stream=True, timeout=40)
|
|
||||||
for chunk in r.iter_content(chunk_size=4096):
|
|
||||||
yield chunk
|
|
||||||
r.close()
|
|
||||||
del r
|
|
||||||
break
|
|
||||||
except RequestException:
|
|
||||||
self.session.close()
|
|
||||||
retries -= 1
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _parse_links(body):
|
def _parse_links(body):
|
||||||
@ -163,7 +167,7 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
parser = HTMLAnchorParser()
|
parser = HTMLAnchorParser()
|
||||||
|
|
||||||
for chunk in body:
|
for chunk in body:
|
||||||
parser.feed(chunk.decode("utf-8", errors="ignore"))
|
parser.feed(chunk)
|
||||||
for anchor in parser.anchors:
|
for anchor in parser.anchors:
|
||||||
yield anchor
|
yield anchor
|
||||||
|
|
||||||
|
@ -53,9 +53,6 @@ class TaskManager:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def run_task(task, db_path, current_tasks):
|
def run_task(task, db_path, current_tasks):
|
||||||
|
|
||||||
# import gc
|
|
||||||
# gc.set_debug(gc.DEBUG_LEAK)
|
|
||||||
|
|
||||||
result = TaskResult()
|
result = TaskResult()
|
||||||
result.start_time = datetime.utcnow()
|
result.start_time = datetime.utcnow()
|
||||||
result.website_id = task.website_id
|
result.website_id = task.website_id
|
||||||
|
@ -4,7 +4,7 @@ import json
|
|||||||
|
|
||||||
payload = json.dumps({
|
payload = json.dumps({
|
||||||
"website_id": 123,
|
"website_id": 123,
|
||||||
"url": "http://liminaire.fr/TEXTES/",
|
"url": "https://computerarchive.org/files/computer/",
|
||||||
# "url": "http://localhost:8000/",
|
# "url": "http://localhost:8000/",
|
||||||
# "url": "http://ubuntu.mirrorservice.org/",
|
# "url": "http://ubuntu.mirrorservice.org/",
|
||||||
"priority": 2,
|
"priority": 2,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user