mirror of
https://github.com/simon987/od-database.git
synced 2025-12-11 14:08:51 +00:00
Updated readme and UI fixes
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import os
|
||||
import ujson
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from timeout_decorator.timeout_decorator import TimeoutError
|
||||
from threading import Thread
|
||||
from queue import Queue, Empty
|
||||
@@ -150,7 +150,7 @@ class RemoteDirectoryCrawler:
|
||||
|
||||
for f in listing:
|
||||
if f.is_dir:
|
||||
in_q.put(os.path.join(f.path, f.name, ""))
|
||||
in_q.put(urljoin(f.path, f.name, ""))
|
||||
else:
|
||||
files_q.put(f)
|
||||
import sys
|
||||
|
||||
@@ -71,7 +71,11 @@ class HttpDirectory(RemoteDirectory):
|
||||
"?DA",
|
||||
"?ND",
|
||||
"?C=N&O=A",
|
||||
"?C=N&O=A"
|
||||
"?C=N&O=A",
|
||||
"?M=A",
|
||||
"?N=D",
|
||||
"?S=A",
|
||||
"?D=A",
|
||||
)
|
||||
FILE_NAME_BLACKLIST = (
|
||||
"Parent Directory",
|
||||
|
||||
@@ -9,7 +9,7 @@ auth = HTTPTokenAuth(scheme="Token")
|
||||
|
||||
token = config.CRAWL_SERVER_TOKEN
|
||||
|
||||
tm = TaskManager("tm_db.sqlite3", 32)
|
||||
tm = TaskManager("tm_db.sqlite3", config.CRAWL_SERVER_PROCESSES)
|
||||
|
||||
|
||||
@auth.verify_token
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import config
|
||||
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
|
||||
from multiprocessing import Manager, Pool
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
@@ -59,7 +60,7 @@ class TaskManager:
|
||||
|
||||
print("Starting task " + task.url)
|
||||
|
||||
crawler = RemoteDirectoryCrawler(task.url, 20)
|
||||
crawler = RemoteDirectoryCrawler(task.url, config.CRAWL_SERVER_THREADS)
|
||||
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
|
||||
del crawler
|
||||
|
||||
|
||||
Reference in New Issue
Block a user