Updated readme and UI fixes

This commit is contained in:
Simon
2018-06-22 13:22:58 -04:00
parent 9d3fc2d71b
commit e824b2bf3c
8 changed files with 51 additions and 38 deletions

View File

@@ -1,6 +1,6 @@
import os
import ujson
from urllib.parse import urlparse
from urllib.parse import urlparse, urljoin
from timeout_decorator.timeout_decorator import TimeoutError
from threading import Thread
from queue import Queue, Empty
@@ -150,7 +150,7 @@ class RemoteDirectoryCrawler:
for f in listing:
if f.is_dir:
in_q.put(os.path.join(f.path, f.name, ""))
in_q.put(urljoin(f.path, f.name, ""))
else:
files_q.put(f)
import sys

View File

@@ -71,7 +71,11 @@ class HttpDirectory(RemoteDirectory):
"?DA",
"?ND",
"?C=N&O=A",
"?C=N&O=A"
"?C=N&O=A",
"?M=A",
"?N=D",
"?S=A",
"?D=A",
)
FILE_NAME_BLACKLIST = (
"Parent Directory",

View File

@@ -9,7 +9,7 @@ auth = HTTPTokenAuth(scheme="Token")
token = config.CRAWL_SERVER_TOKEN
tm = TaskManager("tm_db.sqlite3", 32)
tm = TaskManager("tm_db.sqlite3", config.CRAWL_SERVER_PROCESSES)
@auth.verify_token

View File

@@ -1,3 +1,4 @@
import config
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
from multiprocessing import Manager, Pool
from apscheduler.schedulers.background import BackgroundScheduler
@@ -59,7 +60,7 @@ class TaskManager:
print("Starting task " + task.url)
crawler = RemoteDirectoryCrawler(task.url, 20)
crawler = RemoteDirectoryCrawler(task.url, config.CRAWL_SERVER_THREADS)
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
del crawler