diff --git a/README.md b/README.md index 3a5a289..1f63eb9 100644 --- a/README.md +++ b/README.md @@ -7,22 +7,26 @@ Assuming you have Python 3 and git installed: ```bash git clone https://github.com/simon987/od-database cd od-database -pip3 install -r requirements.txt +sudo pip3 install -r requirements.txt ``` -Create `/config.py` and fill out the parameters. Empty config: +Create `/config.py` and fill out the parameters. Sample config: ```python CAPTCHA_SITE_KEY = "" CAPTCHA_SECRET_KEY = "" FLASK_SECRET = "" -USE_SSL = True RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000) -HEADERS = {} +CRAWL_SERVER_HEADERS = {} +CRAWL_SERVER_TOKEN = "" +CRAWL_SERVER_PORT = 5001 +CRAWL_SERVER_PROCESSES = 3 +CRAWL_SERVER_THREADS = 20 + ``` -## Running +## Running the crawl server ```bash -python3 app.py +cd od-database +export PYTHONPATH=$(pwd) +cd crawl_server +python3 server.py ``` -You should be able to connect with your browser at `https://localhost:12345` - -*_Note: To use SSL you have to put the appropriate certificates in /certificates/_ diff --git a/app.py b/app.py index 045f814..c2179ac 100644 --- a/app.py +++ b/app.py @@ -410,7 +410,6 @@ def admin_crawl_logs(): if "username" in session: results = taskDispatcher.get_task_logs_by_server() - print(results) return render_template("crawl_logs.html", logs=results) else: diff --git a/crawl_server/crawler.py b/crawl_server/crawler.py index 3c1c794..6d804ed 100644 --- a/crawl_server/crawler.py +++ b/crawl_server/crawler.py @@ -1,6 +1,6 @@ import os import ujson -from urllib.parse import urlparse +from urllib.parse import urlparse, urljoin from timeout_decorator.timeout_decorator import TimeoutError from threading import Thread from queue import Queue, Empty @@ -150,7 +150,7 @@ class RemoteDirectoryCrawler: for f in listing: if f.is_dir: - in_q.put(os.path.join(f.path, f.name, "")) + in_q.put(urljoin(f.path, f.name, "")) else: files_q.put(f) import sys diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 54f5260..680008c 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -71,7 +71,11 @@ class HttpDirectory(RemoteDirectory): "?DA", "?ND", "?C=N&O=A", - "?C=N&O=A" + "?C=N&O=A", + "?M=A", + "?N=D", + "?S=A", + "?D=A", ) FILE_NAME_BLACKLIST = ( "Parent Directory", diff --git a/crawl_server/server.py b/crawl_server/server.py index 99fd03a..47b63a1 100644 --- a/crawl_server/server.py +++ b/crawl_server/server.py @@ -9,7 +9,7 @@ auth = HTTPTokenAuth(scheme="Token") token = config.CRAWL_SERVER_TOKEN -tm = TaskManager("tm_db.sqlite3", 32) +tm = TaskManager("tm_db.sqlite3", config.CRAWL_SERVER_PROCESSES) @auth.verify_token diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index a40b624..5e4cd8b 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -1,3 +1,4 @@ +import config from crawl_server.database import TaskManagerDatabase, Task, TaskResult from multiprocessing import Manager, Pool from apscheduler.schedulers.background import BackgroundScheduler @@ -59,7 +60,7 @@ class TaskManager: print("Starting task " + task.url) - crawler = RemoteDirectoryCrawler(task.url, 20) + crawler = RemoteDirectoryCrawler(task.url, config.CRAWL_SERVER_THREADS) crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json") del crawler diff --git a/templates/website.html b/templates/website.html index 499e70b..a6f4d75 100644 --- a/templates/website.html +++ b/templates/website.html @@ -5,7 +5,7 @@ {% block body %}
-
Information for {{ website.url }}
+
Information for {{ website.url | truncate(80) }}
@@ -15,35 +15,40 @@
- - - - - +
+
Base url
+ + + + - - - - + + + + - - - - + + + + - - - - -
Base url
File count
File count
Total size
Total size
Last updated
+ + Last updated + + + +

Link list Summary (JSON) {% if "username" in session %} - Clear - Delete - rescan + + Clear + + Delete + + rescan {% endif %}
diff --git a/templates/websites.html b/templates/websites.html index adc2357..7adbd8b 100644 --- a/templates/websites.html +++ b/templates/websites.html @@ -18,7 +18,7 @@ {% for website in websites %} - {{ website[1] }} + {{ website[1] | truncate(70) }} {{ website[2] }} {% endfor %}