diff --git a/README.md b/README.md
index 3a5a289..1f63eb9 100644
--- a/README.md
+++ b/README.md
@@ -7,22 +7,26 @@ Assuming you have Python 3 and git installed:
```bash
git clone https://github.com/simon987/od-database
cd od-database
-pip3 install -r requirements.txt
+sudo pip3 install -r requirements.txt
```
-Create `/config.py` and fill out the parameters. Empty config:
+Create `/config.py` and fill out the parameters. Sample config:
```python
CAPTCHA_SITE_KEY = ""
CAPTCHA_SECRET_KEY = ""
FLASK_SECRET = ""
-USE_SSL = True
RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
-HEADERS = {}
+CRAWL_SERVER_HEADERS = {}
+CRAWL_SERVER_TOKEN = ""
+CRAWL_SERVER_PORT = 5001
+CRAWL_SERVER_PROCESSES = 3
+CRAWL_SERVER_THREADS = 20
+
```
-## Running
+## Running the crawl server
```bash
-python3 app.py
+cd od-database
+export PYTHONPATH=$(pwd)
+cd crawl_server
+python3 server.py
```
-You should be able to connect with your browser at `https://localhost:12345`
-
-*_Note: To use SSL you have to put the appropriate certificates in /certificates/_
diff --git a/app.py b/app.py
index 045f814..c2179ac 100644
--- a/app.py
+++ b/app.py
@@ -410,7 +410,6 @@ def admin_crawl_logs():
if "username" in session:
results = taskDispatcher.get_task_logs_by_server()
- print(results)
return render_template("crawl_logs.html", logs=results)
else:
diff --git a/crawl_server/crawler.py b/crawl_server/crawler.py
index 3c1c794..6d804ed 100644
--- a/crawl_server/crawler.py
+++ b/crawl_server/crawler.py
@@ -1,6 +1,6 @@
import os
import ujson
-from urllib.parse import urlparse
+from urllib.parse import urlparse, urljoin
from timeout_decorator.timeout_decorator import TimeoutError
from threading import Thread
from queue import Queue, Empty
@@ -150,7 +150,7 @@ class RemoteDirectoryCrawler:
for f in listing:
if f.is_dir:
- in_q.put(os.path.join(f.path, f.name, ""))
+ in_q.put(urljoin(f.path, f.name, ""))
else:
files_q.put(f)
import sys
diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index 54f5260..680008c 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -71,7 +71,11 @@ class HttpDirectory(RemoteDirectory):
"?DA",
"?ND",
"?C=N&O=A",
- "?C=N&O=A"
+ "?C=N&O=A",
+ "?M=A",
+ "?N=D",
+ "?S=A",
+ "?D=A",
)
FILE_NAME_BLACKLIST = (
"Parent Directory",
diff --git a/crawl_server/server.py b/crawl_server/server.py
index 99fd03a..47b63a1 100644
--- a/crawl_server/server.py
+++ b/crawl_server/server.py
@@ -9,7 +9,7 @@ auth = HTTPTokenAuth(scheme="Token")
token = config.CRAWL_SERVER_TOKEN
-tm = TaskManager("tm_db.sqlite3", 32)
+tm = TaskManager("tm_db.sqlite3", config.CRAWL_SERVER_PROCESSES)
@auth.verify_token
diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py
index a40b624..5e4cd8b 100644
--- a/crawl_server/task_manager.py
+++ b/crawl_server/task_manager.py
@@ -1,3 +1,4 @@
+import config
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
from multiprocessing import Manager, Pool
from apscheduler.schedulers.background import BackgroundScheduler
@@ -59,7 +60,7 @@ class TaskManager:
print("Starting task " + task.url)
- crawler = RemoteDirectoryCrawler(task.url, 20)
+ crawler = RemoteDirectoryCrawler(task.url, config.CRAWL_SERVER_THREADS)
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
del crawler
diff --git a/templates/website.html b/templates/website.html
index 499e70b..a6f4d75 100644
--- a/templates/website.html
+++ b/templates/website.html
@@ -5,7 +5,7 @@
{% block body %}
diff --git a/templates/websites.html b/templates/websites.html
index adc2357..7adbd8b 100644
--- a/templates/websites.html
+++ b/templates/websites.html
@@ -18,7 +18,7 @@
{% for website in websites %}
- {{ website[1] }} |
+ {{ website[1] | truncate(70) }} |
{{ website[2] }} |
{% endfor %}