Updated readme and UI fixes

This commit is contained in:
Simon 2018-06-22 13:22:58 -04:00
parent 9d3fc2d71b
commit e824b2bf3c
8 changed files with 51 additions and 38 deletions

View File

@ -7,22 +7,26 @@ Assuming you have Python 3 and git installed:
```bash
git clone https://github.com/simon987/od-database
cd od-database
pip3 install -r requirements.txt
sudo pip3 install -r requirements.txt
```
Create `/config.py` and fill out the parameters. Empty config:
Create `/config.py` and fill out the parameters. Sample config:
```python
CAPTCHA_SITE_KEY = ""
CAPTCHA_SECRET_KEY = ""
FLASK_SECRET = ""
USE_SSL = True
RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
HEADERS = {}
CRAWL_SERVER_HEADERS = {}
CRAWL_SERVER_TOKEN = ""
CRAWL_SERVER_PORT = 5001
CRAWL_SERVER_PROCESSES = 3
CRAWL_SERVER_THREADS = 20
```
## Running
## Running the crawl server
```bash
python3 app.py
cd od-database
export PYTHONPATH=$(pwd)
cd crawl_server
python3 server.py
```
You should be able to connect with your browser at `https://localhost:12345`
*_Note: To use SSL you have to put the appropriate certificates in /certificates/_

1
app.py
View File

@ -410,7 +410,6 @@ def admin_crawl_logs():
if "username" in session:
results = taskDispatcher.get_task_logs_by_server()
print(results)
return render_template("crawl_logs.html", logs=results)
else:

View File

@ -1,6 +1,6 @@
import os
import ujson
from urllib.parse import urlparse
from urllib.parse import urlparse, urljoin
from timeout_decorator.timeout_decorator import TimeoutError
from threading import Thread
from queue import Queue, Empty
@ -150,7 +150,7 @@ class RemoteDirectoryCrawler:
for f in listing:
if f.is_dir:
in_q.put(os.path.join(f.path, f.name, ""))
in_q.put(urljoin(f.path, f.name, ""))
else:
files_q.put(f)
import sys

View File

@ -71,7 +71,11 @@ class HttpDirectory(RemoteDirectory):
"?DA",
"?ND",
"?C=N&O=A",
"?C=N&O=A"
"?C=N&O=A",
"?M=A",
"?N=D",
"?S=A",
"?D=A",
)
FILE_NAME_BLACKLIST = (
"Parent Directory",

View File

@ -9,7 +9,7 @@ auth = HTTPTokenAuth(scheme="Token")
token = config.CRAWL_SERVER_TOKEN
tm = TaskManager("tm_db.sqlite3", 32)
tm = TaskManager("tm_db.sqlite3", config.CRAWL_SERVER_PROCESSES)
@auth.verify_token

View File

@ -1,3 +1,4 @@
import config
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
from multiprocessing import Manager, Pool
from apscheduler.schedulers.background import BackgroundScheduler
@ -59,7 +60,7 @@ class TaskManager:
print("Starting task " + task.url)
crawler = RemoteDirectoryCrawler(task.url, 20)
crawler = RemoteDirectoryCrawler(task.url, config.CRAWL_SERVER_THREADS)
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
del crawler

View File

@ -5,7 +5,7 @@
{% block body %}
<div class="container">
<div class="card">
<div class="card-header">Information for {{ website.url }}</div>
<div class="card-header">Information for {{ website.url | truncate(80) }}</div>
<div class="card-body">
<div id="chart-wrapper" style="margin-bottom: 1em">
@ -15,6 +15,7 @@
<script src="/static/js/report.js"></script>
</div>
<div class="table-responsive">
<table class="table">
<tr>
<th>Base url</th>
@ -36,14 +37,18 @@
<td id="reportTime"></td>
</tr>
</table>
</div>
<hr>
<a href="/website/{{ website.id }}/links" class="btn btn-shadow btn-primary">Link list</a>
<a href="/website/{{ website.id }}/json_chart" class="btn btn-shadow btn-primary">Summary (JSON)</a>
{% if "username" in session %}
<a href="/website/{{ website.id }}/clear" class="btn btn-danger"><i class="fas fa-exclamation"></i> Clear</a>
<a href="/website/{{ website.id }}/delete" class="btn btn-danger"><i class="fas fa-trash"></i> Delete</a>
<a href="/website/{{ website.id }}/rescan" class="btn btn-secondary"><i class="fas fa-redo"></i> rescan</a>
<a href="/website/{{ website.id }}/clear" class="btn btn-danger"><i class="fas fa-exclamation"></i>
Clear</a>
<a href="/website/{{ website.id }}/delete" class="btn btn-danger"><i class="fas fa-trash"></i>
Delete</a>
<a href="/website/{{ website.id }}/rescan" class="btn btn-secondary"><i class="fas fa-redo"></i>
rescan</a>
{% endif %}
</div>
</div>

View File

@ -18,7 +18,7 @@
{% for website in websites %}
<tr>
<td><a title="Click for stats & link list" href="/website/{{ website[0] }}">{{ website[1] }}</a></td>
<td><a title="Click for stats & link list" href="/website/{{ website[0] }}">{{ website[1] | truncate(70) }}</a></td>
<td>{{ website[2] }}</td>
</tr>
{% endfor %}