mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 02:16:47 +00:00
Updated readme and UI fixes
This commit is contained in:
parent
9d3fc2d71b
commit
e824b2bf3c
22
README.md
22
README.md
@ -7,22 +7,26 @@ Assuming you have Python 3 and git installed:
|
||||
```bash
|
||||
git clone https://github.com/simon987/od-database
|
||||
cd od-database
|
||||
pip3 install -r requirements.txt
|
||||
sudo pip3 install -r requirements.txt
|
||||
```
|
||||
Create `/config.py` and fill out the parameters. Empty config:
|
||||
Create `/config.py` and fill out the parameters. Sample config:
|
||||
```python
|
||||
CAPTCHA_SITE_KEY = ""
|
||||
CAPTCHA_SECRET_KEY = ""
|
||||
FLASK_SECRET = ""
|
||||
USE_SSL = True
|
||||
RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
|
||||
HEADERS = {}
|
||||
CRAWL_SERVER_HEADERS = {}
|
||||
CRAWL_SERVER_TOKEN = ""
|
||||
CRAWL_SERVER_PORT = 5001
|
||||
CRAWL_SERVER_PROCESSES = 3
|
||||
CRAWL_SERVER_THREADS = 20
|
||||
|
||||
```
|
||||
|
||||
## Running
|
||||
## Running the crawl server
|
||||
```bash
|
||||
python3 app.py
|
||||
cd od-database
|
||||
export PYTHONPATH=$(pwd)
|
||||
cd crawl_server
|
||||
python3 server.py
|
||||
```
|
||||
You should be able to connect with your browser at `https://localhost:12345`
|
||||
|
||||
*_Note: To use SSL you have to put the appropriate certificates in /certificates/_
|
||||
|
1
app.py
1
app.py
@ -410,7 +410,6 @@ def admin_crawl_logs():
|
||||
if "username" in session:
|
||||
|
||||
results = taskDispatcher.get_task_logs_by_server()
|
||||
print(results)
|
||||
|
||||
return render_template("crawl_logs.html", logs=results)
|
||||
else:
|
||||
|
@ -1,6 +1,6 @@
|
||||
import os
|
||||
import ujson
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from timeout_decorator.timeout_decorator import TimeoutError
|
||||
from threading import Thread
|
||||
from queue import Queue, Empty
|
||||
@ -150,7 +150,7 @@ class RemoteDirectoryCrawler:
|
||||
|
||||
for f in listing:
|
||||
if f.is_dir:
|
||||
in_q.put(os.path.join(f.path, f.name, ""))
|
||||
in_q.put(urljoin(f.path, f.name, ""))
|
||||
else:
|
||||
files_q.put(f)
|
||||
import sys
|
||||
|
@ -71,7 +71,11 @@ class HttpDirectory(RemoteDirectory):
|
||||
"?DA",
|
||||
"?ND",
|
||||
"?C=N&O=A",
|
||||
"?C=N&O=A"
|
||||
"?C=N&O=A",
|
||||
"?M=A",
|
||||
"?N=D",
|
||||
"?S=A",
|
||||
"?D=A",
|
||||
)
|
||||
FILE_NAME_BLACKLIST = (
|
||||
"Parent Directory",
|
||||
|
@ -9,7 +9,7 @@ auth = HTTPTokenAuth(scheme="Token")
|
||||
|
||||
token = config.CRAWL_SERVER_TOKEN
|
||||
|
||||
tm = TaskManager("tm_db.sqlite3", 32)
|
||||
tm = TaskManager("tm_db.sqlite3", config.CRAWL_SERVER_PROCESSES)
|
||||
|
||||
|
||||
@auth.verify_token
|
||||
|
@ -1,3 +1,4 @@
|
||||
import config
|
||||
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
|
||||
from multiprocessing import Manager, Pool
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
@ -59,7 +60,7 @@ class TaskManager:
|
||||
|
||||
print("Starting task " + task.url)
|
||||
|
||||
crawler = RemoteDirectoryCrawler(task.url, 20)
|
||||
crawler = RemoteDirectoryCrawler(task.url, config.CRAWL_SERVER_THREADS)
|
||||
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
|
||||
del crawler
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
{% block body %}
|
||||
<div class="container">
|
||||
<div class="card">
|
||||
<div class="card-header">Information for {{ website.url }}</div>
|
||||
<div class="card-header">Information for {{ website.url | truncate(80) }}</div>
|
||||
<div class="card-body">
|
||||
|
||||
<div id="chart-wrapper" style="margin-bottom: 1em">
|
||||
@ -15,35 +15,40 @@
|
||||
<script src="/static/js/report.js"></script>
|
||||
</div>
|
||||
|
||||
<table class="table">
|
||||
<tr>
|
||||
<th>Base url</th>
|
||||
<td id="baseUrl"></td>
|
||||
</tr>
|
||||
<div class="table-responsive">
|
||||
<table class="table">
|
||||
<tr>
|
||||
<th>Base url</th>
|
||||
<td id="baseUrl"></td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<th>File count</th>
|
||||
<td id="fileCount"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>File count</th>
|
||||
<td id="fileCount"></td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<th>Total size</th>
|
||||
<td id="totalSize"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Total size</th>
|
||||
<td id="totalSize"></td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<th>Last updated</th>
|
||||
<td id="reportTime"></td>
|
||||
</tr>
|
||||
</table>
|
||||
<tr>
|
||||
<th>Last updated</th>
|
||||
<td id="reportTime"></td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
<a href="/website/{{ website.id }}/links" class="btn btn-shadow btn-primary">Link list</a>
|
||||
<a href="/website/{{ website.id }}/json_chart" class="btn btn-shadow btn-primary">Summary (JSON)</a>
|
||||
{% if "username" in session %}
|
||||
<a href="/website/{{ website.id }}/clear" class="btn btn-danger"><i class="fas fa-exclamation"></i> Clear</a>
|
||||
<a href="/website/{{ website.id }}/delete" class="btn btn-danger"><i class="fas fa-trash"></i> Delete</a>
|
||||
<a href="/website/{{ website.id }}/rescan" class="btn btn-secondary"><i class="fas fa-redo"></i> rescan</a>
|
||||
<a href="/website/{{ website.id }}/clear" class="btn btn-danger"><i class="fas fa-exclamation"></i>
|
||||
Clear</a>
|
||||
<a href="/website/{{ website.id }}/delete" class="btn btn-danger"><i class="fas fa-trash"></i>
|
||||
Delete</a>
|
||||
<a href="/website/{{ website.id }}/rescan" class="btn btn-secondary"><i class="fas fa-redo"></i>
|
||||
rescan</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
@ -18,7 +18,7 @@
|
||||
|
||||
{% for website in websites %}
|
||||
<tr>
|
||||
<td><a title="Click for stats & link list" href="/website/{{ website[0] }}">{{ website[1] }}</a></td>
|
||||
<td><a title="Click for stats & link list" href="/website/{{ website[0] }}">{{ website[1] | truncate(70) }}</a></td>
|
||||
<td>{{ website[2] }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
|
Loading…
x
Reference in New Issue
Block a user