mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Updated readme and UI fixes
This commit is contained in:
parent
9d3fc2d71b
commit
e824b2bf3c
22
README.md
22
README.md
@ -7,22 +7,26 @@ Assuming you have Python 3 and git installed:
|
|||||||
```bash
|
```bash
|
||||||
git clone https://github.com/simon987/od-database
|
git clone https://github.com/simon987/od-database
|
||||||
cd od-database
|
cd od-database
|
||||||
pip3 install -r requirements.txt
|
sudo pip3 install -r requirements.txt
|
||||||
```
|
```
|
||||||
Create `/config.py` and fill out the parameters. Empty config:
|
Create `/config.py` and fill out the parameters. Sample config:
|
||||||
```python
|
```python
|
||||||
CAPTCHA_SITE_KEY = ""
|
CAPTCHA_SITE_KEY = ""
|
||||||
CAPTCHA_SECRET_KEY = ""
|
CAPTCHA_SECRET_KEY = ""
|
||||||
FLASK_SECRET = ""
|
FLASK_SECRET = ""
|
||||||
USE_SSL = True
|
|
||||||
RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
|
RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
|
||||||
HEADERS = {}
|
CRAWL_SERVER_HEADERS = {}
|
||||||
|
CRAWL_SERVER_TOKEN = ""
|
||||||
|
CRAWL_SERVER_PORT = 5001
|
||||||
|
CRAWL_SERVER_PROCESSES = 3
|
||||||
|
CRAWL_SERVER_THREADS = 20
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Running
|
## Running the crawl server
|
||||||
```bash
|
```bash
|
||||||
python3 app.py
|
cd od-database
|
||||||
|
export PYTHONPATH=$(pwd)
|
||||||
|
cd crawl_server
|
||||||
|
python3 server.py
|
||||||
```
|
```
|
||||||
You should be able to connect with your browser at `https://localhost:12345`
|
|
||||||
|
|
||||||
*_Note: To use SSL you have to put the appropriate certificates in /certificates/_
|
|
||||||
|
1
app.py
1
app.py
@ -410,7 +410,6 @@ def admin_crawl_logs():
|
|||||||
if "username" in session:
|
if "username" in session:
|
||||||
|
|
||||||
results = taskDispatcher.get_task_logs_by_server()
|
results = taskDispatcher.get_task_logs_by_server()
|
||||||
print(results)
|
|
||||||
|
|
||||||
return render_template("crawl_logs.html", logs=results)
|
return render_template("crawl_logs.html", logs=results)
|
||||||
else:
|
else:
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import ujson
|
import ujson
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse, urljoin
|
||||||
from timeout_decorator.timeout_decorator import TimeoutError
|
from timeout_decorator.timeout_decorator import TimeoutError
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from queue import Queue, Empty
|
from queue import Queue, Empty
|
||||||
@ -150,7 +150,7 @@ class RemoteDirectoryCrawler:
|
|||||||
|
|
||||||
for f in listing:
|
for f in listing:
|
||||||
if f.is_dir:
|
if f.is_dir:
|
||||||
in_q.put(os.path.join(f.path, f.name, ""))
|
in_q.put(urljoin(f.path, f.name, ""))
|
||||||
else:
|
else:
|
||||||
files_q.put(f)
|
files_q.put(f)
|
||||||
import sys
|
import sys
|
||||||
|
@ -71,7 +71,11 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
"?DA",
|
"?DA",
|
||||||
"?ND",
|
"?ND",
|
||||||
"?C=N&O=A",
|
"?C=N&O=A",
|
||||||
"?C=N&O=A"
|
"?C=N&O=A",
|
||||||
|
"?M=A",
|
||||||
|
"?N=D",
|
||||||
|
"?S=A",
|
||||||
|
"?D=A",
|
||||||
)
|
)
|
||||||
FILE_NAME_BLACKLIST = (
|
FILE_NAME_BLACKLIST = (
|
||||||
"Parent Directory",
|
"Parent Directory",
|
||||||
|
@ -9,7 +9,7 @@ auth = HTTPTokenAuth(scheme="Token")
|
|||||||
|
|
||||||
token = config.CRAWL_SERVER_TOKEN
|
token = config.CRAWL_SERVER_TOKEN
|
||||||
|
|
||||||
tm = TaskManager("tm_db.sqlite3", 32)
|
tm = TaskManager("tm_db.sqlite3", config.CRAWL_SERVER_PROCESSES)
|
||||||
|
|
||||||
|
|
||||||
@auth.verify_token
|
@auth.verify_token
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import config
|
||||||
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
|
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
|
||||||
from multiprocessing import Manager, Pool
|
from multiprocessing import Manager, Pool
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
@ -59,7 +60,7 @@ class TaskManager:
|
|||||||
|
|
||||||
print("Starting task " + task.url)
|
print("Starting task " + task.url)
|
||||||
|
|
||||||
crawler = RemoteDirectoryCrawler(task.url, 20)
|
crawler = RemoteDirectoryCrawler(task.url, config.CRAWL_SERVER_THREADS)
|
||||||
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
|
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
|
||||||
del crawler
|
del crawler
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
{% block body %}
|
{% block body %}
|
||||||
<div class="container">
|
<div class="container">
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="card-header">Information for {{ website.url }}</div>
|
<div class="card-header">Information for {{ website.url | truncate(80) }}</div>
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
|
|
||||||
<div id="chart-wrapper" style="margin-bottom: 1em">
|
<div id="chart-wrapper" style="margin-bottom: 1em">
|
||||||
@ -15,6 +15,7 @@
|
|||||||
<script src="/static/js/report.js"></script>
|
<script src="/static/js/report.js"></script>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div class="table-responsive">
|
||||||
<table class="table">
|
<table class="table">
|
||||||
<tr>
|
<tr>
|
||||||
<th>Base url</th>
|
<th>Base url</th>
|
||||||
@ -36,14 +37,18 @@
|
|||||||
<td id="reportTime"></td>
|
<td id="reportTime"></td>
|
||||||
</tr>
|
</tr>
|
||||||
</table>
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
<hr>
|
<hr>
|
||||||
<a href="/website/{{ website.id }}/links" class="btn btn-shadow btn-primary">Link list</a>
|
<a href="/website/{{ website.id }}/links" class="btn btn-shadow btn-primary">Link list</a>
|
||||||
<a href="/website/{{ website.id }}/json_chart" class="btn btn-shadow btn-primary">Summary (JSON)</a>
|
<a href="/website/{{ website.id }}/json_chart" class="btn btn-shadow btn-primary">Summary (JSON)</a>
|
||||||
{% if "username" in session %}
|
{% if "username" in session %}
|
||||||
<a href="/website/{{ website.id }}/clear" class="btn btn-danger"><i class="fas fa-exclamation"></i> Clear</a>
|
<a href="/website/{{ website.id }}/clear" class="btn btn-danger"><i class="fas fa-exclamation"></i>
|
||||||
<a href="/website/{{ website.id }}/delete" class="btn btn-danger"><i class="fas fa-trash"></i> Delete</a>
|
Clear</a>
|
||||||
<a href="/website/{{ website.id }}/rescan" class="btn btn-secondary"><i class="fas fa-redo"></i> rescan</a>
|
<a href="/website/{{ website.id }}/delete" class="btn btn-danger"><i class="fas fa-trash"></i>
|
||||||
|
Delete</a>
|
||||||
|
<a href="/website/{{ website.id }}/rescan" class="btn btn-secondary"><i class="fas fa-redo"></i>
|
||||||
|
rescan</a>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
|
|
||||||
{% for website in websites %}
|
{% for website in websites %}
|
||||||
<tr>
|
<tr>
|
||||||
<td><a title="Click for stats & link list" href="/website/{{ website[0] }}">{{ website[1] }}</a></td>
|
<td><a title="Click for stats & link list" href="/website/{{ website[0] }}">{{ website[1] | truncate(70) }}</a></td>
|
||||||
<td>{{ website[2] }}</td>
|
<td>{{ website[2] }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user