mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 02:46:45 +00:00
add kibana & update README.md
This commit is contained in:
parent
7f121d2ac0
commit
c61f51cb08
78
README.md
78
README.md
@ -1,7 +1,5 @@
|
|||||||
# OD-Database
|
# OD-Database
|
||||||
|
|
||||||
[](https://ci.simon987.net/job/od-database_qa/)
|
|
||||||
|
|
||||||
OD-Database is a web-crawling project that aims to index a very large number of file links and their basic metadata from open directories (misconfigured Apache/Nginx/FTP servers, or more often, mirrors of various public services).
|
OD-Database is a web-crawling project that aims to index a very large number of file links and their basic metadata from open directories (misconfigured Apache/Nginx/FTP servers, or more often, mirrors of various public services).
|
||||||
|
|
||||||
Each crawler instance fetches tasks from the central server and pushes the result once completed. A single instance can crawl hundreds of websites at the same time (Both FTP and HTTP(S)) and the central server is capable of ingesting thousands of new documents per second.
|
Each crawler instance fetches tasks from the central server and pushes the result once completed. A single instance can crawl hundreds of websites at the same time (Both FTP and HTTP(S)) and the central server is capable of ingesting thousands of new documents per second.
|
||||||
@ -14,82 +12,22 @@ The data is indexed into elasticsearch and made available via the web frontend (
|
|||||||
### Contributing
|
### Contributing
|
||||||
Suggestions/concerns/PRs are welcome
|
Suggestions/concerns/PRs are welcome
|
||||||
|
|
||||||
## Installation
|
## Installation (Docker)
|
||||||
Assuming you have Python 3 and git installed:
|
|
||||||
```bash
|
```bash
|
||||||
sudo apt install libssl-dev libcurl4-openssl-dev
|
|
||||||
git clone https://github.com/simon987/od-database
|
git clone https://github.com/simon987/od-database
|
||||||
cd od-database
|
cd od-database
|
||||||
git submodule update --init --recursive
|
docker-compose up
|
||||||
sudo pip3 install -r requirements.txt
|
|
||||||
```
|
```
|
||||||
Create `/config.py` and fill out the parameters. Sample config:
|
|
||||||
```python
|
|
||||||
# Leave default values for no CAPTCHAs
|
|
||||||
CAPTCHA_LOGIN = False
|
|
||||||
CAPTCHA_SUBMIT = False
|
|
||||||
CAPTCHA_SEARCH = False
|
|
||||||
CAPTCHA_EVERY = 10
|
|
||||||
|
|
||||||
# Flask secret key for sessions
|
## Architecture
|
||||||
FLASK_SECRET = ""
|
|
||||||
RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
|
|
||||||
# Allow ftp websites in /submit
|
|
||||||
SUBMIT_FTP = False
|
|
||||||
# Allow http(s) websites in /submit
|
|
||||||
SUBMIT_HTTP = True
|
|
||||||
|
|
||||||
# Number of re-crawl tasks to keep in the queue
|

|
||||||
RECRAWL_POOL_SIZE = 10000
|
|
||||||
# task_tracker API url
|
|
||||||
TT_API = "http://localhost:3010"
|
|
||||||
# task_tracker crawl project id
|
|
||||||
TT_CRAWL_PROJECT = 3
|
|
||||||
# task_tracker indexing project id
|
|
||||||
TT_INDEX_PROJECT = 9
|
|
||||||
# Number of threads to use for ES indexing
|
|
||||||
INDEXER_THREADS = 4
|
|
||||||
|
|
||||||
# ws_bucket API url
|
|
||||||
WSB_API = "http://localhost:3020"
|
|
||||||
# ws_bucket secret
|
|
||||||
WSB_SECRET = "default_secret"
|
|
||||||
# ws_bucket data directory
|
|
||||||
WSB_PATH = "/mnt/data/github.com/simon987/ws_bucket/data"
|
|
||||||
# od-database PostgreSQL connection string
|
|
||||||
DB_CONN_STR = "dbname=od-database user=od-database password=xxx"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Running the crawl server
|
## Running the crawl server
|
||||||
The python crawler that was a part of this project is discontinued,
|
The python crawler that was a part of this project is discontinued,
|
||||||
[the go implementation](https://github.com/terorie/od-database-crawler) is currently in use.
|
[the go implementation](https://github.com/terorie/od-database-crawler) is currently in use.
|
||||||
|
|
||||||
## Running the web server (debug)
|
### Configure Elasticsearch
|
||||||
```bash
|
|
||||||
cd od-database
|
|
||||||
python3 app.py
|
|
||||||
```
|
|
||||||
|
|
||||||
## Running the web server with Nginx (production)
|
|
||||||
* Install dependencies:
|
|
||||||
```bash
|
|
||||||
sudo apt install build-essential python-dev redis-server uwsgi-plugin-python3
|
|
||||||
```
|
|
||||||
* Configure nginx (on Debian 9: `/etc/nginx/sites-enabled/default`):
|
|
||||||
```nginx
|
|
||||||
server {
|
|
||||||
...
|
|
||||||
|
|
||||||
include uwsgi_params;
|
|
||||||
location / {
|
|
||||||
uwsgi_pass 127.0.0.1:3031;
|
|
||||||
}
|
|
||||||
|
|
||||||
...
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
* Configure Elasticsearch
|
|
||||||
```
|
```
|
||||||
PUT _template/default
|
PUT _template/default
|
||||||
{
|
{
|
||||||
@ -102,9 +40,3 @@ PUT _template/default
|
|||||||
"routing_partition_size" : 5
|
"routing_partition_size" : 5
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
|
||||||
* Start uwsgi:
|
|
||||||
```bash
|
|
||||||
uwsgi od-database.ini
|
|
||||||
```
|
|
||||||
|
|
||||||
|
@ -39,8 +39,6 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- "POSTGRES_USER=od_database"
|
- "POSTGRES_USER=od_database"
|
||||||
- "POSTGRES_PASSWORD=changeme"
|
- "POSTGRES_PASSWORD=changeme"
|
||||||
ports:
|
|
||||||
- 5021:5432
|
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "pg_isready -U od_database"]
|
test: ["CMD-SHELL", "pg_isready -U od_database"]
|
||||||
interval: 5s
|
interval: 5s
|
||||||
@ -77,19 +75,27 @@ services:
|
|||||||
- 3010:80
|
- 3010:80
|
||||||
depends_on:
|
depends_on:
|
||||||
tt_db:
|
tt_db:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
es:
|
es:
|
||||||
image: docker.elastic.co/elasticsearch/elasticsearch:7.4.2
|
image: docker.elastic.co/elasticsearch/elasticsearch:7.4.2
|
||||||
environment:
|
environment:
|
||||||
- discovery.type=single-node
|
- discovery.type=single-node
|
||||||
- "ES_JAVA_OPTS=-Xms1G -Xmx10G"
|
- "ES_JAVA_OPTS=-Xms1G -Xmx10G"
|
||||||
volumes:
|
volumes:
|
||||||
- /usr/share/elasticsearch/data
|
- ./es_data:/usr/share/elasticsearch/data
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "curl --silent --fail localhost:9200/_cluster/health || exit 1"]
|
test: ["CMD-SHELL", "curl --silent --fail localhost:9200/_cluster/health || exit 1"]
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
# (Optional)
|
||||||
|
kibana:
|
||||||
|
image: docker.elastic.co/kibana/kibana:7.4.2
|
||||||
|
environment:
|
||||||
|
- ELASTICSEARCH_HOSTS=http://es:9200
|
||||||
|
- xpack.monitoring.collection.enabled=true
|
||||||
|
ports:
|
||||||
|
- 5021:5601
|
||||||
|
depends_on:
|
||||||
|
es:
|
||||||
|
condition: service_healthy
|
||||||
|
Loading…
x
Reference in New Issue
Block a user