mirror of
				https://github.com/simon987/od-database.git
				synced 2025-10-25 19:56:51 +00:00 
			
		
		
		
	
						commit
						853e38e46b
					
				
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -4,9 +4,9 @@ | ||||
| __pycache__/ | ||||
| captchas/ | ||||
| _stats.json | ||||
| config.py | ||||
| oddb.log | ||||
| praw.ini | ||||
| env/ | ||||
| worker.json | ||||
| search_blacklist.txt | ||||
| *.iml | ||||
|  | ||||
							
								
								
									
										11
									
								
								Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,11 @@ | ||||
| FROM python:3.7 | ||||
| 
 | ||||
| WORKDIR /app | ||||
| 
 | ||||
| ADD requirements.txt /app/requirements.txt | ||||
| RUN pip install -r requirements.txt | ||||
| 
 | ||||
| ENTRYPOINT ["python", "app.py"] | ||||
| 
 | ||||
| COPY . /app | ||||
| 
 | ||||
							
								
								
									
										78
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										78
									
								
								README.md
									
									
									
									
									
								
							| @ -1,7 +1,5 @@ | ||||
| # OD-Database | ||||
| 
 | ||||
| [](https://ci.simon987.net/job/od-database_qa/) | ||||
| 
 | ||||
| OD-Database is a web-crawling project that aims to index a very large number of file links and their basic metadata from open directories (misconfigured Apache/Nginx/FTP servers, or more often, mirrors of various public services). | ||||
| 
 | ||||
| Each crawler instance fetches tasks from the central server and pushes the result once completed. A single instance can crawl hundreds of websites at the same time (Both FTP and HTTP(S)) and the central server is capable of ingesting thousands of new documents per second.  | ||||
| @ -14,82 +12,22 @@ The data is indexed into elasticsearch and made available via the web frontend ( | ||||
| ### Contributing    | ||||
| Suggestions/concerns/PRs are welcome | ||||
| 
 | ||||
| ## Installation | ||||
| Assuming you have Python 3 and git installed: | ||||
| ## Installation (Docker) | ||||
| ```bash | ||||
| sudo apt install libssl-dev libcurl4-openssl-dev | ||||
| git clone https://github.com/simon987/od-database | ||||
| cd od-database | ||||
| git submodule update --init --recursive | ||||
| sudo pip3 install -r requirements.txt | ||||
| docker-compose up | ||||
| ``` | ||||
| Create `/config.py` and fill out the parameters. Sample config: | ||||
| ```python | ||||
| # Leave default values for no CAPTCHAs | ||||
| CAPTCHA_LOGIN = False | ||||
| CAPTCHA_SUBMIT = False | ||||
| CAPTCHA_SEARCH = False | ||||
| CAPTCHA_EVERY = 10 | ||||
| 
 | ||||
| # Flask secret key for sessions | ||||
| FLASK_SECRET = "" | ||||
| RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000) | ||||
| # Allow ftp websites in /submit | ||||
| SUBMIT_FTP = False | ||||
| # Allow http(s) websites in /submit | ||||
| SUBMIT_HTTP = True | ||||
| ## Architecture | ||||
| 
 | ||||
| # Number of re-crawl tasks to keep in the queue | ||||
| RECRAWL_POOL_SIZE = 10000 | ||||
| # task_tracker API url | ||||
| TT_API = "http://localhost:3010" | ||||
| # task_tracker crawl project id | ||||
| TT_CRAWL_PROJECT = 3 | ||||
| # task_tracker indexing project id | ||||
| TT_INDEX_PROJECT = 9 | ||||
| # Number of threads to use for ES indexing | ||||
| INDEXER_THREADS = 4 | ||||
| 
 | ||||
| # ws_bucket API url | ||||
| WSB_API = "http://localhost:3020" | ||||
| # ws_bucket secret | ||||
| WSB_SECRET = "default_secret" | ||||
| # ws_bucket data directory | ||||
| WSB_PATH = "/mnt/data/github.com/simon987/ws_bucket/data" | ||||
| # od-database PostgreSQL connection string | ||||
| DB_CONN_STR = "dbname=od-database user=od-database password=xxx" | ||||
| ``` | ||||
|  | ||||
| 
 | ||||
| ## Running the crawl server | ||||
| The python crawler that was a part of this project is discontinued, | ||||
| [the go implementation](https://github.com/terorie/od-database-crawler) is currently in use. | ||||
| 
 | ||||
| ## Running the web server (debug) | ||||
| ```bash | ||||
| cd od-database | ||||
| python3 app.py | ||||
| ``` | ||||
| 
 | ||||
| ## Running the web server with Nginx (production) | ||||
| * Install dependencies: | ||||
| ```bash | ||||
| sudo apt install build-essential python-dev redis-server uwsgi-plugin-python3 | ||||
| ``` | ||||
| * Configure nginx (on Debian 9: `/etc/nginx/sites-enabled/default`): | ||||
| ```nginx | ||||
| server { | ||||
|         ... | ||||
| 
 | ||||
|         include uwsgi_params; | ||||
|         location / { | ||||
|                 uwsgi_pass 127.0.0.1:3031; | ||||
|         } | ||||
|          | ||||
|         ... | ||||
| } | ||||
| ``` | ||||
| 
 | ||||
| * Configure Elasticsearch | ||||
| ### Configure Elasticsearch | ||||
| ``` | ||||
| PUT _template/default | ||||
| { | ||||
| @ -102,9 +40,3 @@ PUT _template/default | ||||
|     "routing_partition_size" : 5 | ||||
|   } | ||||
| } | ||||
| ``` | ||||
| * Start uwsgi: | ||||
| ```bash | ||||
| uwsgi od-database.ini | ||||
| ``` | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										3
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								app.py
									
									
									
									
									
								
							| @ -9,9 +9,8 @@ app = Flask(__name__) | ||||
| app.secret_key = config.FLASK_SECRET | ||||
| template_filters.setup_template_filters(app) | ||||
| 
 | ||||
| 
 | ||||
| views.setup_views(app) | ||||
| api.setup_api(app) | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     app.run("0.0.0.0", port=12345, threaded=True) | ||||
|     app.run("0.0.0.0", port=80, threaded=True) | ||||
|  | ||||
| @ -26,14 +26,13 @@ logger.addHandler(file_handler) | ||||
| logger.addHandler(StreamHandler(sys.stdout)) | ||||
| 
 | ||||
| taskManager = TaskManager() | ||||
| searchEngine = ElasticSearchEngine("od-database") | ||||
| searchEngine = ElasticSearchEngine(config.ES_URL, config.ES_INDEX) | ||||
| searchEngine.start_stats_scheduler() | ||||
| db = Database(config.DB_CONN_STR) | ||||
| 
 | ||||
| redis = r.Redis() | ||||
| redis = r.Redis(host=config.REDIS_HOST, port=config.REDIS_PORT) | ||||
| 
 | ||||
| 
 | ||||
| def require_role(role: str): | ||||
| 
 | ||||
|     if db.get_user_role(session.get("username", None)) != role: | ||||
|         abort(403) | ||||
|  | ||||
							
								
								
									
										29
									
								
								config.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								config.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,29 @@ | ||||
| from os import environ | ||||
| 
 | ||||
| CAPTCHA_LOGIN = bool(environ.get("CAPTCHA_LOGIN", False)) | ||||
| CAPTCHA_SUBMIT = bool(environ.get("CAPTCHA_SUBMIT", False)) | ||||
| CAPTCHA_SEARCH = bool(environ.get("CAPTCHA_SEARCH", False)) | ||||
| CAPTCHA_EVERY = int(environ.get("CAPTCHA_EVERY", 10)) | ||||
| 
 | ||||
| FLASK_SECRET = environ.get("FLASK_SECRET", "A very secret secret") | ||||
| RESULTS_PER_PAGE = (12, 25, 50, 100, 250, 500, 1000) | ||||
| 
 | ||||
| SUBMIT_FTP = bool(environ.get("SUBMIT_FTP", False)) | ||||
| SUBMIT_HTTP = bool(environ.get("SUBMIT_HTTP", True)) | ||||
| 
 | ||||
| TT_API = environ.get("TT_API", "http://localhost:3010") | ||||
| TT_CRAWL_PROJECT = int(environ.get("TT_CRAWL_PROJECT", 3)) | ||||
| TT_INDEX_PROJECT = int(environ.get("TT_INDEX_PROJECT", 9)) | ||||
| 
 | ||||
| WSB_API = environ.get("WSB_API", "http://localhost:3020") | ||||
| WSB_SECRET = environ.get("WSB_API", "default_secret") | ||||
| 
 | ||||
| ES_URL = environ.get("ES_URL", "http://localhost:9200") | ||||
| ES_INDEX = environ.get("ES_INDEX", "od-database") | ||||
| 
 | ||||
| REDIS_HOST = environ.get("REDIS_HOST", "localhost") | ||||
| REDIS_PORT = environ.get("REDIS_PORT", 6379) | ||||
| 
 | ||||
| DB_CONN_STR = environ.get("DB_CONN_STR", "dbname=od_database user=od_database password=od_database") | ||||
| RECRAWL_POOL_SIZE = environ.get("RECRAWL_POOL_SIZE", 10000) | ||||
| INDEXER_THREADS = int(environ.get("INDEXER_THREAD", 3)) | ||||
							
								
								
									
										101
									
								
								docker-compose.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										101
									
								
								docker-compose.yml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,101 @@ | ||||
| version: "2.1" | ||||
| services: | ||||
|   oddb: | ||||
|     image: simon987/od-database | ||||
|     ports: | ||||
|       - 5020:80 | ||||
|     environment: | ||||
|       - "CAPTCHA_LOGIN=True" | ||||
|       - "CAPTCHA_SUBMIT=True" | ||||
|       - "CAPTCHA_SEARCH=True" | ||||
|       - "CAPTCHA_EVERY=10" | ||||
|       - "FLASK_SECRET=changeme" | ||||
|       - "SUBMIT_FTP=False" | ||||
|       - "SUBMIT_HTTP=True" | ||||
|       - "TT_API=http://tt:3010" | ||||
|       - "TT_CRAWL_PROJECT=1" | ||||
|       - "TT_INDEX_PROJECT=2" | ||||
|       - "WSB_API=http://wsb:3020" | ||||
|       - "WSB_SECRET=changeme" | ||||
|       - "REDIS_HOST=oddb_redis" | ||||
|       - "ES_URL=es:9200" | ||||
|       - "DB_CONN_STR=postgres://od_database:changeme@oddb_db/od_database?sslmode=disable" | ||||
|       - "RECRAWL_POOL_SIZE=10000" | ||||
|       - "INDEXER_THREADS=2" | ||||
|     depends_on: | ||||
|       wsb: | ||||
|         condition: service_started | ||||
|       tt: | ||||
|         condition: service_started | ||||
|       oddb_db: | ||||
|         condition: service_healthy | ||||
|       es: | ||||
|         condition: service_healthy | ||||
|     restart: always | ||||
|   oddb_db: | ||||
|     image: postgres | ||||
|     volumes: | ||||
|       - ./oddb_pg_data:/var/lib/postgresql/data | ||||
|     environment: | ||||
|       - "POSTGRES_USER=od_database" | ||||
|       - "POSTGRES_PASSWORD=changeme" | ||||
|     healthcheck: | ||||
|       test: ["CMD-SHELL", "pg_isready -U od_database"] | ||||
|       interval: 5s | ||||
|       timeout: 5s | ||||
|       retries: 5 | ||||
|   oddb_redis: | ||||
|     image: redis | ||||
|   wsb: | ||||
|     image: simon987/wsb_bucket | ||||
|     volumes: | ||||
|       - ./wsb_data:/data | ||||
|     environment: | ||||
|       - "WS_BUCKET_SECRET=changeme" | ||||
|     ports: | ||||
|       - 3020:3020 | ||||
|   tt_db: | ||||
|     image: postgres | ||||
|     volumes: | ||||
|       - ./tt_pg_data:/var/lib/postgresql/data | ||||
|     environment: | ||||
|       POSTGRES_USER: task_tracker | ||||
|       POSTGRES_PASSWORD: changeme | ||||
|     healthcheck: | ||||
|       test: ["CMD-SHELL", "pg_isready -U task_tracker"] | ||||
|       interval: 5s | ||||
|       timeout: 5s | ||||
|       retries: 5 | ||||
|   tt: | ||||
|     image: simon987/task_tracker | ||||
|     volumes: | ||||
|       - ./tt_pg_data:/var/lib/postgresql/data | ||||
|       - ./tt_config.yml:/root/config.yml | ||||
|     ports: | ||||
|       - 3010:80 | ||||
|     depends_on: | ||||
|       tt_db: | ||||
|         condition: service_healthy | ||||
|   es: | ||||
|     image: docker.elastic.co/elasticsearch/elasticsearch:7.4.2 | ||||
|     environment: | ||||
|       - discovery.type=single-node | ||||
|       - "ES_JAVA_OPTS=-Xms1G -Xmx10G" | ||||
|     volumes: | ||||
|       - ./es_data:/usr/share/elasticsearch/data | ||||
|     healthcheck: | ||||
|       test: ["CMD-SHELL", "curl --silent --fail localhost:9200/_cluster/health || exit 1"] | ||||
|       interval: 5s | ||||
|       timeout: 5s | ||||
|       retries: 5 | ||||
|   # (Optional) | ||||
|   kibana: | ||||
|     image: docker.elastic.co/kibana/kibana:7.4.2 | ||||
|     environment: | ||||
|       - ELASTICSEARCH_HOSTS=http://es:9200 | ||||
|       - xpack.monitoring.collection.enabled=true | ||||
|     ports: | ||||
|       - 5021:5601 | ||||
|     depends_on: | ||||
|       es: | ||||
|         condition: service_healthy | ||||
| @ -28,7 +28,7 @@ for file in os.listdir(dldir): | ||||
| print("Export started, connecting to databases...") | ||||
| 
 | ||||
| db = Database(config.DB_CONN_STR) | ||||
| es = ElasticSearchEngine("od-database") | ||||
| es = ElasticSearchEngine(config.ES_URL, config.ES_INDEX) | ||||
| 
 | ||||
| docs_with_url = db.join_website_url(es.stream_all_docs()) | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										50
									
								
								jenkins/Jenkinsfile
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										50
									
								
								jenkins/Jenkinsfile
									
									
									
									
										vendored
									
									
								
							| @ -1,50 +0,0 @@ | ||||
| def remote = [:] | ||||
| remote.name = 'remote' | ||||
| remote.host = env.DEPLOY_HOST | ||||
| remote.user = env.DEPLOY_USER | ||||
| remote.identityFile = '/var/lib/jenkins/.ssh/id_rsa' | ||||
| remote.knownHosts = '/var/lib/jenkins/.ssh/known_hosts' | ||||
| 
 | ||||
| pipeline { | ||||
|     agent any | ||||
|     stages { | ||||
|         stage('Build') { | ||||
| 			steps { | ||||
| 				sh './jenkins/build.sh' | ||||
| 			} | ||||
|         } | ||||
|         stage('Deploy') { | ||||
|             steps { | ||||
| 				sh 'echo $ODDB_CONFIG > config.py' | ||||
| 				sshCommand remote: remote, command: "cd od-database && rm -rf env fold_to_ascii search static task_tracker_drone templates ws_bucket_client *.py deploy.sh" | ||||
| 				sshPut remote: remote, from: 'requirements.txt', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'fold_to_ascii', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'search', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'static', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'task_tracker_drone', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'templates', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'ws_bucket_client', into: 'od-database' | ||||
| 				sshPut remote: remote, from: '__init__.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'api.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'app.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'captcha.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'common.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'database.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'export.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'init_script.sql', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'od_util.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'reddit_bot.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'tasks.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'template_filters.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'uwsgi.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'views.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'config.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'mass_import.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'do_recrawl.py', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'od-database.ini', into: 'od-database' | ||||
| 				sshPut remote: remote, from: 'jenkins/deploy.sh', into: 'od-database' | ||||
| 				sshCommand remote: remote, command: 'chmod +x od-database/deploy.sh && ./od-database/deploy.sh' | ||||
|             } | ||||
|         } | ||||
|    	} | ||||
| } | ||||
| @ -1,4 +0,0 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| git submodule init | ||||
| git submodule update --remote --recursive | ||||
| @ -1,32 +0,0 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| export ODDBROOT="od-database" | ||||
| 
 | ||||
| virtualenv ${ODDBROOT}/env -p python3.7 | ||||
| source ${ODDBROOT}/env/bin/activate | ||||
| pip install -r ${ODDBROOT}/requirements.txt | ||||
| 
 | ||||
| screen -S oddb_web -X quit | ||||
| killall -9 uwsgi | ||||
| 
 | ||||
| sleep 5 | ||||
| 
 | ||||
| echo "starting oddb_web" | ||||
| screen -S oddb_web -d -m bash -c "cd ${ODDBROOT} && source env/bin/activate && uwsgi od-database.ini 2> stderr.txt" | ||||
| sleep 1 | ||||
| screen -list | ||||
| 
 | ||||
| echo "Installing crontabs" | ||||
| absolute_dir=$(cd ${ODDBROOT} && pwd) | ||||
| 
 | ||||
| # Re-crawl dirs | ||||
| command="bash -c \"cd '${absolute_dir}' && source env/bin/activate && python do_recrawl.py >> recrawl_logs.txt\"" | ||||
| job="*/10 * * * * $command" | ||||
| echo "$job" | ||||
| cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab - | ||||
| 
 | ||||
| # Cleanup captchas | ||||
| command="bash -c \"cd '${absolute_dir}' && rm captchas/*.png\"" | ||||
| job="*/60 * * * * $command" | ||||
| echo "$job" | ||||
| cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab - | ||||
| @ -1,10 +0,0 @@ | ||||
| [uwsgi] | ||||
| uwsgi-socket = 127.0.0.1:3031 | ||||
| wsgi-file = uwsgi.py | ||||
| processes = 2 | ||||
| threads = 16 | ||||
| stats = 127.0.0.1:9191 | ||||
| callable=app | ||||
| virtualenv=./env | ||||
| 
 | ||||
| disable-logging=True | ||||
| @ -18,7 +18,6 @@ lxml | ||||
| pillow | ||||
| Wand | ||||
| numpy | ||||
| matplotlib | ||||
| uwsgi | ||||
| redis | ||||
| psycopg2-binary | ||||
|  | ||||
| @ -2,9 +2,8 @@ import os | ||||
| import time | ||||
| from urllib.parse import urljoin | ||||
| 
 | ||||
| import ujson | ||||
| 
 | ||||
| import elasticsearch | ||||
| import ujson | ||||
| from apscheduler.schedulers.background import BackgroundScheduler | ||||
| from elasticsearch import helpers | ||||
| 
 | ||||
| @ -20,32 +19,7 @@ class IndexingError(Exception): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| class SearchEngine: | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         pass | ||||
| 
 | ||||
|     def import_json(self, in_str: str, website_id: int): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min, | ||||
|                date_max) -> {}: | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def reset(self): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def ping(self): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def get_stats(self, website_id: int, subdir: str = None): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def refresh(self): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
| 
 | ||||
| class ElasticSearchEngine(SearchEngine): | ||||
| class ElasticSearchEngine: | ||||
|     SORT_ORDERS = { | ||||
|         "score": ["_score"], | ||||
|         "size_asc": [{"size": {"order": "asc"}}], | ||||
| @ -55,10 +29,11 @@ class ElasticSearchEngine(SearchEngine): | ||||
|         "none": [] | ||||
|     } | ||||
| 
 | ||||
|     def __init__(self, index_name): | ||||
|     def __init__(self, url, index_name): | ||||
|         super().__init__() | ||||
|         self.index_name = index_name | ||||
|         self.es = elasticsearch.Elasticsearch() | ||||
|         logger.info("Connecting to ES @ %s" % url) | ||||
|         self.es = elasticsearch.Elasticsearch(hosts=[url]) | ||||
|         self.filter = SearchFilter() | ||||
| 
 | ||||
|         if not self.es.indices.exists(self.index_name): | ||||
| @ -73,28 +48,31 @@ class ElasticSearchEngine(SearchEngine): | ||||
|         logger.info("Elasticsearch first time setup") | ||||
|         if self.es.indices.exists(self.index_name): | ||||
|             self.es.indices.delete(index=self.index_name) | ||||
|         self.es.indices.create(index=self.index_name) | ||||
|         self.es.indices.close(index=self.index_name) | ||||
| 
 | ||||
|         # Index settings | ||||
|         self.es.indices.put_settings(body={ | ||||
|             "analysis": { | ||||
|                 "tokenizer": { | ||||
|                     "my_nGram_tokenizer": { | ||||
|                         "type": "nGram", "min_gram": 3, "max_gram": 3 | ||||
|                     } | ||||
|                 } | ||||
|             }}, index=self.index_name) | ||||
|         self.es.indices.put_settings(body={ | ||||
|         self.es.indices.create(index=self.index_name, body={ | ||||
|             "settings": { | ||||
|                 "index": { | ||||
|                     "number_of_shards": 50, | ||||
|                     "number_of_replicas": 0, | ||||
|                     "refresh_interval": "30s", | ||||
|                     "codec": "best_compression" | ||||
|                 }, | ||||
|                 "analysis": { | ||||
|                     "analyzer": { | ||||
|                         "my_nGram": { | ||||
|                             "tokenizer": "my_nGram_tokenizer", | ||||
|                             "filter": ["lowercase", "asciifolding"] | ||||
|                         } | ||||
|                     }, | ||||
|                     "tokenizer": { | ||||
|                         "my_nGram_tokenizer": { | ||||
|                             "type": "nGram", "min_gram": 3, "max_gram": 3 | ||||
|                         } | ||||
|             }}, index=self.index_name) | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         }) | ||||
| 
 | ||||
|         # Index Mappings | ||||
|         self.es.indices.put_mapping(body={ | ||||
|             "properties": { | ||||
|                 "path": {"analyzer": "standard", "type": "text"}, | ||||
| @ -110,12 +88,6 @@ class ElasticSearchEngine(SearchEngine): | ||||
| 
 | ||||
|         self.es.indices.open(index=self.index_name) | ||||
| 
 | ||||
|     def reset(self): | ||||
|         self.init() | ||||
| 
 | ||||
|     def ping(self): | ||||
|         return self.es.ping() | ||||
| 
 | ||||
|     def delete_docs(self, website_id): | ||||
| 
 | ||||
|         while True: | ||||
| @ -332,7 +304,8 @@ class ElasticSearchEngine(SearchEngine): | ||||
|             yield urljoin(base_url, "/") + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \ | ||||
|                   ("." if src["ext"] != "" else "") + src["ext"] | ||||
| 
 | ||||
|     def get_global_stats(self): | ||||
|     @staticmethod | ||||
|     def get_global_stats(): | ||||
| 
 | ||||
|         if os.path.exists("_stats.json"): | ||||
|             with open("_stats.json", "r") as f: | ||||
| @ -489,7 +462,7 @@ class ElasticSearchEngine(SearchEngine): | ||||
|             "query": { | ||||
|                 "match_all": {} | ||||
|             } | ||||
|         }, scroll="1m", client=self.es, index=self.index_name, request_timeout=60) | ||||
|         }, scroll="30s", client=self.es, index=self.index_name, request_timeout=30) | ||||
| 
 | ||||
|     def refresh(self): | ||||
|         self.es.indices.refresh(self.index_name) | ||||
|  | ||||
							
								
								
									
										26
									
								
								tasks.py
									
									
									
									
									
								
							
							
						
						
									
										26
									
								
								tasks.py
									
									
									
									
									
								
							| @ -3,9 +3,11 @@ import logging | ||||
| import os | ||||
| import time | ||||
| from multiprocessing.pool import ThreadPool | ||||
| from tempfile import NamedTemporaryFile | ||||
| from threading import Thread | ||||
| from uuid import uuid4 | ||||
| 
 | ||||
| import requests | ||||
| import urllib3 | ||||
| 
 | ||||
| import config | ||||
| @ -60,13 +62,13 @@ class IndexingTask: | ||||
| class TaskManager: | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         self.search = ElasticSearchEngine("od-database") | ||||
|         self.search = ElasticSearchEngine(config.ES_URL, config.ES_INDEX) | ||||
|         self.db = database.Database(config.DB_CONN_STR) | ||||
|         self.tracker = TaskTrackerApi(config.TT_API) | ||||
| 
 | ||||
|         self.worker = Worker.from_file(self.tracker) | ||||
|         if not self.worker: | ||||
|             self.worker = self.tracker.make_worker("oddb_master") | ||||
|             self.worker = self.tracker.make_worker("$oddb_master") | ||||
|             self.worker.dump_to_file() | ||||
|             self.worker.request_access(config.TT_CRAWL_PROJECT, False, True) | ||||
|             self.worker.request_access(config.TT_INDEX_PROJECT, True, False) | ||||
| @ -91,8 +93,9 @@ class TaskManager: | ||||
|                 try: | ||||
|                     recipe = task.json_recipe() | ||||
|                     logger.debug("Got indexing task: " + str(recipe)) | ||||
|                     filename = os.path.join(config.WSB_PATH, | ||||
|                                             format_file_name(recipe["website_id"], recipe["upload_token"])) | ||||
| 
 | ||||
|                     filename = download_file(config.WSB_API + "/slot?token=" + recipe["upload_token"]) | ||||
| 
 | ||||
|                     self._complete_task(filename, Task(recipe["website_id"], recipe["url"])) | ||||
|                 except Exception as e: | ||||
|                     self.worker.release_task(task_id=task.id, result=1, verification=0) | ||||
| @ -167,3 +170,18 @@ class TaskManager: | ||||
| 
 | ||||
| def format_file_name(website_id, token): | ||||
|     return "%d_%s.NDJSON" % (website_id, token,) | ||||
| 
 | ||||
| 
 | ||||
| def download_file(url): | ||||
|     r = requests.get(url, stream=True,) | ||||
| 
 | ||||
|     if r.status_code != 200: | ||||
|         raise ValueError("HTTP error %d: %s" % (r.status_code, url)) | ||||
| 
 | ||||
|     tmp = NamedTemporaryFile(delete=False) | ||||
|     for chunk in r.iter_content(chunk_size=4096): | ||||
|         if chunk: | ||||
|             tmp.write(chunk) | ||||
|     tmp.close() | ||||
| 
 | ||||
|     return tmp.name | ||||
|  | ||||
| @ -26,12 +26,8 @@ | ||||
|                 <form action="/search" id="sfrm"> | ||||
| 
 | ||||
|                     <div class="form-row"> | ||||
|                         <div class="col-md-11"> | ||||
|                             <input class="form-control" name="q" id="q" placeholder="Query"> | ||||
|                         </div> | ||||
|                         <div class="col-md-1"> | ||||
|                             <input class="btn btn-primary btn-shadow" type="submit" value="Search"> | ||||
|                         </div> | ||||
|                         <input class="form-control" style="max-width: calc(100% - 80px);" name="q" id="q" placeholder="Query"> | ||||
|                         <input class="btn btn-primary btn-shadow" type="submit" value="Search" style="margin-left: 3px"> | ||||
|                     </div> | ||||
|                     {% if show_captcha %} | ||||
|                         {{ captcha.get_code()|safe }} | ||||
|  | ||||
							
								
								
									
										0
									
								
								templates/search.html
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								templates/search.html
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										24
									
								
								tt_config.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								tt_config.yml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,24 @@ | ||||
| server: | ||||
|   address: "0.0.0.0:3010" | ||||
| 
 | ||||
| database: | ||||
|   conn_str: "postgres://task_tracker:changeme@tt_db/task_tracker?sslmode=disable" | ||||
|   log_levels: ["error", "info", "warn"] | ||||
| 
 | ||||
| git: | ||||
|   webhook_hash: "sha256" | ||||
|   webhook_sig_header: "X-Gogs-Signature" | ||||
| 
 | ||||
| log: | ||||
|   level: "trace" | ||||
| 
 | ||||
| session: | ||||
|   cookie_name: "tt" | ||||
|   expiration: "48h" | ||||
| 
 | ||||
| monitoring: | ||||
|   snapshot_interval: "120s" | ||||
|   history_length: "1800h" | ||||
| 
 | ||||
| maintenance: | ||||
|   reset_timed_out_tasks_interval: "10m" | ||||
							
								
								
									
										9
									
								
								uwsgi.ini
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								uwsgi.ini
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,9 @@ | ||||
| [uwsgi] | ||||
| module = main | ||||
| callable = app | ||||
| 
 | ||||
| enable-threads = true | ||||
| processes = 4 | ||||
| threads = 16 | ||||
| 
 | ||||
| disable-logging = True | ||||
							
								
								
									
										14
									
								
								views.py
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								views.py
									
									
									
									
									
								
							| @ -3,19 +3,24 @@ import os | ||||
| from multiprocessing.pool import Pool | ||||
| from urllib.parse import urlparse | ||||
| 
 | ||||
| from flask import render_template, redirect, request, flash, abort, Response, session | ||||
| from flask_caching import Cache | ||||
| 
 | ||||
| import captcha | ||||
| import config | ||||
| import od_util | ||||
| from common import db, taskManager, searchEngine, logger, require_role | ||||
| from database import Website | ||||
| from flask import render_template, redirect, request, flash, abort, Response, session | ||||
| from flask_caching import Cache | ||||
| from search.search import InvalidQueryException | ||||
| from tasks import Task | ||||
| 
 | ||||
| 
 | ||||
| def setup_views(app): | ||||
|     cache = Cache(app, config={'CACHE_TYPE': 'simple'}) | ||||
|     cache = Cache(app, config={ | ||||
|         "CACHE_TYPE": "redis", | ||||
|         "CACHE_REDIS_HOST": config.REDIS_HOST, | ||||
|         "CACHE_REDIS_PORT": config.REDIS_PORT, | ||||
|     }) | ||||
| 
 | ||||
|     @app.route("/dl") | ||||
|     @cache.cached(120) | ||||
| @ -207,7 +212,8 @@ def setup_views(app): | ||||
|                     flash("Query failed, this could mean that the search server is overloaded or is not reachable. " | ||||
|                           "Please try again later", "danger") | ||||
| 
 | ||||
|                 results = hits["hits"]["total"]["value"] if not isinstance(hits["hits"]["total"], int) else hits["hits"]["total"] if hits else -1 | ||||
|                 results = hits["hits"]["total"]["value"] if not isinstance(hits["hits"]["total"], int) else \ | ||||
|                     hits["hits"]["total"] if hits else -1 | ||||
|                 took = hits["took"] if hits else -1 | ||||
|                 forwarded_for = request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user