mirror of
https://github.com/simon987/od-database.git
synced 2025-04-10 14:06:45 +00:00
Use task_tracker for task tracking
This commit is contained in:
parent
00e3fd7340
commit
4ffe805b8d
4
.gitignore
vendored
4
.gitignore
vendored
@ -8,4 +8,6 @@ config.py
|
|||||||
db.sqlite3
|
db.sqlite3
|
||||||
oddb.log
|
oddb.log
|
||||||
praw.ini
|
praw.ini
|
||||||
tmp/
|
env/
|
||||||
|
worker.json
|
||||||
|
search_blacklist.txt
|
||||||
|
6
.gitmodules
vendored
6
.gitmodules
vendored
@ -1,3 +1,9 @@
|
|||||||
[submodule "fold_to_ascii"]
|
[submodule "fold_to_ascii"]
|
||||||
path = fold_to_ascii
|
path = fold_to_ascii
|
||||||
url = https://github.com/spanishdict/fold_to_ascii
|
url = https://github.com/spanishdict/fold_to_ascii
|
||||||
|
[submodule "task_tracker_drone"]
|
||||||
|
path = task_tracker_drone
|
||||||
|
url = https://github.com/simon987/task_tracker_drone
|
||||||
|
[submodule "ws_bucket_client"]
|
||||||
|
path = ws_bucket_client
|
||||||
|
url = https://github.com/simon987/ws_bucket_client
|
||||||
|
@ -36,9 +36,6 @@ RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
|
|||||||
SUBMIT_FTP = False
|
SUBMIT_FTP = False
|
||||||
# Allow http(s) websites in /submit
|
# Allow http(s) websites in /submit
|
||||||
SUBMIT_HTTP = True
|
SUBMIT_HTTP = True
|
||||||
|
|
||||||
SERVER_URL = "http://localhost/api"
|
|
||||||
API_TOKEN = "5817926d-f2f9-4422-a411-a98f1bfe4b6c"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Running the crawl server
|
## Running the crawl server
|
||||||
@ -54,7 +51,7 @@ python3 app.py
|
|||||||
## Running the web server with Nginx (production)
|
## Running the web server with Nginx (production)
|
||||||
* Install dependencies:
|
* Install dependencies:
|
||||||
```bash
|
```bash
|
||||||
sudo apt install build-essential python-dev
|
sudo apt install build-essential python-dev redis-server
|
||||||
sudo pip install uwsgi
|
sudo pip install uwsgi
|
||||||
```
|
```
|
||||||
* Adjust the path in `od-database.ini`
|
* Adjust the path in `od-database.ini`
|
||||||
|
@ -1,5 +1,2 @@
|
|||||||
import logging
|
|
||||||
from logging import FileHandler, StreamHandler
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
129
api.py
129
api.py
@ -1,87 +1,15 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
|
||||||
from threading import Lock
|
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
from flask import request, abort, Response, send_file, session
|
from flask import request, abort, send_file, session
|
||||||
|
|
||||||
import common as oddb
|
|
||||||
import captcha
|
import captcha
|
||||||
from callbacks import PostCrawlCallbackFactory
|
import common as oddb
|
||||||
from database import Task, Website
|
from database import Website
|
||||||
from search.search import InvalidQueryException
|
from search.search import InvalidQueryException
|
||||||
from tasks import TaskResult
|
|
||||||
|
|
||||||
uploadLock = Lock()
|
|
||||||
|
|
||||||
|
|
||||||
def setup_api(app):
|
def setup_api(app):
|
||||||
@app.route("/api/task/complete", methods=["POST"])
|
|
||||||
def api_complete_task():
|
|
||||||
# TODO: task_tracker
|
|
||||||
token = request.form.get("token")
|
|
||||||
name = oddb.db.check_api_token(token)
|
|
||||||
|
|
||||||
if name:
|
|
||||||
tr = json.loads(request.form.get("result"))
|
|
||||||
oddb.logger.debug("Task result: " + str(tr))
|
|
||||||
task_result = TaskResult(tr["status_code"], tr["file_count"], tr["start_time"], tr["end_time"],
|
|
||||||
tr["website_id"])
|
|
||||||
|
|
||||||
oddb.logger.info("Task for " + str(task_result.website_id) + " completed by " + name)
|
|
||||||
task = oddb.db.complete_task(task_result.website_id, name)
|
|
||||||
|
|
||||||
if task:
|
|
||||||
|
|
||||||
filename = "./tmp/" + str(task_result.website_id) + ".json"
|
|
||||||
if not os.path.exists(filename):
|
|
||||||
filename = None
|
|
||||||
oddb.taskManager.complete_task(filename, task, task_result, name)
|
|
||||||
|
|
||||||
if filename and os.path.exists(filename):
|
|
||||||
os.remove(filename)
|
|
||||||
|
|
||||||
# Handle task callback
|
|
||||||
callback = PostCrawlCallbackFactory.get_callback(task)
|
|
||||||
if callback:
|
|
||||||
callback.run(task_result, oddb.search)
|
|
||||||
|
|
||||||
return "Successfully logged task result and indexed files"
|
|
||||||
|
|
||||||
else:
|
|
||||||
oddb.logger.error("ERROR: " + name + " indicated that task for " + str(task_result.website_id) +
|
|
||||||
" was completed but there is no such task in the database.")
|
|
||||||
return "No such task"
|
|
||||||
return abort(403)
|
|
||||||
|
|
||||||
@app.route("/api/task/upload", methods=["POST"])
|
|
||||||
def api_upload():
|
|
||||||
token = request.form.get("token")
|
|
||||||
name = oddb.db.check_api_token(token)
|
|
||||||
|
|
||||||
if name:
|
|
||||||
website_id = request.form.get("website_id")
|
|
||||||
oddb.logger.debug("Result part upload for '" + str(website_id) + "' by " + name)
|
|
||||||
|
|
||||||
if "file_list" in request.files:
|
|
||||||
file = request.files['file_list']
|
|
||||||
|
|
||||||
filename = "./tmp/" + str(website_id) + ".json"
|
|
||||||
|
|
||||||
# Read the file into memory cuz if the request fails
|
|
||||||
# no file is corrupted.
|
|
||||||
buf = file.stream.read()
|
|
||||||
|
|
||||||
# Write to file (create if not exists) when
|
|
||||||
# everything read successfully.
|
|
||||||
with uploadLock:
|
|
||||||
with open(filename, "a+b") as f:
|
|
||||||
f.write(buf)
|
|
||||||
|
|
||||||
oddb.logger.debug("Written chunk to file")
|
|
||||||
return "ok"
|
|
||||||
else:
|
|
||||||
return abort(403)
|
|
||||||
|
|
||||||
@app.route("/api/website/by_url", methods=["GET"])
|
@app.route("/api/website/by_url", methods=["GET"])
|
||||||
def api_website_by_url():
|
def api_website_by_url():
|
||||||
@ -126,52 +54,6 @@ def setup_api(app):
|
|||||||
else:
|
else:
|
||||||
return abort(403)
|
return abort(403)
|
||||||
|
|
||||||
@app.route("/api/task/force_enqueue", methods=["POST"])
|
|
||||||
def api_task_enqueue():
|
|
||||||
try:
|
|
||||||
token = request.json["token"]
|
|
||||||
except KeyError:
|
|
||||||
return abort(400)
|
|
||||||
|
|
||||||
name = oddb.db.check_api_token(token)
|
|
||||||
|
|
||||||
if name:
|
|
||||||
|
|
||||||
task = Task(
|
|
||||||
request.json["website_id"],
|
|
||||||
request.json["url"],
|
|
||||||
request.json["priority"],
|
|
||||||
request.json["callback_type"],
|
|
||||||
json.dumps(request.json["callback_args"])
|
|
||||||
)
|
|
||||||
|
|
||||||
oddb.logger.info("API force enqueue by " + name + "\n(" + str(task.to_json()) + ")")
|
|
||||||
|
|
||||||
oddb.taskManager.queue_task(task)
|
|
||||||
return ""
|
|
||||||
else:
|
|
||||||
return abort(403)
|
|
||||||
|
|
||||||
@app.route("/api/task/try_enqueue", methods=["POST"])
|
|
||||||
def api_task_try_enqueue():
|
|
||||||
token = request.form.get("token")
|
|
||||||
name = oddb.db.check_api_token(token)
|
|
||||||
|
|
||||||
if name:
|
|
||||||
|
|
||||||
url = request.form.get("url")
|
|
||||||
# TODO: task_tracker
|
|
||||||
message, result = oddb.try_enqueue(url)
|
|
||||||
|
|
||||||
oddb.logger.info("API try enqueue '" + url + "' by " + name + " (" + message + ")")
|
|
||||||
|
|
||||||
return json.dumps({
|
|
||||||
"message": message,
|
|
||||||
"result": result
|
|
||||||
})
|
|
||||||
else:
|
|
||||||
return abort(403)
|
|
||||||
|
|
||||||
@app.route("/api/website/random")
|
@app.route("/api/website/random")
|
||||||
def api_random_website():
|
def api_random_website():
|
||||||
token = request.json["token"]
|
token = request.json["token"]
|
||||||
@ -215,9 +97,10 @@ def setup_api(app):
|
|||||||
@app.route("/cap", methods=["GET"])
|
@app.route("/cap", methods=["GET"])
|
||||||
def cap():
|
def cap():
|
||||||
word = captcha.make_captcha()
|
word = captcha.make_captcha()
|
||||||
cap_id = uuid4()
|
cap_id = uuid4().__str__()
|
||||||
session["cap"] = cap_id
|
session["cap"] = cap_id
|
||||||
oddb.sessionStore[cap_id] = word
|
|
||||||
|
oddb.redis.set(cap_id, word)
|
||||||
|
|
||||||
return send_file(captcha.get_path(word), cache_timeout=0)
|
return send_file(captcha.get_path(word), cache_timeout=0)
|
||||||
|
|
||||||
|
2
app.py
2
app.py
@ -2,8 +2,8 @@ from flask import Flask
|
|||||||
|
|
||||||
import api
|
import api
|
||||||
import config
|
import config
|
||||||
import views
|
|
||||||
import template_filters
|
import template_filters
|
||||||
|
import views
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
app.secret_key = config.FLASK_SECRET
|
app.secret_key = config.FLASK_SECRET
|
||||||
|
73
callbacks.py
73
callbacks.py
@ -1,73 +0,0 @@
|
|||||||
from tasks import Task, TaskResult
|
|
||||||
from reddit_bot import RedditBot
|
|
||||||
import praw
|
|
||||||
from search.search import SearchEngine
|
|
||||||
import json
|
|
||||||
|
|
||||||
|
|
||||||
class PostCrawlCallback:
|
|
||||||
|
|
||||||
def __init__(self, task: Task):
|
|
||||||
self.task = task
|
|
||||||
|
|
||||||
if self.task.callback_args:
|
|
||||||
self.task.callback_args = json.loads(self.task.callback_args)
|
|
||||||
|
|
||||||
def run(self, task_result: TaskResult, search: SearchEngine):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
|
|
||||||
class PostCrawlCallbackFactory:
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_callback(task: Task):
|
|
||||||
|
|
||||||
if task.callback_type == "reddit_post":
|
|
||||||
return RedditPostCallback(task)
|
|
||||||
|
|
||||||
elif task.callback_type == "reddit_comment":
|
|
||||||
return RedditCommentCallback(task)
|
|
||||||
|
|
||||||
elif task.callback_type == "discord":
|
|
||||||
return DiscordCallback(task)
|
|
||||||
|
|
||||||
|
|
||||||
class RedditCallback(PostCrawlCallback):
|
|
||||||
|
|
||||||
def __init__(self, task: Task):
|
|
||||||
super().__init__(task)
|
|
||||||
|
|
||||||
reddit = praw.Reddit('opendirectories-bot',
|
|
||||||
user_agent='github.com/simon987/od-database (by /u/Hexahedr_n)')
|
|
||||||
self.reddit_bot = RedditBot("crawled.txt", reddit)
|
|
||||||
|
|
||||||
def run(self, task_result: TaskResult, search: SearchEngine):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
|
|
||||||
class RedditPostCallback(RedditCallback):
|
|
||||||
|
|
||||||
def run(self, task_result: TaskResult, search: SearchEngine):
|
|
||||||
print("Reddit post callback for task " + str(self.task))
|
|
||||||
|
|
||||||
|
|
||||||
class RedditCommentCallback(RedditCallback):
|
|
||||||
|
|
||||||
def run(self, task_result: TaskResult, search: SearchEngine):
|
|
||||||
|
|
||||||
comment_id = self.task.callback_args["comment_id"]
|
|
||||||
print("Editing comment comment " + comment_id)
|
|
||||||
|
|
||||||
search.refresh() # Make sure the newly indexed documents are available before commenting
|
|
||||||
stats = search.get_stats(self.task.website_id)
|
|
||||||
message = self.reddit_bot.get_comment(stats, self.task.website_id,
|
|
||||||
message="There you go! This website was crawled in `" +
|
|
||||||
str(int(task_result.end_time - task_result.start_time)) + "s`")
|
|
||||||
print(message)
|
|
||||||
self.reddit_bot.edit(self.reddit_bot.reddit.comment(comment_id), message)
|
|
||||||
|
|
||||||
|
|
||||||
class DiscordCallback(PostCrawlCallback):
|
|
||||||
|
|
||||||
def run(self, task_result: TaskResult, search: SearchEngine):
|
|
||||||
print("Discord callback for task " + str(self.task))
|
|
13
captcha.py
13
captcha.py
@ -4,8 +4,8 @@ import string
|
|||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
from flask import request, session
|
from flask import request, session
|
||||||
|
|
||||||
import config
|
|
||||||
import common as oddb
|
import common as oddb
|
||||||
|
import config
|
||||||
|
|
||||||
|
|
||||||
def get_code():
|
def get_code():
|
||||||
@ -36,9 +36,14 @@ def verify():
|
|||||||
request.args.get("cap") if "cap" in request.args else ""
|
request.args.get("cap") if "cap" in request.args else ""
|
||||||
)
|
)
|
||||||
|
|
||||||
if "cap" in session and session["cap"] in oddb.sessionStore and oddb.sessionStore[session["cap"]] == attempt:
|
if "cap" in session:
|
||||||
session["cap_remaining"] = config.CAPTCHA_EVERY
|
expected = oddb.redis.get(session["cap"]).decode("utf8")
|
||||||
return True
|
oddb.redis.delete(session["cap"])
|
||||||
|
|
||||||
|
if expected == attempt:
|
||||||
|
session["cap_remaining"] = config.CAPTCHA_EVERY
|
||||||
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
10
common.py
10
common.py
@ -1,12 +1,13 @@
|
|||||||
|
import logging
|
||||||
|
import sys
|
||||||
from logging import FileHandler, StreamHandler
|
from logging import FileHandler, StreamHandler
|
||||||
|
|
||||||
import sys
|
import redis as r
|
||||||
|
from flask import session, abort
|
||||||
|
|
||||||
from database import Database
|
from database import Database
|
||||||
from search.search import ElasticSearchEngine
|
from search.search import ElasticSearchEngine
|
||||||
from tasks import TaskManager
|
from tasks import TaskManager
|
||||||
import logging
|
|
||||||
from flask import session, abort
|
|
||||||
|
|
||||||
# Disable flask logging
|
# Disable flask logging
|
||||||
flaskLogger = logging.getLogger('werkzeug')
|
flaskLogger = logging.getLogger('werkzeug')
|
||||||
@ -26,8 +27,7 @@ searchEngine = ElasticSearchEngine("od-database")
|
|||||||
searchEngine.start_stats_scheduler()
|
searchEngine.start_stats_scheduler()
|
||||||
db = Database("db.sqlite3")
|
db = Database("db.sqlite3")
|
||||||
|
|
||||||
# temporary hotfix...
|
redis = r.Redis()
|
||||||
sessionStore = dict()
|
|
||||||
|
|
||||||
|
|
||||||
def require_role(role: str):
|
def require_role(role: str):
|
||||||
|
40
database.py
40
database.py
@ -1,4 +1,3 @@
|
|||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import uuid
|
import uuid
|
||||||
@ -30,32 +29,6 @@ class ApiClient:
|
|||||||
self.name = name
|
self.name = name
|
||||||
|
|
||||||
|
|
||||||
class Task:
|
|
||||||
|
|
||||||
def __init__(self, website_id: int, url: str, priority: int = 1,
|
|
||||||
callback_type: str = None, callback_args: str = None):
|
|
||||||
self.website_id = website_id
|
|
||||||
self.url = url
|
|
||||||
self.priority = priority
|
|
||||||
self.callback_type = callback_type
|
|
||||||
self.callback_args = json.loads(callback_args) if callback_args else {}
|
|
||||||
|
|
||||||
def to_json(self):
|
|
||||||
return {
|
|
||||||
"website_id": self.website_id,
|
|
||||||
"url": self.url,
|
|
||||||
"priority": self.priority,
|
|
||||||
"callback_type": self.callback_type,
|
|
||||||
"callback_args": json.dumps(self.callback_args)
|
|
||||||
}
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return json.dumps(self.to_json())
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.__str__()
|
|
||||||
|
|
||||||
|
|
||||||
class Database:
|
class Database:
|
||||||
|
|
||||||
def __init__(self, db_path):
|
def __init__(self, db_path):
|
||||||
@ -152,19 +125,6 @@ class Database:
|
|||||||
website_id = cursor.fetchone()
|
website_id = cursor.fetchone()
|
||||||
return website_id[0] if website_id else None
|
return website_id[0] if website_id else None
|
||||||
|
|
||||||
def make_task_for_oldest(self, assigned_crawler):
|
|
||||||
|
|
||||||
# TODO: task_tracker
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
|
||||||
cursor = conn.cursor()
|
|
||||||
cursor.execute("INSERT INTO QUEUE (website_id, url, assigned_crawler) SELECT Website.id, Website.url, ? FROM Website WHERE Website.id not in (SELECT website_id FROM Queue) "
|
|
||||||
"ORDER BY last_modified ASC LIMIT 1", (assigned_crawler, ))
|
|
||||||
|
|
||||||
cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM Queue "
|
|
||||||
"WHERE id=LAST_INSERT_ROWID()")
|
|
||||||
task = cursor.fetchone()
|
|
||||||
return Task(task[1], task[2], task[3], task[4], task[5])
|
|
||||||
|
|
||||||
def delete_website(self, website_id):
|
def delete_website(self, website_id):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
from search.search import ElasticSearchEngine
|
|
||||||
from database import Database
|
|
||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from database import Database
|
||||||
|
from search.search import ElasticSearchEngine
|
||||||
|
|
||||||
|
|
||||||
def export(outfile="out.csv"):
|
def export(outfile="out.csv"):
|
||||||
|
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 2c8220921dac132e20af98f12ba724c834dcbebf
|
Subproject commit d134a0f61ec177b3f2419cd1956603b247ba35b9
|
24
jenkins/Jenkinsfile
vendored
Normal file
24
jenkins/Jenkinsfile
vendored
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
def remote = [:]
|
||||||
|
remote.name = 'remote'
|
||||||
|
remote.host = env.DEPLOY_HOST
|
||||||
|
remote.user = env.DEPLOY_USER
|
||||||
|
remote.identityFile = '/var/lib/jenkins/.ssh/id_rsa'
|
||||||
|
remote.knownHosts = '/var/lib/jenkins/.ssh/known_hosts'
|
||||||
|
|
||||||
|
pipeline {
|
||||||
|
agent any
|
||||||
|
stages {
|
||||||
|
stage('Build') {
|
||||||
|
steps {
|
||||||
|
sh './jenkins/build.sh'
|
||||||
|
stash includes: 'env/', name: 'env'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Deploy') {
|
||||||
|
steps {
|
||||||
|
sshPut remote: remote, from: '.', into: 'oddb'
|
||||||
|
sshCommand remote: remote, command: 'chmod +x oddb/deploy.sh && ./oddb/deploy.sh'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
9
jenkins/build.sh
Executable file
9
jenkins/build.sh
Executable file
@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
virtualenv env --download --clear -p python3.7
|
||||||
|
source env/bin/activate
|
||||||
|
python --version
|
||||||
|
|
||||||
|
pip install -r requirements.txt
|
||||||
|
git submodule update --remote --recursive
|
||||||
|
deactivate
|
9
jenkins/deploy.sh
Executable file
9
jenkins/deploy.sh
Executable file
@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
export ODDBROOT="od-database"
|
||||||
|
|
||||||
|
screen -S oddb -X quit
|
||||||
|
echo "starting oddb_web"
|
||||||
|
screen -S tt_drone -d -m bash -c "cd ${ODDBROOT} && uwsgi od-database.ini"
|
||||||
|
sleep 1
|
||||||
|
screen -list
|
@ -1,8 +1,10 @@
|
|||||||
[uwsgi]
|
[uwsgi]
|
||||||
socket = 127.0.0.1:3031
|
socket = 127.0.0.1:3031
|
||||||
chdir = /home/simon/Dropbox/data/CS/python/od-database/
|
|
||||||
wsgi-file = uwsgi.py
|
wsgi-file = uwsgi.py
|
||||||
processes = 4
|
processes = 4
|
||||||
threads = 4
|
threads = 4
|
||||||
stats = 127.0.0.1:9191
|
stats = 127.0.0.1:9191
|
||||||
callable=app
|
callable=app
|
||||||
|
virtualenv=./env
|
||||||
|
|
||||||
|
disable-logging=True
|
@ -1,10 +1,11 @@
|
|||||||
import requests
|
|
||||||
from urllib.parse import urljoin, urlparse
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import os
|
import os
|
||||||
import validators
|
|
||||||
import re
|
import re
|
||||||
from ftplib import FTP
|
from ftplib import FTP
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import validators
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# TODO: find a better way to do this
|
# TODO: find a better way to do this
|
||||||
try:
|
try:
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import praw
|
|
||||||
import humanfriendly
|
import humanfriendly
|
||||||
|
import praw
|
||||||
|
|
||||||
|
|
||||||
class RedditBot:
|
class RedditBot:
|
||||||
|
@ -21,4 +21,6 @@ lxml
|
|||||||
pillow
|
pillow
|
||||||
Wand
|
Wand
|
||||||
numpy
|
numpy
|
||||||
matplotlib
|
matplotlib
|
||||||
|
uwsgi
|
||||||
|
redis
|
@ -1,6 +1,7 @@
|
|||||||
from search.search import ElasticSearchEngine
|
|
||||||
import ujson
|
import ujson
|
||||||
|
|
||||||
|
from search.search import ElasticSearchEngine
|
||||||
|
|
||||||
es = ElasticSearchEngine("od-database")
|
es = ElasticSearchEngine("od-database")
|
||||||
es.reset()
|
es.reset()
|
||||||
|
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
from logging import FileHandler, StreamHandler
|
from logging import FileHandler
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
logger = logging.getLogger("default")
|
logger = logging.getLogger("default")
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
import itertools
|
import os
|
||||||
|
import time
|
||||||
|
import ujson
|
||||||
|
|
||||||
import elasticsearch
|
import elasticsearch
|
||||||
import time
|
|
||||||
from elasticsearch import helpers
|
|
||||||
import os
|
|
||||||
import ujson
|
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
|
from elasticsearch import helpers
|
||||||
|
|
||||||
from search import logger
|
from search import logger
|
||||||
from search.filter import SearchFilter
|
from search.filter import SearchFilter
|
||||||
@ -318,8 +317,12 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
"includes": ["path", "name", "ext"]
|
"includes": ["path", "name", "ext"]
|
||||||
},
|
},
|
||||||
"query": {
|
"query": {
|
||||||
"match_all": {}
|
"constant_score": {
|
||||||
}
|
"filter": {
|
||||||
|
"term": {"website_id": website_id}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
},
|
},
|
||||||
index=self.index_name, request_timeout=20, routing=website_id)
|
index=self.index_name, request_timeout=20, routing=website_id)
|
||||||
for hit in hits:
|
for hit in hits:
|
||||||
|
1
task_tracker_drone
Submodule
1
task_tracker_drone
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit ef07c059ad30def78547562919422925e7686ca6
|
118
tasks.py
118
tasks.py
@ -1,12 +1,18 @@
|
|||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import time
|
||||||
|
from threading import Thread
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
|
||||||
from search.search import ElasticSearchEngine
|
|
||||||
import json
|
|
||||||
import database
|
|
||||||
import urllib3
|
import urllib3
|
||||||
|
|
||||||
|
import config
|
||||||
|
import database
|
||||||
|
from search.search import ElasticSearchEngine
|
||||||
|
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
|
||||||
|
from ws_bucket_client.api import WsBucketApi
|
||||||
|
|
||||||
urllib3.disable_warnings()
|
urllib3.disable_warnings()
|
||||||
|
|
||||||
logger = logging.getLogger("default")
|
logger = logging.getLogger("default")
|
||||||
@ -15,20 +21,22 @@ logger = logging.getLogger("default")
|
|||||||
class Task:
|
class Task:
|
||||||
|
|
||||||
def __init__(self, website_id: int, url: str, priority: int = 1,
|
def __init__(self, website_id: int, url: str, priority: int = 1,
|
||||||
callback_type: str = None, callback_args: str = None):
|
callback_type: str = None, callback_args: str = None,
|
||||||
|
upload_token: str = None):
|
||||||
self.website_id = website_id
|
self.website_id = website_id
|
||||||
self.url = url
|
self.url = url
|
||||||
self.priority = priority
|
self.priority = priority
|
||||||
self.callback_type = callback_type
|
self.callback_type = callback_type
|
||||||
self.callback_args = json.loads(callback_args) if callback_args else {}
|
self.callback_args = json.loads(callback_args) if callback_args else {}
|
||||||
|
self.upload_token = upload_token
|
||||||
|
|
||||||
def to_json(self):
|
def to_json(self):
|
||||||
return {
|
return {
|
||||||
"website_id": self.website_id,
|
"website_id": self.website_id,
|
||||||
"url": self.url,
|
"url": self.url,
|
||||||
"priority": self.priority,
|
|
||||||
"callback_type": self.callback_type,
|
"callback_type": self.callback_type,
|
||||||
"callback_args": json.dumps(self.callback_args)
|
"callback_args": json.dumps(self.callback_args),
|
||||||
|
"upload_token": self.upload_token
|
||||||
}
|
}
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
@ -38,25 +46,13 @@ class Task:
|
|||||||
return self.__str__()
|
return self.__str__()
|
||||||
|
|
||||||
|
|
||||||
class TaskResult:
|
class IndexingTask:
|
||||||
|
|
||||||
def __init__(self, status_code=None, file_count=0, start_time=0,
|
def __init__(self, website_id: int, file_path: str, callback_type: str, callback_args):
|
||||||
end_time=0, website_id=0, server_name=""):
|
|
||||||
self.status_code = status_code
|
|
||||||
self.file_count = file_count
|
|
||||||
self.start_time = start_time
|
|
||||||
self.end_time = end_time
|
|
||||||
self.website_id = website_id
|
self.website_id = website_id
|
||||||
self.server_name = server_name
|
self.file_path = file_path
|
||||||
|
self.callback_type = callback_type
|
||||||
def to_json(self):
|
self.callback_args = callback_args
|
||||||
return {
|
|
||||||
"status_code": self.status_code,
|
|
||||||
"file_count": self.file_count,
|
|
||||||
"start_time": self.start_time,
|
|
||||||
"end_time": self.end_time,
|
|
||||||
"website_id": self.website_id
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class TaskManager:
|
class TaskManager:
|
||||||
@ -64,14 +60,47 @@ class TaskManager:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.search = ElasticSearchEngine("od-database")
|
self.search = ElasticSearchEngine("od-database")
|
||||||
self.db = database.Database("db.sqlite3")
|
self.db = database.Database("db.sqlite3")
|
||||||
|
self.tracker = TaskTrackerApi(config.TT_API)
|
||||||
|
|
||||||
def complete_task(self, file_list, task, task_result, crawler_name):
|
self.worker = Worker.from_file(self.tracker)
|
||||||
|
if not self.worker:
|
||||||
|
self.worker = self.tracker.make_worker("oddb_master")
|
||||||
|
self.worker.dump_to_file()
|
||||||
|
self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
|
||||||
|
self.worker.request_access(config.TT_INDEX_PROJECT, True, False)
|
||||||
|
|
||||||
self.search.delete_docs(task_result.website_id)
|
self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
|
||||||
|
|
||||||
|
self._indexer_thread = Thread(target=self._do_indexing)
|
||||||
|
self._indexer_thread.start()
|
||||||
|
|
||||||
|
def _do_indexing(self):
|
||||||
|
|
||||||
|
while True:
|
||||||
|
logger.debug("Fetching indexing task...")
|
||||||
|
task = self.tracker.fetch_task(worker=self.worker, project_id=config.TT_INDEX_PROJECT)
|
||||||
|
|
||||||
|
if task:
|
||||||
|
try:
|
||||||
|
recipe = task.json_recipe()
|
||||||
|
logger.debug("Got indexing task: " + str(recipe))
|
||||||
|
filename = os.path.join(config.WSB_PATH, format_file_name(recipe["website_id"], recipe["upload_token"]))
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
self._complete_task(filename, Task(recipe["website_id"], recipe["url"]))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
def _complete_task(self, file_list, task):
|
||||||
|
|
||||||
|
self.search.delete_docs(task.website_id)
|
||||||
|
|
||||||
if file_list:
|
if file_list:
|
||||||
def iter_lines():
|
def iter_lines():
|
||||||
|
|
||||||
with open(file_list, "r") as f:
|
with open(file_list, "r") as f:
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
while line:
|
while line:
|
||||||
@ -82,11 +111,38 @@ class TaskManager:
|
|||||||
|
|
||||||
self.db.update_website_date_if_exists(task.website_id)
|
self.db.update_website_date_if_exists(task.website_id)
|
||||||
|
|
||||||
task_result.server_id = crawler_name
|
def fetch_indexing_task(self):
|
||||||
|
|
||||||
self.db.log_result(task_result)
|
task = self.tracker.fetch_task(worker=self.worker, project_id=config.TT_INDEX_PROJECT)
|
||||||
|
print(task)
|
||||||
|
|
||||||
def queue_task(self, task: Task):
|
def queue_task(self, task: Task):
|
||||||
self.db.put_task(task)
|
|
||||||
print("Queued task and made it available to crawlers: " + str(task.website_id))
|
max_assign_time = 24 * 7 * 3600
|
||||||
|
upload_token = uuid4().__str__()
|
||||||
|
|
||||||
|
bucket_response = self.bucket.allocate(upload_token.__str__(),
|
||||||
|
21474837499, # 20Gib
|
||||||
|
format_file_name(task.website_id, upload_token),
|
||||||
|
to_dispose_date=int(time.time() + max_assign_time),
|
||||||
|
upload_hook="")
|
||||||
|
if not bucket_response:
|
||||||
|
return
|
||||||
|
|
||||||
|
print("Allocated upload bucket: %d, t=%s, r=%s" % (task.website_id, upload_token, bucket_response.text))
|
||||||
|
|
||||||
|
task.upload_token = upload_token
|
||||||
|
tracker_response = self.worker.submit_task(config.TT_CRAWL_PROJECT,
|
||||||
|
recipe=task.__str__(),
|
||||||
|
priority=task.priority,
|
||||||
|
max_assign_time=max_assign_time,
|
||||||
|
hash64=task.website_id,
|
||||||
|
verification_count=1,
|
||||||
|
max_retries=3
|
||||||
|
)
|
||||||
|
print("Queued task and made it available to crawlers: t=%s, r=%s" % (task, tracker_response.text))
|
||||||
|
|
||||||
|
|
||||||
|
def format_file_name(website_id, token):
|
||||||
|
return "%d_%s.NDJSON" % (website_id, token, )
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import od_util
|
import od_util
|
||||||
|
|
||||||
|
|
||||||
|
@ -1 +0,0 @@
|
|||||||
Files currently being indexing goes here
|
|
9
uwsgi.py
9
uwsgi.py
@ -1,11 +1,4 @@
|
|||||||
from app import app
|
from app import app
|
||||||
import config
|
|
||||||
import ssl
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if not config.USE_SSL:
|
app.run("0.0.0.0", port=12345)
|
||||||
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
|
|
||||||
context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem')
|
|
||||||
app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True)
|
|
||||||
else:
|
|
||||||
app.run("0.0.0.0", port=12345, threaded=True)
|
|
||||||
|
2
views.py
2
views.py
@ -6,12 +6,12 @@ from urllib.parse import urlparse
|
|||||||
from flask import render_template, redirect, request, flash, abort, Response, session
|
from flask import render_template, redirect, request, flash, abort, Response, session
|
||||||
from flask_caching import Cache
|
from flask_caching import Cache
|
||||||
|
|
||||||
|
import captcha
|
||||||
import config
|
import config
|
||||||
import od_util
|
import od_util
|
||||||
from common import db, taskManager, searchEngine, logger, require_role
|
from common import db, taskManager, searchEngine, logger, require_role
|
||||||
from database import Task, Website
|
from database import Task, Website
|
||||||
from search.search import InvalidQueryException
|
from search.search import InvalidQueryException
|
||||||
import captcha
|
|
||||||
|
|
||||||
|
|
||||||
def setup_views(app):
|
def setup_views(app):
|
||||||
|
1
ws_bucket_client
Submodule
1
ws_bucket_client
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 2cd3d217a7bc57cc7a11566d8ed60f48c9ea9e9e
|
Loading…
x
Reference in New Issue
Block a user