mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 02:46:45 +00:00
Website stats now works with elasticsearch
This commit is contained in:
parent
4b60ac62fc
commit
e266a50197
7
app.py
7
app.py
@ -1,4 +1,5 @@
|
|||||||
from flask import Flask, render_template, redirect, request, flash, abort, Response, send_from_directory, session
|
from flask import Flask, render_template, redirect, request, flash, abort, Response, send_from_directory, session
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import ssl
|
import ssl
|
||||||
@ -69,8 +70,10 @@ def website_json_chart(website_id):
|
|||||||
|
|
||||||
print("FIXME: website_json_chart")
|
print("FIXME: website_json_chart")
|
||||||
if website:
|
if website:
|
||||||
stats = {}
|
stats = searchEngine.get_stats(website_id)
|
||||||
return stats
|
stats["base_url"] = website.url
|
||||||
|
stats["report_time"] = website.last_modified
|
||||||
|
return json.dumps(stats)
|
||||||
else:
|
else:
|
||||||
abort(404)
|
abort(404)
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from flask import Flask, request, abort, Response, send_from_directory
|
from flask import Flask, request, abort, Response, send_from_directory
|
||||||
import json
|
import json
|
||||||
from crawl_server.task_manager import TaskManager, Task, TaskResult
|
from crawl_server.task_manager import TaskManager, Task, TaskResult
|
||||||
|
import os
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
tm = TaskManager("tm_db.sqlite3")
|
tm = TaskManager("tm_db.sqlite3")
|
||||||
@ -47,7 +48,17 @@ def get_current_tasks():
|
|||||||
|
|
||||||
@app.route("/file_list/<int:website_id>/")
|
@app.route("/file_list/<int:website_id>/")
|
||||||
def get_file_list(website_id):
|
def get_file_list(website_id):
|
||||||
return send_from_directory(directory="./crawled/", filename=str(website_id) + ".json")
|
|
||||||
|
file_name = "./crawled/" + str(website_id) + ".json"
|
||||||
|
if os.path.exists(file_name):
|
||||||
|
with open(file_name, "r") as f:
|
||||||
|
file_list = f.read()
|
||||||
|
|
||||||
|
os.remove(file_name)
|
||||||
|
|
||||||
|
return file_list
|
||||||
|
else:
|
||||||
|
return abort(404)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -77,11 +77,3 @@ class TaskManager:
|
|||||||
db.log_result(task_result)
|
db.log_result(task_result)
|
||||||
print("Logged result to DB")
|
print("Logged result to DB")
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def task_error(err):
|
|
||||||
print("FIXME: Task failed (This should not happen)")
|
|
||||||
print(err)
|
|
||||||
raise err
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import elasticsearch
|
import elasticsearch
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
from elasticsearch.exceptions import TransportError
|
|
||||||
|
|
||||||
|
|
||||||
class IndexingError(Exception):
|
class IndexingError(Exception):
|
||||||
@ -25,9 +24,11 @@ class SearchEngine:
|
|||||||
def ping(self):
|
def ping(self):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_stats(self, website_id: int, subdir: str = None):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class ElasticSearchEngine(SearchEngine):
|
class ElasticSearchEngine(SearchEngine):
|
||||||
|
|
||||||
SORT_ORDERS = {
|
SORT_ORDERS = {
|
||||||
"score": ["_score"],
|
"score": ["_score"],
|
||||||
"size_asc": [{"size": {"order": "asc"}}],
|
"size_asc": [{"size": {"order": "asc"}}],
|
||||||
@ -54,21 +55,21 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
|
|
||||||
# File names and paths
|
# File names and paths
|
||||||
self.es.indices.put_settings(body=
|
self.es.indices.put_settings(body=
|
||||||
{"analysis": {
|
{"analysis": {
|
||||||
"tokenizer": {
|
"tokenizer": {
|
||||||
"my_nGram_tokenizer": {
|
"my_nGram_tokenizer": {
|
||||||
"type": "nGram", "min_gram": 3, "max_gram": 3}
|
"type": "nGram", "min_gram": 3, "max_gram": 3}
|
||||||
}
|
}
|
||||||
}}, index=self.index_name)
|
}}, index=self.index_name)
|
||||||
self.es.indices.put_settings(body=
|
self.es.indices.put_settings(body=
|
||||||
{"analysis": {
|
{"analysis": {
|
||||||
"analyzer": {
|
"analyzer": {
|
||||||
"my_nGram": {
|
"my_nGram": {
|
||||||
"tokenizer": "my_nGram_tokenizer",
|
"tokenizer": "my_nGram_tokenizer",
|
||||||
"filter": ["lowercase", "asciifolding"]
|
"filter": ["lowercase", "asciifolding"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}}, index=self.index_name)
|
}}, index=self.index_name)
|
||||||
|
|
||||||
# Mappings
|
# Mappings
|
||||||
self.es.indices.put_mapping(body={"properties": {
|
self.es.indices.put_mapping(body={"properties": {
|
||||||
@ -96,7 +97,7 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
for line in in_str.splitlines():
|
for line in in_str.splitlines():
|
||||||
doc = json.loads(line)
|
doc = json.loads(line)
|
||||||
name, ext = os.path.splitext(doc["name"])
|
name, ext = os.path.splitext(doc["name"])
|
||||||
doc["ext"] = ext if ext else ""
|
doc["ext"] = ext[1:] if ext and len(ext) > 1 else ""
|
||||||
doc["name"] = name
|
doc["name"] = name
|
||||||
doc["website_id"] = website_id
|
doc["website_id"] = website_id
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
@ -151,3 +152,43 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
# todo get scroll time from config
|
# todo get scroll time from config
|
||||||
# todo get size from config
|
# todo get size from config
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
def get_stats(self, website_id: int, subdir: str = None):
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
result = self.es.search(body={
|
||||||
|
"query": {
|
||||||
|
"constant_score": {
|
||||||
|
"filter": {
|
||||||
|
"term": {"website_id": website_id}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"aggs": {
|
||||||
|
"ext_group": {
|
||||||
|
"terms": {
|
||||||
|
"field": "ext"
|
||||||
|
},
|
||||||
|
"aggs": {
|
||||||
|
"size": {
|
||||||
|
"sum": {
|
||||||
|
"field": "size"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"total_size": {
|
||||||
|
"sum_bucket": {
|
||||||
|
"buckets_path": "ext_group>size"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"size": 0
|
||||||
|
})
|
||||||
|
|
||||||
|
stats["total_size"] = result["aggregations"]["total_size"]["value"]
|
||||||
|
stats["total_count"] = result["hits"]["total"]
|
||||||
|
stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"])
|
||||||
|
for b in result["aggregations"]["ext_group"]["buckets"]]
|
||||||
|
|
||||||
|
return stats
|
||||||
|
@ -26,17 +26,17 @@ function drawChart(rData) {
|
|||||||
var otherSize = 0;
|
var otherSize = 0;
|
||||||
var otherCount = 0;
|
var otherCount = 0;
|
||||||
|
|
||||||
for(var ext in rData["mime_stats"]) {
|
for(var ext in rData["ext_stats"]) {
|
||||||
//Ignore file sizes below 0.5%
|
//Ignore file sizes below 0.5%
|
||||||
if (!isRelevant(rData, ext)) {
|
if (!isRelevant(rData, ext)) {
|
||||||
|
|
||||||
otherSize += rData["mime_stats"][ext][0];
|
otherSize += rData["ext_stats"][ext][0];
|
||||||
otherCount += rData["mime_stats"][ext][1];
|
otherCount += rData["ext_stats"][ext][1];
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
dataSetSize.push(rData["mime_stats"][ext][0]);
|
dataSetSize.push(rData["ext_stats"][ext][0]);
|
||||||
dataSetCount.push(rData["mime_stats"][ext][1]);
|
dataSetCount.push(rData["ext_stats"][ext][1]);
|
||||||
labels.push(rData["mime_stats"][ext][2] + " x" + rData["mime_stats"][ext][1] + " (" + humanFileSize(rData["mime_stats"][ext][0]) + ")");
|
labels.push(rData["ext_stats"][ext][2] + " x" + rData["ext_stats"][ext][1] + " (" + humanFileSize(rData["ext_stats"][ext][0]) + ")");
|
||||||
colors.push(getRandomColor())
|
colors.push(getRandomColor())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -82,15 +82,15 @@ function fillTable(rData) {
|
|||||||
|
|
||||||
function isRelevant(rData, ext) {
|
function isRelevant(rData, ext) {
|
||||||
|
|
||||||
// console.log("Checking + " + rData["mime_stats"][ext][2]);
|
// console.log("Checking + " + rData["ext_stats"][ext][2]);
|
||||||
// console.log("total + " + rData["total_size"]);
|
// console.log("total + " + rData["total_size"]);
|
||||||
// console.log("size + " + rData["mime_stats"][ext][0]);
|
// console.log("size + " + rData["ext_stats"][ext][0]);
|
||||||
// console.log("min + " + 0.03 * rData["total_count"]);
|
// console.log("min + " + 0.03 * rData["total_count"]);
|
||||||
|
|
||||||
if(rData["total_size"] < 100000) {
|
if(rData["total_size"] < 100000) {
|
||||||
return rData["mime_stats"][ext][1] > 0.03 * rData["total_count"]
|
return rData["ext_stats"][ext][1] > 0.03 * rData["total_count"]
|
||||||
} else {
|
} else {
|
||||||
return rData["mime_stats"][ext][0] > 0.005 * rData["total_size"]
|
return rData["ext_stats"][ext][0] > 0.005 * rData["total_size"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user