From e266a501972bd0d0248d4e7b69ecfaa66ca26197 Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 12 Jun 2018 20:17:30 -0400 Subject: [PATCH] Website stats now works with elasticsearch --- app.py | 7 +++- crawl_server/server.py | 13 ++++++- crawl_server/task_manager.py | 8 ---- search/search.py | 75 ++++++++++++++++++++++++++++-------- static/js/report.js | 20 +++++----- 5 files changed, 85 insertions(+), 38 deletions(-) diff --git a/app.py b/app.py index 4605610..1254c55 100644 --- a/app.py +++ b/app.py @@ -1,4 +1,5 @@ from flask import Flask, render_template, redirect, request, flash, abort, Response, send_from_directory, session +import json import os import time import ssl @@ -69,8 +70,10 @@ def website_json_chart(website_id): print("FIXME: website_json_chart") if website: - stats = {} - return stats + stats = searchEngine.get_stats(website_id) + stats["base_url"] = website.url + stats["report_time"] = website.last_modified + return json.dumps(stats) else: abort(404) diff --git a/crawl_server/server.py b/crawl_server/server.py index 078077d..924f556 100644 --- a/crawl_server/server.py +++ b/crawl_server/server.py @@ -1,6 +1,7 @@ from flask import Flask, request, abort, Response, send_from_directory import json from crawl_server.task_manager import TaskManager, Task, TaskResult +import os app = Flask(__name__) tm = TaskManager("tm_db.sqlite3") @@ -47,7 +48,17 @@ def get_current_tasks(): @app.route("/file_list//") def get_file_list(website_id): - return send_from_directory(directory="./crawled/", filename=str(website_id) + ".json") + + file_name = "./crawled/" + str(website_id) + ".json" + if os.path.exists(file_name): + with open(file_name, "r") as f: + file_list = f.read() + + os.remove(file_name) + + return file_list + else: + return abort(404) if __name__ == "__main__": diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index 5c853e1..94fbe78 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -77,11 +77,3 @@ class TaskManager: db.log_result(task_result) print("Logged result to DB") - @staticmethod - def task_error(err): - print("FIXME: Task failed (This should not happen)") - print(err) - raise err - - - diff --git a/search/search.py b/search/search.py index e74693e..0b0fec8 100644 --- a/search/search.py +++ b/search/search.py @@ -1,7 +1,6 @@ import elasticsearch import os import json -from elasticsearch.exceptions import TransportError class IndexingError(Exception): @@ -25,9 +24,11 @@ class SearchEngine: def ping(self): raise NotImplementedError + def get_stats(self, website_id: int, subdir: str = None): + raise NotImplementedError + class ElasticSearchEngine(SearchEngine): - SORT_ORDERS = { "score": ["_score"], "size_asc": [{"size": {"order": "asc"}}], @@ -54,21 +55,21 @@ class ElasticSearchEngine(SearchEngine): # File names and paths self.es.indices.put_settings(body= - {"analysis": { - "tokenizer": { - "my_nGram_tokenizer": { - "type": "nGram", "min_gram": 3, "max_gram": 3} - } - }}, index=self.index_name) + {"analysis": { + "tokenizer": { + "my_nGram_tokenizer": { + "type": "nGram", "min_gram": 3, "max_gram": 3} + } + }}, index=self.index_name) self.es.indices.put_settings(body= - {"analysis": { - "analyzer": { - "my_nGram": { - "tokenizer": "my_nGram_tokenizer", - "filter": ["lowercase", "asciifolding"] - } - } - }}, index=self.index_name) + {"analysis": { + "analyzer": { + "my_nGram": { + "tokenizer": "my_nGram_tokenizer", + "filter": ["lowercase", "asciifolding"] + } + } + }}, index=self.index_name) # Mappings self.es.indices.put_mapping(body={"properties": { @@ -96,7 +97,7 @@ class ElasticSearchEngine(SearchEngine): for line in in_str.splitlines(): doc = json.loads(line) name, ext = os.path.splitext(doc["name"]) - doc["ext"] = ext if ext else "" + doc["ext"] = ext[1:] if ext and len(ext) > 1 else "" doc["name"] = name doc["website_id"] = website_id docs.append(doc) @@ -151,3 +152,43 @@ class ElasticSearchEngine(SearchEngine): # todo get scroll time from config # todo get size from config return page + + def get_stats(self, website_id: int, subdir: str = None): + + stats = {} + result = self.es.search(body={ + "query": { + "constant_score": { + "filter": { + "term": {"website_id": website_id} + } + } + }, + "aggs": { + "ext_group": { + "terms": { + "field": "ext" + }, + "aggs": { + "size": { + "sum": { + "field": "size" + } + } + } + }, + "total_size": { + "sum_bucket": { + "buckets_path": "ext_group>size" + } + } + }, + "size": 0 + }) + + stats["total_size"] = result["aggregations"]["total_size"]["value"] + stats["total_count"] = result["hits"]["total"] + stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"]) + for b in result["aggregations"]["ext_group"]["buckets"]] + + return stats diff --git a/static/js/report.js b/static/js/report.js index dee2b19..3d99cbc 100644 --- a/static/js/report.js +++ b/static/js/report.js @@ -26,17 +26,17 @@ function drawChart(rData) { var otherSize = 0; var otherCount = 0; - for(var ext in rData["mime_stats"]) { + for(var ext in rData["ext_stats"]) { //Ignore file sizes below 0.5% if (!isRelevant(rData, ext)) { - otherSize += rData["mime_stats"][ext][0]; - otherCount += rData["mime_stats"][ext][1]; + otherSize += rData["ext_stats"][ext][0]; + otherCount += rData["ext_stats"][ext][1]; } else { - dataSetSize.push(rData["mime_stats"][ext][0]); - dataSetCount.push(rData["mime_stats"][ext][1]); - labels.push(rData["mime_stats"][ext][2] + " x" + rData["mime_stats"][ext][1] + " (" + humanFileSize(rData["mime_stats"][ext][0]) + ")"); + dataSetSize.push(rData["ext_stats"][ext][0]); + dataSetCount.push(rData["ext_stats"][ext][1]); + labels.push(rData["ext_stats"][ext][2] + " x" + rData["ext_stats"][ext][1] + " (" + humanFileSize(rData["ext_stats"][ext][0]) + ")"); colors.push(getRandomColor()) } } @@ -82,15 +82,15 @@ function fillTable(rData) { function isRelevant(rData, ext) { - // console.log("Checking + " + rData["mime_stats"][ext][2]); + // console.log("Checking + " + rData["ext_stats"][ext][2]); // console.log("total + " + rData["total_size"]); - // console.log("size + " + rData["mime_stats"][ext][0]); + // console.log("size + " + rData["ext_stats"][ext][0]); // console.log("min + " + 0.03 * rData["total_count"]); if(rData["total_size"] < 100000) { - return rData["mime_stats"][ext][1] > 0.03 * rData["total_count"] + return rData["ext_stats"][ext][1] > 0.03 * rData["total_count"] } else { - return rData["mime_stats"][ext][0] > 0.005 * rData["total_size"] + return rData["ext_stats"][ext][0] > 0.005 * rData["total_size"] }