mirror of
				https://github.com/simon987/od-database.git
				synced 2025-10-26 03:56:52 +00:00 
			
		
		
		
	Added filter for large files in stats
This commit is contained in:
		
							parent
							
								
									2638e47360
								
							
						
					
					
						commit
						4c9d79fdbf
					
				
							
								
								
									
										1
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								app.py
									
									
									
									
									
								
							| @ -10,7 +10,6 @@ import config | ||||
| from flask_caching import Cache | ||||
| from task import TaskDispatcher, Task, CrawlServer | ||||
| from search.search import ElasticSearchEngine | ||||
| from jinja2 import Undefined | ||||
| 
 | ||||
| app = Flask(__name__) | ||||
| recaptcha = ReCaptcha(app=app, | ||||
|  | ||||
| @ -16,7 +16,8 @@ class SearchEngine: | ||||
|     def import_json(self, in_str: str, website_id: int): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min, date_max) -> {}: | ||||
|     def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min, | ||||
|                date_max) -> {}: | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def reset(self): | ||||
| @ -142,7 +143,8 @@ class ElasticSearchEngine(SearchEngine): | ||||
|         action_string = '{"index":{}}\n' | ||||
|         return "\n".join("".join([action_string, ujson.dumps(doc)]) for doc in docs) | ||||
| 
 | ||||
|     def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, date_max) -> {}: | ||||
|     def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, | ||||
|                date_max) -> {}: | ||||
| 
 | ||||
|         filters = [] | ||||
|         if extensions: | ||||
| @ -264,16 +266,18 @@ class ElasticSearchEngine(SearchEngine): | ||||
|         size_per_ext = self.es.search(body={ | ||||
|             "query": { | ||||
|                 "bool": { | ||||
|                     "must_not": { | ||||
|                         "term": {"size": -1} | ||||
|                     } | ||||
|                     "filter": [ | ||||
|                         {"range": { | ||||
|                             "size": {"gte": 0, "lte": (1000000000000 - 1)}  # 0-1TB | ||||
|                         }} | ||||
|                     ] | ||||
|                 } | ||||
|             }, | ||||
|             "aggs": { | ||||
|                 "ext_group": { | ||||
|                     "terms": { | ||||
|                         "field": "ext", | ||||
|                         "size": 20 | ||||
|                         "size": 40 | ||||
|                     }, | ||||
|                     "aggs": { | ||||
|                         "size": { | ||||
| @ -285,14 +289,17 @@ class ElasticSearchEngine(SearchEngine): | ||||
|                 } | ||||
|             }, | ||||
|             "size": 0 | ||||
| 
 | ||||
|         }, index=self.index_name, request_timeout=30) | ||||
| 
 | ||||
|         total_stats = self.es.search(body={ | ||||
|             "query": { | ||||
|                 "bool": { | ||||
|                     "must_not": { | ||||
|                         "term": {"size": -1} | ||||
|                     } | ||||
|                     "filter": [ | ||||
|                         {"range": { | ||||
|                             "size": {"gte": 0, "lte": (1000000000000 - 1)}  # 0-1TB | ||||
|                         }} | ||||
|                     ] | ||||
|                 } | ||||
|             }, | ||||
|             "aggs": { | ||||
| @ -304,24 +311,20 @@ class ElasticSearchEngine(SearchEngine): | ||||
|                 } | ||||
|             }, | ||||
|             "size": 0 | ||||
| 
 | ||||
|         }, index=self.index_name, request_timeout=30) | ||||
| 
 | ||||
|         size_and_date_histogram = self.es.search(body={ | ||||
|             "query": { | ||||
|                 "bool": { | ||||
|                     "must_not": { | ||||
|                         "term": {"size": -1}, | ||||
|                     }, | ||||
|                     "filter": [ | ||||
|                         {"range": { | ||||
|                             "size": {"gte": 0, "lte": (1000000000000 - 1)}  # 0-1TB | ||||
|                         }}, | ||||
|                         {"range": { | ||||
|                             "mtime": { | ||||
|                                 "gt": 0  # 1970-01-01 | ||||
|                             } | ||||
|                         }}, | ||||
|                         {"range": { | ||||
|                             "size": { | ||||
|                                 "gt": 0 | ||||
|                             } | ||||
|                         }} | ||||
|                     ] | ||||
|                 } | ||||
| @ -349,9 +352,11 @@ class ElasticSearchEngine(SearchEngine): | ||||
|         website_scatter = self.es.search(body={ | ||||
|             "query": { | ||||
|                 "bool": { | ||||
|                     "must_not": { | ||||
|                         "term": {"size": -1}, | ||||
|                     } | ||||
|                     "filter": [ | ||||
|                         {"range": { | ||||
|                             "size": {"gte": 0, "lte": (1000000000000 - 1)}  # 0-1TB | ||||
|                         }} | ||||
|                     ] | ||||
|                 } | ||||
|             }, | ||||
|             "aggs": { | ||||
| @ -379,7 +384,9 @@ class ElasticSearchEngine(SearchEngine): | ||||
|         stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"] | ||||
|         stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"] | ||||
|         stats["es_search_time_avg"] = stats["es_search_time"] / ( | ||||
| 
 | ||||
|             stats["es_search_count"] if stats["es_search_count"] != 0 else 1) | ||||
| 
 | ||||
|         stats["total_count"] = total_stats["hits"]["total"] | ||||
|         stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"] | ||||
|         stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"] | ||||
| @ -398,16 +405,16 @@ class ElasticSearchEngine(SearchEngine): | ||||
| 
 | ||||
|         return stats | ||||
| 
 | ||||
|     def stream_all_docs(self): | ||||
| 
 | ||||
| def stream_all_docs(self): | ||||
|     return helpers.scan(query={ | ||||
|         "query": { | ||||
|             "match_all": {} | ||||
|         } | ||||
|     }, scroll="5m", client=self.es, index=self.index_name) | ||||
| 
 | ||||
|     def are_empty(self, websites): | ||||
| 
 | ||||
| def are_empty(self, websites): | ||||
|     result = self.es.search(body={ | ||||
|         "query": { | ||||
|             "bool": { | ||||
|  | ||||
| @ -25,7 +25,7 @@ | ||||
|                     <tr> | ||||
|                         <td><a href="/get_export">out.csv.xz</a></td> | ||||
|                         <td>{{ export_file_stats.st_size |filesizeformat }}</td> | ||||
|                         <td>{{ export_file_stats.st_mtime|datetime_format }}</td> | ||||
|                         <td>{{ export_file_stats.st_mtime|datetime_format }} UTC</td> | ||||
|                     </tr> | ||||
|                     {% endif %} | ||||
|                     </tbody> | ||||
|  | ||||
| @ -10,7 +10,7 @@ | ||||
| 
 | ||||
|             {% if stats and stats["total_size"] %} | ||||
|                 <p class="lead">{{ stats["total_count"] }} files totalling | ||||
|                     ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} website(s)</p> | ||||
|                     ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites</p> | ||||
|             {% endif %} | ||||
|             {% if current_websites %} | ||||
|                 <p>Currently indexing <code>{{ current_websites }}</code><span class="vim-caret"> </span> </p> | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user