mirror of
				https://github.com/simon987/od-database.git
				synced 2025-10-26 12:06:52 +00:00 
			
		
		
		
	uWSGI config and bugfix with file extensions
This commit is contained in:
		
							parent
							
								
									e91572a06f
								
							
						
					
					
						commit
						9bde8cb629
					
				
							
								
								
									
										6
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										6
									
								
								app.py
									
									
									
									
									
								
							| @ -113,7 +113,7 @@ def search(): | ||||
|     if len(q) >= 3: | ||||
|         try: | ||||
|             hits = searchEngine.search(q, page, per_page, sort_order) | ||||
|             hits = db.join_search_result(hits) | ||||
|             hits = db.join_website_on_search_result(hits) | ||||
|         except InvalidQueryException as e: | ||||
|             flash("<strong>Invalid query:</strong> " + str(e), "warning") | ||||
|             return redirect("/search") | ||||
| @ -299,6 +299,6 @@ if __name__ == '__main__': | ||||
|     if config.USE_SSL: | ||||
|         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) | ||||
|         context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem') | ||||
|         app.run("0.0.0.0", port=12345, ssl_context=context) | ||||
|         app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True) | ||||
|     else: | ||||
|         app.run("0.0.0.0", port=12345) | ||||
|         app.run("0.0.0.0", port=12345, threaded=True) | ||||
|  | ||||
| @ -206,12 +206,15 @@ class Database: | ||||
|                 result[db_website[0]] = db_website[1] | ||||
|             return result | ||||
| 
 | ||||
|     def join_search_result(self, page: dict) -> dict: | ||||
|     def join_website_on_search_result(self, page: dict) -> dict: | ||||
| 
 | ||||
|         websites = self.get_all_websites() | ||||
| 
 | ||||
|         for hit in page["hits"]["hits"]: | ||||
|             if hit["_source"]["website_id"] in websites: | ||||
|                 hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]] | ||||
|             else: | ||||
|                 hit["_source"]["website_url"] = "NONE" | ||||
| 
 | ||||
|         return page | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										8
									
								
								od-database.ini
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								od-database.ini
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | ||||
| [uwsgi] | ||||
| socket = 127.0.0.1:3031 | ||||
| chdir = /home/simon/Dropbox/data/CS/python/od-database/ | ||||
| wsgi-file = uwsgi.py | ||||
| processes = 4 | ||||
| threads = 4 | ||||
| stats = 127.0.0.1:9191 | ||||
| callable=app | ||||
| @ -74,8 +74,8 @@ class ElasticSearchEngine(SearchEngine): | ||||
| 
 | ||||
|         # Mappings | ||||
|         self.es.indices.put_mapping(body={"properties": { | ||||
|             "path": {"analyzer": "my_nGram", "type": "text"}, | ||||
|             "name": {"analyzer": "my_nGram", "type": "text"}, | ||||
|             "path": {"analyzer": "standard", "type": "text"}, | ||||
|             "name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}}, | ||||
|             "mtime": {"type": "date", "format": "epoch_millis"}, | ||||
|             "size": {"type": "long"}, | ||||
|             "website_id": {"type": "integer"}, | ||||
| @ -95,7 +95,7 @@ class ElasticSearchEngine(SearchEngine): | ||||
|         if not in_str: | ||||
|             return | ||||
| 
 | ||||
|         import_every = 1000 | ||||
|         import_every = 5000 | ||||
| 
 | ||||
|         docs = [] | ||||
| 
 | ||||
| @ -138,8 +138,8 @@ class ElasticSearchEngine(SearchEngine): | ||||
|                     "must": { | ||||
|                         "multi_match": { | ||||
|                             "query": query, | ||||
|                             "fields": ["name", "path"], | ||||
|                             "operator": "and" | ||||
|                             "fields": ["name^5", "name.nGram^2", "path"], | ||||
|                             "operator": "or" | ||||
|                         } | ||||
|                     }, | ||||
|                     "filter": filters | ||||
|  | ||||
							
								
								
									
										2
									
								
								startWSGI.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								startWSGI.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,2 @@ | ||||
| #!/usr/bin/env bash | ||||
| uwsgi od-database.ini | ||||
							
								
								
									
										56
									
								
								stress_test.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								stress_test.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,56 @@ | ||||
| import os | ||||
| import json | ||||
| import sys | ||||
| from search.search import ElasticSearchEngine | ||||
| from concurrent.futures import ThreadPoolExecutor | ||||
| import requests | ||||
| import random | ||||
| 
 | ||||
| 
 | ||||
| def dump_local_filesystem(root_dir: str): | ||||
| 
 | ||||
|     docs = [] | ||||
| 
 | ||||
|     for root, dirs, files in os.walk(root_dir): | ||||
| 
 | ||||
|         for filename in files: | ||||
|             full_path = os.path.join(root, filename) | ||||
|             stats = os.stat(full_path) | ||||
| 
 | ||||
|             doc = dict() | ||||
|             doc["name"] = filename | ||||
|             doc["path"] = root | ||||
|             doc["mtime"] = stats.st_mtime | ||||
|             doc["size"] = stats.st_size | ||||
| 
 | ||||
|             docs.append(doc) | ||||
| 
 | ||||
|     with open("local_filesystem.json", "w") as f: | ||||
|             f.writelines(json.dumps(doc) + "\n" for doc in docs) | ||||
| 
 | ||||
| 
 | ||||
| def index_file_list(path: str, website_id): | ||||
| 
 | ||||
|     es = ElasticSearchEngine("od-database") | ||||
|     with open(path, "r") as f: | ||||
|         es.import_json(f.read(), website_id) | ||||
| 
 | ||||
| 
 | ||||
| def search(term=""): | ||||
|     requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False) | ||||
|     print(term) | ||||
| 
 | ||||
| 
 | ||||
| def random_searches(count=10000000, max_workers=1000): | ||||
| 
 | ||||
|     terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain")\ | ||||
|         .text.splitlines() | ||||
| 
 | ||||
|     pool = ThreadPoolExecutor(max_workers=max_workers) | ||||
|     pool.map(search, random.choices(terms, k=count)) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| # dump_local_filesystem("/mnt/") | ||||
| # index_file_list("local_filesystem.json", 10) | ||||
| # random_searches(100000) | ||||
							
								
								
									
										21
									
								
								task.py
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								task.py
									
									
									
									
									
								
							| @ -28,7 +28,7 @@ class CrawlServer: | ||||
|         except ConnectionError: | ||||
|             return False | ||||
| 
 | ||||
|     def get_completed_tasks(self) -> list: | ||||
|     def fetch_completed_tasks(self) -> list: | ||||
| 
 | ||||
|         try: | ||||
|             r = requests.get(self.url + "/task/completed") | ||||
| @ -36,9 +36,10 @@ class CrawlServer: | ||||
|                 TaskResult(r["status_code"], r["file_count"], r["start_time"], r["end_time"], r["website_id"]) | ||||
|                 for r in json.loads(r.text)] | ||||
|         except ConnectionError: | ||||
|             print("Crawl server cannot be reached " + self.url) | ||||
|             return [] | ||||
| 
 | ||||
|     def get_queued_tasks(self) -> list: | ||||
|     def fetch_queued_tasks(self) -> list: | ||||
| 
 | ||||
|         try: | ||||
|             r = requests.get(self.url + "/task/") | ||||
| @ -49,7 +50,7 @@ class CrawlServer: | ||||
|         except ConnectionError: | ||||
|             return [] | ||||
| 
 | ||||
|     def get_current_tasks(self): | ||||
|     def fetch_current_tasks(self): | ||||
| 
 | ||||
|         try: | ||||
|             r = requests.get(self.url + "/task/current") | ||||
| @ -58,14 +59,13 @@ class CrawlServer: | ||||
|                 for t in json.loads(r.text) | ||||
|             ] | ||||
|         except ConnectionError: | ||||
|             print("Server cannot be reached " + self.url) | ||||
|             return [] | ||||
| 
 | ||||
|     def get_file_list(self, website_id) -> str: | ||||
|     def fetch_website_files(self, website_id) -> str: | ||||
| 
 | ||||
|         try: | ||||
|             r = requests.get(self.url + "/file_list/" + str(website_id) + "/") | ||||
|             return r.text | ||||
|             return r.text if r.status_code == 200 else "" | ||||
|         except ConnectionError: | ||||
|             return "" | ||||
| 
 | ||||
| @ -73,6 +73,7 @@ class CrawlServer: | ||||
| class TaskDispatcher: | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         # TODO: remove reddit | ||||
|         reddit = praw.Reddit('opendirectories-bot', | ||||
|                              user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)') | ||||
|         self.reddit_bot = RedditBot("crawled.txt", reddit) | ||||
| @ -91,9 +92,9 @@ class TaskDispatcher: | ||||
|     def check_completed_tasks(self): | ||||
| 
 | ||||
|         for server in self.crawl_servers: | ||||
|             for task in server.get_completed_tasks(): | ||||
|             for task in server.fetch_completed_tasks(): | ||||
|                 print("Completed task") | ||||
|                 file_list = server.get_file_list(task.website_id) | ||||
|                 file_list = server.fetch_website_files(task.website_id) | ||||
|                 self.search.import_json(file_list, task.website_id) | ||||
| 
 | ||||
|     def dispatch_task(self, task: Task): | ||||
| @ -108,7 +109,7 @@ class TaskDispatcher: | ||||
|         queued_tasks = [] | ||||
| 
 | ||||
|         for server in self.crawl_servers: | ||||
|             queued_tasks.extend(server.get_queued_tasks()) | ||||
|             queued_tasks.extend(server.fetch_queued_tasks()) | ||||
| 
 | ||||
|         return queued_tasks | ||||
| 
 | ||||
| @ -117,7 +118,7 @@ class TaskDispatcher: | ||||
| 
 | ||||
|         current_tasks = [] | ||||
|         for server in self.crawl_servers: | ||||
|             current_tasks.extend(server.get_current_tasks()) | ||||
|             current_tasks.extend(server.fetch_current_tasks()) | ||||
| 
 | ||||
|         return current_tasks | ||||
| 
 | ||||
|  | ||||
| @ -26,7 +26,7 @@ | ||||
|                 <form action="/search"> | ||||
| 
 | ||||
|                     <div class="form-group"> | ||||
|                         <input class="form-control" name="q" id="q" placeholder="Full-text Query"> | ||||
|                         <input class="form-control" name="q" id="q" placeholder="Query"> | ||||
|                     </div> | ||||
| 
 | ||||
|                     <input class="btn btn-primary btn-shadow" type="submit" value="Search"> | ||||
| @ -34,20 +34,5 @@ | ||||
|             </div> | ||||
|         </div> | ||||
| 
 | ||||
|         <div class="card"> | ||||
|             <div class="card-header">Full-text Query Syntax</div> | ||||
|             <div class="card-body"> | ||||
| 
 | ||||
|                 <p>The following query types are allowed (More information | ||||
|                     <a href="https://www.sqlite.org/fts5.html#full_text_query_syntax">here</a>):</p> | ||||
|                 <p>Exact term: <code> "foo"</code></p> | ||||
|                 <p>Term with prefix: <code> "foo*"</code></p> | ||||
|                 <p>File names: <code> "name:foo"</code></p> | ||||
|                 <p>Paths: <code> "path:foo"</code></p> | ||||
|                 <p>Starts with term: <code> "^foo"</code></p> | ||||
|                 <p>NEAR group: <code> "NEAR(foo bar, 3)"</code></p> | ||||
|             </div> | ||||
|         </div> | ||||
| 
 | ||||
|     </div> | ||||
| {% endblock body %} | ||||
|  | ||||
| @ -58,8 +58,8 @@ | ||||
|                                 <tr> | ||||
|                                     <td> | ||||
|                                         {# File name & link #} | ||||
|                                         <a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + src["ext"] }}" title="{{ src["name"] + src["ext"] }}"> | ||||
|                                             {{ hl_name |safe }}{{ src["ext"] }} | ||||
|                                         <a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + ("." if src["ext"] != "" else "") + src["ext"] }}" title="{{ src["name"] + src["ext"] }}"> | ||||
|                                             {{ hl_name |safe }}{{ ("." if src["ext"] != "" else "") + src["ext"] }} | ||||
|                                         </a> | ||||
|                                         {# File type badge #} | ||||
|                                         {% set mime = get_mime(src["path"]) %} | ||||
|  | ||||
							
								
								
									
										11
									
								
								uwsgi.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								uwsgi.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,11 @@ | ||||
| from app import app | ||||
| import config | ||||
| import ssl | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     if not config.USE_SSL: | ||||
|         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) | ||||
|         context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem') | ||||
|         app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True) | ||||
|     else: | ||||
|         app.run("0.0.0.0", port=12345, threaded=True) | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user