mirror of
				https://github.com/simon987/od-database.git
				synced 2025-10-25 03:46:52 +00:00 
			
		
		
		
	Support for more than just utf-8 and removed some debug info
This commit is contained in:
		
							parent
							
								
									7c47b0f00c
								
							
						
					
					
						commit
						8a73142ff8
					
				| @ -80,8 +80,6 @@ class RemoteDirectoryCrawler: | ||||
| 
 | ||||
|     def crawl_directory(self, out_file: str) -> CrawlResult: | ||||
| 
 | ||||
|         import gc | ||||
|         gc.set_debug(gc.DEBUG_LEAK) | ||||
|         try: | ||||
|             directory = RemoteDirectoryFactory.get_directory(self.url) | ||||
|             root_listing = directory.list_dir("") | ||||
|  | ||||
| @ -1,4 +1,5 @@ | ||||
| from urllib.parse import unquote, urljoin | ||||
| import warnings | ||||
| import os | ||||
| from html.parser import HTMLParser | ||||
| from itertools import repeat | ||||
| @ -118,44 +119,47 @@ class HttpDirectory(RemoteDirectory): | ||||
| 
 | ||||
|     def _request_file(self, url): | ||||
| 
 | ||||
|         retries = HttpDirectory.MAX_RETRIES | ||||
|         while retries > 0: | ||||
|             try: | ||||
|                 r = self.session.head(url, allow_redirects=False, timeout=40) | ||||
|         with warnings.catch_warnings(): | ||||
|             warnings.simplefilter("ignore") | ||||
|             retries = HttpDirectory.MAX_RETRIES | ||||
|             while retries > 0: | ||||
|                 try: | ||||
|                     r = self.session.head(url, allow_redirects=False, timeout=40) | ||||
| 
 | ||||
|                 stripped_url = url[len(self.base_url) - 1:] | ||||
|                     stripped_url = url[len(self.base_url) - 1:] | ||||
| 
 | ||||
|                 path, name = os.path.split(stripped_url) | ||||
|                 date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01" | ||||
|                 return File( | ||||
|                     path=unquote(path).strip("/"), | ||||
|                     name=unquote(name), | ||||
|                     size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1, | ||||
|                     mtime=int(parse_date(date).timestamp()), | ||||
|                     is_dir=False | ||||
|                 ) | ||||
|             except RequestException: | ||||
|                 self.session.close() | ||||
|                 retries -= 1 | ||||
|                     path, name = os.path.split(stripped_url) | ||||
|                     date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01" | ||||
|                     return File( | ||||
|                         path=unquote(path).strip("/"), | ||||
|                         name=unquote(name), | ||||
|                         size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1, | ||||
|                         mtime=int(parse_date(date).timestamp()), | ||||
|                         is_dir=False | ||||
|                     ) | ||||
|                 except RequestException: | ||||
|                     self.session.close() | ||||
|                     retries -= 1 | ||||
| 
 | ||||
|         return None | ||||
|             return None | ||||
| 
 | ||||
|     def _stream_body(self, url: str): | ||||
|         with warnings.catch_warnings(): | ||||
|             warnings.simplefilter("ignore") | ||||
|             retries = HttpDirectory.MAX_RETRIES | ||||
|             while retries > 0: | ||||
|                 try: | ||||
|                     r = self.session.get(url, stream=True, timeout=40) | ||||
|                     for chunk in r.iter_content(chunk_size=4096): | ||||
|                         yield chunk.decode(r.encoding, errors="ignore") | ||||
|                     r.close() | ||||
|                     del r | ||||
|                     break | ||||
|                 except RequestException: | ||||
|                     self.session.close() | ||||
|                     retries -= 1 | ||||
| 
 | ||||
|         retries = HttpDirectory.MAX_RETRIES | ||||
|         while retries > 0: | ||||
|             try: | ||||
|                 r = self.session.get(url, stream=True, timeout=40) | ||||
|                 for chunk in r.iter_content(chunk_size=4096): | ||||
|                     yield chunk | ||||
|                 r.close() | ||||
|                 del r | ||||
|                 break | ||||
|             except RequestException: | ||||
|                 self.session.close() | ||||
|                 retries -= 1 | ||||
| 
 | ||||
|         return None | ||||
|             return None | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _parse_links(body): | ||||
| @ -163,7 +167,7 @@ class HttpDirectory(RemoteDirectory): | ||||
|         parser = HTMLAnchorParser() | ||||
| 
 | ||||
|         for chunk in body: | ||||
|             parser.feed(chunk.decode("utf-8", errors="ignore")) | ||||
|             parser.feed(chunk) | ||||
|             for anchor in parser.anchors: | ||||
|                 yield anchor | ||||
| 
 | ||||
|  | ||||
| @ -53,9 +53,6 @@ class TaskManager: | ||||
|     @staticmethod | ||||
|     def run_task(task, db_path, current_tasks): | ||||
| 
 | ||||
|         # import gc | ||||
|         # gc.set_debug(gc.DEBUG_LEAK) | ||||
| 
 | ||||
|         result = TaskResult() | ||||
|         result.start_time = datetime.utcnow() | ||||
|         result.website_id = task.website_id | ||||
|  | ||||
| @ -4,7 +4,7 @@ import json | ||||
| 
 | ||||
| payload = json.dumps({ | ||||
|     "website_id": 123, | ||||
|     "url": "http://liminaire.fr/TEXTES/", | ||||
|     "url": "https://computerarchive.org/files/computer/", | ||||
|     # "url": "http://localhost:8000/", | ||||
|     # "url": "http://ubuntu.mirrorservice.org/", | ||||
|     "priority": 2, | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user