mirror of
				https://github.com/simon987/chan_feed.git
				synced 2025-10-25 04:56:51 +00:00 
			
		
		
		
	download & save image metadata
This commit is contained in:
		
							parent
							
								
									c08e5a0e41
								
							
						
					
					
						commit
						5441ea0809
					
				
							
								
								
									
										6
									
								
								chan.py
									
									
									
									
									
								
							
							
						
						
									
										6
									
								
								chan.py
									
									
									
									
									
								
							| @ -87,8 +87,6 @@ class HtmlChanHelper(ChanHelper): | |||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def item_mtime(item): |     def item_mtime(item): | ||||||
|         if item is None: |  | ||||||
|             return int(datetime.datetime.now().timestamp()) |  | ||||||
|         print(item) |         print(item) | ||||||
|         exit(0) |         exit(0) | ||||||
|         return 0  # TODO |         return 0  # TODO | ||||||
| @ -114,12 +112,12 @@ class HtmlChanHelper(ChanHelper): | |||||||
| 
 | 
 | ||||||
|         op_el = soup.find("div", attrs={"class": "innerOP"}) |         op_el = soup.find("div", attrs={"class": "innerOP"}) | ||||||
|         yield { |         yield { | ||||||
|             "id": int(soup.find("div", attrs={"class": "opCell"}).get("id")), |             "id": int(soup.find("div", class_="opCell").get("id")), | ||||||
|             "type": "thread", |             "type": "thread", | ||||||
|             "html": str(op_el), |             "html": str(op_el), | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         for post_el in soup.find_all("div", attrs={"class": "postCell"}): |         for post_el in soup.find_all("div", class_="postCell"): | ||||||
|             yield { |             yield { | ||||||
|                 "id": int(post_el.get("id")), |                 "id": int(post_el.get("id")), | ||||||
|                 "type": "post", |                 "type": "post", | ||||||
|  | |||||||
| @ -1,10 +1,77 @@ | |||||||
|  | import base64 | ||||||
|  | import hashlib | ||||||
| import re | import re | ||||||
|  | import zlib | ||||||
|  | from io import BytesIO | ||||||
|  | 
 | ||||||
|  | import imagehash | ||||||
|  | from PIL import Image | ||||||
|  | 
 | ||||||
|  | from util import logger | ||||||
| 
 | 
 | ||||||
| LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") | LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") | ||||||
| 
 | 
 | ||||||
|  | IMAGE_FILETYPES = ( | ||||||
|  |     # :orig for twitter cdn | ||||||
|  |     '.jpg', | ||||||
|  |     '.jpg:orig', | ||||||
|  |     '.jpeg', | ||||||
|  |     '.jpeg:orig', | ||||||
|  |     '.png', | ||||||
|  |     '.png:orig', | ||||||
|  |     '.gif', | ||||||
|  |     '.gif:orig', | ||||||
|  |     '.tiff', | ||||||
|  |     '.bmp', | ||||||
|  |     '.webp' | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| def post_process(item, board, helper): | 
 | ||||||
|     item["_v"] = 1.3 | def _is_image(url): | ||||||
|  |     return url.lower().endswith(IMAGE_FILETYPES) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def b64hash(imhash, bcount): | ||||||
|  |     return base64.b64encode( | ||||||
|  |         sum(1 << i for i, b in enumerate(imhash.hash.flatten()) if b).to_bytes(bcount, "big") | ||||||
|  |     ).decode("ascii") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def image_meta(url, url_idx, web): | ||||||
|  |     r = web.get(url) | ||||||
|  |     if not r: | ||||||
|  |         logger.warning("") | ||||||
|  |         return None | ||||||
|  |     buf = r.content | ||||||
|  | 
 | ||||||
|  |     try: | ||||||
|  |         f = BytesIO(buf) | ||||||
|  |         im = Image.open(f) | ||||||
|  |     except Exception as e: | ||||||
|  |         logger.warning("exception during image open: " + str(e)) | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  |     meta = { | ||||||
|  |         "url": url_idx, | ||||||
|  |         "size": len(buf), | ||||||
|  |         "width": im.width, | ||||||
|  |         "height": im.height, | ||||||
|  |         "sha1": hashlib.sha1(buf).hexdigest(), | ||||||
|  |         "md5": hashlib.md5(buf).hexdigest(), | ||||||
|  |         "crc32": format(zlib.crc32(buf), "x"), | ||||||
|  |         "dhash": b64hash(imagehash.dhash(im, hash_size=12), 18), | ||||||
|  |         "phash": b64hash(imagehash.phash(im, hash_size=12), 18), | ||||||
|  |         "ahash": b64hash(imagehash.average_hash(im, hash_size=12), 18), | ||||||
|  |         "whash": b64hash(imagehash.whash(im, hash_size=8), 8), | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     del im, r, buf | ||||||
|  | 
 | ||||||
|  |     return meta | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def post_process(item, board, helper, web): | ||||||
|  |     item["_v"] = 1.4 | ||||||
|     item["_id"] = helper.item_unique_id(item, board) |     item["_id"] = helper.item_unique_id(item, board) | ||||||
| 
 | 
 | ||||||
|     item["_board"] = board |     item["_board"] = board | ||||||
| @ -12,6 +79,8 @@ def post_process(item, board, helper): | |||||||
| 
 | 
 | ||||||
|     item["_urls"] = helper.item_urls(item, board) |     item["_urls"] = helper.item_urls(item, board) | ||||||
| 
 | 
 | ||||||
|  |     item["_img"] = [image_meta(url, i, web) for i, url in enumerate(item["_urls"]) if _is_image(url)] | ||||||
|  | 
 | ||||||
|     return item |     return item | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -1,3 +1,5 @@ | |||||||
|  | imagehash | ||||||
|  | Pillow | ||||||
| requests | requests | ||||||
| requests[socks] | requests[socks] | ||||||
| stem | stem | ||||||
|  | |||||||
							
								
								
									
										34
									
								
								run.py
									
									
									
									
									
								
							
							
						
						
									
										34
									
								
								run.py
									
									
									
									
									
								
							| @ -14,7 +14,7 @@ from chan import CHANS | |||||||
| from post_process import post_process | from post_process import post_process | ||||||
| from util import logger, Web | from util import logger, Web | ||||||
| 
 | 
 | ||||||
| MONITORING = False | MONITORING = True | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class ChanScanner: | class ChanScanner: | ||||||
| @ -58,9 +58,9 @@ class ChanScanner: | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def once(func): | def once(func): | ||||||
|     def wrapper(item, board, helper): |     def wrapper(item, board, helper, channel, web): | ||||||
|         if not state.has_visited(helper.item_unique_id(item, board), helper): |         if not state.has_visited(helper.item_unique_id(item, board), helper): | ||||||
|             func(item, board, helper) |             func(item, board, helper, channel, web) | ||||||
|             state.mark_visited(helper.item_unique_id(item, board), helper) |             state.mark_visited(helper.item_unique_id(item, board), helper) | ||||||
| 
 | 
 | ||||||
|     return wrapper |     return wrapper | ||||||
| @ -139,10 +139,13 @@ class ChanState: | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def publish_worker(queue: Queue, helper): | def publish_worker(queue: Queue, helper): | ||||||
|  |     channel = connect() | ||||||
|  |     web = Web(monitoring if MONITORING else None) | ||||||
|  | 
 | ||||||
|     while True: |     while True: | ||||||
|         try: |         try: | ||||||
|             item, board = queue.get() |             item, board = queue.get() | ||||||
|             publish(item, board, helper) |             publish(item, board, helper, channel, web) | ||||||
| 
 | 
 | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             logger.error(str(e) + ": " + traceback.format_exc()) |             logger.error(str(e) + ": " + traceback.format_exc()) | ||||||
| @ -151,16 +154,16 @@ def publish_worker(queue: Queue, helper): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @once | @once | ||||||
| def publish(item, board, helper): | def publish(item, board, helper, channel, web): | ||||||
|     item_type = helper.item_type(item) |     item_type = helper.item_type(item) | ||||||
|     post_process(item, board, helper) |     post_process(item, board, helper, web) | ||||||
| 
 | 
 | ||||||
|     while True: |     while True: | ||||||
|         try: |         try: | ||||||
|             chan_channel.basic_publish( |             channel.basic_publish( | ||||||
|                 exchange='chan', |                 exchange='chan', | ||||||
|                 routing_key="%s.%s.%s" % (chan, item_type, board), |                 routing_key="%s.%s.%s" % (chan, item_type, board), | ||||||
|                 body=json.dumps(item) |                 body=json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True) | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|             if MONITORING: |             if MONITORING: | ||||||
| @ -179,14 +182,14 @@ def publish(item, board, helper): | |||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             logger.debug(traceback.format_exc()) |             logger.debug(traceback.format_exc()) | ||||||
|             logger.error(str(e)) |             logger.error(str(e)) | ||||||
|             connect() |             channel = connect() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def connect(): | def connect(): | ||||||
|     global chan_channel |  | ||||||
|     rabbit = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host)) |     rabbit = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host)) | ||||||
|     chan_channel = rabbit.channel() |     channel = rabbit.channel() | ||||||
|     chan_channel.exchange_declare(exchange="chan", exchange_type="topic") |     channel.exchange_declare(exchange="chan", exchange_type="topic") | ||||||
|  |     return channel | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
| @ -204,10 +207,9 @@ if __name__ == "__main__": | |||||||
|     state = ChanState() |     state = ChanState() | ||||||
| 
 | 
 | ||||||
|     publish_q = Queue() |     publish_q = Queue() | ||||||
|     publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper)) |     for _ in range(5): | ||||||
|     publish_thread.start() |         publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper)) | ||||||
| 
 |         publish_thread.start() | ||||||
|     connect() |  | ||||||
| 
 | 
 | ||||||
|     s = ChanScanner(chan_helper) |     s = ChanScanner(chan_helper) | ||||||
|     while True: |     while True: | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user