From 5441ea08092160ae81c523141e8110ad94806a99 Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 6 Sep 2019 20:42:57 -0400 Subject: [PATCH] download & save image metadata --- chan.py | 6 ++-- post_process.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++-- requirements.txt | 2 ++ run.py | 34 +++++++++++----------- 4 files changed, 93 insertions(+), 22 deletions(-) diff --git a/chan.py b/chan.py index c65938f..75c3686 100644 --- a/chan.py +++ b/chan.py @@ -87,8 +87,6 @@ class HtmlChanHelper(ChanHelper): @staticmethod def item_mtime(item): - if item is None: - return int(datetime.datetime.now().timestamp()) print(item) exit(0) return 0 # TODO @@ -114,12 +112,12 @@ class HtmlChanHelper(ChanHelper): op_el = soup.find("div", attrs={"class": "innerOP"}) yield { - "id": int(soup.find("div", attrs={"class": "opCell"}).get("id")), + "id": int(soup.find("div", class_="opCell").get("id")), "type": "thread", "html": str(op_el), } - for post_el in soup.find_all("div", attrs={"class": "postCell"}): + for post_el in soup.find_all("div", class_="postCell"): yield { "id": int(post_el.get("id")), "type": "post", diff --git a/post_process.py b/post_process.py index 2ba662f..513d661 100644 --- a/post_process.py +++ b/post_process.py @@ -1,10 +1,77 @@ +import base64 +import hashlib import re +import zlib +from io import BytesIO + +import imagehash +from PIL import Image + +from util import logger LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") +IMAGE_FILETYPES = ( + # :orig for twitter cdn + '.jpg', + '.jpg:orig', + '.jpeg', + '.jpeg:orig', + '.png', + '.png:orig', + '.gif', + '.gif:orig', + '.tiff', + '.bmp', + '.webp' +) -def post_process(item, board, helper): - item["_v"] = 1.3 + +def _is_image(url): + return url.lower().endswith(IMAGE_FILETYPES) + + +def b64hash(imhash, bcount): + return base64.b64encode( + sum(1 << i for i, b in enumerate(imhash.hash.flatten()) if b).to_bytes(bcount, "big") + ).decode("ascii") + + +def image_meta(url, url_idx, web): + r = web.get(url) + if not r: + logger.warning("") + return None + buf = r.content + + try: + f = BytesIO(buf) + im = Image.open(f) + except Exception as e: + logger.warning("exception during image open: " + str(e)) + return None + + meta = { + "url": url_idx, + "size": len(buf), + "width": im.width, + "height": im.height, + "sha1": hashlib.sha1(buf).hexdigest(), + "md5": hashlib.md5(buf).hexdigest(), + "crc32": format(zlib.crc32(buf), "x"), + "dhash": b64hash(imagehash.dhash(im, hash_size=12), 18), + "phash": b64hash(imagehash.phash(im, hash_size=12), 18), + "ahash": b64hash(imagehash.average_hash(im, hash_size=12), 18), + "whash": b64hash(imagehash.whash(im, hash_size=8), 8), + } + + del im, r, buf + + return meta + + +def post_process(item, board, helper, web): + item["_v"] = 1.4 item["_id"] = helper.item_unique_id(item, board) item["_board"] = board @@ -12,6 +79,8 @@ def post_process(item, board, helper): item["_urls"] = helper.item_urls(item, board) + item["_img"] = [image_meta(url, i, web) for i, url in enumerate(item["_urls"]) if _is_image(url)] + return item diff --git a/requirements.txt b/requirements.txt index 0be0b80..c0188c8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +imagehash +Pillow requests requests[socks] stem diff --git a/run.py b/run.py index 861f58f..bc8ac57 100644 --- a/run.py +++ b/run.py @@ -14,7 +14,7 @@ from chan import CHANS from post_process import post_process from util import logger, Web -MONITORING = False +MONITORING = True class ChanScanner: @@ -58,9 +58,9 @@ class ChanScanner: def once(func): - def wrapper(item, board, helper): + def wrapper(item, board, helper, channel, web): if not state.has_visited(helper.item_unique_id(item, board), helper): - func(item, board, helper) + func(item, board, helper, channel, web) state.mark_visited(helper.item_unique_id(item, board), helper) return wrapper @@ -139,10 +139,13 @@ class ChanState: def publish_worker(queue: Queue, helper): + channel = connect() + web = Web(monitoring if MONITORING else None) + while True: try: item, board = queue.get() - publish(item, board, helper) + publish(item, board, helper, channel, web) except Exception as e: logger.error(str(e) + ": " + traceback.format_exc()) @@ -151,16 +154,16 @@ def publish_worker(queue: Queue, helper): @once -def publish(item, board, helper): +def publish(item, board, helper, channel, web): item_type = helper.item_type(item) - post_process(item, board, helper) + post_process(item, board, helper, web) while True: try: - chan_channel.basic_publish( + channel.basic_publish( exchange='chan', routing_key="%s.%s.%s" % (chan, item_type, board), - body=json.dumps(item) + body=json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True) ) if MONITORING: @@ -179,14 +182,14 @@ def publish(item, board, helper): except Exception as e: logger.debug(traceback.format_exc()) logger.error(str(e)) - connect() + channel = connect() def connect(): - global chan_channel rabbit = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host)) - chan_channel = rabbit.channel() - chan_channel.exchange_declare(exchange="chan", exchange_type="topic") + channel = rabbit.channel() + channel.exchange_declare(exchange="chan", exchange_type="topic") + return channel if __name__ == "__main__": @@ -204,10 +207,9 @@ if __name__ == "__main__": state = ChanState() publish_q = Queue() - publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper)) - publish_thread.start() - - connect() + for _ in range(5): + publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper)) + publish_thread.start() s = ChanScanner(chan_helper) while True: