download & save image metadata

This commit is contained in:
simon 2019-09-06 20:42:57 -04:00
parent c08e5a0e41
commit 5441ea0809
4 changed files with 93 additions and 22 deletions

View File

@ -87,8 +87,6 @@ class HtmlChanHelper(ChanHelper):
@staticmethod @staticmethod
def item_mtime(item): def item_mtime(item):
if item is None:
return int(datetime.datetime.now().timestamp())
print(item) print(item)
exit(0) exit(0)
return 0 # TODO return 0 # TODO
@ -114,12 +112,12 @@ class HtmlChanHelper(ChanHelper):
op_el = soup.find("div", attrs={"class": "innerOP"}) op_el = soup.find("div", attrs={"class": "innerOP"})
yield { yield {
"id": int(soup.find("div", attrs={"class": "opCell"}).get("id")), "id": int(soup.find("div", class_="opCell").get("id")),
"type": "thread", "type": "thread",
"html": str(op_el), "html": str(op_el),
} }
for post_el in soup.find_all("div", attrs={"class": "postCell"}): for post_el in soup.find_all("div", class_="postCell"):
yield { yield {
"id": int(post_el.get("id")), "id": int(post_el.get("id")),
"type": "post", "type": "post",

View File

@ -1,10 +1,77 @@
import base64
import hashlib
import re import re
import zlib
from io import BytesIO
import imagehash
from PIL import Image
from util import logger
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
IMAGE_FILETYPES = (
# :orig for twitter cdn
'.jpg',
'.jpg:orig',
'.jpeg',
'.jpeg:orig',
'.png',
'.png:orig',
'.gif',
'.gif:orig',
'.tiff',
'.bmp',
'.webp'
)
def post_process(item, board, helper):
item["_v"] = 1.3 def _is_image(url):
return url.lower().endswith(IMAGE_FILETYPES)
def b64hash(imhash, bcount):
return base64.b64encode(
sum(1 << i for i, b in enumerate(imhash.hash.flatten()) if b).to_bytes(bcount, "big")
).decode("ascii")
def image_meta(url, url_idx, web):
r = web.get(url)
if not r:
logger.warning("")
return None
buf = r.content
try:
f = BytesIO(buf)
im = Image.open(f)
except Exception as e:
logger.warning("exception during image open: " + str(e))
return None
meta = {
"url": url_idx,
"size": len(buf),
"width": im.width,
"height": im.height,
"sha1": hashlib.sha1(buf).hexdigest(),
"md5": hashlib.md5(buf).hexdigest(),
"crc32": format(zlib.crc32(buf), "x"),
"dhash": b64hash(imagehash.dhash(im, hash_size=12), 18),
"phash": b64hash(imagehash.phash(im, hash_size=12), 18),
"ahash": b64hash(imagehash.average_hash(im, hash_size=12), 18),
"whash": b64hash(imagehash.whash(im, hash_size=8), 8),
}
del im, r, buf
return meta
def post_process(item, board, helper, web):
item["_v"] = 1.4
item["_id"] = helper.item_unique_id(item, board) item["_id"] = helper.item_unique_id(item, board)
item["_board"] = board item["_board"] = board
@ -12,6 +79,8 @@ def post_process(item, board, helper):
item["_urls"] = helper.item_urls(item, board) item["_urls"] = helper.item_urls(item, board)
item["_img"] = [image_meta(url, i, web) for i, url in enumerate(item["_urls"]) if _is_image(url)]
return item return item

View File

@ -1,3 +1,5 @@
imagehash
Pillow
requests requests
requests[socks] requests[socks]
stem stem

30
run.py
View File

@ -14,7 +14,7 @@ from chan import CHANS
from post_process import post_process from post_process import post_process
from util import logger, Web from util import logger, Web
MONITORING = False MONITORING = True
class ChanScanner: class ChanScanner:
@ -58,9 +58,9 @@ class ChanScanner:
def once(func): def once(func):
def wrapper(item, board, helper): def wrapper(item, board, helper, channel, web):
if not state.has_visited(helper.item_unique_id(item, board), helper): if not state.has_visited(helper.item_unique_id(item, board), helper):
func(item, board, helper) func(item, board, helper, channel, web)
state.mark_visited(helper.item_unique_id(item, board), helper) state.mark_visited(helper.item_unique_id(item, board), helper)
return wrapper return wrapper
@ -139,10 +139,13 @@ class ChanState:
def publish_worker(queue: Queue, helper): def publish_worker(queue: Queue, helper):
channel = connect()
web = Web(monitoring if MONITORING else None)
while True: while True:
try: try:
item, board = queue.get() item, board = queue.get()
publish(item, board, helper) publish(item, board, helper, channel, web)
except Exception as e: except Exception as e:
logger.error(str(e) + ": " + traceback.format_exc()) logger.error(str(e) + ": " + traceback.format_exc())
@ -151,16 +154,16 @@ def publish_worker(queue: Queue, helper):
@once @once
def publish(item, board, helper): def publish(item, board, helper, channel, web):
item_type = helper.item_type(item) item_type = helper.item_type(item)
post_process(item, board, helper) post_process(item, board, helper, web)
while True: while True:
try: try:
chan_channel.basic_publish( channel.basic_publish(
exchange='chan', exchange='chan',
routing_key="%s.%s.%s" % (chan, item_type, board), routing_key="%s.%s.%s" % (chan, item_type, board),
body=json.dumps(item) body=json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True)
) )
if MONITORING: if MONITORING:
@ -179,14 +182,14 @@ def publish(item, board, helper):
except Exception as e: except Exception as e:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.error(str(e)) logger.error(str(e))
connect() channel = connect()
def connect(): def connect():
global chan_channel
rabbit = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host)) rabbit = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host))
chan_channel = rabbit.channel() channel = rabbit.channel()
chan_channel.exchange_declare(exchange="chan", exchange_type="topic") channel.exchange_declare(exchange="chan", exchange_type="topic")
return channel
if __name__ == "__main__": if __name__ == "__main__":
@ -204,11 +207,10 @@ if __name__ == "__main__":
state = ChanState() state = ChanState()
publish_q = Queue() publish_q = Queue()
for _ in range(5):
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper)) publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper))
publish_thread.start() publish_thread.start()
connect()
s = ChanScanner(chan_helper) s = ChanScanner(chan_helper)
while True: while True:
for p, b in s.all_posts(): for p, b in s.all_posts():