mirror of
https://github.com/simon987/chan_feed.git
synced 2025-04-24 12:15:50 +00:00
download & save image metadata
This commit is contained in:
parent
c08e5a0e41
commit
5441ea0809
6
chan.py
6
chan.py
@ -87,8 +87,6 @@ class HtmlChanHelper(ChanHelper):
|
||||
|
||||
@staticmethod
|
||||
def item_mtime(item):
|
||||
if item is None:
|
||||
return int(datetime.datetime.now().timestamp())
|
||||
print(item)
|
||||
exit(0)
|
||||
return 0 # TODO
|
||||
@ -114,12 +112,12 @@ class HtmlChanHelper(ChanHelper):
|
||||
|
||||
op_el = soup.find("div", attrs={"class": "innerOP"})
|
||||
yield {
|
||||
"id": int(soup.find("div", attrs={"class": "opCell"}).get("id")),
|
||||
"id": int(soup.find("div", class_="opCell").get("id")),
|
||||
"type": "thread",
|
||||
"html": str(op_el),
|
||||
}
|
||||
|
||||
for post_el in soup.find_all("div", attrs={"class": "postCell"}):
|
||||
for post_el in soup.find_all("div", class_="postCell"):
|
||||
yield {
|
||||
"id": int(post_el.get("id")),
|
||||
"type": "post",
|
||||
|
@ -1,10 +1,77 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import re
|
||||
import zlib
|
||||
from io import BytesIO
|
||||
|
||||
import imagehash
|
||||
from PIL import Image
|
||||
|
||||
from util import logger
|
||||
|
||||
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
||||
|
||||
IMAGE_FILETYPES = (
|
||||
# :orig for twitter cdn
|
||||
'.jpg',
|
||||
'.jpg:orig',
|
||||
'.jpeg',
|
||||
'.jpeg:orig',
|
||||
'.png',
|
||||
'.png:orig',
|
||||
'.gif',
|
||||
'.gif:orig',
|
||||
'.tiff',
|
||||
'.bmp',
|
||||
'.webp'
|
||||
)
|
||||
|
||||
def post_process(item, board, helper):
|
||||
item["_v"] = 1.3
|
||||
|
||||
def _is_image(url):
|
||||
return url.lower().endswith(IMAGE_FILETYPES)
|
||||
|
||||
|
||||
def b64hash(imhash, bcount):
|
||||
return base64.b64encode(
|
||||
sum(1 << i for i, b in enumerate(imhash.hash.flatten()) if b).to_bytes(bcount, "big")
|
||||
).decode("ascii")
|
||||
|
||||
|
||||
def image_meta(url, url_idx, web):
|
||||
r = web.get(url)
|
||||
if not r:
|
||||
logger.warning("")
|
||||
return None
|
||||
buf = r.content
|
||||
|
||||
try:
|
||||
f = BytesIO(buf)
|
||||
im = Image.open(f)
|
||||
except Exception as e:
|
||||
logger.warning("exception during image open: " + str(e))
|
||||
return None
|
||||
|
||||
meta = {
|
||||
"url": url_idx,
|
||||
"size": len(buf),
|
||||
"width": im.width,
|
||||
"height": im.height,
|
||||
"sha1": hashlib.sha1(buf).hexdigest(),
|
||||
"md5": hashlib.md5(buf).hexdigest(),
|
||||
"crc32": format(zlib.crc32(buf), "x"),
|
||||
"dhash": b64hash(imagehash.dhash(im, hash_size=12), 18),
|
||||
"phash": b64hash(imagehash.phash(im, hash_size=12), 18),
|
||||
"ahash": b64hash(imagehash.average_hash(im, hash_size=12), 18),
|
||||
"whash": b64hash(imagehash.whash(im, hash_size=8), 8),
|
||||
}
|
||||
|
||||
del im, r, buf
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
def post_process(item, board, helper, web):
|
||||
item["_v"] = 1.4
|
||||
item["_id"] = helper.item_unique_id(item, board)
|
||||
|
||||
item["_board"] = board
|
||||
@ -12,6 +79,8 @@ def post_process(item, board, helper):
|
||||
|
||||
item["_urls"] = helper.item_urls(item, board)
|
||||
|
||||
item["_img"] = [image_meta(url, i, web) for i, url in enumerate(item["_urls"]) if _is_image(url)]
|
||||
|
||||
return item
|
||||
|
||||
|
||||
|
@ -1,3 +1,5 @@
|
||||
imagehash
|
||||
Pillow
|
||||
requests
|
||||
requests[socks]
|
||||
stem
|
||||
|
34
run.py
34
run.py
@ -14,7 +14,7 @@ from chan import CHANS
|
||||
from post_process import post_process
|
||||
from util import logger, Web
|
||||
|
||||
MONITORING = False
|
||||
MONITORING = True
|
||||
|
||||
|
||||
class ChanScanner:
|
||||
@ -58,9 +58,9 @@ class ChanScanner:
|
||||
|
||||
|
||||
def once(func):
|
||||
def wrapper(item, board, helper):
|
||||
def wrapper(item, board, helper, channel, web):
|
||||
if not state.has_visited(helper.item_unique_id(item, board), helper):
|
||||
func(item, board, helper)
|
||||
func(item, board, helper, channel, web)
|
||||
state.mark_visited(helper.item_unique_id(item, board), helper)
|
||||
|
||||
return wrapper
|
||||
@ -139,10 +139,13 @@ class ChanState:
|
||||
|
||||
|
||||
def publish_worker(queue: Queue, helper):
|
||||
channel = connect()
|
||||
web = Web(monitoring if MONITORING else None)
|
||||
|
||||
while True:
|
||||
try:
|
||||
item, board = queue.get()
|
||||
publish(item, board, helper)
|
||||
publish(item, board, helper, channel, web)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(str(e) + ": " + traceback.format_exc())
|
||||
@ -151,16 +154,16 @@ def publish_worker(queue: Queue, helper):
|
||||
|
||||
|
||||
@once
|
||||
def publish(item, board, helper):
|
||||
def publish(item, board, helper, channel, web):
|
||||
item_type = helper.item_type(item)
|
||||
post_process(item, board, helper)
|
||||
post_process(item, board, helper, web)
|
||||
|
||||
while True:
|
||||
try:
|
||||
chan_channel.basic_publish(
|
||||
channel.basic_publish(
|
||||
exchange='chan',
|
||||
routing_key="%s.%s.%s" % (chan, item_type, board),
|
||||
body=json.dumps(item)
|
||||
body=json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True)
|
||||
)
|
||||
|
||||
if MONITORING:
|
||||
@ -179,14 +182,14 @@ def publish(item, board, helper):
|
||||
except Exception as e:
|
||||
logger.debug(traceback.format_exc())
|
||||
logger.error(str(e))
|
||||
connect()
|
||||
channel = connect()
|
||||
|
||||
|
||||
def connect():
|
||||
global chan_channel
|
||||
rabbit = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host))
|
||||
chan_channel = rabbit.channel()
|
||||
chan_channel.exchange_declare(exchange="chan", exchange_type="topic")
|
||||
channel = rabbit.channel()
|
||||
channel.exchange_declare(exchange="chan", exchange_type="topic")
|
||||
return channel
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -204,10 +207,9 @@ if __name__ == "__main__":
|
||||
state = ChanState()
|
||||
|
||||
publish_q = Queue()
|
||||
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper))
|
||||
publish_thread.start()
|
||||
|
||||
connect()
|
||||
for _ in range(5):
|
||||
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper))
|
||||
publish_thread.start()
|
||||
|
||||
s = ChanScanner(chan_helper)
|
||||
while True:
|
||||
|
Loading…
x
Reference in New Issue
Block a user