mirror of
https://github.com/simon987/chan_feed.git
synced 2025-04-24 12:15:50 +00:00
download & save image metadata
This commit is contained in:
parent
c08e5a0e41
commit
5441ea0809
6
chan.py
6
chan.py
@ -87,8 +87,6 @@ class HtmlChanHelper(ChanHelper):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def item_mtime(item):
|
def item_mtime(item):
|
||||||
if item is None:
|
|
||||||
return int(datetime.datetime.now().timestamp())
|
|
||||||
print(item)
|
print(item)
|
||||||
exit(0)
|
exit(0)
|
||||||
return 0 # TODO
|
return 0 # TODO
|
||||||
@ -114,12 +112,12 @@ class HtmlChanHelper(ChanHelper):
|
|||||||
|
|
||||||
op_el = soup.find("div", attrs={"class": "innerOP"})
|
op_el = soup.find("div", attrs={"class": "innerOP"})
|
||||||
yield {
|
yield {
|
||||||
"id": int(soup.find("div", attrs={"class": "opCell"}).get("id")),
|
"id": int(soup.find("div", class_="opCell").get("id")),
|
||||||
"type": "thread",
|
"type": "thread",
|
||||||
"html": str(op_el),
|
"html": str(op_el),
|
||||||
}
|
}
|
||||||
|
|
||||||
for post_el in soup.find_all("div", attrs={"class": "postCell"}):
|
for post_el in soup.find_all("div", class_="postCell"):
|
||||||
yield {
|
yield {
|
||||||
"id": int(post_el.get("id")),
|
"id": int(post_el.get("id")),
|
||||||
"type": "post",
|
"type": "post",
|
||||||
|
@ -1,10 +1,77 @@
|
|||||||
|
import base64
|
||||||
|
import hashlib
|
||||||
import re
|
import re
|
||||||
|
import zlib
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
import imagehash
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from util import logger
|
||||||
|
|
||||||
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
||||||
|
|
||||||
|
IMAGE_FILETYPES = (
|
||||||
|
# :orig for twitter cdn
|
||||||
|
'.jpg',
|
||||||
|
'.jpg:orig',
|
||||||
|
'.jpeg',
|
||||||
|
'.jpeg:orig',
|
||||||
|
'.png',
|
||||||
|
'.png:orig',
|
||||||
|
'.gif',
|
||||||
|
'.gif:orig',
|
||||||
|
'.tiff',
|
||||||
|
'.bmp',
|
||||||
|
'.webp'
|
||||||
|
)
|
||||||
|
|
||||||
def post_process(item, board, helper):
|
|
||||||
item["_v"] = 1.3
|
def _is_image(url):
|
||||||
|
return url.lower().endswith(IMAGE_FILETYPES)
|
||||||
|
|
||||||
|
|
||||||
|
def b64hash(imhash, bcount):
|
||||||
|
return base64.b64encode(
|
||||||
|
sum(1 << i for i, b in enumerate(imhash.hash.flatten()) if b).to_bytes(bcount, "big")
|
||||||
|
).decode("ascii")
|
||||||
|
|
||||||
|
|
||||||
|
def image_meta(url, url_idx, web):
|
||||||
|
r = web.get(url)
|
||||||
|
if not r:
|
||||||
|
logger.warning("")
|
||||||
|
return None
|
||||||
|
buf = r.content
|
||||||
|
|
||||||
|
try:
|
||||||
|
f = BytesIO(buf)
|
||||||
|
im = Image.open(f)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("exception during image open: " + str(e))
|
||||||
|
return None
|
||||||
|
|
||||||
|
meta = {
|
||||||
|
"url": url_idx,
|
||||||
|
"size": len(buf),
|
||||||
|
"width": im.width,
|
||||||
|
"height": im.height,
|
||||||
|
"sha1": hashlib.sha1(buf).hexdigest(),
|
||||||
|
"md5": hashlib.md5(buf).hexdigest(),
|
||||||
|
"crc32": format(zlib.crc32(buf), "x"),
|
||||||
|
"dhash": b64hash(imagehash.dhash(im, hash_size=12), 18),
|
||||||
|
"phash": b64hash(imagehash.phash(im, hash_size=12), 18),
|
||||||
|
"ahash": b64hash(imagehash.average_hash(im, hash_size=12), 18),
|
||||||
|
"whash": b64hash(imagehash.whash(im, hash_size=8), 8),
|
||||||
|
}
|
||||||
|
|
||||||
|
del im, r, buf
|
||||||
|
|
||||||
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def post_process(item, board, helper, web):
|
||||||
|
item["_v"] = 1.4
|
||||||
item["_id"] = helper.item_unique_id(item, board)
|
item["_id"] = helper.item_unique_id(item, board)
|
||||||
|
|
||||||
item["_board"] = board
|
item["_board"] = board
|
||||||
@ -12,6 +79,8 @@ def post_process(item, board, helper):
|
|||||||
|
|
||||||
item["_urls"] = helper.item_urls(item, board)
|
item["_urls"] = helper.item_urls(item, board)
|
||||||
|
|
||||||
|
item["_img"] = [image_meta(url, i, web) for i, url in enumerate(item["_urls"]) if _is_image(url)]
|
||||||
|
|
||||||
return item
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
imagehash
|
||||||
|
Pillow
|
||||||
requests
|
requests
|
||||||
requests[socks]
|
requests[socks]
|
||||||
stem
|
stem
|
||||||
|
34
run.py
34
run.py
@ -14,7 +14,7 @@ from chan import CHANS
|
|||||||
from post_process import post_process
|
from post_process import post_process
|
||||||
from util import logger, Web
|
from util import logger, Web
|
||||||
|
|
||||||
MONITORING = False
|
MONITORING = True
|
||||||
|
|
||||||
|
|
||||||
class ChanScanner:
|
class ChanScanner:
|
||||||
@ -58,9 +58,9 @@ class ChanScanner:
|
|||||||
|
|
||||||
|
|
||||||
def once(func):
|
def once(func):
|
||||||
def wrapper(item, board, helper):
|
def wrapper(item, board, helper, channel, web):
|
||||||
if not state.has_visited(helper.item_unique_id(item, board), helper):
|
if not state.has_visited(helper.item_unique_id(item, board), helper):
|
||||||
func(item, board, helper)
|
func(item, board, helper, channel, web)
|
||||||
state.mark_visited(helper.item_unique_id(item, board), helper)
|
state.mark_visited(helper.item_unique_id(item, board), helper)
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
@ -139,10 +139,13 @@ class ChanState:
|
|||||||
|
|
||||||
|
|
||||||
def publish_worker(queue: Queue, helper):
|
def publish_worker(queue: Queue, helper):
|
||||||
|
channel = connect()
|
||||||
|
web = Web(monitoring if MONITORING else None)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
item, board = queue.get()
|
item, board = queue.get()
|
||||||
publish(item, board, helper)
|
publish(item, board, helper, channel, web)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(str(e) + ": " + traceback.format_exc())
|
logger.error(str(e) + ": " + traceback.format_exc())
|
||||||
@ -151,16 +154,16 @@ def publish_worker(queue: Queue, helper):
|
|||||||
|
|
||||||
|
|
||||||
@once
|
@once
|
||||||
def publish(item, board, helper):
|
def publish(item, board, helper, channel, web):
|
||||||
item_type = helper.item_type(item)
|
item_type = helper.item_type(item)
|
||||||
post_process(item, board, helper)
|
post_process(item, board, helper, web)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
chan_channel.basic_publish(
|
channel.basic_publish(
|
||||||
exchange='chan',
|
exchange='chan',
|
||||||
routing_key="%s.%s.%s" % (chan, item_type, board),
|
routing_key="%s.%s.%s" % (chan, item_type, board),
|
||||||
body=json.dumps(item)
|
body=json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True)
|
||||||
)
|
)
|
||||||
|
|
||||||
if MONITORING:
|
if MONITORING:
|
||||||
@ -179,14 +182,14 @@ def publish(item, board, helper):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
logger.error(str(e))
|
logger.error(str(e))
|
||||||
connect()
|
channel = connect()
|
||||||
|
|
||||||
|
|
||||||
def connect():
|
def connect():
|
||||||
global chan_channel
|
|
||||||
rabbit = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host))
|
rabbit = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host))
|
||||||
chan_channel = rabbit.channel()
|
channel = rabbit.channel()
|
||||||
chan_channel.exchange_declare(exchange="chan", exchange_type="topic")
|
channel.exchange_declare(exchange="chan", exchange_type="topic")
|
||||||
|
return channel
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -204,10 +207,9 @@ if __name__ == "__main__":
|
|||||||
state = ChanState()
|
state = ChanState()
|
||||||
|
|
||||||
publish_q = Queue()
|
publish_q = Queue()
|
||||||
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper))
|
for _ in range(5):
|
||||||
publish_thread.start()
|
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper))
|
||||||
|
publish_thread.start()
|
||||||
connect()
|
|
||||||
|
|
||||||
s = ChanScanner(chan_helper)
|
s = ChanScanner(chan_helper)
|
||||||
while True:
|
while True:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user