From caad7cd4584082097f4bf4d436a33404deedcf6b Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 22 Jan 2018 20:26:19 -0500 Subject: [PATCH] Initial commit --- crawler.py | 64 +++++++++++++++++++++ templates/index.html | 16 ++++++ templates/search.html | 71 +++++++++++++++++++++++ webserver.py | 129 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 280 insertions(+) create mode 100644 crawler.py create mode 100644 templates/index.html create mode 100644 templates/search.html create mode 100644 webserver.py diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..479ea45 --- /dev/null +++ b/crawler.py @@ -0,0 +1,64 @@ +import os +import hashlib +import mimetypes +from PIL import Image +import simplejson + +rootDir = "/home/simon/Documents" + + +# https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file +def md5sum(filename, block_size=65536): + hash = hashlib.md5() + with open(filename, "rb") as f: + for block in iter(lambda: f.read(block_size), b""): + hash.update(block) + return hash.hexdigest() + + +def crawl(root_dir): + + docs = [] + + for root, subdirs, files in os.walk(root_dir): + + print(root) + + for filename in files: + full_path = os.path.join(root, filename) + + doc = dict() + + doc["md5"] = md5sum(os.path.join(root, filename)) + doc["path"] = root + doc["name"] = filename + doc["size"] = os.path.getsize(full_path) + doc["mtime"] = int(os.path.getmtime(full_path)) + + mime_type = mimetypes.guess_type(full_path)[0] + + if mime_type is not None: + + doc["mime"] = mime_type + + if mime_type.startswith("image"): + try: + width, height = Image.open(full_path).size + + doc["width"] = width + doc["height"] = height + except OSError: + doc.pop('mime', None) + pass + except ValueError: + doc.pop('mime', None) + pass + + docs.append(doc) + + file = open("crawler.json", "w") + file.write(simplejson.dumps(docs)) + file.close() + + +crawl(rootDir) \ No newline at end of file diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..56fa48f --- /dev/null +++ b/templates/index.html @@ -0,0 +1,16 @@ + + + + + Title + + +
+ + + + +
+ + + \ No newline at end of file diff --git a/templates/search.html b/templates/search.html new file mode 100644 index 0000000..dc28ba9 --- /dev/null +++ b/templates/search.html @@ -0,0 +1,71 @@ + + + + + Title + + + + + +
+ + {% for doc in docs %} + {% if doc.type == "audio" %} +
+ +
+ {% else %} + +
+
+ +
+ {{doc.name}} + +
+ +
+ {% endif %} + {% endfor %} + +
+ + + + + + diff --git a/webserver.py b/webserver.py new file mode 100644 index 0000000..df8f15b --- /dev/null +++ b/webserver.py @@ -0,0 +1,129 @@ +from flask import Flask, render_template, send_file, request +import pysolr +import mimetypes +import requests +import json +from PIL import Image +import os + +SOLR_URL = "http://localhost:8983/solr/test/" + +solr = pysolr.Solr(SOLR_URL, timeout=10) + +app = Flask(__name__) + + +class Document: + def __init__(self, doc_id, name, path, size, md5): + self.doc_id = doc_id + self.name = name + self.path = path + self.size = size + self.md5 = md5 + + +class ImageDocument(Document): + def __init__(self, doc_id, name, path, size, md5): + super().__init__(doc_id, name, path, size, md5) + self.type = "image" + + +class AudioClipDocument(Document): + def __init__(self, doc_id, name, path, size, md5): + super().__init__(doc_id, name, path, size, md5) + self.type = "audio" + + +def get_document(id): + + response = requests.get(SOLR_URL + "get?id=" + id) + + return json.loads(response.text)["doc"] + + +def make_thumb(doc): + size = (1024, 1024) + + thumb_path = "thumbnails/" + doc["id"] + + if not os.path.exists(thumb_path): + + file_path = doc["path"][0] + "/" + doc["name"][0] + + if doc["width"][0] > size[0]: + + image = Image.open(file_path) + image.thumbnail(size, Image.ANTIALIAS) + + if image.mode == "RGB": + image.save(thumb_path, "JPEG") + elif image.mode == "RGBA": + image.save(thumb_path, "PNG") + else: + image = image.convert("RGB") + image.save(thumb_path, "JPEG") + else: + print("Skipping thumbnail") + os.symlink(file_path, thumb_path) + + return "thumbnails/" + doc["id"] + + +@app.route("/search/") +def search(): + + query = request.args.get("query") + page = int(request.args.get("page")) + per_page = int(request.args.get("per_page")) + + results = solr.search(query, None, rows=per_page, start=per_page * page) + + docs = [] + for r in results: + + if "mime" in r: + mime_type = r["mime"][0] + else: + mime_type = "" + + if mime_type.startswith("image"): + docs.append(ImageDocument(r["id"], r["name"][0], r["path"][0], r["size"], r["md5"])) + + elif mime_type.startswith("audio"): + docs.append(AudioClipDocument(r["id"], r["name"][0], r["path"][0], r["size"], r["md5"])) + + return render_template("search.html", docs=docs) + + +@app.route("/") +def index(): + return render_template("index.html") + + +@app.route("/files//") +def files(id): + + doc = get_document(id) + + if doc is not None: + file_path = doc["path"][0] + "/" + doc["name"][0] + return send_file(file_path, mimetype=mimetypes.guess_type(file_path)[0]) + else: + return "File not found" + + +@app.route("/thumbs//") +def thumbs(doc_id): + + doc = get_document(doc_id) + + if doc is not None: + + thumb_path = make_thumb(doc) + + return send_file("thumbnails/" + doc_id, mimetype=mimetypes.guess_type(thumb_path)[0]) + else: + return "File not found" + + +app.run("0.0.0.0", 8080) \ No newline at end of file