diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c8273a1 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +spec/test_folder/* linguist-vendored \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler.py b/crawler.py index 479ea45..79de786 100644 --- a/crawler.py +++ b/crawler.py @@ -1,64 +1,152 @@ import os import hashlib -import mimetypes -from PIL import Image -import simplejson - -rootDir = "/home/simon/Documents" -# https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file -def md5sum(filename, block_size=65536): - hash = hashlib.md5() - with open(filename, "rb") as f: - for block in iter(lambda: f.read(block_size), b""): - hash.update(block) - return hash.hexdigest() +class Crawler: + pass -def crawl(root_dir): - - docs = [] - - for root, subdirs, files in os.walk(root_dir): - - print(root) - - for filename in files: - full_path = os.path.join(root, filename) - - doc = dict() - - doc["md5"] = md5sum(os.path.join(root, filename)) - doc["path"] = root - doc["name"] = filename - doc["size"] = os.path.getsize(full_path) - doc["mtime"] = int(os.path.getmtime(full_path)) - - mime_type = mimetypes.guess_type(full_path)[0] - - if mime_type is not None: - - doc["mime"] = mime_type - - if mime_type.startswith("image"): - try: - width, height = Image.open(full_path).size - - doc["width"] = width - doc["height"] = height - except OSError: - doc.pop('mime', None) - pass - except ValueError: - doc.pop('mime', None) - pass - - docs.append(doc) - - file = open("crawler.json", "w") - file.write(simplejson.dumps(docs)) - file.close() +class FileParser: + pass -crawl(rootDir) \ No newline at end of file +class CheckSumCalculator: + + def checksum(self, path: str) -> str: + """ + Calculate the checksum of a file + :param path: path of the file + :return: checksum + """ + raise NotImplementedError() + + +class Md5CheckSumCalculator(CheckSumCalculator): + + def __init__(self): + self.name = "md5" + + def checksum(self, path: str) -> str: + """ + Calculate the md5 checksum of a file + :param path: path of the file + :return: md5 checksum + """ + result = hashlib.md5() + + with open(path, "rb") as f: + for block in iter(lambda: f.read(65536), b""): + result.update(block) + + return result.hexdigest().upper() + + +class Sha1CheckSumCalculator(CheckSumCalculator): + + def __init__(self): + self.name = "sha1" + + def checksum(self, path: str) -> str: + """ + Calculate the sha1 checksum of a file + :param path: path of the file + :return: sha1 checksum + """ + result = hashlib.sha1() + + with open(path, "rb") as f: + for block in iter(lambda: f.read(65536), b""): + result.update(block) + + return result.hexdigest().upper() + + +class Sha256CheckSumCalculator(CheckSumCalculator): + + def __init__(self): + self.name = "sha256" + + def checksum(self, path: str) -> str: + """ + Calculate the sha256 checksum of a file + :param path: path of the file + :return: sha256 checksum + """ + result = hashlib.sha256() + + with open(path, "rb") as f: + for block in iter(lambda: f.read(65536), b""): + result.update(block) + + return result.hexdigest().upper() + + +class GenericFileParser(FileParser): + + def __init__(self, checksum_calculators: list): + self.checksum_calculators = checksum_calculators + + def parse(self, path: str) -> dict: + """ + Parse a generic file + :param path: path of the file to parse + :return: dict information about the file + """ + + info = dict() + + info["size"] = os.path.getsize(path) + info["name"] = os.path.splitext(path)[0] + + for calculator in self.checksum_calculators: + info[calculator.name] = calculator.checksum(path) + + return info + + + + +# def crawl(root_dir: str) -> None: +# docs = [] +# +# for root, dirs, files in os.walk(root_dir): +# +# print(root) +# +# for filename in files: +# full_path = os.path.join(root, filename) +# +# doc = dict() +# +# doc["md5"] = md5sum(full_path) +# doc["path"] = root +# doc["name"] = filename +# doc["size"] = os.path.getsize(full_path) +# doc["mtime"] = int(os.path.getmtime(full_path)) +# +# mime_type = mimetypes.guess_type(full_path)[0] +# +# if mime_type is not None: +# +# doc["mime"] = mime_type +# +# if mime_type.startswith("image"): +# try: +# width, height = Image.open(full_path).size +# +# doc["width"] = width +# doc["height"] = height +# except OSError: +# doc.pop('mime', None) +# pass +# except ValueError: +# doc.pop('mime', None) +# pass +# +# docs.append(doc) +# +# file = open("crawler.json", "w") +# file.write(simplejson.dumps(docs)) +# file.close() +# +# \ No newline at end of file diff --git a/database.sql b/database.sql new file mode 100644 index 0000000..df72374 --- /dev/null +++ b/database.sql @@ -0,0 +1,40 @@ +PRAGMA FOREIGN_KEYS = ON; + +-- Represents a directory and its sub-directories +CREATE TABLE Directory ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + path TEXT UNIQUE, + enabled BOOLEAN +); + +-- Represents a queued task for crawling a Directory or generating thumnails +CREATE TABLE Task ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + directory_id INTEGER, + task_type INTEGER, + completed BOOLEAN DEFAULT 0, + completed_time DATETIME, + FOREIGN KEY (directory_id) REFERENCES Directory(id) +); + +-- You can set an option on a directory to change the crawler's behavior +CREATE TABLE Option ( + name STRING, + directory_id INTEGER, + FOREIGN KEY (directory_id) REFERENCES Directory(id), + PRIMARY KEY (name, directory_id) +); + +-- User accounts +CREATE TABLE User ( + username TEXT PRIMARY KEY, + password TEXT, + is_admin BOOLEAN +); + +CREATE TABLE User_canRead_Directory ( + username TEXT, + directory_id INTEGER, + PRIMARY KEY (username, directory_id) + +) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5b84407 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +PIL +simplejson diff --git a/run.py b/run.py new file mode 100644 index 0000000..d752381 --- /dev/null +++ b/run.py @@ -0,0 +1,135 @@ +from flask import Flask, render_template, send_file, request +import pysolr +import mimetypes +import requests +import json +from PIL import Image +import os + +SOLR_URL = "http://localhost:8983/solr/test/" + +solr = pysolr.Solr(SOLR_URL, timeout=10) + +app = Flask(__name__) + +# +# class Document: +# def __init__(self, doc_id, name, path, size, md5): +# self.doc_id = doc_id +# self.name = name +# self.path = path +# self.size = size +# self.md5 = md5 +# +# +# class ImageDocument(Document): +# def __init__(self, doc_id, name, path, size, md5): +# super().__init__(doc_id, name, path, size, md5) +# self.type = "image" +# +# +# class AudioClipDocument(Document): +# def __init__(self, doc_id, name, path, size, md5): +# super().__init__(doc_id, name, path, size, md5) +# self.type = "audio" +# +# +# def get_document(id): +# +# response = requests.get(SOLR_URL + "get?id=" + id) +# +# return json.loads(response.text)["doc"] +# +# +# def make_thumb(doc): +# size = (1024, 1024) +# +# thumb_path = "thumbnails/" + doc["id"] +# +# if not os.path.exists(thumb_path): +# +# file_path = doc["path"][0] + "/" + doc["name"][0] +# +# if doc["width"][0] > size[0]: +# +# image = Image.open(file_path) +# image.thumbnail(size, Image.ANTIALIAS) +# +# if image.mode == "RGB": +# image.save(thumb_path, "JPEG") +# elif image.mode == "RGBA": +# image.save(thumb_path, "PNG") +# else: +# image = image.convert("RGB") +# image.save(thumb_path, "JPEG") +# else: +# print("Skipping thumbnail") +# os.symlink(file_path, thumb_path) +# +# return "thumbnails/" + doc["id"] +# +# +# @app.route("/search/") +# def search(): +# +# query = request.args.get("query") +# page = int(request.args.get("page")) +# per_page = int(request.args.get("per_page")) +# +# results = solr.search(query, None, rows=per_page, start=per_page * page) +# +# docs = [] +# for r in results: +# +# if "mime" in r: +# mime_type = r["mime"][0] +# else: +# mime_type = "" +# +# if mime_type.startswith("image"): +# docs.append(ImageDocument(r["id"], r["name"][0], r["path"][0], r["size"], r["md5"])) +# +# elif mime_type.startswith("audio"): +# docs.append(AudioClipDocument(r["id"], r["name"][0], r["path"][0], r["size"], r["md5"])) +# +# return render_template("search.html", docs=docs) +# +# +# @app.route("/") +# def index(): +# return render_template("index.html") +# +# +# @app.route("/files//") +# def files(id): +# +# doc = get_document(id) +# +# if doc is not None: +# file_path = doc["path"][0] + "/" + doc["name"][0] +# return send_file(file_path, mimetype=mimetypes.guess_type(file_path)[0]) +# else: +# return "File not found" +# +# +# @app.route("/thumbs//") +# def thumbs(doc_id): +# +# doc = get_document(doc_id) +# +# if doc is not None: +# +# thumb_path = make_thumb(doc) +# +# return send_file("thumbnails/" + doc_id, mimetype=mimetypes.guess_type(thumb_path)[0]) +# else: +# return "File not found" + + +@app.route("/") +def tmp_route(): + return "test" + + +if __name__ == "__main__": + app.run("0.0.0.0", 8080) diff --git a/setupDb.sh b/setupDb.sh new file mode 100755 index 0000000..15b8e89 --- /dev/null +++ b/setupDb.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +rm test.db +sqlite3 local_storage.db -init "database.sql" \ No newline at end of file diff --git a/spec/Crawler_spec.py b/spec/Crawler_spec.py new file mode 100644 index 0000000..7cd74ae --- /dev/null +++ b/spec/Crawler_spec.py @@ -0,0 +1,10 @@ +from unittest import TestCase + +from crawler import Crawler + + +class CrawlerTest(TestCase): + + def test_dir_walk(self): + c = Crawler() + diff --git a/spec/FileParser_spec.py b/spec/FileParser_spec.py new file mode 100644 index 0000000..e09ae05 --- /dev/null +++ b/spec/FileParser_spec.py @@ -0,0 +1,131 @@ +import os +from unittest import TestCase + +from crawler import GenericFileParser, Md5CheckSumCalculator, Sha1CheckSumCalculator, Sha256CheckSumCalculator + + +class GenericFileParserTest(TestCase): + + def setUp(self): + if os.path.exists("test_parse"): + os.remove("test_parse") + + test_file = open("test_parse", "w") + test_file.write("12345678") + test_file.close() + + self.parser = GenericFileParser([Md5CheckSumCalculator()]) + + def tearDown(self): + os.remove("test_parse") + + def test_parse_size(self): + result = self.parser.parse("test_parse") + + self.assertEqual(result["size"], 8) + + def test_parse_name(self): + result = self.parser.parse("test_parse") + + self.assertEqual(result["name"], "test_parse") + + def test_parse_md5(self): + result = self.parser.parse("test_parse") + + self.assertEqual(result["md5"], "25D55AD283AA400AF464C76D713C07AD") + + +class Md5CheckSumCalculatorTest(TestCase): + + def setUp(self): + if os.path.exists("test_md5_1"): + os.remove("test_md5_1") + + test_file = open("test_md5_1", "w") + test_file.write("789456123") + test_file.close() + + if os.path.exists("test_md5_2"): + os.remove("test_md5_2") + + test_file = open("test_md5_2", "w") + test_file.write("cj3w97n7RY378WRXEN68W7RExnw6nr8276b473824") + test_file.close() + + self.calculator = Md5CheckSumCalculator() + + def tearDown(self): + os.remove("test_md5_1") + os.remove("test_md5_2") + + def test_md5_checksum(self): + + result = self.calculator.checksum("test_md5_1") + self.assertEqual(result, "9FAB6755CD2E8817D3E73B0978CA54A6") + + result = self.calculator.checksum("test_md5_2") + self.assertEqual(result, "39A1AADE23E33A7F37C11C7FF9CDC9EC") + + +class Sha1CheckSumCalculatorTest(TestCase): + + def setUp(self): + if os.path.exists("test_sha1_1"): + os.remove("test_sha1_1") + + test_file = open("test_sha1_1", "w") + test_file.write("sxjkneycbu") + test_file.close() + + if os.path.exists("test_sha1_2"): + os.remove("test_sha1_2") + + test_file = open("test_sha1_2", "w") + test_file.write("xoimoqxy38e") + test_file.close() + + self.calculator = Sha1CheckSumCalculator() + + def tearDown(self): + os.remove("test_sha1_1") + os.remove("test_sha1_2") + + def test_md5_checksum(self): + + result = self.calculator.checksum("test_sha1_1") + self.assertEqual(result, "A80315387730DB5743061F397EB66DE0DDAE19E5") + + result = self.calculator.checksum("test_sha1_2") + self.assertEqual(result, "E7B5A2B6F6838E766A0BC7E558F640726D70A8D6") + + +class Sha256CheckSumCalculatorTest(TestCase): + + def setUp(self): + if os.path.exists("test_sha256_1"): + os.remove("test_sha256_1") + + test_file = open("test_sha256_1", "w") + test_file.write("eaur5t84nc7i") + test_file.close() + + if os.path.exists("test_sha256_2"): + os.remove("test_sha256_2") + + test_file = open("test_sha256_2", "w") + test_file.write("xkwerci47ixryw7r6wxadwd") + test_file.close() + + self.calculator = Sha256CheckSumCalculator() + + def tearDown(self): + os.remove("test_sha256_1") + os.remove("test_sha256_2") + + def test_md5_checksum(self): + + result = self.calculator.checksum("test_sha256_1") + self.assertEqual(result, "DA7606DC763306B700685A71E2E72A2D95F1291209E5DA344B82DA2508FC27C5") + + result = self.calculator.checksum("test_sha256_2") + self.assertEqual(result, "C39C7E0E7D84C9692F3C9C22E1EA0327DEBF1BF531B5738EEA8E79FE27EBC570") \ No newline at end of file diff --git a/spec/LocalStorage_spec.py b/spec/LocalStorage_spec.py new file mode 100644 index 0000000..56063d6 --- /dev/null +++ b/spec/LocalStorage_spec.py @@ -0,0 +1,47 @@ +from unittest import TestCase + +from storage import LocalStorage, Directory, DuplicateDirectoryException + + +class LocalStorageTest(TestCase): + + def setUp(self): + + s = LocalStorage() + s.init_db("../database.sql") + + def test_save_and_retrieve_dir(self): + + storage = LocalStorage() + + d = Directory("/some/directory", True, ["opt1", "opt2", "opt3"]) + + storage.save_directory(d) + + self.assertEqual(storage.dirs()["/some/directory"].enabled, True) + self.assertEqual(storage.dirs()["/some/directory"].options[0], "opt1") + + def test_save_and_retrieve_dir_persistent(self): + + s1 = LocalStorage() + + d = Directory("/some/directory", True, ["opt1", "opt2", "opt3"]) + + s1.save_directory(d) + + s2 = LocalStorage() + self.assertEqual(s2.dirs()["/some/directory"].enabled, True) + self.assertEqual(s2.dirs()["/some/directory"].options[0], "opt1") + + def test_reject_duplicate_path(self): + + s = LocalStorage() + + d1 = Directory("/some/directory", True, ["opt1", "opt2"]) + d2 = Directory("/some/directory", True, ["opt1", "opt2"]) + + s.save_directory(d1) + + with self.assertRaises(DuplicateDirectoryException) as e: + s.save_directory(d2) + diff --git a/spec/test_folder/books.csv b/spec/test_folder/books.csv new file mode 100644 index 0000000..8ccecbb --- /dev/null +++ b/spec/test_folder/books.csv @@ -0,0 +1,11 @@ +id,cat,name,price,inStock,author,series_t,sequence_i,genre_s +0553573403,book,A Game of Thrones,7.99,true,George R.R. Martin,"A Song of Ice and Fire",1,fantasy +0553579908,book,A Clash of Kings,7.99,true,George R.R. Martin,"A Song of Ice and Fire",2,fantasy +055357342X,book,A Storm of Swords,7.99,true,George R.R. Martin,"A Song of Ice and Fire",3,fantasy +0553293354,book,Foundation,7.99,true,Isaac Asimov,Foundation Novels,1,scifi +0812521390,book,The Black Company,6.99,false,Glen Cook,The Chronicles of The Black Company,1,fantasy +0812550706,book,Ender's Game,6.99,true,Orson Scott Card,Ender,1,scifi +0441385532,book,Jhereg,7.95,false,Steven Brust,Vlad Taltos,1,fantasy +0380014300,book,Nine Princes In Amber,6.99,true,Roger Zelazny,the Chronicles of Amber,1,fantasy +0805080481,book,The Book of Three,5.99,true,Lloyd Alexander,The Chronicles of Prydain,1,fantasy +080508049X,book,The Black Cauldron,5.99,true,Lloyd Alexander,The Chronicles of Prydain,2,fantasy diff --git a/spec/test_folder/books.json b/spec/test_folder/books.json new file mode 100644 index 0000000..f82d510 --- /dev/null +++ b/spec/test_folder/books.json @@ -0,0 +1,51 @@ +[ + { + "id" : "978-0641723445", + "cat" : ["book","hardcover"], + "name" : "The Lightning Thief", + "author" : "Rick Riordan", + "series_t" : "Percy Jackson and the Olympians", + "sequence_i" : 1, + "genre_s" : "fantasy", + "inStock" : true, + "price" : 12.50, + "pages_i" : 384 + } +, + { + "id" : "978-1423103349", + "cat" : ["book","paperback"], + "name" : "The Sea of Monsters", + "author" : "Rick Riordan", + "series_t" : "Percy Jackson and the Olympians", + "sequence_i" : 2, + "genre_s" : "fantasy", + "inStock" : true, + "price" : 6.49, + "pages_i" : 304 + } +, + { + "id" : "978-1857995879", + "cat" : ["book","paperback"], + "name" : "Sophie's World : The Greek Philosophers", + "author" : "Jostein Gaarder", + "sequence_i" : 1, + "genre_s" : "fantasy", + "inStock" : true, + "price" : 3.07, + "pages_i" : 64 + } +, + { + "id" : "978-1933988177", + "cat" : ["book","paperback"], + "name" : "Lucene in Action, Second Edition", + "author" : "Michael McCandless", + "sequence_i" : 1, + "genre_s" : "IT", + "inStock" : true, + "price" : 30.50, + "pages_i" : 475 + } +] diff --git a/spec/test_folder/gb18030-example.xml b/spec/test_folder/gb18030-example.xml new file mode 100644 index 0000000..01743d3 --- /dev/null +++ b/spec/test_folder/gb18030-example.xml @@ -0,0 +1,32 @@ + + + + + + GB18030TEST + Test with some GB18030 encoded characters + No accents here + 这是一个功能 + This is a feature (translated) + 这份文件是很有光泽 + This document is very shiny (translated) + 0.0 + true + + + diff --git a/spec/test_folder/hd.xml b/spec/test_folder/hd.xml new file mode 100644 index 0000000..9cf7d1b --- /dev/null +++ b/spec/test_folder/hd.xml @@ -0,0 +1,56 @@ + + + + + SP2514N + Samsung SpinPoint P120 SP2514N - hard drive - 250 GB - ATA-133 + Samsung Electronics Co. Ltd. + + samsung + electronics + hard drive + 7200RPM, 8MB cache, IDE Ultra ATA-133 + NoiseGuard, SilentSeek technology, Fluid Dynamic Bearing (FDB) motor + 92.0 + 6 + true + 2006-02-13T15:26:37Z + + 35.0752,-97.032 + + + + 6H500F0 + Maxtor DiamondMax 11 - hard drive - 500 GB - SATA-300 + Maxtor Corp. + + maxtor + electronics + hard drive + SATA 3.0Gb/s, NCQ + 8.5ms seek + 16MB cache + 350.0 + 6 + true + + 45.17614,-93.87341 + 2006-02-13T15:26:37Z + + + diff --git a/spec/test_folder/ipod_other.xml b/spec/test_folder/ipod_other.xml new file mode 100644 index 0000000..3de32f3 --- /dev/null +++ b/spec/test_folder/ipod_other.xml @@ -0,0 +1,60 @@ + + + + + + F8V7067-APL-KIT + Belkin Mobile Power Cord for iPod w/ Dock + Belkin + + belkin + electronics + connector + car power adapter, white + 4.0 + 19.95 + 1 + false + + 45.18014,-93.87741 + 2005-08-01T16:30:25Z + + + + IW-02 + iPod & iPod Mini USB 2.0 Cable + Belkin + + belkin + electronics + connector + car power adapter for iPod, white + 2.0 + 11.50 + 1 + false + + 37.7752,-122.4232 + 2006-02-14T23:55:59Z + + + + + + + diff --git a/spec/test_folder/ipod_video.xml b/spec/test_folder/ipod_video.xml new file mode 100644 index 0000000..1ca5f6f --- /dev/null +++ b/spec/test_folder/ipod_video.xml @@ -0,0 +1,40 @@ + + + + MA147LL/A + Apple 60 GB iPod with Video Playback Black + Apple Computer Inc. + + apple + electronics + music + iTunes, Podcasts, Audiobooks + Stores up to 15,000 songs, 25,000 photos, or 150 hours of video + 2.5-inch, 320x240 color TFT LCD display with LED backlight + Up to 20 hours of battery life + Plays AAC, MP3, WAV, AIFF, Audible, Apple Lossless, H.264 video + Notes, Calendar, Phone book, Hold button, Date display, Photo wallet, Built-in games, JPEG photo playback, Upgradeable firmware, USB 2.0 compatibility, Playback speed control, Rechargeable capability, Battery level indication + earbud headphones, USB cable + 5.5 + 399.00 + 10 + true + + 37.7752,-100.0232 + 2005-10-12T08:00:00Z + diff --git a/spec/test_folder/monitor2.xml b/spec/test_folder/monitor2.xml new file mode 100644 index 0000000..eaf9e22 --- /dev/null +++ b/spec/test_folder/monitor2.xml @@ -0,0 +1,33 @@ + + + + VA902B + ViewSonic VA902B - flat panel display - TFT - 19" + ViewSonic Corp. + + viewsonic + electronics and stuff2 + 19" TFT active matrix LCD, 8ms response time, 1280 x 1024 native resolution + 190.4 + 279.95 + 6 + true + + 45.18814,-93.88541 + + diff --git a/spec/test_folder/more_books.jsonl b/spec/test_folder/more_books.jsonl new file mode 100644 index 0000000..a48ad1e --- /dev/null +++ b/spec/test_folder/more_books.jsonl @@ -0,0 +1,3 @@ +{"id":"0060248025","name":"Falling Up","inStock": true,"author": "Shel Silverstein"} +{"id":"0679805273","name":"Oh, The Places You'll Go","inStock": true,"author": "Dr. Seuss"} + diff --git a/spec/test_folder/mp500.xml b/spec/test_folder/mp500.xml new file mode 100644 index 0000000..a8f51b6 --- /dev/null +++ b/spec/test_folder/mp500.xml @@ -0,0 +1,43 @@ + + + + 0579B002 + Canon PIXMA MP500 All-In-One Photo Printer + Canon Inc. + + canon + electronics + multifunction printer + printer + scanner + copier + Multifunction ink-jet color photo printer + Flatbed scanner, optical scan resolution of 1,200 x 2,400 dpi + 2.5" color LCD preview screen + Duplex Copying + Printing speed up to 29ppm black, 19ppm color + Hi-Speed USB + memory card: CompactFlash, Micro Drive, SmartMedia, Memory Stick, Memory Stick Pro, SD Card, and MultiMediaCard + 352.0 + 179.99 + 6 + true + + 45.19214,-93.89941 + + diff --git a/spec/test_folder/post.jar b/spec/test_folder/post.jar new file mode 100644 index 0000000..56f12b4 Binary files /dev/null and b/spec/test_folder/post.jar differ diff --git a/spec/test_folder/sample.html b/spec/test_folder/sample.html new file mode 100644 index 0000000..656b656 --- /dev/null +++ b/spec/test_folder/sample.html @@ -0,0 +1,13 @@ + + + Welcome to Solr + + +

+ Here is some text +

+

distinct
words

+
Here is some text in a div
+
This has a link.
+ + diff --git a/spec/test_folder/sample_1.jpg b/spec/test_folder/sample_1.jpg new file mode 100644 index 0000000..8aefd60 Binary files /dev/null and b/spec/test_folder/sample_1.jpg differ diff --git a/spec/test_folder/sample_2.jpeg b/spec/test_folder/sample_2.jpeg new file mode 100644 index 0000000..fe678e9 Binary files /dev/null and b/spec/test_folder/sample_2.jpeg differ diff --git a/spec/test_folder/sample_3.jpg b/spec/test_folder/sample_3.jpg new file mode 100644 index 0000000..e7b1cb6 Binary files /dev/null and b/spec/test_folder/sample_3.jpg differ diff --git a/spec/test_folder/sample_4.jpg b/spec/test_folder/sample_4.jpg new file mode 100644 index 0000000..400ea91 Binary files /dev/null and b/spec/test_folder/sample_4.jpg differ diff --git a/spec/test_folder/solr-word.pdf b/spec/test_folder/solr-word.pdf new file mode 100644 index 0000000..bd8b865 Binary files /dev/null and b/spec/test_folder/solr-word.pdf differ diff --git a/spec/test_folder/solr.xml b/spec/test_folder/solr.xml new file mode 100644 index 0000000..a365617 --- /dev/null +++ b/spec/test_folder/solr.xml @@ -0,0 +1,38 @@ + + + + + SOLR1000 + Solr, the Enterprise Search Server + Apache Software Foundation + software + search + Advanced Full-Text Search Capabilities using Lucene + Optimized for High Volume Web Traffic + Standards Based Open Interfaces - XML and HTTP + Comprehensive HTML Administration Interfaces + Scalability - Efficient Replication to other Solr Search Servers + Flexible and Adaptable with XML configuration and Schema + Good unicode support: héllo (hello with an accent over the e) + 0.0 + 10 + true + 2006-01-17T00:00:00.000Z + + + diff --git a/spec/test_folder/sub1/manufacturers.xml b/spec/test_folder/sub1/manufacturers.xml new file mode 100644 index 0000000..e3121d5 --- /dev/null +++ b/spec/test_folder/sub1/manufacturers.xml @@ -0,0 +1,75 @@ + + + + + adata + A-Data Technology + 46221 Landing Parkway Fremont, CA 94538 + + + apple + Apple + 1 Infinite Way, Cupertino CA + + + asus + ASUS Computer + 800 Corporate Way Fremont, CA 94539 + + + ati + ATI Technologies + 33 Commerce Valley Drive East Thornhill, ON L3T 7N6 Canada + + + belkin + Belkin + 12045 E. Waterfront Drive Playa Vista, CA 90094 + + + canon + Canon, Inc. + One Canon Plaza Lake Success, NY 11042 + + + corsair + Corsair Microsystems + 46221 Landing Parkway Fremont, CA 94538 + + + dell + Dell, Inc. + One Dell Way Round Rock, Texas 78682 + + + maxtor + Maxtor Corporation + 920 Disc Drive Scotts Valley, CA 95066 + + + samsung + Samsung Electronics Co. Ltd. + 105 Challenger Rd. Ridgefield Park, NJ 07660-0511 + + + viewsonic + ViewSonic Corp + 381 Brea Canyon Road Walnut, CA 91789-0708 + + + diff --git a/spec/test_folder/sub1/mem.xml b/spec/test_folder/sub1/mem.xml new file mode 100644 index 0000000..48af522 --- /dev/null +++ b/spec/test_folder/sub1/mem.xml @@ -0,0 +1,77 @@ + + + + + TWINX2048-3200PRO + CORSAIR XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail + Corsair Microsystems Inc. + + corsair + electronics + memory + CAS latency 2, 2-3-3-6 timing, 2.75v, unbuffered, heat-spreader + 185.00 + 5 + true + + 37.7752,-122.4232 + 2006-02-13T15:26:37Z + + + electronics|6.0 memory|3.0 + + + + VS1GB400C3 + CORSAIR ValueSelect 1GB 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) System Memory - Retail + Corsair Microsystems Inc. + + corsair + electronics + memory + 74.99 + 7 + true + + 37.7752,-100.0232 + 2006-02-13T15:26:37Z + + electronics|4.0 memory|2.0 + + + + VDBDB1A16 + A-DATA V-Series 1GB 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) System Memory - OEM + A-DATA Technology Inc. + + corsair + electronics + memory + CAS latency 3, 2.7v + + 0 + true + + 45.18414,-93.88141 + 2006-02-13T15:26:37Z + + electronics|0.9 memory|0.1 + + + + diff --git a/spec/test_folder/sub1/money.xml b/spec/test_folder/sub1/money.xml new file mode 100644 index 0000000..b1b8036 --- /dev/null +++ b/spec/test_folder/sub1/money.xml @@ -0,0 +1,65 @@ + + + + + + USD + One Dollar + Bank of America + boa + currency + Coins and notes + 1,USD + true + + + + EUR + One Euro + European Union + eu + currency + Coins and notes + 1,EUR + true + + + + GBP + One British Pound + U.K. + uk + currency + Coins and notes + 1,GBP + true + + + + NOK + One Krone + Bank of Norway + nor + currency + Coins and notes + 1,NOK + true + + + + diff --git a/spec/test_folder/sub2/monitor.xml b/spec/test_folder/sub2/monitor.xml new file mode 100644 index 0000000..d0343af --- /dev/null +++ b/spec/test_folder/sub2/monitor.xml @@ -0,0 +1,34 @@ + + + + 3007WFP + Dell Widescreen UltraSharp 3007WFP + Dell, Inc. + + dell + electronics and computer1 + 30" TFT active matrix LCD, 2560 x 1600, .25mm dot pitch, 700:1 contrast + USB cable + 401.6 + 2199.0 + 6 + true + + 43.17614,-90.57341 + + diff --git a/spec/test_folder/sub2/sd500.xml b/spec/test_folder/sub2/sd500.xml new file mode 100644 index 0000000..145c6fd --- /dev/null +++ b/spec/test_folder/sub2/sd500.xml @@ -0,0 +1,38 @@ + + + + 9885A004 + Canon PowerShot SD500 + Canon Inc. + + canon + electronics + camera + 3x zoop, 7.1 megapixel Digital ELPH + movie clips up to 640x480 @30 fps + 2.0" TFT LCD, 118,000 pixels + built in flash, red-eye reduction + 32MB SD card, USB cable, AV cable, battery + 6.4 + 329.95 + 7 + true + 2006-02-13T15:26:37Z + + 45.19614,-93.90341 + diff --git a/spec/test_folder/sub2/sub_sub1/more_books.jsonl b/spec/test_folder/sub2/sub_sub1/more_books.jsonl new file mode 100644 index 0000000..a48ad1e --- /dev/null +++ b/spec/test_folder/sub2/sub_sub1/more_books.jsonl @@ -0,0 +1,3 @@ +{"id":"0060248025","name":"Falling Up","inStock": true,"author": "Shel Silverstein"} +{"id":"0679805273","name":"Oh, The Places You'll Go","inStock": true,"author": "Dr. Seuss"} + diff --git a/spec/test_folder/sub2/sub_sub1/mp500.xml b/spec/test_folder/sub2/sub_sub1/mp500.xml new file mode 100644 index 0000000..a8f51b6 --- /dev/null +++ b/spec/test_folder/sub2/sub_sub1/mp500.xml @@ -0,0 +1,43 @@ + + + + 0579B002 + Canon PIXMA MP500 All-In-One Photo Printer + Canon Inc. + + canon + electronics + multifunction printer + printer + scanner + copier + Multifunction ink-jet color photo printer + Flatbed scanner, optical scan resolution of 1,200 x 2,400 dpi + 2.5" color LCD preview screen + Duplex Copying + Printing speed up to 29ppm black, 19ppm color + Hi-Speed USB + memory card: CompactFlash, Micro Drive, SmartMedia, Memory Stick, Memory Stick Pro, SD Card, and MultiMediaCard + 352.0 + 179.99 + 6 + true + + 45.19214,-93.89941 + + diff --git a/spec/test_folder/sub2/sub_sub1/post.jar b/spec/test_folder/sub2/sub_sub1/post.jar new file mode 100644 index 0000000..56f12b4 Binary files /dev/null and b/spec/test_folder/sub2/sub_sub1/post.jar differ diff --git a/spec/test_folder/test_utf8.sh b/spec/test_folder/test_utf8.sh new file mode 100755 index 0000000..9032e12 --- /dev/null +++ b/spec/test_folder/test_utf8.sh @@ -0,0 +1,93 @@ +#!/bin/sh +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#Test script to tell if the server is accepting UTF-8 +#The python writer currently escapes non-ascii chars, so it's good for testing + +SOLR_URL=http://localhost:8983/solr + +if [ ! -z $1 ]; then + SOLR_URL=$1 +fi + +curl "$SOLR_URL/select?q=hello¶ms=explicit&wt=python" 2> /dev/null | grep 'hello' > /dev/null 2>&1 +if [ $? = 0 ]; then + echo "Solr server is up." +else + echo "ERROR: Could not curl to Solr - is curl installed? Is Solr not running?" + exit 1 +fi + +curl "$SOLR_URL/select?q=h%C3%A9llo&echoParams=explicit&wt=python" 2> /dev/null | grep 'h\\u00e9llo' > /dev/null 2>&1 +if [ $? = 0 ]; then + echo "HTTP GET is accepting UTF-8" +else + echo "ERROR: HTTP GET is not accepting UTF-8" +fi + +curl $SOLR_URL/select --data-binary 'q=h%C3%A9llo&echoParams=explicit&wt=python' -H 'Content-type:application/x-www-form-urlencoded; charset=UTF-8' 2> /dev/null | grep 'h\\u00e9llo' > /dev/null 2>&1 +if [ $? = 0 ]; then + echo "HTTP POST is accepting UTF-8" +else + echo "ERROR: HTTP POST is not accepting UTF-8" +fi + +curl $SOLR_URL/select --data-binary 'q=h%C3%A9llo&echoParams=explicit&wt=python' 2> /dev/null | grep 'h\\u00e9llo' > /dev/null 2>&1 +if [ $? = 0 ]; then + echo "HTTP POST defaults to UTF-8" +else + echo "HTTP POST does not default to UTF-8" +fi + + +#A unicode character outside of the BMP (a circle with an x inside) +CHAR="饜寛" +CODEPOINT='0x10308' +#URL encoded UTF8 of the codepoint +UTF8_Q='%F0%90%8C%88' +#expected return of the python writer (currently uses UTF-16 surrogates) +EXPECTED='\\ud800\\udf08' + +curl "$SOLR_URL/select?q=$UTF8_Q&echoParams=explicit&wt=python" 2> /dev/null | grep $EXPECTED > /dev/null 2>&1 +if [ $? = 0 ]; then + echo "HTTP GET is accepting UTF-8 beyond the basic multilingual plane" +else + echo "ERROR: HTTP GET is not accepting UTF-8 beyond the basic multilingual plane" +fi + +curl $SOLR_URL/select --data-binary "q=$UTF8_Q&echoParams=explicit&wt=python" -H 'Content-type:application/x-www-form-urlencoded; charset=UTF-8' 2> /dev/null | grep $EXPECTED > /dev/null 2>&1 +if [ $? = 0 ]; then + echo "HTTP POST is accepting UTF-8 beyond the basic multilingual plane" +else + echo "ERROR: HTTP POST is not accepting UTF-8 beyond the basic multilingual plane" +fi + +curl "$SOLR_URL/select?q=$UTF8_Q&echoParams=explicit&wt=python" --data-binary '' 2> /dev/null | grep $EXPECTED > /dev/null 2>&1 +if [ $? = 0 ]; then + echo "HTTP POST + URL params is accepting UTF-8 beyond the basic multilingual plane" +else + echo "ERROR: HTTP POST + URL params is not accepting UTF-8 beyond the basic multilingual plane" +fi + +#curl "$SOLR_URL/select?q=$UTF8_Q&echoParams=explicit" 2> /dev/null | od -tx1 -w1000 | sed 's/ //g' | grep 'f4808198' > /dev/null 2>&1 +curl "$SOLR_URL/select?q=$UTF8_Q&echoParams=explicit" 2> /dev/null | grep "$CHAR" > /dev/null 2>&1 +if [ $? = 0 ]; then + echo "Response correctly returns UTF-8 beyond the basic multilingual plane" +else + echo "ERROR: Response can't return UTF-8 beyond the basic multilingual plane" +fi + + diff --git a/spec/test_folder/utf8-example.xml b/spec/test_folder/utf8-example.xml new file mode 100644 index 0000000..ee300a6 --- /dev/null +++ b/spec/test_folder/utf8-example.xml @@ -0,0 +1,42 @@ + + + + + + + + UTF8TEST + Test with some UTF-8 encoded characters + Apache Software Foundation + software + search + No accents here + This is an e acute: 茅 + eaiou with circumflexes: 锚芒卯么没 + eaiou with umlauts: 毛盲茂枚眉 + tag with escaped chars: <nicetag/> + escaped ampersand: Bonnie & Clyde + Outside the BMP:饜寛 codepoint=10308, a circle with an x inside. UTF8=f0908c88 UTF16=d800 df08 + 0.0 + true + + + diff --git a/spec/test_folder/vidcard.xml b/spec/test_folder/vidcard.xml new file mode 100644 index 0000000..d867d82 --- /dev/null +++ b/spec/test_folder/vidcard.xml @@ -0,0 +1,62 @@ + + + + + EN7800GTX/2DHTV/256M + ASUS Extreme N7800GTX/2DHTV (256 MB) + + ASUS Computer Inc. + + asus + electronics + graphics card + NVIDIA GeForce 7800 GTX GPU/VPU clocked at 486MHz + 256MB GDDR3 Memory clocked at 1.35GHz + PCI Express x16 + Dual DVI connectors, HDTV out, video input + OpenGL 2.0, DirectX 9.0 + 16.0 + 479.95 + 7 + 40.7143,-74.006 + false + 2006-02-13T15:26:37Z/DAY + + + + 100-435805 + ATI Radeon X1900 XTX 512 MB PCIE Video Card + ATI Technologies + + ati + electronics + graphics card + ATI RADEON X1900 GPU/VPU clocked at 650MHz + 512MB GDDR3 SDRAM clocked at 1.55GHz + PCI Express x16 + dual DVI, HDTV, svideo, composite out + OpenGL 2.0, DirectX 9.0 + 48.0 + 649.99 + 7 + false + 2006-02-13T15:26:37Z/DAY + + 40.7143,-74.006 + + diff --git a/storage.py b/storage.py new file mode 100644 index 0000000..be09824 --- /dev/null +++ b/storage.py @@ -0,0 +1,107 @@ +import sqlite3 +import os + + +class DuplicateDirectoryException(Exception): + pass + + +class Directory: + """ + Data structure to hold directory information + """ + def __init__(self, path: str, enabled: bool, options: list): + self.path = path + self.enabled = enabled + self.options = options + + def __str__(self): + return self.path + " | enabled: " + str(self.enabled) + " | opts: " + str(self.options) + + +class LocalStorage: + """ + Manages storage of application data to disk. + Could be refactored into a abstract class to switch from SQLite3 to something else + """ + + cache_outdated = True + """Static variable that indicates that the database was changed since the last time it was cached in memory""" + + db_path = "../local_storage.db" + + def __init__(self): + self.cached_dirs = {} + pass + + @staticmethod + def init_db(script_path): + """Creates a blank database. Overwrites the old one""" + if os.path.isfile(LocalStorage.db_path): + os.remove(LocalStorage.db_path) + + conn = sqlite3.connect(LocalStorage.db_path) + c = conn.cursor() + with open(script_path, "r") as f: + c.executescript(f.read()) + + conn.commit() + c.close() + conn.close() + + def save_directory(self, directory: Directory): + """ + Save directory to storage + :param directory: Directory to save + :return: None + """ + + LocalStorage.cache_outdated = True + + conn = sqlite3.connect(LocalStorage.db_path) + c = conn.cursor() + c.execute("PRAGMA FOREIGN_KEYS = ON;") + try: + c.execute("INSERT INTO Directory (path, enabled) VALUES (?, ?)", (directory.path, directory.enabled)) + c.execute("SELECT last_insert_rowid()") + + dir_id = c.fetchone()[0] + + for opt in directory.options: + conn.execute("INSERT INTO Option (name, directory_id) VALUES (?, ?)", (opt, dir_id)) + + conn.commit() + except sqlite3.IntegrityError: + raise DuplicateDirectoryException("Duplicate directory path: " + directory.path) + + finally: + conn.close() + + def dirs(self): + + if LocalStorage.cache_outdated: + + self.cached_dirs = {} + + conn = sqlite3.connect(LocalStorage.db_path) + c = conn.cursor() + c.execute("SELECT id, path, enabled FROM Directory") + db_directories = c.fetchall() + c.execute("SELECT name, directory_id FROM Option") + db_options = c.fetchall() + + for db_dir in db_directories: + + options = [] + directory = Directory(db_dir[1], db_dir[2], options) + + for db_opt in db_options: + if db_opt[1] == db_dir[0]: + options.append(db_opt[0]) + + self.cached_dirs[directory.path] = directory + LocalStorage.cache_outdated = False + return self.cached_dirs + + else: + return self.cached_dirs diff --git a/templates/layout.html b/templates/layout.html new file mode 100644 index 0000000..0e6687c --- /dev/null +++ b/templates/layout.html @@ -0,0 +1,10 @@ + + + + + Layout Title + + + + + \ No newline at end of file diff --git a/webserver.py b/webserver.py deleted file mode 100644 index df8f15b..0000000 --- a/webserver.py +++ /dev/null @@ -1,129 +0,0 @@ -from flask import Flask, render_template, send_file, request -import pysolr -import mimetypes -import requests -import json -from PIL import Image -import os - -SOLR_URL = "http://localhost:8983/solr/test/" - -solr = pysolr.Solr(SOLR_URL, timeout=10) - -app = Flask(__name__) - - -class Document: - def __init__(self, doc_id, name, path, size, md5): - self.doc_id = doc_id - self.name = name - self.path = path - self.size = size - self.md5 = md5 - - -class ImageDocument(Document): - def __init__(self, doc_id, name, path, size, md5): - super().__init__(doc_id, name, path, size, md5) - self.type = "image" - - -class AudioClipDocument(Document): - def __init__(self, doc_id, name, path, size, md5): - super().__init__(doc_id, name, path, size, md5) - self.type = "audio" - - -def get_document(id): - - response = requests.get(SOLR_URL + "get?id=" + id) - - return json.loads(response.text)["doc"] - - -def make_thumb(doc): - size = (1024, 1024) - - thumb_path = "thumbnails/" + doc["id"] - - if not os.path.exists(thumb_path): - - file_path = doc["path"][0] + "/" + doc["name"][0] - - if doc["width"][0] > size[0]: - - image = Image.open(file_path) - image.thumbnail(size, Image.ANTIALIAS) - - if image.mode == "RGB": - image.save(thumb_path, "JPEG") - elif image.mode == "RGBA": - image.save(thumb_path, "PNG") - else: - image = image.convert("RGB") - image.save(thumb_path, "JPEG") - else: - print("Skipping thumbnail") - os.symlink(file_path, thumb_path) - - return "thumbnails/" + doc["id"] - - -@app.route("/search/") -def search(): - - query = request.args.get("query") - page = int(request.args.get("page")) - per_page = int(request.args.get("per_page")) - - results = solr.search(query, None, rows=per_page, start=per_page * page) - - docs = [] - for r in results: - - if "mime" in r: - mime_type = r["mime"][0] - else: - mime_type = "" - - if mime_type.startswith("image"): - docs.append(ImageDocument(r["id"], r["name"][0], r["path"][0], r["size"], r["md5"])) - - elif mime_type.startswith("audio"): - docs.append(AudioClipDocument(r["id"], r["name"][0], r["path"][0], r["size"], r["md5"])) - - return render_template("search.html", docs=docs) - - -@app.route("/") -def index(): - return render_template("index.html") - - -@app.route("/files//") -def files(id): - - doc = get_document(id) - - if doc is not None: - file_path = doc["path"][0] + "/" + doc["name"][0] - return send_file(file_path, mimetype=mimetypes.guess_type(file_path)[0]) - else: - return "File not found" - - -@app.route("/thumbs//") -def thumbs(doc_id): - - doc = get_document(doc_id) - - if doc is not None: - - thumb_path = make_thumb(doc) - - return send_file("thumbnails/" + doc_id, mimetype=mimetypes.guess_type(thumb_path)[0]) - else: - return "File not found" - - -app.run("0.0.0.0", 8080) \ No newline at end of file