diff --git a/.gitignore b/.gitignore index aa66ea9..7db1aee 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ __pycache__/ captchas/ _stats.json config.py -db.sqlite3 oddb.log praw.ini env/ diff --git a/database.py b/database.py index bdaf0dc..918db7f 100644 --- a/database.py +++ b/database.py @@ -242,7 +242,7 @@ class Database: return page - def join_website_on_scan(self, docs: list): + def join_website_url(self, docs): websites = self.get_all_websites() diff --git a/export.py b/export.py index 64c917b..d2558ca 100644 --- a/export.py +++ b/export.py @@ -1,39 +1,63 @@ -import csv import os +import time + +import lz4.frame import config from database import Database from search.search import ElasticSearchEngine -def export(outfile="out.csv"): - - print("Export started, connecting to databases...") - es = ElasticSearchEngine("od-database") - db = Database(config.DB_CONN_STR) - docs = es.stream_all_docs() - docs_with_website = db.join_website_on_scan(docs) - - print("Connected, writing to csv") - - with open(outfile + ".temp", "w") as out: - - csv_writer = csv.writer(out) - csv_writer.writerow(["website_id", "website_url", "path", "name", "ext", "size", "mtime"]) - - for doc in docs_with_website: - csv_writer.writerow([doc["_source"]["website_id"], - doc["_source"]["website_url"], - doc["_source"]["path"] + "/" if doc["_source"]["path"] != "" else "", - doc["_source"]["name"], - "." + doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "", - doc["_source"]["size"], - doc["_source"]["mtime"]]) - print("Wrote to csv, compressing with xz") - - os.system("xz -0 " + outfile + ".temp") - os.system("mv " + outfile + ".temp.xz " + outfile + ".xz") - print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes") +def quote(string): + if "\"" in string: + return "\"" + string.replace("\"", "\"\"") + "\"" + elif "," in string: + return "\"" + string + "\"" + else: + return string -export("static/out.csv") +outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime()) +dldir = "static/downloads/" + +print("Deleting existing dumps") +for file in os.listdir(dldir): + if file.endswith("_dump.csv.lz4"): + os.remove(os.path.join(dldir, file)) + +print("Export started, connecting to databases...") + +db = Database(config.DB_CONN_STR) +es = ElasticSearchEngine("od-database") + +docs_with_url = db.join_website_url(es.stream_all_docs()) + +print("Connected, writing to csv") + +with lz4.frame.open(outfile + ".part", mode='wb', + compression_level=9, + block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp: + fp.write((",".join( + ["website_id", "website_url", "path", "name", "ext", "size", "mtime"] + ) + "\n").encode()) + + for doc in docs_with_url: + try: + fp.write( + (",".join( + [ + str(doc["_source"]["website_id"]), + quote(doc["_source"]["website_url"]), + quote(doc["_source"]["path"]), + quote(doc["_source"]["name"]), + quote(doc["_source"]["ext"]), + str(doc["_source"]["size"]), + str(doc["_source"]["mtime"]) + ] + ) + "\n").encode()) + except Exception as e: + print(e) + print(doc) + + +os.rename(outfile + ".part", os.path.join(dldir, outfile)) diff --git a/high_level_diagram.dia b/high_level_diagram.dia new file mode 100644 index 0000000..d3b3822 Binary files /dev/null and b/high_level_diagram.dia differ diff --git a/high_level_diagram.png b/high_level_diagram.png new file mode 100644 index 0000000..c3d2c09 Binary files /dev/null and b/high_level_diagram.png differ diff --git a/requirements.txt b/requirements.txt index c4aa199..96cde60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,5 @@ numpy matplotlib uwsgi redis -psycopg2-binary \ No newline at end of file +psycopg2-binary +lz4 \ No newline at end of file