Rewrite export.py, add diagram

This commit is contained in:
simon987 2019-03-27 22:09:08 -04:00
parent b9f25630b4
commit d69ed65a0c
6 changed files with 57 additions and 33 deletions

1
.gitignore vendored
View File

@ -5,7 +5,6 @@ __pycache__/
captchas/
_stats.json
config.py
db.sqlite3
oddb.log
praw.ini
env/

View File

@ -242,7 +242,7 @@ class Database:
return page
def join_website_on_scan(self, docs: list):
def join_website_url(self, docs):
websites = self.get_all_websites()

View File

@ -1,39 +1,63 @@
import csv
import os
import time
import lz4.frame
import config
from database import Database
from search.search import ElasticSearchEngine
def export(outfile="out.csv"):
print("Export started, connecting to databases...")
es = ElasticSearchEngine("od-database")
db = Database(config.DB_CONN_STR)
docs = es.stream_all_docs()
docs_with_website = db.join_website_on_scan(docs)
print("Connected, writing to csv")
with open(outfile + ".temp", "w") as out:
csv_writer = csv.writer(out)
csv_writer.writerow(["website_id", "website_url", "path", "name", "ext", "size", "mtime"])
for doc in docs_with_website:
csv_writer.writerow([doc["_source"]["website_id"],
doc["_source"]["website_url"],
doc["_source"]["path"] + "/" if doc["_source"]["path"] != "" else "",
doc["_source"]["name"],
"." + doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "",
doc["_source"]["size"],
doc["_source"]["mtime"]])
print("Wrote to csv, compressing with xz")
os.system("xz -0 " + outfile + ".temp")
os.system("mv " + outfile + ".temp.xz " + outfile + ".xz")
print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes")
def quote(string):
if "\"" in string:
return "\"" + string.replace("\"", "\"\"") + "\""
elif "," in string:
return "\"" + string + "\""
else:
return string
export("static/out.csv")
outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime())
dldir = "static/downloads/"
print("Deleting existing dumps")
for file in os.listdir(dldir):
if file.endswith("_dump.csv.lz4"):
os.remove(os.path.join(dldir, file))
print("Export started, connecting to databases...")
db = Database(config.DB_CONN_STR)
es = ElasticSearchEngine("od-database")
docs_with_url = db.join_website_url(es.stream_all_docs())
print("Connected, writing to csv")
with lz4.frame.open(outfile + ".part", mode='wb',
compression_level=9,
block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp:
fp.write((",".join(
["website_id", "website_url", "path", "name", "ext", "size", "mtime"]
) + "\n").encode())
for doc in docs_with_url:
try:
fp.write(
(",".join(
[
str(doc["_source"]["website_id"]),
quote(doc["_source"]["website_url"]),
quote(doc["_source"]["path"]),
quote(doc["_source"]["name"]),
quote(doc["_source"]["ext"]),
str(doc["_source"]["size"]),
str(doc["_source"]["mtime"])
]
) + "\n").encode())
except Exception as e:
print(e)
print(doc)
os.rename(outfile + ".part", os.path.join(dldir, outfile))

BIN
high_level_diagram.dia Normal file

Binary file not shown.

BIN
high_level_diagram.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 332 KiB

View File

@ -21,4 +21,5 @@ numpy
matplotlib
uwsgi
redis
psycopg2-binary
psycopg2-binary
lz4