Rewrite export.py, add diagram

This commit is contained in:
simon987 2019-03-27 22:09:08 -04:00
parent b9f25630b4
commit d69ed65a0c
6 changed files with 57 additions and 33 deletions

1
.gitignore vendored
View File

@ -5,7 +5,6 @@ __pycache__/
captchas/ captchas/
_stats.json _stats.json
config.py config.py
db.sqlite3
oddb.log oddb.log
praw.ini praw.ini
env/ env/

View File

@ -242,7 +242,7 @@ class Database:
return page return page
def join_website_on_scan(self, docs: list): def join_website_url(self, docs):
websites = self.get_all_websites() websites = self.get_all_websites()

View File

@ -1,39 +1,63 @@
import csv
import os import os
import time
import lz4.frame
import config import config
from database import Database from database import Database
from search.search import ElasticSearchEngine from search.search import ElasticSearchEngine
def export(outfile="out.csv"): def quote(string):
if "\"" in string:
print("Export started, connecting to databases...") return "\"" + string.replace("\"", "\"\"") + "\""
es = ElasticSearchEngine("od-database") elif "," in string:
db = Database(config.DB_CONN_STR) return "\"" + string + "\""
docs = es.stream_all_docs() else:
docs_with_website = db.join_website_on_scan(docs) return string
print("Connected, writing to csv")
with open(outfile + ".temp", "w") as out:
csv_writer = csv.writer(out)
csv_writer.writerow(["website_id", "website_url", "path", "name", "ext", "size", "mtime"])
for doc in docs_with_website:
csv_writer.writerow([doc["_source"]["website_id"],
doc["_source"]["website_url"],
doc["_source"]["path"] + "/" if doc["_source"]["path"] != "" else "",
doc["_source"]["name"],
"." + doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "",
doc["_source"]["size"],
doc["_source"]["mtime"]])
print("Wrote to csv, compressing with xz")
os.system("xz -0 " + outfile + ".temp")
os.system("mv " + outfile + ".temp.xz " + outfile + ".xz")
print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes")
export("static/out.csv") outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime())
dldir = "static/downloads/"
print("Deleting existing dumps")
for file in os.listdir(dldir):
if file.endswith("_dump.csv.lz4"):
os.remove(os.path.join(dldir, file))
print("Export started, connecting to databases...")
db = Database(config.DB_CONN_STR)
es = ElasticSearchEngine("od-database")
docs_with_url = db.join_website_url(es.stream_all_docs())
print("Connected, writing to csv")
with lz4.frame.open(outfile + ".part", mode='wb',
compression_level=9,
block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp:
fp.write((",".join(
["website_id", "website_url", "path", "name", "ext", "size", "mtime"]
) + "\n").encode())
for doc in docs_with_url:
try:
fp.write(
(",".join(
[
str(doc["_source"]["website_id"]),
quote(doc["_source"]["website_url"]),
quote(doc["_source"]["path"]),
quote(doc["_source"]["name"]),
quote(doc["_source"]["ext"]),
str(doc["_source"]["size"]),
str(doc["_source"]["mtime"])
]
) + "\n").encode())
except Exception as e:
print(e)
print(doc)
os.rename(outfile + ".part", os.path.join(dldir, outfile))

BIN
high_level_diagram.dia Normal file

Binary file not shown.

BIN
high_level_diagram.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 332 KiB

View File

@ -21,4 +21,5 @@ numpy
matplotlib matplotlib
uwsgi uwsgi
redis redis
psycopg2-binary psycopg2-binary
lz4