mirror of
https://github.com/simon987/od-database.git
synced 2025-04-04 06:52:59 +00:00
Rewrite export.py, add diagram
This commit is contained in:
parent
b9f25630b4
commit
d69ed65a0c
1
.gitignore
vendored
1
.gitignore
vendored
@ -5,7 +5,6 @@ __pycache__/
|
||||
captchas/
|
||||
_stats.json
|
||||
config.py
|
||||
db.sqlite3
|
||||
oddb.log
|
||||
praw.ini
|
||||
env/
|
||||
|
@ -242,7 +242,7 @@ class Database:
|
||||
|
||||
return page
|
||||
|
||||
def join_website_on_scan(self, docs: list):
|
||||
def join_website_url(self, docs):
|
||||
|
||||
websites = self.get_all_websites()
|
||||
|
||||
|
84
export.py
84
export.py
@ -1,39 +1,63 @@
|
||||
import csv
|
||||
import os
|
||||
import time
|
||||
|
||||
import lz4.frame
|
||||
|
||||
import config
|
||||
from database import Database
|
||||
from search.search import ElasticSearchEngine
|
||||
|
||||
|
||||
def export(outfile="out.csv"):
|
||||
|
||||
print("Export started, connecting to databases...")
|
||||
es = ElasticSearchEngine("od-database")
|
||||
db = Database(config.DB_CONN_STR)
|
||||
docs = es.stream_all_docs()
|
||||
docs_with_website = db.join_website_on_scan(docs)
|
||||
|
||||
print("Connected, writing to csv")
|
||||
|
||||
with open(outfile + ".temp", "w") as out:
|
||||
|
||||
csv_writer = csv.writer(out)
|
||||
csv_writer.writerow(["website_id", "website_url", "path", "name", "ext", "size", "mtime"])
|
||||
|
||||
for doc in docs_with_website:
|
||||
csv_writer.writerow([doc["_source"]["website_id"],
|
||||
doc["_source"]["website_url"],
|
||||
doc["_source"]["path"] + "/" if doc["_source"]["path"] != "" else "",
|
||||
doc["_source"]["name"],
|
||||
"." + doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "",
|
||||
doc["_source"]["size"],
|
||||
doc["_source"]["mtime"]])
|
||||
print("Wrote to csv, compressing with xz")
|
||||
|
||||
os.system("xz -0 " + outfile + ".temp")
|
||||
os.system("mv " + outfile + ".temp.xz " + outfile + ".xz")
|
||||
print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes")
|
||||
def quote(string):
|
||||
if "\"" in string:
|
||||
return "\"" + string.replace("\"", "\"\"") + "\""
|
||||
elif "," in string:
|
||||
return "\"" + string + "\""
|
||||
else:
|
||||
return string
|
||||
|
||||
|
||||
export("static/out.csv")
|
||||
outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime())
|
||||
dldir = "static/downloads/"
|
||||
|
||||
print("Deleting existing dumps")
|
||||
for file in os.listdir(dldir):
|
||||
if file.endswith("_dump.csv.lz4"):
|
||||
os.remove(os.path.join(dldir, file))
|
||||
|
||||
print("Export started, connecting to databases...")
|
||||
|
||||
db = Database(config.DB_CONN_STR)
|
||||
es = ElasticSearchEngine("od-database")
|
||||
|
||||
docs_with_url = db.join_website_url(es.stream_all_docs())
|
||||
|
||||
print("Connected, writing to csv")
|
||||
|
||||
with lz4.frame.open(outfile + ".part", mode='wb',
|
||||
compression_level=9,
|
||||
block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp:
|
||||
fp.write((",".join(
|
||||
["website_id", "website_url", "path", "name", "ext", "size", "mtime"]
|
||||
) + "\n").encode())
|
||||
|
||||
for doc in docs_with_url:
|
||||
try:
|
||||
fp.write(
|
||||
(",".join(
|
||||
[
|
||||
str(doc["_source"]["website_id"]),
|
||||
quote(doc["_source"]["website_url"]),
|
||||
quote(doc["_source"]["path"]),
|
||||
quote(doc["_source"]["name"]),
|
||||
quote(doc["_source"]["ext"]),
|
||||
str(doc["_source"]["size"]),
|
||||
str(doc["_source"]["mtime"])
|
||||
]
|
||||
) + "\n").encode())
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print(doc)
|
||||
|
||||
|
||||
os.rename(outfile + ".part", os.path.join(dldir, outfile))
|
||||
|
BIN
high_level_diagram.dia
Normal file
BIN
high_level_diagram.dia
Normal file
Binary file not shown.
BIN
high_level_diagram.png
Normal file
BIN
high_level_diagram.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 332 KiB |
@ -21,4 +21,5 @@ numpy
|
||||
matplotlib
|
||||
uwsgi
|
||||
redis
|
||||
psycopg2-binary
|
||||
psycopg2-binary
|
||||
lz4
|
Loading…
x
Reference in New Issue
Block a user