Elasticsearch export to csv

This commit is contained in:
Simon 2018-06-19 09:48:44 -04:00
parent 81d52a4551
commit e5e38a6faf
5 changed files with 57 additions and 11 deletions

View File

@ -218,6 +218,18 @@ class Database:
return page return page
def join_website_on_scan(self, docs: list):
websites = self.get_all_websites()
for doc in docs:
if doc["_source"]["website_id"] in websites:
doc["_source"]["website_url"] = websites[doc["_source"]["website_id"]]
else:
doc["_source"]["website_url"] = "[DELETED]"
yield doc

37
export.py Normal file
View File

@ -0,0 +1,37 @@
from search.search import ElasticSearchEngine
from database import Database
import csv
import os
def export(outfile="out.csv"):
print("Export started, connecting to databases...")
es = ElasticSearchEngine("od-database")
db = Database("db.sqlite")
docs = es.stream_all_docs()
docs_with_website = db.join_website_on_scan(docs)
print("Connected")
with open(outfile + ".temp", "w") as out:
csv_writer = csv.writer(out)
csv_writer.writerow(["website_id", "website_url", "path", "name", "ext", "size", "mtime"])
for doc in docs_with_website:
csv_writer.writerow([doc["_source"]["website_id"],
doc["_source"]["website_url"],
doc["_source"]["path"] + "/" if doc["_source"]["path"] != "" else "",
doc["_source"]["name"],
"." + doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "",
doc["_source"]["size"],
doc["_source"]["mtime"]])
print("Wrote to csv, compressing with xz")
os.system("xz " + outfile + ".temp")
os.system("mv " + outfile + ".temp.xz " + outfile + ".xz")
print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes")
export("static/export.csv")

View File

@ -1,6 +0,0 @@
#!/usr/bin/env bash
sqlite3 -header -csv db.sqlite3 < export.sql > out.csv.temp
echo "Exported $(wc -l < out.csv.temp) files"
xz out.csv.temp
mv out.csv.temp.xz static/out.csv.xz
echo "Compressed to $(stat --printf="%s" static/out.csv.xz) bytes"

View File

@ -1,5 +0,0 @@
SELECT Website.url, WebsitePath.path, File.name, File.size, MT.mime
FROM File
INNER JOIN WebsitePath on File.path_id = WebsitePath.id
INNER JOIN Website on WebsitePath.website_id = Website.id
INNER JOIN FileType MT on File.mime_id = MT.id;

View File

@ -283,3 +283,11 @@ class ElasticSearchEngine(SearchEngine):
stats["base_url"] = "entire database" stats["base_url"] = "entire database"
return stats return stats
def stream_all_docs(self):
return helpers.scan(query={
"query": {
"match_all": {}
}
}, scroll="5m", client=self.es, index=self.index_name)