mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 02:46:45 +00:00
Elasticsearch export to csv
This commit is contained in:
parent
81d52a4551
commit
e5e38a6faf
12
database.py
12
database.py
@ -218,6 +218,18 @@ class Database:
|
|||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
def join_website_on_scan(self, docs: list):
|
||||||
|
|
||||||
|
websites = self.get_all_websites()
|
||||||
|
|
||||||
|
for doc in docs:
|
||||||
|
if doc["_source"]["website_id"] in websites:
|
||||||
|
doc["_source"]["website_url"] = websites[doc["_source"]["website_id"]]
|
||||||
|
else:
|
||||||
|
doc["_source"]["website_url"] = "[DELETED]"
|
||||||
|
|
||||||
|
yield doc
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
37
export.py
Normal file
37
export.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
from search.search import ElasticSearchEngine
|
||||||
|
from database import Database
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def export(outfile="out.csv"):
|
||||||
|
|
||||||
|
print("Export started, connecting to databases...")
|
||||||
|
es = ElasticSearchEngine("od-database")
|
||||||
|
db = Database("db.sqlite")
|
||||||
|
docs = es.stream_all_docs()
|
||||||
|
docs_with_website = db.join_website_on_scan(docs)
|
||||||
|
|
||||||
|
print("Connected")
|
||||||
|
|
||||||
|
with open(outfile + ".temp", "w") as out:
|
||||||
|
|
||||||
|
csv_writer = csv.writer(out)
|
||||||
|
csv_writer.writerow(["website_id", "website_url", "path", "name", "ext", "size", "mtime"])
|
||||||
|
|
||||||
|
for doc in docs_with_website:
|
||||||
|
csv_writer.writerow([doc["_source"]["website_id"],
|
||||||
|
doc["_source"]["website_url"],
|
||||||
|
doc["_source"]["path"] + "/" if doc["_source"]["path"] != "" else "",
|
||||||
|
doc["_source"]["name"],
|
||||||
|
"." + doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "",
|
||||||
|
doc["_source"]["size"],
|
||||||
|
doc["_source"]["mtime"]])
|
||||||
|
print("Wrote to csv, compressing with xz")
|
||||||
|
|
||||||
|
os.system("xz " + outfile + ".temp")
|
||||||
|
os.system("mv " + outfile + ".temp.xz " + outfile + ".xz")
|
||||||
|
print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes")
|
||||||
|
|
||||||
|
|
||||||
|
export("static/export.csv")
|
@ -1,6 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
sqlite3 -header -csv db.sqlite3 < export.sql > out.csv.temp
|
|
||||||
echo "Exported $(wc -l < out.csv.temp) files"
|
|
||||||
xz out.csv.temp
|
|
||||||
mv out.csv.temp.xz static/out.csv.xz
|
|
||||||
echo "Compressed to $(stat --printf="%s" static/out.csv.xz) bytes"
|
|
@ -1,5 +0,0 @@
|
|||||||
SELECT Website.url, WebsitePath.path, File.name, File.size, MT.mime
|
|
||||||
FROM File
|
|
||||||
INNER JOIN WebsitePath on File.path_id = WebsitePath.id
|
|
||||||
INNER JOIN Website on WebsitePath.website_id = Website.id
|
|
||||||
INNER JOIN FileType MT on File.mime_id = MT.id;
|
|
@ -283,3 +283,11 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
stats["base_url"] = "entire database"
|
stats["base_url"] = "entire database"
|
||||||
|
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
|
def stream_all_docs(self):
|
||||||
|
|
||||||
|
return helpers.scan(query={
|
||||||
|
"query": {
|
||||||
|
"match_all": {}
|
||||||
|
}
|
||||||
|
}, scroll="5m", client=self.es, index=self.index_name)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user