mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 02:46:45 +00:00
Elasticsearch export to csv
This commit is contained in:
parent
81d52a4551
commit
e5e38a6faf
12
database.py
12
database.py
@ -218,6 +218,18 @@ class Database:
|
||||
|
||||
return page
|
||||
|
||||
def join_website_on_scan(self, docs: list):
|
||||
|
||||
websites = self.get_all_websites()
|
||||
|
||||
for doc in docs:
|
||||
if doc["_source"]["website_id"] in websites:
|
||||
doc["_source"]["website_url"] = websites[doc["_source"]["website_id"]]
|
||||
else:
|
||||
doc["_source"]["website_url"] = "[DELETED]"
|
||||
|
||||
yield doc
|
||||
|
||||
|
||||
|
||||
|
||||
|
37
export.py
Normal file
37
export.py
Normal file
@ -0,0 +1,37 @@
|
||||
from search.search import ElasticSearchEngine
|
||||
from database import Database
|
||||
import csv
|
||||
import os
|
||||
|
||||
|
||||
def export(outfile="out.csv"):
|
||||
|
||||
print("Export started, connecting to databases...")
|
||||
es = ElasticSearchEngine("od-database")
|
||||
db = Database("db.sqlite")
|
||||
docs = es.stream_all_docs()
|
||||
docs_with_website = db.join_website_on_scan(docs)
|
||||
|
||||
print("Connected")
|
||||
|
||||
with open(outfile + ".temp", "w") as out:
|
||||
|
||||
csv_writer = csv.writer(out)
|
||||
csv_writer.writerow(["website_id", "website_url", "path", "name", "ext", "size", "mtime"])
|
||||
|
||||
for doc in docs_with_website:
|
||||
csv_writer.writerow([doc["_source"]["website_id"],
|
||||
doc["_source"]["website_url"],
|
||||
doc["_source"]["path"] + "/" if doc["_source"]["path"] != "" else "",
|
||||
doc["_source"]["name"],
|
||||
"." + doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "",
|
||||
doc["_source"]["size"],
|
||||
doc["_source"]["mtime"]])
|
||||
print("Wrote to csv, compressing with xz")
|
||||
|
||||
os.system("xz " + outfile + ".temp")
|
||||
os.system("mv " + outfile + ".temp.xz " + outfile + ".xz")
|
||||
print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes")
|
||||
|
||||
|
||||
export("static/export.csv")
|
@ -1,6 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
sqlite3 -header -csv db.sqlite3 < export.sql > out.csv.temp
|
||||
echo "Exported $(wc -l < out.csv.temp) files"
|
||||
xz out.csv.temp
|
||||
mv out.csv.temp.xz static/out.csv.xz
|
||||
echo "Compressed to $(stat --printf="%s" static/out.csv.xz) bytes"
|
@ -1,5 +0,0 @@
|
||||
SELECT Website.url, WebsitePath.path, File.name, File.size, MT.mime
|
||||
FROM File
|
||||
INNER JOIN WebsitePath on File.path_id = WebsitePath.id
|
||||
INNER JOIN Website on WebsitePath.website_id = Website.id
|
||||
INNER JOIN FileType MT on File.mime_id = MT.id;
|
@ -283,3 +283,11 @@ class ElasticSearchEngine(SearchEngine):
|
||||
stats["base_url"] = "entire database"
|
||||
|
||||
return stats
|
||||
|
||||
def stream_all_docs(self):
|
||||
|
||||
return helpers.scan(query={
|
||||
"query": {
|
||||
"match_all": {}
|
||||
}
|
||||
}, scroll="5m", client=self.es, index=self.index_name)
|
||||
|
Loading…
x
Reference in New Issue
Block a user