diff --git a/database.py b/database.py index 0981f3e..2d2e3a6 100644 --- a/database.py +++ b/database.py @@ -218,6 +218,18 @@ class Database: return page + def join_website_on_scan(self, docs: list): + + websites = self.get_all_websites() + + for doc in docs: + if doc["_source"]["website_id"] in websites: + doc["_source"]["website_url"] = websites[doc["_source"]["website_id"]] + else: + doc["_source"]["website_url"] = "[DELETED]" + + yield doc + diff --git a/export.py b/export.py new file mode 100644 index 0000000..1479034 --- /dev/null +++ b/export.py @@ -0,0 +1,37 @@ +from search.search import ElasticSearchEngine +from database import Database +import csv +import os + + +def export(outfile="out.csv"): + + print("Export started, connecting to databases...") + es = ElasticSearchEngine("od-database") + db = Database("db.sqlite") + docs = es.stream_all_docs() + docs_with_website = db.join_website_on_scan(docs) + + print("Connected") + + with open(outfile + ".temp", "w") as out: + + csv_writer = csv.writer(out) + csv_writer.writerow(["website_id", "website_url", "path", "name", "ext", "size", "mtime"]) + + for doc in docs_with_website: + csv_writer.writerow([doc["_source"]["website_id"], + doc["_source"]["website_url"], + doc["_source"]["path"] + "/" if doc["_source"]["path"] != "" else "", + doc["_source"]["name"], + "." + doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "", + doc["_source"]["size"], + doc["_source"]["mtime"]]) + print("Wrote to csv, compressing with xz") + + os.system("xz " + outfile + ".temp") + os.system("mv " + outfile + ".temp.xz " + outfile + ".xz") + print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes") + + +export("static/export.csv") diff --git a/export.sh b/export.sh deleted file mode 100755 index 81ff9d1..0000000 --- a/export.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash -sqlite3 -header -csv db.sqlite3 < export.sql > out.csv.temp -echo "Exported $(wc -l < out.csv.temp) files" -xz out.csv.temp -mv out.csv.temp.xz static/out.csv.xz -echo "Compressed to $(stat --printf="%s" static/out.csv.xz) bytes" \ No newline at end of file diff --git a/export.sql b/export.sql deleted file mode 100644 index c2fbeb8..0000000 --- a/export.sql +++ /dev/null @@ -1,5 +0,0 @@ -SELECT Website.url, WebsitePath.path, File.name, File.size, MT.mime - FROM File - INNER JOIN WebsitePath on File.path_id = WebsitePath.id - INNER JOIN Website on WebsitePath.website_id = Website.id - INNER JOIN FileType MT on File.mime_id = MT.id; diff --git a/search/search.py b/search/search.py index 671a89c..fe0ff2d 100644 --- a/search/search.py +++ b/search/search.py @@ -283,3 +283,11 @@ class ElasticSearchEngine(SearchEngine): stats["base_url"] = "entire database" return stats + + def stream_all_docs(self): + + return helpers.scan(query={ + "query": { + "match_all": {} + } + }, scroll="5m", client=self.es, index=self.index_name)