mirror of
				https://github.com/simon987/od-database.git
				synced 2025-11-04 06:56:52 +00:00 
			
		
		
		
	Rewrite export.py, add diagram
This commit is contained in:
		
							parent
							
								
									b9f25630b4
								
							
						
					
					
						commit
						d69ed65a0c
					
				
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -5,7 +5,6 @@ __pycache__/
 | 
				
			|||||||
captchas/
 | 
					captchas/
 | 
				
			||||||
_stats.json
 | 
					_stats.json
 | 
				
			||||||
config.py
 | 
					config.py
 | 
				
			||||||
db.sqlite3
 | 
					 | 
				
			||||||
oddb.log
 | 
					oddb.log
 | 
				
			||||||
praw.ini
 | 
					praw.ini
 | 
				
			||||||
env/
 | 
					env/
 | 
				
			||||||
 | 
				
			|||||||
@ -242,7 +242,7 @@ class Database:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        return page
 | 
					        return page
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def join_website_on_scan(self, docs: list):
 | 
					    def join_website_url(self, docs):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        websites = self.get_all_websites()
 | 
					        websites = self.get_all_websites()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										70
									
								
								export.py
									
									
									
									
									
								
							
							
						
						
									
										70
									
								
								export.py
									
									
									
									
									
								
							@ -1,39 +1,63 @@
 | 
				
			|||||||
import csv
 | 
					 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
 | 
					import time
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import lz4.frame
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import config
 | 
					import config
 | 
				
			||||||
from database import Database
 | 
					from database import Database
 | 
				
			||||||
from search.search import ElasticSearchEngine
 | 
					from search.search import ElasticSearchEngine
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def export(outfile="out.csv"):
 | 
					def quote(string):
 | 
				
			||||||
 | 
					    if "\"" in string:
 | 
				
			||||||
 | 
					        return "\"" + string.replace("\"", "\"\"") + "\""
 | 
				
			||||||
 | 
					    elif "," in string:
 | 
				
			||||||
 | 
					        return "\"" + string + "\""
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        return string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime())
 | 
				
			||||||
 | 
					dldir = "static/downloads/"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					print("Deleting existing dumps")
 | 
				
			||||||
 | 
					for file in os.listdir(dldir):
 | 
				
			||||||
 | 
					    if file.endswith("_dump.csv.lz4"):
 | 
				
			||||||
 | 
					        os.remove(os.path.join(dldir, file))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
print("Export started, connecting to databases...")
 | 
					print("Export started, connecting to databases...")
 | 
				
			||||||
    es = ElasticSearchEngine("od-database")
 | 
					
 | 
				
			||||||
db = Database(config.DB_CONN_STR)
 | 
					db = Database(config.DB_CONN_STR)
 | 
				
			||||||
    docs = es.stream_all_docs()
 | 
					es = ElasticSearchEngine("od-database")
 | 
				
			||||||
    docs_with_website = db.join_website_on_scan(docs)
 | 
					
 | 
				
			||||||
 | 
					docs_with_url = db.join_website_url(es.stream_all_docs())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
print("Connected, writing to csv")
 | 
					print("Connected, writing to csv")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with open(outfile + ".temp", "w") as out:
 | 
					with lz4.frame.open(outfile + ".part", mode='wb',
 | 
				
			||||||
 | 
					                    compression_level=9,
 | 
				
			||||||
 | 
					                    block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp:
 | 
				
			||||||
 | 
					    fp.write((",".join(
 | 
				
			||||||
 | 
					        ["website_id", "website_url", "path", "name", "ext", "size", "mtime"]
 | 
				
			||||||
 | 
					    ) + "\n").encode())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        csv_writer = csv.writer(out)
 | 
					    for doc in docs_with_url:
 | 
				
			||||||
        csv_writer.writerow(["website_id", "website_url", "path", "name", "ext", "size", "mtime"])
 | 
					        try:
 | 
				
			||||||
 | 
					            fp.write(
 | 
				
			||||||
        for doc in docs_with_website:
 | 
					                (",".join(
 | 
				
			||||||
            csv_writer.writerow([doc["_source"]["website_id"],
 | 
					                    [
 | 
				
			||||||
                                 doc["_source"]["website_url"],
 | 
					                        str(doc["_source"]["website_id"]),
 | 
				
			||||||
                                 doc["_source"]["path"] + "/" if doc["_source"]["path"] != "" else "",
 | 
					                        quote(doc["_source"]["website_url"]),
 | 
				
			||||||
                                 doc["_source"]["name"],
 | 
					                        quote(doc["_source"]["path"]),
 | 
				
			||||||
                                 "." + doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "",
 | 
					                        quote(doc["_source"]["name"]),
 | 
				
			||||||
                                 doc["_source"]["size"],
 | 
					                        quote(doc["_source"]["ext"]),
 | 
				
			||||||
                                 doc["_source"]["mtime"]])
 | 
					                        str(doc["_source"]["size"]),
 | 
				
			||||||
    print("Wrote to csv, compressing with xz")
 | 
					                        str(doc["_source"]["mtime"])
 | 
				
			||||||
 | 
					                    ]
 | 
				
			||||||
    os.system("xz -0 " + outfile + ".temp")
 | 
					                ) + "\n").encode())
 | 
				
			||||||
    os.system("mv " + outfile + ".temp.xz " + outfile + ".xz")
 | 
					        except Exception as e:
 | 
				
			||||||
    print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes")
 | 
					            print(e)
 | 
				
			||||||
 | 
					            print(doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export("static/out.csv")
 | 
					os.rename(outfile + ".part", os.path.join(dldir, outfile))
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										
											BIN
										
									
								
								high_level_diagram.dia
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								high_level_diagram.dia
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								high_level_diagram.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								high_level_diagram.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| 
		 After Width: | Height: | Size: 332 KiB  | 
@ -22,3 +22,4 @@ matplotlib
 | 
				
			|||||||
uwsgi
 | 
					uwsgi
 | 
				
			||||||
redis
 | 
					redis
 | 
				
			||||||
psycopg2-binary
 | 
					psycopg2-binary
 | 
				
			||||||
 | 
					lz4
 | 
				
			||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user