mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
57 lines
1.4 KiB
Python
57 lines
1.4 KiB
Python
import os
|
|
import json
|
|
import sys
|
|
from search.search import ElasticSearchEngine
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
import requests
|
|
import random
|
|
|
|
|
|
def dump_local_filesystem(root_dir: str):
|
|
|
|
docs = []
|
|
|
|
for root, dirs, files in os.walk(root_dir):
|
|
|
|
for filename in files:
|
|
full_path = os.path.join(root, filename)
|
|
stats = os.stat(full_path)
|
|
|
|
doc = dict()
|
|
doc["name"] = filename
|
|
doc["path"] = root
|
|
doc["mtime"] = stats.st_mtime
|
|
doc["size"] = stats.st_size
|
|
|
|
docs.append(doc)
|
|
|
|
with open("local_filesystem.json", "w") as f:
|
|
f.writelines(json.dumps(doc) + "\n" for doc in docs)
|
|
|
|
|
|
def index_file_list(path: str, website_id):
|
|
|
|
es = ElasticSearchEngine("od-database")
|
|
with open(path, "r") as f:
|
|
es.import_json(f.read(), website_id)
|
|
|
|
|
|
def search(term=""):
|
|
requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False)
|
|
print(term)
|
|
|
|
|
|
def random_searches(count=10000000, max_workers=1000):
|
|
|
|
terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain")\
|
|
.text.splitlines()
|
|
|
|
pool = ThreadPoolExecutor(max_workers=max_workers)
|
|
pool.map(search, random.choices(terms, k=count))
|
|
|
|
|
|
|
|
# dump_local_filesystem("/mnt/")
|
|
index_file_list("crawl_server/crawled/123.json", 10)
|
|
# random_searches(100000)
|