Started working on post-crawl callbacks and basic auth for crawl servers

2025-12-13 23:09:01 +00:00 · 2018-06-14 15:05:56 -04:00
parent 1bd58468eb
commit 83ca579ec7
13 changed files with 142 additions and 56 deletions
--- a/stress_test.py
+++ b/stress_test.py
@@ -6,6 +6,14 @@ from concurrent.futures import ThreadPoolExecutor
 import requests
 import random

+terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain") \
+    .text.splitlines()
+exts = [
+    "zip", "exe", "mp3", "avi", "mp4", "rar", "7zip", "ogg", "m4a", "flac", "doc", "docx", "aac", "xls",
+    "cab", "txt", "c", "java", "class", "jar", "py", "cpp", "h", "png", "jpg", "jpeg", "ttf", "torrent",
+    "part", "blend", "3ds", "obj", "ico", "html", "css", "js", "ts", "ape", "asm", "nasm", "fasm", "o",
+    "so", "dll", "tar", "gz", "bin", "cad", "cmd", "bat", "sh", "md"
+]

 def dump_local_filesystem(root_dir: str):

@@ -29,6 +37,31 @@ def dump_local_filesystem(root_dir: str):
            f.writelines(json.dumps(doc) + "\n" for doc in docs)


+def random_path():
+    return "/".join(random.choices(terms, k=random.randint(1, 5)))
+
+
+def random_file_name():
+    return random.choice(["_", " ", "-", ".", "#", ""]).\
+               join(random.choices(terms, k=random.randint(1, 3))) + "." + random.choice(exts)
+
+
+def get_random_file():
+
+    doc = dict()
+    doc["name"] = random_file_name()
+    doc["path"] = random_path()
+    doc["mtime"] = random.randint(0, 10000000)
+    doc["size"] = random.randint(-1, 100000000000000)
+
+    return doc
+
+
+def dump_random_files(count=10):
+    with open("random_dump.json", "w") as f:
+        f.writelines(json.dumps(get_random_file()) + "\n" for _ in range(count))
+
+
 def index_file_list(path: str, website_id):

    es = ElasticSearchEngine("od-database")
@@ -43,14 +76,12 @@ def search(term=""):

 def random_searches(count=10000000, max_workers=1000):

-    terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain")\
-        .text.splitlines()
-
    pool = ThreadPoolExecutor(max_workers=max_workers)
    pool.map(search, random.choices(terms, k=count))



 # dump_local_filesystem("/mnt/")
-index_file_list("crawl_server/crawled/123.json", 10)
+# index_file_list("crawl_server/crawled/123.json", 10)
 # random_searches(100000)
+dump_random_files(20000 * 100000)