Elasticsearch search engine (import from json)

This commit is contained in:
Simon
2018-06-11 22:35:49 -04:00
parent fcfd7d4acc
commit 72495275b0
9 changed files with 190 additions and 23 deletions

View File

@@ -12,7 +12,7 @@ class TooManyConnectionsError(Exception):
class File:
def __init__(self, name: str, size: int, mtime: str, path: str, is_dir: bool):
def __init__(self, name: str, size: int, mtime: int, path: str, is_dir: bool):
self.name = name
self.size = size
self.mtime = mtime
@@ -69,8 +69,7 @@ def export_to_json(q: Queue, out_file: str) -> int:
while True:
try:
next_file: File = q.get_nowait()
f.write(next_file.to_json())
f.write("\n")
f.write(next_file.to_json() + "\n")
counter += 1
except Empty:
break

View File

@@ -61,7 +61,7 @@ class FtpDirectory(RemoteDirectory):
results.append(File(
name=file_name,
mtime=stat.st_mtime,
mtime=stat.st_mtime, # TODO: check
size=-1 if is_dir else stat.st_size,
is_dir=is_dir,
path=path

View File

@@ -8,6 +8,7 @@ import requests
from requests.exceptions import RequestException
from multiprocessing.pool import ThreadPool
import config
from dateutil.parser import parse as parse_date
class Link:
@@ -59,7 +60,7 @@ class HttpDirectory(RemoteDirectory):
if self._isdir(link):
results.append(File(
name=file_name,
mtime="",
mtime=0,
size=-1,
is_dir=True,
path=path
@@ -79,6 +80,7 @@ class HttpDirectory(RemoteDirectory):
# Many urls, use multi-threaded solution
pool = ThreadPool(processes=10)
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
pool.close()
for file in files:
if file:
results.append(file)
@@ -132,12 +134,12 @@ class HttpDirectory(RemoteDirectory):
stripped_url = url[len(self.base_url) - 1:]
path, name = os.path.split(stripped_url)
date = r.headers["Date"] if "Date" in r.headers else "1970-01-01"
return File(
path=unquote(path).strip("/"),
name=unquote(name),
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
mtime=r.headers["Date"] if "Date" in r.headers else "?",
mtime=int(parse_date(date).timestamp()),
is_dir=False
)
except RequestException: