mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 10:56:47 +00:00
Bug fixes + export to json
This commit is contained in:
parent
d849227798
commit
fcfd7d4acc
@ -1,20 +1,14 @@
|
|||||||
from crawl_server.database import TaskManagerDatabase, Task
|
from crawl_server.database import TaskManagerDatabase, Task
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
from enum import Enum
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from crawler.crawler import RemoteDirectoryCrawler
|
from crawler.crawler import RemoteDirectoryCrawler
|
||||||
|
|
||||||
|
|
||||||
class TaskResultStatus(Enum):
|
|
||||||
SUCCESS = 0
|
|
||||||
FAILURE = 1
|
|
||||||
|
|
||||||
|
|
||||||
class TaskResult:
|
class TaskResult:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.status_code: TaskResultStatus = None
|
self.status_code: str = None
|
||||||
self.file_count = 0
|
self.file_count = 0
|
||||||
self.start_time = None
|
self.start_time = None
|
||||||
self.end_time = None
|
self.end_time = None
|
||||||
@ -56,7 +50,10 @@ class TaskManager:
|
|||||||
print("Starting task " + task.url)
|
print("Starting task " + task.url)
|
||||||
|
|
||||||
crawler = RemoteDirectoryCrawler(task.url, 10)
|
crawler = RemoteDirectoryCrawler(task.url, 10)
|
||||||
crawler.crawl_directory()
|
crawl_result = crawler.crawl_directory("12345.json")
|
||||||
|
|
||||||
|
result.file_count = crawl_result.file_count
|
||||||
|
result.status_code = crawl_result.status_code
|
||||||
|
|
||||||
print("End task " + task.url)
|
print("End task " + task.url)
|
||||||
|
|
||||||
@ -67,6 +64,10 @@ class TaskManager:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def task_complete(result: TaskResult):
|
def task_complete(result: TaskResult):
|
||||||
print("Task done " + str(result))
|
print("Task done " + str(result))
|
||||||
|
print(result.status_code)
|
||||||
|
print(result.file_count)
|
||||||
|
print(result.start_time)
|
||||||
|
print(result.end_time)
|
||||||
# todo save in db
|
# todo save in db
|
||||||
|
|
||||||
|
|
||||||
|
@ -61,20 +61,46 @@ class RemoteDirectoryFactory:
|
|||||||
return dir_engine(url)
|
return dir_engine(url)
|
||||||
|
|
||||||
|
|
||||||
|
def export_to_json(q: Queue, out_file: str) -> int:
|
||||||
|
|
||||||
|
counter = 0
|
||||||
|
|
||||||
|
with open(out_file, "w") as f:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
next_file: File = q.get_nowait()
|
||||||
|
f.write(next_file.to_json())
|
||||||
|
f.write("\n")
|
||||||
|
counter += 1
|
||||||
|
except Empty:
|
||||||
|
break
|
||||||
|
|
||||||
|
return counter
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlResult:
|
||||||
|
|
||||||
|
def __init__(self, file_count: int, status_code: str):
|
||||||
|
self.file_count = file_count
|
||||||
|
self.status_code = status_code
|
||||||
|
|
||||||
|
|
||||||
class RemoteDirectoryCrawler:
|
class RemoteDirectoryCrawler:
|
||||||
|
|
||||||
def __init__(self, url, max_threads: int):
|
def __init__(self, url, max_threads: int):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.max_threads = max_threads
|
self.max_threads = max_threads
|
||||||
|
self.crawled_paths = set()
|
||||||
|
|
||||||
def crawl_directory(self):
|
def crawl_directory(self, out_file: str) -> CrawlResult:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
directory = RemoteDirectoryFactory.get_directory(self.url)
|
directory = RemoteDirectoryFactory.get_directory(self.url)
|
||||||
root_listing = directory.list_dir("/")
|
root_listing = directory.list_dir("")
|
||||||
|
self.crawled_paths.add("")
|
||||||
directory.close()
|
directory.close()
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
return
|
return CrawlResult(0, "timeout")
|
||||||
|
|
||||||
in_q = Queue(maxsize=0)
|
in_q = Queue(maxsize=0)
|
||||||
files_q = Queue(maxsize=0)
|
files_q = Queue(maxsize=0)
|
||||||
@ -86,12 +112,15 @@ class RemoteDirectoryCrawler:
|
|||||||
|
|
||||||
threads = []
|
threads = []
|
||||||
for i in range(self.max_threads):
|
for i in range(self.max_threads):
|
||||||
worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self.url, in_q, files_q))
|
worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self, self.url, in_q, files_q))
|
||||||
threads.append(worker)
|
threads.append(worker)
|
||||||
worker.start()
|
worker.start()
|
||||||
|
|
||||||
in_q.join()
|
in_q.join()
|
||||||
print("DONE")
|
print("Done")
|
||||||
|
|
||||||
|
exported_count = export_to_json(files_q, out_file)
|
||||||
|
print("exported to " + out_file)
|
||||||
|
|
||||||
# Kill threads
|
# Kill threads
|
||||||
for _ in threads:
|
for _ in threads:
|
||||||
@ -99,11 +128,9 @@ class RemoteDirectoryCrawler:
|
|||||||
for t in threads:
|
for t in threads:
|
||||||
t.join()
|
t.join()
|
||||||
|
|
||||||
print(files_q.qsize())
|
return CrawlResult(exported_count, "success")
|
||||||
return []
|
|
||||||
|
|
||||||
@staticmethod
|
def _process_listings(self, url: str, in_q: Queue, files_q: Queue):
|
||||||
def _process_listings(url: str, in_q: Queue, files_q: Queue):
|
|
||||||
|
|
||||||
directory = RemoteDirectoryFactory.get_directory(url)
|
directory = RemoteDirectoryFactory.get_directory(url)
|
||||||
|
|
||||||
@ -118,16 +145,21 @@ class RemoteDirectoryCrawler:
|
|||||||
break
|
break
|
||||||
|
|
||||||
try:
|
try:
|
||||||
listing = directory.list_dir(os.path.join(file.path, file.name, ""))
|
path = os.path.join(file.path, file.name, "")
|
||||||
|
if path not in self.crawled_paths:
|
||||||
|
listing = directory.list_dir(path)
|
||||||
|
self.crawled_paths.add(path)
|
||||||
|
|
||||||
for f in listing:
|
for f in listing:
|
||||||
if f.is_dir:
|
if f.is_dir:
|
||||||
in_q.put(f)
|
in_q.put(f)
|
||||||
else:
|
else:
|
||||||
files_q.put(f)
|
files_q.put(f)
|
||||||
except TooManyConnectionsError:
|
except TooManyConnectionsError:
|
||||||
print("Too many connections")
|
print("Too many connections")
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
pass
|
pass
|
||||||
finally:
|
finally:
|
||||||
in_q.task_done()
|
in_q.task_done()
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,10 +38,9 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
self.parser = etree.HTMLParser(collect_ids=False)
|
self.parser = etree.HTMLParser(collect_ids=False)
|
||||||
|
|
||||||
def list_dir(self, path) -> list:
|
def list_dir(self, path) -> list:
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
path_url = urljoin(self.base_url, path)
|
path_url = os.path.join(self.base_url, path.strip("/"), "")
|
||||||
body = self._fetch_body(path_url)
|
body = self._fetch_body(path_url)
|
||||||
if not body:
|
if not body:
|
||||||
return []
|
return []
|
||||||
@ -130,7 +129,7 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
try:
|
try:
|
||||||
r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50)
|
r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50)
|
||||||
|
|
||||||
stripped_url = r.url[len(self.base_url) - 1:]
|
stripped_url = url[len(self.base_url) - 1:]
|
||||||
|
|
||||||
path, name = os.path.split(stripped_url)
|
path, name = os.path.split(stripped_url)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user