Bug fixes + export to json

This commit is contained in:
Simon 2018-06-11 20:02:30 -04:00
parent d849227798
commit fcfd7d4acc
3 changed files with 58 additions and 26 deletions

View File

@ -1,20 +1,14 @@
from crawl_server.database import TaskManagerDatabase, Task from crawl_server.database import TaskManagerDatabase, Task
from multiprocessing import Pool from multiprocessing import Pool
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
from enum import Enum
from datetime import datetime from datetime import datetime
from crawler.crawler import RemoteDirectoryCrawler from crawler.crawler import RemoteDirectoryCrawler
class TaskResultStatus(Enum):
SUCCESS = 0
FAILURE = 1
class TaskResult: class TaskResult:
def __init__(self): def __init__(self):
self.status_code: TaskResultStatus = None self.status_code: str = None
self.file_count = 0 self.file_count = 0
self.start_time = None self.start_time = None
self.end_time = None self.end_time = None
@ -56,7 +50,10 @@ class TaskManager:
print("Starting task " + task.url) print("Starting task " + task.url)
crawler = RemoteDirectoryCrawler(task.url, 10) crawler = RemoteDirectoryCrawler(task.url, 10)
crawler.crawl_directory() crawl_result = crawler.crawl_directory("12345.json")
result.file_count = crawl_result.file_count
result.status_code = crawl_result.status_code
print("End task " + task.url) print("End task " + task.url)
@ -67,6 +64,10 @@ class TaskManager:
@staticmethod @staticmethod
def task_complete(result: TaskResult): def task_complete(result: TaskResult):
print("Task done " + str(result)) print("Task done " + str(result))
print(result.status_code)
print(result.file_count)
print(result.start_time)
print(result.end_time)
# todo save in db # todo save in db

View File

@ -61,20 +61,46 @@ class RemoteDirectoryFactory:
return dir_engine(url) return dir_engine(url)
def export_to_json(q: Queue, out_file: str) -> int:
counter = 0
with open(out_file, "w") as f:
while True:
try:
next_file: File = q.get_nowait()
f.write(next_file.to_json())
f.write("\n")
counter += 1
except Empty:
break
return counter
class CrawlResult:
def __init__(self, file_count: int, status_code: str):
self.file_count = file_count
self.status_code = status_code
class RemoteDirectoryCrawler: class RemoteDirectoryCrawler:
def __init__(self, url, max_threads: int): def __init__(self, url, max_threads: int):
self.url = url self.url = url
self.max_threads = max_threads self.max_threads = max_threads
self.crawled_paths = set()
def crawl_directory(self): def crawl_directory(self, out_file: str) -> CrawlResult:
try: try:
directory = RemoteDirectoryFactory.get_directory(self.url) directory = RemoteDirectoryFactory.get_directory(self.url)
root_listing = directory.list_dir("/") root_listing = directory.list_dir("")
self.crawled_paths.add("")
directory.close() directory.close()
except TimeoutError: except TimeoutError:
return return CrawlResult(0, "timeout")
in_q = Queue(maxsize=0) in_q = Queue(maxsize=0)
files_q = Queue(maxsize=0) files_q = Queue(maxsize=0)
@ -86,12 +112,15 @@ class RemoteDirectoryCrawler:
threads = [] threads = []
for i in range(self.max_threads): for i in range(self.max_threads):
worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self.url, in_q, files_q)) worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self, self.url, in_q, files_q))
threads.append(worker) threads.append(worker)
worker.start() worker.start()
in_q.join() in_q.join()
print("DONE") print("Done")
exported_count = export_to_json(files_q, out_file)
print("exported to " + out_file)
# Kill threads # Kill threads
for _ in threads: for _ in threads:
@ -99,11 +128,9 @@ class RemoteDirectoryCrawler:
for t in threads: for t in threads:
t.join() t.join()
print(files_q.qsize()) return CrawlResult(exported_count, "success")
return []
@staticmethod def _process_listings(self, url: str, in_q: Queue, files_q: Queue):
def _process_listings(url: str, in_q: Queue, files_q: Queue):
directory = RemoteDirectoryFactory.get_directory(url) directory = RemoteDirectoryFactory.get_directory(url)
@ -118,16 +145,21 @@ class RemoteDirectoryCrawler:
break break
try: try:
listing = directory.list_dir(os.path.join(file.path, file.name, "")) path = os.path.join(file.path, file.name, "")
if path not in self.crawled_paths:
listing = directory.list_dir(path)
self.crawled_paths.add(path)
for f in listing: for f in listing:
if f.is_dir: if f.is_dir:
in_q.put(f) in_q.put(f)
else: else:
files_q.put(f) files_q.put(f)
except TooManyConnectionsError: except TooManyConnectionsError:
print("Too many connections") print("Too many connections")
except TimeoutError: except TimeoutError:
pass pass
finally: finally:
in_q.task_done() in_q.task_done()

View File

@ -38,10 +38,9 @@ class HttpDirectory(RemoteDirectory):
self.parser = etree.HTMLParser(collect_ids=False) self.parser = etree.HTMLParser(collect_ids=False)
def list_dir(self, path) -> list: def list_dir(self, path) -> list:
results = [] results = []
path_url = urljoin(self.base_url, path) path_url = os.path.join(self.base_url, path.strip("/"), "")
body = self._fetch_body(path_url) body = self._fetch_body(path_url)
if not body: if not body:
return [] return []
@ -130,7 +129,7 @@ class HttpDirectory(RemoteDirectory):
try: try:
r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50) r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50)
stripped_url = r.url[len(self.base_url) - 1:] stripped_url = url[len(self.base_url) - 1:]
path, name = os.path.split(stripped_url) path, name = os.path.split(stripped_url)