More bugfixes for looping directories, some work on task dispatching

This commit is contained in:
Simon 2018-06-21 20:50:26 -04:00
parent 14d384e366
commit 7a4432e4d0
3 changed files with 14 additions and 12 deletions

View File

@ -87,8 +87,8 @@ class RemoteDirectoryCrawler:
try:
try:
directory = RemoteDirectoryFactory.get_directory(self.url)
path, root_listing = directory.list_dir("")
self.crawled_paths.append(path)
path_id, root_listing = directory.list_dir(urlparse(self.url).path)
self.crawled_paths.append(path_id)
directory.close()
except TimeoutError:
return CrawlResult(0, "timeout")

View File

@ -73,6 +73,11 @@ class HttpDirectory(RemoteDirectory):
"?C=N&O=A",
"?C=N&O=A"
)
FILE_NAME_BLACKLIST = (
"Parent Directory",
"../"
)
MAX_RETRIES = 3
def __init__(self, url):
@ -114,8 +119,8 @@ class HttpDirectory(RemoteDirectory):
urls_to_request.append(urljoin(path_url, anchor.href))
for file in self.request_files(urls_to_request):
files.append(file)
path_identifier.update(bytes(file))
files.append(file)
return path_identifier.hexdigest(), files
@ -197,7 +202,7 @@ class HttpDirectory(RemoteDirectory):
@staticmethod
def _should_ignore(base_url, link: Anchor):
if link.text == "../" or link.href == "../" or link.href == "./" or link.href == "" \
if link.text in HttpDirectory.FILE_NAME_BLACKLIST or link.href in ("../", "./", "") \
or link.href.endswith(HttpDirectory.BLACK_LIST):
return True

13
task.py
View File

@ -1,5 +1,3 @@
import random
from apscheduler.schedulers.background import BackgroundScheduler
from search.search import ElasticSearchEngine
from crawl_server.database import Task, TaskResult
@ -180,7 +178,7 @@ class TaskDispatcher:
queued_tasks_by_server = self._get_current_tasks_by_server()
server_with_most_free_slots = None
most_free_slots = 0
most_free_slots = -10000
for server in queued_tasks_by_server:
free_slots = server.slots - len(queued_tasks_by_server[server])
@ -188,12 +186,11 @@ class TaskDispatcher:
server_with_most_free_slots = server
most_free_slots = free_slots
if server_with_most_free_slots:
print("Dispatching task to '" +
server_with_most_free_slots.name + "' " +
str(most_free_slots) + " free out of " + str(server_with_most_free_slots.slots))
print("Dispatching task to '" +
server_with_most_free_slots.name + "' " +
str(most_free_slots) + " free out of " + str(server_with_most_free_slots.slots))
return self.db.get_crawl_servers()[0]
return server_with_most_free_slots
def get_queued_tasks(self) -> list: