mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
More bugfixes for looping directories, some work on task dispatching
This commit is contained in:
parent
14d384e366
commit
7a4432e4d0
@ -87,8 +87,8 @@ class RemoteDirectoryCrawler:
|
|||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
directory = RemoteDirectoryFactory.get_directory(self.url)
|
directory = RemoteDirectoryFactory.get_directory(self.url)
|
||||||
path, root_listing = directory.list_dir("")
|
path_id, root_listing = directory.list_dir(urlparse(self.url).path)
|
||||||
self.crawled_paths.append(path)
|
self.crawled_paths.append(path_id)
|
||||||
directory.close()
|
directory.close()
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
return CrawlResult(0, "timeout")
|
return CrawlResult(0, "timeout")
|
||||||
|
@ -73,6 +73,11 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
"?C=N&O=A",
|
"?C=N&O=A",
|
||||||
"?C=N&O=A"
|
"?C=N&O=A"
|
||||||
)
|
)
|
||||||
|
FILE_NAME_BLACKLIST = (
|
||||||
|
"Parent Directory",
|
||||||
|
"../"
|
||||||
|
|
||||||
|
)
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
|
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
@ -114,8 +119,8 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
urls_to_request.append(urljoin(path_url, anchor.href))
|
urls_to_request.append(urljoin(path_url, anchor.href))
|
||||||
|
|
||||||
for file in self.request_files(urls_to_request):
|
for file in self.request_files(urls_to_request):
|
||||||
files.append(file)
|
|
||||||
path_identifier.update(bytes(file))
|
path_identifier.update(bytes(file))
|
||||||
|
files.append(file)
|
||||||
|
|
||||||
return path_identifier.hexdigest(), files
|
return path_identifier.hexdigest(), files
|
||||||
|
|
||||||
@ -197,7 +202,7 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _should_ignore(base_url, link: Anchor):
|
def _should_ignore(base_url, link: Anchor):
|
||||||
if link.text == "../" or link.href == "../" or link.href == "./" or link.href == "" \
|
if link.text in HttpDirectory.FILE_NAME_BLACKLIST or link.href in ("../", "./", "") \
|
||||||
or link.href.endswith(HttpDirectory.BLACK_LIST):
|
or link.href.endswith(HttpDirectory.BLACK_LIST):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
13
task.py
13
task.py
@ -1,5 +1,3 @@
|
|||||||
import random
|
|
||||||
|
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
from search.search import ElasticSearchEngine
|
from search.search import ElasticSearchEngine
|
||||||
from crawl_server.database import Task, TaskResult
|
from crawl_server.database import Task, TaskResult
|
||||||
@ -180,7 +178,7 @@ class TaskDispatcher:
|
|||||||
|
|
||||||
queued_tasks_by_server = self._get_current_tasks_by_server()
|
queued_tasks_by_server = self._get_current_tasks_by_server()
|
||||||
server_with_most_free_slots = None
|
server_with_most_free_slots = None
|
||||||
most_free_slots = 0
|
most_free_slots = -10000
|
||||||
|
|
||||||
for server in queued_tasks_by_server:
|
for server in queued_tasks_by_server:
|
||||||
free_slots = server.slots - len(queued_tasks_by_server[server])
|
free_slots = server.slots - len(queued_tasks_by_server[server])
|
||||||
@ -188,12 +186,11 @@ class TaskDispatcher:
|
|||||||
server_with_most_free_slots = server
|
server_with_most_free_slots = server
|
||||||
most_free_slots = free_slots
|
most_free_slots = free_slots
|
||||||
|
|
||||||
if server_with_most_free_slots:
|
print("Dispatching task to '" +
|
||||||
print("Dispatching task to '" +
|
server_with_most_free_slots.name + "' " +
|
||||||
server_with_most_free_slots.name + "' " +
|
str(most_free_slots) + " free out of " + str(server_with_most_free_slots.slots))
|
||||||
str(most_free_slots) + " free out of " + str(server_with_most_free_slots.slots))
|
|
||||||
|
|
||||||
return self.db.get_crawl_servers()[0]
|
return server_with_most_free_slots
|
||||||
|
|
||||||
def get_queued_tasks(self) -> list:
|
def get_queued_tasks(self) -> list:
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user