mirror of
https://github.com/simon987/od-database.git
synced 2025-04-18 01:46:46 +00:00
More bugfixes for looping directories, some work on task dispatching
This commit is contained in:
parent
14d384e366
commit
7a4432e4d0
@ -87,8 +87,8 @@ class RemoteDirectoryCrawler:
|
||||
try:
|
||||
try:
|
||||
directory = RemoteDirectoryFactory.get_directory(self.url)
|
||||
path, root_listing = directory.list_dir("")
|
||||
self.crawled_paths.append(path)
|
||||
path_id, root_listing = directory.list_dir(urlparse(self.url).path)
|
||||
self.crawled_paths.append(path_id)
|
||||
directory.close()
|
||||
except TimeoutError:
|
||||
return CrawlResult(0, "timeout")
|
||||
|
@ -73,6 +73,11 @@ class HttpDirectory(RemoteDirectory):
|
||||
"?C=N&O=A",
|
||||
"?C=N&O=A"
|
||||
)
|
||||
FILE_NAME_BLACKLIST = (
|
||||
"Parent Directory",
|
||||
"../"
|
||||
|
||||
)
|
||||
MAX_RETRIES = 3
|
||||
|
||||
def __init__(self, url):
|
||||
@ -114,8 +119,8 @@ class HttpDirectory(RemoteDirectory):
|
||||
urls_to_request.append(urljoin(path_url, anchor.href))
|
||||
|
||||
for file in self.request_files(urls_to_request):
|
||||
files.append(file)
|
||||
path_identifier.update(bytes(file))
|
||||
files.append(file)
|
||||
|
||||
return path_identifier.hexdigest(), files
|
||||
|
||||
@ -197,7 +202,7 @@ class HttpDirectory(RemoteDirectory):
|
||||
|
||||
@staticmethod
|
||||
def _should_ignore(base_url, link: Anchor):
|
||||
if link.text == "../" or link.href == "../" or link.href == "./" or link.href == "" \
|
||||
if link.text in HttpDirectory.FILE_NAME_BLACKLIST or link.href in ("../", "./", "") \
|
||||
or link.href.endswith(HttpDirectory.BLACK_LIST):
|
||||
return True
|
||||
|
||||
|
13
task.py
13
task.py
@ -1,5 +1,3 @@
|
||||
import random
|
||||
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from search.search import ElasticSearchEngine
|
||||
from crawl_server.database import Task, TaskResult
|
||||
@ -180,7 +178,7 @@ class TaskDispatcher:
|
||||
|
||||
queued_tasks_by_server = self._get_current_tasks_by_server()
|
||||
server_with_most_free_slots = None
|
||||
most_free_slots = 0
|
||||
most_free_slots = -10000
|
||||
|
||||
for server in queued_tasks_by_server:
|
||||
free_slots = server.slots - len(queued_tasks_by_server[server])
|
||||
@ -188,12 +186,11 @@ class TaskDispatcher:
|
||||
server_with_most_free_slots = server
|
||||
most_free_slots = free_slots
|
||||
|
||||
if server_with_most_free_slots:
|
||||
print("Dispatching task to '" +
|
||||
server_with_most_free_slots.name + "' " +
|
||||
str(most_free_slots) + " free out of " + str(server_with_most_free_slots.slots))
|
||||
print("Dispatching task to '" +
|
||||
server_with_most_free_slots.name + "' " +
|
||||
str(most_free_slots) + " free out of " + str(server_with_most_free_slots.slots))
|
||||
|
||||
return self.db.get_crawl_servers()[0]
|
||||
return server_with_most_free_slots
|
||||
|
||||
def get_queued_tasks(self) -> list:
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user