mirror of
https://github.com/simon987/od-database.git
synced 2025-04-24 12:45:51 +00:00
Attempt to handle looping directories
This commit is contained in:
parent
dd93d40a55
commit
073551df3c
@ -20,8 +20,13 @@ class File:
|
|||||||
self.path = path
|
self.path = path
|
||||||
self.is_dir = is_dir
|
self.is_dir = is_dir
|
||||||
|
|
||||||
def __str__(self):
|
def __bytes__(self):
|
||||||
return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name
|
return b"|".join([
|
||||||
|
self.name.encode(),
|
||||||
|
b"D" if self.is_dir else b"F",
|
||||||
|
str(self.size).encode(),
|
||||||
|
str(self.mtime).encode(),
|
||||||
|
])
|
||||||
|
|
||||||
def to_json(self):
|
def to_json(self):
|
||||||
return ujson.dumps({
|
return ujson.dumps({
|
||||||
@ -39,7 +44,7 @@ class RemoteDirectory:
|
|||||||
def __init__(self, base_url):
|
def __init__(self, base_url):
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
|
|
||||||
def list_dir(self, path: str) -> list:
|
def list_dir(self, path: str):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
@ -82,8 +87,8 @@ class RemoteDirectoryCrawler:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
directory = RemoteDirectoryFactory.get_directory(self.url)
|
directory = RemoteDirectoryFactory.get_directory(self.url)
|
||||||
root_listing = directory.list_dir("")
|
path, root_listing = directory.list_dir("")
|
||||||
self.crawled_paths.append("")
|
self.crawled_paths.append(path)
|
||||||
directory.close()
|
directory.close()
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
return CrawlResult(0, "timeout")
|
return CrawlResult(0, "timeout")
|
||||||
@ -136,9 +141,9 @@ class RemoteDirectoryCrawler:
|
|||||||
break
|
break
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if path not in self.crawled_paths:
|
path_id, listing = directory.list_dir(path)
|
||||||
self.crawled_paths.append(path)
|
if len(listing) > 0 and path_id not in self.crawled_paths:
|
||||||
listing = directory.list_dir(path)
|
self.crawled_paths.append(path_id)
|
||||||
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
|
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
|
||||||
|
|
||||||
for f in listing:
|
for f in listing:
|
||||||
@ -148,6 +153,9 @@ class RemoteDirectoryCrawler:
|
|||||||
files_q.put(f)
|
files_q.put(f)
|
||||||
import sys
|
import sys
|
||||||
print("LISTED " + repr(path) + "dirs:" + str(in_q.qsize()))
|
print("LISTED " + repr(path) + "dirs:" + str(in_q.qsize()))
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
# print("SKIPPED: " + path + ", dropped " + str(len(listing)))
|
||||||
except TooManyConnectionsError:
|
except TooManyConnectionsError:
|
||||||
print("Too many connections")
|
print("Too many connections")
|
||||||
# Kill worker and resubmit listing task
|
# Kill worker and resubmit listing task
|
||||||
|
@ -44,7 +44,7 @@ class FtpDirectory(RemoteDirectory):
|
|||||||
print("Connection error; reconnecting..." + e.strerror + " " + str(e.errno))
|
print("Connection error; reconnecting..." + e.strerror + " " + str(e.errno))
|
||||||
time.sleep(2 * random.uniform(0.5, 1.5))
|
time.sleep(2 * random.uniform(0.5, 1.5))
|
||||||
|
|
||||||
def list_dir(self, path) -> list:
|
def list_dir(self, path):
|
||||||
if not self.ftp:
|
if not self.ftp:
|
||||||
# No connection - assuming that connection was dropped because too many
|
# No connection - assuming that connection was dropped because too many
|
||||||
raise TooManyConnectionsError()
|
raise TooManyConnectionsError()
|
||||||
@ -65,7 +65,7 @@ class FtpDirectory(RemoteDirectory):
|
|||||||
is_dir=is_dir,
|
is_dir=is_dir,
|
||||||
path=path
|
path=path
|
||||||
))
|
))
|
||||||
return results
|
return path, results
|
||||||
except ftputil.error.ParserError as e:
|
except ftputil.error.ParserError as e:
|
||||||
print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name))
|
print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name))
|
||||||
break
|
break
|
||||||
@ -82,7 +82,7 @@ class FtpDirectory(RemoteDirectory):
|
|||||||
print(type(e))
|
print(type(e))
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
return []
|
return path, []
|
||||||
|
|
||||||
def try_stat(self, path):
|
def try_stat(self, path):
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ from requests.exceptions import RequestException
|
|||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
import config
|
import config
|
||||||
from dateutil.parser import parse as parse_date
|
from dateutil.parser import parse as parse_date
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
class Anchor:
|
class Anchor:
|
||||||
@ -66,7 +67,9 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
"?MA",
|
"?MA",
|
||||||
"?SA",
|
"?SA",
|
||||||
"?DA",
|
"?DA",
|
||||||
"?ND"
|
"?ND",
|
||||||
|
"?C=N&O=A",
|
||||||
|
"?C=N&O=A"
|
||||||
)
|
)
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
|
|
||||||
@ -79,31 +82,40 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
|
|
||||||
def list_dir(self, path):
|
def list_dir(self, path):
|
||||||
|
|
||||||
|
current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1]
|
||||||
|
path_identifier = hashlib.sha1(current_dir_name.encode())
|
||||||
path_url = urljoin(self.base_url, path, "")
|
path_url = urljoin(self.base_url, path, "")
|
||||||
body = self._stream_body(path_url)
|
body = self._stream_body(path_url)
|
||||||
if not body:
|
if not body:
|
||||||
return None
|
return None, None
|
||||||
anchors = self._parse_links(body)
|
anchors = self._parse_links(body)
|
||||||
|
|
||||||
urls_to_request = []
|
urls_to_request = []
|
||||||
|
files = []
|
||||||
|
|
||||||
for anchor in anchors:
|
for anchor in anchors:
|
||||||
if self._should_ignore(self.base_url, anchor):
|
if self._should_ignore(self.base_url, anchor):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if self._isdir(anchor):
|
if self._isdir(anchor):
|
||||||
yield File(
|
|
||||||
|
directory = File(
|
||||||
name=anchor.href,
|
name=anchor.href,
|
||||||
mtime=None,
|
mtime=0,
|
||||||
size=None,
|
size=0,
|
||||||
path=path,
|
path=path,
|
||||||
is_dir=True
|
is_dir=True
|
||||||
)
|
)
|
||||||
|
path_identifier.update(bytes(directory))
|
||||||
|
files.append(directory)
|
||||||
else:
|
else:
|
||||||
urls_to_request.append(urljoin(path_url, anchor.href))
|
urls_to_request.append(urljoin(path_url, anchor.href))
|
||||||
|
|
||||||
for file in self.request_files(urls_to_request):
|
for file in self.request_files(urls_to_request):
|
||||||
yield file
|
files.append(file)
|
||||||
|
path_identifier.update(bytes(file))
|
||||||
|
|
||||||
|
return path_identifier.hexdigest(), files
|
||||||
|
|
||||||
def request_files(self, urls_to_request: list) -> list:
|
def request_files(self, urls_to_request: list) -> list:
|
||||||
|
|
||||||
@ -168,11 +180,14 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
def _parse_links(body):
|
def _parse_links(body):
|
||||||
|
|
||||||
parser = HTMLAnchorParser()
|
parser = HTMLAnchorParser()
|
||||||
|
anchors = []
|
||||||
|
|
||||||
for chunk in body:
|
for chunk in body:
|
||||||
parser.feed(chunk)
|
parser.feed(chunk)
|
||||||
for anchor in parser.anchors:
|
for anchor in parser.anchors:
|
||||||
yield anchor
|
anchors.append(anchor)
|
||||||
|
|
||||||
|
return anchors
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _isdir(link: Anchor):
|
def _isdir(link: Anchor):
|
||||||
@ -180,7 +195,7 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _should_ignore(base_url, link: Anchor):
|
def _should_ignore(base_url, link: Anchor):
|
||||||
if link.text == "../" or link.href == "../" or link.href == "./" \
|
if link.text == "../" or link.href == "../" or link.href == "./" or link.href == "" \
|
||||||
or link.href.endswith(HttpDirectory.BLACK_LIST):
|
or link.href.endswith(HttpDirectory.BLACK_LIST):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user