mirror of
https://github.com/simon987/od-database.git
synced 2025-12-14 23:29:04 +00:00
Logging and bugfix for http crawler
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
from crawl_server import logger
|
||||
from urllib.parse import unquote, urljoin
|
||||
import os
|
||||
from html.parser import HTMLParser
|
||||
@@ -19,6 +20,9 @@ class Anchor:
|
||||
self.text = None
|
||||
self.href = None
|
||||
|
||||
def __str__(self):
|
||||
return "<" + self.href + ", " + str(self.text).strip() + ">"
|
||||
|
||||
|
||||
class HTMLAnchorParser(HTMLParser):
|
||||
|
||||
@@ -46,7 +50,7 @@ class HTMLAnchorParser(HTMLParser):
|
||||
self.current_anchor = None
|
||||
|
||||
def error(self, message):
|
||||
pass
|
||||
logger.debug("HTML Parser error: " + message)
|
||||
|
||||
def feed(self, data):
|
||||
self.anchors.clear()
|
||||
@@ -181,7 +185,6 @@ class HttpDirectory(RemoteDirectory):
|
||||
# Unsupported encoding
|
||||
yield chunk.decode("utf-8", errors="ignore")
|
||||
r.close()
|
||||
del r
|
||||
break
|
||||
except RequestException:
|
||||
self.session.close()
|
||||
@@ -208,7 +211,7 @@ class HttpDirectory(RemoteDirectory):
|
||||
|
||||
@staticmethod
|
||||
def _should_ignore(base_url, link: Anchor):
|
||||
if link.text in HttpDirectory.FILE_NAME_BLACKLIST or link.href in ("../", "./", "") \
|
||||
if link.text in HttpDirectory.FILE_NAME_BLACKLIST or link.href in ("../", "./", "", "..", "../../") \
|
||||
or link.href.endswith(HttpDirectory.BLACK_LIST):
|
||||
return True
|
||||
|
||||
@@ -217,7 +220,12 @@ class HttpDirectory(RemoteDirectory):
|
||||
if not full_url.startswith(base_url):
|
||||
return True
|
||||
|
||||
# Ignore parameters in url
|
||||
if "?" in link.href:
|
||||
return True
|
||||
|
||||
def close(self):
|
||||
self.session.close()
|
||||
logger.debug("Closing HTTPRemoteDirectory for " + self.base_url)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user