mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 10:56:47 +00:00
Should fix memory usage problem when crawling (part three)
This commit is contained in:
parent
86144935e3
commit
1283cc9599
2
app.py
2
app.py
@ -83,9 +83,7 @@ def website_links(website_id):
|
||||
website = db.get_website_by_id(website_id)
|
||||
|
||||
if website:
|
||||
print("FIXME: website_links")
|
||||
links = searchEngine.get_link_list(website_id, website.url)
|
||||
print(links)
|
||||
return Response("\n".join(links), mimetype="text/plain")
|
||||
else:
|
||||
abort(404)
|
||||
|
@ -37,10 +37,10 @@ class HttpDirectory(RemoteDirectory):
|
||||
results = []
|
||||
|
||||
path_url = os.path.join(self.base_url, path.strip("/"), "")
|
||||
body = self._fetch_body(path_url)
|
||||
body, encoding = self._fetch_body(path_url)
|
||||
if not body:
|
||||
return []
|
||||
links = self._parse_links(body)
|
||||
links = self._parse_links(body, encoding)
|
||||
|
||||
urls_to_request = []
|
||||
|
||||
@ -93,13 +93,13 @@ class HttpDirectory(RemoteDirectory):
|
||||
while retries > 0:
|
||||
try:
|
||||
r = self.session.get(url)
|
||||
return r.content
|
||||
return r.content, r.encoding
|
||||
except RequestException:
|
||||
retries -= 1
|
||||
|
||||
return None
|
||||
|
||||
def _parse_links(self, body: bytes) -> list:
|
||||
def _parse_links(self, body: bytes, encoding) -> list:
|
||||
|
||||
result = list()
|
||||
try:
|
||||
@ -113,7 +113,7 @@ class HttpDirectory(RemoteDirectory):
|
||||
for link in links:
|
||||
result.append((link.text, link.get("href")))
|
||||
except UnicodeDecodeError:
|
||||
tree = etree.HTML(body.decode("utf-8", errors="ignore").encode("utf-8"), parser=self.parser)
|
||||
tree = etree.HTML(body.decode(encoding, errors="ignore").encode("utf-8"), parser=self.parser)
|
||||
links = []
|
||||
try:
|
||||
links = tree.findall(".//a/[@href]")
|
||||
|
@ -4,12 +4,14 @@ import json
|
||||
|
||||
payload = json.dumps({
|
||||
"website_id": 123,
|
||||
"url": "ftp://ien11-3-88-183-194-246.fbx.proxad.net/",
|
||||
"url": "http://alphamediazone.com/data/Movies1/",
|
||||
# "url": "http://localhost:8000/",
|
||||
"priority": 2,
|
||||
"callback_type": "",
|
||||
"callback_args": "{}"
|
||||
})
|
||||
|
||||
r = requests.post("http://localhost:5001/task/put",
|
||||
headers={"Content-Type": "application/json"},
|
||||
headers={"Content-Type": "application/json",
|
||||
"Authorization": "Token abc"},
|
||||
data=payload)
|
||||
|
Loading…
x
Reference in New Issue
Block a user