mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 10:56:47 +00:00
Should fix memory usage problem when crawling (part three)
This commit is contained in:
parent
86144935e3
commit
1283cc9599
2
app.py
2
app.py
@ -83,9 +83,7 @@ def website_links(website_id):
|
|||||||
website = db.get_website_by_id(website_id)
|
website = db.get_website_by_id(website_id)
|
||||||
|
|
||||||
if website:
|
if website:
|
||||||
print("FIXME: website_links")
|
|
||||||
links = searchEngine.get_link_list(website_id, website.url)
|
links = searchEngine.get_link_list(website_id, website.url)
|
||||||
print(links)
|
|
||||||
return Response("\n".join(links), mimetype="text/plain")
|
return Response("\n".join(links), mimetype="text/plain")
|
||||||
else:
|
else:
|
||||||
abort(404)
|
abort(404)
|
||||||
|
@ -37,10 +37,10 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
results = []
|
results = []
|
||||||
|
|
||||||
path_url = os.path.join(self.base_url, path.strip("/"), "")
|
path_url = os.path.join(self.base_url, path.strip("/"), "")
|
||||||
body = self._fetch_body(path_url)
|
body, encoding = self._fetch_body(path_url)
|
||||||
if not body:
|
if not body:
|
||||||
return []
|
return []
|
||||||
links = self._parse_links(body)
|
links = self._parse_links(body, encoding)
|
||||||
|
|
||||||
urls_to_request = []
|
urls_to_request = []
|
||||||
|
|
||||||
@ -93,13 +93,13 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
while retries > 0:
|
while retries > 0:
|
||||||
try:
|
try:
|
||||||
r = self.session.get(url)
|
r = self.session.get(url)
|
||||||
return r.content
|
return r.content, r.encoding
|
||||||
except RequestException:
|
except RequestException:
|
||||||
retries -= 1
|
retries -= 1
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _parse_links(self, body: bytes) -> list:
|
def _parse_links(self, body: bytes, encoding) -> list:
|
||||||
|
|
||||||
result = list()
|
result = list()
|
||||||
try:
|
try:
|
||||||
@ -113,7 +113,7 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
for link in links:
|
for link in links:
|
||||||
result.append((link.text, link.get("href")))
|
result.append((link.text, link.get("href")))
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
tree = etree.HTML(body.decode("utf-8", errors="ignore").encode("utf-8"), parser=self.parser)
|
tree = etree.HTML(body.decode(encoding, errors="ignore").encode("utf-8"), parser=self.parser)
|
||||||
links = []
|
links = []
|
||||||
try:
|
try:
|
||||||
links = tree.findall(".//a/[@href]")
|
links = tree.findall(".//a/[@href]")
|
||||||
|
@ -4,12 +4,14 @@ import json
|
|||||||
|
|
||||||
payload = json.dumps({
|
payload = json.dumps({
|
||||||
"website_id": 123,
|
"website_id": 123,
|
||||||
"url": "ftp://ien11-3-88-183-194-246.fbx.proxad.net/",
|
"url": "http://alphamediazone.com/data/Movies1/",
|
||||||
|
# "url": "http://localhost:8000/",
|
||||||
"priority": 2,
|
"priority": 2,
|
||||||
"callback_type": "",
|
"callback_type": "",
|
||||||
"callback_args": "{}"
|
"callback_args": "{}"
|
||||||
})
|
})
|
||||||
|
|
||||||
r = requests.post("http://localhost:5001/task/put",
|
r = requests.post("http://localhost:5001/task/put",
|
||||||
headers={"Content-Type": "application/json"},
|
headers={"Content-Type": "application/json",
|
||||||
|
"Authorization": "Token abc"},
|
||||||
data=payload)
|
data=payload)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user