From cc4c70f4004f7c70331a8c69d663d83a6706c2cf Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 11 Aug 2018 13:05:24 -0400 Subject: [PATCH] Request content is read all at once --- crawl_server/remote_http.py | 29 ++++++++++------------------- test/files/apache_table.html | 21 +++++++++++++++++++++ test/webserver.py | 13 +++++++++++++ 3 files changed, 44 insertions(+), 19 deletions(-) create mode 100644 test/files/apache_table.html create mode 100644 test/webserver.py diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 368aaac..37cf3c1 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -104,7 +104,7 @@ class HttpDirectory(RemoteDirectory): current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1] path_identifier = hashlib.md5(current_dir_name.encode()) path_url = urljoin(self.base_url, path, "") - body = self._stream_body(path_url) + body = self._fetch_body(path_url) anchors = self._parse_links(body) urls_to_request = [] @@ -176,19 +176,16 @@ class HttpDirectory(RemoteDirectory): logger.debug("TimeoutError - _request_file") raise TimeoutError - def _stream_body(self, url: str): + def _fetch_body(self, url: str): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: - r = self.session.get(url, stream=True, timeout=HttpDirectory.TIMEOUT) - for chunk in r.iter_content(chunk_size=8192): - try: - yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore") - except LookupError: - # Unsupported encoding - yield chunk.decode("utf-8", errors="ignore") - r.close() - return + r = self.session.get(url, timeout=HttpDirectory.TIMEOUT) + try: + return r.content.decode(r.encoding if r.encoding else "utf-8", errors="ignore") + except LookupError: + # Unsupported encoding + return r.content.decode("utf-8", errors="ignore") except RequestException: self.session.close() retries -= 1 @@ -200,14 +197,8 @@ class HttpDirectory(RemoteDirectory): def _parse_links(body): parser = HTMLAnchorParser() - anchors = [] - - for chunk in body: - parser.feed(chunk) - for anchor in parser.anchors: - anchors.append(anchor) - - return anchors + parser.feed(body) + return parser.anchors @staticmethod def _isdir(link: Anchor): diff --git a/test/files/apache_table.html b/test/files/apache_table.html new file mode 100644 index 0000000..06e21c9 --- /dev/null +++ b/test/files/apache_table.html @@ -0,0 +1,21 @@ + + + + Index of /Public/bootstrap + + +

Index of /Public/bootstrap

+ + + + + + + + + + + +
[ICO]NameLast modifiedSizeDescription

[PARENTDIR]Parent Directory   -  
[   ]bower.json 2017-04-05 01:45 1.0K 
[DIR]css/ 2017-09-07 18:03 -  
[DIR]image/ 2017-09-07 18:03 -  
[DIR]js/ 2017-09-07 18:03 -  
[DIR]less/ 2017-09-07 18:03 -  
[   ]package.json 2017-04-05 01:45 666  

+ + diff --git a/test/webserver.py b/test/webserver.py new file mode 100644 index 0000000..a3a1c14 --- /dev/null +++ b/test/webserver.py @@ -0,0 +1,13 @@ +from flask import Flask, send_file + +app = Flask(__name__) + + +@app.route("/test1/") +def test1(): + return send_file("files/apache_table.html") + + +if __name__ == '__main__': + app.run("0.0.0.0", port=8888, threaded=True) +