diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 368aaac..37cf3c1 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -104,7 +104,7 @@ class HttpDirectory(RemoteDirectory): current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1] path_identifier = hashlib.md5(current_dir_name.encode()) path_url = urljoin(self.base_url, path, "") - body = self._stream_body(path_url) + body = self._fetch_body(path_url) anchors = self._parse_links(body) urls_to_request = [] @@ -176,19 +176,16 @@ class HttpDirectory(RemoteDirectory): logger.debug("TimeoutError - _request_file") raise TimeoutError - def _stream_body(self, url: str): + def _fetch_body(self, url: str): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: - r = self.session.get(url, stream=True, timeout=HttpDirectory.TIMEOUT) - for chunk in r.iter_content(chunk_size=8192): - try: - yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore") - except LookupError: - # Unsupported encoding - yield chunk.decode("utf-8", errors="ignore") - r.close() - return + r = self.session.get(url, timeout=HttpDirectory.TIMEOUT) + try: + return r.content.decode(r.encoding if r.encoding else "utf-8", errors="ignore") + except LookupError: + # Unsupported encoding + return r.content.decode("utf-8", errors="ignore") except RequestException: self.session.close() retries -= 1 @@ -200,14 +197,8 @@ class HttpDirectory(RemoteDirectory): def _parse_links(body): parser = HTMLAnchorParser() - anchors = [] - - for chunk in body: - parser.feed(chunk) - for anchor in parser.anchors: - anchors.append(anchor) - - return anchors + parser.feed(body) + return parser.anchors @staticmethod def _isdir(link: Anchor): diff --git a/test/files/apache_table.html b/test/files/apache_table.html new file mode 100644 index 0000000..06e21c9 --- /dev/null +++ b/test/files/apache_table.html @@ -0,0 +1,21 @@ + + +
+![]() | Name | Last modified | Size | Description |
---|---|---|---|---|
![]() | Parent Directory | - | ||
![]() | bower.json | 2017-04-05 01:45 | 1.0K | |
![]() | css/ | 2017-09-07 18:03 | - | |
![]() | image/ | 2017-09-07 18:03 | - | |
![]() | js/ | 2017-09-07 18:03 | - | |
![]() | less/ | 2017-09-07 18:03 | - | |
![]() | package.json | 2017-04-05 01:45 | 666 | |