Refactoring on http crawler

2025-10-14 15:06:53 +00:00 · 2018-06-11 16:06:56 -04:00 · 2018-06-11 16:06:56 -04:00 · 8421cc0885
commit 8421cc0885
parent 7f496ce7a8
3 changed files with 53 additions and 23 deletions
--- a/README.md
+++ b/README.md
@ -16,6 +16,7 @@ CAPTCHA_SECRET_KEY = ""
 FLASK_SECRET = ""
 USE_SSL = True
 RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
 HEADERS = {}
 ```
 ## Running
--- a/crawler/crawler.py
+++ b/crawler/crawler.py
@ -125,8 +125,8 @@ class RemoteDirectoryCrawler:
                        in_q.put(f)
                    else:
                        files_q.put(f)
-            except TooManyConnectionsError as e:
+            except TooManyConnectionsError:
-                print("TOO MANY CONNNS")
+                print("Too many connections")
            except TimeoutError:
                pass
            finally:
--- a/crawler/http.py
+++ b/crawler/http.py
@ -1,4 +1,4 @@
-from urllib.parse import urlparse, urljoin, unquote
+from urllib.parse import urljoin, unquote
 import os
 from lxml import etree
@ -7,12 +7,20 @@ from crawler.crawler import RemoteDirectory, File
 import requests
 from requests.exceptions import RequestException
 from multiprocessing.pool import ThreadPool
 import config
 class Link:
    def __init__(self, text: str, url: str):
        self.text = text
        self.url = url
 class HttpDirectory(RemoteDirectory):
    SCHEMES = ("http", "https",)
-    HEADERS = {}
+    HEADERS = config.HEADERS
    BLACK_LIST = (
        "?C=N&O=D",
        "?C=M&O=A",
@ -23,6 +31,7 @@ class HttpDirectory(RemoteDirectory):
        "?C=S;O=A",
        "?C=D;O=A"
    )
    MAX_RETRIES = 3
    def __init__(self, url):
        super().__init__(url)
@ -34,6 +43,8 @@ class HttpDirectory(RemoteDirectory):
        path_url = urljoin(self.base_url, path)
        body = self._fetch_body(path_url)
        if not body:
            return []
        links = self._parse_links(body)
        urls_to_request = []
@ -42,11 +53,11 @@ class HttpDirectory(RemoteDirectory):
            if self._should_ignore(link):
                continue
            file_url = urljoin(path_url, link[1])
            path, file_name = os.path.split(file_url[len(self.base_url) - 1:])
            if self._isdir(link):
                results.append(File(
                    name=file_name,
                    mtime="",
@ -57,11 +68,27 @@ class HttpDirectory(RemoteDirectory):
            else:
                urls_to_request.append(file_url)
-        pool = ThreadPool(processes=10)
+        results.extend(self.request_files(urls_to_request))
-        files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
+
-        for f in files:
+        return results
-            if f:
+
-                results.append(f)
+    def request_files(self, urls_to_request: list) -> list:
        results = []
        if len(urls_to_request) > 3:
            # Many urls, use multi-threaded solution
            pool = ThreadPool(processes=10)
            files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
            for file in files:
                if file:
                    results.append(file)
        else:
            # Too few urls to create thread pool
            for url in urls_to_request:
                file = self._request_file(url)
                if file:
                    results.append(file)
        return results
@ -71,10 +98,15 @@ class HttpDirectory(RemoteDirectory):
    @staticmethod
    def _fetch_body(url: str):
-        # todo timeout
+        retries = HttpDirectory.MAX_RETRIES
-        print("FETCH " + url)
+        while retries > 0:
-        r = requests.get(url, headers=HttpDirectory.HEADERS)
+            try:
-        return r.text
+                r = requests.get(url, headers=HttpDirectory.HEADERS)
                return r.text
            except RequestException:
                retries -= 1
        return None
    def _parse_links(self, body: str) -> set:
@ -83,21 +115,19 @@ class HttpDirectory(RemoteDirectory):
        links = tree.findall(".//a/[@href]")
        for link in links:
-            result.add((link.text, link.get("href")))
+            result.add(Link(link.text, link.get("href")))
        return result
    @staticmethod
-    def _isdir(url):
+    def _isdir(link: Link):
-        return url[1].rsplit("?", maxsplit=1)[0].endswith("/")
+        return link.url.rsplit("?", maxsplit=1)[0].endswith("/")
    def _request_file(self, url):
-        # todo timeout
+        retries = HttpDirectory.MAX_RETRIES
        retries = 3
        while retries > 0:
            try:
                print("HEAD " + url)
                r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50)
                stripped_url = r.url[len(self.base_url) - 1:]
@ -116,8 +146,7 @@ class HttpDirectory(RemoteDirectory):
        return None
    @staticmethod
-    def _should_ignore(link):
+    def _should_ignore(link: Link):
-        return link[0] == "../" or link[1].endswith(HttpDirectory.BLACK_LIST)
+        return link.text == "../" or link.url.endswith(HttpDirectory.BLACK_LIST)