mirror of
				https://github.com/simon987/od-database.git
				synced 2025-10-25 19:56:51 +00:00 
			
		
		
		
	Slowly losing my sanity part 1: Removed scrapy dependency and moved to custom solution. Added multi-threaded ftp crawler
This commit is contained in:
		
							parent
							
								
									b649b82854
								
							
						
					
					
						commit
						7f496ce7a8
					
				
							
								
								
									
										133
									
								
								crawler/crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										133
									
								
								crawler/crawler.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,133 @@ | |||||||
|  | import os | ||||||
|  | import json | ||||||
|  | from urllib.parse import urlparse | ||||||
|  | from timeout_decorator.timeout_decorator import TimeoutError | ||||||
|  | from threading import Thread | ||||||
|  | from queue import Queue, Empty | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TooManyConnectionsError(Exception): | ||||||
|  |     pass | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class File: | ||||||
|  | 
 | ||||||
|  |     def __init__(self, name: str, size: int, mtime: str, path: str, is_dir: bool): | ||||||
|  |         self.name = name | ||||||
|  |         self.size = size | ||||||
|  |         self.mtime = mtime | ||||||
|  |         self.path = path | ||||||
|  |         self.is_dir = is_dir | ||||||
|  | 
 | ||||||
|  |     def __str__(self): | ||||||
|  |         return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name | ||||||
|  | 
 | ||||||
|  |     def to_json(self): | ||||||
|  |         return json.dumps({ | ||||||
|  |             "name": self.name, | ||||||
|  |             "size": self.size, | ||||||
|  |             "mtime": self.mtime, | ||||||
|  |             "path": self.path, | ||||||
|  |         }) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class RemoteDirectory: | ||||||
|  | 
 | ||||||
|  |     SCHEMES = () | ||||||
|  | 
 | ||||||
|  |     def __init__(self, base_url): | ||||||
|  |         self.base_url = base_url | ||||||
|  | 
 | ||||||
|  |     def list_dir(self, path: str) -> list: | ||||||
|  |         raise NotImplementedError | ||||||
|  | 
 | ||||||
|  |     def close(self): | ||||||
|  |         pass | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class RemoteDirectoryFactory: | ||||||
|  | 
 | ||||||
|  |     from crawler.ftp import FtpDirectory | ||||||
|  |     from crawler.http import HttpDirectory | ||||||
|  |     DIR_ENGINES = (FtpDirectory, HttpDirectory) | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def get_directory(url) -> RemoteDirectory: | ||||||
|  | 
 | ||||||
|  |         parsed_url = urlparse(url) | ||||||
|  | 
 | ||||||
|  |         for dir_engine in RemoteDirectoryFactory.DIR_ENGINES: | ||||||
|  |             if parsed_url.scheme in dir_engine.SCHEMES: | ||||||
|  |                 return dir_engine(url) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class RemoteDirectoryCrawler: | ||||||
|  | 
 | ||||||
|  |     def __init__(self, url, max_threads: int): | ||||||
|  |         self.url = url | ||||||
|  |         self.max_threads = max_threads | ||||||
|  | 
 | ||||||
|  |     def crawl_directory(self): | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  |             directory = RemoteDirectoryFactory.get_directory(self.url) | ||||||
|  |             root_listing = directory.list_dir("/dl2/")  # todo get path | ||||||
|  |             directory.close() | ||||||
|  |         except TimeoutError: | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         in_q = Queue(maxsize=0) | ||||||
|  |         files_q = Queue(maxsize=0) | ||||||
|  |         for f in root_listing: | ||||||
|  |             if f.is_dir: | ||||||
|  |                 in_q.put(f) | ||||||
|  |             else: | ||||||
|  |                 files_q.put(f) | ||||||
|  | 
 | ||||||
|  |         threads = [] | ||||||
|  |         for i in range(self.max_threads): | ||||||
|  |             worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self.url, in_q, files_q)) | ||||||
|  |             threads.append(worker) | ||||||
|  |             worker.start() | ||||||
|  | 
 | ||||||
|  |         in_q.join() | ||||||
|  |         print("DONE") | ||||||
|  | 
 | ||||||
|  |         # Kill threads | ||||||
|  |         for _ in threads: | ||||||
|  |             in_q.put(None) | ||||||
|  |         for t in threads: | ||||||
|  |             t.join() | ||||||
|  | 
 | ||||||
|  |         print(files_q.qsize()) | ||||||
|  |         return [] | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def _process_listings(url: str, in_q: Queue, files_q: Queue): | ||||||
|  | 
 | ||||||
|  |         directory = RemoteDirectoryFactory.get_directory(url) | ||||||
|  | 
 | ||||||
|  |         while directory: | ||||||
|  | 
 | ||||||
|  |             try: | ||||||
|  |                 file = in_q.get(timeout=60) | ||||||
|  |             except Empty: | ||||||
|  |                 break | ||||||
|  | 
 | ||||||
|  |             if file is None: | ||||||
|  |                 break | ||||||
|  | 
 | ||||||
|  |             try: | ||||||
|  |                 listing = directory.list_dir(os.path.join(file.path, file.name, "")) | ||||||
|  | 
 | ||||||
|  |                 for f in listing: | ||||||
|  |                     if f.is_dir: | ||||||
|  |                         in_q.put(f) | ||||||
|  |                     else: | ||||||
|  |                         files_q.put(f) | ||||||
|  |             except TooManyConnectionsError as e: | ||||||
|  |                 print("TOO MANY CONNNS") | ||||||
|  |             except TimeoutError: | ||||||
|  |                 pass | ||||||
|  |             finally: | ||||||
|  |                 in_q.task_done() | ||||||
							
								
								
									
										79
									
								
								crawler/ftp.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								crawler/ftp.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,79 @@ | |||||||
|  | #! /usr/bin/env python | ||||||
|  | 
 | ||||||
|  | from urllib.parse import urlparse | ||||||
|  | import os | ||||||
|  | import time | ||||||
|  | import ftputil | ||||||
|  | import ftputil.error | ||||||
|  | from ftputil.session import session_factory | ||||||
|  | import random | ||||||
|  | import timeout_decorator | ||||||
|  | from crawler.crawler import RemoteDirectory, File, TooManyConnectionsError | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class FtpDirectory(RemoteDirectory): | ||||||
|  | 
 | ||||||
|  |     SCHEMES = ("ftp", ) | ||||||
|  | 
 | ||||||
|  |     def __init__(self, url): | ||||||
|  |         host = urlparse(url).netloc | ||||||
|  |         super().__init__(host) | ||||||
|  |         self.failed_attempts = 0 | ||||||
|  |         self.max_attempts = 2 | ||||||
|  |         self.ftp = None | ||||||
|  |         self.stop_when_connected() | ||||||
|  | 
 | ||||||
|  |     def _connect(self): | ||||||
|  |         self.ftp = ftputil.FTPHost(self.base_url, "anonymous", "od-database", session_factory=session_factory( | ||||||
|  |             use_passive_mode=False | ||||||
|  |         )) | ||||||
|  | 
 | ||||||
|  |     def stop_when_connected(self): | ||||||
|  |         while self.failed_attempts < self.max_attempts: | ||||||
|  |             try: | ||||||
|  |                 self._connect() | ||||||
|  |                 self.failed_attempts = 0 | ||||||
|  |                 break | ||||||
|  |             except ftputil.error.FTPError as e: | ||||||
|  | 
 | ||||||
|  |                 if e.errno == 530: | ||||||
|  |                     print("Cancel connection - too many connections") | ||||||
|  |                     break | ||||||
|  | 
 | ||||||
|  |                 self.failed_attempts += 1 | ||||||
|  |                 print("Connection error; reconnecting...") | ||||||
|  |                 time.sleep(2 * random.uniform(0.5, 1.5)) | ||||||
|  |                 self.stop_when_connected() | ||||||
|  | 
 | ||||||
|  |     @timeout_decorator.timeout(15, use_signals=False) | ||||||
|  |     def list_dir(self, path) -> list: | ||||||
|  |         if not self.ftp: | ||||||
|  |             print("Conn closed") | ||||||
|  |             return [] | ||||||
|  |         results = [] | ||||||
|  |         try: | ||||||
|  |             self.ftp.chdir(path) | ||||||
|  |             file_names = self.ftp.listdir(path) | ||||||
|  | 
 | ||||||
|  |             for file_name in file_names: | ||||||
|  |                     stat = self.ftp.stat(file_name) | ||||||
|  |                     is_dir = self.ftp.path.isdir(os.path.join(path, file_name)) | ||||||
|  | 
 | ||||||
|  |                     results.append(File( | ||||||
|  |                         name=file_name, | ||||||
|  |                         mtime=stat.st_mtime, | ||||||
|  |                         size=-1 if is_dir else stat.st_size, | ||||||
|  |                         is_dir=is_dir, | ||||||
|  |                         path=path | ||||||
|  |                     )) | ||||||
|  |         except ftputil.error.FTPError as e: | ||||||
|  |             if e.errno == 530: | ||||||
|  |                 raise TooManyConnectionsError() | ||||||
|  |             pass | ||||||
|  | 
 | ||||||
|  |         return results | ||||||
|  | 
 | ||||||
|  |     def close(self): | ||||||
|  |         if self.ftp: | ||||||
|  |             self.ftp.close() | ||||||
|  | 
 | ||||||
							
								
								
									
										123
									
								
								crawler/http.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										123
									
								
								crawler/http.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,123 @@ | |||||||
|  | from urllib.parse import urlparse, urljoin, unquote | ||||||
|  | 
 | ||||||
|  | import os | ||||||
|  | from lxml import etree | ||||||
|  | from itertools import repeat | ||||||
|  | from crawler.crawler import RemoteDirectory, File | ||||||
|  | import requests | ||||||
|  | from requests.exceptions import RequestException | ||||||
|  | from multiprocessing.pool import ThreadPool | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class HttpDirectory(RemoteDirectory): | ||||||
|  | 
 | ||||||
|  |     SCHEMES = ("http", "https",) | ||||||
|  |     HEADERS = {} | ||||||
|  |     BLACK_LIST = ( | ||||||
|  |         "?C=N&O=D", | ||||||
|  |         "?C=M&O=A", | ||||||
|  |         "?C=S&O=A", | ||||||
|  |         "?C=D&O=A", | ||||||
|  |         "?C=N;O=D", | ||||||
|  |         "?C=M;O=A", | ||||||
|  |         "?C=S;O=A", | ||||||
|  |         "?C=D;O=A" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     def __init__(self, url): | ||||||
|  |         super().__init__(url) | ||||||
|  |         self.parser = etree.HTMLParser(collect_ids=False) | ||||||
|  | 
 | ||||||
|  |     def list_dir(self, path) -> list: | ||||||
|  | 
 | ||||||
|  |         results = [] | ||||||
|  | 
 | ||||||
|  |         path_url = urljoin(self.base_url, path) | ||||||
|  |         body = self._fetch_body(path_url) | ||||||
|  |         links = self._parse_links(body) | ||||||
|  | 
 | ||||||
|  |         urls_to_request = [] | ||||||
|  | 
 | ||||||
|  |         for link in links: | ||||||
|  | 
 | ||||||
|  |             if self._should_ignore(link): | ||||||
|  |                 continue | ||||||
|  |             file_url = urljoin(path_url, link[1]) | ||||||
|  |             path, file_name = os.path.split(file_url[len(self.base_url) - 1:]) | ||||||
|  | 
 | ||||||
|  |             if self._isdir(link): | ||||||
|  | 
 | ||||||
|  |                 results.append(File( | ||||||
|  |                     name=file_name, | ||||||
|  |                     mtime="", | ||||||
|  |                     size=-1, | ||||||
|  |                     is_dir=True, | ||||||
|  |                     path=path | ||||||
|  |                 )) | ||||||
|  |             else: | ||||||
|  |                 urls_to_request.append(file_url) | ||||||
|  | 
 | ||||||
|  |         pool = ThreadPool(processes=10) | ||||||
|  |         files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request)) | ||||||
|  |         for f in files: | ||||||
|  |             if f: | ||||||
|  |                 results.append(f) | ||||||
|  | 
 | ||||||
|  |         return results | ||||||
|  | 
 | ||||||
|  |     def _get_url(self, path: str): | ||||||
|  |         return urljoin(self.base_url, path) | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def _fetch_body(url: str): | ||||||
|  | 
 | ||||||
|  |         # todo timeout | ||||||
|  |         print("FETCH " + url) | ||||||
|  |         r = requests.get(url, headers=HttpDirectory.HEADERS) | ||||||
|  |         return r.text | ||||||
|  | 
 | ||||||
|  |     def _parse_links(self, body: str) -> set: | ||||||
|  | 
 | ||||||
|  |         result = set() | ||||||
|  |         tree = etree.HTML(body, parser=self.parser) | ||||||
|  |         links = tree.findall(".//a/[@href]") | ||||||
|  | 
 | ||||||
|  |         for link in links: | ||||||
|  |             result.add((link.text, link.get("href"))) | ||||||
|  | 
 | ||||||
|  |         return result | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def _isdir(url): | ||||||
|  |         return url[1].rsplit("?", maxsplit=1)[0].endswith("/") | ||||||
|  | 
 | ||||||
|  |     def _request_file(self, url): | ||||||
|  | 
 | ||||||
|  |         # todo timeout | ||||||
|  |         retries = 3 | ||||||
|  |         while retries > 0: | ||||||
|  |             try: | ||||||
|  |                 print("HEAD " + url) | ||||||
|  |                 r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50) | ||||||
|  | 
 | ||||||
|  |                 stripped_url = r.url[len(self.base_url) - 1:] | ||||||
|  | 
 | ||||||
|  |                 path, name = os.path.split(stripped_url) | ||||||
|  | 
 | ||||||
|  |                 return File( | ||||||
|  |                     path=unquote(path).strip("/"), | ||||||
|  |                     name=unquote(name), | ||||||
|  |                     size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1, | ||||||
|  |                     mtime=r.headers["Date"] if "Date" in r.headers else "?", | ||||||
|  |                     is_dir=False | ||||||
|  |                 ) | ||||||
|  |             except RequestException: | ||||||
|  |                 retries -= 1 | ||||||
|  | 
 | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def _should_ignore(link): | ||||||
|  |         return link[0] == "../" or link[1].endswith(HttpDirectory.BLACK_LIST) | ||||||
|  | 
 | ||||||
							
								
								
									
										127
									
								
								ftp_crawler.py
									
									
									
									
									
								
							
							
						
						
									
										127
									
								
								ftp_crawler.py
									
									
									
									
									
								
							| @ -1,127 +0,0 @@ | |||||||
| #! /usr/bin/env python |  | ||||||
| 
 |  | ||||||
| from threading import Thread |  | ||||||
| from queue import Queue |  | ||||||
| import os |  | ||||||
| import time |  | ||||||
| import ftputil |  | ||||||
| import ftputil.error |  | ||||||
| import random |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class File: |  | ||||||
| 
 |  | ||||||
|     def __init__(self, name: str, size: int, mtime: str, path: str, is_dir: bool): |  | ||||||
|         self.name = name |  | ||||||
|         self.size = size |  | ||||||
|         self.mtime = mtime |  | ||||||
|         self.path = path |  | ||||||
|         self.is_dir = is_dir |  | ||||||
|         self.ftp = None |  | ||||||
| 
 |  | ||||||
|     def __str__(self): |  | ||||||
|         return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class FTPConnection(object): |  | ||||||
|     def __init__(self, host): |  | ||||||
|         self.host = host |  | ||||||
|         self.failed_attempts = 0 |  | ||||||
|         self.max_attempts = 2 |  | ||||||
|         self.ftp = None |  | ||||||
|         self.stop_when_connected() |  | ||||||
| 
 |  | ||||||
|     def _connect(self): |  | ||||||
|         print("Connecting to " + self.host) |  | ||||||
|         self.ftp = ftputil.FTPHost(self.host, "anonymous", "od-database") |  | ||||||
| 
 |  | ||||||
|     def stop_when_connected(self): |  | ||||||
|         while self.failed_attempts < self.max_attempts: |  | ||||||
|             try: |  | ||||||
|                 self._connect() |  | ||||||
|                 self.failed_attempts = 0 |  | ||||||
|                 break |  | ||||||
|             except ftputil.error.FTPError as e: |  | ||||||
| 
 |  | ||||||
|                 if e.errno == 530: |  | ||||||
|                     print("Cancel connection - too many connections") |  | ||||||
|                     break |  | ||||||
| 
 |  | ||||||
|                 self.failed_attempts += 1 |  | ||||||
|                 print("LIST FAILED; reconnecting...") |  | ||||||
|                 time.sleep(2 * random.uniform(0.5, 1.5)) |  | ||||||
|                 self.stop_when_connected() |  | ||||||
| 
 |  | ||||||
|     def list_dir(self, path) -> list: |  | ||||||
|         if not self.ftp: |  | ||||||
|             return [] |  | ||||||
|         results = [] |  | ||||||
|         self.ftp.chdir(path) |  | ||||||
|         try: |  | ||||||
|             file_names = self.ftp.listdir(path) |  | ||||||
| 
 |  | ||||||
|             for file_name in file_names: |  | ||||||
|                     stat = self.ftp.stat(file_name) |  | ||||||
|                     is_dir = self.ftp.path.isdir(os.path.join(path, file_name)) |  | ||||||
| 
 |  | ||||||
|                     results.append(File( |  | ||||||
|                         name=file_name, |  | ||||||
|                         mtime=stat.st_mtime, |  | ||||||
|                         size=-1 if is_dir else stat.st_size, |  | ||||||
|                         is_dir=is_dir, |  | ||||||
|                         path=path |  | ||||||
|                     )) |  | ||||||
|         except ftputil.error.FTPError: |  | ||||||
|             print("ERROR parsing " + path) |  | ||||||
| 
 |  | ||||||
|         return results |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def process_and_queue(host, q: Queue): |  | ||||||
| 
 |  | ||||||
|     ftp = FTPConnection(host) |  | ||||||
| 
 |  | ||||||
|     while ftp.ftp: |  | ||||||
|         file = q.get() |  | ||||||
| 
 |  | ||||||
|         if file.is_dir: |  | ||||||
|             print(file) |  | ||||||
|             try: |  | ||||||
|                 listing = ftp.list_dir(os.path.join(file.path, file.name)) |  | ||||||
|                 for f in listing: |  | ||||||
|                     q.put(f) |  | ||||||
|             except ftputil.error.PermanentError as e: |  | ||||||
|                 if e.errno == 530: |  | ||||||
|                     # Too many connections, retry this dir but kill this thread |  | ||||||
|                     q.put(file) |  | ||||||
|                     ftp.ftp.close() |  | ||||||
|                     print("Dropping connection because too many") |  | ||||||
|         else: |  | ||||||
|             pass |  | ||||||
| 
 |  | ||||||
|         q.task_done() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def crawl_ftp_server(host: str, max_threads: int) -> list: |  | ||||||
| 
 |  | ||||||
|     ftp = FTPConnection(host) |  | ||||||
|     root_listing = ftp.list_dir("/") |  | ||||||
|     if ftp.ftp: |  | ||||||
|         ftp.ftp.close() |  | ||||||
| 
 |  | ||||||
|     q = Queue(maxsize=0) |  | ||||||
|     for i in range(max_threads): |  | ||||||
|         worker = Thread(target=process_and_queue, args=(host, q,)) |  | ||||||
|         worker.setDaemon(True) |  | ||||||
|         worker.start() |  | ||||||
| 
 |  | ||||||
|     for file in root_listing: |  | ||||||
|         q.put(file) |  | ||||||
| 
 |  | ||||||
|     q.join() |  | ||||||
|     return [] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| if __name__ == '__main__': |  | ||||||
|     import sys |  | ||||||
|     crawl_ftp_server(sys.argv[1], 50) |  | ||||||
| @ -75,7 +75,7 @@ def is_od(url): | |||||||
|             ftp.close() |             ftp.close() | ||||||
|             return True |             return True | ||||||
|         else: |         else: | ||||||
|             r = requests.get(url, timeout=15, allow_redirects=False) |             r = requests.get(url, timeout=30, allow_redirects=False) | ||||||
|             if r.status_code != 200: |             if r.status_code != 200: | ||||||
|                 print("No redirects allowed!") |                 print("No redirects allowed!") | ||||||
|                 return False |                 return False | ||||||
|  | |||||||
| @ -1,4 +1,3 @@ | |||||||
| scrapy |  | ||||||
| flask | flask | ||||||
| requests | requests | ||||||
| bs4 | bs4 | ||||||
| @ -9,4 +8,5 @@ praw | |||||||
| humanfriendly | humanfriendly | ||||||
| apscheduler | apscheduler | ||||||
| bcrypt | bcrypt | ||||||
| ftputil | ftputil | ||||||
|  | lxml | ||||||
| @ -1,71 +0,0 @@ | |||||||
| # -*- coding: utf-8 -*- |  | ||||||
| 
 |  | ||||||
| # For simplicity, this file contains only settings considered important or |  | ||||||
| # commonly used. You can find more settings consulting the documentation: |  | ||||||
| # |  | ||||||
| #     https://doc.scrapy.org/en/latest/topics/settings.html |  | ||||||
| #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html |  | ||||||
| #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html |  | ||||||
| 
 |  | ||||||
| BOT_NAME = 'scrapy_od_database' |  | ||||||
| 
 |  | ||||||
| SPIDER_MODULES = ['scrapy_od_database.spiders'] |  | ||||||
| NEWSPIDER_MODULE = 'scrapy_od_database.spiders' |  | ||||||
| DOWNLOAD_HANDLERS = {'ftp': 'scrapy_od_database.handlers.FtpListingHandler'} |  | ||||||
| 
 |  | ||||||
| LOG_LEVEL = 'ERROR' |  | ||||||
| FEED_FORMAT = 'json' |  | ||||||
| FEED_URI = 'data.json' |  | ||||||
| 
 |  | ||||||
| USER_AGENT = 'Mozilla/5.0 (X11; od-database.simon987.net) AppleWebKit/537.36 ' \ |  | ||||||
|              '(KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' |  | ||||||
| 
 |  | ||||||
| # Obey robots.txt rules |  | ||||||
| ROBOTSTXT_OBEY = False |  | ||||||
| 
 |  | ||||||
| # Configure maximum concurrent requests performed by Scrapy (default: 16) |  | ||||||
| CONCURRENT_REQUESTS = 40 |  | ||||||
| RETRY_TIMES = 6 |  | ||||||
| DOWNLOAD_TIMEOUT = 90 |  | ||||||
| 
 |  | ||||||
| # Configure a delay for requests for the same website (default: 0) |  | ||||||
| # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay |  | ||||||
| # See also autothrottle settings and docs |  | ||||||
| #DOWNLOAD_DELAY = 3 |  | ||||||
| # The download delay setting will honor only one of: |  | ||||||
| CONCURRENT_REQUESTS_PER_DOMAIN = 40 |  | ||||||
| # CONCURRENT_REQUESTS_PER_IP = 16 |  | ||||||
| 
 |  | ||||||
| # Disable cookies (enabled by default) |  | ||||||
| #COOKIES_ENABLED = False |  | ||||||
| 
 |  | ||||||
| # Disable Telnet Console (enabled by default) |  | ||||||
| TELNETCONSOLE_ENABLED = False |  | ||||||
| 
 |  | ||||||
| # Override the default request headers: |  | ||||||
| #DEFAULT_REQUEST_HEADERS = { |  | ||||||
| #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |  | ||||||
| #   'Accept-Language': 'en', |  | ||||||
| #} |  | ||||||
| 
 |  | ||||||
| # Enable and configure the AutoThrottle extension (disabled by default) |  | ||||||
| # See https://doc.scrapy.org/en/latest/topics/autothrottle.html |  | ||||||
| #AUTOTHROTTLE_ENABLED = True |  | ||||||
| # The initial download delay |  | ||||||
| #AUTOTHROTTLE_START_DELAY = 5 |  | ||||||
| # The maximum download delay to be set in case of high latencies |  | ||||||
| #AUTOTHROTTLE_MAX_DELAY = 60 |  | ||||||
| # The average number of requests Scrapy should be sending in parallel to |  | ||||||
| # each remote server |  | ||||||
| #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 |  | ||||||
| # Enable showing throttling stats for every response received: |  | ||||||
| #AUTOTHROTTLE_DEBUG = False |  | ||||||
| 
 |  | ||||||
| # Enable and configure HTTP caching (disabled by default) |  | ||||||
| # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings |  | ||||||
| #HTTPCACHE_ENABLED = True |  | ||||||
| #HTTPCACHE_EXPIRATION_SECS = 0 |  | ||||||
| #HTTPCACHE_DIR = 'httpcache' |  | ||||||
| #HTTPCACHE_IGNORE_HTTP_CODES = [] |  | ||||||
| #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |  | ||||||
| 
 |  | ||||||
| @ -1,4 +0,0 @@ | |||||||
| # This package will contain the spiders of your Scrapy project |  | ||||||
| # |  | ||||||
| # Please refer to the documentation for information on how to create and manage |  | ||||||
| # your spiders. |  | ||||||
| @ -1,79 +0,0 @@ | |||||||
| import scrapy |  | ||||||
| import os |  | ||||||
| from urllib.parse import unquote |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class LinksSpider(scrapy.Spider): |  | ||||||
|     """Scrapy spider for open directories. Will gather all download links recursively""" |  | ||||||
| 
 |  | ||||||
|     name = "od_links" |  | ||||||
| 
 |  | ||||||
|     black_list = ( |  | ||||||
|         "?C=N&O=D", |  | ||||||
|         "?C=M&O=A", |  | ||||||
|         "?C=S&O=A", |  | ||||||
|         "?C=D&O=A", |  | ||||||
|         "?C=N;O=D", |  | ||||||
|         "?C=M;O=A", |  | ||||||
|         "?C=S;O=A", |  | ||||||
|         "?C=D;O=A" |  | ||||||
|     ) |  | ||||||
| 
 |  | ||||||
|     def __init__(self, **kwargs): |  | ||||||
|         super().__init__(**kwargs) |  | ||||||
|         self.crawled_links = set() |  | ||||||
| 
 |  | ||||||
|     def __index__(self, **kw): |  | ||||||
|         super(LinksSpider, self).__init__(**kw) |  | ||||||
|         self.base_url = kw.get("base_url") |  | ||||||
| 
 |  | ||||||
|     def should_ask_headers(self, link): |  | ||||||
|         """Whether or not to send HEAD request""" |  | ||||||
|         return link not in self.crawled_links and not link.rsplit("?", maxsplit=1)[0].endswith("/") |  | ||||||
| 
 |  | ||||||
|     def should_crawl(self, link): |  | ||||||
|         """Whether or not the link should be followed""" |  | ||||||
|         if link in self.crawled_links: |  | ||||||
|             return False |  | ||||||
| 
 |  | ||||||
|         if link.endswith(tuple(self.black_list)): |  | ||||||
|             return False |  | ||||||
| 
 |  | ||||||
|         if not link.startswith(self.base_url): |  | ||||||
|             return False |  | ||||||
| 
 |  | ||||||
|         return link.rsplit("?", maxsplit=1)[0].endswith("/") |  | ||||||
| 
 |  | ||||||
|     def start_requests(self): |  | ||||||
|         yield scrapy.Request(url=self.base_url, callback=self.parse) |  | ||||||
| 
 |  | ||||||
|     def parse(self, response): |  | ||||||
|         if response.status == 200: |  | ||||||
|             links = response.xpath('//a/@href').extract() |  | ||||||
|             for link in links: |  | ||||||
|                 full_link = response.urljoin(link) |  | ||||||
| 
 |  | ||||||
|                 if self.should_ask_headers(full_link): |  | ||||||
|                     yield scrapy.Request(full_link, method="HEAD", callback=self.save_file) |  | ||||||
|                 elif self.should_crawl(full_link): |  | ||||||
|                     self.crawled_links.add(full_link) |  | ||||||
|                     yield scrapy.Request(full_link, callback=self.parse) |  | ||||||
| 
 |  | ||||||
|     def save_file(self, response): |  | ||||||
| 
 |  | ||||||
|         if response.status == 200: |  | ||||||
|             # Save file information |  | ||||||
|             stripped_url = response.url[len(self.base_url) - 1:] |  | ||||||
|             self.crawled_links.add(response.url) |  | ||||||
| 
 |  | ||||||
|             path, name = os.path.split(stripped_url) |  | ||||||
| 
 |  | ||||||
|             yield { |  | ||||||
|                 "path": unquote(path).strip("/"), |  | ||||||
|                 "name": unquote(name), |  | ||||||
|                 "size": int(response.headers["Content-Length"].decode("utf-8")) if "Content-Length" in response.headers else -1, |  | ||||||
|                 "mime": response.headers["Content-Type"].decode("utf-8").split(";", maxsplit=1)[0] |  | ||||||
|                 if "Content-Type" in response.headers else "?", |  | ||||||
|                 "mtime": response.headers["Date"].decode("utf-8") if "Date" in response.headers else "?" |  | ||||||
|             } |  | ||||||
| 
 |  | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user