diff --git a/od_crawler_exemple.py b/od_crawler_exemple.py deleted file mode 100644 index 724a4e8..0000000 --- a/od_crawler_exemple.py +++ /dev/null @@ -1,8 +0,0 @@ -from od_util import is_od -# Example usage. Links will be written into data.json - -with open("/home/simon/out.txt") as f: - - for line in f: - print(line[:-1]) - print(is_od(line[:-1])) diff --git a/queue_reddit_links.py b/queue_reddit_links.py index 46b51cc..f09d10b 100644 --- a/queue_reddit_links.py +++ b/queue_reddit_links.py @@ -11,7 +11,7 @@ subreddit = reddit.subreddit("opendirectories") submissions = [] -for submission in subreddit.new(limit=3): +for submission in subreddit.new(limit=1): submissions.append(submission) bot = RedditBot("crawled.txt", reddit) diff --git a/scrapy.cfg b/scrapy.cfg index 6f201bb..fdd12f2 100644 --- a/scrapy.cfg +++ b/scrapy.cfg @@ -1,6 +1,6 @@ [settings] -default = scrapy_downl_od.settings +default = scrapy_od_database.settings # Automatically created by: scrapy startproject # @@ -9,4 +9,4 @@ default = scrapy_downl_od.settings [deploy] #url = http://localhost:6800/ -project = scrapy_downl_od +project = scrapy_od_database diff --git a/scrapy_od-database/__init__.py b/scrapy_od_database/__init__.py similarity index 100% rename from scrapy_od-database/__init__.py rename to scrapy_od_database/__init__.py diff --git a/scrapy_od-database/settings.py b/scrapy_od_database/settings.py similarity index 92% rename from scrapy_od-database/settings.py rename to scrapy_od_database/settings.py index bd23a33..093fcc9 100644 --- a/scrapy_od-database/settings.py +++ b/scrapy_od_database/settings.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -# Scrapy settings for scrapy_downl_od project -# # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # @@ -9,10 +7,10 @@ # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html -BOT_NAME = 'scrapy_od-database' +BOT_NAME = 'scrapy_od_database' -SPIDER_MODULES = ['scrapy_od-database.spiders'] -NEWSPIDER_MODULE = 'scrapy_od-database.spiders' +SPIDER_MODULES = ['scrapy_od_database.spiders'] +NEWSPIDER_MODULE = 'scrapy_od_database.spiders' LOG_LEVEL = 'ERROR' FEED_FORMAT = 'json' @@ -25,9 +23,9 @@ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Ge ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) -CONCURRENT_REQUESTS = 50 -RETRY_TIMES = 4 -DOWNLOAD_TIMEOUT = 40 +CONCURRENT_REQUESTS = 40 +RETRY_TIMES = 5 +DOWNLOAD_TIMEOUT = 50 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay diff --git a/scrapy_od-database/spiders/__init__.py b/scrapy_od_database/spiders/__init__.py similarity index 100% rename from scrapy_od-database/spiders/__init__.py rename to scrapy_od_database/spiders/__init__.py diff --git a/scrapy_od-database/spiders/od_links_spider.py b/scrapy_od_database/spiders/od_links_spider.py similarity index 96% rename from scrapy_od-database/spiders/od_links_spider.py rename to scrapy_od_database/spiders/od_links_spider.py index 635fcd0..4771cc9 100644 --- a/scrapy_od-database/spiders/od_links_spider.py +++ b/scrapy_od_database/spiders/od_links_spider.py @@ -1,7 +1,6 @@ import scrapy from os import path from urllib.parse import unquote -from od_util import has_extension class LinksSpider(scrapy.Spider): @@ -30,7 +29,7 @@ class LinksSpider(scrapy.Spider): def should_ask_headers(self, link): """Whether or not to send HEAD request""" - return link not in self.crawled_links and has_extension(link) + return link not in self.crawled_links and not link.rsplit("?", maxsplit=1)[0].endswith("/") def should_crawl(self, link): """Whether or not the link should be followed"""