Renamed package (again) and removed unused files

2025-04-24 12:45:51 +00:00 · 2018-05-31 08:30:08 -04:00 · 2018-05-31 08:30:08 -04:00 · 819e2fbddb
commit 819e2fbddb
parent ca651278d0
7 changed files with 10 additions and 21 deletions
--- a/od_crawler_exemple.py
+++ b/od_crawler_exemple.py
@ -1,8 +0,0 @@
 from od_util import is_od
 # Example usage. Links will be written into data.json
 with open("/home/simon/out.txt") as f:
    for line in f:
        print(line[:-1])
        print(is_od(line[:-1]))
--- a/queue_reddit_links.py
+++ b/queue_reddit_links.py
@ -11,7 +11,7 @@ subreddit = reddit.subreddit("opendirectories")
 submissions = []
-for submission in subreddit.new(limit=3):
+for submission in subreddit.new(limit=1):
    submissions.append(submission)
 bot = RedditBot("crawled.txt", reddit)
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -1,6 +1,6 @@
 [settings]
-default = scrapy_downl_od.settings
+default = scrapy_od_database.settings
 # Automatically created by: scrapy startproject
 #
@ -9,4 +9,4 @@ default = scrapy_downl_od.settings
 [deploy]
 #url = http://localhost:6800/
-project = scrapy_downl_od
+project = scrapy_od_database
--- a/scrapy_od_database/init.py
+++ b/scrapy_od_database/init.py
--- a/scrapy_od_database/settings.py
+++ b/scrapy_od_database/settings.py
@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
 # Scrapy settings for scrapy_downl_od project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
@ -9,10 +7,10 @@
 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-BOT_NAME = 'scrapy_od-database'
+BOT_NAME = 'scrapy_od_database'
-SPIDER_MODULES = ['scrapy_od-database.spiders']
+SPIDER_MODULES = ['scrapy_od_database.spiders']
-NEWSPIDER_MODULE = 'scrapy_od-database.spiders'
+NEWSPIDER_MODULE = 'scrapy_od_database.spiders'
 LOG_LEVEL = 'ERROR'
 FEED_FORMAT = 'json'
@ -25,9 +23,9 @@ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Ge
 ROBOTSTXT_OBEY = False
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-CONCURRENT_REQUESTS = 50
+CONCURRENT_REQUESTS = 40
-RETRY_TIMES = 4
+RETRY_TIMES = 5
-DOWNLOAD_TIMEOUT = 40
+DOWNLOAD_TIMEOUT = 50
 # Configure a delay for requests for the same website (default: 0)
 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
--- a/scrapy_od_database/spiders/init.py
+++ b/scrapy_od_database/spiders/init.py
--- a/scrapy_od_database/spiders/od_links_spider.py
+++ b/scrapy_od_database/spiders/od_links_spider.py
@ -1,7 +1,6 @@
 import scrapy
 from os import path
 from urllib.parse import unquote
 from od_util import has_extension
 class LinksSpider(scrapy.Spider):
@ -30,7 +29,7 @@ class LinksSpider(scrapy.Spider):
    def should_ask_headers(self, link):
        """Whether or not to send HEAD request"""
-        return link not in self.crawled_links and has_extension(link)
+        return link not in self.crawled_links and not link.rsplit("?", maxsplit=1)[0].endswith("/")
    def should_crawl(self, link):
        """Whether or not the link should be followed"""