Renamed package (again) and removed unused files

This commit is contained in:
Simon 2018-05-31 08:30:08 -04:00
parent ca651278d0
commit 819e2fbddb
7 changed files with 10 additions and 21 deletions

View File

@ -1,8 +0,0 @@
from od_util import is_od
# Example usage. Links will be written into data.json
with open("/home/simon/out.txt") as f:
for line in f:
print(line[:-1])
print(is_od(line[:-1]))

View File

@ -11,7 +11,7 @@ subreddit = reddit.subreddit("opendirectories")
submissions = []
for submission in subreddit.new(limit=3):
for submission in subreddit.new(limit=1):
submissions.append(submission)
bot = RedditBot("crawled.txt", reddit)

View File

@ -1,6 +1,6 @@
[settings]
default = scrapy_downl_od.settings
default = scrapy_od_database.settings
# Automatically created by: scrapy startproject
#
@ -9,4 +9,4 @@ default = scrapy_downl_od.settings
[deploy]
#url = http://localhost:6800/
project = scrapy_downl_od
project = scrapy_od_database

View File

@ -1,7 +1,5 @@
# -*- coding: utf-8 -*-
# Scrapy settings for scrapy_downl_od project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
@ -9,10 +7,10 @@
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'scrapy_od-database'
BOT_NAME = 'scrapy_od_database'
SPIDER_MODULES = ['scrapy_od-database.spiders']
NEWSPIDER_MODULE = 'scrapy_od-database.spiders'
SPIDER_MODULES = ['scrapy_od_database.spiders']
NEWSPIDER_MODULE = 'scrapy_od_database.spiders'
LOG_LEVEL = 'ERROR'
FEED_FORMAT = 'json'
@ -25,9 +23,9 @@ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Ge
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 50
RETRY_TIMES = 4
DOWNLOAD_TIMEOUT = 40
CONCURRENT_REQUESTS = 40
RETRY_TIMES = 5
DOWNLOAD_TIMEOUT = 50
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

View File

@ -1,7 +1,6 @@
import scrapy
from os import path
from urllib.parse import unquote
from od_util import has_extension
class LinksSpider(scrapy.Spider):
@ -30,7 +29,7 @@ class LinksSpider(scrapy.Spider):
def should_ask_headers(self, link):
"""Whether or not to send HEAD request"""
return link not in self.crawled_links and has_extension(link)
return link not in self.crawled_links and not link.rsplit("?", maxsplit=1)[0].endswith("/")
def should_crawl(self, link):
"""Whether or not the link should be followed"""