mirror of
https://github.com/simon987/od-database.git
synced 2025-10-24 19:36:52 +00:00
Renamed package (again) and removed unused files
This commit is contained in:
parent
ca651278d0
commit
819e2fbddb
@ -1,8 +0,0 @@
|
||||
from od_util import is_od
|
||||
# Example usage. Links will be written into data.json
|
||||
|
||||
with open("/home/simon/out.txt") as f:
|
||||
|
||||
for line in f:
|
||||
print(line[:-1])
|
||||
print(is_od(line[:-1]))
|
@ -11,7 +11,7 @@ subreddit = reddit.subreddit("opendirectories")
|
||||
|
||||
submissions = []
|
||||
|
||||
for submission in subreddit.new(limit=3):
|
||||
for submission in subreddit.new(limit=1):
|
||||
submissions.append(submission)
|
||||
|
||||
bot = RedditBot("crawled.txt", reddit)
|
||||
|
@ -1,6 +1,6 @@
|
||||
|
||||
[settings]
|
||||
default = scrapy_downl_od.settings
|
||||
default = scrapy_od_database.settings
|
||||
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
@ -9,4 +9,4 @@ default = scrapy_downl_od.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = scrapy_downl_od
|
||||
project = scrapy_od_database
|
||||
|
@ -1,7 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Scrapy settings for scrapy_downl_od project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
@ -9,10 +7,10 @@
|
||||
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'scrapy_od-database'
|
||||
BOT_NAME = 'scrapy_od_database'
|
||||
|
||||
SPIDER_MODULES = ['scrapy_od-database.spiders']
|
||||
NEWSPIDER_MODULE = 'scrapy_od-database.spiders'
|
||||
SPIDER_MODULES = ['scrapy_od_database.spiders']
|
||||
NEWSPIDER_MODULE = 'scrapy_od_database.spiders'
|
||||
|
||||
LOG_LEVEL = 'ERROR'
|
||||
FEED_FORMAT = 'json'
|
||||
@ -25,9 +23,9 @@ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Ge
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
CONCURRENT_REQUESTS = 50
|
||||
RETRY_TIMES = 4
|
||||
DOWNLOAD_TIMEOUT = 40
|
||||
CONCURRENT_REQUESTS = 40
|
||||
RETRY_TIMES = 5
|
||||
DOWNLOAD_TIMEOUT = 50
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
@ -1,7 +1,6 @@
|
||||
import scrapy
|
||||
from os import path
|
||||
from urllib.parse import unquote
|
||||
from od_util import has_extension
|
||||
|
||||
|
||||
class LinksSpider(scrapy.Spider):
|
||||
@ -30,7 +29,7 @@ class LinksSpider(scrapy.Spider):
|
||||
|
||||
def should_ask_headers(self, link):
|
||||
"""Whether or not to send HEAD request"""
|
||||
return link not in self.crawled_links and has_extension(link)
|
||||
return link not in self.crawled_links and not link.rsplit("?", maxsplit=1)[0].endswith("/")
|
||||
|
||||
def should_crawl(self, link):
|
||||
"""Whether or not the link should be followed"""
|
Loading…
x
Reference in New Issue
Block a user