mirror of
https://github.com/simon987/od-database.git
synced 2025-04-18 18:06:44 +00:00
Renamed package (again) and removed unused files
This commit is contained in:
parent
ca651278d0
commit
819e2fbddb
@ -1,8 +0,0 @@
|
|||||||
from od_util import is_od
|
|
||||||
# Example usage. Links will be written into data.json
|
|
||||||
|
|
||||||
with open("/home/simon/out.txt") as f:
|
|
||||||
|
|
||||||
for line in f:
|
|
||||||
print(line[:-1])
|
|
||||||
print(is_od(line[:-1]))
|
|
@ -11,7 +11,7 @@ subreddit = reddit.subreddit("opendirectories")
|
|||||||
|
|
||||||
submissions = []
|
submissions = []
|
||||||
|
|
||||||
for submission in subreddit.new(limit=3):
|
for submission in subreddit.new(limit=1):
|
||||||
submissions.append(submission)
|
submissions.append(submission)
|
||||||
|
|
||||||
bot = RedditBot("crawled.txt", reddit)
|
bot = RedditBot("crawled.txt", reddit)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
[settings]
|
[settings]
|
||||||
default = scrapy_downl_od.settings
|
default = scrapy_od_database.settings
|
||||||
|
|
||||||
# Automatically created by: scrapy startproject
|
# Automatically created by: scrapy startproject
|
||||||
#
|
#
|
||||||
@ -9,4 +9,4 @@ default = scrapy_downl_od.settings
|
|||||||
|
|
||||||
[deploy]
|
[deploy]
|
||||||
#url = http://localhost:6800/
|
#url = http://localhost:6800/
|
||||||
project = scrapy_downl_od
|
project = scrapy_od_database
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Scrapy settings for scrapy_downl_od project
|
|
||||||
#
|
|
||||||
# For simplicity, this file contains only settings considered important or
|
# For simplicity, this file contains only settings considered important or
|
||||||
# commonly used. You can find more settings consulting the documentation:
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
#
|
#
|
||||||
@ -9,10 +7,10 @@
|
|||||||
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
BOT_NAME = 'scrapy_od-database'
|
BOT_NAME = 'scrapy_od_database'
|
||||||
|
|
||||||
SPIDER_MODULES = ['scrapy_od-database.spiders']
|
SPIDER_MODULES = ['scrapy_od_database.spiders']
|
||||||
NEWSPIDER_MODULE = 'scrapy_od-database.spiders'
|
NEWSPIDER_MODULE = 'scrapy_od_database.spiders'
|
||||||
|
|
||||||
LOG_LEVEL = 'ERROR'
|
LOG_LEVEL = 'ERROR'
|
||||||
FEED_FORMAT = 'json'
|
FEED_FORMAT = 'json'
|
||||||
@ -25,9 +23,9 @@ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Ge
|
|||||||
ROBOTSTXT_OBEY = False
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
CONCURRENT_REQUESTS = 50
|
CONCURRENT_REQUESTS = 40
|
||||||
RETRY_TIMES = 4
|
RETRY_TIMES = 5
|
||||||
DOWNLOAD_TIMEOUT = 40
|
DOWNLOAD_TIMEOUT = 50
|
||||||
|
|
||||||
# Configure a delay for requests for the same website (default: 0)
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
@ -1,7 +1,6 @@
|
|||||||
import scrapy
|
import scrapy
|
||||||
from os import path
|
from os import path
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
from od_util import has_extension
|
|
||||||
|
|
||||||
|
|
||||||
class LinksSpider(scrapy.Spider):
|
class LinksSpider(scrapy.Spider):
|
||||||
@ -30,7 +29,7 @@ class LinksSpider(scrapy.Spider):
|
|||||||
|
|
||||||
def should_ask_headers(self, link):
|
def should_ask_headers(self, link):
|
||||||
"""Whether or not to send HEAD request"""
|
"""Whether or not to send HEAD request"""
|
||||||
return link not in self.crawled_links and has_extension(link)
|
return link not in self.crawled_links and not link.rsplit("?", maxsplit=1)[0].endswith("/")
|
||||||
|
|
||||||
def should_crawl(self, link):
|
def should_crawl(self, link):
|
||||||
"""Whether or not the link should be followed"""
|
"""Whether or not the link should be followed"""
|
Loading…
x
Reference in New Issue
Block a user