mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-04 06:52:59 +00:00
85 lines
1.9 KiB
YAML
85 lines
1.9 KiB
YAML
# OD-Database server settings
|
|
server:
|
|
# Connection URL
|
|
url: http://od-db.mine.terorie.com/api
|
|
|
|
# Server auth token
|
|
token:
|
|
|
|
# Request timeout
|
|
timeout: 60s
|
|
|
|
# Recheck interval
|
|
# The crawler periodically asks the server
|
|
# for new jobs. Sets the minimum wait time
|
|
# between /task/get requests to the server.
|
|
recheck: 1s
|
|
|
|
# Time to wait after receiving an error
|
|
# from the server. Doesn't apply to uploads.
|
|
cooldown: 30s
|
|
|
|
# Upload chunk size
|
|
# If the value is too high, the upload fails.
|
|
upload_chunk: 1 MB
|
|
|
|
upload_retries: 10
|
|
upload_retry_interval: 30s
|
|
|
|
# Log output settings
|
|
output:
|
|
# Crawl statistics
|
|
crawl_stats: 1s
|
|
|
|
# CPU/RAM/Job queue stats
|
|
resource_stats: 10s
|
|
|
|
# More output? (Every listed dir)
|
|
verbose: false
|
|
|
|
# Print HTTP errors (Super spammy)
|
|
http: false
|
|
|
|
# Log file
|
|
# If empty, no log file is created.
|
|
log: crawler.log
|
|
|
|
# Crawler settings
|
|
crawl:
|
|
# Number of sites that can be processed at once
|
|
tasks: 25
|
|
|
|
# Number of connections per site
|
|
# Please be careful with this setting!
|
|
# The crawler fires fast and more than
|
|
# ten connections can overwhelm a server.
|
|
connections: 1
|
|
|
|
# How often to retry getting data
|
|
# from the site before giving up
|
|
retries: 5
|
|
|
|
# Time before discarding a failed connection attempt
|
|
dial_timeout: 10s
|
|
|
|
# Time before discarding a network request
|
|
timeout: 30s
|
|
|
|
# Crawler User-Agent
|
|
# If empty, no User-Agent header is sent.
|
|
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
|
|
|
|
# Job buffer size (per task)
|
|
# Higher values cause less disk writes
|
|
# but require more memory.
|
|
#
|
|
# The job queue contains all URLs
|
|
# that should be crawled next.
|
|
# As it grows very large over time,
|
|
# it's kept mainly on disk.
|
|
# This sets how many jobs are kept
|
|
# in memory.
|
|
# A negative value will cause all jobs
|
|
# to be stored in memory. (Don't do this)
|
|
job_buffer: -1
|