mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-10 05:56:42 +00:00
Add job buffer size parameter
This commit is contained in:
parent
86ec78cae1
commit
4dbe2aef2b
@ -25,6 +25,7 @@ var config struct {
|
|||||||
AllocStats time.Duration
|
AllocStats time.Duration
|
||||||
Verbose bool
|
Verbose bool
|
||||||
PrintHTTP bool
|
PrintHTTP bool
|
||||||
|
JobBufferSize int
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -39,6 +40,7 @@ const (
|
|||||||
ConfUserAgent = "crawl.user-agent"
|
ConfUserAgent = "crawl.user-agent"
|
||||||
ConfDialTimeout = "crawl.dial_timeout"
|
ConfDialTimeout = "crawl.dial_timeout"
|
||||||
ConfTimeout = "crawl.timeout"
|
ConfTimeout = "crawl.timeout"
|
||||||
|
ConfJobBufferSize = "crawl.job_buffer"
|
||||||
ConfCrawlStats = "output.crawl_stats"
|
ConfCrawlStats = "output.crawl_stats"
|
||||||
ConfAllocStats = "output.resource_stats"
|
ConfAllocStats = "output.resource_stats"
|
||||||
ConfVerbose = "output.verbose"
|
ConfVerbose = "output.verbose"
|
||||||
@ -53,6 +55,7 @@ func prepareConfig() {
|
|||||||
viper.SetDefault(ConfUserAgent, "")
|
viper.SetDefault(ConfUserAgent, "")
|
||||||
viper.SetDefault(ConfDialTimeout, 10 * time.Second)
|
viper.SetDefault(ConfDialTimeout, 10 * time.Second)
|
||||||
viper.SetDefault(ConfTimeout, 30 * time.Second)
|
viper.SetDefault(ConfTimeout, 30 * time.Second)
|
||||||
|
viper.SetDefault(ConfJobBufferSize, 5000)
|
||||||
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
|
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
|
||||||
viper.SetDefault(ConfAllocStats, 0)
|
viper.SetDefault(ConfAllocStats, 0)
|
||||||
viper.SetDefault(ConfVerbose, false)
|
viper.SetDefault(ConfVerbose, false)
|
||||||
@ -112,6 +115,8 @@ func readConfig() {
|
|||||||
|
|
||||||
setTimeout(viper.GetDuration(ConfTimeout))
|
setTimeout(viper.GetDuration(ConfTimeout))
|
||||||
|
|
||||||
|
config.JobBufferSize = viper.GetInt(ConfJobBufferSize)
|
||||||
|
|
||||||
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
|
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
|
||||||
|
|
||||||
config.AllocStats = viper.GetDuration(ConfAllocStats)
|
config.AllocStats = viper.GetDuration(ConfAllocStats)
|
||||||
|
11
config.yml
11
config.yml
@ -46,7 +46,7 @@ crawl:
|
|||||||
# Please be careful with this setting!
|
# Please be careful with this setting!
|
||||||
# The crawler fires fast and more than
|
# The crawler fires fast and more than
|
||||||
# ten connections can overwhelm a server.
|
# ten connections can overwhelm a server.
|
||||||
connections: 10
|
connections: 4
|
||||||
|
|
||||||
# How often to retry getting data
|
# How often to retry getting data
|
||||||
# from the site before giving up
|
# from the site before giving up
|
||||||
@ -65,4 +65,13 @@ crawl:
|
|||||||
# Job buffer size (per task)
|
# Job buffer size (per task)
|
||||||
# Higher values cause less disk writes
|
# Higher values cause less disk writes
|
||||||
# but require more memory.
|
# but require more memory.
|
||||||
|
#
|
||||||
|
# The job queue contains all URLs
|
||||||
|
# that should be crawled next.
|
||||||
|
# As it grows very large over time,
|
||||||
|
# it's kept mainly on disk.
|
||||||
|
# This sets how many jobs are kept
|
||||||
|
# in memory.
|
||||||
|
# A negative value will cause all jobs
|
||||||
|
# to be stored in memory. (Don't do this)
|
||||||
job_buffer: 5000
|
job_buffer: 5000
|
||||||
|
19
queue.go
19
queue.go
@ -7,10 +7,6 @@ import (
|
|||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
|
||||||
threshold = 5000
|
|
||||||
)
|
|
||||||
|
|
||||||
type BufferedQueue struct {
|
type BufferedQueue struct {
|
||||||
dataDir string
|
dataDir string
|
||||||
q *goque.Queue
|
q *goque.Queue
|
||||||
@ -20,6 +16,9 @@ type BufferedQueue struct {
|
|||||||
|
|
||||||
func OpenQueue(dataDir string) (bq *BufferedQueue, err error) {
|
func OpenQueue(dataDir string) (bq *BufferedQueue, err error) {
|
||||||
bq = new(BufferedQueue)
|
bq = new(BufferedQueue)
|
||||||
|
if config.JobBufferSize < 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
bq.dataDir = dataDir
|
bq.dataDir = dataDir
|
||||||
bq.q, err = goque.OpenQueue(dataDir)
|
bq.q, err = goque.OpenQueue(dataDir)
|
||||||
if err != nil { return nil, err }
|
if err != nil { return nil, err }
|
||||||
@ -44,6 +43,11 @@ func (q *BufferedQueue) Dequeue() (job Job, err error) {
|
|||||||
return job, nil
|
return job, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if config.JobBufferSize < 0 {
|
||||||
|
err = goque.ErrEmpty
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
var item *goque.Item
|
var item *goque.Item
|
||||||
item, err = q.q.Dequeue()
|
item, err = q.q.Dequeue()
|
||||||
if err != nil { return }
|
if err != nil { return }
|
||||||
@ -62,7 +66,8 @@ func (q *BufferedQueue) directEnqueue(job *Job) bool {
|
|||||||
q.m.Lock()
|
q.m.Lock()
|
||||||
defer q.m.Unlock()
|
defer q.m.Unlock()
|
||||||
|
|
||||||
if len(q.buf) < threshold {
|
bs := config.JobBufferSize
|
||||||
|
if len(q.buf) < bs || bs < 0 {
|
||||||
q.buf = append(q.buf, *job)
|
q.buf = append(q.buf, *job)
|
||||||
return true
|
return true
|
||||||
} else {
|
} else {
|
||||||
@ -85,6 +90,10 @@ func (q *BufferedQueue) directDequeue(job *Job) bool {
|
|||||||
|
|
||||||
// Always returns nil (But implements io.Closer)
|
// Always returns nil (But implements io.Closer)
|
||||||
func (q *BufferedQueue) Close() error {
|
func (q *BufferedQueue) Close() error {
|
||||||
|
if config.JobBufferSize < 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// Close ignoring errors
|
// Close ignoring errors
|
||||||
q.q.Close()
|
q.q.Close()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user