From 98c7f12cf152024726fceae0f760785d0fd3dbb6 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 30 May 2019 10:03:33 -0400 Subject: [PATCH] Config glob & hierarchy, fixes --- README.md | 23 +++++++++++++++-------- config.go | 48 ++++++++++++++++++++++++++++-------------------- config.json | 18 +++++++++--------- gc.go | 11 ++++++++--- main.go | 26 ++++++++++---------------- 5 files changed, 70 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index b39d6e8..a7b54c5 100644 --- a/README.md +++ b/README.md @@ -34,18 +34,25 @@ and error handling. Built for automated web scraping. "url": "http://p1.exemple.com:8080" } ], - "hosts": { - "*": { - "every": "750ms", - "burst": 5, - "headers": {} + "hosts": [ + { + "host": "*", + "every": "500ms", + "burst": 25, + "headers": { + "User-Agent": "Some user agent", + "X-Test": "Will be overwritten" + } }, - "reddit.com": { + { + "host": "*.reddit.com", "every": "2s", "burst": 2, - "headers": {"User-Agent": "mybot_v0.1"} + "headers": { + "X-Test": "Will overwrite default" + } } - } + ] } ``` diff --git a/config.go b/config.go index fc0fb10..ecaa854 100644 --- a/config.go +++ b/config.go @@ -11,6 +11,7 @@ import ( ) type HostConfig struct { + Host string `json:"host"` EveryStr string `json:"every"` Burst int `json:"burst"` Headers map[string]string `json:"headers"` @@ -23,15 +24,16 @@ type ProxyConfig struct { } var config struct { - Addr string `json:"addr"` - TimeoutStr string `json:"timeout"` - WaitStr string `json:"wait"` - Multiplier float64 `json:"multiplier"` - Retries int `json:"retries"` - Hosts map[string]*HostConfig `json:"hosts"` - Proxies []ProxyConfig `json:"proxies"` - Wait int64 - Timeout time.Duration + Addr string `json:"addr"` + TimeoutStr string `json:"timeout"` + WaitStr string `json:"wait"` + Multiplier float64 `json:"multiplier"` + Retries int `json:"retries"` + Hosts []*HostConfig `json:"hosts"` + Proxies []ProxyConfig `json:"proxies"` + Wait int64 + Timeout time.Duration + DefaultConfig *HostConfig } func loadConfig() { @@ -52,38 +54,44 @@ func loadConfig() { config.Wait = int64(wait) for _, conf := range config.Hosts { - conf.Every, err = time.ParseDuration(conf.EveryStr) - handleErr(err) + if conf.EveryStr == "" { + conf.Every = config.DefaultConfig.Every + } else { + conf.Every, err = time.ParseDuration(conf.EveryStr) + handleErr(err) + } + + if config.DefaultConfig != nil && conf.Burst == 0 { + conf.Burst = config.DefaultConfig.Burst + } } } func validateConfig() { - hasDefaultHost := false + for _, conf := range config.Hosts { - for host, conf := range config.Hosts { - - if host == "*" { - hasDefaultHost = true + if conf.Host == "*" { + config.DefaultConfig = conf } for k := range conf.Headers { if strings.ToLower(k) == "accept-encoding" { panic(fmt.Sprintf("headers config for '%s':"+ - " Do not set the Accept-Encoding header, it breaks goproxy", host)) + " Do not set the Accept-Encoding header, it breaks goproxy", conf.Host)) } } } - if !hasDefaultHost { + if config.DefaultConfig == nil { panic("config.json: You must specify a default host ('*')") } } func applyConfig(proxy *Proxy) { - for host, conf := range config.Hosts { - proxy.Limiters[host] = &ExpiringLimiter{ + for _, conf := range config.Hosts { + proxy.Limiters[conf.Host] = &ExpiringLimiter{ rate.NewLimiter(rate.Every(conf.Every), conf.Burst), time.Now(), } diff --git a/config.json b/config.json index b3faf15..4d62d5e 100644 --- a/config.json +++ b/config.json @@ -14,22 +14,22 @@ "url": "" } ], - "hosts": { - "*": { + "hosts": [ + { + "host": "*", "every": "500s", "burst": 25, "headers": { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0" + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0", + "X-Test": "default" } }, - "reddit.com": { - "every": "2s", - "burst": 2, - "headers": { - } + { + "host": "*.reddit.com", + "every": "2s" } - } + ] } \ No newline at end of file diff --git a/gc.go b/gc.go index 7122851..e4f9023 100644 --- a/gc.go +++ b/gc.go @@ -37,7 +37,7 @@ func (b *Balancer) cleanAllExpiredLimits() { func cleanExpiredLimits(proxy *Proxy) { - const ttl = time.Second + const ttl = time.Hour limits := make(map[string]*ExpiringLimiter, 0) now := time.Now() @@ -60,6 +60,11 @@ func cleanExpiredLimits(proxy *Proxy) { func shouldPruneLimiter(host string) bool { // Don't remove hosts that are coming from the config - _, ok := config.Hosts[host] - return !ok + for _, conf := range config.Hosts { + if conf.Host == host { + return false + } + } + + return true } diff --git a/main.go b/main.go index 9d5ce06..2d483d2 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,7 @@ package main import ( "github.com/elazarl/goproxy" "github.com/pkg/errors" + "github.com/ryanuber/go-glob" "github.com/sirupsen/logrus" "golang.org/x/time/rate" "net/http" @@ -58,11 +59,9 @@ func (p *Proxy) getLimiter(host string) *rate.Limiter { func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter { - defaultConf := config.Hosts["*"] - newExpiringLimiter := &ExpiringLimiter{ LastRead: time.Now(), - Limiter: rate.NewLimiter(rate.Every(defaultConf.Every), defaultConf.Burst), + Limiter: rate.NewLimiter(rate.Every(config.DefaultConfig.Every), config.DefaultConfig.Burst), } p.Limiters[host] = newExpiringLimiter @@ -75,16 +74,13 @@ func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter { } func simplifyHost(host string) string { - if strings.HasPrefix(host, "www.") { - host = host[4:] - } col := strings.LastIndex(host, ":") if col > 0 { host = host[:col] } - return host + return "." + host } func (b *Balancer) chooseProxy() *Proxy { @@ -126,18 +122,16 @@ func New() *Balancer { func applyHeaders(r *http.Request) *http.Request { - if conf, ok := config.Hosts["*"]; ok { - for k, v := range conf.Headers { - r.Header.Set(k, v) + sHost := simplifyHost(r.Host) + + for _, conf := range config.Hosts { + if glob.Glob(conf.Host, sHost) { + for k, v := range conf.Headers { + r.Header.Set(k, v) + } } } - sHost := simplifyHost(r.Host) - if conf, ok := config.Hosts[sHost]; ok { - for k, v := range conf.Headers { - r.Header.Set(k, v) - } - } return r }