Config glob & hierarchy, fixes

This commit is contained in:
simon 2019-05-30 10:03:33 -04:00
parent 69f28f1ff7
commit 98c7f12cf1
5 changed files with 70 additions and 56 deletions

View File

@ -34,18 +34,25 @@ and error handling. Built for automated web scraping.
"url": "http://p1.exemple.com:8080" "url": "http://p1.exemple.com:8080"
} }
], ],
"hosts": { "hosts": [
"*": { {
"every": "750ms", "host": "*",
"burst": 5, "every": "500ms",
"headers": {} "burst": 25,
"headers": {
"User-Agent": "Some user agent",
"X-Test": "Will be overwritten"
}
}, },
"reddit.com": { {
"host": "*.reddit.com",
"every": "2s", "every": "2s",
"burst": 2, "burst": 2,
"headers": {"User-Agent": "mybot_v0.1"} "headers": {
"X-Test": "Will overwrite default"
} }
} }
]
} }
``` ```

View File

@ -11,6 +11,7 @@ import (
) )
type HostConfig struct { type HostConfig struct {
Host string `json:"host"`
EveryStr string `json:"every"` EveryStr string `json:"every"`
Burst int `json:"burst"` Burst int `json:"burst"`
Headers map[string]string `json:"headers"` Headers map[string]string `json:"headers"`
@ -28,10 +29,11 @@ var config struct {
WaitStr string `json:"wait"` WaitStr string `json:"wait"`
Multiplier float64 `json:"multiplier"` Multiplier float64 `json:"multiplier"`
Retries int `json:"retries"` Retries int `json:"retries"`
Hosts map[string]*HostConfig `json:"hosts"` Hosts []*HostConfig `json:"hosts"`
Proxies []ProxyConfig `json:"proxies"` Proxies []ProxyConfig `json:"proxies"`
Wait int64 Wait int64
Timeout time.Duration Timeout time.Duration
DefaultConfig *HostConfig
} }
func loadConfig() { func loadConfig() {
@ -52,38 +54,44 @@ func loadConfig() {
config.Wait = int64(wait) config.Wait = int64(wait)
for _, conf := range config.Hosts { for _, conf := range config.Hosts {
if conf.EveryStr == "" {
conf.Every = config.DefaultConfig.Every
} else {
conf.Every, err = time.ParseDuration(conf.EveryStr) conf.Every, err = time.ParseDuration(conf.EveryStr)
handleErr(err) handleErr(err)
} }
if config.DefaultConfig != nil && conf.Burst == 0 {
conf.Burst = config.DefaultConfig.Burst
}
}
} }
func validateConfig() { func validateConfig() {
hasDefaultHost := false for _, conf := range config.Hosts {
for host, conf := range config.Hosts { if conf.Host == "*" {
config.DefaultConfig = conf
if host == "*" {
hasDefaultHost = true
} }
for k := range conf.Headers { for k := range conf.Headers {
if strings.ToLower(k) == "accept-encoding" { if strings.ToLower(k) == "accept-encoding" {
panic(fmt.Sprintf("headers config for '%s':"+ panic(fmt.Sprintf("headers config for '%s':"+
" Do not set the Accept-Encoding header, it breaks goproxy", host)) " Do not set the Accept-Encoding header, it breaks goproxy", conf.Host))
} }
} }
} }
if !hasDefaultHost { if config.DefaultConfig == nil {
panic("config.json: You must specify a default host ('*')") panic("config.json: You must specify a default host ('*')")
} }
} }
func applyConfig(proxy *Proxy) { func applyConfig(proxy *Proxy) {
for host, conf := range config.Hosts { for _, conf := range config.Hosts {
proxy.Limiters[host] = &ExpiringLimiter{ proxy.Limiters[conf.Host] = &ExpiringLimiter{
rate.NewLimiter(rate.Every(conf.Every), conf.Burst), rate.NewLimiter(rate.Every(conf.Every), conf.Burst),
time.Now(), time.Now(),
} }

View File

@ -14,22 +14,22 @@
"url": "" "url": ""
} }
], ],
"hosts": { "hosts": [
"*": { {
"host": "*",
"every": "500s", "every": "500s",
"burst": 25, "burst": 25,
"headers": { "headers": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Cache-Control": "max-age=0", "Cache-Control": "max-age=0",
"Connection": "keep-alive", "Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0" "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0",
"X-Test": "default"
} }
}, },
"reddit.com": { {
"every": "2s", "host": "*.reddit.com",
"burst": 2, "every": "2s"
"headers": {
}
}
} }
]
} }

11
gc.go
View File

@ -37,7 +37,7 @@ func (b *Balancer) cleanAllExpiredLimits() {
func cleanExpiredLimits(proxy *Proxy) { func cleanExpiredLimits(proxy *Proxy) {
const ttl = time.Second const ttl = time.Hour
limits := make(map[string]*ExpiringLimiter, 0) limits := make(map[string]*ExpiringLimiter, 0)
now := time.Now() now := time.Now()
@ -60,6 +60,11 @@ func cleanExpiredLimits(proxy *Proxy) {
func shouldPruneLimiter(host string) bool { func shouldPruneLimiter(host string) bool {
// Don't remove hosts that are coming from the config // Don't remove hosts that are coming from the config
_, ok := config.Hosts[host] for _, conf := range config.Hosts {
return !ok if conf.Host == host {
return false
}
}
return true
} }

22
main.go
View File

@ -3,6 +3,7 @@ package main
import ( import (
"github.com/elazarl/goproxy" "github.com/elazarl/goproxy"
"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/ryanuber/go-glob"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"golang.org/x/time/rate" "golang.org/x/time/rate"
"net/http" "net/http"
@ -58,11 +59,9 @@ func (p *Proxy) getLimiter(host string) *rate.Limiter {
func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter { func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter {
defaultConf := config.Hosts["*"]
newExpiringLimiter := &ExpiringLimiter{ newExpiringLimiter := &ExpiringLimiter{
LastRead: time.Now(), LastRead: time.Now(),
Limiter: rate.NewLimiter(rate.Every(defaultConf.Every), defaultConf.Burst), Limiter: rate.NewLimiter(rate.Every(config.DefaultConfig.Every), config.DefaultConfig.Burst),
} }
p.Limiters[host] = newExpiringLimiter p.Limiters[host] = newExpiringLimiter
@ -75,16 +74,13 @@ func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter {
} }
func simplifyHost(host string) string { func simplifyHost(host string) string {
if strings.HasPrefix(host, "www.") {
host = host[4:]
}
col := strings.LastIndex(host, ":") col := strings.LastIndex(host, ":")
if col > 0 { if col > 0 {
host = host[:col] host = host[:col]
} }
return host return "." + host
} }
func (b *Balancer) chooseProxy() *Proxy { func (b *Balancer) chooseProxy() *Proxy {
@ -126,18 +122,16 @@ func New() *Balancer {
func applyHeaders(r *http.Request) *http.Request { func applyHeaders(r *http.Request) *http.Request {
if conf, ok := config.Hosts["*"]; ok { sHost := simplifyHost(r.Host)
for _, conf := range config.Hosts {
if glob.Glob(conf.Host, sHost) {
for k, v := range conf.Headers { for k, v := range conf.Headers {
r.Header.Set(k, v) r.Header.Set(k, v)
} }
} }
}
sHost := simplifyHost(r.Host)
if conf, ok := config.Hosts[sHost]; ok {
for k, v := range conf.Headers {
r.Header.Set(k, v)
}
}
return r return r
} }