Config glob & hierarchy, fixes

This commit is contained in:
simon 2019-05-30 10:03:33 -04:00
parent 69f28f1ff7
commit 98c7f12cf1
5 changed files with 70 additions and 56 deletions

View File

@ -34,18 +34,25 @@ and error handling. Built for automated web scraping.
"url": "http://p1.exemple.com:8080"
}
],
"hosts": {
"*": {
"every": "750ms",
"burst": 5,
"headers": {}
"hosts": [
{
"host": "*",
"every": "500ms",
"burst": 25,
"headers": {
"User-Agent": "Some user agent",
"X-Test": "Will be overwritten"
}
},
"reddit.com": {
{
"host": "*.reddit.com",
"every": "2s",
"burst": 2,
"headers": {"User-Agent": "mybot_v0.1"}
"headers": {
"X-Test": "Will overwrite default"
}
}
}
]
}
```

View File

@ -11,6 +11,7 @@ import (
)
type HostConfig struct {
Host string `json:"host"`
EveryStr string `json:"every"`
Burst int `json:"burst"`
Headers map[string]string `json:"headers"`
@ -23,15 +24,16 @@ type ProxyConfig struct {
}
var config struct {
Addr string `json:"addr"`
TimeoutStr string `json:"timeout"`
WaitStr string `json:"wait"`
Multiplier float64 `json:"multiplier"`
Retries int `json:"retries"`
Hosts map[string]*HostConfig `json:"hosts"`
Proxies []ProxyConfig `json:"proxies"`
Wait int64
Timeout time.Duration
Addr string `json:"addr"`
TimeoutStr string `json:"timeout"`
WaitStr string `json:"wait"`
Multiplier float64 `json:"multiplier"`
Retries int `json:"retries"`
Hosts []*HostConfig `json:"hosts"`
Proxies []ProxyConfig `json:"proxies"`
Wait int64
Timeout time.Duration
DefaultConfig *HostConfig
}
func loadConfig() {
@ -52,38 +54,44 @@ func loadConfig() {
config.Wait = int64(wait)
for _, conf := range config.Hosts {
conf.Every, err = time.ParseDuration(conf.EveryStr)
handleErr(err)
if conf.EveryStr == "" {
conf.Every = config.DefaultConfig.Every
} else {
conf.Every, err = time.ParseDuration(conf.EveryStr)
handleErr(err)
}
if config.DefaultConfig != nil && conf.Burst == 0 {
conf.Burst = config.DefaultConfig.Burst
}
}
}
func validateConfig() {
hasDefaultHost := false
for _, conf := range config.Hosts {
for host, conf := range config.Hosts {
if host == "*" {
hasDefaultHost = true
if conf.Host == "*" {
config.DefaultConfig = conf
}
for k := range conf.Headers {
if strings.ToLower(k) == "accept-encoding" {
panic(fmt.Sprintf("headers config for '%s':"+
" Do not set the Accept-Encoding header, it breaks goproxy", host))
" Do not set the Accept-Encoding header, it breaks goproxy", conf.Host))
}
}
}
if !hasDefaultHost {
if config.DefaultConfig == nil {
panic("config.json: You must specify a default host ('*')")
}
}
func applyConfig(proxy *Proxy) {
for host, conf := range config.Hosts {
proxy.Limiters[host] = &ExpiringLimiter{
for _, conf := range config.Hosts {
proxy.Limiters[conf.Host] = &ExpiringLimiter{
rate.NewLimiter(rate.Every(conf.Every), conf.Burst),
time.Now(),
}

View File

@ -14,22 +14,22 @@
"url": ""
}
],
"hosts": {
"*": {
"hosts": [
{
"host": "*",
"every": "500s",
"burst": 25,
"headers": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0"
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0",
"X-Test": "default"
}
},
"reddit.com": {
"every": "2s",
"burst": 2,
"headers": {
}
{
"host": "*.reddit.com",
"every": "2s"
}
}
]
}

11
gc.go
View File

@ -37,7 +37,7 @@ func (b *Balancer) cleanAllExpiredLimits() {
func cleanExpiredLimits(proxy *Proxy) {
const ttl = time.Second
const ttl = time.Hour
limits := make(map[string]*ExpiringLimiter, 0)
now := time.Now()
@ -60,6 +60,11 @@ func cleanExpiredLimits(proxy *Proxy) {
func shouldPruneLimiter(host string) bool {
// Don't remove hosts that are coming from the config
_, ok := config.Hosts[host]
return !ok
for _, conf := range config.Hosts {
if conf.Host == host {
return false
}
}
return true
}

26
main.go
View File

@ -3,6 +3,7 @@ package main
import (
"github.com/elazarl/goproxy"
"github.com/pkg/errors"
"github.com/ryanuber/go-glob"
"github.com/sirupsen/logrus"
"golang.org/x/time/rate"
"net/http"
@ -58,11 +59,9 @@ func (p *Proxy) getLimiter(host string) *rate.Limiter {
func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter {
defaultConf := config.Hosts["*"]
newExpiringLimiter := &ExpiringLimiter{
LastRead: time.Now(),
Limiter: rate.NewLimiter(rate.Every(defaultConf.Every), defaultConf.Burst),
Limiter: rate.NewLimiter(rate.Every(config.DefaultConfig.Every), config.DefaultConfig.Burst),
}
p.Limiters[host] = newExpiringLimiter
@ -75,16 +74,13 @@ func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter {
}
func simplifyHost(host string) string {
if strings.HasPrefix(host, "www.") {
host = host[4:]
}
col := strings.LastIndex(host, ":")
if col > 0 {
host = host[:col]
}
return host
return "." + host
}
func (b *Balancer) chooseProxy() *Proxy {
@ -126,18 +122,16 @@ func New() *Balancer {
func applyHeaders(r *http.Request) *http.Request {
if conf, ok := config.Hosts["*"]; ok {
for k, v := range conf.Headers {
r.Header.Set(k, v)
sHost := simplifyHost(r.Host)
for _, conf := range config.Hosts {
if glob.Glob(conf.Host, sHost) {
for k, v := range conf.Headers {
r.Header.Set(k, v)
}
}
}
sHost := simplifyHost(r.Host)
if conf, ok := config.Hosts[sHost]; ok {
for k, v := range conf.Headers {
r.Header.Set(k, v)
}
}
return r
}