mirror of
https://github.com/simon987/Architeuthis.git
synced 2025-04-10 13:36:41 +00:00
Config glob & hierarchy, fixes
This commit is contained in:
parent
69f28f1ff7
commit
98c7f12cf1
23
README.md
23
README.md
@ -34,18 +34,25 @@ and error handling. Built for automated web scraping.
|
||||
"url": "http://p1.exemple.com:8080"
|
||||
}
|
||||
],
|
||||
"hosts": {
|
||||
"*": {
|
||||
"every": "750ms",
|
||||
"burst": 5,
|
||||
"headers": {}
|
||||
"hosts": [
|
||||
{
|
||||
"host": "*",
|
||||
"every": "500ms",
|
||||
"burst": 25,
|
||||
"headers": {
|
||||
"User-Agent": "Some user agent",
|
||||
"X-Test": "Will be overwritten"
|
||||
}
|
||||
},
|
||||
"reddit.com": {
|
||||
{
|
||||
"host": "*.reddit.com",
|
||||
"every": "2s",
|
||||
"burst": 2,
|
||||
"headers": {"User-Agent": "mybot_v0.1"}
|
||||
"headers": {
|
||||
"X-Test": "Will overwrite default"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
|
48
config.go
48
config.go
@ -11,6 +11,7 @@ import (
|
||||
)
|
||||
|
||||
type HostConfig struct {
|
||||
Host string `json:"host"`
|
||||
EveryStr string `json:"every"`
|
||||
Burst int `json:"burst"`
|
||||
Headers map[string]string `json:"headers"`
|
||||
@ -23,15 +24,16 @@ type ProxyConfig struct {
|
||||
}
|
||||
|
||||
var config struct {
|
||||
Addr string `json:"addr"`
|
||||
TimeoutStr string `json:"timeout"`
|
||||
WaitStr string `json:"wait"`
|
||||
Multiplier float64 `json:"multiplier"`
|
||||
Retries int `json:"retries"`
|
||||
Hosts map[string]*HostConfig `json:"hosts"`
|
||||
Proxies []ProxyConfig `json:"proxies"`
|
||||
Wait int64
|
||||
Timeout time.Duration
|
||||
Addr string `json:"addr"`
|
||||
TimeoutStr string `json:"timeout"`
|
||||
WaitStr string `json:"wait"`
|
||||
Multiplier float64 `json:"multiplier"`
|
||||
Retries int `json:"retries"`
|
||||
Hosts []*HostConfig `json:"hosts"`
|
||||
Proxies []ProxyConfig `json:"proxies"`
|
||||
Wait int64
|
||||
Timeout time.Duration
|
||||
DefaultConfig *HostConfig
|
||||
}
|
||||
|
||||
func loadConfig() {
|
||||
@ -52,38 +54,44 @@ func loadConfig() {
|
||||
config.Wait = int64(wait)
|
||||
|
||||
for _, conf := range config.Hosts {
|
||||
conf.Every, err = time.ParseDuration(conf.EveryStr)
|
||||
handleErr(err)
|
||||
if conf.EveryStr == "" {
|
||||
conf.Every = config.DefaultConfig.Every
|
||||
} else {
|
||||
conf.Every, err = time.ParseDuration(conf.EveryStr)
|
||||
handleErr(err)
|
||||
}
|
||||
|
||||
if config.DefaultConfig != nil && conf.Burst == 0 {
|
||||
conf.Burst = config.DefaultConfig.Burst
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func validateConfig() {
|
||||
|
||||
hasDefaultHost := false
|
||||
for _, conf := range config.Hosts {
|
||||
|
||||
for host, conf := range config.Hosts {
|
||||
|
||||
if host == "*" {
|
||||
hasDefaultHost = true
|
||||
if conf.Host == "*" {
|
||||
config.DefaultConfig = conf
|
||||
}
|
||||
|
||||
for k := range conf.Headers {
|
||||
if strings.ToLower(k) == "accept-encoding" {
|
||||
panic(fmt.Sprintf("headers config for '%s':"+
|
||||
" Do not set the Accept-Encoding header, it breaks goproxy", host))
|
||||
" Do not set the Accept-Encoding header, it breaks goproxy", conf.Host))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !hasDefaultHost {
|
||||
if config.DefaultConfig == nil {
|
||||
panic("config.json: You must specify a default host ('*')")
|
||||
}
|
||||
}
|
||||
|
||||
func applyConfig(proxy *Proxy) {
|
||||
|
||||
for host, conf := range config.Hosts {
|
||||
proxy.Limiters[host] = &ExpiringLimiter{
|
||||
for _, conf := range config.Hosts {
|
||||
proxy.Limiters[conf.Host] = &ExpiringLimiter{
|
||||
rate.NewLimiter(rate.Every(conf.Every), conf.Burst),
|
||||
time.Now(),
|
||||
}
|
||||
|
18
config.json
18
config.json
@ -14,22 +14,22 @@
|
||||
"url": ""
|
||||
}
|
||||
],
|
||||
"hosts": {
|
||||
"*": {
|
||||
"hosts": [
|
||||
{
|
||||
"host": "*",
|
||||
"every": "500s",
|
||||
"burst": 25,
|
||||
"headers": {
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Cache-Control": "max-age=0",
|
||||
"Connection": "keep-alive",
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0"
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0",
|
||||
"X-Test": "default"
|
||||
}
|
||||
},
|
||||
"reddit.com": {
|
||||
"every": "2s",
|
||||
"burst": 2,
|
||||
"headers": {
|
||||
}
|
||||
{
|
||||
"host": "*.reddit.com",
|
||||
"every": "2s"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
11
gc.go
11
gc.go
@ -37,7 +37,7 @@ func (b *Balancer) cleanAllExpiredLimits() {
|
||||
|
||||
func cleanExpiredLimits(proxy *Proxy) {
|
||||
|
||||
const ttl = time.Second
|
||||
const ttl = time.Hour
|
||||
|
||||
limits := make(map[string]*ExpiringLimiter, 0)
|
||||
now := time.Now()
|
||||
@ -60,6 +60,11 @@ func cleanExpiredLimits(proxy *Proxy) {
|
||||
func shouldPruneLimiter(host string) bool {
|
||||
|
||||
// Don't remove hosts that are coming from the config
|
||||
_, ok := config.Hosts[host]
|
||||
return !ok
|
||||
for _, conf := range config.Hosts {
|
||||
if conf.Host == host {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
26
main.go
26
main.go
@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"github.com/elazarl/goproxy"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/ryanuber/go-glob"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/time/rate"
|
||||
"net/http"
|
||||
@ -58,11 +59,9 @@ func (p *Proxy) getLimiter(host string) *rate.Limiter {
|
||||
|
||||
func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter {
|
||||
|
||||
defaultConf := config.Hosts["*"]
|
||||
|
||||
newExpiringLimiter := &ExpiringLimiter{
|
||||
LastRead: time.Now(),
|
||||
Limiter: rate.NewLimiter(rate.Every(defaultConf.Every), defaultConf.Burst),
|
||||
Limiter: rate.NewLimiter(rate.Every(config.DefaultConfig.Every), config.DefaultConfig.Burst),
|
||||
}
|
||||
|
||||
p.Limiters[host] = newExpiringLimiter
|
||||
@ -75,16 +74,13 @@ func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter {
|
||||
}
|
||||
|
||||
func simplifyHost(host string) string {
|
||||
if strings.HasPrefix(host, "www.") {
|
||||
host = host[4:]
|
||||
}
|
||||
|
||||
col := strings.LastIndex(host, ":")
|
||||
if col > 0 {
|
||||
host = host[:col]
|
||||
}
|
||||
|
||||
return host
|
||||
return "." + host
|
||||
}
|
||||
|
||||
func (b *Balancer) chooseProxy() *Proxy {
|
||||
@ -126,18 +122,16 @@ func New() *Balancer {
|
||||
|
||||
func applyHeaders(r *http.Request) *http.Request {
|
||||
|
||||
if conf, ok := config.Hosts["*"]; ok {
|
||||
for k, v := range conf.Headers {
|
||||
r.Header.Set(k, v)
|
||||
sHost := simplifyHost(r.Host)
|
||||
|
||||
for _, conf := range config.Hosts {
|
||||
if glob.Glob(conf.Host, sHost) {
|
||||
for k, v := range conf.Headers {
|
||||
r.Header.Set(k, v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sHost := simplifyHost(r.Host)
|
||||
if conf, ok := config.Hosts[sHost]; ok {
|
||||
for k, v := range conf.Headers {
|
||||
r.Header.Set(k, v)
|
||||
}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user