mirror of
https://github.com/simon987/Architeuthis.git
synced 2025-04-19 15:36:42 +00:00
Config glob & hierarchy, fixes
This commit is contained in:
parent
69f28f1ff7
commit
98c7f12cf1
21
README.md
21
README.md
@ -34,18 +34,25 @@ and error handling. Built for automated web scraping.
|
|||||||
"url": "http://p1.exemple.com:8080"
|
"url": "http://p1.exemple.com:8080"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"hosts": {
|
"hosts": [
|
||||||
"*": {
|
{
|
||||||
"every": "750ms",
|
"host": "*",
|
||||||
"burst": 5,
|
"every": "500ms",
|
||||||
"headers": {}
|
"burst": 25,
|
||||||
|
"headers": {
|
||||||
|
"User-Agent": "Some user agent",
|
||||||
|
"X-Test": "Will be overwritten"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"reddit.com": {
|
{
|
||||||
|
"host": "*.reddit.com",
|
||||||
"every": "2s",
|
"every": "2s",
|
||||||
"burst": 2,
|
"burst": 2,
|
||||||
"headers": {"User-Agent": "mybot_v0.1"}
|
"headers": {
|
||||||
|
"X-Test": "Will overwrite default"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
28
config.go
28
config.go
@ -11,6 +11,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type HostConfig struct {
|
type HostConfig struct {
|
||||||
|
Host string `json:"host"`
|
||||||
EveryStr string `json:"every"`
|
EveryStr string `json:"every"`
|
||||||
Burst int `json:"burst"`
|
Burst int `json:"burst"`
|
||||||
Headers map[string]string `json:"headers"`
|
Headers map[string]string `json:"headers"`
|
||||||
@ -28,10 +29,11 @@ var config struct {
|
|||||||
WaitStr string `json:"wait"`
|
WaitStr string `json:"wait"`
|
||||||
Multiplier float64 `json:"multiplier"`
|
Multiplier float64 `json:"multiplier"`
|
||||||
Retries int `json:"retries"`
|
Retries int `json:"retries"`
|
||||||
Hosts map[string]*HostConfig `json:"hosts"`
|
Hosts []*HostConfig `json:"hosts"`
|
||||||
Proxies []ProxyConfig `json:"proxies"`
|
Proxies []ProxyConfig `json:"proxies"`
|
||||||
Wait int64
|
Wait int64
|
||||||
Timeout time.Duration
|
Timeout time.Duration
|
||||||
|
DefaultConfig *HostConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
func loadConfig() {
|
func loadConfig() {
|
||||||
@ -52,38 +54,44 @@ func loadConfig() {
|
|||||||
config.Wait = int64(wait)
|
config.Wait = int64(wait)
|
||||||
|
|
||||||
for _, conf := range config.Hosts {
|
for _, conf := range config.Hosts {
|
||||||
|
if conf.EveryStr == "" {
|
||||||
|
conf.Every = config.DefaultConfig.Every
|
||||||
|
} else {
|
||||||
conf.Every, err = time.ParseDuration(conf.EveryStr)
|
conf.Every, err = time.ParseDuration(conf.EveryStr)
|
||||||
handleErr(err)
|
handleErr(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if config.DefaultConfig != nil && conf.Burst == 0 {
|
||||||
|
conf.Burst = config.DefaultConfig.Burst
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func validateConfig() {
|
func validateConfig() {
|
||||||
|
|
||||||
hasDefaultHost := false
|
for _, conf := range config.Hosts {
|
||||||
|
|
||||||
for host, conf := range config.Hosts {
|
if conf.Host == "*" {
|
||||||
|
config.DefaultConfig = conf
|
||||||
if host == "*" {
|
|
||||||
hasDefaultHost = true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for k := range conf.Headers {
|
for k := range conf.Headers {
|
||||||
if strings.ToLower(k) == "accept-encoding" {
|
if strings.ToLower(k) == "accept-encoding" {
|
||||||
panic(fmt.Sprintf("headers config for '%s':"+
|
panic(fmt.Sprintf("headers config for '%s':"+
|
||||||
" Do not set the Accept-Encoding header, it breaks goproxy", host))
|
" Do not set the Accept-Encoding header, it breaks goproxy", conf.Host))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !hasDefaultHost {
|
if config.DefaultConfig == nil {
|
||||||
panic("config.json: You must specify a default host ('*')")
|
panic("config.json: You must specify a default host ('*')")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func applyConfig(proxy *Proxy) {
|
func applyConfig(proxy *Proxy) {
|
||||||
|
|
||||||
for host, conf := range config.Hosts {
|
for _, conf := range config.Hosts {
|
||||||
proxy.Limiters[host] = &ExpiringLimiter{
|
proxy.Limiters[conf.Host] = &ExpiringLimiter{
|
||||||
rate.NewLimiter(rate.Every(conf.Every), conf.Burst),
|
rate.NewLimiter(rate.Every(conf.Every), conf.Burst),
|
||||||
time.Now(),
|
time.Now(),
|
||||||
}
|
}
|
||||||
|
18
config.json
18
config.json
@ -14,22 +14,22 @@
|
|||||||
"url": ""
|
"url": ""
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"hosts": {
|
"hosts": [
|
||||||
"*": {
|
{
|
||||||
|
"host": "*",
|
||||||
"every": "500s",
|
"every": "500s",
|
||||||
"burst": 25,
|
"burst": 25,
|
||||||
"headers": {
|
"headers": {
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
"Cache-Control": "max-age=0",
|
"Cache-Control": "max-age=0",
|
||||||
"Connection": "keep-alive",
|
"Connection": "keep-alive",
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0"
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0",
|
||||||
|
"X-Test": "default"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"reddit.com": {
|
{
|
||||||
"every": "2s",
|
"host": "*.reddit.com",
|
||||||
"burst": 2,
|
"every": "2s"
|
||||||
"headers": {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
]
|
||||||
}
|
}
|
11
gc.go
11
gc.go
@ -37,7 +37,7 @@ func (b *Balancer) cleanAllExpiredLimits() {
|
|||||||
|
|
||||||
func cleanExpiredLimits(proxy *Proxy) {
|
func cleanExpiredLimits(proxy *Proxy) {
|
||||||
|
|
||||||
const ttl = time.Second
|
const ttl = time.Hour
|
||||||
|
|
||||||
limits := make(map[string]*ExpiringLimiter, 0)
|
limits := make(map[string]*ExpiringLimiter, 0)
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
@ -60,6 +60,11 @@ func cleanExpiredLimits(proxy *Proxy) {
|
|||||||
func shouldPruneLimiter(host string) bool {
|
func shouldPruneLimiter(host string) bool {
|
||||||
|
|
||||||
// Don't remove hosts that are coming from the config
|
// Don't remove hosts that are coming from the config
|
||||||
_, ok := config.Hosts[host]
|
for _, conf := range config.Hosts {
|
||||||
return !ok
|
if conf.Host == host {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true
|
||||||
}
|
}
|
||||||
|
22
main.go
22
main.go
@ -3,6 +3,7 @@ package main
|
|||||||
import (
|
import (
|
||||||
"github.com/elazarl/goproxy"
|
"github.com/elazarl/goproxy"
|
||||||
"github.com/pkg/errors"
|
"github.com/pkg/errors"
|
||||||
|
"github.com/ryanuber/go-glob"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"golang.org/x/time/rate"
|
"golang.org/x/time/rate"
|
||||||
"net/http"
|
"net/http"
|
||||||
@ -58,11 +59,9 @@ func (p *Proxy) getLimiter(host string) *rate.Limiter {
|
|||||||
|
|
||||||
func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter {
|
func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter {
|
||||||
|
|
||||||
defaultConf := config.Hosts["*"]
|
|
||||||
|
|
||||||
newExpiringLimiter := &ExpiringLimiter{
|
newExpiringLimiter := &ExpiringLimiter{
|
||||||
LastRead: time.Now(),
|
LastRead: time.Now(),
|
||||||
Limiter: rate.NewLimiter(rate.Every(defaultConf.Every), defaultConf.Burst),
|
Limiter: rate.NewLimiter(rate.Every(config.DefaultConfig.Every), config.DefaultConfig.Burst),
|
||||||
}
|
}
|
||||||
|
|
||||||
p.Limiters[host] = newExpiringLimiter
|
p.Limiters[host] = newExpiringLimiter
|
||||||
@ -75,16 +74,13 @@ func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func simplifyHost(host string) string {
|
func simplifyHost(host string) string {
|
||||||
if strings.HasPrefix(host, "www.") {
|
|
||||||
host = host[4:]
|
|
||||||
}
|
|
||||||
|
|
||||||
col := strings.LastIndex(host, ":")
|
col := strings.LastIndex(host, ":")
|
||||||
if col > 0 {
|
if col > 0 {
|
||||||
host = host[:col]
|
host = host[:col]
|
||||||
}
|
}
|
||||||
|
|
||||||
return host
|
return "." + host
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *Balancer) chooseProxy() *Proxy {
|
func (b *Balancer) chooseProxy() *Proxy {
|
||||||
@ -126,18 +122,16 @@ func New() *Balancer {
|
|||||||
|
|
||||||
func applyHeaders(r *http.Request) *http.Request {
|
func applyHeaders(r *http.Request) *http.Request {
|
||||||
|
|
||||||
if conf, ok := config.Hosts["*"]; ok {
|
sHost := simplifyHost(r.Host)
|
||||||
|
|
||||||
|
for _, conf := range config.Hosts {
|
||||||
|
if glob.Glob(conf.Host, sHost) {
|
||||||
for k, v := range conf.Headers {
|
for k, v := range conf.Headers {
|
||||||
r.Header.Set(k, v)
|
r.Header.Set(k, v)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
sHost := simplifyHost(r.Host)
|
|
||||||
if conf, ok := config.Hosts[sHost]; ok {
|
|
||||||
for k, v := range conf.Headers {
|
|
||||||
r.Header.Set(k, v)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user