mirror of
				https://github.com/simon987/Architeuthis.git
				synced 2025-10-24 20:16:51 +00:00 
			
		
		
		
	Config glob & hierarchy, fixes
This commit is contained in:
		
							parent
							
								
									69f28f1ff7
								
							
						
					
					
						commit
						98c7f12cf1
					
				
							
								
								
									
										21
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								README.md
									
									
									
									
									
								
							| @ -34,18 +34,25 @@ and error handling. Built for automated web scraping. | |||||||
|       "url": "http://p1.exemple.com:8080" |       "url": "http://p1.exemple.com:8080" | ||||||
|     } |     } | ||||||
|   ], |   ], | ||||||
|   "hosts": { |   "hosts": [ | ||||||
|     "*": { |     { | ||||||
|       "every": "750ms", |       "host": "*", | ||||||
|       "burst": 5, |       "every": "500ms", | ||||||
|       "headers": {} |       "burst": 25, | ||||||
|  |       "headers": { | ||||||
|  |         "User-Agent": "Some user agent", | ||||||
|  |         "X-Test": "Will be overwritten" | ||||||
|  |       } | ||||||
|     }, |     }, | ||||||
|     "reddit.com": { |     { | ||||||
|  |       "host": "*.reddit.com", | ||||||
|       "every": "2s", |       "every": "2s", | ||||||
|       "burst": 2, |       "burst": 2, | ||||||
|       "headers": {"User-Agent":  "mybot_v0.1"} |       "headers": { | ||||||
|  |         "X-Test": "Will overwrite default" | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |   ] | ||||||
| } | } | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										28
									
								
								config.go
									
									
									
									
									
								
							
							
						
						
									
										28
									
								
								config.go
									
									
									
									
									
								
							| @ -11,6 +11,7 @@ import ( | |||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| type HostConfig struct { | type HostConfig struct { | ||||||
|  | 	Host     string            `json:"host"` | ||||||
| 	EveryStr string            `json:"every"` | 	EveryStr string            `json:"every"` | ||||||
| 	Burst    int               `json:"burst"` | 	Burst    int               `json:"burst"` | ||||||
| 	Headers  map[string]string `json:"headers"` | 	Headers  map[string]string `json:"headers"` | ||||||
| @ -28,10 +29,11 @@ var config struct { | |||||||
| 	WaitStr       string        `json:"wait"` | 	WaitStr       string        `json:"wait"` | ||||||
| 	Multiplier    float64       `json:"multiplier"` | 	Multiplier    float64       `json:"multiplier"` | ||||||
| 	Retries       int           `json:"retries"` | 	Retries       int           `json:"retries"` | ||||||
| 	Hosts      map[string]*HostConfig `json:"hosts"` | 	Hosts         []*HostConfig `json:"hosts"` | ||||||
| 	Proxies       []ProxyConfig `json:"proxies"` | 	Proxies       []ProxyConfig `json:"proxies"` | ||||||
| 	Wait          int64 | 	Wait          int64 | ||||||
| 	Timeout       time.Duration | 	Timeout       time.Duration | ||||||
|  | 	DefaultConfig *HostConfig | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func loadConfig() { | func loadConfig() { | ||||||
| @ -52,38 +54,44 @@ func loadConfig() { | |||||||
| 	config.Wait = int64(wait) | 	config.Wait = int64(wait) | ||||||
| 
 | 
 | ||||||
| 	for _, conf := range config.Hosts { | 	for _, conf := range config.Hosts { | ||||||
|  | 		if conf.EveryStr == "" { | ||||||
|  | 			conf.Every = config.DefaultConfig.Every | ||||||
|  | 		} else { | ||||||
| 			conf.Every, err = time.ParseDuration(conf.EveryStr) | 			conf.Every, err = time.ParseDuration(conf.EveryStr) | ||||||
| 			handleErr(err) | 			handleErr(err) | ||||||
| 		} | 		} | ||||||
|  | 
 | ||||||
|  | 		if config.DefaultConfig != nil && conf.Burst == 0 { | ||||||
|  | 			conf.Burst = config.DefaultConfig.Burst | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func validateConfig() { | func validateConfig() { | ||||||
| 
 | 
 | ||||||
| 	hasDefaultHost := false | 	for _, conf := range config.Hosts { | ||||||
| 
 | 
 | ||||||
| 	for host, conf := range config.Hosts { | 		if conf.Host == "*" { | ||||||
| 
 | 			config.DefaultConfig = conf | ||||||
| 		if host == "*" { |  | ||||||
| 			hasDefaultHost = true |  | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		for k := range conf.Headers { | 		for k := range conf.Headers { | ||||||
| 			if strings.ToLower(k) == "accept-encoding" { | 			if strings.ToLower(k) == "accept-encoding" { | ||||||
| 				panic(fmt.Sprintf("headers config for '%s':"+ | 				panic(fmt.Sprintf("headers config for '%s':"+ | ||||||
| 					" Do not set the Accept-Encoding header, it breaks goproxy", host)) | 					" Do not set the Accept-Encoding header, it breaks goproxy", conf.Host)) | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if !hasDefaultHost { | 	if config.DefaultConfig == nil { | ||||||
| 		panic("config.json: You must specify a default host ('*')") | 		panic("config.json: You must specify a default host ('*')") | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func applyConfig(proxy *Proxy) { | func applyConfig(proxy *Proxy) { | ||||||
| 
 | 
 | ||||||
| 	for host, conf := range config.Hosts { | 	for _, conf := range config.Hosts { | ||||||
| 		proxy.Limiters[host] = &ExpiringLimiter{ | 		proxy.Limiters[conf.Host] = &ExpiringLimiter{ | ||||||
| 			rate.NewLimiter(rate.Every(conf.Every), conf.Burst), | 			rate.NewLimiter(rate.Every(conf.Every), conf.Burst), | ||||||
| 			time.Now(), | 			time.Now(), | ||||||
| 		} | 		} | ||||||
|  | |||||||
							
								
								
									
										18
									
								
								config.json
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								config.json
									
									
									
									
									
								
							| @ -14,22 +14,22 @@ | |||||||
|       "url": "" |       "url": "" | ||||||
|     } |     } | ||||||
|   ], |   ], | ||||||
|   "hosts": { |   "hosts": [ | ||||||
|     "*": { |     { | ||||||
|  |       "host": "*", | ||||||
|       "every": "500s", |       "every": "500s", | ||||||
|       "burst": 25, |       "burst": 25, | ||||||
|       "headers": { |       "headers": { | ||||||
|         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | ||||||
|         "Cache-Control": "max-age=0", |         "Cache-Control": "max-age=0", | ||||||
|         "Connection": "keep-alive", |         "Connection": "keep-alive", | ||||||
|         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0" |         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0", | ||||||
|  |         "X-Test": "default" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "reddit.com": { |     { | ||||||
|       "every": "2s", |       "host": "*.reddit.com", | ||||||
|       "burst": 2, |       "every": "2s" | ||||||
|       "headers": { |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     } |     } | ||||||
|  |   ] | ||||||
| } | } | ||||||
							
								
								
									
										11
									
								
								gc.go
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								gc.go
									
									
									
									
									
								
							| @ -37,7 +37,7 @@ func (b *Balancer) cleanAllExpiredLimits() { | |||||||
| 
 | 
 | ||||||
| func cleanExpiredLimits(proxy *Proxy) { | func cleanExpiredLimits(proxy *Proxy) { | ||||||
| 
 | 
 | ||||||
| 	const ttl = time.Second | 	const ttl = time.Hour | ||||||
| 
 | 
 | ||||||
| 	limits := make(map[string]*ExpiringLimiter, 0) | 	limits := make(map[string]*ExpiringLimiter, 0) | ||||||
| 	now := time.Now() | 	now := time.Now() | ||||||
| @ -60,6 +60,11 @@ func cleanExpiredLimits(proxy *Proxy) { | |||||||
| func shouldPruneLimiter(host string) bool { | func shouldPruneLimiter(host string) bool { | ||||||
| 
 | 
 | ||||||
| 	// Don't remove hosts that are coming from the config | 	// Don't remove hosts that are coming from the config | ||||||
| 	_, ok := config.Hosts[host] | 	for _, conf := range config.Hosts { | ||||||
| 	return !ok | 		if conf.Host == host { | ||||||
|  | 			return false | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return true | ||||||
| } | } | ||||||
|  | |||||||
							
								
								
									
										22
									
								
								main.go
									
									
									
									
									
								
							
							
						
						
									
										22
									
								
								main.go
									
									
									
									
									
								
							| @ -3,6 +3,7 @@ package main | |||||||
| import ( | import ( | ||||||
| 	"github.com/elazarl/goproxy" | 	"github.com/elazarl/goproxy" | ||||||
| 	"github.com/pkg/errors" | 	"github.com/pkg/errors" | ||||||
|  | 	"github.com/ryanuber/go-glob" | ||||||
| 	"github.com/sirupsen/logrus" | 	"github.com/sirupsen/logrus" | ||||||
| 	"golang.org/x/time/rate" | 	"golang.org/x/time/rate" | ||||||
| 	"net/http" | 	"net/http" | ||||||
| @ -58,11 +59,9 @@ func (p *Proxy) getLimiter(host string) *rate.Limiter { | |||||||
| 
 | 
 | ||||||
| func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter { | func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter { | ||||||
| 
 | 
 | ||||||
| 	defaultConf := config.Hosts["*"] |  | ||||||
| 
 |  | ||||||
| 	newExpiringLimiter := &ExpiringLimiter{ | 	newExpiringLimiter := &ExpiringLimiter{ | ||||||
| 		LastRead: time.Now(), | 		LastRead: time.Now(), | ||||||
| 		Limiter:  rate.NewLimiter(rate.Every(defaultConf.Every), defaultConf.Burst), | 		Limiter:  rate.NewLimiter(rate.Every(config.DefaultConfig.Every), config.DefaultConfig.Burst), | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	p.Limiters[host] = newExpiringLimiter | 	p.Limiters[host] = newExpiringLimiter | ||||||
| @ -75,16 +74,13 @@ func (p *Proxy) makeNewLimiter(host string) *ExpiringLimiter { | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func simplifyHost(host string) string { | func simplifyHost(host string) string { | ||||||
| 	if strings.HasPrefix(host, "www.") { |  | ||||||
| 		host = host[4:] |  | ||||||
| 	} |  | ||||||
| 
 | 
 | ||||||
| 	col := strings.LastIndex(host, ":") | 	col := strings.LastIndex(host, ":") | ||||||
| 	if col > 0 { | 	if col > 0 { | ||||||
| 		host = host[:col] | 		host = host[:col] | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return host | 	return "." + host | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func (b *Balancer) chooseProxy() *Proxy { | func (b *Balancer) chooseProxy() *Proxy { | ||||||
| @ -126,18 +122,16 @@ func New() *Balancer { | |||||||
| 
 | 
 | ||||||
| func applyHeaders(r *http.Request) *http.Request { | func applyHeaders(r *http.Request) *http.Request { | ||||||
| 
 | 
 | ||||||
| 	if conf, ok := config.Hosts["*"]; ok { | 	sHost := simplifyHost(r.Host) | ||||||
|  | 
 | ||||||
|  | 	for _, conf := range config.Hosts { | ||||||
|  | 		if glob.Glob(conf.Host, sHost) { | ||||||
| 			for k, v := range conf.Headers { | 			for k, v := range conf.Headers { | ||||||
| 				r.Header.Set(k, v) | 				r.Header.Set(k, v) | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	sHost := simplifyHost(r.Host) |  | ||||||
| 	if conf, ok := config.Hosts[sHost]; ok { |  | ||||||
| 		for k, v := range conf.Headers { |  | ||||||
| 			r.Header.Set(k, v) |  | ||||||
| 		} |  | ||||||
| 	} |  | ||||||
| 	return r | 	return r | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user