mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-18 18:06:45 +00:00
Blacklist all paths with a query parameter
This commit is contained in:
parent
ffde1a9e5d
commit
ac0b8d2d0b
32
crawl.go
32
crawl.go
@ -74,16 +74,15 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
|||||||
linkTexts = nil
|
linkTexts = nil
|
||||||
|
|
||||||
// TODO Optimized decision tree
|
// TODO Optimized decision tree
|
||||||
|
if strings.LastIndexByte(href, '?') != -1 {
|
||||||
|
goto nextToken
|
||||||
|
}
|
||||||
|
|
||||||
for _, entry := range urlBlackList {
|
for _, entry := range urlBlackList {
|
||||||
if href == entry {
|
if href == entry {
|
||||||
goto nextToken
|
goto nextToken
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, entry := range urlPartBlackList {
|
|
||||||
if strings.Contains(href, entry) {
|
|
||||||
goto nextToken
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, entry := range fileNameBlackList {
|
for _, entry := range fileNameBlackList {
|
||||||
if strings.Contains(linkText, entry) {
|
if strings.Contains(linkText, entry) {
|
||||||
goto nextToken
|
goto nextToken
|
||||||
@ -196,29 +195,6 @@ var urlBlackList = [...]string {
|
|||||||
"/",
|
"/",
|
||||||
}
|
}
|
||||||
|
|
||||||
var urlPartBlackList = [...]string {
|
|
||||||
"?C=N&O=D",
|
|
||||||
"?C=M&O=A",
|
|
||||||
"?C=S&O=A",
|
|
||||||
"?C=D&O=A",
|
|
||||||
"?C=N;O=D",
|
|
||||||
"?C=M;O=A",
|
|
||||||
"?C=M&O=D",
|
|
||||||
"?C=S;O=A",
|
|
||||||
"?C=S&O=D",
|
|
||||||
"?C=D;O=A",
|
|
||||||
"?MA",
|
|
||||||
"?SA",
|
|
||||||
"?DA",
|
|
||||||
"?ND",
|
|
||||||
"?C=N&O=A",
|
|
||||||
"?C=N&O=A",
|
|
||||||
"?M=A",
|
|
||||||
"?N=D",
|
|
||||||
"?S=A",
|
|
||||||
"?D=A",
|
|
||||||
}
|
|
||||||
|
|
||||||
var fileNameBlackList = [...]string {
|
var fileNameBlackList = [...]string {
|
||||||
"Parent Directory",
|
"Parent Directory",
|
||||||
" Parent Directory",
|
" Parent Directory",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user