Update hexlib, bug fixes, refactor, migrate item IDs

This commit is contained in:
simon987 2021-03-07 14:29:36 -05:00
parent 6d0e3f0f52
commit 0133c42d62
23 changed files with 245 additions and 4045 deletions

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "docker_viz/feed_viz"]
path = docker_viz/feed_viz
url = https://github.com/simon987/feed_viz

View File

@ -1,15 +0,0 @@
### chan_feed
Daemon that fetches posts from compatible *chan
image boards and publishes serialised JSON to redis
for real-time ingest.
Compatible image boards: 4chan, lainchan, uboachan,
22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
horochan, doushio, desuchan, tgchan, lolnada, 7chan, chanon,
chan.org.li, hispachan, 8kun, nowere, iichan, 2chan and more.
Can optionally push monitoring data to InfluxDB. Below is an
example of Grafana being used to display it.
![monitoring.png](monitoring.png)

View File

@ -40,7 +40,6 @@ CHANS = {
"news", "out", "po", "pol", "qst", "sci", "soc", "sp", "news", "out", "po", "pol", "qst", "sci", "soc", "sp",
"tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x" "tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x"
), ),
rps=3 / 2
), ),
"lainchan": JsonChanHelper( "lainchan": JsonChanHelper(
2, 2,
@ -53,7 +52,6 @@ CHANS = {
"hum", "drg", "zzz", "layer", "q", "r", "_cult", "_psy", "hum", "drg", "zzz", "layer", "q", "r", "_cult", "_psy",
"_mega", "_mega",
), ),
rps=1 / 60
), ),
"uboachan": JsonChanHelper( "uboachan": JsonChanHelper(
3, 3,
@ -65,7 +63,6 @@ CHANS = {
"yn", "yndd", "fg", "yume", "o", "lit", "media", "og", "yn", "yndd", "fg", "yume", "o", "lit", "media", "og",
"ig", "2", "ot", "hikki", "cc", "x", "sugg" "ig", "2", "ot", "hikki", "cc", "x", "sugg"
), ),
rps=1 / 120
), ),
"22chan": JsonChanHelper( "22chan": JsonChanHelper(
4, 4,
@ -77,7 +74,6 @@ CHANS = {
"a", "b", "f", "yu", "i", "k", "mu", "pol", "sewers", "a", "b", "f", "yu", "i", "k", "mu", "pol", "sewers",
"sg", "t", "vg" "sg", "t", "vg"
), ),
rps=1 / 120
), ),
"wizchan": JsonChanHelper( "wizchan": JsonChanHelper(
5, 5,
@ -88,7 +84,6 @@ CHANS = {
( (
"wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music", "wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music",
), ),
rps=1 / 60
), ),
# TODO # TODO
# "1chan": ChanHelper( # "1chan": ChanHelper(
@ -100,7 +95,6 @@ CHANS = {
# ( # (
# "rails" # "rails"
# ), # ),
# rps=1 / 600
# ), # ),
"2chhk": RussianJsonChanHelper( "2chhk": RussianJsonChanHelper(
7, 7,
@ -120,7 +114,6 @@ CHANS = {
"a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga", "a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga",
"vape", "h", "ho", "hc", "e", "fet", "sex", "fag" "vape", "h", "ho", "hc", "e", "fet", "sex", "fag"
), ),
rps=1 / 5
), ),
"endchan": EndchanHtmlChanHelper( "endchan": EndchanHtmlChanHelper(
8, 8,
@ -141,7 +134,6 @@ CHANS = {
"ausneets", "qanonresearch", "polru", "yuri", "christianity", "ausneets", "qanonresearch", "polru", "yuri", "christianity",
"kc", "rapport", "news", "brit", "webm", "4chon" "kc", "rapport", "news", "brit", "webm", "4chon"
), ),
rps=1 / 10
), ),
"38chan": JsonChanHelper( "38chan": JsonChanHelper(
9, 9,
@ -152,7 +144,6 @@ CHANS = {
( (
"a", "b", "g", "38" "a", "b", "g", "38"
), ),
rps=1 / 600
), ),
"alokal": AlokalJsonChanHelper( "alokal": AlokalJsonChanHelper(
10, 10,
@ -164,7 +155,6 @@ CHANS = {
"b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp", "b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp",
"fit", "had", "fit", "had",
), ),
rps=1 / 60
), ),
"gnfos": JsonChanHelper( "gnfos": JsonChanHelper(
11, 11,
@ -175,7 +165,6 @@ CHANS = {
( (
"jp", "drive" "jp", "drive"
), ),
rps=1 / 120
), ),
"synch": SynchJsonChanHelper( "synch": SynchJsonChanHelper(
12, 12,
@ -187,7 +176,6 @@ CHANS = {
"b", "d", "_r", "a", "_g", "mlp", "mu", "_tv", "vg", "b", "d", "_r", "a", "_g", "mlp", "mu", "_tv", "vg",
"_wh", "old", "test" "_wh", "old", "test"
), ),
rps=1 / 120
), ),
"tahta": JsonChanHelper( "tahta": JsonChanHelper(
13, 13,
@ -198,7 +186,6 @@ CHANS = {
( (
"b", "g", "s", "v" "b", "g", "s", "v"
), ),
rps=1 / 300
), ),
"awsumchan": JsonChanHelper( "awsumchan": JsonChanHelper(
14, 14,
@ -209,7 +196,6 @@ CHANS = {
( (
"an", "aw", "cr", "fi", "ra", "au", "ga", "he", "sp" "an", "aw", "cr", "fi", "ra", "au", "ga", "he", "sp"
), ),
rps=1 / 600
), ),
"horochan": MayuriChanHelper( "horochan": MayuriChanHelper(
15, 15,
@ -218,7 +204,6 @@ CHANS = {
( (
"b", "b",
), ),
rps=1 / 20
), ),
"doushio": DoushioHtmlChanHelper( "doushio": DoushioHtmlChanHelper(
16, 16,
@ -229,7 +214,6 @@ CHANS = {
( (
"moe", "moe",
), ),
rps=1 / 20
), ),
"desuchan": DesuChanHtmlChanHelper( "desuchan": DesuChanHtmlChanHelper(
17, 17,
@ -245,7 +229,6 @@ CHANS = {
"arrrrr", "brocastan", "gar", "gif", "media", "ot", "r", "w", "arrrrr", "brocastan", "gar", "gif", "media", "ot", "r", "w",
"sandbox", "sugg" "sandbox", "sugg"
), ),
rps=1 / 30
), ),
"aurorachan": DesuChanHtmlChanHelper( "aurorachan": DesuChanHtmlChanHelper(
18, 18,
@ -257,7 +240,6 @@ CHANS = {
"_bm", "de", "ic", "rp", "rpi", "v", "w", "tg", "_bm", "de", "ic", "rp", "rpi", "v", "w", "tg",
"alt", "b", "g", "pkmn", "yuri", "fl", "mu", "sugg" "alt", "b", "g", "pkmn", "yuri", "fl", "mu", "sugg"
), ),
rps=1 / 20
), ),
"tgchan": TgChanHtmlChanHelper( "tgchan": TgChanHtmlChanHelper(
19, 19,
@ -268,7 +250,6 @@ CHANS = {
( (
"draw", "meep", "quest", "questdis", "tg", "icons", "draw", "meep", "quest", "questdis", "tg", "icons",
), ),
rps=1 / 600,
), ),
"lolnada": LolNadaHtmlChanHelper( "lolnada": LolNadaHtmlChanHelper(
20, 20,
@ -280,7 +261,6 @@ CHANS = {
"b", "a", "aw", "cgl", "dw", "int", "qt", "sad", "t", "b", "a", "aw", "cgl", "dw", "int", "qt", "sad", "t",
"toy", "v", "x", "34", "e", "f", "h" "toy", "v", "x", "34", "e", "f", "h"
), ),
rps=1 / 60,
), ),
"fchan": FChanHtmlChanHelper( "fchan": FChanHtmlChanHelper(
21, 21,
@ -291,7 +271,6 @@ CHANS = {
( (
"f", "m", "h", "s", "toon", "a", "ah", "c", "artist", "crit", "b" "f", "m", "h", "s", "toon", "a", "ah", "c", "artist", "crit", "b"
), ),
rps=1 / 60,
), ),
"0chan": ZerochanHtmlChanHelper( "0chan": ZerochanHtmlChanHelper(
22, 22,
@ -307,7 +286,6 @@ CHANS = {
"poligon", "postach", "psih", "r", "rm", "s", "shrek", "shy", "t", "poligon", "postach", "psih", "r", "rm", "s", "shrek", "shy", "t",
"test", "tlp", "tmp", "tv", "vg", "vipe", "wh", "xikkadvach", "ynet" "test", "tlp", "tmp", "tv", "vg", "vipe", "wh", "xikkadvach", "ynet"
), ),
rps=1 / 5
), ),
"410chan": Chan410HtmlChanHelper( "410chan": Chan410HtmlChanHelper(
23, 23,
@ -318,7 +296,6 @@ CHANS = {
( (
"d", "b", "cu", "dev", "r", "a", "ts", "ci" "d", "b", "cu", "dev", "r", "a", "ts", "ci"
), ),
rps=1 / 120
), ),
"7chan": Chan7HtmlChanHelper( "7chan": Chan7HtmlChanHelper(
24, 24,
@ -335,7 +312,6 @@ CHANS = {
"elit", "fag", "fur", "gif", "h", "men", "pco", "s", "elit", "fag", "fur", "gif", "h", "men", "pco", "s",
"sm", "ss", "unf", "v", "sm", "ss", "unf", "v",
), ),
rps=1 / 30
), ),
"chanon": ChanonHtmlChanHelper( "chanon": ChanonHtmlChanHelper(
25, 25,
@ -347,7 +323,6 @@ CHANS = {
"a", "int", "j", "m", "pc", "pol", "prog", "tv", "a", "int", "j", "m", "pc", "pol", "prog", "tv",
"b", "milo", "pr0n", "s", "c", "sug", "b", "milo", "pr0n", "s", "c", "sug",
), ),
rps=1 / 60
), ),
"chanorg": JsonChanHelper( "chanorg": JsonChanHelper(
26, 26,
@ -358,7 +333,6 @@ CHANS = {
( (
"b", "goys" "b", "goys"
), ),
rps=1 / 60
), ),
"iichan": IichanHtmlChanHelper( "iichan": IichanHtmlChanHelper(
27, 27,
@ -373,7 +347,6 @@ CHANS = {
"aa", "abe", "c", "fi", "jp", "rm", "tan", "to", "ts", "aa", "abe", "c", "fi", "jp", "rm", "tan", "to", "ts",
"vn", "vo", "misc" "vn", "vo", "misc"
), ),
rps=1 / 10
), ),
"nowere": NowereHtmlChanHelper( "nowere": NowereHtmlChanHelper(
28, 28,
@ -384,7 +357,6 @@ CHANS = {
( (
"b", "d", "tu", "a", "ph", "wa", "cg", "t", "p" "b", "d", "tu", "a", "ph", "wa", "cg", "t", "p"
), ),
rps=1 / 60
), ),
"8kun2": JsonKunChanHelper( "8kun2": JsonKunChanHelper(
35, 35,
@ -392,67 +364,84 @@ CHANS = {
"https://media.8kun.top/", "https://media.8kun.top/",
"/res/", "/res/",
"file_store/", "file_store/",
("1", "55chan", "_64chen", "8bantb", "8tube", "a", "_abdl2", "agdg", "amv", "aneki", "animu", "animus", "ara", ("1", "55chan", "_64chen", "8bantb", "8tube", "a", "_abdl2", "agdg", "_amv", "aneki", "animu", "animus", "ara",
"arda", "arms", "asatru", "asmr", "aus", "ausneets", "__b", "__baka", "_baneposting", "__baseballbat", "arda", "_arms", "asatru", "_asmr", "aus", "ausneets", "_b", "_baka", "_baneposting", "_baseballbat",
"bcards", "bleached", "blog", "__bonehurtingjuice", "bq", "__brit", "bubblegum", "builders", "bunkers", "butt", "_bcards", "bleached", "blog", "_bonehurtingjuice", "_bq", "_brit", "bubblegum", "builders", "bunkers", "butt",
"cafechan", "caffe", "canada", "cath", "chori", "choroy", "christian", "christianity", "christianmeme", "cafechan", "caffe", "canada", "_cath", "chori", "choroy", "christian", "christianity", "_christianmeme",
"cicachan", "civicrs", "ck", "cloveros", "co", "cow", "__cuckquean", "cute", "cyber", "cyoa", "__czech", "cicachan", "civicrs", "ck", "cloveros", "co", "cow", "_cuckquean", "cute", "cyber", "cyoa", "_czech",
"dadtalk", "danpu", "dao101", "degen", "delete", "dempart", "desu", "diaperfags", "diaperfetish", "dir", "_dadtalk", "danpu", "dao101", "degen", "delete", "dempart", "desu", "diaperfags", "diaperfetish", "dir",
"__dolphin", "dpfag", "_dpr", "druid", "_e9y", "eatme", "ebola", "eerie", "egy", "egypt", "etika", "eu", "_dolphin", "_dpfag", "_dpr", "druid", "_e9y", "_eatme", "ebola", "eerie", "egy", "egypt", "_etika", "_eu",
"euskotxa", "__exit", "f1", "fa", "fairy", "fallen", "fast", "faygo", "feet", "femaledomination", "feri", "_euskotxa", "_exit", "f1", "fa", "_fairy", "fallen", "fast", "faygo", "feet", "femaledomination", "feri",
"__fightcomms", "film", "flemish", "floss", "fortnite", "freedomzine", "fukemo", "fumo", "fur", "furry", "g", "_fightcomms", "film", "flemish", "_floss", "fortnite", "freedomzine", "fukemo", "fumo", "fur", "furry", "g",
"gamergatehq", "genesis", "_gesu", "ggis", "girltalk", "greenbreeze", "gts", "haxxor", "hentai", "hentaiclub", "gamergatehq", "genesis", "_gesu", "_ggis", "girltalk", "greenbreeze", "gts", "_haxxor", "hentai",
"__herm", "hermetics", "hgb", "hgg", "__hindu", "hisparefugio", "hissss", "hnt", "hover", "hybrids", "hydrus", "hentaiclub", "_herm", "_hermetics", "_hgb", "hgg", "_hindu", "hisparefugio", "_hissss", "hnt", "hover",
"hypno", "_hypnochan", "icup", "imperium", "in", "ipfs", "ircsecrets", "islam", "ita", "jaooo", "jewess", "hybrids", "_hydrus", "hypno", "_hypnochan", "icup", "imperium", "in", "ipfs", "ircsecrets", "islam", "ita",
"jmaatv", "joker", "jp", "k", "_kekforceusa", "kemono", "kocsog", "kohlchan", "__(komica)", "_komika", "kpop", "_jaooo", "jewess", "_jmaatv", "_joker", "jp", "k", "_kekforceusa", "kemono", "kocsog", "kohlchan",
"lain", "_lego", "leo", "lewd", "lit", "lol", "loomis", "loroy", "luddite", "magick", "maka", "mde", "_(komica)", "_komika", "kpop", "lain", "_lego", "leo", "lewd", "lit", "_lol", "loomis", "_loroy", "luddite",
"merrychristmas", "miku", "milf", "mom", "monster", "msb", "mtb", "mtt", "mu", "n0thingness", "nanachi", "magick", "maka", "mde", "_merrychristmas", "_miku", "milf", "_mom", "monster", "_msb", "mtb", "mtt", "mu",
"natiofr", "nep", "newbrit", "newsplus", "nobody", "nofap", "nofur", "nogatco", "nothingness", "ntr", "_nuke8", "_n0thingness", "_nanachi", "natiofr", "nep", "newbrit", "newsplus", "_nobody", "nofap", "_nofur", "_nogatco",
"oanda", "__ocb", "__ocult", "_omorashi", "opmk", "os", "otter", "p", "panconleche", "pdfs", "__peaceofmind", "nothingness", "ntr", "_nuke8", "_oanda", "_ocb", "_ocult", "_omorashi", "_opmk", "os", "otter", "p",
"pen", "philosophy", "_pkmns", "pnd", "pokeporn", "polymath", "pone", "projectdcomms", "__pyatibrat", "_qm", "_panconleche", "pdfs", "_peaceofmind", "pen", "philosophy", "_pkmns", "pnd", "pokeporn", "polymath", "pone",
"qpatriotresearch", "__qresearch", "qrnews", "__rand21", "rec", "rmart", "rusrandom", "rzabczan", "s", "s8s", "projectdcomms", "_pyatibrat", "_qm", "qpatriotresearch", "qresearch", "qrnews", "_rand21", "rec", "rmart",
"sag", "sapphic", "shousa", "sikhi", "sip", "sl", "_snowboarding", "socpl", "strek", "subs", "__sve", "t", "_rusrandom", "rzabczan", "s", "s8s", "_sag", "sapphic", "shousa", "_sikhi", "sip", "sl", "_snowboarding",
"tan", "tdt", "tech9", "techan", "techbunker", "tek", "templeos", "tenda", "teraha", "__texit", "tf2", "__tg", "socpl", "strek", "_subs", "_sve", "t", "tan", "tdt", "_tech9", "_techan", "techbunker", "_tek", "templeos",
"_thb", "thedickshow", "throat", "_tibby", "tikilounge", "tkr", "tr55", "__trashcollector", "truthlegion", "tenda", "teraha", "_texit", "tf2", "_tg", "_thb", "_thedickshow", "throat", "_tibby", "tikilounge", "tkr",
"tulpamancers", "turul", "tutturu", "tv", "u", "uaco", "_ucla", "underground", "__usersunion", "v", "vichan", "_tr55", "_trashcollector", "truthlegion", "tulpamancers", "turul", "tutturu", "tv", "u", "_uaco", "_ucla",
"vietkong", "vietnam", "vore", "vr", "_warposting", "wdsc", "webm", "wg", "__wga", "wikieat", "wis", "wmafsex", "underground", "_usersunion", "v", "vichan", "_vietkong", "vietnam", "vore", "vr", "_warposting", "wdsc",
"workrelated", "wqt", "wx", "x", "__xivl", "__xtian", "zoomerright", "zundel", "0", "55sync", "abdl", "webm", "wg", "_wga", "wikieat", "wis", "wmafsex", "_workrelated", "_wqt", "wx", "x", "_xivl", "_xtian",
"alleycat", "_arisu", "arisubunker", "_arp", "bane", "_bimbohypnosis", "_bluemoon", "bmn", "brains", "cats", "_zoomerright", "zundel", "0", "55sync", "abdl", "alleycat", "_arisu", "_arisubunker", "_arp", "_bane",
"_chance", "clang", "comfy", "critters", "_cursed", "_cvine", "cze", "d", "dcaco", "demonp", "_dnmd", "doomer", "_bimbohypnosis", "_bluemoon", "bmn", "brains", "cats", "_chance", "clang", "comfy", "_critters", "_cursed",
"doot", "elitabla", "_empanada", "erp", "_falseflags", "fashionplus", "fata", "femdom", "fit", "_flg", "_cvine", "_cze", "d", "dcaco", "_demonp", "_dnmd", "doomer", "doot", "elitabla", "_empanada", "erp",
"_fr8chan", "futyitorna", "garrett", "_giantesshentai", "hentaiporn", "hmfr", "hooliedayz", "hsp", "hujszon", "_falseflags", "fashionplus", "_fata", "femdom", "fit", "_flg", "_fr8chan", "futyitorna", "garrett",
"iep", "just", "k46", "kind", "_kiwc", "kukichan", "_lacajita", "_legos", "lgd", "liveanarchy", "_giantesshentai", "hentaiporn", "_hmfr", "hooliedayz", "hsp", "_hujszon", "_iep", "just", "k46", "_kind",
"luciddreaming", "m", "_mapp", "mental", "_mets", "_milhis", "monarchy", "_myon", "newhomosuck", "newsci", "_kiwc", "kukichan", "_lacajita", "_legos", "_lgd", "liveanarchy", "_luciddreaming", "m", "_mapp", "mental",
"_nine", "oes", "onepiece", "_other369", "otomad", "_penguware", "psyid", "qresearch2gen", "rule34", "_mets", "_milhis", "monarchy", "_myon", "newhomosuck", "newsci", "_nine", "_oes", "_onepiece", "_other369",
"_satorare", "sonyeon", "split", "sunflower", "_tae", "test", "_tft", "tftg", "toy", "trap", "_vein", "_otomad", "_penguware", "psyid", "qresearch2gen", "rule34", "_satorare", "sonyeon", "split", "_sunflower",
"_virtualreality", "vivian", "voros", "wbr", "_weird", "wooo", "yuuka", "fringe", "random", "cuteboys", "tech", "_tae", "test", "_tft", "tftg", "toy", "trap", "_vein", "_virtualreality", "vivian", "voros", "wbr", "_weird",
"internatiomall", "interracial", "liberty", "htg", "mai", "komica", "cutebois", "argentina", "r", "tf", "wooo", "yuuka", "fringe", "random", "cuteboys", "tech", "_internatiomall", "interracial", "liberty", "htg",
"draftnote", "abcu", "k117", "britfeel", "liberty", "htg", "mai", "komica", "cutebois", "argentina", "r", "tf", "mai", "komica", "cutebois", "argentina", "r", "tf", "draftnote", "abcu", "_k117", "britfeel", "liberty",
"draftnote", "abcu", "k117", "britfeel", "y", "an", "francofil", "portal", "royalhawk", "vdm", "bullmask", "htg", "mai", "komica", "cutebois", "argentina", "r", "tf", "draftnote", "abcu", "_k117", "britfeel", "y",
"imouto", "tripfriend", "arepa", "rwby", "sw", "y", "an", "francofil", "portal", "royalhawk", "vdm", "an", "francofil", "portal", "_royalhawk", "_vdm", "_bullmask", "imouto", "tripfriend", "arepa", "rwby", "sw",
"bullmask", "imouto", "tripfriend", "arepa", "rwby", "sw", "magali", "hikki", "biz", "eris", "india", "mg", "y", "an", "francofil", "portal", "_royalhawk", "_vdm", "_bullmask", "imouto", "tripfriend", "arepa", "rwby",
"magali", "hikki", "biz", "eris", "india", "mg", "out", "infinity", "tifa", "muslim", "out", "infinity", "sw", "magali", "hikki", "biz", "eris", "india", "mg", "magali", "hikki", "biz", "eris", "india", "mg", "out",
"tifa", "muslim", "slackware", "archivo", "flatearth", "yaoi", "boombox", "wdp", "thedonald", "_infinity", "tifa", "_muslim", "out", "_infinity", "tifa", "_muslim", "slackware", "archivo", "_flatearth",
"libertedexpression", "khyber", "jsr", "slackware", "archivo", "flatearth", "yaoi", "boombox", "wdp", "_yaoi", "_boombox", "_wdp", "thedonald", "libertedexpression", "_khyber", "jsr", "slackware", "archivo",
"thedonald", "libertedexpression", "khyber", "jsr", "fso", "wumpawhip", "buddhismhotline", "indochinaexpats", "_flatearth", "_yaoi", "_boombox", "_wdp", "thedonald", "libertedexpression", "_khyber", "jsr", "fso",
"ett", "redbar", "skyline350gt", "asc", "bazafx", "bestkorea", "covid19", "sokra", "bowsu", "qpatriotsunited", "wumpawhip", "_buddhismhotline", "indochinaexpats", "_ett", "_redbar", "_skyline350gt", "_asc", "bazafx",
"verzet", "wlctint", "cultstate", "melody", "vedic", "yhvh", "1cok", "astropolis", "fso", "wumpawhip", "bestkorea", "covid19", "_sokra", "_bowsu", "_qpatriotsunited", "_verzet", "_wlctint", "_cultstate", "_melody",
"buddhismhotline", "indochinaexpats", "ett", "redbar", "skyline350gt", "asc", "bazafx", "bestkorea", "covid19", "_vedic", "yhvh", "1cok", "_astropolis", "fso", "wumpawhip", "_buddhismhotline", "indochinaexpats", "_ett",
"sokra", "bowsu", "qpatriotsunited", "verzet", "wlctint", "cultstate", "melody", "vedic", "yhvh", "1cok", "_redbar", "_skyline350gt", "_asc", "bazafx", "bestkorea", "covid19", "_sokra", "_bowsu", "_qpatriotsunited",
"astropolis", "earthlibfront", "pardochan", "stanislawowski", "thetrump", "yukkuri", "1825kun", "cryptobtc", "_verzet", "_wlctint", "_cultstate", "_melody", "_vedic", "yhvh", "1cok", "_astropolis", "_earthlibfront",
"isol", "knights", "language", "rr34", "sperg", "awaken", "belgium", "blizzard", "brain", "buddha", "dbs", "_pardochan", "_stanislawowski", "_thetrump", "yukkuri", "1825kun", "cryptobtc", "_isol", "_knights",
"deestevensvoice4you", "f4net", "fuckuchina", "gbtv", "hairygirls", "hallaca", "homeowner", "indo", "jersey", "language", "_rr34", "_sperg", "_awaken", "_belgium", "_blizzard", "_brain", "buddha", "_dbs",
"jigglypuff", "lbt", "madh4ckrs", "medcorp", "miamichan", "mrsfrisby", "mulatto", "mupro", "nhoodlink", "_deestevensvoice4you", "_f4net", "_fuckuchina", "_gbtv", "hairygirls", "_hallaca", "_homeowner", "indo",
"p5porn", "patriotrevolution", "peko", "projectobject", "prop", "pups", "qanonspain", "qcastellano", "_jersey", "_jigglypuff", "_lbt", "_madh4ckrs", "_medcorp", "_miamichan", "mrsfrisby", "_mulatto", "_mupro",
"earthlibfront", "pardochan", "stanislawowski", "thetrump", "yukkuri", "1825kun", "cryptobtc", "isol", "_nhoodlink", "_p5porn", "_patriotrevolution", "_peko", "_projectobject", "_prop", "pups", "_qanonspain",
"knights", "language", "rr34", "sperg", "awaken", "belgium", "blizzard", "brain", "buddha", "dbs", "_qcastellano", "_earthlibfront", "_pardochan", "_stanislawowski", "_thetrump", "yukkuri", "1825kun",
"deestevensvoice4you", "f4net", "fuckuchina", "gbtv", "hairygirls", "hallaca", "homeowner", "indo", "jersey", "cryptobtc", "_isol", "_knights", "language", "_rr34", "_sperg", "_awaken", "_belgium", "_blizzard", "_brain",
"jigglypuff", "lbt", "madh4ckrs", "medcorp", "miamichan", "mrsfrisby", "mulatto", "mupro", "nhoodlink", "buddha", "_dbs", "_deestevensvoice4you", "_f4net", "_fuckuchina", "_gbtv", "hairygirls", "_hallaca",
"p5porn", "patriotrevolution", "peko", "projectobject", "prop", "pups", "qanonspain", "qcastellano", "qsocial", "_homeowner", "indo", "_jersey", "_jigglypuff", "_lbt", "_madh4ckrs", "_medcorp", "_miamichan", "mrsfrisby",
"resist", "revolu", "skemt", "sketheory", "spaceforce", "surro", "thehand", "transit", "vitaecryptocurrency", "_mulatto", "_mupro", "_nhoodlink", "_p5porn", "_patriotrevolution", "_peko", "_projectobject", "_prop",
"qsocial", "resist", "revolu", "skemt", "sketheory", "spaceforce", "surro", "thehand", "transit", "pups", "_qanonspain", "_qcastellano", "qsocial", "_resist", "_revolu", "_skemt", "_sketheory", "_spaceforce",
"vitaecryptocurrency"), "_surro", "_thehand", "_transit", "_vitaecryptocurrency", "qsocial", "_resist", "_revolu", "_skemt",
rps=1 / 3 "_sketheory", "_spaceforce", "_surro", "_thehand", "_transit", "_vitaecryptocurrency", "midnightriders",
"tingles", "1cc", "prog", "ytc", "arcagayghetto", "prog", "ytc", "arcagayghetto", "2hu", "o", "warroom", "2hu",
"o", "warroom", "ebon", "xiaomicha", "ebon", "xiaomicha", "gnosticwarfare", "moldnet", "zenczan", "cosplay",
"otakus", "nohup", "frenzone", "8dixie", "hqa", "pundit", "vrgg", "uf0", "malaysia", "gnosticwarfare",
"moldnet", "zenczan", "cosplay", "otakus", "nohup", "frenzone", "8dixie", "hqa", "pundit", "vrgg", "uf0",
"malaysia", "instruments", "unlightopen", "pso2g", "jozsicsan", "komijoke", "bmsgeu", "92k", "komicaz", "pcal",
"accent", "wethepatriots", "porussia", "1a", "tarhana", "bigwomen", "maths", "instruments", "unlightopen",
"pso2g", "jozsicsan", "komijoke", "bmsgeu", "92k", "komicaz", "pcal", "accent", "wethepatriots", "porussia",
"1a", "tarhana", "bigwomen", "maths", "coffeetalk", "arcader", "kingcrimson", "moonlight", "trkey", "whogen",
"xivlgr", "amichan", "gendercritical", "inflg", "komicalol", "capcom", "coser", "cud", "feedism", "grc",
"reimuchan", "stalker2", "2020istheyear", "carib", "jumpchen", "mishmash", "qbl", "sakurachan", "satsukichan",
"taodick", "aes", "gacha", "nfl2", "redlands", "traditionalcatholics", "tsiou", "airsoft2", "animation",
"cafardchan", "chrstdis", "coffeetalk", "arcader", "kingcrimson", "moonlight", "trkey", "whogen", "xivlgr",
"amichan", "gendercritical", "inflg", "komicalol", "capcom", "coser", "cud", "feedism", "grc", "reimuchan",
"stalker2", "2020istheyear", "carib", "jumpchen", "mishmash", "qbl", "sakurachan", "satsukichan", "taodick",
"aes", "gacha", "nfl2", "redlands", "traditionalcatholics", "tsiou", "airsoft2", "animation", "cafardchan",
"chrstdis", "komicamc", "marista", "neetpride", "numis", "progmusic", "retrogaminggifs", "warcraft2004",
"komicamc", "marista", "neetpride", "numis", "progmusic", "retrogaminggifs", "warcraft2004"),
), ),
"hispachan": HispachanHtmlHelper( "hispachan": HispachanHtmlHelper(
30, 30,
@ -466,7 +455,6 @@ CHANS = {
"cl", "co", "ec", "es", "mx", "pe", "py", "uy", "ve", "d", "cl", "co", "ec", "es", "mx", "pe", "py", "uy", "ve", "d",
"h", "o", "s", "sar", "scl", "sco", "ses", "smx", "spe", "sve", "h", "o", "s", "sar", "scl", "sco", "ses", "smx", "spe", "sve",
), ),
rps=1 / 20
), ),
"sushigirl": JsonChanHelper( "sushigirl": JsonChanHelper(
31, 31,
@ -478,7 +466,6 @@ CHANS = {
"archive", "wildcard", "lounge", "arcade", "kawaii", "archive", "wildcard", "lounge", "arcade", "kawaii",
"kitchen", "tunes", "culture", "silicon", "yakuza", "hell", "lewd" "kitchen", "tunes", "culture", "silicon", "yakuza", "hell", "lewd"
), ),
rps=1 / 30
), ),
"4kev": Kev4PhpHelper( "4kev": Kev4PhpHelper(
32, 32,
@ -491,7 +478,6 @@ CHANS = {
"politics", "programming", "random", "technology", "politics", "programming", "random", "technology",
"television", "videogames", "television", "videogames",
), ),
rps=1 / 20
), ),
"plus4chan": Plus4ChanHelper( "plus4chan": Plus4ChanHelper(
33, 33,
@ -503,7 +489,6 @@ CHANS = {
"baw", "co", "cog", "jam", "mtv", "baw", "co", "cog", "jam", "mtv",
"coc", "draw", "pco", "coq", "cod", "a" "coc", "draw", "pco", "coq", "cod", "a"
), ),
rps=1 / 15
), ),
"2chan": Chan2Helper( "2chan": Chan2Helper(
34, 34,
@ -628,7 +613,6 @@ CHANS = {
"oe", # ??? お絵sql "oe", # ??? お絵sql
"72", # ??? お絵sqlip "72", # ??? お絵sqlip
), ),
rps=1 / 3
), ),
"waifuist": LynxChanHelper( "waifuist": LynxChanHelper(
36, 36,
@ -639,7 +623,6 @@ CHANS = {
( (
"w", "starlet", "etc", "w", "starlet", "etc",
), ),
rps=1 / 25
), ),
"cutiegarden": LynxChanHelper( "cutiegarden": LynxChanHelper(
37, 37,
@ -650,7 +633,6 @@ CHANS = {
( (
"lg", "cozy", "meta", "test" "lg", "cozy", "meta", "test"
), ),
rps=1 / 25
), ),
"9chan": JsonInfinityNextChanHelper( "9chan": JsonInfinityNextChanHelper(
38, 38,
@ -737,6 +719,5 @@ CHANS = {
"politicallyincorrect", "hockey", "randb", "traps", "vichan", "ircsecrets", "bosartest111111", "chib", "politicallyincorrect", "hockey", "randb", "traps", "vichan", "ircsecrets", "bosartest111111", "chib",
"testing1234fake", "mdma", "virgo", "homo", "scum", "anal", "gamerhatehq", "vagina", "dump", "advert", "testing1234fake", "mdma", "virgo", "homo", "scum", "anal", "gamerhatehq", "vagina", "dump", "advert",
"jueggin", "kike", "type", "robot", "goodguys", "ween", "bankfraudaccountloading", "vhsch"), "jueggin", "kike", "type", "robot", "goodguys", "ween", "bankfraudaccountloading", "vhsch"),
rps=1 / 10
), ),
} }

View File

@ -1,9 +1,10 @@
import json import json
from json import JSONDecodeError from json import JSONDecodeError
from hexlib.log import logger
from chan.helper import ChanHelper from chan.helper import ChanHelper
from post_process import get_links_from_body from post_process import get_links_from_body
from util import logger
class JsonChanHelper(ChanHelper): class JsonChanHelper(ChanHelper):

View File

@ -2,14 +2,13 @@ from bs4 import BeautifulSoup
class ChanHelper: class ChanHelper:
def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps): def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards):
self.db_id = db_id self.db_id = db_id
self._base_url = base_url self._base_url = base_url
self._image_url = image_url self._image_url = image_url
self._thread_path = thread_path self._thread_path = thread_path
self._image_path = image_path self._image_path = image_path
self._boards = boards self._boards = boards
self.rps = rps
self.get_method = None self.get_method = None
self.save_folder = None self.save_folder = None
@ -37,7 +36,7 @@ class ChanHelper:
raise NotImplementedError raise NotImplementedError
def item_unique_id(self, item, board): def item_unique_id(self, item, board):
return int(self.board_hash(board) + str(self.item_id(item))) return board + str(self.item_id(item))
@staticmethod @staticmethod
def thread_mtime(thread): def thread_mtime(thread):

View File

@ -3,9 +3,9 @@ import re
from urllib.parse import urljoin from urllib.parse import urljoin
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from hexlib.log import logger
from chan.desuchan_html import DesuChanHtmlChanHelper from chan.desuchan_html import DesuChanHtmlChanHelper
from util import logger
def _ts(text): def _ts(text):

View File

@ -3,9 +3,10 @@ from urllib.parse import urljoin
import json import json
from hexlib.log import logger
from chan.helper import ChanHelper from chan.helper import ChanHelper
from post_process import get_links_from_body from post_process import get_links_from_body
from util import logger
class JsonInfinityNextChanHelper(ChanHelper): class JsonInfinityNextChanHelper(ChanHelper):

View File

@ -1,7 +1,7 @@
from vanwanet_scrape.scraper import Scraper from vanwanet_scrape.scraper import Scraper
from chan.chan_json import JsonChanHelper from chan.chan_json import JsonChanHelper
from util import logger from hexlib.log import logger
class JsonKunChanHelper(JsonChanHelper): class JsonKunChanHelper(JsonChanHelper):
@ -10,8 +10,8 @@ class JsonKunChanHelper(JsonChanHelper):
def item_type(item): def item_type(item):
return "thread" if item["resto"] == 0 else "post" return "thread" if item["resto"] == 0 else "post"
def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps): def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards):
super().__init__(db_id, base_url, image_url, thread_path, image_path, boards, rps) super().__init__(db_id, base_url, image_url, thread_path, image_path, boards)
self._scraper = Scraper( self._scraper = Scraper(
headers={ headers={

View File

@ -7,14 +7,14 @@ import cloudscraper
import sys import sys
from chan.helper import ChanHelper from chan.helper import ChanHelper
from util import logger from hexlib.log import logger
class LynxChanHelper(ChanHelper): class LynxChanHelper(ChanHelper):
"""See https://gitgud.io/LynxChan/LynxChan/blob/master/doc/Json.txt""" """See https://gitgud.io/LynxChan/LynxChan/blob/master/doc/Json.txt"""
def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps): def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards):
super().__init__(db_id, base_url, image_url, thread_path, image_path, boards, rps) super().__init__(db_id, base_url, image_url, thread_path, image_path, boards)
scraper = cloudscraper.create_scraper() scraper = cloudscraper.create_scraper()
if len(sys.argv) > 3: if len(sys.argv) > 3:

View File

@ -3,13 +3,13 @@ from json import JSONDecodeError
from chan.helper import ChanHelper from chan.helper import ChanHelper
from post_process import get_links_from_body from post_process import get_links_from_body
from util import logger from hexlib.log import logger
class MayuriChanHelper(ChanHelper): class MayuriChanHelper(ChanHelper):
def __init__(self, db_id, base_url, image_url, boards, rps): def __init__(self, db_id, base_url, image_url, boards):
super().__init__(db_id, base_url, image_url, None, None, boards, rps) super().__init__(db_id, base_url, image_url, None, None, boards)
@staticmethod @staticmethod
def item_id(item): def item_id(item):

View File

@ -3,8 +3,7 @@ from json import JSONDecodeError
from chan.helper import ChanHelper from chan.helper import ChanHelper
from post_process import get_links_from_body from post_process import get_links_from_body
from util import logger from hexlib.log import logger
class RussianJsonChanHelper(ChanHelper): class RussianJsonChanHelper(ChanHelper):

View File

@ -36,6 +36,10 @@ class TgChanHtmlChanHelper(DesuChanHtmlChanHelper):
posts = [] posts = []
for post_el in op_el.find_all("table", recursive=False): for post_el in op_el.find_all("table", recursive=False):
*_, time = post_el.find("label").children *_, time = post_el.find("label").children
if post_el.get("class") and "userdelete" in post_el.get("class"):
continue
posts.append({ posts.append({
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]), "id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
"type": "post", "type": "post",

View File

@ -1,384 +1,247 @@
version: "2.1" version: "3"
volumes:
influxdb_data:
pg_data:
pg_data_imhash:
services: services:
influxdb:
image: influxdb:alpine
volumes:
- influxdb_data:/var/lib/influxdb
grafana:
image: grafana/grafana
ports:
- 127.0.0.1:3006:3000
environment:
- "GF_SECURITY_ADMIN_PASSWORD=changeme"
db:
image: postgres
volumes:
- pg_data:/var/lib/postgresql/data
environment:
- "POSTGRES_USER=feed_archiver"
- "POSTGRES_PASSWORD=changeme"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U feed_archiver"]
interval: 5s
timeout: 5s
retries: 5
db_imhashdb:
image: simon987/pg_hamming
volumes:
- pg_data_imhash:/var/lib/postgresql/data
environment:
- "POSTGRES_USER=imhashdb"
- "POSTGRES_PASSWORD=changeme"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U imhashdb"]
interval: 5s
timeout: 5s
retries: 5
redis:
image: redis
archiver:
image: simon987/feed_archiver
restart: always
depends_on:
db:
condition: service_healthy
environment:
- "FA_DB_HOST=db"
- "FA_DB_USER=feed_archiver"
- "FA_DB_PASSWORD=changeme"
- "FA_REDIS_ADDR=redis:6379"
- "FA_PATTERN=arc.*"
imhashdb:
image: simon987/imhashdb
restart: always
entrypoint: "/build/imhashdb/cli/cli hasher"
volumes:
- ${SAVE_FOLDER}:/data/
environment:
- "IMHASHDB_STORE=/data"
- "IMHASHDB_REDIS_ADDR=redis:6379"
- "IMHASHDB_PG_USER=imhashdb"
- "IMHASHDB_PG_PASSWORD=changeme"
- "IMHASHDB_PG_DATABASE=imhashdb"
- "IMHASHDB_PG_HOST=db_imhashdb"
- "IMHASHDB_HASH_CONCURRENCY=16"
# Image boards
4chan: 4chan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=4chan" - "CF_CHAN=4chan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
0chan: 0chan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=0chan" - "CF_CHAN=0chan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
22chan: 22chan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=22chan" - "CF_CHAN=22chan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
2chan: 2chan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=2chan" - "CF_CHAN=2chan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
2chhk: 2chhk:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=2chhk" - "CF_CHAN=2chhk"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
38chan: 38chan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=38chan" - "CF_CHAN=38chan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
410chan: 410chan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=410chan" - "CF_CHAN=410chan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
4kev: 4kev:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=4kev" - "CF_CHAN=4kev"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
7chan: 7chan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=7chan" - "CF_CHAN=7chan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
8kun: 8kun:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=8kun" - "CF_CHAN=8kun"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
alokal: alokal:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=alokal" - "CF_CHAN=alokal"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
aurorachan: aurorachan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=aurorachan" - "CF_CHAN=aurorachan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
awsumchan: awsumchan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=awsumchan" - "CF_CHAN=awsumchan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
chanon: chanon:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=chanon" - "CF_CHAN=chanon"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
chanorg: chanorg:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=chanorg" - "CF_CHAN=chanorg"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
desuchan: desuchan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=desuchan" - "CF_CHAN=desuchan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
doushio: doushio:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=doushio" - "CF_CHAN=doushio"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
endchan: endchan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=endchan" - "CF_CHAN=endchan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
fchan: fchan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=fchan" - "CF_CHAN=fchan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
gnfos: gnfos:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=gnfos" - "CF_CHAN=gnfos"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
hispachan: hispachan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=hispachan" - "CF_CHAN=hispachan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
horochan: horochan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=horochan" - "CF_CHAN=horochan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
iichan: iichan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=iichan" - "CF_CHAN=iichan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
lainchan: lainchan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=lainchan" - "CF_CHAN=lainchan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
lolnada: lolnada:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=lolnada" - "CF_CHAN=lolnada"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
nowere: nowere:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=nowere" - "CF_CHAN=nowere"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
plus4chan: plus4chan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=plus4chan" - "CF_CHAN=plus4chan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
sushigirl: sushigirl:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=sushigirl" - "CF_CHAN=sushigirl"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
synch: synch:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=synch" - "CF_CHAN=synch"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
tahta: tahta:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=tahta" - "CF_CHAN=tahta"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
tgchan: tgchan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=tgchan" - "CF_CHAN=tgchan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
uboachan: uboachan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=uboachan" - "CF_CHAN=uboachan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
waifuist: waifuist:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=waifuist" - "CF_CHAN=waifuist"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
wizchan: wizchan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=wizchan" - "CF_CHAN=wizchan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"
9chan: 9chan:
image: simon987/chan_feed image: simon987/chan_feed
restart: always restart: always
user: ${CURRENT_UID}
environment: environment:
- "CF_CHAN=9chan" - "CF_CHAN=9chan"
- "CF_REDIS_HOST=redis" - "REDIS_HOST=redis"
- "CF_INFLUXDB=influxdb"

View File

@ -1,6 +0,0 @@
FROM nginx:alpine
COPY nginx.conf /etc/nginx/
COPY ["/feed_viz", "/webroot"]
EXPOSE 80

@ -1 +0,0 @@
Subproject commit c8e11a73d74e6af19cab581c94abf943daea050e

View File

@ -1,48 +0,0 @@
user nginx;
worker_processes 1;
error_log /var/log/nginx/error.log warn;
pid /var/run/nginx.pid;
events {
worker_connections 1024;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
access_log /var/log/nginx/access.log main;
sendfile on;
keepalive_timeout 65;
upstream socket {
server ws_adapter:3090;
}
server {
listen 80;
index index.html;
root /webroot;
location / {
try_files $uri $uri/ /index.html;
}
location /socket {
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "Upgrade";
proxy_set_header Host $host;
proxy_read_timeout 86400;
proxy_pass http://socket;
}
}
}

View File

@ -1,11 +1,28 @@
import json import json
import requests
from hexlib.log import logger
from vanwanet_scrape.scraper import Scraper
from chan.chan import CHANS from chan.chan import CHANS
existing = CHANS["8kun2"]._boards existing = CHANS["8kun2"]._boards
updated = list(existing) updated = list(existing)
added = set() added = set()
scraper = Scraper(
headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": "https://8kun.top/index.html"
},
domains=[
"8kun.top",
"media.8kun.top",
"sys.8kun.net"
],
logger=logger
)
def mask(board): def mask(board):
for i, b in enumerate(updated): for i, b in enumerate(updated):
@ -22,8 +39,7 @@ def unmask(board):
for i in range(0, 500, 50): for i in range(0, 500, 50):
r = requests.get("https://sys.8kun.top/board-search.php?page=" + str(i)) r = scraper.get("https://sys.8kun.top/board-search.php?page=" + str(i))
j = json.loads(r.text) j = json.loads(r.text)
for board in j["boards"]: for board in j["boards"]:
@ -36,7 +52,7 @@ for i in range(0, 500, 50):
print("[+] " + board) print("[+] " + board)
for board in existing: for board in existing:
if board not in added: if board not in added and not board.startswith("_"):
mask(board) mask(board)
print("(" + ",".join('"' + u + '"' for u in updated) + ")") print("(" + ",".join('"' + u + '"' for u in updated) + ")")

File diff suppressed because it is too large Load Diff

73
migrate_item_ids.py Normal file
View File

@ -0,0 +1,73 @@
import itertools
import orjson
import psycopg2
from hexlib.misc import buffered
from tqdm import tqdm
from hexlib.db import pg_fetch_cursor_all
from chan.chan import CHANS
if __name__ == '__main__':
conn = psycopg2.connect(
host="192.168.1.70",
port="5432",
user="feed_archiver",
password="",
dbname="feed_archiver"
)
conn.set_client_encoding("utf8")
table = "chan_4chan_post"
new_table = "chan2_4chan_post"
print(table)
# chan_name = table.split("_")[1]
# chan = CHANS[chan_name]
cur = conn.cursor()
cur2 = conn.cursor()
cur2.execute("""
CREATE TABLE IF NOT EXISTS %s (
id TEXT PRIMARY KEY NOT NULL,
archived_on TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL,
data JSONB NOT NULL
);
""" % new_table)
cur.execute("SELECT COUNT(*) FROM %s" % table)
row_count = cur.fetchone()[0]
cur.execute("DECLARE cur1 CURSOR FOR SELECT * FROM %s" % table)
rows = pg_fetch_cursor_all(cur, name="cur1", batch_size=5000)
@buffered(batch_size=1000)
def pg_bulk_insert(rows):
val_count = len(rows[0])
cur2.execute(
"INSERT INTO %s VALUES %s ON CONFLICT DO NOTHING" %
(
new_table,
", ".join(("(" + ",".join("%s" for _ in range(val_count)) + ")") for _ in rows)
),
list(itertools.chain(*rows))
)
for row in tqdm(rows, total=row_count):
id_, archived_on, data = row
new_id = data["_board"] + str(data["no"])
pg_bulk_insert([
(new_id, archived_on, orjson.dumps(data).decode())
])
pg_bulk_insert(None)
conn.commit()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 366 KiB

83
run.py
View File

@ -1,40 +1,24 @@
import datetime
import json import json
import os import os
import time import time
import traceback import traceback
from datetime import datetime
from queue import Queue from queue import Queue
from threading import Thread from threading import Thread
import redis
from hexlib.concurrency import queue_iter
from hexlib.db import VolatileBooleanState, VolatileState from hexlib.db import VolatileBooleanState, VolatileState
from hexlib.monitoring import Monitoring from hexlib.env import get_web, get_redis
from hexlib.log import logger
from chan.chan import CHANS from chan.chan import CHANS
from post_process import post_process from post_process import post_process
from util import logger, Web
BYPASS_RPS = False
DBNAME = "chan_feed"
if os.environ.get("CF_INFLUXDB"):
influxdb = Monitoring(DBNAME, host=os.environ.get("CF_INFLUXDB"), logger=logger, batch_size=100, flush_on_exit=True)
MONITORING = True
else:
MONITORING = False
REDIS_HOST = os.environ.get("CF_REDIS_HOST", "localhost")
REDIS_PORT = os.environ.get("CF_REDIS_PORT", 6379)
CHAN = os.environ.get("CF_CHAN", None) CHAN = os.environ.get("CF_CHAN", None)
CF_PUBLISH = os.environ.get("CF_PUBLISH", False)
ARC_LISTS = os.environ.get("CF_ARC_LISTS", "arc").split(",")
class ChanScanner: class ChanScanner:
def __init__(self, helper, proxy): def __init__(self, helper):
self.web = Web(influxdb if MONITORING else None, rps=helper.rps, get_method=helper.get_method, proxy=proxy) self.web = get_web()
self.helper = helper self.helper = helper
self.state = state self.state = state
@ -83,9 +67,8 @@ def once(func):
class ChanState: class ChanState:
def __init__(self, prefix): def __init__(self, prefix):
self._posts = VolatileBooleanState(prefix, host=REDIS_HOST, port=REDIS_PORT) self._posts = VolatileBooleanState(prefix)
self._threads = VolatileState(prefix, host=REDIS_HOST, port=REDIS_PORT) self._threads = VolatileState(prefix)
print("redis host=" + REDIS_HOST)
def mark_visited(self, item: int): def mark_visited(self, item: int):
self._posts["posts"][item] = True self._posts["posts"][item] = True
@ -109,18 +92,12 @@ class ChanState:
} }
def publish_worker(queue: Queue, helper, p): def publish_worker(queue: Queue, helper):
while True: for item, board in queue_iter(queue):
try: try:
item, board = queue.get() publish(item, board, helper)
if item is None:
break
publish(item, board, helper,)
except Exception as e: except Exception as e:
logger.error(str(e) + ": " + traceback.format_exc()) logger.error(str(e) + ": " + traceback.format_exc())
finally:
queue.task_done()
@once @once
@ -131,23 +108,7 @@ def publish(item, board, helper):
routing_key = "%s.%s.%s" % (CHAN, item_type, board) routing_key = "%s.%s.%s" % (CHAN, item_type, board)
message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True) message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True)
if CF_PUBLISH: rdb.lpush("arc.chan2." + routing_key, message)
rdb.publish("chan." + routing_key, message)
for arc in ARC_LISTS:
rdb.lpush(arc + ".chan." + routing_key, message)
if MONITORING:
distance = datetime.utcnow() - datetime.utcfromtimestamp(helper.item_mtime(item))
influxdb.log([{
"measurement": CHAN,
"time": str(datetime.utcnow()),
"tags": {
"board": board
},
"fields": {
"distance": distance.total_seconds()
}
}])
if __name__ == "__main__": if __name__ == "__main__":
@ -157,30 +118,20 @@ if __name__ == "__main__":
if save_folder: if save_folder:
chan_helper.save_folder = save_folder chan_helper.save_folder = save_folder
proxy = None
if os.environ.get("CF_PROXY"):
proxy = os.environ.get("CF_PROXY")
logger.info("Using proxy %s" % proxy)
if BYPASS_RPS:
chan_helper.rps = 10
state = ChanState(CHAN) state = ChanState(CHAN)
rdb = redis.Redis(host=REDIS_HOST, port=REDIS_PORT) rdb = get_redis()
publish_q = Queue() publish_q = Queue()
for _ in range(3): publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper))
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper, proxy)) publish_thread.setDaemon(True)
publish_thread.setDaemon(True) publish_thread.start()
publish_thread.start()
s = ChanScanner(chan_helper, proxy) s = ChanScanner(chan_helper)
while True: while True:
try: try:
for p, b in s.all_posts(): for p, b in s.all_posts():
publish_q.put((p, b)) publish_q.put((p, b))
except KeyboardInterrupt as e: except KeyboardInterrupt as e:
print("cleanup..") print("cleanup..")
for _ in range(3): publish_q.put(None)
publish_q.put((None, None))
break break

View File

@ -1,2 +0,0 @@
#!/bin/bash
CURRENT_UID=$(id -u):$(id -g) SAVE_FOLDER=$(pwd)/data docker-compose up --force-recreate

86
util.py
View File

@ -1,86 +0,0 @@
import logging
import sys
import traceback
from datetime import datetime
from logging import FileHandler, StreamHandler
import requests
from hexlib.misc import rate_limit
from urllib3 import disable_warnings
disable_warnings()
last_time_called = dict()
logger = logging.getLogger("default")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
for h in logger.handlers:
logger.removeHandler(h)
logger.addHandler(StreamHandler(sys.stdout))
class Web:
def __init__(self, monitoring, rps=1 / 2, proxy=None, get_method=None):
self.session = requests.Session()
if proxy:
self.session.proxies = {"http": proxy, "https": proxy}
self.session.verify = False
self._rps = rps
self.monitoring = monitoring
self._get_method = get_method
@rate_limit(self._rps)
def _get(url, **kwargs):
retries = 3
while retries > 0:
retries -= 1
try:
if self._get_method:
return self._get_method(url, **kwargs)
return self.session.get(url, **kwargs)
except KeyboardInterrupt as e:
raise e
except Exception as e:
logger.warning("Error with request %s: %s" % (url, str(e)))
raise Exception("Gave up request after maximum number of retries")
self._get = _get
def get(self, url, **kwargs):
try:
r = self._get(url, **kwargs)
logger.debug("GET %s <%d>" % (url, r.status_code))
if self.monitoring:
self.monitoring.log([{
"measurement": "web",
"time": str(datetime.utcnow()),
"fields": {
"status_code": r.status_code,
"size": len(r.content),
},
"tags": {
"ok": r.status_code == 200
},
}])
return r
except KeyboardInterrupt as e:
raise e
except Exception as e:
logger.error(str(e) + traceback.format_exc())
if self.monitoring:
self.monitoring.log([{
"measurement": "web",
"time": str(datetime.utcnow()),
"fields": {
"status_code": 0,
"size": 0,
},
"tags": {
"ok": False
},
}])
return None