2.0 rewrite wip

2025-10-18 23:06:52 +00:00 · 2020-06-08 20:40:08 -04:00 · 2020-06-08 20:40:08 -04:00 · 3f9382e6f7
commit 3f9382e6f7
parent c3dc1faa31
21 changed files with 956 additions and 893 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,15 +0,0 @@
 [submodule "task_tracker_drone"]
 	path = task_tracker_drone
 	url = https://github.com/simon987/task_tracker_drone/
 [submodule "last.fm"]
 	path = last.fm
 	url = https://git.simon987.net/drone/last.fm
 [submodule "caa"]
 	path = caa
 	url = https://git.simon987.net/drone/caa.git
 [submodule "spotify"]
 	path = spotify
 	url = https://git.simon987.net/drone/spotify
 [submodule "spotify2"]
 	path = spotify2
 	url = https://git.simon987.net/drone/spotify2
--- a/README.md
+++ b/README.md
@ -1,17 +1,3 @@
 wip
 ### task_tracker setup:
 Last.fm api calls are queued to [task_tracker](https://github.com/simon987/task_tracker/),
 and results are gathered by a [task_tracker_drone](https://github.com/simon987/task_tracker_drone/) 
 ([script](https://git.simon987.net/drone/last.fm/src/master/run)).
 Project secret:
 ```json
 {
 	"apikey": "<Your Last.fm api key>",
 	"user": "<Your Last.fm username>"
 }
 ```
--- a/1
+++ b/1
@ -1 +0,0 @@
 Subproject commit 910f4a0bceadac37ac28fa59e7648f241c931fe6
--- a/config.py
+++ b/config.py
@ -0,0 +1,22 @@
 import os
 config = {
    "DB": "musicbrainz_db",
    "USER": "musicbrainz",
    "PASSWORD": "musicbrainz",
    "HOST": "127.0.0.1",
    "PORT": 5433,
    "LASTFM_APIKEY": os.environ.get("LASTFM_APIKEY"),
    "LASTFM_USER": os.environ.get("LASTFM_USER"),
    "SPOTIFY_CLIENTID": os.environ.get("SPOTIFY_CLIENTID"),
    "SPOTIFY_SECRET": os.environ.get("SPOTIFY_SECRET"),
 }
 def connstr():
    return " dbname=%s user=%s password=%s host=%s port=%d" % (
        config["DB"], config["USER"], config["PASSWORD"],
        config["HOST"], config["PORT"]
    )
--- a/download_mb_dump.sh
+++ b/download_mb_dump.sh
@ -1,21 +0,0 @@
 #!/usr/bin/env bash
 latest=$(curl http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/LATEST)
 mkdir in 2> /dev/null
 cd in
 wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/mbdump.tar.bz2"
 wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/mbdump-derived.tar.bz2"
 tar -xjvf mbdump.tar.bz2 mbdump/area mbdump/artist mbdump/l_area_area mbdump/l_artist_artist \
 mbdump/l_artist_release mbdump/l_artist_release_group mbdump/l_label_label mbdump/l_release_group_release_group \
 mbdump/label mbdump/label_type mbdump/link mbdump/link_type mbdump/release mbdump/release_group \
 mbdump/release_group_primary_type mbdump/artist_credit_name mbdump/release_status mbdump/l_label_release \
 mbdump/l_label_release_group
 tar -xjvf mbdump-derived.tar.bz2 mbdump/artist_tag mbdump/release_group_tag mbdump/tag mbdump/tag_relation \
 mbdump/release_group_meta
 mv mbdump/* .
 rm -r mbdump
 cd ..
--- a/extract_covers.py
+++ b/extract_covers.py
@ -1,27 +0,0 @@
 import sqlite3
 import sys
 with sqlite3.connect(sys.argv[1]) as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT id from covers")
    cursor = conn.cursor()
    cursor.execute("SELECT id from covers")
    def rows():
        buf = list()
        for row in cursor.fetchall():
            buf.append(row[0])
            if len(buf) > 30:
                yield buf
                buf.clear()
    for batch in rows():
        cursor.execute("SELECT cover from covers where id in (%s)" % (",".join(("'" + b + "'") for b in batch)))
        covers = cursor.fetchall()
        for i, cover in enumerate(covers):
            with open("./tmpcovers/" + batch[i] + ".jpg", "wb") as out:
                out.write(cover[0])
                print(batch[i])
--- a/generate_caa_tasks.py
+++ b/generate_caa_tasks.py
@ -1,56 +0,0 @@
 import json
 from multiprocessing.pool import ThreadPool
 from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
 TT_API_URL = "https://tt.simon987.net/api"
 TT_PROJECT = 5
 done = set()
 # with sqlite3.connect(sys.argv[1]) as conn:
 #     cur = conn.cursor()
 #     cur.execute("SELECT id FROM covers")
 #     for mbid in cur.fetchall():
 #         done.add(mbid[0])
 api = TaskTrackerApi(TT_API_URL)
 worker = Worker.from_file(api)
 if not worker:
    worker = api.make_worker("caa scraper")
    worker.dump_to_file()
 worker.request_access(TT_PROJECT, True, True)
 input("Give permission to " + worker.alias)
 def mktask(mbids):
    res = worker.submit_task(
        project=TT_PROJECT,
        recipe=json.dumps(mbids),
        hash64=hash(mbids[0]),
        max_assign_time=60 * 30,
        priority=1,
        unique_str=None,
        verification_count=None,
        max_retries=5,
    )
    print(res.text)
 def lines():
    with open("in/release") as f:
        buf = list()
        for line in f:
            cols = line.split("\t")
            buf.append(cols[1])
            if len(buf) == 75:
                a = list(buf)
                buf.clear()
                yield a
 pool = ThreadPool(processes=20)
 pool.map(func=mktask, iterable=lines())
--- a/generate_lastfm_tasks.py
+++ b/generate_lastfm_tasks.py
@ -1,48 +0,0 @@
 import csv
 import json
 from multiprocessing.pool import ThreadPool
 from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker, LOG_TRACE
 TT_API_URL = "https://tt.simon987.net/api"
 TT_PROJECT = 1
 api = TaskTrackerApi(TT_API_URL)
 worker = Worker.from_file(api)
 if not worker:
    worker = api.make_worker("last.fm scraper")
    worker.dump_to_file()
 worker.request_access(TT_PROJECT, True, True)
 input("Give permission to " + worker.alias)
 with open("repo/artist.csv") as f:
    reader = csv.reader(f)
    def mktask(lines):
        res = worker.submit_task(
            project=TT_PROJECT,
            recipe=json.dumps(
                [{"mbid": line[0], "name": line[1]} for line in lines]
            ),
            unique_str=lines[0][0],
            max_assign_time=60 * 5,
        )
        print(res.text)
    def lines():
        line_batch = list()
        for line in reader:
            if "Group" in line[3]:
                line_batch.append(line)
            if len(line_batch) >= 30:
                res = list(line_batch)
                line_batch.clear()
                yield res
    tasks = list(lines())
    pool = ThreadPool(processes=25)
    pool.map(func=mktask, iterable=tasks)
--- a/generate_spotify_tasks.py
+++ b/generate_spotify_tasks.py
@ -1,48 +0,0 @@
 import csv
 import json
 from multiprocessing.pool import ThreadPool
 from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
 TT_API_URL = "https://tt.simon987.net/api"
 TT_PROJECT = 6
 api = TaskTrackerApi(TT_API_URL)
 worker = Worker.from_file(api)
 if not worker:
    worker = api.make_worker("mm worker")
    worker.dump_to_file()
 worker.request_access(TT_PROJECT, True, True)
 input("Give permission to " + worker.alias)
 with open("repo/artist.csv") as f:
    reader = csv.reader(f)
    def mktask(lines):
        res = worker.submit_task(
            project=TT_PROJECT,
            recipe=json.dumps(
                [{"mbid": line[0], "name": line[1]} for line in lines]
            ),
            unique_str=lines[0][0],
            max_assign_time=60 * 5,
        )
        print(res.text)
    def lines():
        line_batch = list()
        for line in reader:
            line_batch.append(line)
            if len(line_batch) >= 30:
                res = list(line_batch)
                line_batch.clear()
                yield res
    tasks = list(lines())
    pool = ThreadPool(processes=25)
    pool.map(func=mktask, iterable=tasks)
--- a/generate_spotify_tasks_2.py
+++ b/generate_spotify_tasks_2.py
@ -1,60 +0,0 @@
 import json
 import sqlite3
 from multiprocessing.pool import ThreadPool
 import sys
 from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
 TT_API_URL = "https://tt.simon987.net/api"
 TT_PROJECT = 7
 api = TaskTrackerApi(TT_API_URL)
 worker = Worker.from_file(api)
 if not worker:
    worker = api.make_worker("mm worker")
    worker.dump_to_file()
 worker.request_access(TT_PROJECT, True, True)
 input("Give permission to " + worker.alias)
 spotids = set()
 with sqlite3.connect(sys.argv[1]) as conn:
    cur = conn.cursor()
    cur.execute("SELECT data from artist")
    for row in cur.fetchall():
        j = json.loads(row[0])
        if j is None or "artists" not in j or "items" not in j["artists"]:
            continue
        for item in j["artists"]["items"]:
            spotids.add(item["id"])
    def mktask(lines):
        res = worker.submit_task(
            project=TT_PROJECT,
            recipe=json.dumps(
                [{"spotid": line} for line in lines]
            ),
            unique_str=lines[0],
            max_assign_time=60 * 5,
        )
        print(res.text)
    def ids():
        id_batch = list()
        for spotid in spotids:
            id_batch.append(spotid)
            if len(id_batch) >= 30:
                res = list(id_batch)
                id_batch.clear()
                yield res
    tasks = list(ids())
    pool = ThreadPool(processes=25)
    pool.map(func=mktask, iterable=tasks)
--- a/last.fm
+++ b/last.fm
@ -1 +0,0 @@
 Subproject commit 855df64c316930062ff4f7740492d0f039788498
--- a/map_release_to_rg_map.py
+++ b/map_release_to_rg_map.py
@ -1,31 +0,0 @@
 import sqlite3
 release_to_release_group_map = dict()
 release_groups = dict()
 with open("in/release_group") as f:
    for line in f:
        cols = line.split("\t")
        release_groups[cols[0]] = cols[1]
 with open("in/release") as f:
    for line in f:
        cols = line.split("\t")
        release_to_release_group_map[cols[1]] = release_groups[cols[4]]
 with sqlite3.connect("mapdb.db") as conn:
    cursor = conn.cursor()
    cursor.execute("CREATE TABLE map (release TEXT PRIMARY KEY , release_group TEXT)")
    for k, v in release_to_release_group_map.items():
        cursor.execute("INSERT INTO map (release, release_group) VALUES (?,?)", (k, v))
    conn.commit()
 """
 CREATE TABLE covers (id TEXT primary key, cover BLOB);
 ATTACH 'mapdb.db' AS map;
 ATTACH '/mnt/Data8/caa_tn_only.db' AS source;
 INSERT OR IGNORE INTO covers SELECT release_group, cover FROM source.covers INNER JOIN map.map ON id = map.release;
 """
--- a/mb_scratch.sql
+++ b/mb_scratch.sql
@ -0,0 +1,391 @@
 CREATE OR REPLACE FUNCTION fn_sortname(name text, mb_sortname text) RETURNS text AS
 $$
 declare
    sn text;
 BEGIN
    sn = regexp_replace(name, '[^a-zA-Z0-9.\-!?&çéàâäëïöü'' ]', '_');
    if length(replace(sn, '_', '')) = 0 then
        return upper(regexp_replace(mb_sortname, '[^\w.\-!?& ]', '_'));
    end if;
    return upper(sn);
 END
 $$ LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION fn_sortname(name text) RETURNS text AS
 $$
 BEGIN
    return upper(regexp_replace(name, '[^a-zA-Z0-9.\-!?&çéàâäëïöü'' ]', '_'));
 END
 $$ LANGUAGE plpgsql;
 CREATE TABLE mg.translate_artist_artist_rel
 (
    mb_name TEXT PRIMARY KEY,
    mg_name TEXT
 );
 INSERT INTO mg.translate_artist_artist_rel
 VALUES ('teacher', 'TEACHER_OF'),
       ('composer-in-residence', 'HAS_COMPOSER-IN-RESIDENCE_STATUS_IN'),
       ('member of band', 'IS_MEMBER_OF'),
       ('voice actor', 'IS_VOICE_ACTOR_OF'),
       ('tribute', 'IS_TRIBUTE_TO'),
       ('supporting musician', 'IS_SUPPORTING_MUSICIAN_OF'),
       ('instrumental supporting musician', 'IS_INSTRUMENTAL_SUPPORTING_MUSICIAN_OF'),
       ('personal relationship', 'HAS_PERSONAL_RELATIONSHIP_WITH'),
       ('musical relationships', 'HAS_MUSICAL_RELATIONSHIP_WITH'),
       ('collaboration', 'HAS_COLLABORATED_WITH'),
       ('married', 'IS_MARRIED_WITH'),
       ('sibling', 'IS_SIBLING_OF'),
       ('parent', 'IS_PARENT_OF'),
       ('is person', 'IS'),
       ('conductor position', 'IS_CONDUCTOR_OF'),
       ('vocal supporting musician', 'DOES_VOCAL_SUPPORT_FOR'),
       ('artistic director', 'IS_ARTIST_DIRECTOR_OF'),
       ('subgroup', 'IS_SUBGROUP_OF'),
       ('founder', 'IS_FOUNDER_OF'),
       ('involved with', 'IS_INVOLVED_WITH'),
       ('named after', 'IS_NAMED_AFTER');
 CREATE TABLE mg.translate_artist_release_rel
 (
    mb_name TEXT PRIMARY KEY,
    mg_name text
 );
 INSERT INTO mg.translate_artist_release_rel
 VALUES ('translator', 'TRANSLATED'),
       ('liner notes', 'WROTE_LINER_NOTES'),
       ('lyricist', 'IS_LYRICIST_FOR'),
       ('lacquer cut', 'DID_LACQUER_CUT_FOR'),
       ('samples from artist', 'HAS_SAMPLES_IN'),
       ('remixes and compilations', NULL),
       ('composition', 'COMPOSED'),
       ('booking', 'DID_BOOKING_FOR'),
       ('balance', 'DID_BALANCE_FOR'),
       ('misc', 'HAS_MISC_ROLE_IN'),
       ('conductor', 'CONDUCTED'),
       ('legal representation', 'PROVIDED_LEGAL_REPRESENTATION_FOR'),
       ('design/illustration', 'DID_DESIGN_FOR'),
       ('performing orchestra', 'PERFORMED_FOR'),
       ('producer', 'PRODUCED'),
       ('instrument', 'PERFORMED_INSTRUMENT_FOR'),
       ('writer', 'WROTE_LYRICS_FOR'),
       ('production', 'DID_PRODUCTION_FOR'),
       ('performance', 'PERFORMED_FOR'),
       ('composer', 'IS_COMPOSER_FOR'),
       ('sound', 'DID_SOUND_FOR'),
       ('remixer', 'DID_REMIXING_FOR'),
       ('orchestrator', 'IS_ORCHESTRATOR_FOR'),
       ('compiler', 'DID_COMPILATION_FOR'),
       ('vocal arranger', 'IS_ARRANGER_FOR'),
       ('arranger', 'IS_ARRENGER_FOR'),
       ('mix-DJ', 'MIXED'),
       ('editor', 'IS_EDITOR_FOR'),
       ('illustration', 'DID_ILLUSTRATION_FOR'),
       ('audio', 'DID_AUDIO_FOR'),
       ('publishing', 'IS_PUBLISHER_FOR'),
       ('art direction', 'DID_ART_DIRECTOR_FOR'),
       ('design', 'DID_DESIGN_FOR'),
       ('instrument arranger', 'IS_ARRANGER_FOR'),
       ('chorus master', 'IS_CHORUS_MASTER_FOR'),
       ('photography', 'DID_PHOTOGRAPHY_FOR'),
       ('performer', 'PERFORMED_IN'),
       ('graphic design', 'DID_GRAPHIC_DESIGN_FOR'),
       ('booklet editor', 'IS_BOOKLET_EDITOR_FOR'),
       ('programming', 'DID_PROGRAMING_FOR'),
       ('copyright', 'IS_COPYRIGHT_HOLDER_OF'),
       ('piano technician', 'IS_PIANO_TECNICIAN_FOR'),
       ('phonographic copyright', 'IS_PHONOGRAPHIC_COPYRIGHT_HOLDER_OF'),
       ('mastering', 'DID_MASTERING_FOR'),
       ('vocal', 'PERFORED_VOCALS_FOR'),
       ('librettist', 'IS_LIBRETTIST_FOR'),
       ('mix', 'MIXED'),
       ('recording', 'DID_RECORDING_FOR'),
       ('concertmaster', 'IS_CONCERTMASTER_FOR'),
       ('engineer', 'IS_ENGINEER_FOR'),
       ('tribute', 'IS_TRIBUTE_TO'),
       ('dedicated to', 'IS_DEDICATED_TO'),
       ('creative direction', NULL),
       ('artists and repertoire', NULL);
 CREATE TABLE mg.translate_label_label_rel
 (
    mb_name TEXT PRIMARY KEY,
    mg_name text
 );
 INSERT INTO mg.translate_label_label_rel
 VALUES ('label rename', 'WAS_RENAMED_TO'),
       ('imprint', 'DOES_IMPRINT_FOR'),
       ('label distribution', 'DOES_DISTRIBUTION_FOR'),
       ('business association', 'HAS_BUSINESS_ASSOCIATION_TO'),
       ('label ownership', 'OWNS'),
       ('label reissue', 'DOES_REISSUING_FOR');
 CREATE OR REPLACE VIEW mg.artist AS
 SELECT gid                                                 as "id:ID(Artist)",
       name,
       fn_sortname(name, sort_name)                        as sortname,
       COALESCE(begin_date_year, 0)                        as "year:int",
       comment,
       (CASE WHEN type = 2 THEN 'Group' ELSE 'Artist' END) as ":LABEL"
 FROM artist;
 CREATE OR REPLACE VIEW mg.artist_artist AS
 SELECT a0.gid    as ":START_ID(Artist)",
       a1.gid    as ":END_ID(Artist)",
       t.mg_name as ":TYPE"
 FROM l_artist_artist
         INNER JOIN artist a0 ON entity0 = a0.id
         INNER JOIN artist a1 ON entity1 = a1.id
         INNER JOIN link l on l.id = l_artist_artist.link
         INNER JOIN link_type lt ON lt.id = l.link_type
         INNER JOIN mg.translate_artist_artist_rel t ON t.mb_name = lt.name;
 CREATE OR REPLACE VIEW mg.release AS
 SELECT release_group.gid          as ":id:ID(Release)",
       release_group.name,
       m.first_release_date_year  as "year:int",
       CONCAT('Release;', t.name) as ":LABEL"
 FROM release_group
         INNER JOIN release_group_meta m ON m.id = release_group.id
         INNER JOIN release_group_primary_type t ON t.id = release_group.type;
 CREATE OR REPLACE VIEW mg.artist_release AS
 SELECT a.gid     as ":START_ID(Artist)",
       rg.gid    as ":END_ID(Release)",
       t.mg_name as ":TYPE"
 FROM l_artist_release_group
         INNER JOIN artist a on a.id = l_artist_release_group.entity0
         INNER JOIN release_group rg on rg.id = l_artist_release_group.entity1
         INNER JOIN link l on l.id = l_artist_release_group.link
         INNER JOIN link_type lt ON lt.id = l.link_type
         INNER JOIN mg.translate_artist_release_rel t ON t.mb_name = lt.name
 UNION ALL
 SELECT a.gid     as ":START_ID(Artist)",
       rg.gid    as ":END_ID(Release)",
       t.mg_name as ":TYPE"
 FROM l_artist_release
         INNER JOIN artist a on a.id = l_artist_release.entity0
         INNER JOIN release r on r.id = l_artist_release.entity1
         INNER JOIN release_group rg on rg.id = r.release_group
         INNER JOIN link l on l.id = l_artist_release.link
         INNER JOIN link_type lt ON lt.id = l.link_type
         INNER JOIN mg.translate_artist_release_rel t ON t.mb_name = lt.name
 UNION ALL
 SELECT a.gid          as ":START_ID(Artist)",
       rg.gid         as ":END_ID(Release)",
       'CREDITED_FOR' as ":TYPE"
 FROM release
         INNER JOIN artist_credit_name cn ON cn.artist_credit = release.artist_credit
         INNER JOIN artist a on a.id = cn.artist
         INNER JOIN release_group rg on rg.id = release.release_group;
 CREATE OR REPLACE VIEW mg.tag AS
 WITH occurences AS (
    SELECT tag, COUNT(*) as count
    FROM (
             SELECT tag
             FROM release_group_tag
             UNION ALL
             SELECT tag
             FROM release_tag
         ) as tags
    GROUP BY tag
 )
 SELECT tag.id           as "id:ID(Tag)",
       tag.name,
       occurences.count as "occurences:int"
 FROM tag
         INNER JOIN occurences ON occurences.tag = tag.id
 WHERE ref_count > 0
  AND occurences.count > 5;
 CREATE OR REPLACE VIEW mg.release_tag AS
 SELECT rg.gid                                                      as ":START_ID(Release)",
       release_group_tag.tag                                       as ":END_ID(Tag)",
       greatest(least(release_group_tag.count::float / 6, 1), 0.2) as "weight:float"
 FROM release_group_tag
         INNER JOIN release_group rg ON rg.id = release_group_tag.release_group
         INNER JOIN mg.tag t ON t."id:ID(Tag)" = release_group_tag.tag
 WHERE release_group_tag.count > 0
 UNION ALL
 SELECT rg.gid                                                as ":START_ID(Release)",
       release_tag.tag                                       as ":END_ID(Tag)",
       greatest(least(release_tag.count::float / 6, 1), 0.2) as "weight:float"
 FROM release_tag
         INNER JOIN release r ON r.id = release_tag.release
         INNER JOIN release_group rg ON rg.id = r.release_group
         INNER JOIN mg.tag t ON t."id:ID(Tag)" = release_tag.tag
 WHERE release_tag.count > 0;
 CREATE OR REPLACE VIEW mg.artist_tag AS
 SELECT a.gid                                                as ":START_ID(Artist)",
       artist_tag.tag                                       as ":END_ID(Tag)",
       greatest(least(artist_tag.count::float / 8, 1), 0.2) as "weight:float"
 FROM artist_tag
         INNER JOIN artist a on artist_tag.artist = a.id
         INNER JOIN mg.tag t ON t."id:ID(Tag)" = artist_tag.tag
 CREATE OR REPLACE VIEW mg.tag_tag AS
 SELECT tag_relation.tag1                                        as ":START_ID(Tag)",
       tag_relation.tag2                                        as ":END_ID(Tag)",
       greatest(least(tag_relation.weight::float / 12, 1), 0.2) as "weight:float"
 FROM tag_relation;
 CREATE OR REPLACE VIEW mg.label AS
 SELECT label.gid                 as "id:ID(Label)",
       label.name,
       fn_sortname(label.name)   as sortname,
 --        label_code                    as code,
       concat('Label;', lt.name) as ":LABEL"
 FROM label
         INNER JOIN label_type lt on label.type = lt.id;
 CREATE OR REPLACE VIEW mg.release_label AS
 SELECT l.gid as ":START_ID(Release)",
       r.gid as ":END_ID(Label)"
 FROM l_label_release
         INNER JOIN label l on l_label_release.entity0 = l.id
         INNER JOIN release r on l_label_release.entity1 = r.id;
 -- UNION
 -- SELECT l.gid as ":START_ID(Release)",
 --        r.gid as ":END_ID(Label)"
 -- FROM l_label_release_group
 --          INNER JOIN label l on l_label_release_group.entity0 = l.id
 --          INNER JOIN release_group rg on l_label_release_group.entity1 = rg.id
 --          INNER JOIN release r on r.release_group = rg.id
 CREATE OR REPLACE VIEW mg.label_label AS
 SELECT l0.gid    as ":START_ID(Label)",
       l1.gid    as ":END_ID(Label)",
       t.mg_name as ":TYPE"
 FROM l_label_label
         INNER JOIN label l0 on l_label_label.entity0 = l0.id
         INNER JOIN label l1 on l_label_label.entity1 = l1.id
         INNER JOIN link l on l.id = l_label_label.link
         INNER JOIN link_type lt ON lt.id = l.link_type
         INNER JOIN mg.translate_label_label_rel t ON t.mb_name = lt.name
 --------------
 CREATE TABLE mg.covers
 (
    mbid uuid PRIMARY KEY,
    ts   timestamp DEFAULT CURRENT_TIMESTAMP,
    tn   bytea
 );
 CREATE TABLE mg.lastfm_artist
 (
    name TEXT PRIMARY KEY,
    mbid uuid
 );
 CREATE TABLE mg.lastfm_raw_data
 (
    name TEXT,
    mbid uuid,
    ts   timestamp DEFAULT CURRENT_TIMESTAMP,
    data jsonb,
    PRIMARY KEY (name, mbid)
 );
 CREATE TABLE mg.lastfm_artist_meta
 (
    name      TEXT PRIMARY KEY,
    listeners int,
    playcount int
 );
 CREATE TABLE mg.lastfm_artist_tag
 (
    name TEXT,
    tag  TEXT,
    PRIMARY KEY (name, tag)
 );
 CREATE TABLE mg.lastfm_artist_artist
 (
    name0  TEXT,
    name1  TEXT,
    weight float,
    ts     timestamp DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (name0, name1)
 );
 --------------
 CREATE TABLE mg.spotify_artist
 (
    spotid TEXT PRIMARY KEY,
    mbid   UUID UNIQUE
 );
 CREATE TABLE mg.spotify_artist_meta
 (
    spotid     TEXT PRIMARY KEY,
    name       TEXT,
    followers  int,
    popularity int,
    ts         timestamp DEFAULT CURRENT_TIMESTAMP
 );
 CREATE TABLE mg.spotify_artist_tag
 (
    spotid TEXT,
    tag    TEXT,
    PRIMARY KEY (spotid, tag)
 );
 CREATE TABLE mg.spotify_artist_album
 (
    spotid TEXT,
    album  TEXT,
    PRIMARY KEY (spotid, album)
 );
 CREATE TABLE mg.spotify_artist_track
 (
    spotid TEXT,
    track  TEXT,
    url    TEXT,
    PRIMARY KEY (spotid, track)
 );
 CREATE TABLE mg.spotify_artist_artist
 (
    spotid0 TEXT,
    spotid1 TEXT,
    index   int,
    ts      timestamp DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (spotid0, spotid1)
 );
 CREATE TABLE mg.spotify_raw_data
 (
    query    TEXT,
    endpoint TEXT,
    ts       timestamp DEFAULT CURRENT_TIMESTAMP,
    data     jsonb,
    PRIMARY KEY (query, endpoint)
 );
 --------
 CREATE OR REPLACE FUNCTION asciifold(text) RETURNS text
 AS
 '/pglib/libasciifolding.so',
 'asciifold' LANGUAGE C STRICT;
 CREATE OR REPLACE FUNCTION asciifold_lower(text) RETURNS text
 AS
 '/pglib/libasciifolding.so',
 'asciifold_lower' LANGUAGE C STRICT;
--- a/process_lastfm_data.py
+++ b/process_lastfm_data.py
@ -1,102 +0,0 @@
 import csv
 import json
 import sqlite3
 from collections import defaultdict
 import sys
 artists = set()
 def disambiguate(lfm_artist, artist_release_count, name, mbid):
    existing_mbid = lfm_artist.get(name, None)
    if existing_mbid and mbid != existing_mbid:
        if artist_release_count[existing_mbid] < artist_release_count[mbid]:
            lfm_artist[name] = mbid
            # print("Replacing %s (%s) with %s (%d) for %s" %
            #       (existing_mbid, artist_release_count[existing_mbid],
            #        mbid, artist_release_count[mbid],
            #        name))
    else:
        lfm_artist[name] = mbid
 def patch(lastfm_data):
    artist_listeners = dict()
    lastfm_artist_to_mbid = dict()
    artist_release_count = defaultdict(int)
    related = list()
    with open("repo/artist_release.csv") as f:
        for line in f:
            cols = line.split(',')
            artist_release_count[cols[0]] += 1
    with sqlite3.connect(lastfm_data) as conn:
        cur = conn.cursor()
        cur.execute("SELECT data FROM lastfmdata", )
        data = list(cur.fetchall())
    # A lastfm artist name can refer to multiple MBIDs
    # For RELATED_TO purposes, we assume that the MBID referring
    # to the artist with the most official releases is the one
    for row in data:
        meta = json.loads(row[0])
        disambiguate(lastfm_artist_to_mbid, artist_release_count, meta["name"], meta["artist"])
        for similar in [s for s in meta["similar"] if s["mbid"] is not None]:
            disambiguate(lastfm_artist_to_mbid, artist_release_count, similar["name"], similar["mbid"])
    # Get related links & listener counts
    for row in data:
        meta = json.loads(row[0])
        artist_listeners[lastfm_artist_to_mbid[meta["name"]]] = \
            (meta["listeners"], meta["playcount"])
        for similar in [s for s in meta["similar"] if s["mbid"] is not None]:
            related.append((
                lastfm_artist_to_mbid[similar["name"]],
                lastfm_artist_to_mbid[meta["name"]],
                similar["match"]
            ))
    with open("repo/lastfm_artist.csv", "w") as out:
        writer = csv.writer(out)
        writer.writerow([
            "id:ID(Artist)", "name", "sortname", "year:short", "comment",  ":LABEL", "listeners:int", "playcount:int"
        ])
        with open("repo/artist.csv") as f:
            reader = csv.reader(f)
            reader.__next__()  # Skip header
            for row in reader:
                writer.writerow([
                    row[0],
                    row[1],
                    row[2],
                    row[3],
                    row[4],
                    row[5],
                    artist_listeners.get(row[0], (0, 0))[0],
                    artist_listeners.get(row[0], (0, 0))[1],
                ])
                artists.add(row[0])
    with open("repo/lastfm_artist_artist.csv", "w") as out:
        out.write(",".join((
            ":START_ID(Artist)", ":END_ID(Artist)", "weight:float"
        )) + "\n")
        for x in related:
            if x[0] in artists and x[1] in artists:
                out.write(",".join(x) + "\n")
 patch(sys.argv[1])
--- a/process_mb_dump.py
+++ b/process_mb_dump.py
@ -1,466 +0,0 @@
 import os
 from collections import defaultdict
 import re
 from statistics import median
 links = dict()
 link_types = dict()
 areas = dict()
 labels = dict()
 label_types = {
    "\\N": ""
 }
 release_groups = dict()
 release_statuses = dict()
 release_to_release_group_map = dict()
 release_types = {
    "\\N": "",
 }
 artists = dict()
 tags = dict()
 release_release_rel_map = {
    "covers and versions": "",
    "remixes and compilations": "",
    "DJ-mix": "IS_DJ_MIX_OF",
    "live performance": "IS_LIVE_PERFORMANCE_OF",
    "cover": "IS_COVER_OF",
    "remix": "IS_REMIX_OF",
    "mashes up": "IS_MASHUP_OF",
    "included in": "INCLUDED_IN",
    "single from": "IS_SINGLE_FROM"
 }
 artist_release_rel_map = {
    "translator": "TRANSLATED",
    "liner notes": "WROTE_LINER_NOTES",
    "lyricist": "IS_LYRICIST_FOR",
    "lacquer cut": "DID_LACQUER_CUT_FOR",
    "samples from artist": "HAS_SAMPLES_IN",
    "remixes and compilations": "",
    "composition": "COMPOSED",
    "booking": "DID_BOOKING_FOR",
    "balance": "DID_BALANCE_FOR",
    "misc": "HAS_MISC_ROLE_IN",
    "conductor": "CONDUCTED",
    "legal representation": "PROVIDED_LEGAL_REPRESENTATION_FOR",
    "design/illustration": "DID_DESIGN_FOR",
    "performing orchestra": "PERFORMED_FOR",
    "producer": "PRODUCED",
    "instrument": "PERFORMED_INSTRUMENT_FOR",
    "writer": "WROTE_LYRICS_FOR",
    "production": "DID_PRODUCTION_FOR",
    "performance": "PERFORMED_FOR",
    "composer": "IS_COMPOSER_FOR",
    "sound": "DID_SOUND_FOR",
    "remixer": "DID_REMIXING_FOR",
    "orchestrator": "IS_ORCHESTRATOR_FOR",
    "compiler": "DID_COMPILATION_FOR",
    "vocal arranger": "IS_ARRANGER_FOR",
    "arranger": "IS_ARRENGER_FOR",
    "mix-DJ": "MIXED",
    "editor": "IS_EDITOR_FOR",
    "illustration": "DID_ILLUSTRATION_FOR",
    "audio": "DID_AUDIO_FOR",
    "publishing": "IS_PUBLISHER_FOR",
    "art direction": "DID_ART_DIRECTOR_FOR",
    "design": "DID_DESIGN_FOR",
    "instrument arranger": "IS_ARRANGER_FOR",
    "chorus master": "IS_CHORUS_MASTER_FOR",
    "photography": "DID_PHOTOGRAPHY_FOR",
    "performer": "PERFORMED_IN",
    "graphic design": "DID_GRAPHIC_DESIGN_FOR",
    "booklet editor": "IS_BOOKLET_EDITOR_FOR",
    "programming": "DID_PROGRAMING_FOR",
    "copyright": "IS_COPYRIGHT_HOLDER_OF",
    "piano technician": "IS_PIANO_TECNICIAN_FOR",
    "phonographic copyright": "IS_PHONOGRAPHIC_COPYRIGHT_HOLDER_OF",
    "mastering": "DID_MASTERING_FOR",
    "vocal": "PERFORED_VOCALS_FOR",
    "librettist": "IS_LIBRETTIST_FOR",
    "mix": "MIXED",
    "recording": "DID_RECORDING_FOR",
    "concertmaster": "IS_CONCERTMASTER_FOR",
    "engineer": "IS_ENGINEER_FOR",
    # release_group
    "tribute": "IS_TRIBUTE_TO",
    "dedicated to": "IS_DEDICATED_TO",
    "creative direction": "",
    "artists and repertoire": ""
 }
 artist_artist_rel_map = {
    "teacher": "TEACHER_OF",
    "composer-in-residence": "HAS_COMPOSER-IN-RESIDENCE_STATUS_IN",
    "member of band": "IS_MEMBER_OF",
    "voice actor": "IS_VOICE_ACTOR_OF",
    "tribute": "IS_TRIBUTE_TO",
    "supporting musician": "IS_SUPPORTING_MUSICIAN_OF",
    "instrumental supporting musician": "IS_INSTRUMENTAL_SUPPORTING_MUSICIAN_OF",
    "personal relationship": "HAS_PERSONAL_RELATIONSHIP_WITH",
    "musical relationships": "HAS_MUSICAL_RELATIONSHIP_WITH",
    "collaboration": "HAS_COLLABORATED_WITH",
    "married": "IS_MARRIED_WITH",
    "sibling": "IS_SIBLING_OF",
    "parent": "IS_PARENT_OF",
    "is person": "IS",
    "conductor position": "IS_CONDUCTOR_OF",
    "vocal supporting musician": "DOES_VOCAL_SUPPORT_FOR",
    "artistic director": "IS_ARTIST_DIRECTOR_OF",
    "subgroup": "IS_SUBGROUP_OF",
    "founder": "IS_FOUNDER_OF",
    "involved with": "IS_INVOLVED_WITH",
    "named after": "IS_NAMED_AFTER",
 }
 label_label_rel_map = {
    "label rename": "WAS_RENAMED_TO",
    "imprint": "DOES_IMPRINT_FOR",
    "label distribution": "DOES_DISTRIBUTION_FOR",
    "business association": "HAS_BUSINESS_ASSOCIATION_TO",
    "label ownership": "OWNS",
    "label reissue": "DOES_REISSUING_FOR"
 }
 if not os.path.exists("repo"):
    os.mkdir("repo")
 else:
    os.system("rm repo/*")
 if not os.path.exists("tmp"):
    os.mkdir("tmp")
 else:
    os.system("rm tmp/*")
 with open("in/link", "r") as f:
    for line in f:
        cols = line.split("\t")
        links[cols[0]] = cols
 with open("in/release_status", "r") as f:
    for line in f:
        cols = line.split("\t")
        release_statuses[cols[0]] = cols
 with open("in/link_type", "r") as f:
    for line in f:
        cols = line.split("\t")
        link_types[cols[0]] = cols
 with open("in/area", "r") as f:
    for line in f:
        cols = line.split("\t")
        areas[cols[0]] = cols
 with open("in/label_type") as f:
    for line in f:
        cols = line.split("\t")
        label_types[cols[0]] = ";" + cols[1].replace(" ", "")
        if cols[3] != "\\N" and cols[2] in label_types:
            label_types[cols[0]] += label_types[cols[2]].replace(" ", "")
 with open("in/artist") as f:
    for line in f:
        cols = line.split("\t")
        artists[cols[0]] = cols
 with open("repo/area_area.csv", "w") as out:
    out.write(":START_ID(Area),:END_ID(Area)\n")
    with open("in/l_area_area", "r") as f:
        for line in f:
            cols = line.split("\t")
            out.write(",".join((areas[cols[3]][1],
                                areas[cols[2]][1]
                                )) + "\n")
 with open("repo/area.csv", "w") as out:
    out.write("id:ID(Area),name\n")
    for k, area in areas.items():
        out.write(",".join((area[1],
                            '"' + area[2] + '"'
                            )) + "\n")
 # ------
 out_artist = open("repo/artist.csv", "w")
 out_artist_area = open("repo/artist_area.csv", "w")
 out_artist.write("id:ID(Artist),name,sortname,year:int,comment,:LABEL\n")
 out_artist_area.write(":START_ID(Artist),:END_ID(Area)\n")
 ASCII_RE = re.compile(r"[^a-zA-Z0-9.\-!?& ]")
 ALPHANUM_RE = re.compile(r"[^\w.\-!?& ]")
 for _, artist in artists.items():
    sortname = ASCII_RE.sub("_", artist[2]).upper()
    if sortname.replace("_", "").strip() == "":
        sortname = ALPHANUM_RE.sub("_", artist[3]).upper()
    out_artist.write(",".join((
        artist[1],
        '"' + artist[2].replace("\"", "\"\"") + '"',
        sortname,
        artist[4] if artist[4] != "\\N" else "0",
        ('"' + artist[13].replace("\"", "\"\"") + '"') if artist[13] != "\\N" else "",
        "Artist" + (";Group\n" if artist[10] == "2" else "\n")
    )))
    if artist[11] != "\\N":
        out_artist_area.write(artist[1] + "," + areas[artist[11]][1] + "\n")
 out_artist.close()
 out_artist_area.close()
 with open("repo/artist_artist.csv", "w") as out:
    out.write(":START_ID(Artist),:END_ID(Artist),:TYPE\n")
    with open("in/l_artist_artist", "r") as f:
        for line in f:
            cols = line.split("\t")
            out.write(",".join((
                artists[cols[2]][1],
                artists[cols[3]][1],
                artist_artist_rel_map[link_types[links[cols[1]][1]][6]] + "\n"
            )))
 #  --------
 with open("in/release_group_primary_type") as f:
    for line in f:
        cols = line.split("\t")
        release_types[cols[0]] = ";" + cols[1]
 release_group_year = dict()
 with open("in/release_group_meta") as f:
    for line in f:
        cols = line.split("\t")
        release_group_year[cols[0]] = cols[2] if cols[2] != "\\N" else "0"
 with open("repo/release.csv", "w") as out:
    out.write("id:ID(Release),name,year:int,:LABEL\n")
    with open("in/release_group") as f:
        for line in f:
            cols = line.split("\t")
            out.write(",".join((
                cols[1],
                '"' + cols[2].replace("\"", "\"\"") + '"',
                release_group_year[cols[0]],
                "Release" + release_types[cols[4]],
            )) + "\n")
            release_groups[cols[0]] = cols
 with open("in/release") as f:
    for line in f:
        cols = line.split("\t")
        if cols[5] != '\\N' and release_statuses[cols[5]][1] == "Official":
            release_to_release_group_map[cols[0]] = cols[4]
 credit_names = defaultdict(list)
 with open("in/artist_credit_name") as f:
    for line in f:
        cols = line.split("\t")
        credit_names[cols[0]].append(artists[cols[2]][1])
 with open("tmp/tmp_artist_release.csv", "w") as out:
    out.write(":START_ID(Artist),:END_ID(Release),:TYPE\n")
    # Is this part really necessary?
    with open("in/l_artist_release") as f:
        for line in f:
            cols = line.split("\t")
            if cols[3] in release_to_release_group_map:
                out.write(",".join((
                    artists[cols[2]][1],
                    release_groups[release_to_release_group_map[cols[3]]][1],
                    artist_release_rel_map[link_types[links[cols[1]][1]][6]]
                )) + "\n")
    # Artist credits
    with open("in/release") as f:
        for line in f:
            cols = line.split("\t")
            if cols[0] in release_to_release_group_map:
                for credit in credit_names[cols[3]]:
                    out.write(",".join((
                        credit,
                        release_groups[release_to_release_group_map[cols[0]]][1],
                        "CREDITED_FOR"
                    )) + "\n")
 # Remove dupes
 os.system("(head -n 1 tmp/tmp_artist_release.csv && tail -n +2 tmp/tmp_artist_release.csv"
          " | sort) | uniq > repo/artist_release.csv && rm tmp/tmp_artist_release.csv")
 with open("repo/release_release.csv", "w") as out:
    out.write(":START_ID(Release),:END_ID(Release),:TYPE\n")
    with open("in/l_release_group_release_group") as f:
        for line in f:
            cols = line.split("\t")
            out.write(",".join((
                release_groups[cols[2]][1],
                release_groups[cols[3]][1],
                release_release_rel_map[link_types[links[cols[1]][1]][6]]
            )) + "\n")
 # ---
 tag_occurence = defaultdict(int)
 with open("in/release_group_tag") as f:
    for line in f:
        tag_occurence[line.split("\t")[1]] += 1
 with open("in/tag") as f:
    with open("repo/tag.csv", "w") as out:
        out.write("id:ID(Tag),name, occurences\n")
        for line in f:
            cols = line.split("\t")
            if tag_occurence[cols[0]] < 5:
                continue
            tags[cols[0]] = cols
            out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"," + str(tag_occurence[cols[0]]) + "\n")
 with open("repo/release_tag.csv", "w") as out:
    out.write(":START_ID(Release),:END_ID(Tag),weight:float\n")
    # get max count
    max_count = 0
    with open("in/release_group_tag") as f:
        for line in f:
            cols = line.split("\t")
            max_count = max(max_count, int(cols[2]))
    max_count = max_count / 4
    # weight is linear
    with open("in/release_group_tag") as f:
        for line in f:
            cols = line.split("\t")
            count = int(cols[2])
            if count <= 0:
                continue
            if cols[1] not in tags:
                continue
            out.write(",".join((
                release_groups[cols[0]][1],
                cols[1],
                str(max(min(count / max_count, 1), 0.2)),
            )) + "\n")
            tag_occurence[cols[1]] += 1
 with open("repo/artist_tag.csv", "w") as out:
    out.write(":START_ID(Artist),:END_ID(Tag),weight:float\n")
    # get max count
    max_count = 0
    with open("in/artist_tag") as f:
        for line in f:
            cols = line.split("\t")
            max_count = max(max_count, int(cols[2]))
    max_count = max_count / 4
    # Weight is linear
    with open("in/artist_tag") as f:
        for line in f:
            cols = line.split("\t")
            count = int(cols[2])
            if count <= 0:
                continue
            if cols[1] not in tags:
                continue
            out.write(",".join((
                artists[cols[0]][1],
                cols[1],
                str(max(min(count / max_count, 1), 0.2)),
            )) + "\n")
 with open("repo/tag_tag.csv", "w") as out:
    out.write(":START_ID(Tag),:END_ID(Tag),weight:float\n")
    def weights():
        with open("in/tag_relation") as f:
            for line in f:
                weight = int(line.split("\t")[2])
                if weight < 5:
                    continue
                yield weight
    weight_median = median(weights()) * 3
    with open("in/tag_relation") as f:
        for line in f:
            cols = line.split("\t")
            weight = int(cols[2])
            if weight < 5:
                continue
            if cols[0] not in tags or cols[1] not in tags:
                continue
            out.write(",".join((
                cols[0],
                cols[1],
                str(max(min(weight / weight_median, 1), 0.2)),
            )) + "\n")
 # -----
 with open("repo/labels.csv", "w") as out:
    out.write("id:ID(Label),name,sortname,code,:LABEL\n")
    with open("in/label") as f:
        for line in f:
            cols = line.split("\t")
            labels[cols[0]] = cols
            sortname = ASCII_RE.sub("_", cols[2]).upper()
            out.write(",".join((
                cols[1],
                "\"" + cols[2].replace("\"", "\"\"") + "\"",
                sortname,
                cols[9] if cols[9] != "\\N" else "",
                "Label" + label_types[cols[10]]
            )) + "\n")
 with open("repo/release_label.csv", "w") as out:
    out.write(":START_ID(Release),:END_ID(Label)\n")
    # Should I check link types here?
    with open("in/l_label_release_group") as f:
        for line in f:
            cols = line.split("\t")
            out.write(release_groups[cols[3]][1] + "," + labels[cols[2]][1] + "\n")
    with open("in/l_label_release") as f:
        for line in f:
            cols = line.split("\t")
            if cols[3] in release_to_release_group_map:
                out.write(release_groups[release_to_release_group_map[cols[3]]][1] + "," + labels[cols[2]][1] + "\n")
 with open("repo/label_label.csv", "w") as out:
    out.write(":START_ID(Label),:END_ID(Label),:TYPE\n")
    with open("in/l_label_label") as f:
        for line in f:
            cols = line.split("\t")
            out.write(",".join((
                labels[cols[2]][1],
                labels[cols[3]][1],
                label_label_rel_map[link_types[links[cols[1]][1]][6]]
            )) + "\n")
 # ---
--- a/1
+++ b/1
@ -1 +0,0 @@
 Subproject commit 4ac596b2ff7659b880ac8a3fe9c58ea6527c2efc
--- a/1
+++ b/1
@ -1 +0,0 @@
 Subproject commit 0a05c69bcf7005496c2efdf5b825ffa2f443ccdf
--- a/task_get_cover.py
+++ b/task_get_cover.py
@ -0,0 +1,85 @@
 from io import BytesIO
 import psycopg2
 import requests
 from PIL import Image
 import config
 def should_download(image: dict):
    return image["front"] is True
 def thumb(cover_blob):
    with Image.open(BytesIO(cover_blob)) as image:
        # https://stackoverflow.com/questions/43978819
        if image.mode == "I;16":
            image.mode = "I"
            image.point(lambda i: i * (1. / 256)).convert('L')
        image.thumbnail((256, 256), Image.BICUBIC)
        canvas = Image.new("RGB", image.size, 0x000000)
        if image.mode in ('RGBA', 'LA') or (image.mode == 'P' and 'transparency' in image.info):
            try:
                canvas.paste(image, mask=image.split()[-1])
            except ValueError:
                canvas.paste(image)
        else:
            canvas.paste(image)
        blob = BytesIO()
        canvas.save(blob, "JPEG", quality=85, optimize=True)
        canvas.close()
    return blob.getvalue()
 def save(mbid, tn):
    with psycopg2.connect(config.connstr()) as conn:
        cur = conn.cursor()
        cur.execute(
            "INSERT INTO mg.covers (mbid, tn) VALUES (%s,%s) ON CONFLICT (mbid) "
            "DO UPDATE SET tn = excluded.tn",
            (mbid, tn)
        )
        conn.commit()
 def get_mbids(count=1):
    with psycopg2.connect(config.connstr()) as conn:
        cur = conn.cursor()
        cur.execute(
            "SELECT gid FROM release_group "
            "LEFT JOIN mg.covers ON gid = mbid "
            "WHERE tn IS NULL "
            "ORDER BY ts NULLS FIRST LIMIT %s",
            (count,)
        )
        for row in cur:
            yield row[0]
 def download(mbid):
    r = requests.get("https://coverartarchive.org/release-group/%s/front-250.jpg" % mbid)
    if r.status_code == 200:
        return r.content
    if r.status_code != 404:
        print("<%d> %s" % (r.status_code, r.text))
    return None
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--count", type=int, default=1)
    args = parser.parse_args()
    for mbid in get_mbids(args.count):
        tn = download(mbid)
        save(mbid, tn)
        print(mbid)
--- a/task_get_lastfm.py
+++ b/task_get_lastfm.py
@ -0,0 +1,197 @@
 #!/usr/bin/env python3
 import json
 from itertools import repeat
 import config
 import psycopg2
 import requests
 def get_mbid(lfm_name):
    cur = conn.cursor()
    cur.execute("SELECT mbid "
                "FROM mg.lastfm_artist WHERE name=%s", (lfm_name,))
    row = cur.fetchone()
    return row[0] if row else None
 def set_mbid(lfm_name, mbid):
    cur = conn.cursor()
    cur.execute("INSERT INTO mg.lastfm_artist VALUES (%s,%s) ON CONFLICT (name) "
                "DO UPDATE SET mbid=excluded.mbid", (lfm_name, mbid))
 def save_tags(lfm_name, tags):
    if not tags:
        return
    cur = conn.cursor()
    cur.execute("DELETE FROM mg.lastfm_artist_tag WHERE name=%s", (lfm_name,))
    cur.execute(
        "INSERT INTO mg.lastfm_artist_tag VALUES %s" %
        ",".join("('%s', '%s')" % (n, t) for (n, t) in zip(repeat(lfm_name), tags))
    )
 def save_data(data):
    if data:
        disambiguate(data["name"], mbid=data["artist"])
        for similar in [s for s in data["similar"] if s["mbid"] is not None]:
            disambiguate(similar["name"], similar["mbid"])
            save_similar(data["name"], similar["name"], similar["match"])
        save_tags(data["name"], data["tags"])
        save_meta(data["name"], data["listeners"], data["playcount"])
 def save_similar(lfm_name, similar, weight):
    cur = conn.cursor()
    cur.execute(
        "INSERT INTO mg.lastfm_artist_artist (name0, name1, weight) VALUES (%s,%s,%s) "
        "ON CONFLICT (name0, name1) DO UPDATE SET weight=excluded.weight, ts=CURRENT_TIMESTAMP",
        (lfm_name, similar, weight)
    )
 def save_meta(lfm_name, listeners, playcount):
    cur = conn.cursor()
    cur.execute("INSERT INTO mg.lastfm_artist_meta VALUES (%s,%s,%s) ON CONFLICT (name) "
                "DO UPDATE SET listeners=excluded.listeners, playcount=excluded.playcount",
                (lfm_name, listeners, playcount))
 def save_raw_data(name, mbid, data):
    cur = conn.cursor()
    cur.execute("INSERT INTO mg.lastfm_raw_data (name, mbid, data) VALUES (%s,%s,%s) "
                "ON CONFLICT (name, mbid) DO UPDATE SET ts=CURRENT_TIMESTAMP, data=excluded.data",
                (name, mbid, json.dumps(data)))
 def get_release_count(mbid):
    cur = conn.cursor()
    cur.execute('SELECT COUNT(*) '
                'FROM l_artist_release '
                'INNER JOIN artist a ON entity0 = a.id '
                'WHERE a.gid = %s', (mbid,))
    row = cur.fetchone()
    return row[0] if row else 0
 def disambiguate(name, mbid):
    """
    A lastfm artist name can refer to multiple MBIDs
    For RELATED_TO purposes, we assume that the MBID referring
    to the artist with the most official releases is the one
    """
    existing_mbid = get_mbid(name)
    if existing_mbid and mbid != existing_mbid:
        if get_release_count(existing_mbid) < get_release_count(mbid):
            set_mbid(name, mbid)
    else:
        set_mbid(name, mbid)
 def get_cached_artist_data(name, mbid, max_age_days):
    cur = conn.cursor()
    cur.execute("SELECT data FROM mg.lastfm_raw_data WHERE name=%s AND mbid=%s "
                "AND date_part('day', CURRENT_TIMESTAMP - ts) <= %s ",
                (name, mbid, max_age_days))
    row = cur.fetchone()
    return row[0] if row else 0
 def get_artist_data(name: str, mbid: str):
    cached_data = get_cached_artist_data(name, mbid, max_age_days=30)
    if cached_data:
        return cached_data
    raw = []
    url = "https://ws.audioscrobbler.com/2.0/?method=artist.getinfo&mbid=%s&api_key=%s&format=json" % \
          (mbid, config.config["LASTFM_APIKEY"],)
    r = requests.get(url)
    raw.append((url, r.text))
    info_json = r.json()
    by_name = False
    if "artist" not in info_json:
        url1 = "https://ws.audioscrobbler.com/2.0/?method=artist.getinfo&artist=%s&api_key=%s&format=json" % \
               (name, config.config["LASTFM_APIKEY"],)
        r = requests.get(url1)
        raw.append((url1, r.text))
        info_json = r.json()
        if "artist" not in info_json:
            if "Rate Limit Exceeded" in r.text:
                raise Exception("Rate Limit Exceeded!")
            data = {
                "_raw": raw
            }
            save_raw_data(name, mbid, data)
            return
        by_name = True
    if by_name:
        url2 = "https://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&artist=%s&api_key=%s&format=json" % (
            name, config.config["LASTFM_APIKEY"],)
    else:
        url2 = "https://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&mbid=%s&api_key=%s&format=json" % (
            mbid, config.config["LASTFM_APIKEY"],)
    r2 = requests.get(url2)
    raw.append((url2, r2.text))
    similar_json = r2.json()
    data = {
        "artist": mbid,
        "name": info_json["artist"]["name"],
        "mbid": info_json["artist"]["mbid"] if "mbid" in info_json["artist"] else None,
        "tags": [t["name"] for t in info_json["artist"]["tags"]["tag"]] if "tags" in info_json["artist"] and "tag" in
                                                                           info_json["artist"]["tags"] else [],
        "listeners": info_json["artist"]["stats"]["listeners"],
        "playcount": info_json["artist"]["stats"]["playcount"],
        "similar": [
            {
                "mbid": a["mbid"] if "mbid" in a else None,
                "match": a["match"],
                "name": a["name"]
            }
            for a in similar_json["similarartists"]["artist"]],
        "_raw": raw
    }
    save_raw_data(name, mbid, data)
    return data
 def get_task(count=1):
    cur = conn.cursor()
    cur.execute(
        "SELECT artist.name, artist.gid FROM artist "
        "LEFT JOIN mg.lastfm_raw_data lfm ON lfm.mbid=gid AND lfm.name=artist.name "
        "ORDER BY lfm.ts NULLS FIRST LIMIT %s",
        (count,)
    )
    return cur.fetchone()
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--count", type=int, default=1)
    args = parser.parse_args()
    conn = psycopg2.connect(config.connstr())
    for task in get_task(args.count):
        save_data(get_artist_data(*task))
        conn.commit()
        print(task[0])
    conn.close()
--- a/task_get_spotify.py
+++ b/task_get_spotify.py
@ -0,0 +1,261 @@
 #!/usr/bin/env python
 import json
 from itertools import repeat
 import psycopg2
 import spotipy
 from hexlib.misc import silent_stdout
 from spotipy.oauth2 import SpotifyClientCredentials
 import config
 def save_raw(query, endpoint, data):
    cur = conn.cursor()
    cur.execute(
        "INSERT INTO mg.spotify_raw_data (query, endpoint, data) VALUES (%s,%s,%s) "
        "ON CONFLICT (query, endpoint) "
        "DO UPDATE SET ts=CURRENT_TIMESTAMP, data=excluded.data",
        (query, endpoint, json.dumps(data))
    )
 def save_artist(data, max_age_days=30):
    """Returns True if artist is new (and therefore, its albums, tracks etc. should be fetched)"""
    cur = conn.cursor()
    cur.execute("SELECT spotid FROM mg.spotify_artist_meta WHERE spotid=%s AND "
                "date_part('day', CURRENT_TIMESTAMP - ts) <= %s", (data["id"], max_age_days,))
    if cur.fetchone():
        return False
    cur.execute(
        "INSERT INTO mg.spotify_artist_meta (spotid, name, followers, popularity) "
        "VALUES (%s,%s,%s,%s) "
        "ON CONFLICT (spotid) "
        "DO UPDATE SET name=excluded.name, followers=excluded.followers, popularity=excluded.popularity",
        (data["id"], data["name"], data["followers"]["total"], data["popularity"])
    )
    cur.execute("DELETE FROM mg.spotify_artist_tag WHERE spotid=%s", (data["id"],))
    if data["genres"]:
        cur.execute(
            "INSERT INTO mg.spotify_artist_tag VALUES %s" %
            ",".join("('%s', '%s')" % (n, t.replace("'", "''")) for (n, t) in zip(repeat(data["id"]), data["genres"]))
        )
    return True
 def get_albums(spotid):
    data = silent_stdout(spotify.artist_albums, spotid, album_type="album,single,compilation")
    save_raw(spotid, "artist_albums", data)
    cur = conn.cursor()
    cur.execute("DELETE FROM mg.spotify_artist_album WHERE spotid=%s", (spotid,))
    if data["items"]:
        cur.execute(
            "INSERT INTO mg.spotify_artist_album VALUES %s" %
            ",".join("('%s', '%s')" % (n, t.replace("'", "''"))
                     for (n, t) in zip(repeat(spotid), set(a["name"] for a in data["items"])))
        )
    return list()
 def get_tracks(spotid):
    data = silent_stdout(spotify.artist_top_tracks, spotid)
    save_raw(spotid, "artist_top_tracks", data)
    cur = conn.cursor()
    cur.execute("DELETE FROM mg.spotify_artist_track WHERE spotid=%s", (spotid,))
    unique_tracks = []
    done = set()
    for track in data["tracks"]:
        if track["name"] in done:
            continue
        unique_tracks.append((track["name"], track["preview_url"]))
        done.add(track["name"])
    if unique_tracks:
        cur.execute(
            "INSERT INTO mg.spotify_artist_track (spotid, track, url) VALUES %s" %
            ",".join("('%s', '%s', '%s')" % (i, t[0].replace("'", "''"), t[1])
                     for (i, t) in zip(repeat(spotid), unique_tracks))
        )
 def related(spotid):
    data = silent_stdout(spotify.artist_related_artists, spotid)
    save_raw(spotid, "artist_related_artists", data)
    return data["artists"]
 def save_artist_artist(id0, relations):
    if relations:
        cur = conn.cursor()
        cur.execute(
            "INSERT INTO mg.spotify_artist_artist (spotid0, spotid1, index) "
            "VALUES %s "
            "ON CONFLICT (spotid0, spotid1) "
            "DO NOTHING" %
            ",".join("('%s', '%s', '%d')" % (r[0], r[1]["id"], i) for (i, r) in enumerate(zip(repeat(id0), relations)))
        )
 def get_mbids_with_matching_name(name):
    cur = conn.cursor()
    cur.execute(
        "SELECT gid FROM artist "
        "WHERE asciifold_lower(name)=asciifold_lower(%s)",
        (name,)
    )
    rows = cur.fetchall()
    return [r[0] for r in rows]
 def resolve_spotify_conflict(mbid, existing_spotid, new_spotid):
    cur = conn.cursor()
    cur.execute(
        "SELECT asciifold_lower(album) FROM mg.spotify_artist_album WHERE spotid=%s",
        (new_spotid,)
    )
    new_albums = set(row[0] for row in cur.fetchall())
    if len(new_albums) == 0:
        return
    cur.execute(
        "SELECT asciifold_lower(album) FROM mg.spotify_artist_album WHERE spotid=%s",
        (existing_spotid,)
    )
    existing_albums = set(row[0] for row in cur.fetchall())
    if len(existing_albums) != 0:
        cur.execute(
            "SELECT DISTINCT asciifold_lower(release.name) FROM release "
            "INNER JOIN artist_credit_name cn ON cn.artist_credit = release.artist_credit "
            "INNER JOIN artist a on a.id = cn.artist "
            "WHERE a.gid=%s", (mbid,)
        )
        mb_albums = set(row[0] for row in cur.fetchall())
        if len(new_albums.intersection(mb_albums)) > len(existing_albums.intersection(mb_albums)):
            cur.execute("UPDATE mg.spotify_artist SET spotid = %s WHERE mbid=%s", (new_spotid, mbid))
 def resolve_mb_conflict(spotid, mbids):
    cur = conn.cursor()
    cur.execute(
        "SELECT asciifold_lower(album) FROM mg.spotify_artist_album WHERE spotid=%s",
        (spotid,)
    )
    spot_albums = set(row[0] for row in cur.fetchall())
    best_match_count = -1
    best_match = None
    if len(spot_albums) == 0:
        # We can't base our conflict resolution based on album names,
        # pick the one with the most releases
        for mbid in mbids:
            cur.execute(
                "SELECT count(release.name) FROM release "
                "INNER JOIN artist_credit_name cn ON cn.artist_credit = release.artist_credit "
                "INNER JOIN artist a on a.id = cn.artist "
                "WHERE a.gid = %s ",
                (mbid,)
            )
            match_count = cur.fetchone()[0]
            if match_count > best_match_count:
                best_match_count = match_count
                best_match = mbid
    else:
        for mbid in mbids:
            cur.execute(
                "SELECT asciifold_lower(release.name) FROM release "
                "INNER JOIN artist_credit_name cn ON cn.artist_credit = release.artist_credit "
                "INNER JOIN artist a on a.id = cn.artist "
                "WHERE a.gid = %s ",
                (mbid,)
            )
            match_count = len(set(row[0] for row in cur.fetchall()).intersection(spot_albums))
            if match_count > best_match_count:
                best_match_count = match_count
                best_match = mbid
    save_spotid_to_mbid(spotid, best_match)
 def save_spotid_to_mbid(spotid, mbid):
    cur = conn.cursor()
    cur.execute(
        "SELECT spotid FROM mg.spotify_artist WHERE mbid=%s",
        (mbid,)
    )
    row = cur.fetchone()
    if row:
        resolve_spotify_conflict(mbid, row[0], spotid)
    else:
        cur.execute(
            "INSERT INTO mg.spotify_artist (spotid, mbid) VALUES (%s,%s)",
            (spotid, mbid)
        )
 def search_artist(name):
    quoted_name = "\"%s\"" % name
    data = silent_stdout(spotify.search, quoted_name, type="artist", limit=20)
    save_raw(name, "search", data)
    for result in data["artists"]["items"]:
        if save_artist(result):
            mbids = get_mbids_with_matching_name(result["name"])
            get_albums(result["id"])
            get_tracks(result["id"])
            if len(mbids) > 1:
                resolve_mb_conflict(result["id"], mbids)
            elif len(mbids) == 1:
                save_spotid_to_mbid(result["id"], mbids[0])
            save_artist_artist(result["id"], related(result["id"]))
 def get_tasks(count=1):
    cur = conn.cursor()
    cur.execute(
        "SELECT artist.name FROM artist "
        "LEFT JOIN mg.spotify_artist sa ON sa.mbid=gid "
        "LEFT JOIN mg.spotify_raw_data srd ON srd.query=artist.name AND endpoint='search' "
        "LEFT JOIN mg.spotify_artist_meta sam ON sa.spotid=sam.spotid "
        "ORDER BY sam.ts NULLS FIRST, srd.ts NULLS FIRST LIMIT %s",
        (count,)
    )
    for row in cur:
        yield row[0]
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--count", type=int, default=1)
    args = parser.parse_args()
    conn = psycopg2.connect(config.connstr())
    client_credentials_manager = SpotifyClientCredentials(
        client_id=config.config["SPOTIFY_CLIENTID"],
        client_secret=config.config["SPOTIFY_SECRET"]
    )
    spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    for name in get_tasks(args.count):
        search_artist(name)
        conn.commit()
        print(name)
    conn.close()
--- a/1
+++ b/1
@ -1 +0,0 @@
 Subproject commit aa15a1b29e2fc7f03dafc9301c65e32cb82e4cb4
		`@ -1 +0,0 @@`
			`Subproject commit 910f4a0bceadac37ac28fa59e7648f241c931fe6`
		`@ -1 +0,0 @@`
			`Subproject commit 855df64c316930062ff4f7740492d0f039788498`
		`@ -1 +0,0 @@`
			`Subproject commit 4ac596b2ff7659b880ac8a3fe9c58ea6527c2efc`
		`@ -1 +0,0 @@`
			`Subproject commit 0a05c69bcf7005496c2efdf5b825ffa2f443ccdf`
		`@ -1 +0,0 @@`
			`Subproject commit aa15a1b29e2fc7f03dafc9301c65e32cb82e4cb4`