diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 4340f4c..0000000 --- a/.gitmodules +++ /dev/null @@ -1,15 +0,0 @@ -[submodule "task_tracker_drone"] - path = task_tracker_drone - url = https://github.com/simon987/task_tracker_drone/ -[submodule "last.fm"] - path = last.fm - url = https://git.simon987.net/drone/last.fm -[submodule "caa"] - path = caa - url = https://git.simon987.net/drone/caa.git -[submodule "spotify"] - path = spotify - url = https://git.simon987.net/drone/spotify -[submodule "spotify2"] - path = spotify2 - url = https://git.simon987.net/drone/spotify2 diff --git a/README.md b/README.md index ab49bd9..fef7b61 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,3 @@ wip -### task_tracker setup: - -Last.fm api calls are queued to [task_tracker](https://github.com/simon987/task_tracker/), - and results are gathered by a [task_tracker_drone](https://github.com/simon987/task_tracker_drone/) - ([script](https://git.simon987.net/drone/last.fm/src/master/run)). - - -Project secret: -```json -{ - "apikey": "", - "user": "" -} -``` diff --git a/caa b/caa deleted file mode 160000 index 910f4a0..0000000 --- a/caa +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 910f4a0bceadac37ac28fa59e7648f241c931fe6 diff --git a/config.py b/config.py new file mode 100644 index 0000000..3e90498 --- /dev/null +++ b/config.py @@ -0,0 +1,22 @@ +import os + +config = { + "DB": "musicbrainz_db", + "USER": "musicbrainz", + "PASSWORD": "musicbrainz", + "HOST": "127.0.0.1", + "PORT": 5433, + + "LASTFM_APIKEY": os.environ.get("LASTFM_APIKEY"), + "LASTFM_USER": os.environ.get("LASTFM_USER"), + + "SPOTIFY_CLIENTID": os.environ.get("SPOTIFY_CLIENTID"), + "SPOTIFY_SECRET": os.environ.get("SPOTIFY_SECRET"), +} + + +def connstr(): + return " dbname=%s user=%s password=%s host=%s port=%d" % ( + config["DB"], config["USER"], config["PASSWORD"], + config["HOST"], config["PORT"] + ) diff --git a/download_mb_dump.sh b/download_mb_dump.sh deleted file mode 100755 index f435b19..0000000 --- a/download_mb_dump.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash - -latest=$(curl http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/LATEST) - -mkdir in 2> /dev/null -cd in - -wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/mbdump.tar.bz2" -wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/mbdump-derived.tar.bz2" - -tar -xjvf mbdump.tar.bz2 mbdump/area mbdump/artist mbdump/l_area_area mbdump/l_artist_artist \ -mbdump/l_artist_release mbdump/l_artist_release_group mbdump/l_label_label mbdump/l_release_group_release_group \ -mbdump/label mbdump/label_type mbdump/link mbdump/link_type mbdump/release mbdump/release_group \ -mbdump/release_group_primary_type mbdump/artist_credit_name mbdump/release_status mbdump/l_label_release \ -mbdump/l_label_release_group -tar -xjvf mbdump-derived.tar.bz2 mbdump/artist_tag mbdump/release_group_tag mbdump/tag mbdump/tag_relation \ -mbdump/release_group_meta - -mv mbdump/* . -rm -r mbdump -cd .. \ No newline at end of file diff --git a/extract_covers.py b/extract_covers.py deleted file mode 100644 index e50ea6c..0000000 --- a/extract_covers.py +++ /dev/null @@ -1,27 +0,0 @@ -import sqlite3 - -import sys - -with sqlite3.connect(sys.argv[1]) as conn: - - cursor = conn.cursor() - cursor.execute("SELECT id from covers") - - cursor = conn.cursor() - cursor.execute("SELECT id from covers") - - def rows(): - buf = list() - for row in cursor.fetchall(): - buf.append(row[0]) - if len(buf) > 30: - yield buf - buf.clear() - - for batch in rows(): - cursor.execute("SELECT cover from covers where id in (%s)" % (",".join(("'" + b + "'") for b in batch))) - covers = cursor.fetchall() - for i, cover in enumerate(covers): - with open("./tmpcovers/" + batch[i] + ".jpg", "wb") as out: - out.write(cover[0]) - print(batch[i]) diff --git a/generate_caa_tasks.py b/generate_caa_tasks.py deleted file mode 100644 index cb20260..0000000 --- a/generate_caa_tasks.py +++ /dev/null @@ -1,56 +0,0 @@ -import json -from multiprocessing.pool import ThreadPool - -from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker - -TT_API_URL = "https://tt.simon987.net/api" -TT_PROJECT = 5 - - -done = set() -# with sqlite3.connect(sys.argv[1]) as conn: -# cur = conn.cursor() -# cur.execute("SELECT id FROM covers") -# for mbid in cur.fetchall(): -# done.add(mbid[0]) - -api = TaskTrackerApi(TT_API_URL) - -worker = Worker.from_file(api) -if not worker: - worker = api.make_worker("caa scraper") - worker.dump_to_file() -worker.request_access(TT_PROJECT, True, True) -input("Give permission to " + worker.alias) - - -def mktask(mbids): - res = worker.submit_task( - project=TT_PROJECT, - recipe=json.dumps(mbids), - hash64=hash(mbids[0]), - max_assign_time=60 * 30, - priority=1, - unique_str=None, - verification_count=None, - max_retries=5, - ) - print(res.text) - - -def lines(): - with open("in/release") as f: - buf = list() - - for line in f: - cols = line.split("\t") - - buf.append(cols[1]) - if len(buf) == 75: - a = list(buf) - buf.clear() - yield a - - -pool = ThreadPool(processes=20) -pool.map(func=mktask, iterable=lines()) diff --git a/generate_lastfm_tasks.py b/generate_lastfm_tasks.py deleted file mode 100644 index 444c761..0000000 --- a/generate_lastfm_tasks.py +++ /dev/null @@ -1,48 +0,0 @@ -import csv -import json -from multiprocessing.pool import ThreadPool - -from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker, LOG_TRACE - -TT_API_URL = "https://tt.simon987.net/api" -TT_PROJECT = 1 - -api = TaskTrackerApi(TT_API_URL) - -worker = Worker.from_file(api) -if not worker: - worker = api.make_worker("last.fm scraper") - worker.dump_to_file() -worker.request_access(TT_PROJECT, True, True) -input("Give permission to " + worker.alias) - -with open("repo/artist.csv") as f: - reader = csv.reader(f) - - def mktask(lines): - res = worker.submit_task( - project=TT_PROJECT, - recipe=json.dumps( - [{"mbid": line[0], "name": line[1]} for line in lines] - ), - unique_str=lines[0][0], - max_assign_time=60 * 5, - ) - print(res.text) - - def lines(): - line_batch = list() - - for line in reader: - if "Group" in line[3]: - line_batch.append(line) - if len(line_batch) >= 30: - res = list(line_batch) - line_batch.clear() - yield res - - tasks = list(lines()) - - pool = ThreadPool(processes=25) - pool.map(func=mktask, iterable=tasks) - diff --git a/generate_spotify_tasks.py b/generate_spotify_tasks.py deleted file mode 100644 index 45adb7f..0000000 --- a/generate_spotify_tasks.py +++ /dev/null @@ -1,48 +0,0 @@ -import csv -import json -from multiprocessing.pool import ThreadPool - -from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker - -TT_API_URL = "https://tt.simon987.net/api" -TT_PROJECT = 6 - -api = TaskTrackerApi(TT_API_URL) - -worker = Worker.from_file(api) -if not worker: - worker = api.make_worker("mm worker") - worker.dump_to_file() -worker.request_access(TT_PROJECT, True, True) -input("Give permission to " + worker.alias) - -with open("repo/artist.csv") as f: - reader = csv.reader(f) - - def mktask(lines): - res = worker.submit_task( - project=TT_PROJECT, - recipe=json.dumps( - [{"mbid": line[0], "name": line[1]} for line in lines] - ), - unique_str=lines[0][0], - max_assign_time=60 * 5, - ) - print(res.text) - - def lines(): - line_batch = list() - - for line in reader: - line_batch.append(line) - if len(line_batch) >= 30: - res = list(line_batch) - line_batch.clear() - yield res - - tasks = list(lines()) - - pool = ThreadPool(processes=25) - pool.map(func=mktask, iterable=tasks) - - diff --git a/generate_spotify_tasks_2.py b/generate_spotify_tasks_2.py deleted file mode 100644 index 9828c04..0000000 --- a/generate_spotify_tasks_2.py +++ /dev/null @@ -1,60 +0,0 @@ -import json -import sqlite3 -from multiprocessing.pool import ThreadPool - -import sys - -from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker - -TT_API_URL = "https://tt.simon987.net/api" -TT_PROJECT = 7 - -api = TaskTrackerApi(TT_API_URL) - -worker = Worker.from_file(api) -if not worker: - worker = api.make_worker("mm worker") - worker.dump_to_file() -worker.request_access(TT_PROJECT, True, True) -input("Give permission to " + worker.alias) - -spotids = set() - -with sqlite3.connect(sys.argv[1]) as conn: - - cur = conn.cursor() - cur.execute("SELECT data from artist") - for row in cur.fetchall(): - j = json.loads(row[0]) - if j is None or "artists" not in j or "items" not in j["artists"]: - continue - for item in j["artists"]["items"]: - spotids.add(item["id"]) - - - def mktask(lines): - res = worker.submit_task( - project=TT_PROJECT, - recipe=json.dumps( - [{"spotid": line} for line in lines] - ), - unique_str=lines[0], - max_assign_time=60 * 5, - ) - print(res.text) - - def ids(): - id_batch = list() - - for spotid in spotids: - id_batch.append(spotid) - if len(id_batch) >= 30: - res = list(id_batch) - id_batch.clear() - yield res - - tasks = list(ids()) - - pool = ThreadPool(processes=25) - pool.map(func=mktask, iterable=tasks) - diff --git a/last.fm b/last.fm deleted file mode 160000 index 855df64..0000000 --- a/last.fm +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 855df64c316930062ff4f7740492d0f039788498 diff --git a/map_release_to_rg_map.py b/map_release_to_rg_map.py deleted file mode 100644 index ba21978..0000000 --- a/map_release_to_rg_map.py +++ /dev/null @@ -1,31 +0,0 @@ -import sqlite3 - -release_to_release_group_map = dict() -release_groups = dict() - -with open("in/release_group") as f: - for line in f: - cols = line.split("\t") - release_groups[cols[0]] = cols[1] - -with open("in/release") as f: - for line in f: - cols = line.split("\t") - release_to_release_group_map[cols[1]] = release_groups[cols[4]] - -with sqlite3.connect("mapdb.db") as conn: - - cursor = conn.cursor() - cursor.execute("CREATE TABLE map (release TEXT PRIMARY KEY , release_group TEXT)") - - for k, v in release_to_release_group_map.items(): - cursor.execute("INSERT INTO map (release, release_group) VALUES (?,?)", (k, v)) - conn.commit() - -""" -CREATE TABLE covers (id TEXT primary key, cover BLOB); -ATTACH 'mapdb.db' AS map; -ATTACH '/mnt/Data8/caa_tn_only.db' AS source; -INSERT OR IGNORE INTO covers SELECT release_group, cover FROM source.covers INNER JOIN map.map ON id = map.release; -""" - diff --git a/mb_scratch.sql b/mb_scratch.sql new file mode 100644 index 0000000..d07b2ce --- /dev/null +++ b/mb_scratch.sql @@ -0,0 +1,391 @@ +CREATE OR REPLACE FUNCTION fn_sortname(name text, mb_sortname text) RETURNS text AS +$$ +declare + sn text; +BEGIN + + sn = regexp_replace(name, '[^a-zA-Z0-9.\-!?&çéàâäëïöü'' ]', '_'); + + if length(replace(sn, '_', '')) = 0 then + return upper(regexp_replace(mb_sortname, '[^\w.\-!?& ]', '_')); + end if; + + return upper(sn); +END +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION fn_sortname(name text) RETURNS text AS +$$ +BEGIN + return upper(regexp_replace(name, '[^a-zA-Z0-9.\-!?&çéàâäëïöü'' ]', '_')); +END +$$ LANGUAGE plpgsql; + +CREATE TABLE mg.translate_artist_artist_rel +( + mb_name TEXT PRIMARY KEY, + mg_name TEXT +); +INSERT INTO mg.translate_artist_artist_rel +VALUES ('teacher', 'TEACHER_OF'), + ('composer-in-residence', 'HAS_COMPOSER-IN-RESIDENCE_STATUS_IN'), + ('member of band', 'IS_MEMBER_OF'), + ('voice actor', 'IS_VOICE_ACTOR_OF'), + ('tribute', 'IS_TRIBUTE_TO'), + ('supporting musician', 'IS_SUPPORTING_MUSICIAN_OF'), + ('instrumental supporting musician', 'IS_INSTRUMENTAL_SUPPORTING_MUSICIAN_OF'), + ('personal relationship', 'HAS_PERSONAL_RELATIONSHIP_WITH'), + ('musical relationships', 'HAS_MUSICAL_RELATIONSHIP_WITH'), + ('collaboration', 'HAS_COLLABORATED_WITH'), + ('married', 'IS_MARRIED_WITH'), + ('sibling', 'IS_SIBLING_OF'), + ('parent', 'IS_PARENT_OF'), + ('is person', 'IS'), + ('conductor position', 'IS_CONDUCTOR_OF'), + ('vocal supporting musician', 'DOES_VOCAL_SUPPORT_FOR'), + ('artistic director', 'IS_ARTIST_DIRECTOR_OF'), + ('subgroup', 'IS_SUBGROUP_OF'), + ('founder', 'IS_FOUNDER_OF'), + ('involved with', 'IS_INVOLVED_WITH'), + ('named after', 'IS_NAMED_AFTER'); + +CREATE TABLE mg.translate_artist_release_rel +( + mb_name TEXT PRIMARY KEY, + mg_name text +); +INSERT INTO mg.translate_artist_release_rel +VALUES ('translator', 'TRANSLATED'), + ('liner notes', 'WROTE_LINER_NOTES'), + ('lyricist', 'IS_LYRICIST_FOR'), + ('lacquer cut', 'DID_LACQUER_CUT_FOR'), + ('samples from artist', 'HAS_SAMPLES_IN'), + ('remixes and compilations', NULL), + ('composition', 'COMPOSED'), + ('booking', 'DID_BOOKING_FOR'), + ('balance', 'DID_BALANCE_FOR'), + ('misc', 'HAS_MISC_ROLE_IN'), + ('conductor', 'CONDUCTED'), + ('legal representation', 'PROVIDED_LEGAL_REPRESENTATION_FOR'), + ('design/illustration', 'DID_DESIGN_FOR'), + ('performing orchestra', 'PERFORMED_FOR'), + ('producer', 'PRODUCED'), + ('instrument', 'PERFORMED_INSTRUMENT_FOR'), + ('writer', 'WROTE_LYRICS_FOR'), + ('production', 'DID_PRODUCTION_FOR'), + ('performance', 'PERFORMED_FOR'), + ('composer', 'IS_COMPOSER_FOR'), + ('sound', 'DID_SOUND_FOR'), + ('remixer', 'DID_REMIXING_FOR'), + ('orchestrator', 'IS_ORCHESTRATOR_FOR'), + ('compiler', 'DID_COMPILATION_FOR'), + ('vocal arranger', 'IS_ARRANGER_FOR'), + ('arranger', 'IS_ARRENGER_FOR'), + ('mix-DJ', 'MIXED'), + ('editor', 'IS_EDITOR_FOR'), + ('illustration', 'DID_ILLUSTRATION_FOR'), + ('audio', 'DID_AUDIO_FOR'), + ('publishing', 'IS_PUBLISHER_FOR'), + ('art direction', 'DID_ART_DIRECTOR_FOR'), + ('design', 'DID_DESIGN_FOR'), + ('instrument arranger', 'IS_ARRANGER_FOR'), + ('chorus master', 'IS_CHORUS_MASTER_FOR'), + ('photography', 'DID_PHOTOGRAPHY_FOR'), + ('performer', 'PERFORMED_IN'), + ('graphic design', 'DID_GRAPHIC_DESIGN_FOR'), + ('booklet editor', 'IS_BOOKLET_EDITOR_FOR'), + ('programming', 'DID_PROGRAMING_FOR'), + ('copyright', 'IS_COPYRIGHT_HOLDER_OF'), + ('piano technician', 'IS_PIANO_TECNICIAN_FOR'), + ('phonographic copyright', 'IS_PHONOGRAPHIC_COPYRIGHT_HOLDER_OF'), + ('mastering', 'DID_MASTERING_FOR'), + ('vocal', 'PERFORED_VOCALS_FOR'), + ('librettist', 'IS_LIBRETTIST_FOR'), + ('mix', 'MIXED'), + ('recording', 'DID_RECORDING_FOR'), + ('concertmaster', 'IS_CONCERTMASTER_FOR'), + ('engineer', 'IS_ENGINEER_FOR'), + ('tribute', 'IS_TRIBUTE_TO'), + ('dedicated to', 'IS_DEDICATED_TO'), + ('creative direction', NULL), + ('artists and repertoire', NULL); + + +CREATE TABLE mg.translate_label_label_rel +( + mb_name TEXT PRIMARY KEY, + mg_name text +); +INSERT INTO mg.translate_label_label_rel +VALUES ('label rename', 'WAS_RENAMED_TO'), + ('imprint', 'DOES_IMPRINT_FOR'), + ('label distribution', 'DOES_DISTRIBUTION_FOR'), + ('business association', 'HAS_BUSINESS_ASSOCIATION_TO'), + ('label ownership', 'OWNS'), + ('label reissue', 'DOES_REISSUING_FOR'); + + +CREATE OR REPLACE VIEW mg.artist AS +SELECT gid as "id:ID(Artist)", + name, + fn_sortname(name, sort_name) as sortname, + COALESCE(begin_date_year, 0) as "year:int", + comment, + (CASE WHEN type = 2 THEN 'Group' ELSE 'Artist' END) as ":LABEL" +FROM artist; + +CREATE OR REPLACE VIEW mg.artist_artist AS +SELECT a0.gid as ":START_ID(Artist)", + a1.gid as ":END_ID(Artist)", + t.mg_name as ":TYPE" +FROM l_artist_artist + INNER JOIN artist a0 ON entity0 = a0.id + INNER JOIN artist a1 ON entity1 = a1.id + INNER JOIN link l on l.id = l_artist_artist.link + INNER JOIN link_type lt ON lt.id = l.link_type + INNER JOIN mg.translate_artist_artist_rel t ON t.mb_name = lt.name; + + +CREATE OR REPLACE VIEW mg.release AS +SELECT release_group.gid as ":id:ID(Release)", + release_group.name, + m.first_release_date_year as "year:int", + CONCAT('Release;', t.name) as ":LABEL" +FROM release_group + INNER JOIN release_group_meta m ON m.id = release_group.id + INNER JOIN release_group_primary_type t ON t.id = release_group.type; + +CREATE OR REPLACE VIEW mg.artist_release AS +SELECT a.gid as ":START_ID(Artist)", + rg.gid as ":END_ID(Release)", + t.mg_name as ":TYPE" +FROM l_artist_release_group + INNER JOIN artist a on a.id = l_artist_release_group.entity0 + INNER JOIN release_group rg on rg.id = l_artist_release_group.entity1 + INNER JOIN link l on l.id = l_artist_release_group.link + INNER JOIN link_type lt ON lt.id = l.link_type + INNER JOIN mg.translate_artist_release_rel t ON t.mb_name = lt.name +UNION ALL +SELECT a.gid as ":START_ID(Artist)", + rg.gid as ":END_ID(Release)", + t.mg_name as ":TYPE" +FROM l_artist_release + INNER JOIN artist a on a.id = l_artist_release.entity0 + INNER JOIN release r on r.id = l_artist_release.entity1 + INNER JOIN release_group rg on rg.id = r.release_group + INNER JOIN link l on l.id = l_artist_release.link + INNER JOIN link_type lt ON lt.id = l.link_type + INNER JOIN mg.translate_artist_release_rel t ON t.mb_name = lt.name +UNION ALL +SELECT a.gid as ":START_ID(Artist)", + rg.gid as ":END_ID(Release)", + 'CREDITED_FOR' as ":TYPE" +FROM release + INNER JOIN artist_credit_name cn ON cn.artist_credit = release.artist_credit + INNER JOIN artist a on a.id = cn.artist + INNER JOIN release_group rg on rg.id = release.release_group; + +CREATE OR REPLACE VIEW mg.tag AS +WITH occurences AS ( + SELECT tag, COUNT(*) as count + FROM ( + SELECT tag + FROM release_group_tag + UNION ALL + SELECT tag + FROM release_tag + ) as tags + GROUP BY tag +) +SELECT tag.id as "id:ID(Tag)", + tag.name, + occurences.count as "occurences:int" +FROM tag + INNER JOIN occurences ON occurences.tag = tag.id +WHERE ref_count > 0 + AND occurences.count > 5; + + +CREATE OR REPLACE VIEW mg.release_tag AS +SELECT rg.gid as ":START_ID(Release)", + release_group_tag.tag as ":END_ID(Tag)", + greatest(least(release_group_tag.count::float / 6, 1), 0.2) as "weight:float" +FROM release_group_tag + INNER JOIN release_group rg ON rg.id = release_group_tag.release_group + INNER JOIN mg.tag t ON t."id:ID(Tag)" = release_group_tag.tag +WHERE release_group_tag.count > 0 +UNION ALL +SELECT rg.gid as ":START_ID(Release)", + release_tag.tag as ":END_ID(Tag)", + greatest(least(release_tag.count::float / 6, 1), 0.2) as "weight:float" +FROM release_tag + INNER JOIN release r ON r.id = release_tag.release + INNER JOIN release_group rg ON rg.id = r.release_group + INNER JOIN mg.tag t ON t."id:ID(Tag)" = release_tag.tag +WHERE release_tag.count > 0; + +CREATE OR REPLACE VIEW mg.artist_tag AS +SELECT a.gid as ":START_ID(Artist)", + artist_tag.tag as ":END_ID(Tag)", + greatest(least(artist_tag.count::float / 8, 1), 0.2) as "weight:float" +FROM artist_tag + INNER JOIN artist a on artist_tag.artist = a.id + INNER JOIN mg.tag t ON t."id:ID(Tag)" = artist_tag.tag + +CREATE OR REPLACE VIEW mg.tag_tag AS +SELECT tag_relation.tag1 as ":START_ID(Tag)", + tag_relation.tag2 as ":END_ID(Tag)", + greatest(least(tag_relation.weight::float / 12, 1), 0.2) as "weight:float" +FROM tag_relation; + +CREATE OR REPLACE VIEW mg.label AS +SELECT label.gid as "id:ID(Label)", + label.name, + fn_sortname(label.name) as sortname, +-- label_code as code, + concat('Label;', lt.name) as ":LABEL" +FROM label + INNER JOIN label_type lt on label.type = lt.id; + +CREATE OR REPLACE VIEW mg.release_label AS +SELECT l.gid as ":START_ID(Release)", + r.gid as ":END_ID(Label)" +FROM l_label_release + INNER JOIN label l on l_label_release.entity0 = l.id + INNER JOIN release r on l_label_release.entity1 = r.id; +-- UNION +-- SELECT l.gid as ":START_ID(Release)", +-- r.gid as ":END_ID(Label)" +-- FROM l_label_release_group +-- INNER JOIN label l on l_label_release_group.entity0 = l.id +-- INNER JOIN release_group rg on l_label_release_group.entity1 = rg.id +-- INNER JOIN release r on r.release_group = rg.id + + +CREATE OR REPLACE VIEW mg.label_label AS +SELECT l0.gid as ":START_ID(Label)", + l1.gid as ":END_ID(Label)", + t.mg_name as ":TYPE" +FROM l_label_label + INNER JOIN label l0 on l_label_label.entity0 = l0.id + INNER JOIN label l1 on l_label_label.entity1 = l1.id + INNER JOIN link l on l.id = l_label_label.link + INNER JOIN link_type lt ON lt.id = l.link_type + INNER JOIN mg.translate_label_label_rel t ON t.mb_name = lt.name + + +-------------- + +CREATE TABLE mg.covers +( + mbid uuid PRIMARY KEY, + ts timestamp DEFAULT CURRENT_TIMESTAMP, + tn bytea +); + +CREATE TABLE mg.lastfm_artist +( + name TEXT PRIMARY KEY, + mbid uuid +); + +CREATE TABLE mg.lastfm_raw_data +( + name TEXT, + mbid uuid, + ts timestamp DEFAULT CURRENT_TIMESTAMP, + data jsonb, + PRIMARY KEY (name, mbid) +); + +CREATE TABLE mg.lastfm_artist_meta +( + name TEXT PRIMARY KEY, + listeners int, + playcount int +); + +CREATE TABLE mg.lastfm_artist_tag +( + name TEXT, + tag TEXT, + PRIMARY KEY (name, tag) +); + +CREATE TABLE mg.lastfm_artist_artist +( + name0 TEXT, + name1 TEXT, + weight float, + ts timestamp DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (name0, name1) +); + +-------------- + +CREATE TABLE mg.spotify_artist +( + spotid TEXT PRIMARY KEY, + mbid UUID UNIQUE +); + +CREATE TABLE mg.spotify_artist_meta +( + spotid TEXT PRIMARY KEY, + name TEXT, + followers int, + popularity int, + ts timestamp DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE mg.spotify_artist_tag +( + spotid TEXT, + tag TEXT, + PRIMARY KEY (spotid, tag) +); + +CREATE TABLE mg.spotify_artist_album +( + spotid TEXT, + album TEXT, + PRIMARY KEY (spotid, album) +); + +CREATE TABLE mg.spotify_artist_track +( + spotid TEXT, + track TEXT, + url TEXT, + PRIMARY KEY (spotid, track) +); + +CREATE TABLE mg.spotify_artist_artist +( + spotid0 TEXT, + spotid1 TEXT, + index int, + ts timestamp DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (spotid0, spotid1) +); + + +CREATE TABLE mg.spotify_raw_data +( + query TEXT, + endpoint TEXT, + ts timestamp DEFAULT CURRENT_TIMESTAMP, + data jsonb, + PRIMARY KEY (query, endpoint) +); + +-------- +CREATE OR REPLACE FUNCTION asciifold(text) RETURNS text +AS +'/pglib/libasciifolding.so', +'asciifold' LANGUAGE C STRICT; + +CREATE OR REPLACE FUNCTION asciifold_lower(text) RETURNS text +AS +'/pglib/libasciifolding.so', +'asciifold_lower' LANGUAGE C STRICT; diff --git a/process_lastfm_data.py b/process_lastfm_data.py deleted file mode 100644 index ff6c022..0000000 --- a/process_lastfm_data.py +++ /dev/null @@ -1,102 +0,0 @@ -import csv -import json -import sqlite3 -from collections import defaultdict -import sys - -artists = set() - - -def disambiguate(lfm_artist, artist_release_count, name, mbid): - existing_mbid = lfm_artist.get(name, None) - - if existing_mbid and mbid != existing_mbid: - if artist_release_count[existing_mbid] < artist_release_count[mbid]: - - lfm_artist[name] = mbid - - # print("Replacing %s (%s) with %s (%d) for %s" % - # (existing_mbid, artist_release_count[existing_mbid], - # mbid, artist_release_count[mbid], - # name)) - else: - lfm_artist[name] = mbid - - -def patch(lastfm_data): - - artist_listeners = dict() - lastfm_artist_to_mbid = dict() - artist_release_count = defaultdict(int) - related = list() - - with open("repo/artist_release.csv") as f: - for line in f: - cols = line.split(',') - artist_release_count[cols[0]] += 1 - - with sqlite3.connect(lastfm_data) as conn: - cur = conn.cursor() - cur.execute("SELECT data FROM lastfmdata", ) - data = list(cur.fetchall()) - - # A lastfm artist name can refer to multiple MBIDs - # For RELATED_TO purposes, we assume that the MBID referring - # to the artist with the most official releases is the one - - for row in data: - meta = json.loads(row[0]) - - disambiguate(lastfm_artist_to_mbid, artist_release_count, meta["name"], meta["artist"]) - - for similar in [s for s in meta["similar"] if s["mbid"] is not None]: - disambiguate(lastfm_artist_to_mbid, artist_release_count, similar["name"], similar["mbid"]) - - # Get related links & listener counts - for row in data: - meta = json.loads(row[0]) - - artist_listeners[lastfm_artist_to_mbid[meta["name"]]] = \ - (meta["listeners"], meta["playcount"]) - - for similar in [s for s in meta["similar"] if s["mbid"] is not None]: - related.append(( - lastfm_artist_to_mbid[similar["name"]], - lastfm_artist_to_mbid[meta["name"]], - similar["match"] - )) - - with open("repo/lastfm_artist.csv", "w") as out: - writer = csv.writer(out) - writer.writerow([ - "id:ID(Artist)", "name", "sortname", "year:short", "comment", ":LABEL", "listeners:int", "playcount:int" - ]) - - with open("repo/artist.csv") as f: - reader = csv.reader(f) - - reader.__next__() # Skip header - for row in reader: - writer.writerow([ - row[0], - row[1], - row[2], - row[3], - row[4], - row[5], - artist_listeners.get(row[0], (0, 0))[0], - artist_listeners.get(row[0], (0, 0))[1], - ]) - artists.add(row[0]) - - with open("repo/lastfm_artist_artist.csv", "w") as out: - out.write(",".join(( - ":START_ID(Artist)", ":END_ID(Artist)", "weight:float" - )) + "\n") - - for x in related: - if x[0] in artists and x[1] in artists: - out.write(",".join(x) + "\n") - - -patch(sys.argv[1]) diff --git a/process_mb_dump.py b/process_mb_dump.py deleted file mode 100644 index f84ba2b..0000000 --- a/process_mb_dump.py +++ /dev/null @@ -1,466 +0,0 @@ -import os -from collections import defaultdict -import re -from statistics import median - -links = dict() -link_types = dict() -areas = dict() -labels = dict() -label_types = { - "\\N": "" -} -release_groups = dict() -release_statuses = dict() -release_to_release_group_map = dict() -release_types = { - "\\N": "", -} -artists = dict() -tags = dict() - -release_release_rel_map = { - "covers and versions": "", - "remixes and compilations": "", - "DJ-mix": "IS_DJ_MIX_OF", - "live performance": "IS_LIVE_PERFORMANCE_OF", - "cover": "IS_COVER_OF", - "remix": "IS_REMIX_OF", - "mashes up": "IS_MASHUP_OF", - "included in": "INCLUDED_IN", - "single from": "IS_SINGLE_FROM" -} - -artist_release_rel_map = { - "translator": "TRANSLATED", - "liner notes": "WROTE_LINER_NOTES", - "lyricist": "IS_LYRICIST_FOR", - "lacquer cut": "DID_LACQUER_CUT_FOR", - "samples from artist": "HAS_SAMPLES_IN", - "remixes and compilations": "", - "composition": "COMPOSED", - "booking": "DID_BOOKING_FOR", - "balance": "DID_BALANCE_FOR", - "misc": "HAS_MISC_ROLE_IN", - "conductor": "CONDUCTED", - "legal representation": "PROVIDED_LEGAL_REPRESENTATION_FOR", - "design/illustration": "DID_DESIGN_FOR", - "performing orchestra": "PERFORMED_FOR", - "producer": "PRODUCED", - "instrument": "PERFORMED_INSTRUMENT_FOR", - "writer": "WROTE_LYRICS_FOR", - "production": "DID_PRODUCTION_FOR", - "performance": "PERFORMED_FOR", - "composer": "IS_COMPOSER_FOR", - "sound": "DID_SOUND_FOR", - "remixer": "DID_REMIXING_FOR", - "orchestrator": "IS_ORCHESTRATOR_FOR", - "compiler": "DID_COMPILATION_FOR", - "vocal arranger": "IS_ARRANGER_FOR", - "arranger": "IS_ARRENGER_FOR", - "mix-DJ": "MIXED", - "editor": "IS_EDITOR_FOR", - "illustration": "DID_ILLUSTRATION_FOR", - "audio": "DID_AUDIO_FOR", - "publishing": "IS_PUBLISHER_FOR", - "art direction": "DID_ART_DIRECTOR_FOR", - "design": "DID_DESIGN_FOR", - "instrument arranger": "IS_ARRANGER_FOR", - "chorus master": "IS_CHORUS_MASTER_FOR", - "photography": "DID_PHOTOGRAPHY_FOR", - "performer": "PERFORMED_IN", - "graphic design": "DID_GRAPHIC_DESIGN_FOR", - "booklet editor": "IS_BOOKLET_EDITOR_FOR", - "programming": "DID_PROGRAMING_FOR", - "copyright": "IS_COPYRIGHT_HOLDER_OF", - "piano technician": "IS_PIANO_TECNICIAN_FOR", - "phonographic copyright": "IS_PHONOGRAPHIC_COPYRIGHT_HOLDER_OF", - "mastering": "DID_MASTERING_FOR", - "vocal": "PERFORED_VOCALS_FOR", - "librettist": "IS_LIBRETTIST_FOR", - "mix": "MIXED", - "recording": "DID_RECORDING_FOR", - "concertmaster": "IS_CONCERTMASTER_FOR", - "engineer": "IS_ENGINEER_FOR", - - # release_group - "tribute": "IS_TRIBUTE_TO", - "dedicated to": "IS_DEDICATED_TO", - "creative direction": "", - "artists and repertoire": "" -} - -artist_artist_rel_map = { - "teacher": "TEACHER_OF", - "composer-in-residence": "HAS_COMPOSER-IN-RESIDENCE_STATUS_IN", - "member of band": "IS_MEMBER_OF", - "voice actor": "IS_VOICE_ACTOR_OF", - "tribute": "IS_TRIBUTE_TO", - "supporting musician": "IS_SUPPORTING_MUSICIAN_OF", - "instrumental supporting musician": "IS_INSTRUMENTAL_SUPPORTING_MUSICIAN_OF", - "personal relationship": "HAS_PERSONAL_RELATIONSHIP_WITH", - "musical relationships": "HAS_MUSICAL_RELATIONSHIP_WITH", - "collaboration": "HAS_COLLABORATED_WITH", - "married": "IS_MARRIED_WITH", - "sibling": "IS_SIBLING_OF", - "parent": "IS_PARENT_OF", - "is person": "IS", - "conductor position": "IS_CONDUCTOR_OF", - "vocal supporting musician": "DOES_VOCAL_SUPPORT_FOR", - "artistic director": "IS_ARTIST_DIRECTOR_OF", - "subgroup": "IS_SUBGROUP_OF", - "founder": "IS_FOUNDER_OF", - "involved with": "IS_INVOLVED_WITH", - "named after": "IS_NAMED_AFTER", -} - -label_label_rel_map = { - "label rename": "WAS_RENAMED_TO", - "imprint": "DOES_IMPRINT_FOR", - "label distribution": "DOES_DISTRIBUTION_FOR", - "business association": "HAS_BUSINESS_ASSOCIATION_TO", - "label ownership": "OWNS", - "label reissue": "DOES_REISSUING_FOR" -} - -if not os.path.exists("repo"): - os.mkdir("repo") -else: - os.system("rm repo/*") -if not os.path.exists("tmp"): - os.mkdir("tmp") -else: - os.system("rm tmp/*") - -with open("in/link", "r") as f: - for line in f: - cols = line.split("\t") - links[cols[0]] = cols - -with open("in/release_status", "r") as f: - for line in f: - cols = line.split("\t") - release_statuses[cols[0]] = cols - -with open("in/link_type", "r") as f: - for line in f: - cols = line.split("\t") - link_types[cols[0]] = cols - -with open("in/area", "r") as f: - for line in f: - cols = line.split("\t") - areas[cols[0]] = cols - -with open("in/label_type") as f: - for line in f: - cols = line.split("\t") - - label_types[cols[0]] = ";" + cols[1].replace(" ", "") - - if cols[3] != "\\N" and cols[2] in label_types: - label_types[cols[0]] += label_types[cols[2]].replace(" ", "") - -with open("in/artist") as f: - for line in f: - cols = line.split("\t") - artists[cols[0]] = cols - -with open("repo/area_area.csv", "w") as out: - out.write(":START_ID(Area),:END_ID(Area)\n") - - with open("in/l_area_area", "r") as f: - for line in f: - cols = line.split("\t") - out.write(",".join((areas[cols[3]][1], - areas[cols[2]][1] - )) + "\n") - -with open("repo/area.csv", "w") as out: - out.write("id:ID(Area),name\n") - - for k, area in areas.items(): - out.write(",".join((area[1], - '"' + area[2] + '"' - )) + "\n") - -# ------ - - -out_artist = open("repo/artist.csv", "w") -out_artist_area = open("repo/artist_area.csv", "w") - -out_artist.write("id:ID(Artist),name,sortname,year:int,comment,:LABEL\n") -out_artist_area.write(":START_ID(Artist),:END_ID(Area)\n") - -ASCII_RE = re.compile(r"[^a-zA-Z0-9.\-!?& ]") -ALPHANUM_RE = re.compile(r"[^\w.\-!?& ]") - -for _, artist in artists.items(): - - sortname = ASCII_RE.sub("_", artist[2]).upper() - if sortname.replace("_", "").strip() == "": - sortname = ALPHANUM_RE.sub("_", artist[3]).upper() - - out_artist.write(",".join(( - artist[1], - '"' + artist[2].replace("\"", "\"\"") + '"', - sortname, - artist[4] if artist[4] != "\\N" else "0", - ('"' + artist[13].replace("\"", "\"\"") + '"') if artist[13] != "\\N" else "", - "Artist" + (";Group\n" if artist[10] == "2" else "\n") - ))) - - if artist[11] != "\\N": - out_artist_area.write(artist[1] + "," + areas[artist[11]][1] + "\n") - -out_artist.close() -out_artist_area.close() - -with open("repo/artist_artist.csv", "w") as out: - out.write(":START_ID(Artist),:END_ID(Artist),:TYPE\n") - - with open("in/l_artist_artist", "r") as f: - for line in f: - cols = line.split("\t") - out.write(",".join(( - artists[cols[2]][1], - artists[cols[3]][1], - artist_artist_rel_map[link_types[links[cols[1]][1]][6]] + "\n" - ))) - -# -------- - -with open("in/release_group_primary_type") as f: - for line in f: - cols = line.split("\t") - release_types[cols[0]] = ";" + cols[1] - -release_group_year = dict() -with open("in/release_group_meta") as f: - for line in f: - cols = line.split("\t") - release_group_year[cols[0]] = cols[2] if cols[2] != "\\N" else "0" - -with open("repo/release.csv", "w") as out: - out.write("id:ID(Release),name,year:int,:LABEL\n") - - with open("in/release_group") as f: - for line in f: - cols = line.split("\t") - out.write(",".join(( - cols[1], - '"' + cols[2].replace("\"", "\"\"") + '"', - release_group_year[cols[0]], - "Release" + release_types[cols[4]], - )) + "\n") - - release_groups[cols[0]] = cols - -with open("in/release") as f: - for line in f: - cols = line.split("\t") - if cols[5] != '\\N' and release_statuses[cols[5]][1] == "Official": - release_to_release_group_map[cols[0]] = cols[4] - -credit_names = defaultdict(list) - -with open("in/artist_credit_name") as f: - for line in f: - cols = line.split("\t") - credit_names[cols[0]].append(artists[cols[2]][1]) - -with open("tmp/tmp_artist_release.csv", "w") as out: - out.write(":START_ID(Artist),:END_ID(Release),:TYPE\n") - - # Is this part really necessary? - with open("in/l_artist_release") as f: - for line in f: - cols = line.split("\t") - if cols[3] in release_to_release_group_map: - out.write(",".join(( - artists[cols[2]][1], - release_groups[release_to_release_group_map[cols[3]]][1], - artist_release_rel_map[link_types[links[cols[1]][1]][6]] - )) + "\n") - - # Artist credits - with open("in/release") as f: - for line in f: - cols = line.split("\t") - if cols[0] in release_to_release_group_map: - for credit in credit_names[cols[3]]: - out.write(",".join(( - credit, - release_groups[release_to_release_group_map[cols[0]]][1], - "CREDITED_FOR" - )) + "\n") - -# Remove dupes -os.system("(head -n 1 tmp/tmp_artist_release.csv && tail -n +2 tmp/tmp_artist_release.csv" - " | sort) | uniq > repo/artist_release.csv && rm tmp/tmp_artist_release.csv") - - -with open("repo/release_release.csv", "w") as out: - out.write(":START_ID(Release),:END_ID(Release),:TYPE\n") - - with open("in/l_release_group_release_group") as f: - for line in f: - cols = line.split("\t") - out.write(",".join(( - release_groups[cols[2]][1], - release_groups[cols[3]][1], - release_release_rel_map[link_types[links[cols[1]][1]][6]] - )) + "\n") - -# --- - -tag_occurence = defaultdict(int) -with open("in/release_group_tag") as f: - for line in f: - tag_occurence[line.split("\t")[1]] += 1 - -with open("in/tag") as f: - with open("repo/tag.csv", "w") as out: - out.write("id:ID(Tag),name, occurences\n") - - for line in f: - cols = line.split("\t") - if tag_occurence[cols[0]] < 5: - continue - tags[cols[0]] = cols - out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"," + str(tag_occurence[cols[0]]) + "\n") - -with open("repo/release_tag.csv", "w") as out: - out.write(":START_ID(Release),:END_ID(Tag),weight:float\n") - - # get max count - max_count = 0 - with open("in/release_group_tag") as f: - for line in f: - cols = line.split("\t") - max_count = max(max_count, int(cols[2])) - max_count = max_count / 4 - - # weight is linear - with open("in/release_group_tag") as f: - for line in f: - cols = line.split("\t") - count = int(cols[2]) - if count <= 0: - continue - if cols[1] not in tags: - continue - out.write(",".join(( - release_groups[cols[0]][1], - cols[1], - str(max(min(count / max_count, 1), 0.2)), - )) + "\n") - tag_occurence[cols[1]] += 1 - - -with open("repo/artist_tag.csv", "w") as out: - out.write(":START_ID(Artist),:END_ID(Tag),weight:float\n") - - # get max count - max_count = 0 - with open("in/artist_tag") as f: - for line in f: - cols = line.split("\t") - max_count = max(max_count, int(cols[2])) - max_count = max_count / 4 - - # Weight is linear - with open("in/artist_tag") as f: - for line in f: - cols = line.split("\t") - - count = int(cols[2]) - if count <= 0: - continue - if cols[1] not in tags: - continue - - out.write(",".join(( - artists[cols[0]][1], - cols[1], - str(max(min(count / max_count, 1), 0.2)), - )) + "\n") - -with open("repo/tag_tag.csv", "w") as out: - out.write(":START_ID(Tag),:END_ID(Tag),weight:float\n") - - def weights(): - with open("in/tag_relation") as f: - for line in f: - weight = int(line.split("\t")[2]) - if weight < 5: - continue - yield weight - weight_median = median(weights()) * 3 - - with open("in/tag_relation") as f: - for line in f: - cols = line.split("\t") - - weight = int(cols[2]) - if weight < 5: - continue - if cols[0] not in tags or cols[1] not in tags: - continue - - out.write(",".join(( - cols[0], - cols[1], - str(max(min(weight / weight_median, 1), 0.2)), - )) + "\n") - -# ----- - -with open("repo/labels.csv", "w") as out: - out.write("id:ID(Label),name,sortname,code,:LABEL\n") - - with open("in/label") as f: - for line in f: - cols = line.split("\t") - labels[cols[0]] = cols - - sortname = ASCII_RE.sub("_", cols[2]).upper() - out.write(",".join(( - cols[1], - "\"" + cols[2].replace("\"", "\"\"") + "\"", - sortname, - cols[9] if cols[9] != "\\N" else "", - "Label" + label_types[cols[10]] - )) + "\n") - -with open("repo/release_label.csv", "w") as out: - out.write(":START_ID(Release),:END_ID(Label)\n") - - # Should I check link types here? - with open("in/l_label_release_group") as f: - for line in f: - cols = line.split("\t") - out.write(release_groups[cols[3]][1] + "," + labels[cols[2]][1] + "\n") - - with open("in/l_label_release") as f: - for line in f: - cols = line.split("\t") - if cols[3] in release_to_release_group_map: - out.write(release_groups[release_to_release_group_map[cols[3]]][1] + "," + labels[cols[2]][1] + "\n") - - -with open("repo/label_label.csv", "w") as out: - out.write(":START_ID(Label),:END_ID(Label),:TYPE\n") - - with open("in/l_label_label") as f: - for line in f: - cols = line.split("\t") - - out.write(",".join(( - labels[cols[2]][1], - labels[cols[3]][1], - label_label_rel_map[link_types[links[cols[1]][1]][6]] - )) + "\n") - -# --- diff --git a/spotify b/spotify deleted file mode 160000 index 4ac596b..0000000 --- a/spotify +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4ac596b2ff7659b880ac8a3fe9c58ea6527c2efc diff --git a/spotify2 b/spotify2 deleted file mode 160000 index 0a05c69..0000000 --- a/spotify2 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0a05c69bcf7005496c2efdf5b825ffa2f443ccdf diff --git a/task_get_cover.py b/task_get_cover.py new file mode 100644 index 0000000..5211d6b --- /dev/null +++ b/task_get_cover.py @@ -0,0 +1,85 @@ +from io import BytesIO + +import psycopg2 +import requests +from PIL import Image + +import config + + +def should_download(image: dict): + return image["front"] is True + + +def thumb(cover_blob): + with Image.open(BytesIO(cover_blob)) as image: + + # https://stackoverflow.com/questions/43978819 + if image.mode == "I;16": + image.mode = "I" + image.point(lambda i: i * (1. / 256)).convert('L') + + image.thumbnail((256, 256), Image.BICUBIC) + canvas = Image.new("RGB", image.size, 0x000000) + + if image.mode in ('RGBA', 'LA') or (image.mode == 'P' and 'transparency' in image.info): + try: + canvas.paste(image, mask=image.split()[-1]) + except ValueError: + canvas.paste(image) + else: + canvas.paste(image) + + blob = BytesIO() + canvas.save(blob, "JPEG", quality=85, optimize=True) + canvas.close() + + return blob.getvalue() + + +def save(mbid, tn): + with psycopg2.connect(config.connstr()) as conn: + cur = conn.cursor() + cur.execute( + "INSERT INTO mg.covers (mbid, tn) VALUES (%s,%s) ON CONFLICT (mbid) " + "DO UPDATE SET tn = excluded.tn", + (mbid, tn) + ) + conn.commit() + + +def get_mbids(count=1): + with psycopg2.connect(config.connstr()) as conn: + cur = conn.cursor() + cur.execute( + "SELECT gid FROM release_group " + "LEFT JOIN mg.covers ON gid = mbid " + "WHERE tn IS NULL " + "ORDER BY ts NULLS FIRST LIMIT %s", + (count,) + ) + for row in cur: + yield row[0] + + +def download(mbid): + r = requests.get("https://coverartarchive.org/release-group/%s/front-250.jpg" % mbid) + + if r.status_code == 200: + return r.content + if r.status_code != 404: + print("<%d> %s" % (r.status_code, r.text)) + return None + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--count", type=int, default=1) + args = parser.parse_args() + + for mbid in get_mbids(args.count): + tn = download(mbid) + save(mbid, tn) + print(mbid) diff --git a/task_get_lastfm.py b/task_get_lastfm.py new file mode 100755 index 0000000..328e552 --- /dev/null +++ b/task_get_lastfm.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 + +import json +from itertools import repeat + +import config + +import psycopg2 +import requests + + +def get_mbid(lfm_name): + cur = conn.cursor() + cur.execute("SELECT mbid " + "FROM mg.lastfm_artist WHERE name=%s", (lfm_name,)) + row = cur.fetchone() + return row[0] if row else None + + +def set_mbid(lfm_name, mbid): + cur = conn.cursor() + cur.execute("INSERT INTO mg.lastfm_artist VALUES (%s,%s) ON CONFLICT (name) " + "DO UPDATE SET mbid=excluded.mbid", (lfm_name, mbid)) + + +def save_tags(lfm_name, tags): + if not tags: + return + cur = conn.cursor() + + cur.execute("DELETE FROM mg.lastfm_artist_tag WHERE name=%s", (lfm_name,)) + cur.execute( + "INSERT INTO mg.lastfm_artist_tag VALUES %s" % + ",".join("('%s', '%s')" % (n, t) for (n, t) in zip(repeat(lfm_name), tags)) + ) + + +def save_data(data): + if data: + disambiguate(data["name"], mbid=data["artist"]) + + for similar in [s for s in data["similar"] if s["mbid"] is not None]: + disambiguate(similar["name"], similar["mbid"]) + save_similar(data["name"], similar["name"], similar["match"]) + + save_tags(data["name"], data["tags"]) + save_meta(data["name"], data["listeners"], data["playcount"]) + + +def save_similar(lfm_name, similar, weight): + cur = conn.cursor() + + cur.execute( + "INSERT INTO mg.lastfm_artist_artist (name0, name1, weight) VALUES (%s,%s,%s) " + "ON CONFLICT (name0, name1) DO UPDATE SET weight=excluded.weight, ts=CURRENT_TIMESTAMP", + (lfm_name, similar, weight) + ) + + +def save_meta(lfm_name, listeners, playcount): + cur = conn.cursor() + cur.execute("INSERT INTO mg.lastfm_artist_meta VALUES (%s,%s,%s) ON CONFLICT (name) " + "DO UPDATE SET listeners=excluded.listeners, playcount=excluded.playcount", + (lfm_name, listeners, playcount)) + + +def save_raw_data(name, mbid, data): + cur = conn.cursor() + cur.execute("INSERT INTO mg.lastfm_raw_data (name, mbid, data) VALUES (%s,%s,%s) " + "ON CONFLICT (name, mbid) DO UPDATE SET ts=CURRENT_TIMESTAMP, data=excluded.data", + (name, mbid, json.dumps(data))) + + +def get_release_count(mbid): + cur = conn.cursor() + cur.execute('SELECT COUNT(*) ' + 'FROM l_artist_release ' + 'INNER JOIN artist a ON entity0 = a.id ' + 'WHERE a.gid = %s', (mbid,)) + row = cur.fetchone() + return row[0] if row else 0 + + +def disambiguate(name, mbid): + """ + A lastfm artist name can refer to multiple MBIDs + For RELATED_TO purposes, we assume that the MBID referring + to the artist with the most official releases is the one + """ + existing_mbid = get_mbid(name) + + if existing_mbid and mbid != existing_mbid: + if get_release_count(existing_mbid) < get_release_count(mbid): + set_mbid(name, mbid) + else: + set_mbid(name, mbid) + + +def get_cached_artist_data(name, mbid, max_age_days): + cur = conn.cursor() + cur.execute("SELECT data FROM mg.lastfm_raw_data WHERE name=%s AND mbid=%s " + "AND date_part('day', CURRENT_TIMESTAMP - ts) <= %s ", + (name, mbid, max_age_days)) + + row = cur.fetchone() + return row[0] if row else 0 + + +def get_artist_data(name: str, mbid: str): + cached_data = get_cached_artist_data(name, mbid, max_age_days=30) + if cached_data: + return cached_data + + raw = [] + url = "https://ws.audioscrobbler.com/2.0/?method=artist.getinfo&mbid=%s&api_key=%s&format=json" % \ + (mbid, config.config["LASTFM_APIKEY"],) + r = requests.get(url) + raw.append((url, r.text)) + info_json = r.json() + + by_name = False + + if "artist" not in info_json: + url1 = "https://ws.audioscrobbler.com/2.0/?method=artist.getinfo&artist=%s&api_key=%s&format=json" % \ + (name, config.config["LASTFM_APIKEY"],) + r = requests.get(url1) + raw.append((url1, r.text)) + info_json = r.json() + if "artist" not in info_json: + if "Rate Limit Exceeded" in r.text: + raise Exception("Rate Limit Exceeded!") + data = { + "_raw": raw + } + save_raw_data(name, mbid, data) + return + by_name = True + + if by_name: + url2 = "https://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&artist=%s&api_key=%s&format=json" % ( + name, config.config["LASTFM_APIKEY"],) + else: + url2 = "https://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&mbid=%s&api_key=%s&format=json" % ( + mbid, config.config["LASTFM_APIKEY"],) + r2 = requests.get(url2) + raw.append((url2, r2.text)) + similar_json = r2.json() + + data = { + "artist": mbid, + "name": info_json["artist"]["name"], + "mbid": info_json["artist"]["mbid"] if "mbid" in info_json["artist"] else None, + "tags": [t["name"] for t in info_json["artist"]["tags"]["tag"]] if "tags" in info_json["artist"] and "tag" in + info_json["artist"]["tags"] else [], + "listeners": info_json["artist"]["stats"]["listeners"], + "playcount": info_json["artist"]["stats"]["playcount"], + "similar": [ + { + "mbid": a["mbid"] if "mbid" in a else None, + "match": a["match"], + "name": a["name"] + } + for a in similar_json["similarartists"]["artist"]], + "_raw": raw + } + + save_raw_data(name, mbid, data) + + return data + + +def get_task(count=1): + cur = conn.cursor() + cur.execute( + "SELECT artist.name, artist.gid FROM artist " + "LEFT JOIN mg.lastfm_raw_data lfm ON lfm.mbid=gid AND lfm.name=artist.name " + "ORDER BY lfm.ts NULLS FIRST LIMIT %s", + (count,) + ) + return cur.fetchone() + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--count", type=int, default=1) + args = parser.parse_args() + + conn = psycopg2.connect(config.connstr()) + + for task in get_task(args.count): + save_data(get_artist_data(*task)) + conn.commit() + print(task[0]) + + conn.close() diff --git a/task_get_spotify.py b/task_get_spotify.py new file mode 100755 index 0000000..95253ff --- /dev/null +++ b/task_get_spotify.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python +import json +from itertools import repeat + +import psycopg2 +import spotipy +from hexlib.misc import silent_stdout +from spotipy.oauth2 import SpotifyClientCredentials + +import config + + +def save_raw(query, endpoint, data): + cur = conn.cursor() + cur.execute( + "INSERT INTO mg.spotify_raw_data (query, endpoint, data) VALUES (%s,%s,%s) " + "ON CONFLICT (query, endpoint) " + "DO UPDATE SET ts=CURRENT_TIMESTAMP, data=excluded.data", + (query, endpoint, json.dumps(data)) + ) + + +def save_artist(data, max_age_days=30): + """Returns True if artist is new (and therefore, its albums, tracks etc. should be fetched)""" + + cur = conn.cursor() + + cur.execute("SELECT spotid FROM mg.spotify_artist_meta WHERE spotid=%s AND " + "date_part('day', CURRENT_TIMESTAMP - ts) <= %s", (data["id"], max_age_days,)) + if cur.fetchone(): + return False + + cur.execute( + "INSERT INTO mg.spotify_artist_meta (spotid, name, followers, popularity) " + "VALUES (%s,%s,%s,%s) " + "ON CONFLICT (spotid) " + "DO UPDATE SET name=excluded.name, followers=excluded.followers, popularity=excluded.popularity", + (data["id"], data["name"], data["followers"]["total"], data["popularity"]) + ) + + cur.execute("DELETE FROM mg.spotify_artist_tag WHERE spotid=%s", (data["id"],)) + if data["genres"]: + cur.execute( + "INSERT INTO mg.spotify_artist_tag VALUES %s" % + ",".join("('%s', '%s')" % (n, t.replace("'", "''")) for (n, t) in zip(repeat(data["id"]), data["genres"])) + ) + return True + + +def get_albums(spotid): + data = silent_stdout(spotify.artist_albums, spotid, album_type="album,single,compilation") + save_raw(spotid, "artist_albums", data) + + cur = conn.cursor() + cur.execute("DELETE FROM mg.spotify_artist_album WHERE spotid=%s", (spotid,)) + if data["items"]: + cur.execute( + "INSERT INTO mg.spotify_artist_album VALUES %s" % + ",".join("('%s', '%s')" % (n, t.replace("'", "''")) + for (n, t) in zip(repeat(spotid), set(a["name"] for a in data["items"]))) + ) + return list() + + +def get_tracks(spotid): + data = silent_stdout(spotify.artist_top_tracks, spotid) + save_raw(spotid, "artist_top_tracks", data) + + cur = conn.cursor() + cur.execute("DELETE FROM mg.spotify_artist_track WHERE spotid=%s", (spotid,)) + + unique_tracks = [] + done = set() + for track in data["tracks"]: + if track["name"] in done: + continue + unique_tracks.append((track["name"], track["preview_url"])) + done.add(track["name"]) + + if unique_tracks: + cur.execute( + "INSERT INTO mg.spotify_artist_track (spotid, track, url) VALUES %s" % + ",".join("('%s', '%s', '%s')" % (i, t[0].replace("'", "''"), t[1]) + for (i, t) in zip(repeat(spotid), unique_tracks)) + ) + + +def related(spotid): + data = silent_stdout(spotify.artist_related_artists, spotid) + save_raw(spotid, "artist_related_artists", data) + return data["artists"] + + +def save_artist_artist(id0, relations): + if relations: + cur = conn.cursor() + cur.execute( + "INSERT INTO mg.spotify_artist_artist (spotid0, spotid1, index) " + "VALUES %s " + "ON CONFLICT (spotid0, spotid1) " + "DO NOTHING" % + ",".join("('%s', '%s', '%d')" % (r[0], r[1]["id"], i) for (i, r) in enumerate(zip(repeat(id0), relations))) + ) + + +def get_mbids_with_matching_name(name): + cur = conn.cursor() + cur.execute( + "SELECT gid FROM artist " + "WHERE asciifold_lower(name)=asciifold_lower(%s)", + (name,) + ) + rows = cur.fetchall() + + return [r[0] for r in rows] + + +def resolve_spotify_conflict(mbid, existing_spotid, new_spotid): + cur = conn.cursor() + cur.execute( + "SELECT asciifold_lower(album) FROM mg.spotify_artist_album WHERE spotid=%s", + (new_spotid,) + ) + new_albums = set(row[0] for row in cur.fetchall()) + + if len(new_albums) == 0: + return + + cur.execute( + "SELECT asciifold_lower(album) FROM mg.spotify_artist_album WHERE spotid=%s", + (existing_spotid,) + ) + existing_albums = set(row[0] for row in cur.fetchall()) + + if len(existing_albums) != 0: + cur.execute( + "SELECT DISTINCT asciifold_lower(release.name) FROM release " + "INNER JOIN artist_credit_name cn ON cn.artist_credit = release.artist_credit " + "INNER JOIN artist a on a.id = cn.artist " + "WHERE a.gid=%s", (mbid,) + ) + mb_albums = set(row[0] for row in cur.fetchall()) + if len(new_albums.intersection(mb_albums)) > len(existing_albums.intersection(mb_albums)): + cur.execute("UPDATE mg.spotify_artist SET spotid = %s WHERE mbid=%s", (new_spotid, mbid)) + + +def resolve_mb_conflict(spotid, mbids): + cur = conn.cursor() + + cur.execute( + "SELECT asciifold_lower(album) FROM mg.spotify_artist_album WHERE spotid=%s", + (spotid,) + ) + spot_albums = set(row[0] for row in cur.fetchall()) + + best_match_count = -1 + best_match = None + + if len(spot_albums) == 0: + # We can't base our conflict resolution based on album names, + # pick the one with the most releases + for mbid in mbids: + cur.execute( + "SELECT count(release.name) FROM release " + "INNER JOIN artist_credit_name cn ON cn.artist_credit = release.artist_credit " + "INNER JOIN artist a on a.id = cn.artist " + "WHERE a.gid = %s ", + (mbid,) + ) + match_count = cur.fetchone()[0] + if match_count > best_match_count: + best_match_count = match_count + best_match = mbid + else: + for mbid in mbids: + cur.execute( + "SELECT asciifold_lower(release.name) FROM release " + "INNER JOIN artist_credit_name cn ON cn.artist_credit = release.artist_credit " + "INNER JOIN artist a on a.id = cn.artist " + "WHERE a.gid = %s ", + (mbid,) + ) + match_count = len(set(row[0] for row in cur.fetchall()).intersection(spot_albums)) + if match_count > best_match_count: + best_match_count = match_count + best_match = mbid + + save_spotid_to_mbid(spotid, best_match) + + +def save_spotid_to_mbid(spotid, mbid): + cur = conn.cursor() + cur.execute( + "SELECT spotid FROM mg.spotify_artist WHERE mbid=%s", + (mbid,) + ) + row = cur.fetchone() + if row: + resolve_spotify_conflict(mbid, row[0], spotid) + else: + cur.execute( + "INSERT INTO mg.spotify_artist (spotid, mbid) VALUES (%s,%s)", + (spotid, mbid) + ) + + +def search_artist(name): + quoted_name = "\"%s\"" % name + + data = silent_stdout(spotify.search, quoted_name, type="artist", limit=20) + save_raw(name, "search", data) + + for result in data["artists"]["items"]: + if save_artist(result): + mbids = get_mbids_with_matching_name(result["name"]) + + get_albums(result["id"]) + get_tracks(result["id"]) + + if len(mbids) > 1: + resolve_mb_conflict(result["id"], mbids) + elif len(mbids) == 1: + save_spotid_to_mbid(result["id"], mbids[0]) + + save_artist_artist(result["id"], related(result["id"])) + + +def get_tasks(count=1): + cur = conn.cursor() + cur.execute( + "SELECT artist.name FROM artist " + "LEFT JOIN mg.spotify_artist sa ON sa.mbid=gid " + "LEFT JOIN mg.spotify_raw_data srd ON srd.query=artist.name AND endpoint='search' " + "LEFT JOIN mg.spotify_artist_meta sam ON sa.spotid=sam.spotid " + "ORDER BY sam.ts NULLS FIRST, srd.ts NULLS FIRST LIMIT %s", + (count,) + ) + for row in cur: + yield row[0] + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--count", type=int, default=1) + args = parser.parse_args() + + conn = psycopg2.connect(config.connstr()) + client_credentials_manager = SpotifyClientCredentials( + client_id=config.config["SPOTIFY_CLIENTID"], + client_secret=config.config["SPOTIFY_SECRET"] + ) + spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager) + + for name in get_tasks(args.count): + search_artist(name) + conn.commit() + print(name) + + conn.close() diff --git a/task_tracker_drone b/task_tracker_drone deleted file mode 160000 index aa15a1b..0000000 --- a/task_tracker_drone +++ /dev/null @@ -1 +0,0 @@ -Subproject commit aa15a1b29e2fc7f03dafc9301c65e32cb82e4cb4