From c3dc1faa318ed50933e1dcc5dbd7a9561be2c69b Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 20 Jun 2019 10:26:31 -0400 Subject: [PATCH] Add label data, some work on spotify data --- .gitmodules | 6 ++++ download_mb_dump.sh | 3 +- generate_spotify_tasks.py | 48 +++++++++++++++++++++++++++++ generate_spotify_tasks_2.py | 60 +++++++++++++++++++++++++++++++++++++ make_neoj4_db.sh | 4 ++- process_lastfm_data.py | 8 ++--- process_mb_dump.py | 57 ++++++++++++++++++++++++++++++----- seed.cypher | 1 + spotify | 1 + spotify2 | 1 + 10 files changed, 176 insertions(+), 13 deletions(-) create mode 100644 generate_spotify_tasks.py create mode 100644 generate_spotify_tasks_2.py create mode 160000 spotify create mode 160000 spotify2 diff --git a/.gitmodules b/.gitmodules index 11d1bc0..4340f4c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,9 @@ [submodule "caa"] path = caa url = https://git.simon987.net/drone/caa.git +[submodule "spotify"] + path = spotify + url = https://git.simon987.net/drone/spotify +[submodule "spotify2"] + path = spotify2 + url = https://git.simon987.net/drone/spotify2 diff --git a/download_mb_dump.sh b/download_mb_dump.sh index ff3bf47..f435b19 100755 --- a/download_mb_dump.sh +++ b/download_mb_dump.sh @@ -11,7 +11,8 @@ wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/m tar -xjvf mbdump.tar.bz2 mbdump/area mbdump/artist mbdump/l_area_area mbdump/l_artist_artist \ mbdump/l_artist_release mbdump/l_artist_release_group mbdump/l_label_label mbdump/l_release_group_release_group \ mbdump/label mbdump/label_type mbdump/link mbdump/link_type mbdump/release mbdump/release_group \ -mbdump/release_group_primary_type mbdump/artist_credit_name mbdump/release_status +mbdump/release_group_primary_type mbdump/artist_credit_name mbdump/release_status mbdump/l_label_release \ +mbdump/l_label_release_group tar -xjvf mbdump-derived.tar.bz2 mbdump/artist_tag mbdump/release_group_tag mbdump/tag mbdump/tag_relation \ mbdump/release_group_meta diff --git a/generate_spotify_tasks.py b/generate_spotify_tasks.py new file mode 100644 index 0000000..45adb7f --- /dev/null +++ b/generate_spotify_tasks.py @@ -0,0 +1,48 @@ +import csv +import json +from multiprocessing.pool import ThreadPool + +from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker + +TT_API_URL = "https://tt.simon987.net/api" +TT_PROJECT = 6 + +api = TaskTrackerApi(TT_API_URL) + +worker = Worker.from_file(api) +if not worker: + worker = api.make_worker("mm worker") + worker.dump_to_file() +worker.request_access(TT_PROJECT, True, True) +input("Give permission to " + worker.alias) + +with open("repo/artist.csv") as f: + reader = csv.reader(f) + + def mktask(lines): + res = worker.submit_task( + project=TT_PROJECT, + recipe=json.dumps( + [{"mbid": line[0], "name": line[1]} for line in lines] + ), + unique_str=lines[0][0], + max_assign_time=60 * 5, + ) + print(res.text) + + def lines(): + line_batch = list() + + for line in reader: + line_batch.append(line) + if len(line_batch) >= 30: + res = list(line_batch) + line_batch.clear() + yield res + + tasks = list(lines()) + + pool = ThreadPool(processes=25) + pool.map(func=mktask, iterable=tasks) + + diff --git a/generate_spotify_tasks_2.py b/generate_spotify_tasks_2.py new file mode 100644 index 0000000..9828c04 --- /dev/null +++ b/generate_spotify_tasks_2.py @@ -0,0 +1,60 @@ +import json +import sqlite3 +from multiprocessing.pool import ThreadPool + +import sys + +from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker + +TT_API_URL = "https://tt.simon987.net/api" +TT_PROJECT = 7 + +api = TaskTrackerApi(TT_API_URL) + +worker = Worker.from_file(api) +if not worker: + worker = api.make_worker("mm worker") + worker.dump_to_file() +worker.request_access(TT_PROJECT, True, True) +input("Give permission to " + worker.alias) + +spotids = set() + +with sqlite3.connect(sys.argv[1]) as conn: + + cur = conn.cursor() + cur.execute("SELECT data from artist") + for row in cur.fetchall(): + j = json.loads(row[0]) + if j is None or "artists" not in j or "items" not in j["artists"]: + continue + for item in j["artists"]["items"]: + spotids.add(item["id"]) + + + def mktask(lines): + res = worker.submit_task( + project=TT_PROJECT, + recipe=json.dumps( + [{"spotid": line} for line in lines] + ), + unique_str=lines[0], + max_assign_time=60 * 5, + ) + print(res.text) + + def ids(): + id_batch = list() + + for spotid in spotids: + id_batch.append(spotid) + if len(id_batch) >= 30: + res = list(id_batch) + id_batch.clear() + yield res + + tasks = list(ids()) + + pool = ThreadPool(processes=25) + pool.map(func=mktask, iterable=tasks) + diff --git a/make_neoj4_db.sh b/make_neoj4_db.sh index 763154c..d6ef5fc 100755 --- a/make_neoj4_db.sh +++ b/make_neoj4_db.sh @@ -23,6 +23,7 @@ wget ${REPOSITORY}/release.csv wget ${REPOSITORY}/tag.csv wget ${REPOSITORY}/tag_tag.csv wget ${REPOSITORY}/release_tag.csv +wget ${REPOSITORY}/release_label.csv wget ${REPOSITORY}/release_release.csv wget ${REPOSITORY}/artist_tag.csv wget ${REPOSITORY}/labels.csv @@ -46,7 +47,8 @@ wget ${REPOSITORY}/lastfm_artist_artist.csv --relationships:IS_RELATED_TO "tag_tag.csv"\ --relationships "label_label.csv"\ --relationships "release_release.csv"\ - --relationships:IS_RELATED_TO "lastfm_artist_artist.csv" + --relationships:IS_RELATED_TO "lastfm_artist_artist.csv"\ + --relationships:RELEASE_UNDER "release_label.csv" rm *.csv cd .. diff --git a/process_lastfm_data.py b/process_lastfm_data.py index 014297a..ff6c022 100644 --- a/process_lastfm_data.py +++ b/process_lastfm_data.py @@ -15,10 +15,10 @@ def disambiguate(lfm_artist, artist_release_count, name, mbid): lfm_artist[name] = mbid - print("Replacing %s (%s) with %s (%d) for %s" % - (existing_mbid, artist_release_count[existing_mbid], - mbid, artist_release_count[mbid], - name)) + # print("Replacing %s (%s) with %s (%d) for %s" % + # (existing_mbid, artist_release_count[existing_mbid], + # mbid, artist_release_count[mbid], + # name)) else: lfm_artist[name] = mbid diff --git a/process_mb_dump.py b/process_mb_dump.py index 5c610b8..f84ba2b 100644 --- a/process_mb_dump.py +++ b/process_mb_dump.py @@ -1,6 +1,7 @@ import os from collections import defaultdict import re +from statistics import median links = dict() link_types = dict() @@ -314,14 +315,21 @@ with open("repo/release_release.csv", "w") as out: # --- +tag_occurence = defaultdict(int) +with open("in/release_group_tag") as f: + for line in f: + tag_occurence[line.split("\t")[1]] += 1 + with open("in/tag") as f: with open("repo/tag.csv", "w") as out: - out.write("id:ID(Tag),name\n") + out.write("id:ID(Tag),name, occurences\n") for line in f: cols = line.split("\t") + if tag_occurence[cols[0]] < 5: + continue tags[cols[0]] = cols - out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"\n") + out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"," + str(tag_occurence[cols[0]]) + "\n") with open("repo/release_tag.csv", "w") as out: out.write(":START_ID(Release),:END_ID(Tag),weight:float\n") @@ -341,11 +349,15 @@ with open("repo/release_tag.csv", "w") as out: count = int(cols[2]) if count <= 0: continue + if cols[1] not in tags: + continue out.write(",".join(( release_groups[cols[0]][1], cols[1], str(max(min(count / max_count, 1), 0.2)), )) + "\n") + tag_occurence[cols[1]] += 1 + with open("repo/artist_tag.csv", "w") as out: out.write(":START_ID(Artist),:END_ID(Tag),weight:float\n") @@ -366,6 +378,8 @@ with open("repo/artist_tag.csv", "w") as out: count = int(cols[2]) if count <= 0: continue + if cols[1] not in tags: + continue out.write(",".join(( artists[cols[0]][1], @@ -374,39 +388,68 @@ with open("repo/artist_tag.csv", "w") as out: )) + "\n") with open("repo/tag_tag.csv", "w") as out: - out.write(":START_ID(Tag),:END_ID(Tag),weight:int\n") + out.write(":START_ID(Tag),:END_ID(Tag),weight:float\n") + + def weights(): + with open("in/tag_relation") as f: + for line in f: + weight = int(line.split("\t")[2]) + if weight < 5: + continue + yield weight + weight_median = median(weights()) * 3 - # TODO: normalize weight so it's between [0,1] with open("in/tag_relation") as f: for line in f: cols = line.split("\t") - if int(cols[2]) <= 0: + weight = int(cols[2]) + if weight < 5: + continue + if cols[0] not in tags or cols[1] not in tags: continue out.write(",".join(( cols[0], cols[1], - cols[2], + str(max(min(weight / weight_median, 1), 0.2)), )) + "\n") # ----- with open("repo/labels.csv", "w") as out: - out.write("id:ID(Label),name,code,:LABEL\n") + out.write("id:ID(Label),name,sortname,code,:LABEL\n") with open("in/label") as f: for line in f: cols = line.split("\t") labels[cols[0]] = cols + sortname = ASCII_RE.sub("_", cols[2]).upper() out.write(",".join(( cols[1], "\"" + cols[2].replace("\"", "\"\"") + "\"", + sortname, cols[9] if cols[9] != "\\N" else "", "Label" + label_types[cols[10]] )) + "\n") +with open("repo/release_label.csv", "w") as out: + out.write(":START_ID(Release),:END_ID(Label)\n") + + # Should I check link types here? + with open("in/l_label_release_group") as f: + for line in f: + cols = line.split("\t") + out.write(release_groups[cols[3]][1] + "," + labels[cols[2]][1] + "\n") + + with open("in/l_label_release") as f: + for line in f: + cols = line.split("\t") + if cols[3] in release_to_release_group_map: + out.write(release_groups[release_to_release_group_map[cols[3]]][1] + "," + labels[cols[2]][1] + "\n") + + with open("repo/label_label.csv", "w") as out: out.write(":START_ID(Label),:END_ID(Label),:TYPE\n") diff --git a/seed.cypher b/seed.cypher index 9b1edce..1dabca7 100644 --- a/seed.cypher +++ b/seed.cypher @@ -1,3 +1,4 @@ CREATE INDEX ON :Artist(id); CREATE INDEX ON :Artist(sortname); CREATE INDEX ON :Release(id); +CREATE INDEX ON :Label(sortname); diff --git a/spotify b/spotify new file mode 160000 index 0000000..4ac596b --- /dev/null +++ b/spotify @@ -0,0 +1 @@ +Subproject commit 4ac596b2ff7659b880ac8a3fe9c58ea6527c2efc diff --git a/spotify2 b/spotify2 new file mode 160000 index 0000000..0a05c69 --- /dev/null +++ b/spotify2 @@ -0,0 +1 @@ +Subproject commit 0a05c69bcf7005496c2efdf5b825ffa2f443ccdf