mirror of
https://github.com/simon987/music-graph-scripts.git
synced 2025-04-10 05:56:42 +00:00
Add label data, some work on spotify data
This commit is contained in:
parent
a70b9ffa61
commit
c3dc1faa31
6
.gitmodules
vendored
6
.gitmodules
vendored
@ -7,3 +7,9 @@
|
|||||||
[submodule "caa"]
|
[submodule "caa"]
|
||||||
path = caa
|
path = caa
|
||||||
url = https://git.simon987.net/drone/caa.git
|
url = https://git.simon987.net/drone/caa.git
|
||||||
|
[submodule "spotify"]
|
||||||
|
path = spotify
|
||||||
|
url = https://git.simon987.net/drone/spotify
|
||||||
|
[submodule "spotify2"]
|
||||||
|
path = spotify2
|
||||||
|
url = https://git.simon987.net/drone/spotify2
|
||||||
|
@ -11,7 +11,8 @@ wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/m
|
|||||||
tar -xjvf mbdump.tar.bz2 mbdump/area mbdump/artist mbdump/l_area_area mbdump/l_artist_artist \
|
tar -xjvf mbdump.tar.bz2 mbdump/area mbdump/artist mbdump/l_area_area mbdump/l_artist_artist \
|
||||||
mbdump/l_artist_release mbdump/l_artist_release_group mbdump/l_label_label mbdump/l_release_group_release_group \
|
mbdump/l_artist_release mbdump/l_artist_release_group mbdump/l_label_label mbdump/l_release_group_release_group \
|
||||||
mbdump/label mbdump/label_type mbdump/link mbdump/link_type mbdump/release mbdump/release_group \
|
mbdump/label mbdump/label_type mbdump/link mbdump/link_type mbdump/release mbdump/release_group \
|
||||||
mbdump/release_group_primary_type mbdump/artist_credit_name mbdump/release_status
|
mbdump/release_group_primary_type mbdump/artist_credit_name mbdump/release_status mbdump/l_label_release \
|
||||||
|
mbdump/l_label_release_group
|
||||||
tar -xjvf mbdump-derived.tar.bz2 mbdump/artist_tag mbdump/release_group_tag mbdump/tag mbdump/tag_relation \
|
tar -xjvf mbdump-derived.tar.bz2 mbdump/artist_tag mbdump/release_group_tag mbdump/tag mbdump/tag_relation \
|
||||||
mbdump/release_group_meta
|
mbdump/release_group_meta
|
||||||
|
|
||||||
|
48
generate_spotify_tasks.py
Normal file
48
generate_spotify_tasks.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
import csv
|
||||||
|
import json
|
||||||
|
from multiprocessing.pool import ThreadPool
|
||||||
|
|
||||||
|
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
|
||||||
|
|
||||||
|
TT_API_URL = "https://tt.simon987.net/api"
|
||||||
|
TT_PROJECT = 6
|
||||||
|
|
||||||
|
api = TaskTrackerApi(TT_API_URL)
|
||||||
|
|
||||||
|
worker = Worker.from_file(api)
|
||||||
|
if not worker:
|
||||||
|
worker = api.make_worker("mm worker")
|
||||||
|
worker.dump_to_file()
|
||||||
|
worker.request_access(TT_PROJECT, True, True)
|
||||||
|
input("Give permission to " + worker.alias)
|
||||||
|
|
||||||
|
with open("repo/artist.csv") as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
|
||||||
|
def mktask(lines):
|
||||||
|
res = worker.submit_task(
|
||||||
|
project=TT_PROJECT,
|
||||||
|
recipe=json.dumps(
|
||||||
|
[{"mbid": line[0], "name": line[1]} for line in lines]
|
||||||
|
),
|
||||||
|
unique_str=lines[0][0],
|
||||||
|
max_assign_time=60 * 5,
|
||||||
|
)
|
||||||
|
print(res.text)
|
||||||
|
|
||||||
|
def lines():
|
||||||
|
line_batch = list()
|
||||||
|
|
||||||
|
for line in reader:
|
||||||
|
line_batch.append(line)
|
||||||
|
if len(line_batch) >= 30:
|
||||||
|
res = list(line_batch)
|
||||||
|
line_batch.clear()
|
||||||
|
yield res
|
||||||
|
|
||||||
|
tasks = list(lines())
|
||||||
|
|
||||||
|
pool = ThreadPool(processes=25)
|
||||||
|
pool.map(func=mktask, iterable=tasks)
|
||||||
|
|
||||||
|
|
60
generate_spotify_tasks_2.py
Normal file
60
generate_spotify_tasks_2.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
from multiprocessing.pool import ThreadPool
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
|
||||||
|
|
||||||
|
TT_API_URL = "https://tt.simon987.net/api"
|
||||||
|
TT_PROJECT = 7
|
||||||
|
|
||||||
|
api = TaskTrackerApi(TT_API_URL)
|
||||||
|
|
||||||
|
worker = Worker.from_file(api)
|
||||||
|
if not worker:
|
||||||
|
worker = api.make_worker("mm worker")
|
||||||
|
worker.dump_to_file()
|
||||||
|
worker.request_access(TT_PROJECT, True, True)
|
||||||
|
input("Give permission to " + worker.alias)
|
||||||
|
|
||||||
|
spotids = set()
|
||||||
|
|
||||||
|
with sqlite3.connect(sys.argv[1]) as conn:
|
||||||
|
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute("SELECT data from artist")
|
||||||
|
for row in cur.fetchall():
|
||||||
|
j = json.loads(row[0])
|
||||||
|
if j is None or "artists" not in j or "items" not in j["artists"]:
|
||||||
|
continue
|
||||||
|
for item in j["artists"]["items"]:
|
||||||
|
spotids.add(item["id"])
|
||||||
|
|
||||||
|
|
||||||
|
def mktask(lines):
|
||||||
|
res = worker.submit_task(
|
||||||
|
project=TT_PROJECT,
|
||||||
|
recipe=json.dumps(
|
||||||
|
[{"spotid": line} for line in lines]
|
||||||
|
),
|
||||||
|
unique_str=lines[0],
|
||||||
|
max_assign_time=60 * 5,
|
||||||
|
)
|
||||||
|
print(res.text)
|
||||||
|
|
||||||
|
def ids():
|
||||||
|
id_batch = list()
|
||||||
|
|
||||||
|
for spotid in spotids:
|
||||||
|
id_batch.append(spotid)
|
||||||
|
if len(id_batch) >= 30:
|
||||||
|
res = list(id_batch)
|
||||||
|
id_batch.clear()
|
||||||
|
yield res
|
||||||
|
|
||||||
|
tasks = list(ids())
|
||||||
|
|
||||||
|
pool = ThreadPool(processes=25)
|
||||||
|
pool.map(func=mktask, iterable=tasks)
|
||||||
|
|
@ -23,6 +23,7 @@ wget ${REPOSITORY}/release.csv
|
|||||||
wget ${REPOSITORY}/tag.csv
|
wget ${REPOSITORY}/tag.csv
|
||||||
wget ${REPOSITORY}/tag_tag.csv
|
wget ${REPOSITORY}/tag_tag.csv
|
||||||
wget ${REPOSITORY}/release_tag.csv
|
wget ${REPOSITORY}/release_tag.csv
|
||||||
|
wget ${REPOSITORY}/release_label.csv
|
||||||
wget ${REPOSITORY}/release_release.csv
|
wget ${REPOSITORY}/release_release.csv
|
||||||
wget ${REPOSITORY}/artist_tag.csv
|
wget ${REPOSITORY}/artist_tag.csv
|
||||||
wget ${REPOSITORY}/labels.csv
|
wget ${REPOSITORY}/labels.csv
|
||||||
@ -46,7 +47,8 @@ wget ${REPOSITORY}/lastfm_artist_artist.csv
|
|||||||
--relationships:IS_RELATED_TO "tag_tag.csv"\
|
--relationships:IS_RELATED_TO "tag_tag.csv"\
|
||||||
--relationships "label_label.csv"\
|
--relationships "label_label.csv"\
|
||||||
--relationships "release_release.csv"\
|
--relationships "release_release.csv"\
|
||||||
--relationships:IS_RELATED_TO "lastfm_artist_artist.csv"
|
--relationships:IS_RELATED_TO "lastfm_artist_artist.csv"\
|
||||||
|
--relationships:RELEASE_UNDER "release_label.csv"
|
||||||
|
|
||||||
rm *.csv
|
rm *.csv
|
||||||
cd ..
|
cd ..
|
||||||
|
@ -15,10 +15,10 @@ def disambiguate(lfm_artist, artist_release_count, name, mbid):
|
|||||||
|
|
||||||
lfm_artist[name] = mbid
|
lfm_artist[name] = mbid
|
||||||
|
|
||||||
print("Replacing %s (%s) with %s (%d) for %s" %
|
# print("Replacing %s (%s) with %s (%d) for %s" %
|
||||||
(existing_mbid, artist_release_count[existing_mbid],
|
# (existing_mbid, artist_release_count[existing_mbid],
|
||||||
mbid, artist_release_count[mbid],
|
# mbid, artist_release_count[mbid],
|
||||||
name))
|
# name))
|
||||||
else:
|
else:
|
||||||
lfm_artist[name] = mbid
|
lfm_artist[name] = mbid
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import re
|
import re
|
||||||
|
from statistics import median
|
||||||
|
|
||||||
links = dict()
|
links = dict()
|
||||||
link_types = dict()
|
link_types = dict()
|
||||||
@ -314,14 +315,21 @@ with open("repo/release_release.csv", "w") as out:
|
|||||||
|
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
|
tag_occurence = defaultdict(int)
|
||||||
|
with open("in/release_group_tag") as f:
|
||||||
|
for line in f:
|
||||||
|
tag_occurence[line.split("\t")[1]] += 1
|
||||||
|
|
||||||
with open("in/tag") as f:
|
with open("in/tag") as f:
|
||||||
with open("repo/tag.csv", "w") as out:
|
with open("repo/tag.csv", "w") as out:
|
||||||
out.write("id:ID(Tag),name\n")
|
out.write("id:ID(Tag),name, occurences\n")
|
||||||
|
|
||||||
for line in f:
|
for line in f:
|
||||||
cols = line.split("\t")
|
cols = line.split("\t")
|
||||||
|
if tag_occurence[cols[0]] < 5:
|
||||||
|
continue
|
||||||
tags[cols[0]] = cols
|
tags[cols[0]] = cols
|
||||||
out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"\n")
|
out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"," + str(tag_occurence[cols[0]]) + "\n")
|
||||||
|
|
||||||
with open("repo/release_tag.csv", "w") as out:
|
with open("repo/release_tag.csv", "w") as out:
|
||||||
out.write(":START_ID(Release),:END_ID(Tag),weight:float\n")
|
out.write(":START_ID(Release),:END_ID(Tag),weight:float\n")
|
||||||
@ -341,11 +349,15 @@ with open("repo/release_tag.csv", "w") as out:
|
|||||||
count = int(cols[2])
|
count = int(cols[2])
|
||||||
if count <= 0:
|
if count <= 0:
|
||||||
continue
|
continue
|
||||||
|
if cols[1] not in tags:
|
||||||
|
continue
|
||||||
out.write(",".join((
|
out.write(",".join((
|
||||||
release_groups[cols[0]][1],
|
release_groups[cols[0]][1],
|
||||||
cols[1],
|
cols[1],
|
||||||
str(max(min(count / max_count, 1), 0.2)),
|
str(max(min(count / max_count, 1), 0.2)),
|
||||||
)) + "\n")
|
)) + "\n")
|
||||||
|
tag_occurence[cols[1]] += 1
|
||||||
|
|
||||||
|
|
||||||
with open("repo/artist_tag.csv", "w") as out:
|
with open("repo/artist_tag.csv", "w") as out:
|
||||||
out.write(":START_ID(Artist),:END_ID(Tag),weight:float\n")
|
out.write(":START_ID(Artist),:END_ID(Tag),weight:float\n")
|
||||||
@ -366,6 +378,8 @@ with open("repo/artist_tag.csv", "w") as out:
|
|||||||
count = int(cols[2])
|
count = int(cols[2])
|
||||||
if count <= 0:
|
if count <= 0:
|
||||||
continue
|
continue
|
||||||
|
if cols[1] not in tags:
|
||||||
|
continue
|
||||||
|
|
||||||
out.write(",".join((
|
out.write(",".join((
|
||||||
artists[cols[0]][1],
|
artists[cols[0]][1],
|
||||||
@ -374,39 +388,68 @@ with open("repo/artist_tag.csv", "w") as out:
|
|||||||
)) + "\n")
|
)) + "\n")
|
||||||
|
|
||||||
with open("repo/tag_tag.csv", "w") as out:
|
with open("repo/tag_tag.csv", "w") as out:
|
||||||
out.write(":START_ID(Tag),:END_ID(Tag),weight:int\n")
|
out.write(":START_ID(Tag),:END_ID(Tag),weight:float\n")
|
||||||
|
|
||||||
|
def weights():
|
||||||
|
with open("in/tag_relation") as f:
|
||||||
|
for line in f:
|
||||||
|
weight = int(line.split("\t")[2])
|
||||||
|
if weight < 5:
|
||||||
|
continue
|
||||||
|
yield weight
|
||||||
|
weight_median = median(weights()) * 3
|
||||||
|
|
||||||
# TODO: normalize weight so it's between [0,1]
|
|
||||||
with open("in/tag_relation") as f:
|
with open("in/tag_relation") as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
cols = line.split("\t")
|
cols = line.split("\t")
|
||||||
|
|
||||||
if int(cols[2]) <= 0:
|
weight = int(cols[2])
|
||||||
|
if weight < 5:
|
||||||
|
continue
|
||||||
|
if cols[0] not in tags or cols[1] not in tags:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
out.write(",".join((
|
out.write(",".join((
|
||||||
cols[0],
|
cols[0],
|
||||||
cols[1],
|
cols[1],
|
||||||
cols[2],
|
str(max(min(weight / weight_median, 1), 0.2)),
|
||||||
)) + "\n")
|
)) + "\n")
|
||||||
|
|
||||||
# -----
|
# -----
|
||||||
|
|
||||||
with open("repo/labels.csv", "w") as out:
|
with open("repo/labels.csv", "w") as out:
|
||||||
out.write("id:ID(Label),name,code,:LABEL\n")
|
out.write("id:ID(Label),name,sortname,code,:LABEL\n")
|
||||||
|
|
||||||
with open("in/label") as f:
|
with open("in/label") as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
cols = line.split("\t")
|
cols = line.split("\t")
|
||||||
labels[cols[0]] = cols
|
labels[cols[0]] = cols
|
||||||
|
|
||||||
|
sortname = ASCII_RE.sub("_", cols[2]).upper()
|
||||||
out.write(",".join((
|
out.write(",".join((
|
||||||
cols[1],
|
cols[1],
|
||||||
"\"" + cols[2].replace("\"", "\"\"") + "\"",
|
"\"" + cols[2].replace("\"", "\"\"") + "\"",
|
||||||
|
sortname,
|
||||||
cols[9] if cols[9] != "\\N" else "",
|
cols[9] if cols[9] != "\\N" else "",
|
||||||
"Label" + label_types[cols[10]]
|
"Label" + label_types[cols[10]]
|
||||||
)) + "\n")
|
)) + "\n")
|
||||||
|
|
||||||
|
with open("repo/release_label.csv", "w") as out:
|
||||||
|
out.write(":START_ID(Release),:END_ID(Label)\n")
|
||||||
|
|
||||||
|
# Should I check link types here?
|
||||||
|
with open("in/l_label_release_group") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
out.write(release_groups[cols[3]][1] + "," + labels[cols[2]][1] + "\n")
|
||||||
|
|
||||||
|
with open("in/l_label_release") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
if cols[3] in release_to_release_group_map:
|
||||||
|
out.write(release_groups[release_to_release_group_map[cols[3]]][1] + "," + labels[cols[2]][1] + "\n")
|
||||||
|
|
||||||
|
|
||||||
with open("repo/label_label.csv", "w") as out:
|
with open("repo/label_label.csv", "w") as out:
|
||||||
out.write(":START_ID(Label),:END_ID(Label),:TYPE\n")
|
out.write(":START_ID(Label),:END_ID(Label),:TYPE\n")
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
CREATE INDEX ON :Artist(id);
|
CREATE INDEX ON :Artist(id);
|
||||||
CREATE INDEX ON :Artist(sortname);
|
CREATE INDEX ON :Artist(sortname);
|
||||||
CREATE INDEX ON :Release(id);
|
CREATE INDEX ON :Release(id);
|
||||||
|
CREATE INDEX ON :Label(sortname);
|
||||||
|
1
spotify
Submodule
1
spotify
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 4ac596b2ff7659b880ac8a3fe9c58ea6527c2efc
|
1
spotify2
Submodule
1
spotify2
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 0a05c69bcf7005496c2efdf5b825ffa2f443ccdf
|
Loading…
x
Reference in New Issue
Block a user