music-graph-scripts/process_mb_dump.py

467 lines
14 KiB
Python

import os
from collections import defaultdict
import re
from statistics import median
links = dict()
link_types = dict()
areas = dict()
labels = dict()
label_types = {
"\\N": ""
}
release_groups = dict()
release_statuses = dict()
release_to_release_group_map = dict()
release_types = {
"\\N": "",
}
artists = dict()
tags = dict()
release_release_rel_map = {
"covers and versions": "",
"remixes and compilations": "",
"DJ-mix": "IS_DJ_MIX_OF",
"live performance": "IS_LIVE_PERFORMANCE_OF",
"cover": "IS_COVER_OF",
"remix": "IS_REMIX_OF",
"mashes up": "IS_MASHUP_OF",
"included in": "INCLUDED_IN",
"single from": "IS_SINGLE_FROM"
}
artist_release_rel_map = {
"translator": "TRANSLATED",
"liner notes": "WROTE_LINER_NOTES",
"lyricist": "IS_LYRICIST_FOR",
"lacquer cut": "DID_LACQUER_CUT_FOR",
"samples from artist": "HAS_SAMPLES_IN",
"remixes and compilations": "",
"composition": "COMPOSED",
"booking": "DID_BOOKING_FOR",
"balance": "DID_BALANCE_FOR",
"misc": "HAS_MISC_ROLE_IN",
"conductor": "CONDUCTED",
"legal representation": "PROVIDED_LEGAL_REPRESENTATION_FOR",
"design/illustration": "DID_DESIGN_FOR",
"performing orchestra": "PERFORMED_FOR",
"producer": "PRODUCED",
"instrument": "PERFORMED_INSTRUMENT_FOR",
"writer": "WROTE_LYRICS_FOR",
"production": "DID_PRODUCTION_FOR",
"performance": "PERFORMED_FOR",
"composer": "IS_COMPOSER_FOR",
"sound": "DID_SOUND_FOR",
"remixer": "DID_REMIXING_FOR",
"orchestrator": "IS_ORCHESTRATOR_FOR",
"compiler": "DID_COMPILATION_FOR",
"vocal arranger": "IS_ARRANGER_FOR",
"arranger": "IS_ARRENGER_FOR",
"mix-DJ": "MIXED",
"editor": "IS_EDITOR_FOR",
"illustration": "DID_ILLUSTRATION_FOR",
"audio": "DID_AUDIO_FOR",
"publishing": "IS_PUBLISHER_FOR",
"art direction": "DID_ART_DIRECTOR_FOR",
"design": "DID_DESIGN_FOR",
"instrument arranger": "IS_ARRANGER_FOR",
"chorus master": "IS_CHORUS_MASTER_FOR",
"photography": "DID_PHOTOGRAPHY_FOR",
"performer": "PERFORMED_IN",
"graphic design": "DID_GRAPHIC_DESIGN_FOR",
"booklet editor": "IS_BOOKLET_EDITOR_FOR",
"programming": "DID_PROGRAMING_FOR",
"copyright": "IS_COPYRIGHT_HOLDER_OF",
"piano technician": "IS_PIANO_TECNICIAN_FOR",
"phonographic copyright": "IS_PHONOGRAPHIC_COPYRIGHT_HOLDER_OF",
"mastering": "DID_MASTERING_FOR",
"vocal": "PERFORED_VOCALS_FOR",
"librettist": "IS_LIBRETTIST_FOR",
"mix": "MIXED",
"recording": "DID_RECORDING_FOR",
"concertmaster": "IS_CONCERTMASTER_FOR",
"engineer": "IS_ENGINEER_FOR",
# release_group
"tribute": "IS_TRIBUTE_TO",
"dedicated to": "IS_DEDICATED_TO",
"creative direction": "",
"artists and repertoire": ""
}
artist_artist_rel_map = {
"teacher": "TEACHER_OF",
"composer-in-residence": "HAS_COMPOSER-IN-RESIDENCE_STATUS_IN",
"member of band": "IS_MEMBER_OF",
"voice actor": "IS_VOICE_ACTOR_OF",
"tribute": "IS_TRIBUTE_TO",
"supporting musician": "IS_SUPPORTING_MUSICIAN_OF",
"instrumental supporting musician": "IS_INSTRUMENTAL_SUPPORTING_MUSICIAN_OF",
"personal relationship": "HAS_PERSONAL_RELATIONSHIP_WITH",
"musical relationships": "HAS_MUSICAL_RELATIONSHIP_WITH",
"collaboration": "HAS_COLLABORATED_WITH",
"married": "IS_MARRIED_WITH",
"sibling": "IS_SIBLING_OF",
"parent": "IS_PARENT_OF",
"is person": "IS",
"conductor position": "IS_CONDUCTOR_OF",
"vocal supporting musician": "DOES_VOCAL_SUPPORT_FOR",
"artistic director": "IS_ARTIST_DIRECTOR_OF",
"subgroup": "IS_SUBGROUP_OF",
"founder": "IS_FOUNDER_OF",
"involved with": "IS_INVOLVED_WITH",
"named after": "IS_NAMED_AFTER",
}
label_label_rel_map = {
"label rename": "WAS_RENAMED_TO",
"imprint": "DOES_IMPRINT_FOR",
"label distribution": "DOES_DISTRIBUTION_FOR",
"business association": "HAS_BUSINESS_ASSOCIATION_TO",
"label ownership": "OWNS",
"label reissue": "DOES_REISSUING_FOR"
}
if not os.path.exists("repo"):
os.mkdir("repo")
else:
os.system("rm repo/*")
if not os.path.exists("tmp"):
os.mkdir("tmp")
else:
os.system("rm tmp/*")
with open("in/link", "r") as f:
for line in f:
cols = line.split("\t")
links[cols[0]] = cols
with open("in/release_status", "r") as f:
for line in f:
cols = line.split("\t")
release_statuses[cols[0]] = cols
with open("in/link_type", "r") as f:
for line in f:
cols = line.split("\t")
link_types[cols[0]] = cols
with open("in/area", "r") as f:
for line in f:
cols = line.split("\t")
areas[cols[0]] = cols
with open("in/label_type") as f:
for line in f:
cols = line.split("\t")
label_types[cols[0]] = ";" + cols[1].replace(" ", "")
if cols[3] != "\\N" and cols[2] in label_types:
label_types[cols[0]] += label_types[cols[2]].replace(" ", "")
with open("in/artist") as f:
for line in f:
cols = line.split("\t")
artists[cols[0]] = cols
with open("repo/area_area.csv", "w") as out:
out.write(":START_ID(Area),:END_ID(Area)\n")
with open("in/l_area_area", "r") as f:
for line in f:
cols = line.split("\t")
out.write(",".join((areas[cols[3]][1],
areas[cols[2]][1]
)) + "\n")
with open("repo/area.csv", "w") as out:
out.write("id:ID(Area),name\n")
for k, area in areas.items():
out.write(",".join((area[1],
'"' + area[2] + '"'
)) + "\n")
# ------
out_artist = open("repo/artist.csv", "w")
out_artist_area = open("repo/artist_area.csv", "w")
out_artist.write("id:ID(Artist),name,sortname,year:int,comment,:LABEL\n")
out_artist_area.write(":START_ID(Artist),:END_ID(Area)\n")
ASCII_RE = re.compile(r"[^a-zA-Z0-9.\-!?& ]")
ALPHANUM_RE = re.compile(r"[^\w.\-!?& ]")
for _, artist in artists.items():
sortname = ASCII_RE.sub("_", artist[2]).upper()
if sortname.replace("_", "").strip() == "":
sortname = ALPHANUM_RE.sub("_", artist[3]).upper()
out_artist.write(",".join((
artist[1],
'"' + artist[2].replace("\"", "\"\"") + '"',
sortname,
artist[4] if artist[4] != "\\N" else "0",
('"' + artist[13].replace("\"", "\"\"") + '"') if artist[13] != "\\N" else "",
"Artist" + (";Group\n" if artist[10] == "2" else "\n")
)))
if artist[11] != "\\N":
out_artist_area.write(artist[1] + "," + areas[artist[11]][1] + "\n")
out_artist.close()
out_artist_area.close()
with open("repo/artist_artist.csv", "w") as out:
out.write(":START_ID(Artist),:END_ID(Artist),:TYPE\n")
with open("in/l_artist_artist", "r") as f:
for line in f:
cols = line.split("\t")
out.write(",".join((
artists[cols[2]][1],
artists[cols[3]][1],
artist_artist_rel_map[link_types[links[cols[1]][1]][6]] + "\n"
)))
# --------
with open("in/release_group_primary_type") as f:
for line in f:
cols = line.split("\t")
release_types[cols[0]] = ";" + cols[1]
release_group_year = dict()
with open("in/release_group_meta") as f:
for line in f:
cols = line.split("\t")
release_group_year[cols[0]] = cols[2] if cols[2] != "\\N" else "0"
with open("repo/release.csv", "w") as out:
out.write("id:ID(Release),name,year:int,:LABEL\n")
with open("in/release_group") as f:
for line in f:
cols = line.split("\t")
out.write(",".join((
cols[1],
'"' + cols[2].replace("\"", "\"\"") + '"',
release_group_year[cols[0]],
"Release" + release_types[cols[4]],
)) + "\n")
release_groups[cols[0]] = cols
with open("in/release") as f:
for line in f:
cols = line.split("\t")
if cols[5] != '\\N' and release_statuses[cols[5]][1] == "Official":
release_to_release_group_map[cols[0]] = cols[4]
credit_names = defaultdict(list)
with open("in/artist_credit_name") as f:
for line in f:
cols = line.split("\t")
credit_names[cols[0]].append(artists[cols[2]][1])
with open("tmp/tmp_artist_release.csv", "w") as out:
out.write(":START_ID(Artist),:END_ID(Release),:TYPE\n")
# Is this part really necessary?
with open("in/l_artist_release") as f:
for line in f:
cols = line.split("\t")
if cols[3] in release_to_release_group_map:
out.write(",".join((
artists[cols[2]][1],
release_groups[release_to_release_group_map[cols[3]]][1],
artist_release_rel_map[link_types[links[cols[1]][1]][6]]
)) + "\n")
# Artist credits
with open("in/release") as f:
for line in f:
cols = line.split("\t")
if cols[0] in release_to_release_group_map:
for credit in credit_names[cols[3]]:
out.write(",".join((
credit,
release_groups[release_to_release_group_map[cols[0]]][1],
"CREDITED_FOR"
)) + "\n")
# Remove dupes
os.system("(head -n 1 tmp/tmp_artist_release.csv && tail -n +2 tmp/tmp_artist_release.csv"
" | sort) | uniq > repo/artist_release.csv && rm tmp/tmp_artist_release.csv")
with open("repo/release_release.csv", "w") as out:
out.write(":START_ID(Release),:END_ID(Release),:TYPE\n")
with open("in/l_release_group_release_group") as f:
for line in f:
cols = line.split("\t")
out.write(",".join((
release_groups[cols[2]][1],
release_groups[cols[3]][1],
release_release_rel_map[link_types[links[cols[1]][1]][6]]
)) + "\n")
# ---
tag_occurence = defaultdict(int)
with open("in/release_group_tag") as f:
for line in f:
tag_occurence[line.split("\t")[1]] += 1
with open("in/tag") as f:
with open("repo/tag.csv", "w") as out:
out.write("id:ID(Tag),name, occurences\n")
for line in f:
cols = line.split("\t")
if tag_occurence[cols[0]] < 5:
continue
tags[cols[0]] = cols
out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"," + str(tag_occurence[cols[0]]) + "\n")
with open("repo/release_tag.csv", "w") as out:
out.write(":START_ID(Release),:END_ID(Tag),weight:float\n")
# get max count
max_count = 0
with open("in/release_group_tag") as f:
for line in f:
cols = line.split("\t")
max_count = max(max_count, int(cols[2]))
max_count = max_count / 4
# weight is linear
with open("in/release_group_tag") as f:
for line in f:
cols = line.split("\t")
count = int(cols[2])
if count <= 0:
continue
if cols[1] not in tags:
continue
out.write(",".join((
release_groups[cols[0]][1],
cols[1],
str(max(min(count / max_count, 1), 0.2)),
)) + "\n")
tag_occurence[cols[1]] += 1
with open("repo/artist_tag.csv", "w") as out:
out.write(":START_ID(Artist),:END_ID(Tag),weight:float\n")
# get max count
max_count = 0
with open("in/artist_tag") as f:
for line in f:
cols = line.split("\t")
max_count = max(max_count, int(cols[2]))
max_count = max_count / 4
# Weight is linear
with open("in/artist_tag") as f:
for line in f:
cols = line.split("\t")
count = int(cols[2])
if count <= 0:
continue
if cols[1] not in tags:
continue
out.write(",".join((
artists[cols[0]][1],
cols[1],
str(max(min(count / max_count, 1), 0.2)),
)) + "\n")
with open("repo/tag_tag.csv", "w") as out:
out.write(":START_ID(Tag),:END_ID(Tag),weight:float\n")
def weights():
with open("in/tag_relation") as f:
for line in f:
weight = int(line.split("\t")[2])
if weight < 5:
continue
yield weight
weight_median = median(weights()) * 3
with open("in/tag_relation") as f:
for line in f:
cols = line.split("\t")
weight = int(cols[2])
if weight < 5:
continue
if cols[0] not in tags or cols[1] not in tags:
continue
out.write(",".join((
cols[0],
cols[1],
str(max(min(weight / weight_median, 1), 0.2)),
)) + "\n")
# -----
with open("repo/labels.csv", "w") as out:
out.write("id:ID(Label),name,sortname,code,:LABEL\n")
with open("in/label") as f:
for line in f:
cols = line.split("\t")
labels[cols[0]] = cols
sortname = ASCII_RE.sub("_", cols[2]).upper()
out.write(",".join((
cols[1],
"\"" + cols[2].replace("\"", "\"\"") + "\"",
sortname,
cols[9] if cols[9] != "\\N" else "",
"Label" + label_types[cols[10]]
)) + "\n")
with open("repo/release_label.csv", "w") as out:
out.write(":START_ID(Release),:END_ID(Label)\n")
# Should I check link types here?
with open("in/l_label_release_group") as f:
for line in f:
cols = line.split("\t")
out.write(release_groups[cols[3]][1] + "," + labels[cols[2]][1] + "\n")
with open("in/l_label_release") as f:
for line in f:
cols = line.split("\t")
if cols[3] in release_to_release_group_map:
out.write(release_groups[release_to_release_group_map[cols[3]]][1] + "," + labels[cols[2]][1] + "\n")
with open("repo/label_label.csv", "w") as out:
out.write(":START_ID(Label),:END_ID(Label),:TYPE\n")
with open("in/l_label_label") as f:
for line in f:
cols = line.split("\t")
out.write(",".join((
labels[cols[2]][1],
labels[cols[3]][1],
label_label_rel_map[link_types[links[cols[1]][1]][6]]
)) + "\n")
# ---