import os from collections import defaultdict import re from statistics import median links = dict() link_types = dict() areas = dict() labels = dict() label_types = { "\\N": "" } release_groups = dict() release_statuses = dict() release_to_release_group_map = dict() release_types = { "\\N": "", } artists = dict() tags = dict() release_release_rel_map = { "covers and versions": "", "remixes and compilations": "", "DJ-mix": "IS_DJ_MIX_OF", "live performance": "IS_LIVE_PERFORMANCE_OF", "cover": "IS_COVER_OF", "remix": "IS_REMIX_OF", "mashes up": "IS_MASHUP_OF", "included in": "INCLUDED_IN", "single from": "IS_SINGLE_FROM" } artist_release_rel_map = { "translator": "TRANSLATED", "liner notes": "WROTE_LINER_NOTES", "lyricist": "IS_LYRICIST_FOR", "lacquer cut": "DID_LACQUER_CUT_FOR", "samples from artist": "HAS_SAMPLES_IN", "remixes and compilations": "", "composition": "COMPOSED", "booking": "DID_BOOKING_FOR", "balance": "DID_BALANCE_FOR", "misc": "HAS_MISC_ROLE_IN", "conductor": "CONDUCTED", "legal representation": "PROVIDED_LEGAL_REPRESENTATION_FOR", "design/illustration": "DID_DESIGN_FOR", "performing orchestra": "PERFORMED_FOR", "producer": "PRODUCED", "instrument": "PERFORMED_INSTRUMENT_FOR", "writer": "WROTE_LYRICS_FOR", "production": "DID_PRODUCTION_FOR", "performance": "PERFORMED_FOR", "composer": "IS_COMPOSER_FOR", "sound": "DID_SOUND_FOR", "remixer": "DID_REMIXING_FOR", "orchestrator": "IS_ORCHESTRATOR_FOR", "compiler": "DID_COMPILATION_FOR", "vocal arranger": "IS_ARRANGER_FOR", "arranger": "IS_ARRENGER_FOR", "mix-DJ": "MIXED", "editor": "IS_EDITOR_FOR", "illustration": "DID_ILLUSTRATION_FOR", "audio": "DID_AUDIO_FOR", "publishing": "IS_PUBLISHER_FOR", "art direction": "DID_ART_DIRECTOR_FOR", "design": "DID_DESIGN_FOR", "instrument arranger": "IS_ARRANGER_FOR", "chorus master": "IS_CHORUS_MASTER_FOR", "photography": "DID_PHOTOGRAPHY_FOR", "performer": "PERFORMED_IN", "graphic design": "DID_GRAPHIC_DESIGN_FOR", "booklet editor": "IS_BOOKLET_EDITOR_FOR", "programming": "DID_PROGRAMING_FOR", "copyright": "IS_COPYRIGHT_HOLDER_OF", "piano technician": "IS_PIANO_TECNICIAN_FOR", "phonographic copyright": "IS_PHONOGRAPHIC_COPYRIGHT_HOLDER_OF", "mastering": "DID_MASTERING_FOR", "vocal": "PERFORED_VOCALS_FOR", "librettist": "IS_LIBRETTIST_FOR", "mix": "MIXED", "recording": "DID_RECORDING_FOR", "concertmaster": "IS_CONCERTMASTER_FOR", "engineer": "IS_ENGINEER_FOR", # release_group "tribute": "IS_TRIBUTE_TO", "dedicated to": "IS_DEDICATED_TO", "creative direction": "", "artists and repertoire": "" } artist_artist_rel_map = { "teacher": "TEACHER_OF", "composer-in-residence": "HAS_COMPOSER-IN-RESIDENCE_STATUS_IN", "member of band": "IS_MEMBER_OF", "voice actor": "IS_VOICE_ACTOR_OF", "tribute": "IS_TRIBUTE_TO", "supporting musician": "IS_SUPPORTING_MUSICIAN_OF", "instrumental supporting musician": "IS_INSTRUMENTAL_SUPPORTING_MUSICIAN_OF", "personal relationship": "HAS_PERSONAL_RELATIONSHIP_WITH", "musical relationships": "HAS_MUSICAL_RELATIONSHIP_WITH", "collaboration": "HAS_COLLABORATED_WITH", "married": "IS_MARRIED_WITH", "sibling": "IS_SIBLING_OF", "parent": "IS_PARENT_OF", "is person": "IS", "conductor position": "IS_CONDUCTOR_OF", "vocal supporting musician": "DOES_VOCAL_SUPPORT_FOR", "artistic director": "IS_ARTIST_DIRECTOR_OF", "subgroup": "IS_SUBGROUP_OF", "founder": "IS_FOUNDER_OF", "involved with": "IS_INVOLVED_WITH", "named after": "IS_NAMED_AFTER", } label_label_rel_map = { "label rename": "WAS_RENAMED_TO", "imprint": "DOES_IMPRINT_FOR", "label distribution": "DOES_DISTRIBUTION_FOR", "business association": "HAS_BUSINESS_ASSOCIATION_TO", "label ownership": "OWNS", "label reissue": "DOES_REISSUING_FOR" } if not os.path.exists("repo"): os.mkdir("repo") else: os.system("rm repo/*") if not os.path.exists("tmp"): os.mkdir("tmp") else: os.system("rm tmp/*") with open("in/link", "r") as f: for line in f: cols = line.split("\t") links[cols[0]] = cols with open("in/release_status", "r") as f: for line in f: cols = line.split("\t") release_statuses[cols[0]] = cols with open("in/link_type", "r") as f: for line in f: cols = line.split("\t") link_types[cols[0]] = cols with open("in/area", "r") as f: for line in f: cols = line.split("\t") areas[cols[0]] = cols with open("in/label_type") as f: for line in f: cols = line.split("\t") label_types[cols[0]] = ";" + cols[1].replace(" ", "") if cols[3] != "\\N" and cols[2] in label_types: label_types[cols[0]] += label_types[cols[2]].replace(" ", "") with open("in/artist") as f: for line in f: cols = line.split("\t") artists[cols[0]] = cols with open("repo/area_area.csv", "w") as out: out.write(":START_ID(Area),:END_ID(Area)\n") with open("in/l_area_area", "r") as f: for line in f: cols = line.split("\t") out.write(",".join((areas[cols[3]][1], areas[cols[2]][1] )) + "\n") with open("repo/area.csv", "w") as out: out.write("id:ID(Area),name\n") for k, area in areas.items(): out.write(",".join((area[1], '"' + area[2] + '"' )) + "\n") # ------ out_artist = open("repo/artist.csv", "w") out_artist_area = open("repo/artist_area.csv", "w") out_artist.write("id:ID(Artist),name,sortname,year:int,comment,:LABEL\n") out_artist_area.write(":START_ID(Artist),:END_ID(Area)\n") ASCII_RE = re.compile(r"[^a-zA-Z0-9.\-!?& ]") ALPHANUM_RE = re.compile(r"[^\w.\-!?& ]") for _, artist in artists.items(): sortname = ASCII_RE.sub("_", artist[2]).upper() if sortname.replace("_", "").strip() == "": sortname = ALPHANUM_RE.sub("_", artist[3]).upper() out_artist.write(",".join(( artist[1], '"' + artist[2].replace("\"", "\"\"") + '"', sortname, artist[4] if artist[4] != "\\N" else "0", ('"' + artist[13].replace("\"", "\"\"") + '"') if artist[13] != "\\N" else "", "Artist" + (";Group\n" if artist[10] == "2" else "\n") ))) if artist[11] != "\\N": out_artist_area.write(artist[1] + "," + areas[artist[11]][1] + "\n") out_artist.close() out_artist_area.close() with open("repo/artist_artist.csv", "w") as out: out.write(":START_ID(Artist),:END_ID(Artist),:TYPE\n") with open("in/l_artist_artist", "r") as f: for line in f: cols = line.split("\t") out.write(",".join(( artists[cols[2]][1], artists[cols[3]][1], artist_artist_rel_map[link_types[links[cols[1]][1]][6]] + "\n" ))) # -------- with open("in/release_group_primary_type") as f: for line in f: cols = line.split("\t") release_types[cols[0]] = ";" + cols[1] release_group_year = dict() with open("in/release_group_meta") as f: for line in f: cols = line.split("\t") release_group_year[cols[0]] = cols[2] if cols[2] != "\\N" else "0" with open("repo/release.csv", "w") as out: out.write("id:ID(Release),name,year:int,:LABEL\n") with open("in/release_group") as f: for line in f: cols = line.split("\t") out.write(",".join(( cols[1], '"' + cols[2].replace("\"", "\"\"") + '"', release_group_year[cols[0]], "Release" + release_types[cols[4]], )) + "\n") release_groups[cols[0]] = cols with open("in/release") as f: for line in f: cols = line.split("\t") if cols[5] != '\\N' and release_statuses[cols[5]][1] == "Official": release_to_release_group_map[cols[0]] = cols[4] credit_names = defaultdict(list) with open("in/artist_credit_name") as f: for line in f: cols = line.split("\t") credit_names[cols[0]].append(artists[cols[2]][1]) with open("tmp/tmp_artist_release.csv", "w") as out: out.write(":START_ID(Artist),:END_ID(Release),:TYPE\n") # Is this part really necessary? with open("in/l_artist_release") as f: for line in f: cols = line.split("\t") if cols[3] in release_to_release_group_map: out.write(",".join(( artists[cols[2]][1], release_groups[release_to_release_group_map[cols[3]]][1], artist_release_rel_map[link_types[links[cols[1]][1]][6]] )) + "\n") # Artist credits with open("in/release") as f: for line in f: cols = line.split("\t") if cols[0] in release_to_release_group_map: for credit in credit_names[cols[3]]: out.write(",".join(( credit, release_groups[release_to_release_group_map[cols[0]]][1], "CREDITED_FOR" )) + "\n") # Remove dupes os.system("(head -n 1 tmp/tmp_artist_release.csv && tail -n +2 tmp/tmp_artist_release.csv" " | sort) | uniq > repo/artist_release.csv && rm tmp/tmp_artist_release.csv") with open("repo/release_release.csv", "w") as out: out.write(":START_ID(Release),:END_ID(Release),:TYPE\n") with open("in/l_release_group_release_group") as f: for line in f: cols = line.split("\t") out.write(",".join(( release_groups[cols[2]][1], release_groups[cols[3]][1], release_release_rel_map[link_types[links[cols[1]][1]][6]] )) + "\n") # --- tag_occurence = defaultdict(int) with open("in/release_group_tag") as f: for line in f: tag_occurence[line.split("\t")[1]] += 1 with open("in/tag") as f: with open("repo/tag.csv", "w") as out: out.write("id:ID(Tag),name, occurences\n") for line in f: cols = line.split("\t") if tag_occurence[cols[0]] < 5: continue tags[cols[0]] = cols out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"," + str(tag_occurence[cols[0]]) + "\n") with open("repo/release_tag.csv", "w") as out: out.write(":START_ID(Release),:END_ID(Tag),weight:float\n") # get max count max_count = 0 with open("in/release_group_tag") as f: for line in f: cols = line.split("\t") max_count = max(max_count, int(cols[2])) max_count = max_count / 4 # weight is linear with open("in/release_group_tag") as f: for line in f: cols = line.split("\t") count = int(cols[2]) if count <= 0: continue if cols[1] not in tags: continue out.write(",".join(( release_groups[cols[0]][1], cols[1], str(max(min(count / max_count, 1), 0.2)), )) + "\n") tag_occurence[cols[1]] += 1 with open("repo/artist_tag.csv", "w") as out: out.write(":START_ID(Artist),:END_ID(Tag),weight:float\n") # get max count max_count = 0 with open("in/artist_tag") as f: for line in f: cols = line.split("\t") max_count = max(max_count, int(cols[2])) max_count = max_count / 4 # Weight is linear with open("in/artist_tag") as f: for line in f: cols = line.split("\t") count = int(cols[2]) if count <= 0: continue if cols[1] not in tags: continue out.write(",".join(( artists[cols[0]][1], cols[1], str(max(min(count / max_count, 1), 0.2)), )) + "\n") with open("repo/tag_tag.csv", "w") as out: out.write(":START_ID(Tag),:END_ID(Tag),weight:float\n") def weights(): with open("in/tag_relation") as f: for line in f: weight = int(line.split("\t")[2]) if weight < 5: continue yield weight weight_median = median(weights()) * 3 with open("in/tag_relation") as f: for line in f: cols = line.split("\t") weight = int(cols[2]) if weight < 5: continue if cols[0] not in tags or cols[1] not in tags: continue out.write(",".join(( cols[0], cols[1], str(max(min(weight / weight_median, 1), 0.2)), )) + "\n") # ----- with open("repo/labels.csv", "w") as out: out.write("id:ID(Label),name,sortname,code,:LABEL\n") with open("in/label") as f: for line in f: cols = line.split("\t") labels[cols[0]] = cols sortname = ASCII_RE.sub("_", cols[2]).upper() out.write(",".join(( cols[1], "\"" + cols[2].replace("\"", "\"\"") + "\"", sortname, cols[9] if cols[9] != "\\N" else "", "Label" + label_types[cols[10]] )) + "\n") with open("repo/release_label.csv", "w") as out: out.write(":START_ID(Release),:END_ID(Label)\n") # Should I check link types here? with open("in/l_label_release_group") as f: for line in f: cols = line.split("\t") out.write(release_groups[cols[3]][1] + "," + labels[cols[2]][1] + "\n") with open("in/l_label_release") as f: for line in f: cols = line.split("\t") if cols[3] in release_to_release_group_map: out.write(release_groups[release_to_release_group_map[cols[3]]][1] + "," + labels[cols[2]][1] + "\n") with open("repo/label_label.csv", "w") as out: out.write(":START_ID(Label),:END_ID(Label),:TYPE\n") with open("in/l_label_label") as f: for line in f: cols = line.split("\t") out.write(",".join(( labels[cols[2]][1], labels[cols[3]][1], label_label_rel_map[link_types[links[cols[1]][1]][6]] )) + "\n") # ---