diff --git a/.gitignore b/.gitignore index 894a44c..a8c339e 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,11 @@ venv.bak/ # mypy .mypy_cache/ + +.idea/ +in/ +repo/ +tmp/ +workspace/ +worker.json +*.db diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..11d1bc0 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,9 @@ +[submodule "task_tracker_drone"] + path = task_tracker_drone + url = https://github.com/simon987/task_tracker_drone/ +[submodule "last.fm"] + path = last.fm + url = https://git.simon987.net/drone/last.fm +[submodule "caa"] + path = caa + url = https://git.simon987.net/drone/caa.git diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..28a804d --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..3e8c63a --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/music-graph-scripts.iml b/.idea/music-graph-scripts.iml new file mode 100644 index 0000000..d6ebd48 --- /dev/null +++ b/.idea/music-graph-scripts.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/caa b/caa new file mode 160000 index 0000000..910f4a0 --- /dev/null +++ b/caa @@ -0,0 +1 @@ +Subproject commit 910f4a0bceadac37ac28fa59e7648f241c931fe6 diff --git a/download_mb_dump.sh b/download_mb_dump.sh new file mode 100755 index 0000000..ff3bf47 --- /dev/null +++ b/download_mb_dump.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +latest=$(curl http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/LATEST) + +mkdir in 2> /dev/null +cd in + +wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/mbdump.tar.bz2" +wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/mbdump-derived.tar.bz2" + +tar -xjvf mbdump.tar.bz2 mbdump/area mbdump/artist mbdump/l_area_area mbdump/l_artist_artist \ +mbdump/l_artist_release mbdump/l_artist_release_group mbdump/l_label_label mbdump/l_release_group_release_group \ +mbdump/label mbdump/label_type mbdump/link mbdump/link_type mbdump/release mbdump/release_group \ +mbdump/release_group_primary_type mbdump/artist_credit_name mbdump/release_status +tar -xjvf mbdump-derived.tar.bz2 mbdump/artist_tag mbdump/release_group_tag mbdump/tag mbdump/tag_relation \ +mbdump/release_group_meta + +mv mbdump/* . +rm -r mbdump +cd .. \ No newline at end of file diff --git a/extract_covers.py b/extract_covers.py new file mode 100644 index 0000000..e50ea6c --- /dev/null +++ b/extract_covers.py @@ -0,0 +1,27 @@ +import sqlite3 + +import sys + +with sqlite3.connect(sys.argv[1]) as conn: + + cursor = conn.cursor() + cursor.execute("SELECT id from covers") + + cursor = conn.cursor() + cursor.execute("SELECT id from covers") + + def rows(): + buf = list() + for row in cursor.fetchall(): + buf.append(row[0]) + if len(buf) > 30: + yield buf + buf.clear() + + for batch in rows(): + cursor.execute("SELECT cover from covers where id in (%s)" % (",".join(("'" + b + "'") for b in batch))) + covers = cursor.fetchall() + for i, cover in enumerate(covers): + with open("./tmpcovers/" + batch[i] + ".jpg", "wb") as out: + out.write(cover[0]) + print(batch[i]) diff --git a/generate_caa_tasks.py b/generate_caa_tasks.py new file mode 100644 index 0000000..cb20260 --- /dev/null +++ b/generate_caa_tasks.py @@ -0,0 +1,56 @@ +import json +from multiprocessing.pool import ThreadPool + +from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker + +TT_API_URL = "https://tt.simon987.net/api" +TT_PROJECT = 5 + + +done = set() +# with sqlite3.connect(sys.argv[1]) as conn: +# cur = conn.cursor() +# cur.execute("SELECT id FROM covers") +# for mbid in cur.fetchall(): +# done.add(mbid[0]) + +api = TaskTrackerApi(TT_API_URL) + +worker = Worker.from_file(api) +if not worker: + worker = api.make_worker("caa scraper") + worker.dump_to_file() +worker.request_access(TT_PROJECT, True, True) +input("Give permission to " + worker.alias) + + +def mktask(mbids): + res = worker.submit_task( + project=TT_PROJECT, + recipe=json.dumps(mbids), + hash64=hash(mbids[0]), + max_assign_time=60 * 30, + priority=1, + unique_str=None, + verification_count=None, + max_retries=5, + ) + print(res.text) + + +def lines(): + with open("in/release") as f: + buf = list() + + for line in f: + cols = line.split("\t") + + buf.append(cols[1]) + if len(buf) == 75: + a = list(buf) + buf.clear() + yield a + + +pool = ThreadPool(processes=20) +pool.map(func=mktask, iterable=lines()) diff --git a/generate_lastfm_tasks.py b/generate_lastfm_tasks.py new file mode 100644 index 0000000..1d8051c --- /dev/null +++ b/generate_lastfm_tasks.py @@ -0,0 +1,48 @@ +import csv +import json +from multiprocessing.pool import ThreadPool + +from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker + +TT_API_URL = "https://tt.simon987.net/api" +TT_PROJECT = 1 + +api = TaskTrackerApi(TT_API_URL) + +worker = Worker.from_file(api) +if not worker: + worker = api.make_worker("last.fm scraper") + worker.dump_to_file() +worker.request_access(TT_PROJECT, True, True) +input("Give permission to " + worker.alias) + +with open("repo/artist.csv") as f: + reader = csv.reader(f) + + def mktask(lines): + res = worker.submit_task( + project=TT_PROJECT, + recipe=json.dumps( + [{"mbid": line[0], "name": line[1]} for line in lines] + ), + unique_str=lines[0][0], + max_assign_time=60 * 5, + ) + print(res.text) + + def lines(): + line_batch = list() + + for line in reader: + if "Group" in line[3]: + line_batch.append(line) + if len(line_batch) >= 30: + res = list(line_batch) + line_batch.clear() + yield res + + tasks = list(lines()) + + pool = ThreadPool(processes=25) + pool.map(func=mktask, iterable=tasks) + diff --git a/last.fm b/last.fm new file mode 160000 index 0000000..855df64 --- /dev/null +++ b/last.fm @@ -0,0 +1 @@ +Subproject commit 855df64c316930062ff4f7740492d0f039788498 diff --git a/make_neoj4_db.sh b/make_neoj4_db.sh new file mode 100755 index 0000000..85930e7 --- /dev/null +++ b/make_neoj4_db.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +export NEO4J_HOME="/home/drone/Downloads/neo4j-community-3.5.3" +export REPOSITORY="http://localhost:9999" +export DATABASE="graph.db" + +rm -rf "${NEO4J_HOME}/data/databases/${DATABASE}" + +cp ${NEO4J_HOME}/conf/neo4j.conf ${NEO4J_HOME}/conf/neo4j.conf.bak +echo "dbms.security.auth_enabled=false" >> ${NEO4J_HOME}/conf/neo4j.conf + +mkdir workspace 2> /dev/null +cd workspace +rm *.csv + +wget ${REPOSITORY}/area.csv +wget ${REPOSITORY}/area_area.csv +wget ${REPOSITORY}/lastfm_artist.csv +wget ${REPOSITORY}/artist_area.csv +wget ${REPOSITORY}/artist_artist.csv +wget ${REPOSITORY}/artist_release.csv +wget ${REPOSITORY}/release.csv +wget ${REPOSITORY}/tag.csv +wget ${REPOSITORY}/tag_tag.csv +wget ${REPOSITORY}/release_tag.csv +wget ${REPOSITORY}/release_release.csv +wget ${REPOSITORY}/artist_tag.csv +wget ${REPOSITORY}/labels.csv +wget ${REPOSITORY}/label_label.csv +wget ${REPOSITORY}/lastfm_artist_artist.csv + +. ${NEO4J_HOME}/bin/neo4j-admin import \ + --database ${DATABASE}\ + --high-io=true\ + --nodes:Area:MusicBrainzEntity "area.csv"\ + --nodes:MusicBrainzEntity "release.csv"\ + --nodes:MusicBrainzEntity "lastfm_artist.csv"\ + --nodes:Tag "tag.csv"\ + --nodes:MusicBrainzEntity "labels.csv"\ + --relationships:IS_PART_OF "area_area.csv"\ + --relationships:IS_BASED_IN "artist_area.csv"\ + --relationships "artist_artist.csv"\ + --relationships "artist_release.csv"\ + --relationships:IS_TAGGED "release_tag.csv"\ + --relationships:IS_TAGGED "artist_tag.csv"\ + --relationships:IS_RELATED_TO "tag_tag.csv"\ + --relationships "label_label.csv"\ + --relationships "release_release.csv"\ + --relationships:IS_RELATED_TO "lastfm_artist_artist.csv" + +rm *.csv +cd .. + diff --git a/make_release_to_rg_map.py b/make_release_to_rg_map.py new file mode 100644 index 0000000..ba21978 --- /dev/null +++ b/make_release_to_rg_map.py @@ -0,0 +1,31 @@ +import sqlite3 + +release_to_release_group_map = dict() +release_groups = dict() + +with open("in/release_group") as f: + for line in f: + cols = line.split("\t") + release_groups[cols[0]] = cols[1] + +with open("in/release") as f: + for line in f: + cols = line.split("\t") + release_to_release_group_map[cols[1]] = release_groups[cols[4]] + +with sqlite3.connect("mapdb.db") as conn: + + cursor = conn.cursor() + cursor.execute("CREATE TABLE map (release TEXT PRIMARY KEY , release_group TEXT)") + + for k, v in release_to_release_group_map.items(): + cursor.execute("INSERT INTO map (release, release_group) VALUES (?,?)", (k, v)) + conn.commit() + +""" +CREATE TABLE covers (id TEXT primary key, cover BLOB); +ATTACH 'mapdb.db' AS map; +ATTACH '/mnt/Data8/caa_tn_only.db' AS source; +INSERT OR IGNORE INTO covers SELECT release_group, cover FROM source.covers INNER JOIN map.map ON id = map.release; +""" + diff --git a/process_lastfm_data.py b/process_lastfm_data.py new file mode 100644 index 0000000..af6b1c6 --- /dev/null +++ b/process_lastfm_data.py @@ -0,0 +1,100 @@ +import csv +import json +import sqlite3 +from collections import defaultdict +import sys + +artists = set() + + +def disambiguate(lfm_artist, artist_release_count, name, mbid): + existing_mbid = lfm_artist.get(name, None) + + if existing_mbid and mbid != existing_mbid: + if artist_release_count[existing_mbid] < artist_release_count[mbid]: + + lfm_artist[name] = mbid + + print("Replacing %s (%s) with %s (%d) for %s" % + (existing_mbid, artist_release_count[existing_mbid], + mbid, artist_release_count[mbid], + name)) + else: + lfm_artist[name] = mbid + + +def patch(lastfm_data): + + artist_listeners = dict() + lastfm_artist_to_mbid = dict() + artist_release_count = defaultdict(int) + related = list() + + with open("repo/artist_release.csv") as f: + for line in f: + cols = line.split(',') + artist_release_count[cols[0]] += 1 + + with sqlite3.connect(lastfm_data) as conn: + cur = conn.cursor() + cur.execute("SELECT data FROM lastfmdata", ) + data = list(cur.fetchall()) + + # A lastfm artist name can refer to multiple MBIDs + # For RELATED_TO purposes, we assume that the MBID referring + # to the artist with the most official releases is the one + + for row in data: + meta = json.loads(row[0]) + + disambiguate(lastfm_artist_to_mbid, artist_release_count, meta["name"], meta["artist"]) + + for similar in [s for s in meta["similar"] if s["mbid"] is not None]: + disambiguate(lastfm_artist_to_mbid, artist_release_count, similar["name"], similar["mbid"]) + + # Get related links & listener counts + for row in data: + meta = json.loads(row[0]) + + artist_listeners[lastfm_artist_to_mbid[meta["name"]]] = \ + (meta["listeners"], meta["playcount"]) + + for similar in [s for s in meta["similar"] if s["mbid"] is not None]: + related.append(( + lastfm_artist_to_mbid[similar["name"]], + lastfm_artist_to_mbid[meta["name"]], + similar["match"] + )) + + with open("repo/lastfm_artist.csv", "w") as out: + writer = csv.writer(out) + writer.writerow([ + "id:ID(Artist)", "name", "year:short", ":LABEL", "listeners:int", "playcount:int" + ]) + + with open("repo/artist.csv") as f: + reader = csv.reader(f) + + reader.__next__() # Skip header + for row in reader: + writer.writerow([ + row[0], + row[1], + row[2], + row[3], + artist_listeners.get(row[0], (0, 0))[0], + artist_listeners.get(row[0], (0, 0))[1], + ]) + artists.add(row[0]) + + with open("repo/lastfm_artist_artist.csv", "w") as out: + out.write(",".join(( + ":START_ID(Artist)", ":END_ID(Artist)", "weight:float" + )) + "\n") + + for x in related: + if x[0] in artists and x[1] in artists: + out.write(",".join(x) + "\n") + + +patch(sys.argv[1]) diff --git a/process_mb_dump.py b/process_mb_dump.py new file mode 100644 index 0000000..b4d369f --- /dev/null +++ b/process_mb_dump.py @@ -0,0 +1,393 @@ +import os +from collections import defaultdict + +links = dict() +link_types = dict() +areas = dict() +labels = dict() +label_types = { + "\\N": "" +} +release_groups = dict() +release_statuses = dict() +release_to_release_group_map = dict() +release_types = { + "\\N": "", +} +artists = dict() +tags = dict() + +release_release_rel_map = { + "covers and versions": "", + "remixes and compilations": "", + "DJ-mix": "IS_DJ_MIX_OF", + "live performance": "IS_LIVE_PERFORMANCE_OF", + "cover": "IS_COVER_OF", + "remix": "IS_REMIX_OF", + "mashes up": "IS_MASHUP_OF", + "included in": "INCLUDED_IN", + "single from": "IS_SINGLE_FROM" +} + +artist_release_rel_map = { + "translator": "TRANSLATED", + "liner notes": "WROTE_LINER_NOTES", + "lyricist": "IS_LYRICIST_FOR", + "lacquer cut": "DID_LACQUER_CUT_FOR", + "samples from artist": "HAS_SAMPLES_IN", + "remixes and compilations": "", + "composition": "COMPOSED", + "booking": "DID_BOOKING_FOR", + "balance": "DID_BALANCE_FOR", + "misc": "HAS_MISC_ROLE_IN", + "conductor": "CONDUCTED", + "legal representation": "PROVIDED_LEGAL_REPRESENTATION_FOR", + "design/illustration": "DID_DESIGN_FOR", + "performing orchestra": "PERFORMED_FOR", + "producer": "PRODUCED", + "instrument": "PERFORMED_INSTRUMENT_FOR", + "writer": "WROTE_LYRICS_FOR", + "production": "DID_PRODUCTION_FOR", + "performance": "PERFORMED_FOR", + "composer": "IS_COMPOSER_FOR", + "sound": "DID_SOUND_FOR", + "remixer": "DID_REMIXING_FOR", + "orchestrator": "IS_ORCHESTRATOR_FOR", + "compiler": "DID_COMPILATION_FOR", + "vocal arranger": "IS_ARRANGER_FOR", + "arranger": "IS_ARRENGER_FOR", + "mix-DJ": "MIXED", + "editor": "IS_EDITOR_FOR", + "illustration": "DID_ILLUSTRATION_FOR", + "audio": "DID_AUDIO_FOR", + "publishing": "IS_PUBLISHER_FOR", + "art direction": "DID_ART_DIRECTOR_FOR", + "design": "DID_DESIGN_FOR", + "instrument arranger": "IS_ARRANGER_FOR", + "chorus master": "IS_CHORUS_MASTER_FOR", + "photography": "DID_PHOTOGRAPHY_FOR", + "performer": "PERFORMED_IN", + "graphic design": "DID_GRAPHIC_DESIGN_FOR", + "booklet editor": "IS_BOOKLET_EDITOR_FOR", + "programming": "DID_PROGRAMING_FOR", + "copyright": "IS_COPYRIGHT_HOLDER_OF", + "piano technician": "IS_PIANO_TECNICIAN_FOR", + "phonographic copyright": "IS_PHONOGRAPHIC_COPYRIGHT_HOLDER_OF", + "mastering": "DID_MASTERING_FOR", + "vocal": "PERFORED_VOCALS_FOR", + "librettist": "IS_LIBRETTIST_FOR", + "mix": "MIXED", + "recording": "DID_RECORDING_FOR", + "concertmaster": "IS_CONCERTMASTER_FOR", + "engineer": "IS_ENGINEER_FOR", + + # release_group + "tribute": "IS_TRIBUTE_TO", + "dedicated to": "IS_DEDICATED_TO", + "creative direction": "", + "artists and repertoire": "" +} + +artist_artist_rel_map = { + "teacher": "TEACHER_OF", + "composer-in-residence": "HAS_COMPOSER-IN-RESIDENCE_STATUS_IN", + "member of band": "IS_MEMBER_OF", + "voice actor": "IS_VOICE_ACTOR_OF", + "tribute": "IS_TRIBUTE_TO", + "supporting musician": "IS_SUPPORTING_MUSICIAN_OF", + "instrumental supporting musician": "IS_INSTRUMENTAL_SUPPORTING_MUSICIAN_OF", + "personal relationship": "HAS_PERSONAL_RELATIONSHIP_WITH", + "musical relationships": "HAS_MUSICAL_RELATIONSHIP_WITH", + "collaboration": "HAS_COLLABORATED_WITH", + "married": "IS_MARRIED_WITH", + "sibling": "IS_SIBLING_OF", + "parent": "IS_PARENT_OF", + "is person": "IS", + "conductor position": "IS_CONDUCTOR_OF", + "vocal supporting musician": "DOES_VOCAL_SUPPORT_FOR", + "artistic director": "IS_ARTIST_DIRECTOR_OF", + "subgroup": "IS_SUBGROUP_OF", + "founder": "IS_FOUNDER_OF", + "involved with": "IS_INVOLVED_WITH", + "named after": "IS_NAMED_AFTER", +} + +label_label_rel_map = { + "label rename": "WAS_RENAMED_TO", + "imprint": "DOES_IMPRINT_FOR", + "label distribution": "DOES_DISTRIBUTION_FOR", + "business association": "HAS_BUSINESS_ASSOCIATION_TO", + "label ownership": "OWNS", + "label reissue": "DOES_REISSUING_FOR" +} + +if not os.path.exists("repo"): + os.mkdir("repo") +else: + os.system("rm repo/*") +if not os.path.exists("tmp"): + os.mkdir("tmp") +else: + os.system("rm tmp/*") + +with open("in/link", "r") as f: + for line in f: + cols = line.split("\t") + links[cols[0]] = cols + +with open("in/release_status", "r") as f: + for line in f: + cols = line.split("\t") + release_statuses[cols[0]] = cols + +with open("in/link_type", "r") as f: + for line in f: + cols = line.split("\t") + link_types[cols[0]] = cols + +with open("in/area", "r") as f: + for line in f: + cols = line.split("\t") + areas[cols[0]] = cols + +with open("in/label_type") as f: + for line in f: + cols = line.split("\t") + + label_types[cols[0]] = ";" + cols[1].replace(" ", "") + + if cols[3] != "\\N" and cols[2] in label_types: + label_types[cols[0]] += label_types[cols[2]].replace(" ", "") + +with open("in/artist") as f: + for line in f: + cols = line.split("\t") + artists[cols[0]] = cols + +with open("repo/area_area.csv", "w") as out: + out.write(":START_ID(Area),:END_ID(Area)\n") + + with open("in/l_area_area", "r") as f: + for line in f: + cols = line.split("\t") + out.write(",".join((areas[cols[3]][1], + areas[cols[2]][1] + )) + "\n") + +with open("repo/area.csv", "w") as out: + out.write("id:ID(Area),name\n") + + for k, area in areas.items(): + out.write(",".join((area[1], + '"' + area[2] + '"' + )) + "\n") + +# ------ + + +out_artist = open("repo/artist.csv", "w") +out_artist_area = open("repo/artist_area.csv", "w") + +out_artist.write("id:ID(Artist),name,year:int,:LABEL\n") +out_artist_area.write(":START_ID(Artist),:END_ID(Area)\n") + +for _, artist in artists.items(): + out_artist.write(",".join(( + artist[1], + '"' + artist[2].replace("\"", "\"\"") + '"', + artist[4] if artist[4] != "\\N" else "0", + "Artist" + (";Group\n" if artist[10] == "2" else "\n") + ))) + + if artist[11] != "\\N": + out_artist_area.write(artist[1] + "," + areas[artist[11]][1] + "\n") + +out_artist.close() +out_artist_area.close() + +with open("repo/artist_artist.csv", "w") as out: + out.write(":START_ID(Artist),:END_ID(Artist),:TYPE\n") + + with open("in/l_artist_artist", "r") as f: + for line in f: + cols = line.split("\t") + out.write(",".join(( + artists[cols[2]][1], + artists[cols[3]][1], + artist_artist_rel_map[link_types[links[cols[1]][1]][6]] + "\n" + ))) + +# -------- + +with open("in/release_group_primary_type") as f: + for line in f: + cols = line.split("\t") + release_types[cols[0]] = ";" + cols[1] + +release_group_year = dict() +with open("in/release_group_meta") as f: + for line in f: + cols = line.split("\t") + release_group_year[cols[0]] = cols[2] if cols[2] != "\\N" else "0" + +with open("repo/release.csv", "w") as out: + out.write("id:ID(Release),name,year:int,:LABEL\n") + + with open("in/release_group") as f: + for line in f: + cols = line.split("\t") + out.write(",".join(( + cols[1], + '"' + cols[2].replace("\"", "\"\"") + '"', + release_group_year[cols[0]], + "Release" + release_types[cols[4]], + )) + "\n") + + release_groups[cols[0]] = cols + +with open("in/release") as f: + for line in f: + cols = line.split("\t") + if cols[5] != '\\N' and release_statuses[cols[5]][1] == "Official": + release_to_release_group_map[cols[0]] = cols[4] + +credit_names = defaultdict(list) + +with open("in/artist_credit_name") as f: + for line in f: + cols = line.split("\t") + credit_names[cols[0]].append(artists[cols[2]][1]) + +with open("tmp/tmp_artist_release.csv", "w") as out: + out.write(":START_ID(Artist),:END_ID(Release),:TYPE\n") + + # Is this part really necessary? + with open("in/l_artist_release") as f: + for line in f: + cols = line.split("\t") + if cols[3] in release_to_release_group_map: + out.write(",".join(( + artists[cols[2]][1], + release_groups[release_to_release_group_map[cols[3]]][1], + artist_release_rel_map[link_types[links[cols[1]][1]][6]] + )) + "\n") + + # Artist credits + with open("in/release") as f: + for line in f: + cols = line.split("\t") + if cols[0] in release_to_release_group_map: + for credit in credit_names[cols[3]]: + out.write(",".join(( + credit, + release_groups[release_to_release_group_map[cols[0]]][1], + "CREDITED_FOR" + )) + "\n") + +# Remove dupes +os.system("(head -n 1 tmp/tmp_artist_release.csv && tail -n +2 tmp/tmp_artist_release.csv" + " | sort) | uniq > repo/artist_release.csv && rm tmp/tmp_artist_release.csv") + + +with open("repo/release_release.csv", "w") as out: + out.write(":START_ID(Release),:END_ID(Release),:TYPE\n") + + with open("in/l_release_group_release_group") as f: + for line in f: + cols = line.split("\t") + out.write(",".join(( + release_groups[cols[2]][1], + release_groups[cols[3]][1], + release_release_rel_map[link_types[links[cols[1]][1]][6]] + )) + "\n") + +# --- + +with open("in/tag") as f: + with open("repo/tag.csv", "w") as out: + out.write("id:ID(Tag),name\n") + + for line in f: + cols = line.split("\t") + tags[cols[0]] = cols + out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"\n") + +with open("repo/release_tag.csv", "w") as out: + out.write(":START_ID(Release),:END_ID(Tag),weight:int\n") + + with open("in/release_group_tag") as f: + for line in f: + cols = line.split("\t") + + if int(cols[2]) <= 0: + continue + + out.write(",".join(( + release_groups[cols[0]][1], + cols[1], + cols[2], + )) + "\n") + +with open("repo/artist_tag.csv", "w") as out: + out.write(":START_ID(Artist),:END_ID(Tag),weight:int\n") + + with open("in/artist_tag") as f: + for line in f: + cols = line.split("\t") + + if int(cols[2]) <= 0: + continue + + out.write(",".join(( + artists[cols[0]][1], + cols[1], + cols[2], + )) + "\n") + +with open("repo/tag_tag.csv", "w") as out: + out.write(":START_ID(Tag),:END_ID(Tag),weight:int\n") + + with open("in/tag_relation") as f: + for line in f: + cols = line.split("\t") + + if int(cols[2]) <= 0: + continue + + out.write(",".join(( + cols[0], + cols[1], + cols[2], + )) + "\n") + +# ----- + +with open("repo/labels.csv", "w") as out: + out.write("id:ID(Label),name,code,:LABEL\n") + + with open("in/label") as f: + for line in f: + cols = line.split("\t") + labels[cols[0]] = cols + + out.write(",".join(( + cols[1], + "\"" + cols[2].replace("\"", "\"\"") + "\"", + cols[9] if cols[9] != "\\N" else "", + "Label" + label_types[cols[10]] + )) + "\n") + +with open("repo/label_label.csv", "w") as out: + out.write(":START_ID(Label),:END_ID(Label),:TYPE\n") + + with open("in/l_label_label") as f: + for line in f: + cols = line.split("\t") + + out.write(",".join(( + labels[cols[2]][1], + labels[cols[3]][1], + label_label_rel_map[link_types[links[cols[1]][1]][6]] + )) + "\n") + +# --- diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f229360 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests diff --git a/seed.cypher b/seed.cypher new file mode 100644 index 0000000..7a2fc34 --- /dev/null +++ b/seed.cypher @@ -0,0 +1,2 @@ +CREATE INDEX ON :Artist(id); +CREATE INDEX ON :Release(id); diff --git a/seed_neo4j_db.sh b/seed_neo4j_db.sh new file mode 100755 index 0000000..846b8ef --- /dev/null +++ b/seed_neo4j_db.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +export NEO4J_HOME="/home/drone/Downloads/neo4j-community-3.5.3" + +cat seed.cypher | ${NEO4J_HOME}/bin/cypher-shell