diff --git a/.gitignore b/.gitignore
index 894a44c..a8c339e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,11 @@ venv.bak/
# mypy
.mypy_cache/
+
+.idea/
+in/
+repo/
+tmp/
+workspace/
+worker.json
+*.db
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..11d1bc0
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,9 @@
+[submodule "task_tracker_drone"]
+ path = task_tracker_drone
+ url = https://github.com/simon987/task_tracker_drone/
+[submodule "last.fm"]
+ path = last.fm
+ url = https://git.simon987.net/drone/last.fm
+[submodule "caa"]
+ path = caa
+ url = https://git.simon987.net/drone/caa.git
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..28a804d
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..3e8c63a
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/music-graph-scripts.iml b/.idea/music-graph-scripts.iml
new file mode 100644
index 0000000..d6ebd48
--- /dev/null
+++ b/.idea/music-graph-scripts.iml
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/caa b/caa
new file mode 160000
index 0000000..910f4a0
--- /dev/null
+++ b/caa
@@ -0,0 +1 @@
+Subproject commit 910f4a0bceadac37ac28fa59e7648f241c931fe6
diff --git a/download_mb_dump.sh b/download_mb_dump.sh
new file mode 100755
index 0000000..ff3bf47
--- /dev/null
+++ b/download_mb_dump.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+latest=$(curl http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/LATEST)
+
+mkdir in 2> /dev/null
+cd in
+
+wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/mbdump.tar.bz2"
+wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/mbdump-derived.tar.bz2"
+
+tar -xjvf mbdump.tar.bz2 mbdump/area mbdump/artist mbdump/l_area_area mbdump/l_artist_artist \
+mbdump/l_artist_release mbdump/l_artist_release_group mbdump/l_label_label mbdump/l_release_group_release_group \
+mbdump/label mbdump/label_type mbdump/link mbdump/link_type mbdump/release mbdump/release_group \
+mbdump/release_group_primary_type mbdump/artist_credit_name mbdump/release_status
+tar -xjvf mbdump-derived.tar.bz2 mbdump/artist_tag mbdump/release_group_tag mbdump/tag mbdump/tag_relation \
+mbdump/release_group_meta
+
+mv mbdump/* .
+rm -r mbdump
+cd ..
\ No newline at end of file
diff --git a/extract_covers.py b/extract_covers.py
new file mode 100644
index 0000000..e50ea6c
--- /dev/null
+++ b/extract_covers.py
@@ -0,0 +1,27 @@
+import sqlite3
+
+import sys
+
+with sqlite3.connect(sys.argv[1]) as conn:
+
+ cursor = conn.cursor()
+ cursor.execute("SELECT id from covers")
+
+ cursor = conn.cursor()
+ cursor.execute("SELECT id from covers")
+
+ def rows():
+ buf = list()
+ for row in cursor.fetchall():
+ buf.append(row[0])
+ if len(buf) > 30:
+ yield buf
+ buf.clear()
+
+ for batch in rows():
+ cursor.execute("SELECT cover from covers where id in (%s)" % (",".join(("'" + b + "'") for b in batch)))
+ covers = cursor.fetchall()
+ for i, cover in enumerate(covers):
+ with open("./tmpcovers/" + batch[i] + ".jpg", "wb") as out:
+ out.write(cover[0])
+ print(batch[i])
diff --git a/generate_caa_tasks.py b/generate_caa_tasks.py
new file mode 100644
index 0000000..cb20260
--- /dev/null
+++ b/generate_caa_tasks.py
@@ -0,0 +1,56 @@
+import json
+from multiprocessing.pool import ThreadPool
+
+from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
+
+TT_API_URL = "https://tt.simon987.net/api"
+TT_PROJECT = 5
+
+
+done = set()
+# with sqlite3.connect(sys.argv[1]) as conn:
+# cur = conn.cursor()
+# cur.execute("SELECT id FROM covers")
+# for mbid in cur.fetchall():
+# done.add(mbid[0])
+
+api = TaskTrackerApi(TT_API_URL)
+
+worker = Worker.from_file(api)
+if not worker:
+ worker = api.make_worker("caa scraper")
+ worker.dump_to_file()
+worker.request_access(TT_PROJECT, True, True)
+input("Give permission to " + worker.alias)
+
+
+def mktask(mbids):
+ res = worker.submit_task(
+ project=TT_PROJECT,
+ recipe=json.dumps(mbids),
+ hash64=hash(mbids[0]),
+ max_assign_time=60 * 30,
+ priority=1,
+ unique_str=None,
+ verification_count=None,
+ max_retries=5,
+ )
+ print(res.text)
+
+
+def lines():
+ with open("in/release") as f:
+ buf = list()
+
+ for line in f:
+ cols = line.split("\t")
+
+ buf.append(cols[1])
+ if len(buf) == 75:
+ a = list(buf)
+ buf.clear()
+ yield a
+
+
+pool = ThreadPool(processes=20)
+pool.map(func=mktask, iterable=lines())
diff --git a/generate_lastfm_tasks.py b/generate_lastfm_tasks.py
new file mode 100644
index 0000000..1d8051c
--- /dev/null
+++ b/generate_lastfm_tasks.py
@@ -0,0 +1,48 @@
+import csv
+import json
+from multiprocessing.pool import ThreadPool
+
+from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
+
+TT_API_URL = "https://tt.simon987.net/api"
+TT_PROJECT = 1
+
+api = TaskTrackerApi(TT_API_URL)
+
+worker = Worker.from_file(api)
+if not worker:
+ worker = api.make_worker("last.fm scraper")
+ worker.dump_to_file()
+worker.request_access(TT_PROJECT, True, True)
+input("Give permission to " + worker.alias)
+
+with open("repo/artist.csv") as f:
+ reader = csv.reader(f)
+
+ def mktask(lines):
+ res = worker.submit_task(
+ project=TT_PROJECT,
+ recipe=json.dumps(
+ [{"mbid": line[0], "name": line[1]} for line in lines]
+ ),
+ unique_str=lines[0][0],
+ max_assign_time=60 * 5,
+ )
+ print(res.text)
+
+ def lines():
+ line_batch = list()
+
+ for line in reader:
+ if "Group" in line[3]:
+ line_batch.append(line)
+ if len(line_batch) >= 30:
+ res = list(line_batch)
+ line_batch.clear()
+ yield res
+
+ tasks = list(lines())
+
+ pool = ThreadPool(processes=25)
+ pool.map(func=mktask, iterable=tasks)
+
diff --git a/last.fm b/last.fm
new file mode 160000
index 0000000..855df64
--- /dev/null
+++ b/last.fm
@@ -0,0 +1 @@
+Subproject commit 855df64c316930062ff4f7740492d0f039788498
diff --git a/make_neoj4_db.sh b/make_neoj4_db.sh
new file mode 100755
index 0000000..85930e7
--- /dev/null
+++ b/make_neoj4_db.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+export NEO4J_HOME="/home/drone/Downloads/neo4j-community-3.5.3"
+export REPOSITORY="http://localhost:9999"
+export DATABASE="graph.db"
+
+rm -rf "${NEO4J_HOME}/data/databases/${DATABASE}"
+
+cp ${NEO4J_HOME}/conf/neo4j.conf ${NEO4J_HOME}/conf/neo4j.conf.bak
+echo "dbms.security.auth_enabled=false" >> ${NEO4J_HOME}/conf/neo4j.conf
+
+mkdir workspace 2> /dev/null
+cd workspace
+rm *.csv
+
+wget ${REPOSITORY}/area.csv
+wget ${REPOSITORY}/area_area.csv
+wget ${REPOSITORY}/lastfm_artist.csv
+wget ${REPOSITORY}/artist_area.csv
+wget ${REPOSITORY}/artist_artist.csv
+wget ${REPOSITORY}/artist_release.csv
+wget ${REPOSITORY}/release.csv
+wget ${REPOSITORY}/tag.csv
+wget ${REPOSITORY}/tag_tag.csv
+wget ${REPOSITORY}/release_tag.csv
+wget ${REPOSITORY}/release_release.csv
+wget ${REPOSITORY}/artist_tag.csv
+wget ${REPOSITORY}/labels.csv
+wget ${REPOSITORY}/label_label.csv
+wget ${REPOSITORY}/lastfm_artist_artist.csv
+
+. ${NEO4J_HOME}/bin/neo4j-admin import \
+ --database ${DATABASE}\
+ --high-io=true\
+ --nodes:Area:MusicBrainzEntity "area.csv"\
+ --nodes:MusicBrainzEntity "release.csv"\
+ --nodes:MusicBrainzEntity "lastfm_artist.csv"\
+ --nodes:Tag "tag.csv"\
+ --nodes:MusicBrainzEntity "labels.csv"\
+ --relationships:IS_PART_OF "area_area.csv"\
+ --relationships:IS_BASED_IN "artist_area.csv"\
+ --relationships "artist_artist.csv"\
+ --relationships "artist_release.csv"\
+ --relationships:IS_TAGGED "release_tag.csv"\
+ --relationships:IS_TAGGED "artist_tag.csv"\
+ --relationships:IS_RELATED_TO "tag_tag.csv"\
+ --relationships "label_label.csv"\
+ --relationships "release_release.csv"\
+ --relationships:IS_RELATED_TO "lastfm_artist_artist.csv"
+
+rm *.csv
+cd ..
+
diff --git a/make_release_to_rg_map.py b/make_release_to_rg_map.py
new file mode 100644
index 0000000..ba21978
--- /dev/null
+++ b/make_release_to_rg_map.py
@@ -0,0 +1,31 @@
+import sqlite3
+
+release_to_release_group_map = dict()
+release_groups = dict()
+
+with open("in/release_group") as f:
+ for line in f:
+ cols = line.split("\t")
+ release_groups[cols[0]] = cols[1]
+
+with open("in/release") as f:
+ for line in f:
+ cols = line.split("\t")
+ release_to_release_group_map[cols[1]] = release_groups[cols[4]]
+
+with sqlite3.connect("mapdb.db") as conn:
+
+ cursor = conn.cursor()
+ cursor.execute("CREATE TABLE map (release TEXT PRIMARY KEY , release_group TEXT)")
+
+ for k, v in release_to_release_group_map.items():
+ cursor.execute("INSERT INTO map (release, release_group) VALUES (?,?)", (k, v))
+ conn.commit()
+
+"""
+CREATE TABLE covers (id TEXT primary key, cover BLOB);
+ATTACH 'mapdb.db' AS map;
+ATTACH '/mnt/Data8/caa_tn_only.db' AS source;
+INSERT OR IGNORE INTO covers SELECT release_group, cover FROM source.covers INNER JOIN map.map ON id = map.release;
+"""
+
diff --git a/process_lastfm_data.py b/process_lastfm_data.py
new file mode 100644
index 0000000..af6b1c6
--- /dev/null
+++ b/process_lastfm_data.py
@@ -0,0 +1,100 @@
+import csv
+import json
+import sqlite3
+from collections import defaultdict
+import sys
+
+artists = set()
+
+
+def disambiguate(lfm_artist, artist_release_count, name, mbid):
+ existing_mbid = lfm_artist.get(name, None)
+
+ if existing_mbid and mbid != existing_mbid:
+ if artist_release_count[existing_mbid] < artist_release_count[mbid]:
+
+ lfm_artist[name] = mbid
+
+ print("Replacing %s (%s) with %s (%d) for %s" %
+ (existing_mbid, artist_release_count[existing_mbid],
+ mbid, artist_release_count[mbid],
+ name))
+ else:
+ lfm_artist[name] = mbid
+
+
+def patch(lastfm_data):
+
+ artist_listeners = dict()
+ lastfm_artist_to_mbid = dict()
+ artist_release_count = defaultdict(int)
+ related = list()
+
+ with open("repo/artist_release.csv") as f:
+ for line in f:
+ cols = line.split(',')
+ artist_release_count[cols[0]] += 1
+
+ with sqlite3.connect(lastfm_data) as conn:
+ cur = conn.cursor()
+ cur.execute("SELECT data FROM lastfmdata", )
+ data = list(cur.fetchall())
+
+ # A lastfm artist name can refer to multiple MBIDs
+ # For RELATED_TO purposes, we assume that the MBID referring
+ # to the artist with the most official releases is the one
+
+ for row in data:
+ meta = json.loads(row[0])
+
+ disambiguate(lastfm_artist_to_mbid, artist_release_count, meta["name"], meta["artist"])
+
+ for similar in [s for s in meta["similar"] if s["mbid"] is not None]:
+ disambiguate(lastfm_artist_to_mbid, artist_release_count, similar["name"], similar["mbid"])
+
+ # Get related links & listener counts
+ for row in data:
+ meta = json.loads(row[0])
+
+ artist_listeners[lastfm_artist_to_mbid[meta["name"]]] = \
+ (meta["listeners"], meta["playcount"])
+
+ for similar in [s for s in meta["similar"] if s["mbid"] is not None]:
+ related.append((
+ lastfm_artist_to_mbid[similar["name"]],
+ lastfm_artist_to_mbid[meta["name"]],
+ similar["match"]
+ ))
+
+ with open("repo/lastfm_artist.csv", "w") as out:
+ writer = csv.writer(out)
+ writer.writerow([
+ "id:ID(Artist)", "name", "year:short", ":LABEL", "listeners:int", "playcount:int"
+ ])
+
+ with open("repo/artist.csv") as f:
+ reader = csv.reader(f)
+
+ reader.__next__() # Skip header
+ for row in reader:
+ writer.writerow([
+ row[0],
+ row[1],
+ row[2],
+ row[3],
+ artist_listeners.get(row[0], (0, 0))[0],
+ artist_listeners.get(row[0], (0, 0))[1],
+ ])
+ artists.add(row[0])
+
+ with open("repo/lastfm_artist_artist.csv", "w") as out:
+ out.write(",".join((
+ ":START_ID(Artist)", ":END_ID(Artist)", "weight:float"
+ )) + "\n")
+
+ for x in related:
+ if x[0] in artists and x[1] in artists:
+ out.write(",".join(x) + "\n")
+
+
+patch(sys.argv[1])
diff --git a/process_mb_dump.py b/process_mb_dump.py
new file mode 100644
index 0000000..b4d369f
--- /dev/null
+++ b/process_mb_dump.py
@@ -0,0 +1,393 @@
+import os
+from collections import defaultdict
+
+links = dict()
+link_types = dict()
+areas = dict()
+labels = dict()
+label_types = {
+ "\\N": ""
+}
+release_groups = dict()
+release_statuses = dict()
+release_to_release_group_map = dict()
+release_types = {
+ "\\N": "",
+}
+artists = dict()
+tags = dict()
+
+release_release_rel_map = {
+ "covers and versions": "",
+ "remixes and compilations": "",
+ "DJ-mix": "IS_DJ_MIX_OF",
+ "live performance": "IS_LIVE_PERFORMANCE_OF",
+ "cover": "IS_COVER_OF",
+ "remix": "IS_REMIX_OF",
+ "mashes up": "IS_MASHUP_OF",
+ "included in": "INCLUDED_IN",
+ "single from": "IS_SINGLE_FROM"
+}
+
+artist_release_rel_map = {
+ "translator": "TRANSLATED",
+ "liner notes": "WROTE_LINER_NOTES",
+ "lyricist": "IS_LYRICIST_FOR",
+ "lacquer cut": "DID_LACQUER_CUT_FOR",
+ "samples from artist": "HAS_SAMPLES_IN",
+ "remixes and compilations": "",
+ "composition": "COMPOSED",
+ "booking": "DID_BOOKING_FOR",
+ "balance": "DID_BALANCE_FOR",
+ "misc": "HAS_MISC_ROLE_IN",
+ "conductor": "CONDUCTED",
+ "legal representation": "PROVIDED_LEGAL_REPRESENTATION_FOR",
+ "design/illustration": "DID_DESIGN_FOR",
+ "performing orchestra": "PERFORMED_FOR",
+ "producer": "PRODUCED",
+ "instrument": "PERFORMED_INSTRUMENT_FOR",
+ "writer": "WROTE_LYRICS_FOR",
+ "production": "DID_PRODUCTION_FOR",
+ "performance": "PERFORMED_FOR",
+ "composer": "IS_COMPOSER_FOR",
+ "sound": "DID_SOUND_FOR",
+ "remixer": "DID_REMIXING_FOR",
+ "orchestrator": "IS_ORCHESTRATOR_FOR",
+ "compiler": "DID_COMPILATION_FOR",
+ "vocal arranger": "IS_ARRANGER_FOR",
+ "arranger": "IS_ARRENGER_FOR",
+ "mix-DJ": "MIXED",
+ "editor": "IS_EDITOR_FOR",
+ "illustration": "DID_ILLUSTRATION_FOR",
+ "audio": "DID_AUDIO_FOR",
+ "publishing": "IS_PUBLISHER_FOR",
+ "art direction": "DID_ART_DIRECTOR_FOR",
+ "design": "DID_DESIGN_FOR",
+ "instrument arranger": "IS_ARRANGER_FOR",
+ "chorus master": "IS_CHORUS_MASTER_FOR",
+ "photography": "DID_PHOTOGRAPHY_FOR",
+ "performer": "PERFORMED_IN",
+ "graphic design": "DID_GRAPHIC_DESIGN_FOR",
+ "booklet editor": "IS_BOOKLET_EDITOR_FOR",
+ "programming": "DID_PROGRAMING_FOR",
+ "copyright": "IS_COPYRIGHT_HOLDER_OF",
+ "piano technician": "IS_PIANO_TECNICIAN_FOR",
+ "phonographic copyright": "IS_PHONOGRAPHIC_COPYRIGHT_HOLDER_OF",
+ "mastering": "DID_MASTERING_FOR",
+ "vocal": "PERFORED_VOCALS_FOR",
+ "librettist": "IS_LIBRETTIST_FOR",
+ "mix": "MIXED",
+ "recording": "DID_RECORDING_FOR",
+ "concertmaster": "IS_CONCERTMASTER_FOR",
+ "engineer": "IS_ENGINEER_FOR",
+
+ # release_group
+ "tribute": "IS_TRIBUTE_TO",
+ "dedicated to": "IS_DEDICATED_TO",
+ "creative direction": "",
+ "artists and repertoire": ""
+}
+
+artist_artist_rel_map = {
+ "teacher": "TEACHER_OF",
+ "composer-in-residence": "HAS_COMPOSER-IN-RESIDENCE_STATUS_IN",
+ "member of band": "IS_MEMBER_OF",
+ "voice actor": "IS_VOICE_ACTOR_OF",
+ "tribute": "IS_TRIBUTE_TO",
+ "supporting musician": "IS_SUPPORTING_MUSICIAN_OF",
+ "instrumental supporting musician": "IS_INSTRUMENTAL_SUPPORTING_MUSICIAN_OF",
+ "personal relationship": "HAS_PERSONAL_RELATIONSHIP_WITH",
+ "musical relationships": "HAS_MUSICAL_RELATIONSHIP_WITH",
+ "collaboration": "HAS_COLLABORATED_WITH",
+ "married": "IS_MARRIED_WITH",
+ "sibling": "IS_SIBLING_OF",
+ "parent": "IS_PARENT_OF",
+ "is person": "IS",
+ "conductor position": "IS_CONDUCTOR_OF",
+ "vocal supporting musician": "DOES_VOCAL_SUPPORT_FOR",
+ "artistic director": "IS_ARTIST_DIRECTOR_OF",
+ "subgroup": "IS_SUBGROUP_OF",
+ "founder": "IS_FOUNDER_OF",
+ "involved with": "IS_INVOLVED_WITH",
+ "named after": "IS_NAMED_AFTER",
+}
+
+label_label_rel_map = {
+ "label rename": "WAS_RENAMED_TO",
+ "imprint": "DOES_IMPRINT_FOR",
+ "label distribution": "DOES_DISTRIBUTION_FOR",
+ "business association": "HAS_BUSINESS_ASSOCIATION_TO",
+ "label ownership": "OWNS",
+ "label reissue": "DOES_REISSUING_FOR"
+}
+
+if not os.path.exists("repo"):
+ os.mkdir("repo")
+else:
+ os.system("rm repo/*")
+if not os.path.exists("tmp"):
+ os.mkdir("tmp")
+else:
+ os.system("rm tmp/*")
+
+with open("in/link", "r") as f:
+ for line in f:
+ cols = line.split("\t")
+ links[cols[0]] = cols
+
+with open("in/release_status", "r") as f:
+ for line in f:
+ cols = line.split("\t")
+ release_statuses[cols[0]] = cols
+
+with open("in/link_type", "r") as f:
+ for line in f:
+ cols = line.split("\t")
+ link_types[cols[0]] = cols
+
+with open("in/area", "r") as f:
+ for line in f:
+ cols = line.split("\t")
+ areas[cols[0]] = cols
+
+with open("in/label_type") as f:
+ for line in f:
+ cols = line.split("\t")
+
+ label_types[cols[0]] = ";" + cols[1].replace(" ", "")
+
+ if cols[3] != "\\N" and cols[2] in label_types:
+ label_types[cols[0]] += label_types[cols[2]].replace(" ", "")
+
+with open("in/artist") as f:
+ for line in f:
+ cols = line.split("\t")
+ artists[cols[0]] = cols
+
+with open("repo/area_area.csv", "w") as out:
+ out.write(":START_ID(Area),:END_ID(Area)\n")
+
+ with open("in/l_area_area", "r") as f:
+ for line in f:
+ cols = line.split("\t")
+ out.write(",".join((areas[cols[3]][1],
+ areas[cols[2]][1]
+ )) + "\n")
+
+with open("repo/area.csv", "w") as out:
+ out.write("id:ID(Area),name\n")
+
+ for k, area in areas.items():
+ out.write(",".join((area[1],
+ '"' + area[2] + '"'
+ )) + "\n")
+
+# ------
+
+
+out_artist = open("repo/artist.csv", "w")
+out_artist_area = open("repo/artist_area.csv", "w")
+
+out_artist.write("id:ID(Artist),name,year:int,:LABEL\n")
+out_artist_area.write(":START_ID(Artist),:END_ID(Area)\n")
+
+for _, artist in artists.items():
+ out_artist.write(",".join((
+ artist[1],
+ '"' + artist[2].replace("\"", "\"\"") + '"',
+ artist[4] if artist[4] != "\\N" else "0",
+ "Artist" + (";Group\n" if artist[10] == "2" else "\n")
+ )))
+
+ if artist[11] != "\\N":
+ out_artist_area.write(artist[1] + "," + areas[artist[11]][1] + "\n")
+
+out_artist.close()
+out_artist_area.close()
+
+with open("repo/artist_artist.csv", "w") as out:
+ out.write(":START_ID(Artist),:END_ID(Artist),:TYPE\n")
+
+ with open("in/l_artist_artist", "r") as f:
+ for line in f:
+ cols = line.split("\t")
+ out.write(",".join((
+ artists[cols[2]][1],
+ artists[cols[3]][1],
+ artist_artist_rel_map[link_types[links[cols[1]][1]][6]] + "\n"
+ )))
+
+# --------
+
+with open("in/release_group_primary_type") as f:
+ for line in f:
+ cols = line.split("\t")
+ release_types[cols[0]] = ";" + cols[1]
+
+release_group_year = dict()
+with open("in/release_group_meta") as f:
+ for line in f:
+ cols = line.split("\t")
+ release_group_year[cols[0]] = cols[2] if cols[2] != "\\N" else "0"
+
+with open("repo/release.csv", "w") as out:
+ out.write("id:ID(Release),name,year:int,:LABEL\n")
+
+ with open("in/release_group") as f:
+ for line in f:
+ cols = line.split("\t")
+ out.write(",".join((
+ cols[1],
+ '"' + cols[2].replace("\"", "\"\"") + '"',
+ release_group_year[cols[0]],
+ "Release" + release_types[cols[4]],
+ )) + "\n")
+
+ release_groups[cols[0]] = cols
+
+with open("in/release") as f:
+ for line in f:
+ cols = line.split("\t")
+ if cols[5] != '\\N' and release_statuses[cols[5]][1] == "Official":
+ release_to_release_group_map[cols[0]] = cols[4]
+
+credit_names = defaultdict(list)
+
+with open("in/artist_credit_name") as f:
+ for line in f:
+ cols = line.split("\t")
+ credit_names[cols[0]].append(artists[cols[2]][1])
+
+with open("tmp/tmp_artist_release.csv", "w") as out:
+ out.write(":START_ID(Artist),:END_ID(Release),:TYPE\n")
+
+ # Is this part really necessary?
+ with open("in/l_artist_release") as f:
+ for line in f:
+ cols = line.split("\t")
+ if cols[3] in release_to_release_group_map:
+ out.write(",".join((
+ artists[cols[2]][1],
+ release_groups[release_to_release_group_map[cols[3]]][1],
+ artist_release_rel_map[link_types[links[cols[1]][1]][6]]
+ )) + "\n")
+
+ # Artist credits
+ with open("in/release") as f:
+ for line in f:
+ cols = line.split("\t")
+ if cols[0] in release_to_release_group_map:
+ for credit in credit_names[cols[3]]:
+ out.write(",".join((
+ credit,
+ release_groups[release_to_release_group_map[cols[0]]][1],
+ "CREDITED_FOR"
+ )) + "\n")
+
+# Remove dupes
+os.system("(head -n 1 tmp/tmp_artist_release.csv && tail -n +2 tmp/tmp_artist_release.csv"
+ " | sort) | uniq > repo/artist_release.csv && rm tmp/tmp_artist_release.csv")
+
+
+with open("repo/release_release.csv", "w") as out:
+ out.write(":START_ID(Release),:END_ID(Release),:TYPE\n")
+
+ with open("in/l_release_group_release_group") as f:
+ for line in f:
+ cols = line.split("\t")
+ out.write(",".join((
+ release_groups[cols[2]][1],
+ release_groups[cols[3]][1],
+ release_release_rel_map[link_types[links[cols[1]][1]][6]]
+ )) + "\n")
+
+# ---
+
+with open("in/tag") as f:
+ with open("repo/tag.csv", "w") as out:
+ out.write("id:ID(Tag),name\n")
+
+ for line in f:
+ cols = line.split("\t")
+ tags[cols[0]] = cols
+ out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"\n")
+
+with open("repo/release_tag.csv", "w") as out:
+ out.write(":START_ID(Release),:END_ID(Tag),weight:int\n")
+
+ with open("in/release_group_tag") as f:
+ for line in f:
+ cols = line.split("\t")
+
+ if int(cols[2]) <= 0:
+ continue
+
+ out.write(",".join((
+ release_groups[cols[0]][1],
+ cols[1],
+ cols[2],
+ )) + "\n")
+
+with open("repo/artist_tag.csv", "w") as out:
+ out.write(":START_ID(Artist),:END_ID(Tag),weight:int\n")
+
+ with open("in/artist_tag") as f:
+ for line in f:
+ cols = line.split("\t")
+
+ if int(cols[2]) <= 0:
+ continue
+
+ out.write(",".join((
+ artists[cols[0]][1],
+ cols[1],
+ cols[2],
+ )) + "\n")
+
+with open("repo/tag_tag.csv", "w") as out:
+ out.write(":START_ID(Tag),:END_ID(Tag),weight:int\n")
+
+ with open("in/tag_relation") as f:
+ for line in f:
+ cols = line.split("\t")
+
+ if int(cols[2]) <= 0:
+ continue
+
+ out.write(",".join((
+ cols[0],
+ cols[1],
+ cols[2],
+ )) + "\n")
+
+# -----
+
+with open("repo/labels.csv", "w") as out:
+ out.write("id:ID(Label),name,code,:LABEL\n")
+
+ with open("in/label") as f:
+ for line in f:
+ cols = line.split("\t")
+ labels[cols[0]] = cols
+
+ out.write(",".join((
+ cols[1],
+ "\"" + cols[2].replace("\"", "\"\"") + "\"",
+ cols[9] if cols[9] != "\\N" else "",
+ "Label" + label_types[cols[10]]
+ )) + "\n")
+
+with open("repo/label_label.csv", "w") as out:
+ out.write(":START_ID(Label),:END_ID(Label),:TYPE\n")
+
+ with open("in/l_label_label") as f:
+ for line in f:
+ cols = line.split("\t")
+
+ out.write(",".join((
+ labels[cols[2]][1],
+ labels[cols[3]][1],
+ label_label_rel_map[link_types[links[cols[1]][1]][6]]
+ )) + "\n")
+
+# ---
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f229360
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+requests
diff --git a/seed.cypher b/seed.cypher
new file mode 100644
index 0000000..7a2fc34
--- /dev/null
+++ b/seed.cypher
@@ -0,0 +1,2 @@
+CREATE INDEX ON :Artist(id);
+CREATE INDEX ON :Release(id);
diff --git a/seed_neo4j_db.sh b/seed_neo4j_db.sh
new file mode 100755
index 0000000..846b8ef
--- /dev/null
+++ b/seed_neo4j_db.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+export NEO4J_HOME="/home/drone/Downloads/neo4j-community-3.5.3"
+
+cat seed.cypher | ${NEO4J_HOME}/bin/cypher-shell