mirror of
https://github.com/simon987/music-graph-scripts.git
synced 2025-04-10 05:56:42 +00:00
Initial commit
This commit is contained in:
parent
5fb05d591a
commit
6d8c65fcd2
8
.gitignore
vendored
8
.gitignore
vendored
@ -102,3 +102,11 @@ venv.bak/
|
|||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
|
|
||||||
|
.idea/
|
||||||
|
in/
|
||||||
|
repo/
|
||||||
|
tmp/
|
||||||
|
workspace/
|
||||||
|
worker.json
|
||||||
|
*.db
|
||||||
|
9
.gitmodules
vendored
Normal file
9
.gitmodules
vendored
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
[submodule "task_tracker_drone"]
|
||||||
|
path = task_tracker_drone
|
||||||
|
url = https://github.com/simon987/task_tracker_drone/
|
||||||
|
[submodule "last.fm"]
|
||||||
|
path = last.fm
|
||||||
|
url = https://git.simon987.net/drone/last.fm
|
||||||
|
[submodule "caa"]
|
||||||
|
path = caa
|
||||||
|
url = https://git.simon987.net/drone/caa.git
|
6
.idea/misc.xml
generated
Normal file
6
.idea/misc.xml
generated
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="JavaScriptSettings">
|
||||||
|
<option name="languageLevel" value="ES6" />
|
||||||
|
</component>
|
||||||
|
</project>
|
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/music-graph-scripts.iml" filepath="$PROJECT_DIR$/.idea/music-graph-scripts.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
9
.idea/music-graph-scripts.iml
generated
Normal file
9
.idea/music-graph-scripts.iml
generated
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="JAVA_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||||
|
<exclude-output />
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
0
__init__.py
Normal file
0
__init__.py
Normal file
1
caa
Submodule
1
caa
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 910f4a0bceadac37ac28fa59e7648f241c931fe6
|
20
download_mb_dump.sh
Executable file
20
download_mb_dump.sh
Executable file
@ -0,0 +1,20 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
latest=$(curl http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/LATEST)
|
||||||
|
|
||||||
|
mkdir in 2> /dev/null
|
||||||
|
cd in
|
||||||
|
|
||||||
|
wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/mbdump.tar.bz2"
|
||||||
|
wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/mbdump-derived.tar.bz2"
|
||||||
|
|
||||||
|
tar -xjvf mbdump.tar.bz2 mbdump/area mbdump/artist mbdump/l_area_area mbdump/l_artist_artist \
|
||||||
|
mbdump/l_artist_release mbdump/l_artist_release_group mbdump/l_label_label mbdump/l_release_group_release_group \
|
||||||
|
mbdump/label mbdump/label_type mbdump/link mbdump/link_type mbdump/release mbdump/release_group \
|
||||||
|
mbdump/release_group_primary_type mbdump/artist_credit_name mbdump/release_status
|
||||||
|
tar -xjvf mbdump-derived.tar.bz2 mbdump/artist_tag mbdump/release_group_tag mbdump/tag mbdump/tag_relation \
|
||||||
|
mbdump/release_group_meta
|
||||||
|
|
||||||
|
mv mbdump/* .
|
||||||
|
rm -r mbdump
|
||||||
|
cd ..
|
27
extract_covers.py
Normal file
27
extract_covers.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import sqlite3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
with sqlite3.connect(sys.argv[1]) as conn:
|
||||||
|
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT id from covers")
|
||||||
|
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT id from covers")
|
||||||
|
|
||||||
|
def rows():
|
||||||
|
buf = list()
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
buf.append(row[0])
|
||||||
|
if len(buf) > 30:
|
||||||
|
yield buf
|
||||||
|
buf.clear()
|
||||||
|
|
||||||
|
for batch in rows():
|
||||||
|
cursor.execute("SELECT cover from covers where id in (%s)" % (",".join(("'" + b + "'") for b in batch)))
|
||||||
|
covers = cursor.fetchall()
|
||||||
|
for i, cover in enumerate(covers):
|
||||||
|
with open("./tmpcovers/" + batch[i] + ".jpg", "wb") as out:
|
||||||
|
out.write(cover[0])
|
||||||
|
print(batch[i])
|
56
generate_caa_tasks.py
Normal file
56
generate_caa_tasks.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
import json
|
||||||
|
from multiprocessing.pool import ThreadPool
|
||||||
|
|
||||||
|
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
|
||||||
|
|
||||||
|
TT_API_URL = "https://tt.simon987.net/api"
|
||||||
|
TT_PROJECT = 5
|
||||||
|
|
||||||
|
|
||||||
|
done = set()
|
||||||
|
# with sqlite3.connect(sys.argv[1]) as conn:
|
||||||
|
# cur = conn.cursor()
|
||||||
|
# cur.execute("SELECT id FROM covers")
|
||||||
|
# for mbid in cur.fetchall():
|
||||||
|
# done.add(mbid[0])
|
||||||
|
|
||||||
|
api = TaskTrackerApi(TT_API_URL)
|
||||||
|
|
||||||
|
worker = Worker.from_file(api)
|
||||||
|
if not worker:
|
||||||
|
worker = api.make_worker("caa scraper")
|
||||||
|
worker.dump_to_file()
|
||||||
|
worker.request_access(TT_PROJECT, True, True)
|
||||||
|
input("Give permission to " + worker.alias)
|
||||||
|
|
||||||
|
|
||||||
|
def mktask(mbids):
|
||||||
|
res = worker.submit_task(
|
||||||
|
project=TT_PROJECT,
|
||||||
|
recipe=json.dumps(mbids),
|
||||||
|
hash64=hash(mbids[0]),
|
||||||
|
max_assign_time=60 * 30,
|
||||||
|
priority=1,
|
||||||
|
unique_str=None,
|
||||||
|
verification_count=None,
|
||||||
|
max_retries=5,
|
||||||
|
)
|
||||||
|
print(res.text)
|
||||||
|
|
||||||
|
|
||||||
|
def lines():
|
||||||
|
with open("in/release") as f:
|
||||||
|
buf = list()
|
||||||
|
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
|
||||||
|
buf.append(cols[1])
|
||||||
|
if len(buf) == 75:
|
||||||
|
a = list(buf)
|
||||||
|
buf.clear()
|
||||||
|
yield a
|
||||||
|
|
||||||
|
|
||||||
|
pool = ThreadPool(processes=20)
|
||||||
|
pool.map(func=mktask, iterable=lines())
|
48
generate_lastfm_tasks.py
Normal file
48
generate_lastfm_tasks.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
import csv
|
||||||
|
import json
|
||||||
|
from multiprocessing.pool import ThreadPool
|
||||||
|
|
||||||
|
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
|
||||||
|
|
||||||
|
TT_API_URL = "https://tt.simon987.net/api"
|
||||||
|
TT_PROJECT = 1
|
||||||
|
|
||||||
|
api = TaskTrackerApi(TT_API_URL)
|
||||||
|
|
||||||
|
worker = Worker.from_file(api)
|
||||||
|
if not worker:
|
||||||
|
worker = api.make_worker("last.fm scraper")
|
||||||
|
worker.dump_to_file()
|
||||||
|
worker.request_access(TT_PROJECT, True, True)
|
||||||
|
input("Give permission to " + worker.alias)
|
||||||
|
|
||||||
|
with open("repo/artist.csv") as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
|
||||||
|
def mktask(lines):
|
||||||
|
res = worker.submit_task(
|
||||||
|
project=TT_PROJECT,
|
||||||
|
recipe=json.dumps(
|
||||||
|
[{"mbid": line[0], "name": line[1]} for line in lines]
|
||||||
|
),
|
||||||
|
unique_str=lines[0][0],
|
||||||
|
max_assign_time=60 * 5,
|
||||||
|
)
|
||||||
|
print(res.text)
|
||||||
|
|
||||||
|
def lines():
|
||||||
|
line_batch = list()
|
||||||
|
|
||||||
|
for line in reader:
|
||||||
|
if "Group" in line[3]:
|
||||||
|
line_batch.append(line)
|
||||||
|
if len(line_batch) >= 30:
|
||||||
|
res = list(line_batch)
|
||||||
|
line_batch.clear()
|
||||||
|
yield res
|
||||||
|
|
||||||
|
tasks = list(lines())
|
||||||
|
|
||||||
|
pool = ThreadPool(processes=25)
|
||||||
|
pool.map(func=mktask, iterable=tasks)
|
||||||
|
|
1
last.fm
Submodule
1
last.fm
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 855df64c316930062ff4f7740492d0f039788498
|
53
make_neoj4_db.sh
Executable file
53
make_neoj4_db.sh
Executable file
@ -0,0 +1,53 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
export NEO4J_HOME="/home/drone/Downloads/neo4j-community-3.5.3"
|
||||||
|
export REPOSITORY="http://localhost:9999"
|
||||||
|
export DATABASE="graph.db"
|
||||||
|
|
||||||
|
rm -rf "${NEO4J_HOME}/data/databases/${DATABASE}"
|
||||||
|
|
||||||
|
cp ${NEO4J_HOME}/conf/neo4j.conf ${NEO4J_HOME}/conf/neo4j.conf.bak
|
||||||
|
echo "dbms.security.auth_enabled=false" >> ${NEO4J_HOME}/conf/neo4j.conf
|
||||||
|
|
||||||
|
mkdir workspace 2> /dev/null
|
||||||
|
cd workspace
|
||||||
|
rm *.csv
|
||||||
|
|
||||||
|
wget ${REPOSITORY}/area.csv
|
||||||
|
wget ${REPOSITORY}/area_area.csv
|
||||||
|
wget ${REPOSITORY}/lastfm_artist.csv
|
||||||
|
wget ${REPOSITORY}/artist_area.csv
|
||||||
|
wget ${REPOSITORY}/artist_artist.csv
|
||||||
|
wget ${REPOSITORY}/artist_release.csv
|
||||||
|
wget ${REPOSITORY}/release.csv
|
||||||
|
wget ${REPOSITORY}/tag.csv
|
||||||
|
wget ${REPOSITORY}/tag_tag.csv
|
||||||
|
wget ${REPOSITORY}/release_tag.csv
|
||||||
|
wget ${REPOSITORY}/release_release.csv
|
||||||
|
wget ${REPOSITORY}/artist_tag.csv
|
||||||
|
wget ${REPOSITORY}/labels.csv
|
||||||
|
wget ${REPOSITORY}/label_label.csv
|
||||||
|
wget ${REPOSITORY}/lastfm_artist_artist.csv
|
||||||
|
|
||||||
|
. ${NEO4J_HOME}/bin/neo4j-admin import \
|
||||||
|
--database ${DATABASE}\
|
||||||
|
--high-io=true\
|
||||||
|
--nodes:Area:MusicBrainzEntity "area.csv"\
|
||||||
|
--nodes:MusicBrainzEntity "release.csv"\
|
||||||
|
--nodes:MusicBrainzEntity "lastfm_artist.csv"\
|
||||||
|
--nodes:Tag "tag.csv"\
|
||||||
|
--nodes:MusicBrainzEntity "labels.csv"\
|
||||||
|
--relationships:IS_PART_OF "area_area.csv"\
|
||||||
|
--relationships:IS_BASED_IN "artist_area.csv"\
|
||||||
|
--relationships "artist_artist.csv"\
|
||||||
|
--relationships "artist_release.csv"\
|
||||||
|
--relationships:IS_TAGGED "release_tag.csv"\
|
||||||
|
--relationships:IS_TAGGED "artist_tag.csv"\
|
||||||
|
--relationships:IS_RELATED_TO "tag_tag.csv"\
|
||||||
|
--relationships "label_label.csv"\
|
||||||
|
--relationships "release_release.csv"\
|
||||||
|
--relationships:IS_RELATED_TO "lastfm_artist_artist.csv"
|
||||||
|
|
||||||
|
rm *.csv
|
||||||
|
cd ..
|
||||||
|
|
31
make_release_to_rg_map.py
Normal file
31
make_release_to_rg_map.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import sqlite3
|
||||||
|
|
||||||
|
release_to_release_group_map = dict()
|
||||||
|
release_groups = dict()
|
||||||
|
|
||||||
|
with open("in/release_group") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
release_groups[cols[0]] = cols[1]
|
||||||
|
|
||||||
|
with open("in/release") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
release_to_release_group_map[cols[1]] = release_groups[cols[4]]
|
||||||
|
|
||||||
|
with sqlite3.connect("mapdb.db") as conn:
|
||||||
|
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("CREATE TABLE map (release TEXT PRIMARY KEY , release_group TEXT)")
|
||||||
|
|
||||||
|
for k, v in release_to_release_group_map.items():
|
||||||
|
cursor.execute("INSERT INTO map (release, release_group) VALUES (?,?)", (k, v))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
"""
|
||||||
|
CREATE TABLE covers (id TEXT primary key, cover BLOB);
|
||||||
|
ATTACH 'mapdb.db' AS map;
|
||||||
|
ATTACH '/mnt/Data8/caa_tn_only.db' AS source;
|
||||||
|
INSERT OR IGNORE INTO covers SELECT release_group, cover FROM source.covers INNER JOIN map.map ON id = map.release;
|
||||||
|
"""
|
||||||
|
|
100
process_lastfm_data.py
Normal file
100
process_lastfm_data.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
from collections import defaultdict
|
||||||
|
import sys
|
||||||
|
|
||||||
|
artists = set()
|
||||||
|
|
||||||
|
|
||||||
|
def disambiguate(lfm_artist, artist_release_count, name, mbid):
|
||||||
|
existing_mbid = lfm_artist.get(name, None)
|
||||||
|
|
||||||
|
if existing_mbid and mbid != existing_mbid:
|
||||||
|
if artist_release_count[existing_mbid] < artist_release_count[mbid]:
|
||||||
|
|
||||||
|
lfm_artist[name] = mbid
|
||||||
|
|
||||||
|
print("Replacing %s (%s) with %s (%d) for %s" %
|
||||||
|
(existing_mbid, artist_release_count[existing_mbid],
|
||||||
|
mbid, artist_release_count[mbid],
|
||||||
|
name))
|
||||||
|
else:
|
||||||
|
lfm_artist[name] = mbid
|
||||||
|
|
||||||
|
|
||||||
|
def patch(lastfm_data):
|
||||||
|
|
||||||
|
artist_listeners = dict()
|
||||||
|
lastfm_artist_to_mbid = dict()
|
||||||
|
artist_release_count = defaultdict(int)
|
||||||
|
related = list()
|
||||||
|
|
||||||
|
with open("repo/artist_release.csv") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split(',')
|
||||||
|
artist_release_count[cols[0]] += 1
|
||||||
|
|
||||||
|
with sqlite3.connect(lastfm_data) as conn:
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute("SELECT data FROM lastfmdata", )
|
||||||
|
data = list(cur.fetchall())
|
||||||
|
|
||||||
|
# A lastfm artist name can refer to multiple MBIDs
|
||||||
|
# For RELATED_TO purposes, we assume that the MBID referring
|
||||||
|
# to the artist with the most official releases is the one
|
||||||
|
|
||||||
|
for row in data:
|
||||||
|
meta = json.loads(row[0])
|
||||||
|
|
||||||
|
disambiguate(lastfm_artist_to_mbid, artist_release_count, meta["name"], meta["artist"])
|
||||||
|
|
||||||
|
for similar in [s for s in meta["similar"] if s["mbid"] is not None]:
|
||||||
|
disambiguate(lastfm_artist_to_mbid, artist_release_count, similar["name"], similar["mbid"])
|
||||||
|
|
||||||
|
# Get related links & listener counts
|
||||||
|
for row in data:
|
||||||
|
meta = json.loads(row[0])
|
||||||
|
|
||||||
|
artist_listeners[lastfm_artist_to_mbid[meta["name"]]] = \
|
||||||
|
(meta["listeners"], meta["playcount"])
|
||||||
|
|
||||||
|
for similar in [s for s in meta["similar"] if s["mbid"] is not None]:
|
||||||
|
related.append((
|
||||||
|
lastfm_artist_to_mbid[similar["name"]],
|
||||||
|
lastfm_artist_to_mbid[meta["name"]],
|
||||||
|
similar["match"]
|
||||||
|
))
|
||||||
|
|
||||||
|
with open("repo/lastfm_artist.csv", "w") as out:
|
||||||
|
writer = csv.writer(out)
|
||||||
|
writer.writerow([
|
||||||
|
"id:ID(Artist)", "name", "year:short", ":LABEL", "listeners:int", "playcount:int"
|
||||||
|
])
|
||||||
|
|
||||||
|
with open("repo/artist.csv") as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
|
||||||
|
reader.__next__() # Skip header
|
||||||
|
for row in reader:
|
||||||
|
writer.writerow([
|
||||||
|
row[0],
|
||||||
|
row[1],
|
||||||
|
row[2],
|
||||||
|
row[3],
|
||||||
|
artist_listeners.get(row[0], (0, 0))[0],
|
||||||
|
artist_listeners.get(row[0], (0, 0))[1],
|
||||||
|
])
|
||||||
|
artists.add(row[0])
|
||||||
|
|
||||||
|
with open("repo/lastfm_artist_artist.csv", "w") as out:
|
||||||
|
out.write(",".join((
|
||||||
|
":START_ID(Artist)", ":END_ID(Artist)", "weight:float"
|
||||||
|
)) + "\n")
|
||||||
|
|
||||||
|
for x in related:
|
||||||
|
if x[0] in artists and x[1] in artists:
|
||||||
|
out.write(",".join(x) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
patch(sys.argv[1])
|
393
process_mb_dump.py
Normal file
393
process_mb_dump.py
Normal file
@ -0,0 +1,393 @@
|
|||||||
|
import os
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
links = dict()
|
||||||
|
link_types = dict()
|
||||||
|
areas = dict()
|
||||||
|
labels = dict()
|
||||||
|
label_types = {
|
||||||
|
"\\N": ""
|
||||||
|
}
|
||||||
|
release_groups = dict()
|
||||||
|
release_statuses = dict()
|
||||||
|
release_to_release_group_map = dict()
|
||||||
|
release_types = {
|
||||||
|
"\\N": "",
|
||||||
|
}
|
||||||
|
artists = dict()
|
||||||
|
tags = dict()
|
||||||
|
|
||||||
|
release_release_rel_map = {
|
||||||
|
"covers and versions": "",
|
||||||
|
"remixes and compilations": "",
|
||||||
|
"DJ-mix": "IS_DJ_MIX_OF",
|
||||||
|
"live performance": "IS_LIVE_PERFORMANCE_OF",
|
||||||
|
"cover": "IS_COVER_OF",
|
||||||
|
"remix": "IS_REMIX_OF",
|
||||||
|
"mashes up": "IS_MASHUP_OF",
|
||||||
|
"included in": "INCLUDED_IN",
|
||||||
|
"single from": "IS_SINGLE_FROM"
|
||||||
|
}
|
||||||
|
|
||||||
|
artist_release_rel_map = {
|
||||||
|
"translator": "TRANSLATED",
|
||||||
|
"liner notes": "WROTE_LINER_NOTES",
|
||||||
|
"lyricist": "IS_LYRICIST_FOR",
|
||||||
|
"lacquer cut": "DID_LACQUER_CUT_FOR",
|
||||||
|
"samples from artist": "HAS_SAMPLES_IN",
|
||||||
|
"remixes and compilations": "",
|
||||||
|
"composition": "COMPOSED",
|
||||||
|
"booking": "DID_BOOKING_FOR",
|
||||||
|
"balance": "DID_BALANCE_FOR",
|
||||||
|
"misc": "HAS_MISC_ROLE_IN",
|
||||||
|
"conductor": "CONDUCTED",
|
||||||
|
"legal representation": "PROVIDED_LEGAL_REPRESENTATION_FOR",
|
||||||
|
"design/illustration": "DID_DESIGN_FOR",
|
||||||
|
"performing orchestra": "PERFORMED_FOR",
|
||||||
|
"producer": "PRODUCED",
|
||||||
|
"instrument": "PERFORMED_INSTRUMENT_FOR",
|
||||||
|
"writer": "WROTE_LYRICS_FOR",
|
||||||
|
"production": "DID_PRODUCTION_FOR",
|
||||||
|
"performance": "PERFORMED_FOR",
|
||||||
|
"composer": "IS_COMPOSER_FOR",
|
||||||
|
"sound": "DID_SOUND_FOR",
|
||||||
|
"remixer": "DID_REMIXING_FOR",
|
||||||
|
"orchestrator": "IS_ORCHESTRATOR_FOR",
|
||||||
|
"compiler": "DID_COMPILATION_FOR",
|
||||||
|
"vocal arranger": "IS_ARRANGER_FOR",
|
||||||
|
"arranger": "IS_ARRENGER_FOR",
|
||||||
|
"mix-DJ": "MIXED",
|
||||||
|
"editor": "IS_EDITOR_FOR",
|
||||||
|
"illustration": "DID_ILLUSTRATION_FOR",
|
||||||
|
"audio": "DID_AUDIO_FOR",
|
||||||
|
"publishing": "IS_PUBLISHER_FOR",
|
||||||
|
"art direction": "DID_ART_DIRECTOR_FOR",
|
||||||
|
"design": "DID_DESIGN_FOR",
|
||||||
|
"instrument arranger": "IS_ARRANGER_FOR",
|
||||||
|
"chorus master": "IS_CHORUS_MASTER_FOR",
|
||||||
|
"photography": "DID_PHOTOGRAPHY_FOR",
|
||||||
|
"performer": "PERFORMED_IN",
|
||||||
|
"graphic design": "DID_GRAPHIC_DESIGN_FOR",
|
||||||
|
"booklet editor": "IS_BOOKLET_EDITOR_FOR",
|
||||||
|
"programming": "DID_PROGRAMING_FOR",
|
||||||
|
"copyright": "IS_COPYRIGHT_HOLDER_OF",
|
||||||
|
"piano technician": "IS_PIANO_TECNICIAN_FOR",
|
||||||
|
"phonographic copyright": "IS_PHONOGRAPHIC_COPYRIGHT_HOLDER_OF",
|
||||||
|
"mastering": "DID_MASTERING_FOR",
|
||||||
|
"vocal": "PERFORED_VOCALS_FOR",
|
||||||
|
"librettist": "IS_LIBRETTIST_FOR",
|
||||||
|
"mix": "MIXED",
|
||||||
|
"recording": "DID_RECORDING_FOR",
|
||||||
|
"concertmaster": "IS_CONCERTMASTER_FOR",
|
||||||
|
"engineer": "IS_ENGINEER_FOR",
|
||||||
|
|
||||||
|
# release_group
|
||||||
|
"tribute": "IS_TRIBUTE_TO",
|
||||||
|
"dedicated to": "IS_DEDICATED_TO",
|
||||||
|
"creative direction": "",
|
||||||
|
"artists and repertoire": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
artist_artist_rel_map = {
|
||||||
|
"teacher": "TEACHER_OF",
|
||||||
|
"composer-in-residence": "HAS_COMPOSER-IN-RESIDENCE_STATUS_IN",
|
||||||
|
"member of band": "IS_MEMBER_OF",
|
||||||
|
"voice actor": "IS_VOICE_ACTOR_OF",
|
||||||
|
"tribute": "IS_TRIBUTE_TO",
|
||||||
|
"supporting musician": "IS_SUPPORTING_MUSICIAN_OF",
|
||||||
|
"instrumental supporting musician": "IS_INSTRUMENTAL_SUPPORTING_MUSICIAN_OF",
|
||||||
|
"personal relationship": "HAS_PERSONAL_RELATIONSHIP_WITH",
|
||||||
|
"musical relationships": "HAS_MUSICAL_RELATIONSHIP_WITH",
|
||||||
|
"collaboration": "HAS_COLLABORATED_WITH",
|
||||||
|
"married": "IS_MARRIED_WITH",
|
||||||
|
"sibling": "IS_SIBLING_OF",
|
||||||
|
"parent": "IS_PARENT_OF",
|
||||||
|
"is person": "IS",
|
||||||
|
"conductor position": "IS_CONDUCTOR_OF",
|
||||||
|
"vocal supporting musician": "DOES_VOCAL_SUPPORT_FOR",
|
||||||
|
"artistic director": "IS_ARTIST_DIRECTOR_OF",
|
||||||
|
"subgroup": "IS_SUBGROUP_OF",
|
||||||
|
"founder": "IS_FOUNDER_OF",
|
||||||
|
"involved with": "IS_INVOLVED_WITH",
|
||||||
|
"named after": "IS_NAMED_AFTER",
|
||||||
|
}
|
||||||
|
|
||||||
|
label_label_rel_map = {
|
||||||
|
"label rename": "WAS_RENAMED_TO",
|
||||||
|
"imprint": "DOES_IMPRINT_FOR",
|
||||||
|
"label distribution": "DOES_DISTRIBUTION_FOR",
|
||||||
|
"business association": "HAS_BUSINESS_ASSOCIATION_TO",
|
||||||
|
"label ownership": "OWNS",
|
||||||
|
"label reissue": "DOES_REISSUING_FOR"
|
||||||
|
}
|
||||||
|
|
||||||
|
if not os.path.exists("repo"):
|
||||||
|
os.mkdir("repo")
|
||||||
|
else:
|
||||||
|
os.system("rm repo/*")
|
||||||
|
if not os.path.exists("tmp"):
|
||||||
|
os.mkdir("tmp")
|
||||||
|
else:
|
||||||
|
os.system("rm tmp/*")
|
||||||
|
|
||||||
|
with open("in/link", "r") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
links[cols[0]] = cols
|
||||||
|
|
||||||
|
with open("in/release_status", "r") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
release_statuses[cols[0]] = cols
|
||||||
|
|
||||||
|
with open("in/link_type", "r") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
link_types[cols[0]] = cols
|
||||||
|
|
||||||
|
with open("in/area", "r") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
areas[cols[0]] = cols
|
||||||
|
|
||||||
|
with open("in/label_type") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
|
||||||
|
label_types[cols[0]] = ";" + cols[1].replace(" ", "")
|
||||||
|
|
||||||
|
if cols[3] != "\\N" and cols[2] in label_types:
|
||||||
|
label_types[cols[0]] += label_types[cols[2]].replace(" ", "")
|
||||||
|
|
||||||
|
with open("in/artist") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
artists[cols[0]] = cols
|
||||||
|
|
||||||
|
with open("repo/area_area.csv", "w") as out:
|
||||||
|
out.write(":START_ID(Area),:END_ID(Area)\n")
|
||||||
|
|
||||||
|
with open("in/l_area_area", "r") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
out.write(",".join((areas[cols[3]][1],
|
||||||
|
areas[cols[2]][1]
|
||||||
|
)) + "\n")
|
||||||
|
|
||||||
|
with open("repo/area.csv", "w") as out:
|
||||||
|
out.write("id:ID(Area),name\n")
|
||||||
|
|
||||||
|
for k, area in areas.items():
|
||||||
|
out.write(",".join((area[1],
|
||||||
|
'"' + area[2] + '"'
|
||||||
|
)) + "\n")
|
||||||
|
|
||||||
|
# ------
|
||||||
|
|
||||||
|
|
||||||
|
out_artist = open("repo/artist.csv", "w")
|
||||||
|
out_artist_area = open("repo/artist_area.csv", "w")
|
||||||
|
|
||||||
|
out_artist.write("id:ID(Artist),name,year:int,:LABEL\n")
|
||||||
|
out_artist_area.write(":START_ID(Artist),:END_ID(Area)\n")
|
||||||
|
|
||||||
|
for _, artist in artists.items():
|
||||||
|
out_artist.write(",".join((
|
||||||
|
artist[1],
|
||||||
|
'"' + artist[2].replace("\"", "\"\"") + '"',
|
||||||
|
artist[4] if artist[4] != "\\N" else "0",
|
||||||
|
"Artist" + (";Group\n" if artist[10] == "2" else "\n")
|
||||||
|
)))
|
||||||
|
|
||||||
|
if artist[11] != "\\N":
|
||||||
|
out_artist_area.write(artist[1] + "," + areas[artist[11]][1] + "\n")
|
||||||
|
|
||||||
|
out_artist.close()
|
||||||
|
out_artist_area.close()
|
||||||
|
|
||||||
|
with open("repo/artist_artist.csv", "w") as out:
|
||||||
|
out.write(":START_ID(Artist),:END_ID(Artist),:TYPE\n")
|
||||||
|
|
||||||
|
with open("in/l_artist_artist", "r") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
out.write(",".join((
|
||||||
|
artists[cols[2]][1],
|
||||||
|
artists[cols[3]][1],
|
||||||
|
artist_artist_rel_map[link_types[links[cols[1]][1]][6]] + "\n"
|
||||||
|
)))
|
||||||
|
|
||||||
|
# --------
|
||||||
|
|
||||||
|
with open("in/release_group_primary_type") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
release_types[cols[0]] = ";" + cols[1]
|
||||||
|
|
||||||
|
release_group_year = dict()
|
||||||
|
with open("in/release_group_meta") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
release_group_year[cols[0]] = cols[2] if cols[2] != "\\N" else "0"
|
||||||
|
|
||||||
|
with open("repo/release.csv", "w") as out:
|
||||||
|
out.write("id:ID(Release),name,year:int,:LABEL\n")
|
||||||
|
|
||||||
|
with open("in/release_group") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
out.write(",".join((
|
||||||
|
cols[1],
|
||||||
|
'"' + cols[2].replace("\"", "\"\"") + '"',
|
||||||
|
release_group_year[cols[0]],
|
||||||
|
"Release" + release_types[cols[4]],
|
||||||
|
)) + "\n")
|
||||||
|
|
||||||
|
release_groups[cols[0]] = cols
|
||||||
|
|
||||||
|
with open("in/release") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
if cols[5] != '\\N' and release_statuses[cols[5]][1] == "Official":
|
||||||
|
release_to_release_group_map[cols[0]] = cols[4]
|
||||||
|
|
||||||
|
credit_names = defaultdict(list)
|
||||||
|
|
||||||
|
with open("in/artist_credit_name") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
credit_names[cols[0]].append(artists[cols[2]][1])
|
||||||
|
|
||||||
|
with open("tmp/tmp_artist_release.csv", "w") as out:
|
||||||
|
out.write(":START_ID(Artist),:END_ID(Release),:TYPE\n")
|
||||||
|
|
||||||
|
# Is this part really necessary?
|
||||||
|
with open("in/l_artist_release") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
if cols[3] in release_to_release_group_map:
|
||||||
|
out.write(",".join((
|
||||||
|
artists[cols[2]][1],
|
||||||
|
release_groups[release_to_release_group_map[cols[3]]][1],
|
||||||
|
artist_release_rel_map[link_types[links[cols[1]][1]][6]]
|
||||||
|
)) + "\n")
|
||||||
|
|
||||||
|
# Artist credits
|
||||||
|
with open("in/release") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
if cols[0] in release_to_release_group_map:
|
||||||
|
for credit in credit_names[cols[3]]:
|
||||||
|
out.write(",".join((
|
||||||
|
credit,
|
||||||
|
release_groups[release_to_release_group_map[cols[0]]][1],
|
||||||
|
"CREDITED_FOR"
|
||||||
|
)) + "\n")
|
||||||
|
|
||||||
|
# Remove dupes
|
||||||
|
os.system("(head -n 1 tmp/tmp_artist_release.csv && tail -n +2 tmp/tmp_artist_release.csv"
|
||||||
|
" | sort) | uniq > repo/artist_release.csv && rm tmp/tmp_artist_release.csv")
|
||||||
|
|
||||||
|
|
||||||
|
with open("repo/release_release.csv", "w") as out:
|
||||||
|
out.write(":START_ID(Release),:END_ID(Release),:TYPE\n")
|
||||||
|
|
||||||
|
with open("in/l_release_group_release_group") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
out.write(",".join((
|
||||||
|
release_groups[cols[2]][1],
|
||||||
|
release_groups[cols[3]][1],
|
||||||
|
release_release_rel_map[link_types[links[cols[1]][1]][6]]
|
||||||
|
)) + "\n")
|
||||||
|
|
||||||
|
# ---
|
||||||
|
|
||||||
|
with open("in/tag") as f:
|
||||||
|
with open("repo/tag.csv", "w") as out:
|
||||||
|
out.write("id:ID(Tag),name\n")
|
||||||
|
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
tags[cols[0]] = cols
|
||||||
|
out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"\n")
|
||||||
|
|
||||||
|
with open("repo/release_tag.csv", "w") as out:
|
||||||
|
out.write(":START_ID(Release),:END_ID(Tag),weight:int\n")
|
||||||
|
|
||||||
|
with open("in/release_group_tag") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
|
||||||
|
if int(cols[2]) <= 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
out.write(",".join((
|
||||||
|
release_groups[cols[0]][1],
|
||||||
|
cols[1],
|
||||||
|
cols[2],
|
||||||
|
)) + "\n")
|
||||||
|
|
||||||
|
with open("repo/artist_tag.csv", "w") as out:
|
||||||
|
out.write(":START_ID(Artist),:END_ID(Tag),weight:int\n")
|
||||||
|
|
||||||
|
with open("in/artist_tag") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
|
||||||
|
if int(cols[2]) <= 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
out.write(",".join((
|
||||||
|
artists[cols[0]][1],
|
||||||
|
cols[1],
|
||||||
|
cols[2],
|
||||||
|
)) + "\n")
|
||||||
|
|
||||||
|
with open("repo/tag_tag.csv", "w") as out:
|
||||||
|
out.write(":START_ID(Tag),:END_ID(Tag),weight:int\n")
|
||||||
|
|
||||||
|
with open("in/tag_relation") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
|
||||||
|
if int(cols[2]) <= 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
out.write(",".join((
|
||||||
|
cols[0],
|
||||||
|
cols[1],
|
||||||
|
cols[2],
|
||||||
|
)) + "\n")
|
||||||
|
|
||||||
|
# -----
|
||||||
|
|
||||||
|
with open("repo/labels.csv", "w") as out:
|
||||||
|
out.write("id:ID(Label),name,code,:LABEL\n")
|
||||||
|
|
||||||
|
with open("in/label") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
labels[cols[0]] = cols
|
||||||
|
|
||||||
|
out.write(",".join((
|
||||||
|
cols[1],
|
||||||
|
"\"" + cols[2].replace("\"", "\"\"") + "\"",
|
||||||
|
cols[9] if cols[9] != "\\N" else "",
|
||||||
|
"Label" + label_types[cols[10]]
|
||||||
|
)) + "\n")
|
||||||
|
|
||||||
|
with open("repo/label_label.csv", "w") as out:
|
||||||
|
out.write(":START_ID(Label),:END_ID(Label),:TYPE\n")
|
||||||
|
|
||||||
|
with open("in/l_label_label") as f:
|
||||||
|
for line in f:
|
||||||
|
cols = line.split("\t")
|
||||||
|
|
||||||
|
out.write(",".join((
|
||||||
|
labels[cols[2]][1],
|
||||||
|
labels[cols[3]][1],
|
||||||
|
label_label_rel_map[link_types[links[cols[1]][1]][6]]
|
||||||
|
)) + "\n")
|
||||||
|
|
||||||
|
# ---
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
requests
|
2
seed.cypher
Normal file
2
seed.cypher
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
CREATE INDEX ON :Artist(id);
|
||||||
|
CREATE INDEX ON :Release(id);
|
5
seed_neo4j_db.sh
Executable file
5
seed_neo4j_db.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
export NEO4J_HOME="/home/drone/Downloads/neo4j-community-3.5.3"
|
||||||
|
|
||||||
|
cat seed.cypher | ${NEO4J_HOME}/bin/cypher-shell
|
Loading…
x
Reference in New Issue
Block a user