mirror of
				https://github.com/simon987/music-graph-scripts.git
				synced 2025-11-04 13:46:52 +00:00 
			
		
		
		
	Initial commit
This commit is contained in:
		
							parent
							
								
									5fb05d591a
								
							
						
					
					
						commit
						6d8c65fcd2
					
				
							
								
								
									
										8
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -102,3 +102,11 @@ venv.bak/
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
# mypy
 | 
					# mypy
 | 
				
			||||||
.mypy_cache/
 | 
					.mypy_cache/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.idea/
 | 
				
			||||||
 | 
					in/
 | 
				
			||||||
 | 
					repo/
 | 
				
			||||||
 | 
					tmp/
 | 
				
			||||||
 | 
					workspace/
 | 
				
			||||||
 | 
					worker.json
 | 
				
			||||||
 | 
					*.db
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										9
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@ -0,0 +1,9 @@
 | 
				
			|||||||
 | 
					[submodule "task_tracker_drone"]
 | 
				
			||||||
 | 
						path = task_tracker_drone
 | 
				
			||||||
 | 
						url = https://github.com/simon987/task_tracker_drone/
 | 
				
			||||||
 | 
					[submodule "last.fm"]
 | 
				
			||||||
 | 
						path = last.fm
 | 
				
			||||||
 | 
						url = https://git.simon987.net/drone/last.fm
 | 
				
			||||||
 | 
					[submodule "caa"]
 | 
				
			||||||
 | 
						path = caa
 | 
				
			||||||
 | 
						url = https://git.simon987.net/drone/caa.git
 | 
				
			||||||
							
								
								
									
										6
									
								
								.idea/misc.xml
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								.idea/misc.xml
									
									
									
										generated
									
									
									
										Normal file
									
								
							@ -0,0 +1,6 @@
 | 
				
			|||||||
 | 
					<?xml version="1.0" encoding="UTF-8"?>
 | 
				
			||||||
 | 
					<project version="4">
 | 
				
			||||||
 | 
					  <component name="JavaScriptSettings">
 | 
				
			||||||
 | 
					    <option name="languageLevel" value="ES6" />
 | 
				
			||||||
 | 
					  </component>
 | 
				
			||||||
 | 
					</project>
 | 
				
			||||||
							
								
								
									
										8
									
								
								.idea/modules.xml
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								.idea/modules.xml
									
									
									
										generated
									
									
									
										Normal file
									
								
							@ -0,0 +1,8 @@
 | 
				
			|||||||
 | 
					<?xml version="1.0" encoding="UTF-8"?>
 | 
				
			||||||
 | 
					<project version="4">
 | 
				
			||||||
 | 
					  <component name="ProjectModuleManager">
 | 
				
			||||||
 | 
					    <modules>
 | 
				
			||||||
 | 
					      <module fileurl="file://$PROJECT_DIR$/.idea/music-graph-scripts.iml" filepath="$PROJECT_DIR$/.idea/music-graph-scripts.iml" />
 | 
				
			||||||
 | 
					    </modules>
 | 
				
			||||||
 | 
					  </component>
 | 
				
			||||||
 | 
					</project>
 | 
				
			||||||
							
								
								
									
										9
									
								
								.idea/music-graph-scripts.iml
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								.idea/music-graph-scripts.iml
									
									
									
										generated
									
									
									
										Normal file
									
								
							@ -0,0 +1,9 @@
 | 
				
			|||||||
 | 
					<?xml version="1.0" encoding="UTF-8"?>
 | 
				
			||||||
 | 
					<module type="JAVA_MODULE" version="4">
 | 
				
			||||||
 | 
					  <component name="NewModuleRootManager" inherit-compiler-output="true">
 | 
				
			||||||
 | 
					    <exclude-output />
 | 
				
			||||||
 | 
					    <content url="file://$MODULE_DIR$" />
 | 
				
			||||||
 | 
					    <orderEntry type="inheritedJdk" />
 | 
				
			||||||
 | 
					    <orderEntry type="sourceFolder" forTests="false" />
 | 
				
			||||||
 | 
					  </component>
 | 
				
			||||||
 | 
					</module>
 | 
				
			||||||
							
								
								
									
										6
									
								
								.idea/vcs.xml
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								.idea/vcs.xml
									
									
									
										generated
									
									
									
										Normal file
									
								
							@ -0,0 +1,6 @@
 | 
				
			|||||||
 | 
					<?xml version="1.0" encoding="UTF-8"?>
 | 
				
			||||||
 | 
					<project version="4">
 | 
				
			||||||
 | 
					  <component name="VcsDirectoryMappings">
 | 
				
			||||||
 | 
					    <mapping directory="" vcs="Git" />
 | 
				
			||||||
 | 
					  </component>
 | 
				
			||||||
 | 
					</project>
 | 
				
			||||||
							
								
								
									
										0
									
								
								__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										1
									
								
								caa
									
									
									
									
									
										Submodule
									
								
							
							
								
								
								
								
								
								
									
									
								
							
						
						
									
										1
									
								
								caa
									
									
									
									
									
										Submodule
									
								
							@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					Subproject commit 910f4a0bceadac37ac28fa59e7648f241c931fe6
 | 
				
			||||||
							
								
								
									
										20
									
								
								download_mb_dump.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										20
									
								
								download_mb_dump.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,20 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/env bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					latest=$(curl http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/LATEST)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					mkdir in 2> /dev/null
 | 
				
			||||||
 | 
					cd in
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/mbdump.tar.bz2"
 | 
				
			||||||
 | 
					wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/mbdump-derived.tar.bz2"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					tar -xjvf mbdump.tar.bz2 mbdump/area mbdump/artist mbdump/l_area_area mbdump/l_artist_artist \
 | 
				
			||||||
 | 
					mbdump/l_artist_release mbdump/l_artist_release_group mbdump/l_label_label mbdump/l_release_group_release_group \
 | 
				
			||||||
 | 
					mbdump/label mbdump/label_type mbdump/link mbdump/link_type mbdump/release mbdump/release_group \
 | 
				
			||||||
 | 
					mbdump/release_group_primary_type mbdump/artist_credit_name mbdump/release_status
 | 
				
			||||||
 | 
					tar -xjvf mbdump-derived.tar.bz2 mbdump/artist_tag mbdump/release_group_tag mbdump/tag mbdump/tag_relation \
 | 
				
			||||||
 | 
					mbdump/release_group_meta
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					mv mbdump/* .
 | 
				
			||||||
 | 
					rm -r mbdump
 | 
				
			||||||
 | 
					cd ..
 | 
				
			||||||
							
								
								
									
										27
									
								
								extract_covers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								extract_covers.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,27 @@
 | 
				
			|||||||
 | 
					import sqlite3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with sqlite3.connect(sys.argv[1]) as conn:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cursor = conn.cursor()
 | 
				
			||||||
 | 
					    cursor.execute("SELECT id from covers")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cursor = conn.cursor()
 | 
				
			||||||
 | 
					    cursor.execute("SELECT id from covers")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def rows():
 | 
				
			||||||
 | 
					        buf = list()
 | 
				
			||||||
 | 
					        for row in cursor.fetchall():
 | 
				
			||||||
 | 
					            buf.append(row[0])
 | 
				
			||||||
 | 
					            if len(buf) > 30:
 | 
				
			||||||
 | 
					                yield buf
 | 
				
			||||||
 | 
					                buf.clear()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for batch in rows():
 | 
				
			||||||
 | 
					        cursor.execute("SELECT cover from covers where id in (%s)" % (",".join(("'" + b + "'") for b in batch)))
 | 
				
			||||||
 | 
					        covers = cursor.fetchall()
 | 
				
			||||||
 | 
					        for i, cover in enumerate(covers):
 | 
				
			||||||
 | 
					            with open("./tmpcovers/" + batch[i] + ".jpg", "wb") as out:
 | 
				
			||||||
 | 
					                out.write(cover[0])
 | 
				
			||||||
 | 
					                print(batch[i])
 | 
				
			||||||
							
								
								
									
										56
									
								
								generate_caa_tasks.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								generate_caa_tasks.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,56 @@
 | 
				
			|||||||
 | 
					import json
 | 
				
			||||||
 | 
					from multiprocessing.pool import ThreadPool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TT_API_URL = "https://tt.simon987.net/api"
 | 
				
			||||||
 | 
					TT_PROJECT = 5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					done = set()
 | 
				
			||||||
 | 
					# with sqlite3.connect(sys.argv[1]) as conn:
 | 
				
			||||||
 | 
					#     cur = conn.cursor()
 | 
				
			||||||
 | 
					#     cur.execute("SELECT id FROM covers")
 | 
				
			||||||
 | 
					#     for mbid in cur.fetchall():
 | 
				
			||||||
 | 
					#         done.add(mbid[0])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					api = TaskTrackerApi(TT_API_URL)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					worker = Worker.from_file(api)
 | 
				
			||||||
 | 
					if not worker:
 | 
				
			||||||
 | 
					    worker = api.make_worker("caa scraper")
 | 
				
			||||||
 | 
					    worker.dump_to_file()
 | 
				
			||||||
 | 
					worker.request_access(TT_PROJECT, True, True)
 | 
				
			||||||
 | 
					input("Give permission to " + worker.alias)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def mktask(mbids):
 | 
				
			||||||
 | 
					    res = worker.submit_task(
 | 
				
			||||||
 | 
					        project=TT_PROJECT,
 | 
				
			||||||
 | 
					        recipe=json.dumps(mbids),
 | 
				
			||||||
 | 
					        hash64=hash(mbids[0]),
 | 
				
			||||||
 | 
					        max_assign_time=60 * 30,
 | 
				
			||||||
 | 
					        priority=1,
 | 
				
			||||||
 | 
					        unique_str=None,
 | 
				
			||||||
 | 
					        verification_count=None,
 | 
				
			||||||
 | 
					        max_retries=5,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    print(res.text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def lines():
 | 
				
			||||||
 | 
					    with open("in/release") as f:
 | 
				
			||||||
 | 
					        buf = list()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            buf.append(cols[1])
 | 
				
			||||||
 | 
					            if len(buf) == 75:
 | 
				
			||||||
 | 
					                a = list(buf)
 | 
				
			||||||
 | 
					                buf.clear()
 | 
				
			||||||
 | 
					                yield a
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					pool = ThreadPool(processes=20)
 | 
				
			||||||
 | 
					pool.map(func=mktask, iterable=lines())
 | 
				
			||||||
							
								
								
									
										48
									
								
								generate_lastfm_tasks.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								generate_lastfm_tasks.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,48 @@
 | 
				
			|||||||
 | 
					import csv
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					from multiprocessing.pool import ThreadPool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TT_API_URL = "https://tt.simon987.net/api"
 | 
				
			||||||
 | 
					TT_PROJECT = 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					api = TaskTrackerApi(TT_API_URL)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					worker = Worker.from_file(api)
 | 
				
			||||||
 | 
					if not worker:
 | 
				
			||||||
 | 
					    worker = api.make_worker("last.fm scraper")
 | 
				
			||||||
 | 
					    worker.dump_to_file()
 | 
				
			||||||
 | 
					worker.request_access(TT_PROJECT, True, True)
 | 
				
			||||||
 | 
					input("Give permission to " + worker.alias)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("repo/artist.csv") as f:
 | 
				
			||||||
 | 
					    reader = csv.reader(f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def mktask(lines):
 | 
				
			||||||
 | 
					        res = worker.submit_task(
 | 
				
			||||||
 | 
					            project=TT_PROJECT,
 | 
				
			||||||
 | 
					            recipe=json.dumps(
 | 
				
			||||||
 | 
					                [{"mbid": line[0], "name": line[1]} for line in lines]
 | 
				
			||||||
 | 
					            ),
 | 
				
			||||||
 | 
					            unique_str=lines[0][0],
 | 
				
			||||||
 | 
					            max_assign_time=60 * 5,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        print(res.text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def lines():
 | 
				
			||||||
 | 
					        line_batch = list()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for line in reader:
 | 
				
			||||||
 | 
					            if "Group" in line[3]:
 | 
				
			||||||
 | 
					                line_batch.append(line)
 | 
				
			||||||
 | 
					            if len(line_batch) >= 30:
 | 
				
			||||||
 | 
					                res = list(line_batch)
 | 
				
			||||||
 | 
					                line_batch.clear()
 | 
				
			||||||
 | 
					                yield res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    tasks = list(lines())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pool = ThreadPool(processes=25)
 | 
				
			||||||
 | 
					    pool.map(func=mktask, iterable=tasks)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										1
									
								
								last.fm
									
									
									
									
									
										Submodule
									
								
							
							
								
								
								
								
								
								
									
									
								
							
						
						
									
										1
									
								
								last.fm
									
									
									
									
									
										Submodule
									
								
							@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					Subproject commit 855df64c316930062ff4f7740492d0f039788498
 | 
				
			||||||
							
								
								
									
										53
									
								
								make_neoj4_db.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										53
									
								
								make_neoj4_db.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,53 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export NEO4J_HOME="/home/drone/Downloads/neo4j-community-3.5.3"
 | 
				
			||||||
 | 
					export REPOSITORY="http://localhost:9999"
 | 
				
			||||||
 | 
					export DATABASE="graph.db"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					rm -rf "${NEO4J_HOME}/data/databases/${DATABASE}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cp ${NEO4J_HOME}/conf/neo4j.conf ${NEO4J_HOME}/conf/neo4j.conf.bak
 | 
				
			||||||
 | 
					echo "dbms.security.auth_enabled=false" >> ${NEO4J_HOME}/conf/neo4j.conf
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					mkdir workspace 2> /dev/null
 | 
				
			||||||
 | 
					cd workspace
 | 
				
			||||||
 | 
					rm *.csv
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/area.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/area_area.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/lastfm_artist.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/artist_area.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/artist_artist.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/artist_release.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/release.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/tag.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/tag_tag.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/release_tag.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/release_release.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/artist_tag.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/labels.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/label_label.csv
 | 
				
			||||||
 | 
					wget ${REPOSITORY}/lastfm_artist_artist.csv
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					. ${NEO4J_HOME}/bin/neo4j-admin import \
 | 
				
			||||||
 | 
					    --database ${DATABASE}\
 | 
				
			||||||
 | 
					    --high-io=true\
 | 
				
			||||||
 | 
					    --nodes:Area:MusicBrainzEntity "area.csv"\
 | 
				
			||||||
 | 
					    --nodes:MusicBrainzEntity "release.csv"\
 | 
				
			||||||
 | 
					    --nodes:MusicBrainzEntity "lastfm_artist.csv"\
 | 
				
			||||||
 | 
					    --nodes:Tag "tag.csv"\
 | 
				
			||||||
 | 
					    --nodes:MusicBrainzEntity "labels.csv"\
 | 
				
			||||||
 | 
					    --relationships:IS_PART_OF "area_area.csv"\
 | 
				
			||||||
 | 
					    --relationships:IS_BASED_IN "artist_area.csv"\
 | 
				
			||||||
 | 
					    --relationships "artist_artist.csv"\
 | 
				
			||||||
 | 
					    --relationships "artist_release.csv"\
 | 
				
			||||||
 | 
					    --relationships:IS_TAGGED "release_tag.csv"\
 | 
				
			||||||
 | 
					    --relationships:IS_TAGGED "artist_tag.csv"\
 | 
				
			||||||
 | 
					    --relationships:IS_RELATED_TO "tag_tag.csv"\
 | 
				
			||||||
 | 
					    --relationships "label_label.csv"\
 | 
				
			||||||
 | 
					    --relationships "release_release.csv"\
 | 
				
			||||||
 | 
					    --relationships:IS_RELATED_TO "lastfm_artist_artist.csv"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					rm *.csv
 | 
				
			||||||
 | 
					cd ..
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										31
									
								
								make_release_to_rg_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								make_release_to_rg_map.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,31 @@
 | 
				
			|||||||
 | 
					import sqlite3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					release_to_release_group_map = dict()
 | 
				
			||||||
 | 
					release_groups = dict()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("in/release_group") as f:
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        cols = line.split("\t")
 | 
				
			||||||
 | 
					        release_groups[cols[0]] = cols[1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("in/release") as f:
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        cols = line.split("\t")
 | 
				
			||||||
 | 
					        release_to_release_group_map[cols[1]] = release_groups[cols[4]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with sqlite3.connect("mapdb.db") as conn:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cursor = conn.cursor()
 | 
				
			||||||
 | 
					    cursor.execute("CREATE TABLE map (release TEXT PRIMARY KEY , release_group TEXT)")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for k, v in release_to_release_group_map.items():
 | 
				
			||||||
 | 
					        cursor.execute("INSERT INTO map (release, release_group) VALUES (?,?)", (k, v))
 | 
				
			||||||
 | 
					    conn.commit()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					CREATE TABLE covers (id TEXT primary key, cover BLOB);
 | 
				
			||||||
 | 
					ATTACH 'mapdb.db' AS map;
 | 
				
			||||||
 | 
					ATTACH '/mnt/Data8/caa_tn_only.db' AS source;
 | 
				
			||||||
 | 
					INSERT OR IGNORE INTO covers SELECT release_group, cover FROM source.covers INNER JOIN map.map ON id = map.release;
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										100
									
								
								process_lastfm_data.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										100
									
								
								process_lastfm_data.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,100 @@
 | 
				
			|||||||
 | 
					import csv
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					import sqlite3
 | 
				
			||||||
 | 
					from collections import defaultdict
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					artists = set()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def disambiguate(lfm_artist, artist_release_count, name, mbid):
 | 
				
			||||||
 | 
					    existing_mbid = lfm_artist.get(name, None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if existing_mbid and mbid != existing_mbid:
 | 
				
			||||||
 | 
					        if artist_release_count[existing_mbid] < artist_release_count[mbid]:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            lfm_artist[name] = mbid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            print("Replacing %s (%s) with %s (%d) for %s" %
 | 
				
			||||||
 | 
					                  (existing_mbid, artist_release_count[existing_mbid],
 | 
				
			||||||
 | 
					                   mbid, artist_release_count[mbid],
 | 
				
			||||||
 | 
					                   name))
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        lfm_artist[name] = mbid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def patch(lastfm_data):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    artist_listeners = dict()
 | 
				
			||||||
 | 
					    lastfm_artist_to_mbid = dict()
 | 
				
			||||||
 | 
					    artist_release_count = defaultdict(int)
 | 
				
			||||||
 | 
					    related = list()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("repo/artist_release.csv") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split(',')
 | 
				
			||||||
 | 
					            artist_release_count[cols[0]] += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with sqlite3.connect(lastfm_data) as conn:
 | 
				
			||||||
 | 
					        cur = conn.cursor()
 | 
				
			||||||
 | 
					        cur.execute("SELECT data FROM lastfmdata", )
 | 
				
			||||||
 | 
					        data = list(cur.fetchall())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # A lastfm artist name can refer to multiple MBIDs
 | 
				
			||||||
 | 
					    # For RELATED_TO purposes, we assume that the MBID referring
 | 
				
			||||||
 | 
					    # to the artist with the most official releases is the one
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for row in data:
 | 
				
			||||||
 | 
					        meta = json.loads(row[0])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        disambiguate(lastfm_artist_to_mbid, artist_release_count, meta["name"], meta["artist"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for similar in [s for s in meta["similar"] if s["mbid"] is not None]:
 | 
				
			||||||
 | 
					            disambiguate(lastfm_artist_to_mbid, artist_release_count, similar["name"], similar["mbid"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Get related links & listener counts
 | 
				
			||||||
 | 
					    for row in data:
 | 
				
			||||||
 | 
					        meta = json.loads(row[0])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        artist_listeners[lastfm_artist_to_mbid[meta["name"]]] = \
 | 
				
			||||||
 | 
					            (meta["listeners"], meta["playcount"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for similar in [s for s in meta["similar"] if s["mbid"] is not None]:
 | 
				
			||||||
 | 
					            related.append((
 | 
				
			||||||
 | 
					                lastfm_artist_to_mbid[similar["name"]],
 | 
				
			||||||
 | 
					                lastfm_artist_to_mbid[meta["name"]],
 | 
				
			||||||
 | 
					                similar["match"]
 | 
				
			||||||
 | 
					            ))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("repo/lastfm_artist.csv", "w") as out:
 | 
				
			||||||
 | 
					        writer = csv.writer(out)
 | 
				
			||||||
 | 
					        writer.writerow([
 | 
				
			||||||
 | 
					            "id:ID(Artist)", "name", "year:short", ":LABEL", "listeners:int", "playcount:int"
 | 
				
			||||||
 | 
					        ])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        with open("repo/artist.csv") as f:
 | 
				
			||||||
 | 
					            reader = csv.reader(f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            reader.__next__()  # Skip header
 | 
				
			||||||
 | 
					            for row in reader:
 | 
				
			||||||
 | 
					                writer.writerow([
 | 
				
			||||||
 | 
					                    row[0],
 | 
				
			||||||
 | 
					                    row[1],
 | 
				
			||||||
 | 
					                    row[2],
 | 
				
			||||||
 | 
					                    row[3],
 | 
				
			||||||
 | 
					                    artist_listeners.get(row[0], (0, 0))[0],
 | 
				
			||||||
 | 
					                    artist_listeners.get(row[0], (0, 0))[1],
 | 
				
			||||||
 | 
					                ])
 | 
				
			||||||
 | 
					                artists.add(row[0])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("repo/lastfm_artist_artist.csv", "w") as out:
 | 
				
			||||||
 | 
					        out.write(",".join((
 | 
				
			||||||
 | 
					            ":START_ID(Artist)", ":END_ID(Artist)", "weight:float"
 | 
				
			||||||
 | 
					        )) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for x in related:
 | 
				
			||||||
 | 
					            if x[0] in artists and x[1] in artists:
 | 
				
			||||||
 | 
					                out.write(",".join(x) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					patch(sys.argv[1])
 | 
				
			||||||
							
								
								
									
										393
									
								
								process_mb_dump.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										393
									
								
								process_mb_dump.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,393 @@
 | 
				
			|||||||
 | 
					import os
 | 
				
			||||||
 | 
					from collections import defaultdict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					links = dict()
 | 
				
			||||||
 | 
					link_types = dict()
 | 
				
			||||||
 | 
					areas = dict()
 | 
				
			||||||
 | 
					labels = dict()
 | 
				
			||||||
 | 
					label_types = {
 | 
				
			||||||
 | 
					    "\\N": ""
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					release_groups = dict()
 | 
				
			||||||
 | 
					release_statuses = dict()
 | 
				
			||||||
 | 
					release_to_release_group_map = dict()
 | 
				
			||||||
 | 
					release_types = {
 | 
				
			||||||
 | 
					    "\\N": "",
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					artists = dict()
 | 
				
			||||||
 | 
					tags = dict()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					release_release_rel_map = {
 | 
				
			||||||
 | 
					    "covers and versions": "",
 | 
				
			||||||
 | 
					    "remixes and compilations": "",
 | 
				
			||||||
 | 
					    "DJ-mix": "IS_DJ_MIX_OF",
 | 
				
			||||||
 | 
					    "live performance": "IS_LIVE_PERFORMANCE_OF",
 | 
				
			||||||
 | 
					    "cover": "IS_COVER_OF",
 | 
				
			||||||
 | 
					    "remix": "IS_REMIX_OF",
 | 
				
			||||||
 | 
					    "mashes up": "IS_MASHUP_OF",
 | 
				
			||||||
 | 
					    "included in": "INCLUDED_IN",
 | 
				
			||||||
 | 
					    "single from": "IS_SINGLE_FROM"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					artist_release_rel_map = {
 | 
				
			||||||
 | 
					    "translator": "TRANSLATED",
 | 
				
			||||||
 | 
					    "liner notes": "WROTE_LINER_NOTES",
 | 
				
			||||||
 | 
					    "lyricist": "IS_LYRICIST_FOR",
 | 
				
			||||||
 | 
					    "lacquer cut": "DID_LACQUER_CUT_FOR",
 | 
				
			||||||
 | 
					    "samples from artist": "HAS_SAMPLES_IN",
 | 
				
			||||||
 | 
					    "remixes and compilations": "",
 | 
				
			||||||
 | 
					    "composition": "COMPOSED",
 | 
				
			||||||
 | 
					    "booking": "DID_BOOKING_FOR",
 | 
				
			||||||
 | 
					    "balance": "DID_BALANCE_FOR",
 | 
				
			||||||
 | 
					    "misc": "HAS_MISC_ROLE_IN",
 | 
				
			||||||
 | 
					    "conductor": "CONDUCTED",
 | 
				
			||||||
 | 
					    "legal representation": "PROVIDED_LEGAL_REPRESENTATION_FOR",
 | 
				
			||||||
 | 
					    "design/illustration": "DID_DESIGN_FOR",
 | 
				
			||||||
 | 
					    "performing orchestra": "PERFORMED_FOR",
 | 
				
			||||||
 | 
					    "producer": "PRODUCED",
 | 
				
			||||||
 | 
					    "instrument": "PERFORMED_INSTRUMENT_FOR",
 | 
				
			||||||
 | 
					    "writer": "WROTE_LYRICS_FOR",
 | 
				
			||||||
 | 
					    "production": "DID_PRODUCTION_FOR",
 | 
				
			||||||
 | 
					    "performance": "PERFORMED_FOR",
 | 
				
			||||||
 | 
					    "composer": "IS_COMPOSER_FOR",
 | 
				
			||||||
 | 
					    "sound": "DID_SOUND_FOR",
 | 
				
			||||||
 | 
					    "remixer": "DID_REMIXING_FOR",
 | 
				
			||||||
 | 
					    "orchestrator": "IS_ORCHESTRATOR_FOR",
 | 
				
			||||||
 | 
					    "compiler": "DID_COMPILATION_FOR",
 | 
				
			||||||
 | 
					    "vocal arranger": "IS_ARRANGER_FOR",
 | 
				
			||||||
 | 
					    "arranger": "IS_ARRENGER_FOR",
 | 
				
			||||||
 | 
					    "mix-DJ": "MIXED",
 | 
				
			||||||
 | 
					    "editor": "IS_EDITOR_FOR",
 | 
				
			||||||
 | 
					    "illustration": "DID_ILLUSTRATION_FOR",
 | 
				
			||||||
 | 
					    "audio": "DID_AUDIO_FOR",
 | 
				
			||||||
 | 
					    "publishing": "IS_PUBLISHER_FOR",
 | 
				
			||||||
 | 
					    "art direction": "DID_ART_DIRECTOR_FOR",
 | 
				
			||||||
 | 
					    "design": "DID_DESIGN_FOR",
 | 
				
			||||||
 | 
					    "instrument arranger": "IS_ARRANGER_FOR",
 | 
				
			||||||
 | 
					    "chorus master": "IS_CHORUS_MASTER_FOR",
 | 
				
			||||||
 | 
					    "photography": "DID_PHOTOGRAPHY_FOR",
 | 
				
			||||||
 | 
					    "performer": "PERFORMED_IN",
 | 
				
			||||||
 | 
					    "graphic design": "DID_GRAPHIC_DESIGN_FOR",
 | 
				
			||||||
 | 
					    "booklet editor": "IS_BOOKLET_EDITOR_FOR",
 | 
				
			||||||
 | 
					    "programming": "DID_PROGRAMING_FOR",
 | 
				
			||||||
 | 
					    "copyright": "IS_COPYRIGHT_HOLDER_OF",
 | 
				
			||||||
 | 
					    "piano technician": "IS_PIANO_TECNICIAN_FOR",
 | 
				
			||||||
 | 
					    "phonographic copyright": "IS_PHONOGRAPHIC_COPYRIGHT_HOLDER_OF",
 | 
				
			||||||
 | 
					    "mastering": "DID_MASTERING_FOR",
 | 
				
			||||||
 | 
					    "vocal": "PERFORED_VOCALS_FOR",
 | 
				
			||||||
 | 
					    "librettist": "IS_LIBRETTIST_FOR",
 | 
				
			||||||
 | 
					    "mix": "MIXED",
 | 
				
			||||||
 | 
					    "recording": "DID_RECORDING_FOR",
 | 
				
			||||||
 | 
					    "concertmaster": "IS_CONCERTMASTER_FOR",
 | 
				
			||||||
 | 
					    "engineer": "IS_ENGINEER_FOR",
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # release_group
 | 
				
			||||||
 | 
					    "tribute": "IS_TRIBUTE_TO",
 | 
				
			||||||
 | 
					    "dedicated to": "IS_DEDICATED_TO",
 | 
				
			||||||
 | 
					    "creative direction": "",
 | 
				
			||||||
 | 
					    "artists and repertoire": ""
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					artist_artist_rel_map = {
 | 
				
			||||||
 | 
					    "teacher": "TEACHER_OF",
 | 
				
			||||||
 | 
					    "composer-in-residence": "HAS_COMPOSER-IN-RESIDENCE_STATUS_IN",
 | 
				
			||||||
 | 
					    "member of band": "IS_MEMBER_OF",
 | 
				
			||||||
 | 
					    "voice actor": "IS_VOICE_ACTOR_OF",
 | 
				
			||||||
 | 
					    "tribute": "IS_TRIBUTE_TO",
 | 
				
			||||||
 | 
					    "supporting musician": "IS_SUPPORTING_MUSICIAN_OF",
 | 
				
			||||||
 | 
					    "instrumental supporting musician": "IS_INSTRUMENTAL_SUPPORTING_MUSICIAN_OF",
 | 
				
			||||||
 | 
					    "personal relationship": "HAS_PERSONAL_RELATIONSHIP_WITH",
 | 
				
			||||||
 | 
					    "musical relationships": "HAS_MUSICAL_RELATIONSHIP_WITH",
 | 
				
			||||||
 | 
					    "collaboration": "HAS_COLLABORATED_WITH",
 | 
				
			||||||
 | 
					    "married": "IS_MARRIED_WITH",
 | 
				
			||||||
 | 
					    "sibling": "IS_SIBLING_OF",
 | 
				
			||||||
 | 
					    "parent": "IS_PARENT_OF",
 | 
				
			||||||
 | 
					    "is person": "IS",
 | 
				
			||||||
 | 
					    "conductor position": "IS_CONDUCTOR_OF",
 | 
				
			||||||
 | 
					    "vocal supporting musician": "DOES_VOCAL_SUPPORT_FOR",
 | 
				
			||||||
 | 
					    "artistic director": "IS_ARTIST_DIRECTOR_OF",
 | 
				
			||||||
 | 
					    "subgroup": "IS_SUBGROUP_OF",
 | 
				
			||||||
 | 
					    "founder": "IS_FOUNDER_OF",
 | 
				
			||||||
 | 
					    "involved with": "IS_INVOLVED_WITH",
 | 
				
			||||||
 | 
					    "named after": "IS_NAMED_AFTER",
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					label_label_rel_map = {
 | 
				
			||||||
 | 
					    "label rename": "WAS_RENAMED_TO",
 | 
				
			||||||
 | 
					    "imprint": "DOES_IMPRINT_FOR",
 | 
				
			||||||
 | 
					    "label distribution": "DOES_DISTRIBUTION_FOR",
 | 
				
			||||||
 | 
					    "business association": "HAS_BUSINESS_ASSOCIATION_TO",
 | 
				
			||||||
 | 
					    "label ownership": "OWNS",
 | 
				
			||||||
 | 
					    "label reissue": "DOES_REISSUING_FOR"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if not os.path.exists("repo"):
 | 
				
			||||||
 | 
					    os.mkdir("repo")
 | 
				
			||||||
 | 
					else:
 | 
				
			||||||
 | 
					    os.system("rm repo/*")
 | 
				
			||||||
 | 
					if not os.path.exists("tmp"):
 | 
				
			||||||
 | 
					    os.mkdir("tmp")
 | 
				
			||||||
 | 
					else:
 | 
				
			||||||
 | 
					    os.system("rm tmp/*")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("in/link", "r") as f:
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        cols = line.split("\t")
 | 
				
			||||||
 | 
					        links[cols[0]] = cols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("in/release_status", "r") as f:
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        cols = line.split("\t")
 | 
				
			||||||
 | 
					        release_statuses[cols[0]] = cols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("in/link_type", "r") as f:
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        cols = line.split("\t")
 | 
				
			||||||
 | 
					        link_types[cols[0]] = cols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("in/area", "r") as f:
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        cols = line.split("\t")
 | 
				
			||||||
 | 
					        areas[cols[0]] = cols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("in/label_type") as f:
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        cols = line.split("\t")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        label_types[cols[0]] = ";" + cols[1].replace(" ", "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if cols[3] != "\\N" and cols[2] in label_types:
 | 
				
			||||||
 | 
					            label_types[cols[0]] += label_types[cols[2]].replace(" ", "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("in/artist") as f:
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        cols = line.split("\t")
 | 
				
			||||||
 | 
					        artists[cols[0]] = cols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("repo/area_area.csv", "w") as out:
 | 
				
			||||||
 | 
					    out.write(":START_ID(Area),:END_ID(Area)\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("in/l_area_area", "r") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					            out.write(",".join((areas[cols[3]][1],
 | 
				
			||||||
 | 
					                                areas[cols[2]][1]
 | 
				
			||||||
 | 
					                                )) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("repo/area.csv", "w") as out:
 | 
				
			||||||
 | 
					    out.write("id:ID(Area),name\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for k, area in areas.items():
 | 
				
			||||||
 | 
					        out.write(",".join((area[1],
 | 
				
			||||||
 | 
					                            '"' + area[2] + '"'
 | 
				
			||||||
 | 
					                            )) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					out_artist = open("repo/artist.csv", "w")
 | 
				
			||||||
 | 
					out_artist_area = open("repo/artist_area.csv", "w")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					out_artist.write("id:ID(Artist),name,year:int,:LABEL\n")
 | 
				
			||||||
 | 
					out_artist_area.write(":START_ID(Artist),:END_ID(Area)\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for _, artist in artists.items():
 | 
				
			||||||
 | 
					    out_artist.write(",".join((
 | 
				
			||||||
 | 
					        artist[1],
 | 
				
			||||||
 | 
					        '"' + artist[2].replace("\"", "\"\"") + '"',
 | 
				
			||||||
 | 
					        artist[4] if artist[4] != "\\N" else "0",
 | 
				
			||||||
 | 
					        "Artist" + (";Group\n" if artist[10] == "2" else "\n")
 | 
				
			||||||
 | 
					    )))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if artist[11] != "\\N":
 | 
				
			||||||
 | 
					        out_artist_area.write(artist[1] + "," + areas[artist[11]][1] + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					out_artist.close()
 | 
				
			||||||
 | 
					out_artist_area.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("repo/artist_artist.csv", "w") as out:
 | 
				
			||||||
 | 
					    out.write(":START_ID(Artist),:END_ID(Artist),:TYPE\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("in/l_artist_artist", "r") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					            out.write(",".join((
 | 
				
			||||||
 | 
					                artists[cols[2]][1],
 | 
				
			||||||
 | 
					                artists[cols[3]][1],
 | 
				
			||||||
 | 
					                artist_artist_rel_map[link_types[links[cols[1]][1]][6]] + "\n"
 | 
				
			||||||
 | 
					            )))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#  --------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("in/release_group_primary_type") as f:
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        cols = line.split("\t")
 | 
				
			||||||
 | 
					        release_types[cols[0]] = ";" + cols[1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					release_group_year = dict()
 | 
				
			||||||
 | 
					with open("in/release_group_meta") as f:
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        cols = line.split("\t")
 | 
				
			||||||
 | 
					        release_group_year[cols[0]] = cols[2] if cols[2] != "\\N" else "0"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("repo/release.csv", "w") as out:
 | 
				
			||||||
 | 
					    out.write("id:ID(Release),name,year:int,:LABEL\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("in/release_group") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					            out.write(",".join((
 | 
				
			||||||
 | 
					                cols[1],
 | 
				
			||||||
 | 
					                '"' + cols[2].replace("\"", "\"\"") + '"',
 | 
				
			||||||
 | 
					                release_group_year[cols[0]],
 | 
				
			||||||
 | 
					                "Release" + release_types[cols[4]],
 | 
				
			||||||
 | 
					            )) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            release_groups[cols[0]] = cols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("in/release") as f:
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        cols = line.split("\t")
 | 
				
			||||||
 | 
					        if cols[5] != '\\N' and release_statuses[cols[5]][1] == "Official":
 | 
				
			||||||
 | 
					            release_to_release_group_map[cols[0]] = cols[4]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					credit_names = defaultdict(list)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("in/artist_credit_name") as f:
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        cols = line.split("\t")
 | 
				
			||||||
 | 
					        credit_names[cols[0]].append(artists[cols[2]][1])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("tmp/tmp_artist_release.csv", "w") as out:
 | 
				
			||||||
 | 
					    out.write(":START_ID(Artist),:END_ID(Release),:TYPE\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Is this part really necessary?
 | 
				
			||||||
 | 
					    with open("in/l_artist_release") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					            if cols[3] in release_to_release_group_map:
 | 
				
			||||||
 | 
					                out.write(",".join((
 | 
				
			||||||
 | 
					                    artists[cols[2]][1],
 | 
				
			||||||
 | 
					                    release_groups[release_to_release_group_map[cols[3]]][1],
 | 
				
			||||||
 | 
					                    artist_release_rel_map[link_types[links[cols[1]][1]][6]]
 | 
				
			||||||
 | 
					                )) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Artist credits
 | 
				
			||||||
 | 
					    with open("in/release") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					            if cols[0] in release_to_release_group_map:
 | 
				
			||||||
 | 
					                for credit in credit_names[cols[3]]:
 | 
				
			||||||
 | 
					                    out.write(",".join((
 | 
				
			||||||
 | 
					                        credit,
 | 
				
			||||||
 | 
					                        release_groups[release_to_release_group_map[cols[0]]][1],
 | 
				
			||||||
 | 
					                        "CREDITED_FOR"
 | 
				
			||||||
 | 
					                    )) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Remove dupes
 | 
				
			||||||
 | 
					os.system("(head -n 1 tmp/tmp_artist_release.csv && tail -n +2 tmp/tmp_artist_release.csv"
 | 
				
			||||||
 | 
					          " | sort) | uniq > repo/artist_release.csv && rm tmp/tmp_artist_release.csv")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("repo/release_release.csv", "w") as out:
 | 
				
			||||||
 | 
					    out.write(":START_ID(Release),:END_ID(Release),:TYPE\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("in/l_release_group_release_group") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					            out.write(",".join((
 | 
				
			||||||
 | 
					                release_groups[cols[2]][1],
 | 
				
			||||||
 | 
					                release_groups[cols[3]][1],
 | 
				
			||||||
 | 
					                release_release_rel_map[link_types[links[cols[1]][1]][6]]
 | 
				
			||||||
 | 
					            )) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ---
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("in/tag") as f:
 | 
				
			||||||
 | 
					    with open("repo/tag.csv", "w") as out:
 | 
				
			||||||
 | 
					        out.write("id:ID(Tag),name\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					            tags[cols[0]] = cols
 | 
				
			||||||
 | 
					            out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("repo/release_tag.csv", "w") as out:
 | 
				
			||||||
 | 
					    out.write(":START_ID(Release),:END_ID(Tag),weight:int\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("in/release_group_tag") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if int(cols[2]) <= 0:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            out.write(",".join((
 | 
				
			||||||
 | 
					                release_groups[cols[0]][1],
 | 
				
			||||||
 | 
					                cols[1],
 | 
				
			||||||
 | 
					                cols[2],
 | 
				
			||||||
 | 
					            )) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("repo/artist_tag.csv", "w") as out:
 | 
				
			||||||
 | 
					    out.write(":START_ID(Artist),:END_ID(Tag),weight:int\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("in/artist_tag") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if int(cols[2]) <= 0:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            out.write(",".join((
 | 
				
			||||||
 | 
					                artists[cols[0]][1],
 | 
				
			||||||
 | 
					                cols[1],
 | 
				
			||||||
 | 
					                cols[2],
 | 
				
			||||||
 | 
					            )) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("repo/tag_tag.csv", "w") as out:
 | 
				
			||||||
 | 
					    out.write(":START_ID(Tag),:END_ID(Tag),weight:int\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("in/tag_relation") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if int(cols[2]) <= 0:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            out.write(",".join((
 | 
				
			||||||
 | 
					                cols[0],
 | 
				
			||||||
 | 
					                cols[1],
 | 
				
			||||||
 | 
					                cols[2],
 | 
				
			||||||
 | 
					            )) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# -----
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("repo/labels.csv", "w") as out:
 | 
				
			||||||
 | 
					    out.write("id:ID(Label),name,code,:LABEL\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("in/label") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					            labels[cols[0]] = cols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            out.write(",".join((
 | 
				
			||||||
 | 
					                cols[1],
 | 
				
			||||||
 | 
					                "\"" + cols[2].replace("\"", "\"\"") + "\"",
 | 
				
			||||||
 | 
					                cols[9] if cols[9] != "\\N" else "",
 | 
				
			||||||
 | 
					                "Label" + label_types[cols[10]]
 | 
				
			||||||
 | 
					            )) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("repo/label_label.csv", "w") as out:
 | 
				
			||||||
 | 
					    out.write(":START_ID(Label),:END_ID(Label),:TYPE\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("in/l_label_label") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            cols = line.split("\t")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            out.write(",".join((
 | 
				
			||||||
 | 
					                labels[cols[2]][1],
 | 
				
			||||||
 | 
					                labels[cols[3]][1],
 | 
				
			||||||
 | 
					                label_label_rel_map[link_types[links[cols[1]][1]][6]]
 | 
				
			||||||
 | 
					            )) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ---
 | 
				
			||||||
							
								
								
									
										1
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					requests
 | 
				
			||||||
							
								
								
									
										2
									
								
								seed.cypher
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								seed.cypher
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,2 @@
 | 
				
			|||||||
 | 
					CREATE INDEX ON :Artist(id);
 | 
				
			||||||
 | 
					CREATE INDEX ON :Release(id);
 | 
				
			||||||
							
								
								
									
										5
									
								
								seed_neo4j_db.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										5
									
								
								seed_neo4j_db.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,5 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/env bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export NEO4J_HOME="/home/drone/Downloads/neo4j-community-3.5.3"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cat seed.cypher | ${NEO4J_HOME}/bin/cypher-shell
 | 
				
			||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user