From c3dc1faa318ed50933e1dcc5dbd7a9561be2c69b Mon Sep 17 00:00:00 2001
From: simon <fortier.simon@protonmail.com>
Date: Thu, 20 Jun 2019 10:26:31 -0400
Subject: [PATCH] Add label data, some work on spotify data

---
 .gitmodules                 |  6 ++++
 download_mb_dump.sh         |  3 +-
 generate_spotify_tasks.py   | 48 +++++++++++++++++++++++++++++
 generate_spotify_tasks_2.py | 60 +++++++++++++++++++++++++++++++++++++
 make_neoj4_db.sh            |  4 ++-
 process_lastfm_data.py      |  8 ++---
 process_mb_dump.py          | 57 ++++++++++++++++++++++++++++++-----
 seed.cypher                 |  1 +
 spotify                     |  1 +
 spotify2                    |  1 +
 10 files changed, 176 insertions(+), 13 deletions(-)
 create mode 100644 generate_spotify_tasks.py
 create mode 100644 generate_spotify_tasks_2.py
 create mode 160000 spotify
 create mode 160000 spotify2

diff --git a/.gitmodules b/.gitmodules
index 11d1bc0..4340f4c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,9 @@
 [submodule "caa"]
 	path = caa
 	url = https://git.simon987.net/drone/caa.git
+[submodule "spotify"]
+	path = spotify
+	url = https://git.simon987.net/drone/spotify
+[submodule "spotify2"]
+	path = spotify2
+	url = https://git.simon987.net/drone/spotify2
diff --git a/download_mb_dump.sh b/download_mb_dump.sh
index ff3bf47..f435b19 100755
--- a/download_mb_dump.sh
+++ b/download_mb_dump.sh
@@ -11,7 +11,8 @@ wget -nc "http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/${latest}/m
 tar -xjvf mbdump.tar.bz2 mbdump/area mbdump/artist mbdump/l_area_area mbdump/l_artist_artist \
 mbdump/l_artist_release mbdump/l_artist_release_group mbdump/l_label_label mbdump/l_release_group_release_group \
 mbdump/label mbdump/label_type mbdump/link mbdump/link_type mbdump/release mbdump/release_group \
-mbdump/release_group_primary_type mbdump/artist_credit_name mbdump/release_status
+mbdump/release_group_primary_type mbdump/artist_credit_name mbdump/release_status mbdump/l_label_release \
+mbdump/l_label_release_group
 tar -xjvf mbdump-derived.tar.bz2 mbdump/artist_tag mbdump/release_group_tag mbdump/tag mbdump/tag_relation \
 mbdump/release_group_meta
 
diff --git a/generate_spotify_tasks.py b/generate_spotify_tasks.py
new file mode 100644
index 0000000..45adb7f
--- /dev/null
+++ b/generate_spotify_tasks.py
@@ -0,0 +1,48 @@
+import csv
+import json
+from multiprocessing.pool import ThreadPool
+
+from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
+
+TT_API_URL = "https://tt.simon987.net/api"
+TT_PROJECT = 6
+
+api = TaskTrackerApi(TT_API_URL)
+
+worker = Worker.from_file(api)
+if not worker:
+    worker = api.make_worker("mm worker")
+    worker.dump_to_file()
+worker.request_access(TT_PROJECT, True, True)
+input("Give permission to " + worker.alias)
+
+with open("repo/artist.csv") as f:
+    reader = csv.reader(f)
+
+    def mktask(lines):
+        res = worker.submit_task(
+            project=TT_PROJECT,
+            recipe=json.dumps(
+                [{"mbid": line[0], "name": line[1]} for line in lines]
+            ),
+            unique_str=lines[0][0],
+            max_assign_time=60 * 5,
+        )
+        print(res.text)
+
+    def lines():
+        line_batch = list()
+
+        for line in reader:
+            line_batch.append(line)
+            if len(line_batch) >= 30:
+                res = list(line_batch)
+                line_batch.clear()
+                yield res
+
+    tasks = list(lines())
+
+    pool = ThreadPool(processes=25)
+    pool.map(func=mktask, iterable=tasks)
+
+
diff --git a/generate_spotify_tasks_2.py b/generate_spotify_tasks_2.py
new file mode 100644
index 0000000..9828c04
--- /dev/null
+++ b/generate_spotify_tasks_2.py
@@ -0,0 +1,60 @@
+import json
+import sqlite3
+from multiprocessing.pool import ThreadPool
+
+import sys
+
+from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
+
+TT_API_URL = "https://tt.simon987.net/api"
+TT_PROJECT = 7
+
+api = TaskTrackerApi(TT_API_URL)
+
+worker = Worker.from_file(api)
+if not worker:
+    worker = api.make_worker("mm worker")
+    worker.dump_to_file()
+worker.request_access(TT_PROJECT, True, True)
+input("Give permission to " + worker.alias)
+
+spotids = set()
+
+with sqlite3.connect(sys.argv[1]) as conn:
+
+    cur = conn.cursor()
+    cur.execute("SELECT data from artist")
+    for row in cur.fetchall():
+        j = json.loads(row[0])
+        if j is None or "artists" not in j or "items" not in j["artists"]:
+            continue
+        for item in j["artists"]["items"]:
+            spotids.add(item["id"])
+
+
+    def mktask(lines):
+        res = worker.submit_task(
+            project=TT_PROJECT,
+            recipe=json.dumps(
+                [{"spotid": line} for line in lines]
+            ),
+            unique_str=lines[0],
+            max_assign_time=60 * 5,
+        )
+        print(res.text)
+
+    def ids():
+        id_batch = list()
+
+        for spotid in spotids:
+            id_batch.append(spotid)
+            if len(id_batch) >= 30:
+                res = list(id_batch)
+                id_batch.clear()
+                yield res
+
+    tasks = list(ids())
+
+    pool = ThreadPool(processes=25)
+    pool.map(func=mktask, iterable=tasks)
+
diff --git a/make_neoj4_db.sh b/make_neoj4_db.sh
index 763154c..d6ef5fc 100755
--- a/make_neoj4_db.sh
+++ b/make_neoj4_db.sh
@@ -23,6 +23,7 @@ wget ${REPOSITORY}/release.csv
 wget ${REPOSITORY}/tag.csv
 wget ${REPOSITORY}/tag_tag.csv
 wget ${REPOSITORY}/release_tag.csv
+wget ${REPOSITORY}/release_label.csv
 wget ${REPOSITORY}/release_release.csv
 wget ${REPOSITORY}/artist_tag.csv
 wget ${REPOSITORY}/labels.csv
@@ -46,7 +47,8 @@ wget ${REPOSITORY}/lastfm_artist_artist.csv
     --relationships:IS_RELATED_TO "tag_tag.csv"\
     --relationships "label_label.csv"\
     --relationships "release_release.csv"\
-    --relationships:IS_RELATED_TO "lastfm_artist_artist.csv"
+    --relationships:IS_RELATED_TO "lastfm_artist_artist.csv"\
+    --relationships:RELEASE_UNDER "release_label.csv"
 
 rm *.csv
 cd ..
diff --git a/process_lastfm_data.py b/process_lastfm_data.py
index 014297a..ff6c022 100644
--- a/process_lastfm_data.py
+++ b/process_lastfm_data.py
@@ -15,10 +15,10 @@ def disambiguate(lfm_artist, artist_release_count, name, mbid):
 
             lfm_artist[name] = mbid
 
-            print("Replacing %s (%s) with %s (%d) for %s" %
-                  (existing_mbid, artist_release_count[existing_mbid],
-                   mbid, artist_release_count[mbid],
-                   name))
+            # print("Replacing %s (%s) with %s (%d) for %s" %
+            #       (existing_mbid, artist_release_count[existing_mbid],
+            #        mbid, artist_release_count[mbid],
+            #        name))
     else:
         lfm_artist[name] = mbid
 
diff --git a/process_mb_dump.py b/process_mb_dump.py
index 5c610b8..f84ba2b 100644
--- a/process_mb_dump.py
+++ b/process_mb_dump.py
@@ -1,6 +1,7 @@
 import os
 from collections import defaultdict
 import re
+from statistics import median
 
 links = dict()
 link_types = dict()
@@ -314,14 +315,21 @@ with open("repo/release_release.csv", "w") as out:
 
 # ---
 
+tag_occurence = defaultdict(int)
+with open("in/release_group_tag") as f:
+    for line in f:
+        tag_occurence[line.split("\t")[1]] += 1
+
 with open("in/tag") as f:
     with open("repo/tag.csv", "w") as out:
-        out.write("id:ID(Tag),name\n")
+        out.write("id:ID(Tag),name, occurences\n")
 
         for line in f:
             cols = line.split("\t")
+            if tag_occurence[cols[0]] < 5:
+                continue
             tags[cols[0]] = cols
-            out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"\n")
+            out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"," + str(tag_occurence[cols[0]]) + "\n")
 
 with open("repo/release_tag.csv", "w") as out:
     out.write(":START_ID(Release),:END_ID(Tag),weight:float\n")
@@ -341,11 +349,15 @@ with open("repo/release_tag.csv", "w") as out:
             count = int(cols[2])
             if count <= 0:
                 continue
+            if cols[1] not in tags:
+                continue
             out.write(",".join((
                 release_groups[cols[0]][1],
                 cols[1],
                 str(max(min(count / max_count, 1), 0.2)),
             )) + "\n")
+            tag_occurence[cols[1]] += 1
+
 
 with open("repo/artist_tag.csv", "w") as out:
     out.write(":START_ID(Artist),:END_ID(Tag),weight:float\n")
@@ -366,6 +378,8 @@ with open("repo/artist_tag.csv", "w") as out:
             count = int(cols[2])
             if count <= 0:
                 continue
+            if cols[1] not in tags:
+                continue
 
             out.write(",".join((
                 artists[cols[0]][1],
@@ -374,39 +388,68 @@ with open("repo/artist_tag.csv", "w") as out:
             )) + "\n")
 
 with open("repo/tag_tag.csv", "w") as out:
-    out.write(":START_ID(Tag),:END_ID(Tag),weight:int\n")
+    out.write(":START_ID(Tag),:END_ID(Tag),weight:float\n")
+
+    def weights():
+        with open("in/tag_relation") as f:
+            for line in f:
+                weight = int(line.split("\t")[2])
+                if weight < 5:
+                    continue
+                yield weight
+    weight_median = median(weights()) * 3
 
-    # TODO: normalize weight so it's between [0,1]
     with open("in/tag_relation") as f:
         for line in f:
             cols = line.split("\t")
 
-            if int(cols[2]) <= 0:
+            weight = int(cols[2])
+            if weight < 5:
+                continue
+            if cols[0] not in tags or cols[1] not in tags:
                 continue
 
             out.write(",".join((
                 cols[0],
                 cols[1],
-                cols[2],
+                str(max(min(weight / weight_median, 1), 0.2)),
             )) + "\n")
 
 # -----
 
 with open("repo/labels.csv", "w") as out:
-    out.write("id:ID(Label),name,code,:LABEL\n")
+    out.write("id:ID(Label),name,sortname,code,:LABEL\n")
 
     with open("in/label") as f:
         for line in f:
             cols = line.split("\t")
             labels[cols[0]] = cols
 
+            sortname = ASCII_RE.sub("_", cols[2]).upper()
             out.write(",".join((
                 cols[1],
                 "\"" + cols[2].replace("\"", "\"\"") + "\"",
+                sortname,
                 cols[9] if cols[9] != "\\N" else "",
                 "Label" + label_types[cols[10]]
             )) + "\n")
 
+with open("repo/release_label.csv", "w") as out:
+    out.write(":START_ID(Release),:END_ID(Label)\n")
+
+    # Should I check link types here?
+    with open("in/l_label_release_group") as f:
+        for line in f:
+            cols = line.split("\t")
+            out.write(release_groups[cols[3]][1] + "," + labels[cols[2]][1] + "\n")
+
+    with open("in/l_label_release") as f:
+        for line in f:
+            cols = line.split("\t")
+            if cols[3] in release_to_release_group_map:
+                out.write(release_groups[release_to_release_group_map[cols[3]]][1] + "," + labels[cols[2]][1] + "\n")
+
+
 with open("repo/label_label.csv", "w") as out:
     out.write(":START_ID(Label),:END_ID(Label),:TYPE\n")
 
diff --git a/seed.cypher b/seed.cypher
index 9b1edce..1dabca7 100644
--- a/seed.cypher
+++ b/seed.cypher
@@ -1,3 +1,4 @@
 CREATE INDEX ON :Artist(id);
 CREATE INDEX ON :Artist(sortname);
 CREATE INDEX ON :Release(id);
+CREATE INDEX ON :Label(sortname);
diff --git a/spotify b/spotify
new file mode 160000
index 0000000..4ac596b
--- /dev/null
+++ b/spotify
@@ -0,0 +1 @@
+Subproject commit 4ac596b2ff7659b880ac8a3fe9c58ea6527c2efc
diff --git a/spotify2 b/spotify2
new file mode 160000
index 0000000..0a05c69
--- /dev/null
+++ b/spotify2
@@ -0,0 +1 @@
+Subproject commit 0a05c69bcf7005496c2efdf5b825ffa2f443ccdf