diff --git a/make_neoj4_db.sh b/make_neoj4_db.sh index 85930e7..763154c 100755 --- a/make_neoj4_db.sh +++ b/make_neoj4_db.sh @@ -1,6 +1,6 @@ -#!/bin/bash +#!/usr/bin/env bash -export NEO4J_HOME="/home/drone/Downloads/neo4j-community-3.5.3" +export NEO4J_HOME="/home/drone/Documents/neo4j" export REPOSITORY="http://localhost:9999" export DATABASE="graph.db" diff --git a/make_release_to_rg_map.py b/map_release_to_rg_map.py similarity index 100% rename from make_release_to_rg_map.py rename to map_release_to_rg_map.py diff --git a/process_mb_dump.py b/process_mb_dump.py index b4d369f..2828a4d 100644 --- a/process_mb_dump.py +++ b/process_mb_dump.py @@ -313,40 +313,59 @@ with open("in/tag") as f: out.write(cols[0] + ",\"" + cols[1].replace("\"", "\"\"") + "\"\n") with open("repo/release_tag.csv", "w") as out: - out.write(":START_ID(Release),:END_ID(Tag),weight:int\n") + out.write(":START_ID(Release),:END_ID(Tag),weight:float\n") + # get max count + max_count = 0 with open("in/release_group_tag") as f: for line in f: cols = line.split("\t") + max_count = max(max_count, int(cols[2])) + max_count = max_count / 4 - if int(cols[2]) <= 0: + # weight is linear + with open("in/release_group_tag") as f: + for line in f: + cols = line.split("\t") + count = int(cols[2]) + if count <= 0: continue - out.write(",".join(( release_groups[cols[0]][1], cols[1], - cols[2], + str(max(min(count / max_count, 1), 0.2)), )) + "\n") with open("repo/artist_tag.csv", "w") as out: - out.write(":START_ID(Artist),:END_ID(Tag),weight:int\n") + out.write(":START_ID(Artist),:END_ID(Tag),weight:float\n") + # get max count + max_count = 0 + with open("in/artist_tag") as f: + for line in f: + cols = line.split("\t") + max_count = max(max_count, int(cols[2])) + max_count = max_count / 4 + + # Weight is linear with open("in/artist_tag") as f: for line in f: cols = line.split("\t") - if int(cols[2]) <= 0: + count = int(cols[2]) + if count <= 0: continue out.write(",".join(( artists[cols[0]][1], cols[1], - cols[2], + str(max(min(count / max_count, 1), 0.2)), )) + "\n") with open("repo/tag_tag.csv", "w") as out: out.write(":START_ID(Tag),:END_ID(Tag),weight:int\n") + # TODO: normalize weight so it's between [0,1] with open("in/tag_relation") as f: for line in f: cols = line.split("\t") diff --git a/seed.cypher b/seed.cypher index 7a2fc34..02606bf 100644 --- a/seed.cypher +++ b/seed.cypher @@ -1,2 +1,3 @@ CREATE INDEX ON :Artist(id); +CREATE INDEX ON :Artist(name); CREATE INDEX ON :Release(id); diff --git a/seed_neo4j_db.sh b/seed_neo4j_db.sh index 846b8ef..90f3eea 100755 --- a/seed_neo4j_db.sh +++ b/seed_neo4j_db.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -export NEO4J_HOME="/home/drone/Downloads/neo4j-community-3.5.3" +export NEO4J_HOME="/home/drone/Documents/neo4j" cat seed.cypher | ${NEO4J_HOME}/bin/cypher-shell