From 5385143d27e4ba9112b5503af6d407d6808c23c2 Mon Sep 17 00:00:00 2001 From: simon987 Date: Sun, 14 Jun 2020 16:55:50 -0400 Subject: [PATCH] fixes --- mb_scratch.sql | 4 ++-- task_get_lastfm.py | 43 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/mb_scratch.sql b/mb_scratch.sql index 0927353..1492763 100644 --- a/mb_scratch.sql +++ b/mb_scratch.sql @@ -4,10 +4,10 @@ declare sn text; BEGIN - sn = regexp_replace(name, '[^a-zA-Z0-9.\-!?&çéàâäëïöü'' ]', '_'); + sn = regexp_replace(name, '[^a-zA-Z0-9.\-!?&çéàâäëïöü'' ]', '_', 'g'); if length(replace(sn, '_', '')) = 0 then - return upper(regexp_replace(mb_sortname, '[^\w.\-!?& ]', '_')); + return upper(regexp_replace(mb_sortname, '[^\w.\-!?& ]', '_', 'g')); end if; return upper(sn); diff --git a/task_get_lastfm.py b/task_get_lastfm.py index 2fc74e6..95ce0d0 100755 --- a/task_get_lastfm.py +++ b/task_get_lastfm.py @@ -20,12 +20,34 @@ def get_mbid(conn, lfm_name): return row[0] if row else None +def get_names(conn, mbid): + cur = conn.cursor() + cur.execute( + "SELECT lfa.name, listeners " + "FROM mg.lastfm_artist lfa " + "INNER JOIN mg.lastfm_artist_meta lfm ON lfm.name=lfa.name " + "WHERE mbid=%s", (mbid,) + ) + + return cur.fetchall() + + def set_mbid(conn, lfm_name, mbid): cur = conn.cursor() cur.execute("INSERT INTO mg.lastfm_artist VALUES (%s,%s) ON CONFLICT (name) " "DO UPDATE SET mbid=excluded.mbid", (lfm_name, mbid)) +def delete_mbid(conn, mbid): + cur = conn.cursor() + cur.execute("DELETE FROM mg.lastfm_artist WHERE mbid=%s", (mbid,)) + + +def delete_name(conn, name): + cur = conn.cursor() + cur.execute("DELETE FROM mg.lastfm_artist WHERE name=%s", (name,)) + + def save_tags(conn, lfm_name, tags): if not tags: return @@ -34,16 +56,17 @@ def save_tags(conn, lfm_name, tags): cur.execute("DELETE FROM mg.lastfm_artist_tag WHERE name=%s", (lfm_name,)) cur.execute( "INSERT INTO mg.lastfm_artist_tag VALUES %s" % - ",".join("('%s', '%s')" % (n.replace("'", "''"), t.strip().replace("'", "''")) for (n, t) in zip(repeat(lfm_name), tags)) + ",".join("('%s', '%s')" % (n.replace("'", "''"), t.strip().replace("'", "''")) for (n, t) in + zip(repeat(lfm_name), tags)) ) def save_data(conn, data): if data: - disambiguate(conn, data["name"], mbid=data["artist"]) + disambiguate(conn, data["name"], mbid=data["artist"], listeners=data["listeners"]) for similar in [s for s in data["similar"] if s["mbid"] is not None]: - disambiguate(conn, similar["name"], similar["mbid"]) + disambiguate(conn, similar["name"], similar["mbid"], listeners=data["listeners"]) save_similar(conn, data["name"], similar["name"], similar["match"]) save_tags(conn, data["name"], set(data["tags"])) @@ -84,16 +107,26 @@ def get_release_count(conn, mbid): return row[0] if row else 0 -def disambiguate(conn, name, mbid): +def disambiguate(conn, name, mbid, listeners): """ A lastfm artist name can refer to multiple MBIDs For RELATED_TO purposes, we assume that the MBID referring - to the artist with the most official releases is the one + to the artist with the most official releases is the one. + When multiple last.fm artists refer to the same MBID, we + assume that the one with the most listeners is the real one """ existing_mbid = get_mbid(conn, name) + for existing_name, existing_listeners in get_names(conn, mbid): + if existing_listeners > listeners: + return + delete_name(conn, existing_name) + set_mbid(conn, name, mbid) + return + if existing_mbid and mbid != existing_mbid: if get_release_count(conn, existing_mbid) < get_release_count(conn, mbid): + delete_mbid(conn, existing_mbid) set_mbid(conn, name, mbid) else: set_mbid(conn, name, mbid)