mirror of
https://github.com/simon987/yt-metadata.git
synced 2025-04-04 08:23:00 +00:00
253 lines
9.5 KiB
Python
253 lines
9.5 KiB
Python
import json
|
|
import os
|
|
import psycopg2
|
|
|
|
DB_STR = "dbname=yt-meta user=yt-meta"
|
|
tags_cache = {}
|
|
categories_cache = {}
|
|
licenses_cache = {}
|
|
|
|
|
|
def init_cache():
|
|
|
|
with psycopg2.connect(DB_STR) as conn:
|
|
|
|
cursor = conn.cursor()
|
|
|
|
# Tags
|
|
cursor.execute('SELECT id, name FROM tag')
|
|
for tag in cursor.fetchall():
|
|
tags_cache[tag[1]] = tag[0]
|
|
|
|
# Categories
|
|
cursor.execute('SELECT id, name FROM category')
|
|
for category in cursor.fetchall():
|
|
categories_cache[category[1]] = category[0]
|
|
|
|
# License
|
|
cursor.execute('SELECT id, name FROM license')
|
|
for license in cursor.fetchall():
|
|
licenses_cache[license[1]] = license[0]
|
|
|
|
|
|
def create_tag(cursor, tag, video_id):
|
|
tag_id = tags_cache.get(tag, None)
|
|
if not tag_id:
|
|
cursor.execute('INSERT INTO tag (name) VALUES (%s)', (tag, ))
|
|
cursor.execute('SELECT LASTVAL()')
|
|
|
|
tag_id = cursor.fetchone()[0]
|
|
tags_cache[tag] = tag_id
|
|
print("Created tag '" + tag + "' with id " + str(tag_id))
|
|
cursor.execute('INSERT INTO video_has_tag (video_id, tag_id) VALUES (%s,%s)', (video_id, tag_id))
|
|
|
|
|
|
def create_category(cursor, category, video_id):
|
|
|
|
category_id = categories_cache.get(category, None)
|
|
if not category_id:
|
|
cursor.execute('INSERT INTO category (name) VALUES (%s)', (category, ))
|
|
cursor.execute('SELECT LASTVAL()')
|
|
|
|
category_id = cursor.fetchone()[0]
|
|
categories_cache[category] = category_id
|
|
print("Created category '" + category + "' with id " + str(category_id))
|
|
cursor.execute('INSERT INTO video_in_category (video_id, category_id) VALUES (%s,%s)', (video_id, category_id))
|
|
|
|
|
|
def create_uploader(cursor, name, ul_id, url):
|
|
|
|
cursor.execute('INSERT INTO uploader (id, url, name) VALUES (%s,%s,%s) ON CONFLICT DO NOTHING',
|
|
(ul_id, url, name))
|
|
|
|
|
|
def create_license(cursor, name):
|
|
|
|
license_id = licenses_cache.get(name, None)
|
|
if not license_id:
|
|
cursor.execute('INSERT INTO license (name) VALUES (%s)', (name, ))
|
|
cursor.execute('SELECT LASTVAL()')
|
|
|
|
license_id = cursor.fetchone()[0]
|
|
licenses_cache[name] = license_id
|
|
print("Created license '" + name + "' with id " + str(license_id))
|
|
|
|
|
|
def create_video(cursor, **kwargs):
|
|
|
|
cursor.execute('INSERT INTO video (id, uploader_id, creator, upload_date, license_id, title, full_title,'
|
|
' alt_title, file_name, description, annotation, webpage_url, view_count, like_count, '
|
|
'dislike_count, display_id, duration, age_limit) '
|
|
'VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) '
|
|
'ON CONFLICT DO NOTHING ',
|
|
(kwargs["id"],
|
|
kwargs["uploader_id"], kwargs["creator"], kwargs["upload_date"],
|
|
kwargs["license_id"],
|
|
kwargs["title"], kwargs["full_title"], kwargs["alt_title"],
|
|
kwargs["file_name"],
|
|
kwargs["description"],
|
|
kwargs["annotation"],
|
|
kwargs["webpage_url"],
|
|
kwargs["view_count"], kwargs["like_count"], kwargs["dislike_count"],
|
|
kwargs["display_id"],
|
|
kwargs["duration"],
|
|
kwargs["age_limit"]
|
|
))
|
|
|
|
print("Created video " + kwargs["id"])
|
|
|
|
|
|
def create_subtitles(cursor, filename, lang, url, video_id):
|
|
|
|
with open(filename, "r") as f:
|
|
data = f.read()
|
|
|
|
cursor.execute('INSERT INTO subtitles (language, url, data, video_id) VALUES (%s,%s,%s,%s) '
|
|
'ON CONFLICT DO NOTHING',
|
|
(lang, url, data, video_id))
|
|
print("Create subtitles " + lang + " for " + video_id)
|
|
|
|
|
|
def create_thumbnail(cursor, filename, url, tn_id, video_id):
|
|
|
|
if filename and os.path.exists(filename):
|
|
with open(filename, "rb") as f:
|
|
data = psycopg2.Binary(f.read())
|
|
else:
|
|
data = None
|
|
|
|
cursor.execute('INSERT INTO thumbnail (thumbnail_id, url, video_id, data) VALUES (%s,%s,%s,%s) '
|
|
'ON CONFLICT DO NOTHING',
|
|
(tn_id, url, video_id, data))
|
|
print("Create thumbnail " + tn_id + " for " + video_id)
|
|
|
|
|
|
def create_format(cursor, **kwargs):
|
|
cursor.execute('INSERT INTO format (name, note, format_id, url, player_url, extension, audio_codec, video_codec, '
|
|
'audio_bitrate, total_bitrate, file_size, quality, width, height, fps, video_id) '
|
|
'VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) '
|
|
'ON CONFLICT DO NOTHING ',
|
|
(kwargs["name"],
|
|
kwargs["note"],
|
|
kwargs["format_id"],
|
|
kwargs["url"],
|
|
kwargs["player_url"],
|
|
kwargs["extension"],
|
|
kwargs["audio_codec"],
|
|
kwargs["video_codec"],
|
|
kwargs["audio_bitrate"],
|
|
kwargs["total_bitrate"],
|
|
kwargs["file_size"],
|
|
kwargs["quality"],
|
|
kwargs["width"], kwargs["height"],
|
|
kwargs["fps"],
|
|
kwargs["video_id"]
|
|
))
|
|
print("Create format " + kwargs["format_id"] + " for " + kwargs["video_id"])
|
|
|
|
|
|
def create_chapter(cursor, video_id, start_time, end_time, title):
|
|
|
|
cursor.execute('INSERT INTO chapter (start_time, end_time, title, video_id) VALUES (%s,%s,%s,%s) '
|
|
'ON CONFLICT DO NOTHING',
|
|
(start_time, end_time, title, video_id))
|
|
print("Created chapter for " + video_id)
|
|
|
|
|
|
def import_json(filename):
|
|
|
|
directory = filename[:filename.rfind("/") + 1]
|
|
|
|
with open(filename, "r") as f:
|
|
|
|
meta = json.load(f)
|
|
|
|
with psycopg2.connect(DB_STR) as conn:
|
|
|
|
cursor = conn.cursor()
|
|
|
|
create_license(cursor, meta["license"])
|
|
|
|
create_uploader(cursor, meta["uploader"], meta["uploader_id"], meta["uploader_url"])
|
|
|
|
create_video(cursor,
|
|
id=meta["id"],
|
|
uploader_id=meta["uploader_id"],
|
|
creator=meta["creator"],
|
|
upload_date=meta["upload_date"],
|
|
license_id=licenses_cache[meta["license"]],
|
|
title=meta["title"],
|
|
full_title=meta["fulltitle"],
|
|
alt_title=meta["alt_title"],
|
|
file_name=meta["_filename"],
|
|
description=meta["description"],
|
|
duration=meta["duration"],
|
|
age_limit=meta["age_limit"],
|
|
annotation=meta["annotations"],
|
|
webpage_url=meta["webpage_url"],
|
|
view_count=meta["view_count"],
|
|
like_count=meta["like_count"],
|
|
dislike_count=meta["dislike_count"],
|
|
display_id=meta["display_id"])
|
|
|
|
for tag in meta["tags"]:
|
|
create_tag(cursor, tag, meta["id"])
|
|
|
|
for category in meta["categories"]:
|
|
create_category(cursor, category, meta["id"])
|
|
|
|
for sub in meta["subtitles"]:
|
|
sub_filename = directory + meta["_filename"].replace(meta["ext"], sub + ".vtt")
|
|
if os.path.exists(filename):
|
|
create_subtitles(cursor, sub_filename, sub, meta["subtitles"][sub][0]["url"], meta["id"])
|
|
|
|
for tn in meta["thumbnails"]:
|
|
# With the script, only the first (default) thumbnail is saved
|
|
if tn["id"] == "0":
|
|
tn_filename = directory + meta["_filename"].replace(meta["ext"], "jpg")
|
|
else:
|
|
tn_filename = None
|
|
|
|
create_thumbnail(cursor, tn_filename, tn["url"], tn["id"], meta["id"])
|
|
|
|
for frmt in meta["formats"]:
|
|
create_format(cursor,
|
|
name=frmt["format"],
|
|
note=frmt.get("format_note", None),
|
|
format_id=frmt["format_id"],
|
|
url=frmt["url"],
|
|
player_url=frmt.get("player_url", None),
|
|
extension=frmt["ext"],
|
|
audio_codec=frmt["acodec"],
|
|
video_codec=frmt["vcodec"],
|
|
audio_bitrate=frmt.get("abr", None),
|
|
total_bitrate=frmt.get("tbr", None),
|
|
file_size=frmt.get("filesize", None),
|
|
quality=frmt.get("quality", None),
|
|
width=frmt.get("width", None),
|
|
height=frmt.get("height", None),
|
|
fps=frmt.get("fps", None),
|
|
video_id=meta["id"])
|
|
|
|
if meta["chapters"]:
|
|
for chapter in meta["chapters"]:
|
|
create_chapter(cursor, meta["id"], chapter["start_time"], chapter["end_time"], chapter["title"])
|
|
|
|
if meta["automatic_captions"]:
|
|
print(meta["id"])
|
|
quit()
|
|
|
|
|
|
def import_recursive(root_path):
|
|
|
|
for root, dirs, files in os.walk(root_path):
|
|
for file in files:
|
|
ext = os.path.splitext(file)[1]
|
|
if ext == ".json":
|
|
filename = os.path.join(root, file)
|
|
import_json(filename)
|
|
|
|
|
|
init_cache()
|
|
import_recursive("/home/simon/Downloads/yt/")
|