diff --git a/.gitmodules b/.gitmodules index a754c49..9d8a91a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "themes/whiteplain"] - path = themes/whiteplain - url = https://github.com/taikii/whiteplain.git +[submodule "themes/temple"] + path = themes/temple + url = https://github.com/aos/temple.git diff --git a/config.toml b/config.toml index 145e61f..b40f5d8 100644 --- a/config.toml +++ b/config.toml @@ -1,10 +1,26 @@ -baseURL = "https://dataarchivist.net" +baseURL = "https://dataarchivist.net/" languageCode = "en-us" title = "Data Archivist Weekly" -theme = "whiteplain" +theme = "temple" pygmentsStyle = "friendly" [params] useCDN = false showShareIcons = false + +toc = true +custom_css = ["static/custom.css"] +dateformatpretty = "2006-01-02" +pygmentsCodeFences = true + + + +# Builds a list page for each category given +[taxonomies] + tag = "tags" + +[author] + name = "simon987" + github = "simon987" + email = "me@simon987.net" diff --git a/content/posts/test.md b/content/posts/test.md deleted file mode 100644 index 27703e0..0000000 --- a/content/posts/test.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: "Test post" -date: 2019-05-11T14:19:21-04:00 -draft: false ---- - -Hello world - -This is a test - -{{}} - -for x in range(0, 1): - print(x) - -{{}} - diff --git a/content/posts/ytdl_1.md b/content/posts/ytdl_1.md new file mode 100644 index 0000000..bca3c4d --- /dev/null +++ b/content/posts/ytdl_1.md @@ -0,0 +1,160 @@ +--- +title: "Automating Youtube Archival" +date: 2019-05-11 +draft: false +tags: ["youtube-dl", "automation"] +--- + +Google has been known to terminate entire Youtube channels without +notice in the hope of staying advertiser friendly. **58 million "problematic" videos +were deleted from the platform in Q3 2018** [1](#sources). +This week we are exploring various Youtube archival solutions that utilizes + [youtube-dl](https://github.com/ytdl-org/youtube-dl/), + a command-line program that extracts and downloads videos from web pages. + +{{< figure src="/ytdl/1.png" title="Channels removed, by removal reason">}} + + + +# Installation + +Install youtube-dl via pip to ensure that you have the latest version. Google often +pushes updates that breaks youtube-dl so you always want to stay up to date. You +might be able to install it via your distibution’s package manager but it’s often several +versions behind. + +``` +pip install --upgrade youtube-dl +youtube-dl --version +``` + +# Basic usage + +You can now use the youtube-dl command to download a single video, a playlist or a channel: + +``` +youtube-dl https://www.youtube.com/watch?v=XXXXXXXXXX +youtube-dl https://www.youtube.com/channel/XXXXXXXXXX +``` + +You can use the same command to download videos from various websites.Youtube-dl +is also a Python library that you can use in scripts: + +{{}} +#!/usr/bin/python + +import youtube_dl + +yt_dl = youtube_dl.YoutubeDL() +yt_dl.download("https://www.youtube.com/watch?v=XXXXXXXXXXXX") +{{}} + +## Command line arguments & Scripting + +This document is not a replacement for youtube-dl’s documentation, you can find the +updated list of command line arguments on its [Github page](https://github.com/ytdl-org/youtube-dl/). + +Below is a bash script that will download everything specified in `list.txt`, a text file +with a youtube channel or video on each line. The script will save the URLs of the videos +in archive.txt as it downloads them to speed-up the subsequent executions. +The `--write-info-json` and `--write-thumbnail` options ensures that we also download +the video metadata such as the description and the title. + +{{}} +youtube-dl -a list.txt -o '%(uploader)s/%(title)s-%(id)s.%(ext)s' \ + --write-thumbnail\ + -f "bestvideo[ext=webm]+bestaudio[ext=webm]/bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio/best"\ + --write-info-json\ + --geo-bypass\ + --ignore-errors\ + --download-archive archive.txt +{{}} + +You can decide to run the script periodically or to schedule it with cron: + +``` +crontab -e +``` + +{{}} +# Download new videos every 2 hours +0 */2 * * * +/mnt/Archive/Youtube/archive_yt.sh +{{}} + +## Live streams + +While a cron job will download all videos uploaded by a channel (if the uploader does +not delete the video between executions), it does not handle live streams. Youtube-dl +allows you to download live streams with the same command but you obviously have to +start the execution during the stream. +Below is a different approach that takes advantage of the Youtube email notification +feature. This simple Python script reads your last 3 emails and searches for a youtube +link in the email body. It will immediately start downloading the video using the youtube-dl +Python library. You can use this method to download uploaded videos as well as live +streams. If you are using a gmail account, you will need to genrate an [App Password](https://security.google.com/settings/security/apppasswords) + to allow the script to login. + +{{}} +#!/usr/bin/python + +import imaplib +import re +import youtube_dl + +# Initalize the youtube-dl downloader, nooverwrites param will +# skip videos that are already downloaded or +# currently being downloaded +yt_dl = youtube_dl.YoutubeDL(params={ + "nooverwrites": True, + "nopart": True, + "format": "bestvideo[ext=webm]+bestaudio[ext=webm]/bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio/best" + }) + +# This regex pattern matches youtube video links +YT_LINK = re.compile("Fv%3D([^%]*)%") + +mail = imaplib.IMAP4_SSL("imap.gmail.com") +mail.login("username@gmail.com", "password") +mail.list() +mail.select("INBOX") + +# Fetch the last 3 emails +_, data = mail.search(None, "ALL") +last_emails = list(reversed(data[0].split()))[:3] + +for num in last_emails: + _, data = mail.fetch(num, "(RFC822)") + body = data[0][1].decode() + + # Check for pattern match + match = YT_LINK.search(body) + if match: + url = "https://youtube.com/watch?v=" + match.group(1) + yt_dl.download([url, ]) # immediately download +{{}} + + +## Automatically upload to rclone remote + +To take advantage of cloud storage, you can setup [ytdlrc](http://github.com/bardisty/ytdlrc) to automatically move +videos to an rclone remote as they are downloaded. This simple script is completely interchangable with +youtube-dl and can be setup on a machine with low disk space. +The script uses your existing youtube-dl and rclone configuration and is ideal for +setting up automatic Youtube archival on a cheap VPS. + +## Archiving Metadata + +If you wish to save a video’s metadata without downloading the actual video, there are command line utilities dedicated to this task. + +* [Youtube-MA](https://github.com/CorentinB/YouTube-MA) +* [yt-mango](https://github.com/terorie/yt-mango) + + +---- + +# Sources + +* https://transparencyreport.google.com/youtube-policy/removals + + diff --git a/layouts/partials/css/custom.css b/layouts/partials/css/custom.css new file mode 100644 index 0000000..e21e174 --- /dev/null +++ b/layouts/partials/css/custom.css @@ -0,0 +1,273 @@ +html { + min-height: 100%; + width: 100%; + position: relative; +} + +body { + background-color: rgb(252, 252, 252); + color: #484848; +} + +a { + text-decoration: none; +} + +code, kbd, pre, samp { + background-color: rgb(240, 240, 240); +} + +.nav-menu { + margin-top: 5px; + padding-bottom: 5px; + border-bottom: 1px solid #e3e3e3; +} + +.pure-menu-heading { + text-transform: none; + font-size: large; +} + +.header { + text-align: left; + color: #484848; + margin-bottom: .5em; +} + +.header ul li { + height: auto; +} + +.header ul li a { + font-weight: bold; + color: #484848; + font-family: "Source Sans Pro", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; +} + +.header { + font-weight: bold; + color: #484848; + font-family: "Source Sans Pro", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; +} + +.site-title { + color: #484848; + text-transform: none; + font-weight: normal; + font-family: "Source Sans Pro", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; +} + +.pull-right { + float: right; +} + +.posts-name { + text-transform: capitalize; + font-weight: bold; + padding-left: 1em; + margin-top: 1em; + font-family: "Source Sans Pro", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; +} + +.posts { + font-family: verdana, arial, helvetica, sans-serif; + list-style-type: none; + padding-left: 1em; +} + +.posts li p { + margin-top: 0; +} + +.posts li { + margin-bottom: 1em; +} + +.posts li > a { + color: #369; + text-decoration: none; +} + +.post-list { + font-size: large; +} + +.footnote { + font-family: verdana, arial, helvetica, sans-serif; + color: #575757; + font-size: 0.75em; + margin-bottom: 0; +} + +.footnote a { + color: #575757; +} + +.footnote a:hover { + text-decoration: underline; + color: #369; +} + +.footer { + position: absolute; + z-index: 2; + height: auto; + width: 100%; + bottom: 0; +} + +.footer-content { + border-top: 1px solid #e3e3e3; + font-size: 80%; + color: #bbb; +} + +.footer-content a { + color: #575757; +} + +.footer-content ul { + height: auto; + margin-top: 0; + margin-bottom: 0; + display: inline-block; + padding-left: 0; +} + +/* https://github.com/gohugoio/hugo/issues/1778#issuecomment-451552602 */ +#TableOfContents > ul { + list-style: none; + margin: 0; + padding: 0; +} + +#gototop-btn { + display: inline-block; +} + +#foot-name { + color: #484848; + text-transform: none; +} + +#foot-copyright { + padding-left: 1em; + padding-bottom: 0.5em; + margin: 0; +} + +.post { + font-family: proxima-nova, "Helvetica Neue", Helvetica, Roboto, Arial, sans-serif; + color: #484848; + letter-spacing: normal; + padding-left: .5em; +} + +.post h1, h2, h3, h4, h5, h6 { + font-weight: bold; + letter-spacing: normal; +} + +.post-content { + z-index: 9; + overflow: auto; + padding: 0; + padding-bottom: 3em; + font-size: 16px; + line-height: 1.4; +} + +.post-content img { + max-width: 100%; + height: auto; +} + +.post-content pre { + padding: 0.5em; +} + +.post a { + color: #c05b4d; + text-decoration: none; +} + +.post a:hover { + color: #a5473a; + text-decoration: underline; +} + +.post h1 { + font-size: 28px; +} + +.post h2 { + font-size: 25px; +} + +.post h3 { + font-size: 23px; +} + +.post h4 { + font-size: 21px; +} + +.post h5 { + font-size: 19px; +} + +.post h6 { + font-size: 18px; +} + +.post-title { + margin-top: 0; + margin-bottom: 2em; +} + +.post-title h1 { + font-weight: bold; + font-size: 39px; + line-height: 40px; + margin-top: 8px; + margin-bottom: 0; +} + +@media screen and (max-width: 767px) { + .desktop { + display: none; + } + + .mobile { + display: block; + } + + #toggle-btn { + display: inline-block; + float: right; + padding: .5em 1em; + text-decoration: none; + color: #484848; + font-weight: bold; + } + + #toggle-content li { + clear: both; + height: auto; + background-color: rgb(249, 249, 249); + } + + #toggle-home { + display: inline-block; + } +} + +@media screen and (min-width: 768px) { + .mobile { + display: none; + } + + .desktop { + display: block; + } +} diff --git a/layouts/partials/footer.html b/layouts/partials/footer.html index cdf6021..a55e3cd 100644 --- a/layouts/partials/footer.html +++ b/layouts/partials/footer.html @@ -1,17 +1,75 @@ + -{{ template "_internal/google_analytics.html" . }} + + + + +{{ if .Site.Author.gaid }} + +{{ end }} + + diff --git a/static/.gitkeep b/static/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/static/custom.css b/static/custom.css new file mode 100644 index 0000000..8a91ee8 --- /dev/null +++ b/static/custom.css @@ -0,0 +1,3 @@ +.post-title h1 { + margin-top: 10px; +} diff --git a/static/img/favicon.ico b/static/img/favicon.ico new file mode 100644 index 0000000..d68366f Binary files /dev/null and b/static/img/favicon.ico differ diff --git a/static/ytdl/1.png b/static/ytdl/1.png new file mode 100644 index 0000000..15ddf2b Binary files /dev/null and b/static/ytdl/1.png differ diff --git a/themes/temple b/themes/temple new file mode 160000 index 0000000..49cfa2b --- /dev/null +++ b/themes/temple @@ -0,0 +1 @@ +Subproject commit 49cfa2b6bc43166e99e0c68ed2e1d7901b26810f diff --git a/themes/whiteplain b/themes/whiteplain deleted file mode 160000 index ad683f7..0000000 --- a/themes/whiteplain +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ad683f76fe3314c5bc631304b5c0fce30d06d7ae