commit b380df90494a016907bd0caa0c58ed983c770440
Author: simon987 <me@simon987.net>
Date:   Mon Sep 13 11:28:07 2021 -0400

    first commit

diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..11bd5c0
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,10 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Zeppelin ignored files
+/ZeppelinRemoteNotebooks/
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..be4e605
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,22 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="2">
+            <item index="0" class="java.lang.String" itemvalue="numpy" />
+            <item index="1" class="java.lang.String" itemvalue="windows-curses" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="E402" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..dc40e08
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (sist2-scripts)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..2883f26
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/sist2-scripts.iml" filepath="$PROJECT_DIR$/.idea/sist2-scripts.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/sist2-scripts.iml b/.idea/sist2-scripts.iml
new file mode 100644
index 0000000..d0876a7
--- /dev/null
+++ b/.idea/sist2-scripts.iml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..761c723
--- /dev/null
+++ b/README.md
@@ -0,0 +1,8 @@
+## transcribe.py
+
+Transcribe audio files using AWS Transcribe
+
+Example usage:
+```
+find /path/to/audio/files/ -name "*.mp3" | parallel -j8 python transcribe.py --bucket my-s3-bucket-name {}
+```
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..5ece686
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+boto3
+requests
\ No newline at end of file
diff --git a/transcribe.py b/transcribe.py
new file mode 100644
index 0000000..c25edeb
--- /dev/null
+++ b/transcribe.py
@@ -0,0 +1,58 @@
+import argparse
+import json
+import os
+from time import sleep, time
+import boto3
+
+import requests
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description="Transcribe audio files")
+    parser.add_argument("input_file", nargs=1, help="Audio file to transcribe")
+    parser.add_argument("--bucket", dest="bucket", action='store', help="S3 bucket name", required=True)
+    args = parser.parse_args()
+
+    INPUT_FILE = args.input_file[0]
+    BUCKET = args.bucket
+
+    if os.path.exists(INPUT_FILE + ".s2meta"):
+        exit(0)
+
+    transcribe = boto3.client("transcribe")
+    s3 = boto3.client("s3")
+
+    job_name = "sist2-transcribe-%d" % int(time())
+
+    s3.upload_file(INPUT_FILE, BUCKET, job_name)
+
+    job_uri = "s3://%s/%s" % (BUCKET, job_name)
+
+    transcribe.start_transcription_job(
+        TranscriptionJobName=job_name,
+        Media={"MediaFileUri": job_uri},
+        MediaFormat=os.path.splitext(INPUT_FILE)[1][1:],
+        LanguageCode="en-US",
+        Settings={
+            "VocabularyFilterMethod": "tag"
+        }
+    )
+
+    while True:
+        status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
+        if status["TranscriptionJob"]["TranscriptionJobStatus"] in ["COMPLETED", "FAILED"]:
+            break
+        sleep(5)
+
+    transcript_url = status["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]
+    r = requests.get(transcript_url)
+    transcript = r.json()["results"]["transcripts"][0]["transcript"]
+
+    s3.delete_object(Bucket=BUCKET, Key=job_name)
+    transcribe.delete_transcription_job(TranscriptionJobName=job_name)
+
+    with open(INPUT_FILE + ".s2meta", "w") as f:
+        f.write(json.dumps({
+            "content": transcript,
+            "_transcribed_by": "AWS/Transcribe"
+        }))