commit b380df90494a016907bd0caa0c58ed983c770440 Author: simon987 Date: Mon Sep 13 11:28:07 2021 -0400 first commit diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..11bd5c0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,10 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Zeppelin ignored files +/ZeppelinRemoteNotebooks/ diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..be4e605 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,22 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..dc40e08 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..2883f26 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/sist2-scripts.iml b/.idea/sist2-scripts.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/sist2-scripts.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..761c723 --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +## transcribe.py + +Transcribe audio files using AWS Transcribe + +Example usage: +``` +find /path/to/audio/files/ -name "*.mp3" | parallel -j8 python transcribe.py --bucket my-s3-bucket-name {} +``` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5ece686 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +boto3 +requests \ No newline at end of file diff --git a/transcribe.py b/transcribe.py new file mode 100644 index 0000000..c25edeb --- /dev/null +++ b/transcribe.py @@ -0,0 +1,58 @@ +import argparse +import json +import os +from time import sleep, time +import boto3 + +import requests + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description="Transcribe audio files") + parser.add_argument("input_file", nargs=1, help="Audio file to transcribe") + parser.add_argument("--bucket", dest="bucket", action='store', help="S3 bucket name", required=True) + args = parser.parse_args() + + INPUT_FILE = args.input_file[0] + BUCKET = args.bucket + + if os.path.exists(INPUT_FILE + ".s2meta"): + exit(0) + + transcribe = boto3.client("transcribe") + s3 = boto3.client("s3") + + job_name = "sist2-transcribe-%d" % int(time()) + + s3.upload_file(INPUT_FILE, BUCKET, job_name) + + job_uri = "s3://%s/%s" % (BUCKET, job_name) + + transcribe.start_transcription_job( + TranscriptionJobName=job_name, + Media={"MediaFileUri": job_uri}, + MediaFormat=os.path.splitext(INPUT_FILE)[1][1:], + LanguageCode="en-US", + Settings={ + "VocabularyFilterMethod": "tag" + } + ) + + while True: + status = transcribe.get_transcription_job(TranscriptionJobName=job_name) + if status["TranscriptionJob"]["TranscriptionJobStatus"] in ["COMPLETED", "FAILED"]: + break + sleep(5) + + transcript_url = status["TranscriptionJob"]["Transcript"]["TranscriptFileUri"] + r = requests.get(transcript_url) + transcript = r.json()["results"]["transcripts"][0]["transcript"] + + s3.delete_object(Bucket=BUCKET, Key=job_name) + transcribe.delete_transcription_job(TranscriptionJobName=job_name) + + with open(INPUT_FILE + ".s2meta", "w") as f: + f.write(json.dumps({ + "content": transcript, + "_transcribed_by": "AWS/Transcribe" + }))