first commit

This commit is contained in:
simon987 2021-09-13 11:28:07 -04:00
commit b380df9049
9 changed files with 126 additions and 0 deletions

10
.idea/.gitignore generated vendored Normal file
View File

@ -0,0 +1,10 @@
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
# Zeppelin ignored files
/ZeppelinRemoteNotebooks/

View File

@ -0,0 +1,22 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="2">
<item index="0" class="java.lang.String" itemvalue="numpy" />
<item index="1" class="java.lang.String" itemvalue="windows-curses" />
</list>
</value>
</option>
</inspection_tool>
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="E402" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml generated Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (sist2-scripts)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/sist2-scripts.iml" filepath="$PROJECT_DIR$/.idea/sist2-scripts.iml" />
</modules>
</component>
</project>

8
.idea/sist2-scripts.iml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

8
README.md Normal file
View File

@ -0,0 +1,8 @@
## transcribe.py
Transcribe audio files using AWS Transcribe
Example usage:
```
find /path/to/audio/files/ -name "*.mp3" | parallel -j8 python transcribe.py --bucket my-s3-bucket-name {}
```

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
boto3
requests

58
transcribe.py Normal file
View File

@ -0,0 +1,58 @@
import argparse
import json
import os
from time import sleep, time
import boto3
import requests
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Transcribe audio files")
parser.add_argument("input_file", nargs=1, help="Audio file to transcribe")
parser.add_argument("--bucket", dest="bucket", action='store', help="S3 bucket name", required=True)
args = parser.parse_args()
INPUT_FILE = args.input_file[0]
BUCKET = args.bucket
if os.path.exists(INPUT_FILE + ".s2meta"):
exit(0)
transcribe = boto3.client("transcribe")
s3 = boto3.client("s3")
job_name = "sist2-transcribe-%d" % int(time())
s3.upload_file(INPUT_FILE, BUCKET, job_name)
job_uri = "s3://%s/%s" % (BUCKET, job_name)
transcribe.start_transcription_job(
TranscriptionJobName=job_name,
Media={"MediaFileUri": job_uri},
MediaFormat=os.path.splitext(INPUT_FILE)[1][1:],
LanguageCode="en-US",
Settings={
"VocabularyFilterMethod": "tag"
}
)
while True:
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
if status["TranscriptionJob"]["TranscriptionJobStatus"] in ["COMPLETED", "FAILED"]:
break
sleep(5)
transcript_url = status["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]
r = requests.get(transcript_url)
transcript = r.json()["results"]["transcripts"][0]["transcript"]
s3.delete_object(Bucket=BUCKET, Key=job_name)
transcribe.delete_transcription_job(TranscriptionJobName=job_name)
with open(INPUT_FILE + ".s2meta", "w") as f:
f.write(json.dumps({
"content": transcript,
"_transcribed_by": "AWS/Transcribe"
}))