mirror of
https://github.com/simon987/sist2-scripts.git
synced 2025-04-04 08:53:03 +00:00
first commit
This commit is contained in:
commit
b380df9049
10
.idea/.gitignore
generated
vendored
Normal file
10
.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Zeppelin ignored files
|
||||
/ZeppelinRemoteNotebooks/
|
22
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
22
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
@ -0,0 +1,22 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredPackages">
|
||||
<value>
|
||||
<list size="2">
|
||||
<item index="0" class="java.lang.String" itemvalue="numpy" />
|
||||
<item index="1" class="java.lang.String" itemvalue="windows-curses" />
|
||||
</list>
|
||||
</value>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||
<option name="ignoredErrors">
|
||||
<list>
|
||||
<option value="E402" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
4
.idea/misc.xml
generated
Normal file
4
.idea/misc.xml
generated
Normal file
@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (sist2-scripts)" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/sist2-scripts.iml" filepath="$PROJECT_DIR$/.idea/sist2-scripts.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
8
.idea/sist2-scripts.iml
generated
Normal file
8
.idea/sist2-scripts.iml
generated
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
8
README.md
Normal file
8
README.md
Normal file
@ -0,0 +1,8 @@
|
||||
## transcribe.py
|
||||
|
||||
Transcribe audio files using AWS Transcribe
|
||||
|
||||
Example usage:
|
||||
```
|
||||
find /path/to/audio/files/ -name "*.mp3" | parallel -j8 python transcribe.py --bucket my-s3-bucket-name {}
|
||||
```
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
boto3
|
||||
requests
|
58
transcribe.py
Normal file
58
transcribe.py
Normal file
@ -0,0 +1,58 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from time import sleep, time
|
||||
import boto3
|
||||
|
||||
import requests
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(description="Transcribe audio files")
|
||||
parser.add_argument("input_file", nargs=1, help="Audio file to transcribe")
|
||||
parser.add_argument("--bucket", dest="bucket", action='store', help="S3 bucket name", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
INPUT_FILE = args.input_file[0]
|
||||
BUCKET = args.bucket
|
||||
|
||||
if os.path.exists(INPUT_FILE + ".s2meta"):
|
||||
exit(0)
|
||||
|
||||
transcribe = boto3.client("transcribe")
|
||||
s3 = boto3.client("s3")
|
||||
|
||||
job_name = "sist2-transcribe-%d" % int(time())
|
||||
|
||||
s3.upload_file(INPUT_FILE, BUCKET, job_name)
|
||||
|
||||
job_uri = "s3://%s/%s" % (BUCKET, job_name)
|
||||
|
||||
transcribe.start_transcription_job(
|
||||
TranscriptionJobName=job_name,
|
||||
Media={"MediaFileUri": job_uri},
|
||||
MediaFormat=os.path.splitext(INPUT_FILE)[1][1:],
|
||||
LanguageCode="en-US",
|
||||
Settings={
|
||||
"VocabularyFilterMethod": "tag"
|
||||
}
|
||||
)
|
||||
|
||||
while True:
|
||||
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
|
||||
if status["TranscriptionJob"]["TranscriptionJobStatus"] in ["COMPLETED", "FAILED"]:
|
||||
break
|
||||
sleep(5)
|
||||
|
||||
transcript_url = status["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]
|
||||
r = requests.get(transcript_url)
|
||||
transcript = r.json()["results"]["transcripts"][0]["transcript"]
|
||||
|
||||
s3.delete_object(Bucket=BUCKET, Key=job_name)
|
||||
transcribe.delete_transcription_job(TranscriptionJobName=job_name)
|
||||
|
||||
with open(INPUT_FILE + ".s2meta", "w") as f:
|
||||
f.write(json.dumps({
|
||||
"content": transcript,
|
||||
"_transcribed_by": "AWS/Transcribe"
|
||||
}))
|
Loading…
x
Reference in New Issue
Block a user