This commit is contained in:
simon987 2022-04-19 12:07:18 -04:00
parent bc17fc4b98
commit f87e947e02
7 changed files with 292 additions and 45 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
.idea/
.idea/
*.zip

View File

@ -1,8 +1,38 @@
Create conda env with:
```
conda create -y -n sist2-scripts -c conda-forge python=3.7 cudnn=8.1 cudatoolkit=11.2
conda clean --force-pkgs-dirs -y && conda clean --all -y
conda activate sist2-scripts
pip install -r requirements.txt
```
## transcribe.py
Transcribe audio files using transformers STT
Example usage (Don't use multithreading!!):
```
find /path/to/audio/files/ -name "*.mp3" -exec python transcribe.py {} \;
```
## transcribe_aws.py
Transcribe audio files using AWS Transcribe
Example usage:
```
find /path/to/audio/files/ -name "*.mp3" | parallel -j8 python transcribe_aws.py --bucket my-s3-bucket-name {}
```
## export_meta.py
Save all .s2meta files to a zip archive for easy sharing
Example usage:
```
python export_meta.py [--json] /path/to/dataset/
```
find /path/to/audio/files/ -name "*.mp3" | parallel -j8 python transcribe.py --bucket my-s3-bucket-name {}
```

22
data/lq.rnnn Normal file

File diff suppressed because one or more lines are too long

36
export_meta.py Normal file
View File

@ -0,0 +1,36 @@
import os
import argparse
import zipfile
def ftw(path):
for cur, _dirs, files in os.walk(path):
for file in files:
yield os.path.join(cur, file)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Export all s2meta into a zip file")
parser.add_argument("input_file", nargs=1, help="Directory root")
parser.add_argument('--json', dest="json", help="Rename .s2meta to .json", action="store_true")
args = parser.parse_args()
INPUT_FILE = args.input_file[0]
RENAME_JSON = args.json
archive = zipfile.ZipFile("metadata.zip", "w", compression=zipfile.ZIP_DEFLATED, compresslevel=9)
for filename in ftw(INPUT_FILE):
if filename.endswith(".s2meta"):
relpath = os.path.relpath(filename, INPUT_FILE)
if RENAME_JSON:
relpath = relpath.replace(".s2meta", ".json")
archive.write(filename, relpath)
archive.close()

View File

@ -1,2 +1,5 @@
boto3
requests
requests
ffmpeg-python
transformers
torch

View File

@ -1,58 +1,155 @@
import argparse
import glob
import json
import os
from time import sleep, time
import boto3
import re
import time
import wave
from io import BytesIO
from time import sleep
import ffmpeg
import librosa
import numpy as np
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, HubertForCTC
from denoiser import enhance
def _get_audio_bytes(audio_filename):
# audio_bytes = BytesIO(normalize_audio(audio_filename))
y, sr = librosa.load(audio_filename, sr=16000, mono=True, dtype=np.float32)
return y
#
# with wave.Wave_read(audio_bytes) as wav:
# x = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
# return x
def np_chunks(lst: np.array, chunk_len: int):
for i in range(0, len(lst), chunk_len):
yield lst[i:i + chunk_len]
def normalize_audio(input_file):
try:
kwargs = {
"f": "WAV",
"acodec": "pcm_s16le",
"ac": 1,
"ar": "16k",
"loglevel": "error",
"hide_banner": None,
}
out, err = ffmpeg.input(input_file).output(
"pipe:1",
**kwargs
).run(capture_stdout=True, capture_stderr=True)
return out
except Exception as e:
print(e)
print(e.stderr)
class SpeechToTextEngine:
def __init__(self, model_name):
self.processor = Wav2Vec2Processor.from_pretrained(model_name)
if "hubert" in model_name:
self.model = HubertForCTC.from_pretrained(model_name)
else:
self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
self.model.to(torch.device("cuda:0"))
def run(self, audio_filename):
audio_bytes = _get_audio_bytes(audio_filename)
text = ""
# Tweak this number to avoid OOM
for chunk in np_chunks(audio_bytes, 100_000):
input_values = self.processor(
chunk, return_tensors="pt", sampling_rate=16000
).input_values
input_values = input_values.float().to(torch.device("cuda:0"))
predicted_ids = torch.argmax(self.model(input_values).logits, dim=-1)
text += self.processor.batch_decode(predicted_ids)[0]
text += "\n"
return re.sub(r"\s+", " ", text).lower()
class DenoiseArgs:
def __init__(self):
self.device = "cuda"
self.dry = 0
self.sample_rate = 16000
self.num_workers = 32
self.streaming = True
self.noisy_dir = "/tmp/transcribe-denoise-input/"
self.noisy_json = None
self.dns64 = False
self.dns48 = False
self.master64 = True
self.model_path = None
DENOISE_INPUT = "/tmp/transcribe-denoise-input/"
DENOISE_OUTPUT = "/tmp/transcribe-denoise-output/"
def denoise(filepath):
os.makedirs(DENOISE_INPUT, exist_ok=True)
os.makedirs(DENOISE_OUTPUT, exist_ok=True)
audio_bytes = normalize_audio(filepath)
with open(f"{DENOISE_INPUT}/file.wav", "wb") as f:
f.write(audio_bytes)
enhance.enhance(DenoiseArgs(), local_out_dir=DENOISE_OUTPUT)
print("denoise done")
import requests
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Transcribe audio files")
parser.add_argument("input_file", nargs=1, help="Audio file to transcribe")
parser.add_argument("--bucket", dest="bucket", action='store', help="S3 bucket name", required=True)
parser.add_argument('--skip-denoise', dest="skip_denoise", help="Skip de-noising", action="store_true")
args = parser.parse_args()
INPUT_FILE = args.input_file[0]
BUCKET = args.bucket
if os.path.exists(INPUT_FILE + ".s2meta"):
exit(0)
MODEL = "facebook/wav2vec2-large-960h"
# MODEL = "facebook/hubert-large-ls960-ft"
DENOISE_MODEL = "master64"
transcribe = boto3.client("transcribe")
s3 = boto3.client("s3")
DENOISE = not args.skip_denoise
# DENOISE = False
job_name = "sist2-transcribe-%d" % int(time())
# if os.path.exists(filepath + ".s2meta"):
# exit(0)
engine = SpeechToTextEngine(MODEL)
s3.upload_file(INPUT_FILE, BUCKET, job_name)
start = time.time()
job_uri = "s3://%s/%s" % (BUCKET, job_name)
if DENOISE:
denoise(INPUT_FILE)
result = engine.run(f"{DENOISE_OUTPUT}/file_enhanced.wav")
os.remove(f"{DENOISE_OUTPUT}/file_enhanced.wav")
else:
result = engine.run(INPUT_FILE)
transcribe.start_transcription_job(
TranscriptionJobName=job_name,
Media={"MediaFileUri": job_uri},
MediaFormat=os.path.splitext(INPUT_FILE)[1][1:],
LanguageCode="en-US",
Settings={
"VocabularyFilterMethod": "tag"
}
)
while True:
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
if status["TranscriptionJob"]["TranscriptionJobStatus"] in ["COMPLETED", "FAILED"]:
break
sleep(5)
transcript_url = status["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]
r = requests.get(transcript_url)
transcript = r.json()["results"]["transcripts"][0]["transcript"]
s3.delete_object(Bucket=BUCKET, Key=job_name)
transcribe.delete_transcription_job(TranscriptionJobName=job_name)
with open(INPUT_FILE + ".s2meta", "w") as f:
f.write(json.dumps({
"content": transcript,
"_transcribed_by": "AWS/Transcribe"
}))
print(f"Took {time.time() - start:.2f}s")
print(result)
# with open(INPUT_FILE + ".s2meta", "w") as f:
# f.write(json.dumps({
# "content": result,
# "_transcribed_by": MODEL,
# "_denoised_by": DENOISE_MODEL,
# }))

58
transcribe_aws.py Normal file
View File

@ -0,0 +1,58 @@
import argparse
import json
import os
from time import sleep, time
import boto3
import requests
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Transcribe audio files")
parser.add_argument("input_file", nargs=1, help="Audio file to transcribe")
parser.add_argument("--bucket", dest="bucket", action='store', help="S3 bucket name", required=True)
args = parser.parse_args()
INPUT_FILE = args.input_file[0]
BUCKET = args.bucket
if os.path.exists(INPUT_FILE + ".s2meta"):
exit(0)
transcribe = boto3.client("transcribe")
s3 = boto3.client("s3")
job_name = "sist2-transcribe-%d" % int(time())
s3.upload_file(INPUT_FILE, BUCKET, job_name)
job_uri = "s3://%s/%s" % (BUCKET, job_name)
transcribe.start_transcription_job(
TranscriptionJobName=job_name,
Media={"MediaFileUri": job_uri},
MediaFormat=os.path.splitext(INPUT_FILE)[1][1:],
LanguageCode="en-US",
Settings={
"VocabularyFilterMethod": "tag"
}
)
while True:
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
if status["TranscriptionJob"]["TranscriptionJobStatus"] in ["COMPLETED", "FAILED"]:
break
sleep(5)
transcript_url = status["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]
r = requests.get(transcript_url)
transcript = r.json()["results"]["transcripts"][0]["transcript"]
s3.delete_object(Bucket=BUCKET, Key=job_name)
transcribe.delete_transcription_job(TranscriptionJobName=job_name)
with open(INPUT_FILE + ".s2meta", "w") as f:
f.write(json.dumps({
"content": transcript,
"_transcribed_by": "AWS/Transcribe"
}))