mirror of
https://github.com/simon987/sist2-scripts.git
synced 2025-10-23 22:06:52 +00:00
updates
This commit is contained in:
parent
bc17fc4b98
commit
f87e947e02
3
.gitignore
vendored
3
.gitignore
vendored
@ -1 +1,2 @@
|
||||
.idea/
|
||||
.idea/
|
||||
*.zip
|
34
README.md
34
README.md
@ -1,8 +1,38 @@
|
||||
Create conda env with:
|
||||
|
||||
```
|
||||
conda create -y -n sist2-scripts -c conda-forge python=3.7 cudnn=8.1 cudatoolkit=11.2
|
||||
conda clean --force-pkgs-dirs -y && conda clean --all -y
|
||||
conda activate sist2-scripts
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## transcribe.py
|
||||
|
||||
Transcribe audio files using transformers STT
|
||||
|
||||
Example usage (Don't use multithreading!!):
|
||||
|
||||
```
|
||||
find /path/to/audio/files/ -name "*.mp3" -exec python transcribe.py {} \;
|
||||
```
|
||||
|
||||
## transcribe_aws.py
|
||||
|
||||
Transcribe audio files using AWS Transcribe
|
||||
|
||||
Example usage:
|
||||
|
||||
```
|
||||
find /path/to/audio/files/ -name "*.mp3" | parallel -j8 python transcribe_aws.py --bucket my-s3-bucket-name {}
|
||||
```
|
||||
|
||||
## export_meta.py
|
||||
|
||||
Save all .s2meta files to a zip archive for easy sharing
|
||||
|
||||
Example usage:
|
||||
|
||||
```
|
||||
python export_meta.py [--json] /path/to/dataset/
|
||||
```
|
||||
find /path/to/audio/files/ -name "*.mp3" | parallel -j8 python transcribe.py --bucket my-s3-bucket-name {}
|
||||
```
|
22
data/lq.rnnn
Normal file
22
data/lq.rnnn
Normal file
File diff suppressed because one or more lines are too long
36
export_meta.py
Normal file
36
export_meta.py
Normal file
@ -0,0 +1,36 @@
|
||||
import os
|
||||
import argparse
|
||||
import zipfile
|
||||
|
||||
|
||||
def ftw(path):
|
||||
for cur, _dirs, files in os.walk(path):
|
||||
for file in files:
|
||||
yield os.path.join(cur, file)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(description="Export all s2meta into a zip file")
|
||||
parser.add_argument("input_file", nargs=1, help="Directory root")
|
||||
parser.add_argument('--json', dest="json", help="Rename .s2meta to .json", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
INPUT_FILE = args.input_file[0]
|
||||
RENAME_JSON = args.json
|
||||
|
||||
archive = zipfile.ZipFile("metadata.zip", "w", compression=zipfile.ZIP_DEFLATED, compresslevel=9)
|
||||
|
||||
for filename in ftw(INPUT_FILE):
|
||||
if filename.endswith(".s2meta"):
|
||||
|
||||
relpath = os.path.relpath(filename, INPUT_FILE)
|
||||
|
||||
if RENAME_JSON:
|
||||
relpath = relpath.replace(".s2meta", ".json")
|
||||
|
||||
archive.write(filename, relpath)
|
||||
|
||||
archive.close()
|
||||
|
||||
|
@ -1,2 +1,5 @@
|
||||
boto3
|
||||
requests
|
||||
requests
|
||||
ffmpeg-python
|
||||
transformers
|
||||
torch
|
179
transcribe.py
179
transcribe.py
@ -1,58 +1,155 @@
|
||||
import argparse
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
from time import sleep, time
|
||||
import boto3
|
||||
import re
|
||||
import time
|
||||
import wave
|
||||
from io import BytesIO
|
||||
from time import sleep
|
||||
|
||||
import ffmpeg
|
||||
import librosa
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, HubertForCTC
|
||||
from denoiser import enhance
|
||||
|
||||
|
||||
def _get_audio_bytes(audio_filename):
|
||||
# audio_bytes = BytesIO(normalize_audio(audio_filename))
|
||||
y, sr = librosa.load(audio_filename, sr=16000, mono=True, dtype=np.float32)
|
||||
|
||||
return y
|
||||
#
|
||||
# with wave.Wave_read(audio_bytes) as wav:
|
||||
# x = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
|
||||
# return x
|
||||
|
||||
|
||||
def np_chunks(lst: np.array, chunk_len: int):
|
||||
for i in range(0, len(lst), chunk_len):
|
||||
yield lst[i:i + chunk_len]
|
||||
|
||||
|
||||
def normalize_audio(input_file):
|
||||
try:
|
||||
kwargs = {
|
||||
"f": "WAV",
|
||||
"acodec": "pcm_s16le",
|
||||
"ac": 1,
|
||||
"ar": "16k",
|
||||
"loglevel": "error",
|
||||
"hide_banner": None,
|
||||
}
|
||||
|
||||
out, err = ffmpeg.input(input_file).output(
|
||||
"pipe:1",
|
||||
**kwargs
|
||||
).run(capture_stdout=True, capture_stderr=True)
|
||||
|
||||
return out
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print(e.stderr)
|
||||
|
||||
|
||||
class SpeechToTextEngine:
|
||||
def __init__(self, model_name):
|
||||
self.processor = Wav2Vec2Processor.from_pretrained(model_name)
|
||||
if "hubert" in model_name:
|
||||
self.model = HubertForCTC.from_pretrained(model_name)
|
||||
else:
|
||||
self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
||||
|
||||
self.model.to(torch.device("cuda:0"))
|
||||
|
||||
def run(self, audio_filename):
|
||||
audio_bytes = _get_audio_bytes(audio_filename)
|
||||
|
||||
text = ""
|
||||
|
||||
# Tweak this number to avoid OOM
|
||||
for chunk in np_chunks(audio_bytes, 100_000):
|
||||
input_values = self.processor(
|
||||
chunk, return_tensors="pt", sampling_rate=16000
|
||||
).input_values
|
||||
|
||||
input_values = input_values.float().to(torch.device("cuda:0"))
|
||||
|
||||
predicted_ids = torch.argmax(self.model(input_values).logits, dim=-1)
|
||||
|
||||
text += self.processor.batch_decode(predicted_ids)[0]
|
||||
text += "\n"
|
||||
|
||||
return re.sub(r"\s+", " ", text).lower()
|
||||
|
||||
|
||||
class DenoiseArgs:
|
||||
def __init__(self):
|
||||
self.device = "cuda"
|
||||
self.dry = 0
|
||||
self.sample_rate = 16000
|
||||
self.num_workers = 32
|
||||
self.streaming = True
|
||||
self.noisy_dir = "/tmp/transcribe-denoise-input/"
|
||||
self.noisy_json = None
|
||||
self.dns64 = False
|
||||
self.dns48 = False
|
||||
self.master64 = True
|
||||
self.model_path = None
|
||||
|
||||
|
||||
DENOISE_INPUT = "/tmp/transcribe-denoise-input/"
|
||||
DENOISE_OUTPUT = "/tmp/transcribe-denoise-output/"
|
||||
|
||||
|
||||
def denoise(filepath):
|
||||
os.makedirs(DENOISE_INPUT, exist_ok=True)
|
||||
os.makedirs(DENOISE_OUTPUT, exist_ok=True)
|
||||
|
||||
audio_bytes = normalize_audio(filepath)
|
||||
with open(f"{DENOISE_INPUT}/file.wav", "wb") as f:
|
||||
f.write(audio_bytes)
|
||||
|
||||
enhance.enhance(DenoiseArgs(), local_out_dir=DENOISE_OUTPUT)
|
||||
|
||||
print("denoise done")
|
||||
|
||||
import requests
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(description="Transcribe audio files")
|
||||
parser.add_argument("input_file", nargs=1, help="Audio file to transcribe")
|
||||
parser.add_argument("--bucket", dest="bucket", action='store', help="S3 bucket name", required=True)
|
||||
parser.add_argument('--skip-denoise', dest="skip_denoise", help="Skip de-noising", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
INPUT_FILE = args.input_file[0]
|
||||
BUCKET = args.bucket
|
||||
|
||||
if os.path.exists(INPUT_FILE + ".s2meta"):
|
||||
exit(0)
|
||||
MODEL = "facebook/wav2vec2-large-960h"
|
||||
# MODEL = "facebook/hubert-large-ls960-ft"
|
||||
DENOISE_MODEL = "master64"
|
||||
|
||||
transcribe = boto3.client("transcribe")
|
||||
s3 = boto3.client("s3")
|
||||
DENOISE = not args.skip_denoise
|
||||
# DENOISE = False
|
||||
|
||||
job_name = "sist2-transcribe-%d" % int(time())
|
||||
# if os.path.exists(filepath + ".s2meta"):
|
||||
# exit(0)
|
||||
engine = SpeechToTextEngine(MODEL)
|
||||
|
||||
s3.upload_file(INPUT_FILE, BUCKET, job_name)
|
||||
start = time.time()
|
||||
|
||||
job_uri = "s3://%s/%s" % (BUCKET, job_name)
|
||||
if DENOISE:
|
||||
denoise(INPUT_FILE)
|
||||
result = engine.run(f"{DENOISE_OUTPUT}/file_enhanced.wav")
|
||||
os.remove(f"{DENOISE_OUTPUT}/file_enhanced.wav")
|
||||
else:
|
||||
result = engine.run(INPUT_FILE)
|
||||
|
||||
transcribe.start_transcription_job(
|
||||
TranscriptionJobName=job_name,
|
||||
Media={"MediaFileUri": job_uri},
|
||||
MediaFormat=os.path.splitext(INPUT_FILE)[1][1:],
|
||||
LanguageCode="en-US",
|
||||
Settings={
|
||||
"VocabularyFilterMethod": "tag"
|
||||
}
|
||||
)
|
||||
|
||||
while True:
|
||||
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
|
||||
if status["TranscriptionJob"]["TranscriptionJobStatus"] in ["COMPLETED", "FAILED"]:
|
||||
break
|
||||
sleep(5)
|
||||
|
||||
transcript_url = status["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]
|
||||
r = requests.get(transcript_url)
|
||||
transcript = r.json()["results"]["transcripts"][0]["transcript"]
|
||||
|
||||
s3.delete_object(Bucket=BUCKET, Key=job_name)
|
||||
transcribe.delete_transcription_job(TranscriptionJobName=job_name)
|
||||
|
||||
with open(INPUT_FILE + ".s2meta", "w") as f:
|
||||
f.write(json.dumps({
|
||||
"content": transcript,
|
||||
"_transcribed_by": "AWS/Transcribe"
|
||||
}))
|
||||
print(f"Took {time.time() - start:.2f}s")
|
||||
print(result)
|
||||
# with open(INPUT_FILE + ".s2meta", "w") as f:
|
||||
# f.write(json.dumps({
|
||||
# "content": result,
|
||||
# "_transcribed_by": MODEL,
|
||||
# "_denoised_by": DENOISE_MODEL,
|
||||
# }))
|
||||
|
58
transcribe_aws.py
Normal file
58
transcribe_aws.py
Normal file
@ -0,0 +1,58 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from time import sleep, time
|
||||
import boto3
|
||||
|
||||
import requests
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(description="Transcribe audio files")
|
||||
parser.add_argument("input_file", nargs=1, help="Audio file to transcribe")
|
||||
parser.add_argument("--bucket", dest="bucket", action='store', help="S3 bucket name", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
INPUT_FILE = args.input_file[0]
|
||||
BUCKET = args.bucket
|
||||
|
||||
if os.path.exists(INPUT_FILE + ".s2meta"):
|
||||
exit(0)
|
||||
|
||||
transcribe = boto3.client("transcribe")
|
||||
s3 = boto3.client("s3")
|
||||
|
||||
job_name = "sist2-transcribe-%d" % int(time())
|
||||
|
||||
s3.upload_file(INPUT_FILE, BUCKET, job_name)
|
||||
|
||||
job_uri = "s3://%s/%s" % (BUCKET, job_name)
|
||||
|
||||
transcribe.start_transcription_job(
|
||||
TranscriptionJobName=job_name,
|
||||
Media={"MediaFileUri": job_uri},
|
||||
MediaFormat=os.path.splitext(INPUT_FILE)[1][1:],
|
||||
LanguageCode="en-US",
|
||||
Settings={
|
||||
"VocabularyFilterMethod": "tag"
|
||||
}
|
||||
)
|
||||
|
||||
while True:
|
||||
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
|
||||
if status["TranscriptionJob"]["TranscriptionJobStatus"] in ["COMPLETED", "FAILED"]:
|
||||
break
|
||||
sleep(5)
|
||||
|
||||
transcript_url = status["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]
|
||||
r = requests.get(transcript_url)
|
||||
transcript = r.json()["results"]["transcripts"][0]["transcript"]
|
||||
|
||||
s3.delete_object(Bucket=BUCKET, Key=job_name)
|
||||
transcribe.delete_transcription_job(TranscriptionJobName=job_name)
|
||||
|
||||
with open(INPUT_FILE + ".s2meta", "w") as f:
|
||||
f.write(json.dumps({
|
||||
"content": transcript,
|
||||
"_transcribed_by": "AWS/Transcribe"
|
||||
}))
|
Loading…
x
Reference in New Issue
Block a user