|
import gradio as gr |
|
from huggingface_hub import snapshot_download |
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC |
|
|
|
model_id = "wbbbbb/wav2vec2-large-chinese-zh-cn" |
|
|
|
model = snapshot_download(repo_id=model_id, cache_dir='cache') |
|
processor = Wav2Vec2Processor.from_pretrained(model_id) |
|
|
|
def transcribe(audio): |
|
|
|
inputs = processor(audio, sampling_rate=16_000, return_tensors="pt", padding=True) |
|
with torch.no_grad(): |
|
logits = model(inputs.input_values).logits |
|
prediction = processor.batch_decode(torch.argmax(logits, dim=-1)) |
|
return prediction[0] |
|
|
|
iface = gr.Interface(fn=transcribe, inputs="audio", outputs="text") |
|
iface.launch() |