import gradio as gr from huggingface_hub import snapshot_download import sys print(sys.path) import transformers from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC model_id = "wbbbbb/wav2vec2-large-chinese-zh-cn" model = snapshot_download(repo_id=model_id, cache_dir='cache') processor = Wav2Vec2Processor.from_pretrained(model_id) def transcribe(audio): # 语音识别接口 inputs = processor(audio, sampling_rate=16_000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(inputs.input_values).logits prediction = processor.batch_decode(torch.argmax(logits, dim=-1)) return prediction[0] iface = gr.Interface(fn=transcribe, inputs="audio", outputs="text") iface.launch()