Spaces:
Paused
Paused
import re | |
import tempfile | |
import gradio as gr | |
import numpy as np | |
import soundfile as sf | |
import torchaudio | |
from cached_path import cached_path | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from num2words import num2words | |
try: | |
import spaces | |
USING_SPACES = True | |
except ImportError: | |
USING_SPACES = False | |
def gpu_decorator(func): | |
if USING_SPACES: | |
return spaces.GPU(func) | |
else: | |
return func | |
from f5_tts.model import DiT, UNetT | |
from f5_tts.infer.utils_infer import ( | |
load_vocoder, | |
load_model, | |
preprocess_ref_audio_text, | |
infer_process, | |
remove_silence_for_generated_wav, | |
save_spectrogram, | |
) | |
vocoder = load_vocoder() | |
# Cargar modelos | |
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4) | |
F5TTS_ema_model = load_model( | |
DiT, F5TTS_model_cfg, str(cached_path("hf://jpgallegoar/F5-Spanish/model_1200000.safetensors")) | |
) | |
chat_model_state = None | |
chat_tokenizer_state = None | |
def generate_response(messages, model, tokenizer): | |
"""Generar respuesta usando Qwen.""" | |
text = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True, | |
) | |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | |
generated_ids = model.generate( | |
**model_inputs, | |
max_new_tokens=512, | |
temperature=0.7, | |
top_p=0.95, | |
) | |
generated_ids = [ | |
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) | |
] | |
return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
def traducir_numero_a_texto(texto): | |
texto_separado = re.sub(r'([A-Za-z])(\d)', r'\1 \2', texto) | |
texto_separado = re.sub(r'(\d)([A-Za-z])', r'\1 \2', texto_separado) | |
def reemplazar_numero(match): | |
numero = match.group() | |
return num2words(int(numero), lang='es') | |
texto_traducido = re.sub(r'\b\d+\b', reemplazar_numero, texto_separado) | |
return texto_traducido | |
def infer( | |
ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info | |
): | |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info) | |
ema_model = F5TTS_ema_model | |
if not gen_text.startswith(" "): | |
gen_text = " " + gen_text | |
if not gen_text.endswith(". "): | |
gen_text += ". " | |
gen_text = gen_text.lower() | |
gen_text = traducir_numero_a_texto(gen_text) | |
final_wave, final_sample_rate, combined_spectrogram = infer_process( | |
ref_audio, | |
ref_text, | |
gen_text, | |
ema_model, | |
vocoder, | |
cross_fade_duration=cross_fade_duration, | |
speed=speed, | |
show_info=show_info, | |
progress=gr.Progress(), | |
) | |
# Remover silencios | |
if remove_silence: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: | |
sf.write(f.name, final_wave, final_sample_rate) | |
remove_silence_for_generated_wav(f.name) | |
final_wave, _ = torchaudio.load(f.name) | |
final_wave = final_wave.squeeze().cpu().numpy() | |
# Guardar espectrograma | |
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram: | |
spectrogram_path = tmp_spectrogram.name | |
save_spectrogram(combined_spectrogram, spectrogram_path) | |
return (final_sample_rate, final_wave), spectrogram_path | |
with gr.Blocks() as app_tts: | |
gr.Markdown("# TTS por Lotes") | |
ref_audio_input = gr.Audio(label="Audio de Referencia", type="filepath") | |
gen_text_input = gr.Textbox(label="Texto para Generar", lines=10) | |
model_choice = gr.Radio(choices=["F5-TTS"], label="Seleccionar Modelo TTS", value="F5-TTS") | |
generate_btn = gr.Button("Sintetizar", variant="primary") | |
with gr.Accordion("Configuraciones Avanzadas", open=False): | |
ref_text_input = gr.Textbox( | |
label="Texto de Referencia", | |
info="Deja en blanco para transcribir automáticamente el audio de referencia. Si ingresas texto, sobrescribirá la transcripción automática.", | |
lines=2, | |
) | |
remove_silence = gr.Checkbox( | |
label="Eliminar Silencios", | |
info="El modelo tiende a producir silencios, especialmente en audios más largos. Podemos eliminar manualmente los silencios si es necesario. Ten en cuenta que esta es una característica experimental y puede producir resultados extraños. Esto también aumentará el tiempo de generación.", | |
value=False, | |
) | |
speed_slider = gr.Slider( | |
label="Velocidad", | |
minimum=0.3, | |
maximum=2.0, | |
value=1.0, | |
step=0.1, | |
info="Ajusta la velocidad del audio.", | |
) | |
cross_fade_duration_slider = gr.Slider( | |
label="Duración del Cross-Fade (s)", | |
minimum=0.0, | |
maximum=1.0, | |
value=0.15, | |
step=0.01, | |
info="Establece la duración del cross-fade entre clips de audio.", | |
) | |
audio_output = gr.Audio(label="Audio Sintetizado") | |
spectrogram_output = gr.Image(label="Espectrograma") | |
generate_btn.click( | |
infer, | |
inputs=[ | |
ref_audio_input, | |
ref_text_input, | |
gen_text_input, | |
model_choice, | |
remove_silence, | |
cross_fade_duration_slider, | |
speed_slider, | |
], | |
outputs=[audio_output, spectrogram_output], | |
) | |
with gr.Blocks() as app: | |
gr.Markdown( | |
""" | |
# Spanish-F5 | |
Esta es una interfaz web para F5 TTS, con un finetuning para poder hablar en castellano. | |
""" | |
) | |
gr.TabbedInterface( | |
[app_tts], | |
["TTS"], | |
) | |
if __name__ == "__main__": | |
app.queue().launch() | |