Spaces:
Paused
Paused
Update chat_ai.py
Browse files- chat_ai.py +62 -84
chat_ai.py
CHANGED
@@ -1,13 +1,9 @@
|
|
1 |
-
import re
|
2 |
-
import tempfile
|
3 |
-
|
4 |
import gradio as gr
|
|
|
5 |
import numpy as np
|
6 |
-
import soundfile as sf
|
7 |
import torchaudio
|
8 |
-
|
9 |
-
from transformers import
|
10 |
-
|
11 |
from f5_tts.model import DiT
|
12 |
from f5_tts.infer.utils_infer import (
|
13 |
load_vocoder,
|
@@ -17,27 +13,21 @@ from f5_tts.infer.utils_infer import (
|
|
17 |
remove_silence_for_generated_wav,
|
18 |
save_spectrogram,
|
19 |
)
|
|
|
|
|
20 |
|
21 |
-
#
|
22 |
-
from cached_path import cached_path
|
23 |
-
|
24 |
-
# Decorador GPU para Spaces o local
|
25 |
-
def gpu_decorator(func):
|
26 |
-
return func # Simplemente devuelve la función, ajusta según tu entorno si usas HF中国镜像站 Spaces
|
27 |
-
|
28 |
-
# Cargar el vocoder
|
29 |
vocoder = load_vocoder()
|
30 |
-
|
31 |
-
# Configuración y carga del modelo F5TTS
|
32 |
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
|
33 |
F5TTS_ema_model = load_model(
|
34 |
-
DiT, F5TTS_model_cfg,
|
35 |
)
|
36 |
|
37 |
def traducir_numero_a_texto(texto):
|
|
|
38 |
texto_separado = re.sub(r'([A-Za-z])(\d)', r'\1 \2', texto)
|
39 |
texto_separado = re.sub(r'(\d)([A-Za-z])', r'\1 \2', texto_separado)
|
40 |
-
|
41 |
def reemplazar_numero(match):
|
42 |
numero = match.group()
|
43 |
return num2words(int(numero), lang='es')
|
@@ -46,35 +36,24 @@ def traducir_numero_a_texto(texto):
|
|
46 |
|
47 |
return texto_traducido
|
48 |
|
49 |
-
@gpu_decorator
|
50 |
def infer(
|
51 |
-
ref_audio_orig, ref_text, gen_text,
|
52 |
):
|
53 |
-
|
|
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
if not gen_text.startswith(" "):
|
58 |
-
gen_text = " " + gen_text
|
59 |
-
if not gen_text.endswith(". "):
|
60 |
-
gen_text += ". "
|
61 |
-
|
62 |
-
gen_text = gen_text.lower()
|
63 |
-
gen_text = traducir_numero_a_texto(gen_text)
|
64 |
|
65 |
final_wave, final_sample_rate, combined_spectrogram = infer_process(
|
66 |
ref_audio,
|
67 |
ref_text,
|
68 |
gen_text,
|
69 |
-
|
70 |
vocoder,
|
71 |
cross_fade_duration=cross_fade_duration,
|
72 |
speed=speed,
|
73 |
-
show_info=show_info,
|
74 |
-
progress=gr.Progress(),
|
75 |
)
|
76 |
|
77 |
-
# Eliminar silencios
|
78 |
if remove_silence:
|
79 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
80 |
sf.write(f.name, final_wave, final_sample_rate)
|
@@ -82,65 +61,64 @@ def infer(
|
|
82 |
final_wave, _ = torchaudio.load(f.name)
|
83 |
final_wave = final_wave.squeeze().cpu().numpy()
|
84 |
|
85 |
-
# Guardar
|
86 |
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
|
87 |
spectrogram_path = tmp_spectrogram.name
|
88 |
save_spectrogram(combined_spectrogram, spectrogram_path)
|
89 |
|
90 |
return (final_sample_rate, final_wave), spectrogram_path
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
ref_text_input = gr.Textbox(
|
101 |
-
label="Texto de Referencia",
|
102 |
-
info="Deja en blanco para transcribir automáticamente el audio de referencia. Si ingresas texto, sobrescribirá la transcripción automática.",
|
103 |
-
lines=2,
|
104 |
-
)
|
105 |
-
remove_silence = gr.Checkbox(
|
106 |
-
label="Eliminar Silencios",
|
107 |
-
info="El modelo tiende a producir silencios, especialmente en audios más largos. Podemos eliminar manualmente los silencios si es necesario. Ten en cuenta que esta es una característica experimental y puede producir resultados extraños. Esto también aumentará el tiempo de generación.",
|
108 |
-
value=False,
|
109 |
-
)
|
110 |
-
speed_slider = gr.Slider(
|
111 |
-
label="Velocidad",
|
112 |
-
minimum=0.3,
|
113 |
-
maximum=2.0,
|
114 |
-
value=1.0,
|
115 |
-
step=0.1,
|
116 |
-
info="Ajusta la velocidad del audio.",
|
117 |
-
)
|
118 |
-
cross_fade_duration_slider = gr.Slider(
|
119 |
-
label="Duración del Cross-Fade (s)",
|
120 |
-
minimum=0.0,
|
121 |
-
maximum=1.0,
|
122 |
-
value=0.15,
|
123 |
-
step=0.01,
|
124 |
-
info="Establece la duración del cross-fade entre clips de audio.",
|
125 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
-
|
128 |
-
|
|
|
129 |
|
130 |
generate_btn.click(
|
131 |
-
|
132 |
-
inputs=[
|
133 |
-
ref_audio_input,
|
134 |
-
ref_text_input,
|
135 |
-
gen_text_input,
|
136 |
-
model_choice,
|
137 |
-
remove_silence,
|
138 |
-
cross_fade_duration_slider,
|
139 |
-
speed_slider,
|
140 |
-
],
|
141 |
outputs=[audio_output, spectrogram_output],
|
142 |
)
|
143 |
|
144 |
-
# Ejecutar la aplicación
|
145 |
-
|
146 |
-
app_tts.launch()
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import tempfile
|
3 |
import numpy as np
|
|
|
4 |
import torchaudio
|
5 |
+
import soundfile as sf
|
6 |
+
from transformers import AutoTokenizer
|
|
|
7 |
from f5_tts.model import DiT
|
8 |
from f5_tts.infer.utils_infer import (
|
9 |
load_vocoder,
|
|
|
13 |
remove_silence_for_generated_wav,
|
14 |
save_spectrogram,
|
15 |
)
|
16 |
+
from num2words import num2words
|
17 |
+
import re
|
18 |
|
19 |
+
# Cargar vocoder y modelo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
vocoder = load_vocoder()
|
|
|
|
|
21 |
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
|
22 |
F5TTS_ema_model = load_model(
|
23 |
+
DiT, F5TTS_model_cfg, "hf://jpgallegoar/F5-Spanish/model_1200000.safetensors"
|
24 |
)
|
25 |
|
26 |
def traducir_numero_a_texto(texto):
|
27 |
+
"""Convierte números a palabras en el texto."""
|
28 |
texto_separado = re.sub(r'([A-Za-z])(\d)', r'\1 \2', texto)
|
29 |
texto_separado = re.sub(r'(\d)([A-Za-z])', r'\1 \2', texto_separado)
|
30 |
+
|
31 |
def reemplazar_numero(match):
|
32 |
numero = match.group()
|
33 |
return num2words(int(numero), lang='es')
|
|
|
36 |
|
37 |
return texto_traducido
|
38 |
|
|
|
39 |
def infer(
|
40 |
+
ref_audio_orig, ref_text, gen_text, remove_silence=False, cross_fade_duration=0.15, speed=1.0
|
41 |
):
|
42 |
+
"""Realiza la inferencia para convertir texto en voz."""
|
43 |
+
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text)
|
44 |
|
45 |
+
gen_text = traducir_numero_a_texto(gen_text.lower())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
final_wave, final_sample_rate, combined_spectrogram = infer_process(
|
48 |
ref_audio,
|
49 |
ref_text,
|
50 |
gen_text,
|
51 |
+
F5TTS_ema_model,
|
52 |
vocoder,
|
53 |
cross_fade_duration=cross_fade_duration,
|
54 |
speed=speed,
|
|
|
|
|
55 |
)
|
56 |
|
|
|
57 |
if remove_silence:
|
58 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
59 |
sf.write(f.name, final_wave, final_sample_rate)
|
|
|
61 |
final_wave, _ = torchaudio.load(f.name)
|
62 |
final_wave = final_wave.squeeze().cpu().numpy()
|
63 |
|
64 |
+
# Guardar espectrograma
|
65 |
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
|
66 |
spectrogram_path = tmp_spectrogram.name
|
67 |
save_spectrogram(combined_spectrogram, spectrogram_path)
|
68 |
|
69 |
return (final_sample_rate, final_wave), spectrogram_path
|
70 |
|
71 |
+
def tts_pipeline(ref_audio, ref_text, gen_text, remove_silence, speed):
|
72 |
+
"""Pipeline para la interfaz de Gradio."""
|
73 |
+
if not ref_audio:
|
74 |
+
return None, "Por favor sube un audio de referencia."
|
75 |
+
|
76 |
+
try:
|
77 |
+
(sample_rate, audio), spectrogram_path = infer(
|
78 |
+
ref_audio, ref_text, gen_text, remove_silence=remove_silence, speed=speed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
)
|
80 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
|
81 |
+
sf.write(tmp_audio.name, audio, sample_rate)
|
82 |
+
return tmp_audio.name, spectrogram_path
|
83 |
+
except Exception as e:
|
84 |
+
return None, f"Error al generar audio: {str(e)}"
|
85 |
+
|
86 |
+
# Crear interfaz con Gradio
|
87 |
+
with gr.Blocks() as demo:
|
88 |
+
gr.Markdown("""# Conversión de Texto a Voz (TTS) en Español
|
89 |
+
|
90 |
+
Convierte texto en audio en español usando un modelo de TTS. Proporciona un audio de referencia y el texto a convertir.
|
91 |
+
|
92 |
+
**Instrucciones:**
|
93 |
+
1. Sube un audio de referencia (formato WAV o MP3, de 11 a 14 segundos).
|
94 |
+
2. Opcionalmente, ingresa el texto correspondiente al audio de referencia.
|
95 |
+
3. Escribe el texto que deseas convertir a voz.
|
96 |
+
4. Haz clic en "Generar Audio".
|
97 |
+
|
98 |
+
*Nota: Los números en el texto serán convertidos automáticamente a palabras.*
|
99 |
+
""")
|
100 |
+
|
101 |
+
with gr.Row():
|
102 |
+
ref_audio = gr.Audio(label="Audio de Referencia", type="filepath")
|
103 |
+
ref_text = gr.Textbox(label="Texto de Referencia (Opcional)", placeholder="Transcripción del audio de referencia")
|
104 |
+
|
105 |
+
gen_text = gr.Textbox(label="Texto para Convertir a Voz", lines=4, placeholder="Escribe aquí el texto a convertir")
|
106 |
+
|
107 |
+
with gr.Row():
|
108 |
+
remove_silence = gr.Checkbox(label="Eliminar Silencios", value=False)
|
109 |
+
speed = gr.Slider(label="Velocidad", minimum=0.5, maximum=2.0, value=1.0, step=0.1)
|
110 |
+
|
111 |
+
generate_btn = gr.Button("Generar Audio")
|
112 |
|
113 |
+
with gr.Row():
|
114 |
+
audio_output = gr.Audio(label="Audio Generado", type="filepath")
|
115 |
+
spectrogram_output = gr.Image(label="Espectrograma")
|
116 |
|
117 |
generate_btn.click(
|
118 |
+
tts_pipeline,
|
119 |
+
inputs=[ref_audio, ref_text, gen_text, remove_silence, speed],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
outputs=[audio_output, spectrogram_output],
|
121 |
)
|
122 |
|
123 |
+
# Ejecutar la aplicación en Spaces
|
124 |
+
demo.launch()
|
|