ayush2607 commited on
Commit
7d73628
·
verified ·
1 Parent(s): 76aa730

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -0
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
+ from datasets import load_dataset
6
+ import numpy as np
7
+ from speechbrain.inference import EncoderClassifier
8
+
9
+ # Load models and processor
10
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
+ model = SpeechT5ForTextToSpeech.from_pretrained("ayush2607/speecht5_tts_technical_data")
12
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
+
14
+ # Load speaker encoder
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ speaker_model = EncoderClassifier.from_hparams(
17
+ source="speechbrain/spkrec-xvect-voxceleb",
18
+ run_opts={"device": device},
19
+ savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb")
20
+ )
21
+
22
+ # Load a sample from the dataset for speaker embedding
23
+ dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train")
24
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
25
+ sample = dataset[0]
26
+
27
+ def create_speaker_embedding(waveform):
28
+ with torch.no_grad():
29
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
30
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
31
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
32
+ return speaker_embeddings
33
+
34
+ # Create a speaker embedding from the sample
35
+ speaker_embedding = create_speaker_embedding(sample['audio']['array'])
36
+
37
+ def text_to_speech(text):
38
+ # Clean up text
39
+ replacements = [
40
+ ('$', 'dollar'), ('%', 'percent'), ('&', 'and'), ('*', 'asterick'),
41
+ ('+', 'plus'), ('1', 'one'), ('2', 'two'), ('3', 'three'), ('4', 'four'),
42
+ ('5', 'five'), ('6', 'six'), ('7', 'seven'), ('8', 'eight'), ('9', 'nine'),
43
+ ('0', 'zero'), ('@', 'at'), ('\n', ' '), ('\xa0', ' '), (',', ' '),
44
+ ('"', '"'), ('"', '"'),
45
+ ]
46
+ for src, dst in replacements:
47
+ text = text.replace(src, dst)
48
+
49
+ inputs = processor(text=text, return_tensors="pt")
50
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
51
+ return (16000, speech.numpy())
52
+
53
+ iface = gr.Interface(
54
+ fn=text_to_speech,
55
+ inputs="text",
56
+ outputs="audio",
57
+ title="Technical Text-to-Speech",
58
+ description="Enter technical text to convert to speech. The model has been fine-tuned on technical data."
59
+ )
60
+
61
+ iface.launch()