1+ from transformers import SpeechT5Processor , SpeechT5ForTextToSpeech , SpeechT5HifiGan
2+ from datasets import load_dataset
3+ import torch
4+ import random
5+ import string
6+ import soundfile as sf
7+
8+ device = "cuda" if torch .cuda .is_available () else "cpu"
9+ # load the processor
10+ processor = SpeechT5Processor .from_pretrained ("microsoft/speecht5_tts" )
11+ # load the model
12+ model = SpeechT5ForTextToSpeech .from_pretrained ("microsoft/speecht5_tts" ).to (device )
13+ # load the vocoder, that is the voice encoder
14+ vocoder = SpeechT5HifiGan .from_pretrained ("microsoft/speecht5_hifigan" ).to (device )
15+ # we load this dataset to get the speaker embeddings
16+ embeddings_dataset = load_dataset ("Matthijs/cmu-arctic-xvectors" , split = "validation" )
17+
18+ # speaker ids from the embeddings dataset
19+ speakers = {
20+ 'awb' : 0 , # Scottish male
21+ 'bdl' : 1138 , # US male
22+ 'clb' : 2271 , # US female
23+ 'jmk' : 3403 , # Canadian male
24+ 'ksp' : 4535 , # Indian male
25+ 'rms' : 5667 , # US male
26+ 'slt' : 6799 # US female
27+ }
28+
29+ def save_text_to_speech (text , speaker = None ):
30+ # preprocess text
31+ inputs = processor (text = text , return_tensors = "pt" ).to (device )
32+ if speaker is not None :
33+ # load xvector containing speaker's voice characteristics from a dataset
34+ speaker_embeddings = torch .tensor (embeddings_dataset [speaker ]["xvector" ]).unsqueeze (0 ).to (device )
35+ else :
36+ # random vector, meaning a random voice
37+ speaker_embeddings = torch .randn ((1 , 512 )).to (device )
38+ # generate speech with the models
39+ speech = model .generate_speech (inputs ["input_ids" ], speaker_embeddings , vocoder = vocoder )
40+ if speaker is not None :
41+ # if we have a speaker, we use the speaker's ID in the filename
42+ output_filename = f"{ speaker } -{ '-' .join (text .split ()[:6 ])} .mp3"
43+ else :
44+ # if we don't have a speaker, we use a random string in the filename
45+ random_str = '' .join (random .sample (string .ascii_letters + string .digits , k = 5 ))
46+ output_filename = f"{ random_str } -{ '-' .join (text .split ()[:6 ])} .mp3"
47+ # save the generated speech to a file with 16KHz sampling rate
48+ sf .write (output_filename , speech .cpu ().numpy (), samplerate = 16000 )
49+ # return the filename for reference
50+ return output_filename
51+
52+ # generate speech with a US female voice
53+ save_text_to_speech ("Python is my favorite programming language" , speaker = speakers ["slt" ])
54+ # generate speech with a random voice
55+ save_text_to_speech ("Python is my favorite programming language" )
56+
57+ # a challenging text with all speakers
58+ text = """In his miracle year, he published four groundbreaking papers.
59+ These outlined the theory of the photoelectric effect, explained Brownian motion,
60+ introduced special relativity, and demonstrated mass-energy equivalence."""
61+
62+ for speaker_name , speaker in speakers .items ():
63+ output_filename = save_text_to_speech (text , speaker )
64+ print (f"Saved { output_filename } " )
65+ # random speaker
66+ output_filename = save_text_to_speech (text )
67+ print (f"Saved { output_filename } " )
0 commit comments