1+ from  transformers  import  SpeechT5Processor , SpeechT5ForTextToSpeech , SpeechT5HifiGan 
2+ from  datasets  import  load_dataset 
3+ import  torch 
4+ import  random 
5+ import  string 
6+ import  soundfile  as  sf 
7+ 
8+ device  =  "cuda"  if  torch .cuda .is_available () else  "cpu" 
9+ # load the processor 
10+ processor  =  SpeechT5Processor .from_pretrained ("microsoft/speecht5_tts" )
11+ # load the model 
12+ model  =  SpeechT5ForTextToSpeech .from_pretrained ("microsoft/speecht5_tts" ).to (device )
13+ # load the vocoder, that is the voice encoder 
14+ vocoder  =  SpeechT5HifiGan .from_pretrained ("microsoft/speecht5_hifigan" ).to (device )
15+ # we load this dataset to get the speaker embeddings 
16+ embeddings_dataset  =  load_dataset ("Matthijs/cmu-arctic-xvectors" , split = "validation" )
17+ 
18+ # speaker ids from the embeddings dataset 
19+ speakers  =  {
20+     'awb' : 0 ,     # Scottish male 
21+     'bdl' : 1138 ,  # US male 
22+     'clb' : 2271 ,  # US female 
23+     'jmk' : 3403 ,  # Canadian male 
24+     'ksp' : 4535 ,  # Indian male 
25+     'rms' : 5667 ,  # US male 
26+     'slt' : 6799    # US female 
27+ }
28+ 
29+ def  save_text_to_speech (text , speaker = None ):
30+     # preprocess text 
31+     inputs  =  processor (text = text , return_tensors = "pt" ).to (device )
32+     if  speaker  is  not   None :
33+         # load xvector containing speaker's voice characteristics from a dataset 
34+         speaker_embeddings  =  torch .tensor (embeddings_dataset [speaker ]["xvector" ]).unsqueeze (0 ).to (device )
35+     else :
36+         # random vector, meaning a random voice 
37+         speaker_embeddings  =  torch .randn ((1 , 512 )).to (device )
38+     # generate speech with the models 
39+     speech  =  model .generate_speech (inputs ["input_ids" ], speaker_embeddings , vocoder = vocoder )
40+     if  speaker  is  not   None :
41+         # if we have a speaker, we use the speaker's ID in the filename 
42+         output_filename  =  f"{ speaker }  -{ '-' .join (text .split ()[:6 ])}  .mp3" 
43+     else :
44+         # if we don't have a speaker, we use a random string in the filename 
45+         random_str  =  '' .join (random .sample (string .ascii_letters + string .digits , k = 5 ))
46+         output_filename  =  f"{ random_str }  -{ '-' .join (text .split ()[:6 ])}  .mp3" 
47+     # save the generated speech to a file with 16KHz sampling rate 
48+     sf .write (output_filename , speech .cpu ().numpy (), samplerate = 16000 )
49+     # return the filename for reference 
50+     return  output_filename 
51+ 
52+ # generate speech with a US female voice 
53+ save_text_to_speech ("Python is my favorite programming language" , speaker = speakers ["slt" ])
54+ # generate speech with a random voice 
55+ save_text_to_speech ("Python is my favorite programming language" )
56+ 
57+ # a challenging text with all speakers 
58+ text  =  """In his miracle year, he published four groundbreaking papers.  
59+ These outlined the theory of the photoelectric effect, explained Brownian motion,  
60+ introduced special relativity, and demonstrated mass-energy equivalence.""" 
61+ 
62+ for  speaker_name , speaker  in  speakers .items ():
63+     output_filename  =  save_text_to_speech (text , speaker )
64+     print (f"Saved { output_filename }  " )
65+ # random speaker 
66+ output_filename  =  save_text_to_speech (text )
67+ print (f"Saved { output_filename }  " )
0 commit comments