aswcenter
diff --git a/‎machine-learning/text-to-speech/6799-In-his-miracle-year,-he-published.mp3‎
72 KB b/‎machine-learning/text-to-speech/6799-In-his-miracle-year,-he-published.mp3‎
72 KB
diff --git a/‎machine-learning/text-to-speech/requirements‎
Lines changed: 0 additions & 3 deletions b/‎machine-learning/text-to-speech/requirements‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎machine-learning/text-to-speech/requirements.txt‎
Lines changed: 7 additions & 0 deletions b/‎machine-learning/text-to-speech/requirements.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎machine-learning/text-to-speech/tts_transformers.py‎
Lines changed: 67 additions & 0 deletions b/‎machine-learning/text-to-speech/tts_transformers.py‎
Lines changed: 67 additions & 0 deletions
@@ -0,0 +1,7 @@
+pyttsx3
+gTTS
+playsound
+soundfile
+transformers
+datasets
+sentencepiece
@@ -0,0 +1,67 @@
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+import torch
+import random
+import string
+import soundfile as sf
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# load the processor
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+# load the model
+model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
+# load the vocoder, that is the voice encoder
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+# we load this dataset to get the speaker embeddings
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+
+# speaker ids from the embeddings dataset
+speakers = {
+    'awb': 0,     # Scottish male
+    'bdl': 1138,  # US male
+    'clb': 2271,  # US female
+    'jmk': 3403,  # Canadian male
+    'ksp': 4535,  # Indian male
+    'rms': 5667,  # US male
+    'slt': 6799   # US female
+}
+
+def save_text_to_speech(text, speaker=None):
+    # preprocess text
+    inputs = processor(text=text, return_tensors="pt").to(device)
+    if speaker is not None:
+        # load xvector containing speaker's voice characteristics from a dataset
+        speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
+    else:
+        # random vector, meaning a random voice
+        speaker_embeddings = torch.randn((1, 512)).to(device)
+    # generate speech with the models
+    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+    if speaker is not None:
+        # if we have a speaker, we use the speaker's ID in the filename
+        output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
+    else:
+        # if we don't have a speaker, we use a random string in the filename
+        random_str = ''.join(random.sample(string.ascii_letters+string.digits, k=5))
+        output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
+    # save the generated speech to a file with 16KHz sampling rate
+    sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
+    # return the filename for reference
+    return output_filename
+
+# generate speech with a US female voice
+save_text_to_speech("Python is my favorite programming language", speaker=speakers["slt"])
+# generate speech with a random voice
+save_text_to_speech("Python is my favorite programming language")
+
+# a challenging text with all speakers
+text = """In his miracle year, he published four groundbreaking papers. 
+These outlined the theory of the photoelectric effect, explained Brownian motion, 
+introduced special relativity, and demonstrated mass-energy equivalence."""
+
+for speaker_name, speaker in speakers.items():
+    output_filename = save_text_to_speech(text, speaker)
+    print(f"Saved {output_filename}")
+# random speaker
+output_filename = save_text_to_speech(text)
+print(f"Saved {output_filename}")