Skip to content

Make MeloTTS logs English & add G2P debug toggle and SOLA algorithm #12

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 45 additions & 22 deletions projects/llm_framework/main_melotts/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "Lexicon.hpp"
#include <ax_sys_api.h>
#include "AudioFile.h"
#include "SolaProcessor.h"
#include "Lexicon.hpp"

#include <signal.h>
Expand Down Expand Up @@ -263,49 +264,71 @@ class llm_task {
auto encoder_output =
encoder_->Run(phones, tones, langids, g_matrix, mode_config_.noise_scale, mode_config_.noise_scale_w,
mode_config_.get_length_scale(), mode_config_.sdp_ratio);
float *zp_data = encoder_output.at(0).GetTensorMutableData<float>();
int audio_len = encoder_output.at(2).GetTensorMutableData<int>()[0];
auto zp_info = encoder_output.at(0).GetTensorTypeAndShapeInfo();
auto zp_shape = zp_info.GetShape();
int zp_size = decoder_->GetInputSize(0) / sizeof(float);
int dec_len = zp_size / zp_shape[1];
int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float);
std::vector<float> decoder_output(audio_slice_len);
int dec_slice_num = int(std::ceil(zp_shape[2] * 1.0 / dec_len));
float *zp_data = encoder_output.at(0).GetTensorMutableData<float>();
int audio_len = encoder_output.at(2).GetTensorMutableData<int>()[0];
auto zp_info = encoder_output.at(0).GetTensorTypeAndShapeInfo();
auto zp_shape = zp_info.GetShape();

// Decoder parameters setup
int zp_size = decoder_->GetInputSize(0) / sizeof(float);
int dec_len = zp_size / zp_shape[1];
int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float);
const int pad_frames = 16;
const int samples_per_frame = 512;
const int effective_frames = dec_len - 2 * pad_frames;
int dec_slice_num =
static_cast<int>(std::ceil(static_cast<double>(zp_shape[2]) / static_cast<double>(effective_frames)));
SolaProcessor sola(pad_frames, samples_per_frame);
std::vector<float> pcmlist;

for (int i = 0; i < dec_slice_num; i++) {
int input_start = i * effective_frames;
if (i > 0) {
input_start -= pad_frames;
}
input_start = std::max(0, input_start);
int actual_len = std::min(dec_len, static_cast<int>(zp_shape[2] - input_start));
std::vector<float> zp(zp_size, 0);
int actual_size = (i + 1) * dec_len < zp_shape[2] ? dec_len : zp_shape[2] - i * dec_len;

for (int n = 0; n < zp_shape[1]; n++) {
memcpy(zp.data() + n * dec_len, zp_data + n * zp_shape[2] + i * dec_len,
sizeof(float) * actual_size);
int copy_size = std::min(actual_len, static_cast<int>(zp_shape[2] - input_start));
if (copy_size > 0) {
memcpy(zp.data() + n * dec_len, zp_data + n * zp_shape[2] + input_start,
sizeof(float) * copy_size);
}
}
// Run decoder
std::vector<float> decoder_output(audio_slice_len);
decoder_->SetInput(zp.data(), 0);
decoder_->SetInput(g_matrix.data(), 1);
if (0 != decoder_->Run()) {
printf("Run decoder model failed!\n");
throw std::string("decoder_ RunSync error");
}
decoder_->GetOutput(decoder_output.data(), 0);
actual_size = (i + 1) * audio_slice_len < audio_len ? audio_slice_len : audio_len - i * audio_slice_len;
if (decoder_output.size() > actual_size) {
pcmlist.reserve(pcmlist.size() + actual_size);
std::copy(decoder_output.begin(), decoder_output.begin() + actual_size,
std::back_inserter(pcmlist));
} else {
pcmlist.reserve(pcmlist.size() + decoder_output.size());
std::copy(decoder_output.begin(), decoder_output.end(), std::back_inserter(pcmlist));
}
std::vector<float> processed_output = sola.ProcessFrame(decoder_output, i, dec_slice_num, actual_len);

pcmlist.insert(pcmlist.end(), processed_output.begin(), processed_output.end());
}

double src_ratio = (mode_config_.audio_rate * 1.0f) / (mode_config_.mode_rate * 1.0f);
std::vector<float> tmp_pcm((pcmlist.size() * src_ratio + 1));
int len;
resample_audio(pcmlist.data(), pcmlist.size(), tmp_pcm.data(), &len, src_ratio);

// Convert to 16-bit PCM
wav_pcm_data.reserve(len);
std::transform(tmp_pcm.begin(), tmp_pcm.begin() + len, std::back_inserter(wav_pcm_data),
[](const auto val) { return (int16_t)(val * INT16_MAX); });

// Call callback function with output
if (out_callback_)
out_callback_(std::string((char *)wav_pcm_data.data(), wav_pcm_data.size() * sizeof(int16_t)), finish);

} catch (const std::exception &e) {
SLOGI("TTS processing exception: %s", e.what());
return true;
} catch (...) {
SLOGI("TTS processing encountered unknown exception");
return true;
}
return false;
Expand Down
103 changes: 52 additions & 51 deletions projects/llm_framework/main_melotts/src/runner/Lexicon.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#pragma once

#include <string>
#include <vector>
#include <fstream>
Expand All @@ -9,7 +8,15 @@
#include <cassert>
#include <iostream>
#include "../../../../../SDK/components/utilities/include/sample_log.h"

// Debug logging switch - set to true to enable debug logs
static bool DEBUG_LOGGING = false;
// Macro for debug logging
#define DEBUG_LOG(fmt, ...) \
do { \
if (DEBUG_LOGGING) { \
SLOGI(fmt, ##__VA_ARGS__); \
} \
} while (0)
std::vector<std::string> split(const std::string& s, char delim)
{
std::vector<std::string> result;
Expand All @@ -30,8 +37,16 @@ class Lexicon {
std::unordered_map<int, std::string> reverse_tokens;

public:
// Setter for debug logging
static void setDebugLogging(bool enable)
{
DEBUG_LOGGING = enable;
}
Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0)
{
DEBUG_LOG("Dictionary loading: %s Pronunciation table loading: %s", tokens_filename.c_str(),
lexicon_filename.c_str());

std::unordered_map<std::string, int> tokens;
std::ifstream ifs(tokens_filename);
assert(ifs.is_open());
Expand Down Expand Up @@ -82,8 +97,10 @@ class Lexicon {
lexicon["。"] = lexicon["."];
lexicon["!"] = lexicon["!"];
lexicon["?"] = lexicon["?"];
SLOGI("词典加载完成,包含 %zu 个条目,最长词组长度: %zu", lexicon.size(), max_phrase_length);
DEBUG_LOG("Dictionary loading complete, containing %zu entries, longest phrase length: %zu", lexicon.size(),
max_phrase_length);
}

std::vector<std::string> splitEachChar(const std::string& text)
{
std::vector<std::string> words;
Expand All @@ -94,93 +111,77 @@ class Lexicon {
if ((text[i] & 0x80) == 0x00) {
// ASCII
} else if ((text[i] & 0xE0) == 0xC0) {
next = 2; // 2字节UTF-8
next = 2; // 2-byte UTF-8
} else if ((text[i] & 0xF0) == 0xE0) {
next = 3; // 3字节UTF-8
next = 3; // 3-byte UTF-8
} else if ((text[i] & 0xF8) == 0xF0) {
next = 4; // 4字节UTF-8
next = 4; // 4-byte UTF-8
}
words.push_back(text.substr(i, next));
i += next;
}
return words;
}

bool is_english(const std::string& s)
{
return s.size() == 1 && ((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z'));
}

bool is_english_token_char(const std::string& s)
{
if (s.size() != 1) return false;
char c = s[0];
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' || c == '_';
}

void process_unknown_english(const std::string& word, std::vector<int>& phones, std::vector<int>& tones)
{
SLOGI("Processing unknown term: %s", word.c_str());

DEBUG_LOG("Processing unknown term: %s", word.c_str());
std::string orig_word = word;
std::vector<std::string> parts;
std::vector<std::string> phonetic_parts;

size_t start = 0;
while (start < word.size()) {
bool matched = false;

for (size_t len = std::min(word.size() - start, (size_t)10); len > 0 && !matched; --len) {
std::string sub_word = word.substr(start, len);
std::string lower_sub_word = sub_word;
std::transform(lower_sub_word.begin(), lower_sub_word.end(), lower_sub_word.begin(),
[](unsigned char c) { return std::tolower(c); });

if (lexicon.find(lower_sub_word) != lexicon.end()) {
// Substring found in lexicon
auto& [sub_phones, sub_tones] = lexicon[lower_sub_word];
phones.insert(phones.end(), sub_phones.begin(), sub_phones.end());
tones.insert(tones.end(), sub_tones.begin(), sub_tones.end());

parts.push_back(sub_word);
phonetic_parts.push_back(phonesToString(sub_phones));

SLOGI(" Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str());

DEBUG_LOG(" Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str());
start += len;
matched = true;
break;
}
}

if (!matched) {
std::string single_char = word.substr(start, 1);
std::string lower_char = single_char;
std::transform(lower_char.begin(), lower_char.end(), lower_char.begin(),
[](unsigned char c) { return std::tolower(c); });

if (lexicon.find(lower_char) != lexicon.end()) {
auto& [char_phones, char_tones] = lexicon[lower_char];
phones.insert(phones.end(), char_phones.begin(), char_phones.end());
tones.insert(tones.end(), char_tones.begin(), char_tones.end());

parts.push_back(single_char);
phonetic_parts.push_back(phonesToString(char_phones));

SLOGI(" Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str());
DEBUG_LOG(" Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str());
} else {
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());

parts.push_back(single_char);
phonetic_parts.push_back("_unknown_");

SLOGI(" Unknown: '%s'", single_char.c_str());
DEBUG_LOG(" Unknown: '%s'", single_char.c_str());
}

start++;
}
}

std::string parts_str, phonetic_str;
for (size_t i = 0; i < parts.size(); i++) {
if (i > 0) {
Expand All @@ -190,20 +191,20 @@ class Lexicon {
parts_str += parts[i];
phonetic_str += phonetic_parts[i];
}

SLOGI("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(), phonetic_str.c_str());
DEBUG_LOG("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(),
phonetic_str.c_str());
}

void convert(const std::string& text, std::vector<int>& phones, std::vector<int>& tones)
{
SLOGI("\n开始处理文本: \"%s\"", text.c_str());
SLOGI("=======匹配结果=======");
SLOGI("单元\t|\t音素\t|\t声调");
SLOGI("-----------------------------");
DEBUG_LOG("\nStarting text processing: \"%s\"", text.c_str());
DEBUG_LOG("=======Matching Results=======");
DEBUG_LOG("Unit\t|\tPhonemes\t|\tTones");
DEBUG_LOG("-----------------------------");
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());

SLOGI("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
tonesToString(unknown_token.second).c_str());
DEBUG_LOG("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
tonesToString(unknown_token.second).c_str());
auto chars = splitEachChar(text);
int i = 0;
while (i < chars.size()) {
Expand All @@ -220,8 +221,8 @@ class Lexicon {
auto& [eng_phones, eng_tones] = lexicon[eng_word];
phones.insert(phones.end(), eng_phones.begin(), eng_phones.end());
tones.insert(tones.end(), eng_tones.begin(), eng_tones.end());
SLOGI("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(),
tonesToString(eng_tones).c_str());
DEBUG_LOG("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(),
tonesToString(eng_tones).c_str());
} else {
process_unknown_english(orig_word, phones, tones);
}
Expand All @@ -240,8 +241,8 @@ class Lexicon {
auto& [phrase_phones, phrase_tones] = lexicon[phrase];
phones.insert(phones.end(), phrase_phones.begin(), phrase_phones.end());
tones.insert(tones.end(), phrase_tones.begin(), phrase_tones.end());
SLOGI("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(),
tonesToString(phrase_tones).c_str());
DEBUG_LOG("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(),
tonesToString(phrase_tones).c_str());
i += len;
matched = true;
break;
Expand All @@ -263,25 +264,25 @@ class Lexicon {
auto& [char_phones, char_tones] = lexicon[s];
phones.insert(phones.end(), char_phones.begin(), char_phones.end());
tones.insert(tones.end(), char_tones.begin(), char_tones.end());
SLOGI("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(),
tonesToString(char_tones).c_str());
DEBUG_LOG("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(),
tonesToString(char_tones).c_str());
} else {
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
SLOGI("%s\t|\t%s (未匹配)\t|\t%s", orig_char.c_str(), phonesToString(unknown_token.first).c_str(),
tonesToString(unknown_token.second).c_str());
DEBUG_LOG("%s\t|\t%s (Not matched)\t|\t%s", orig_char.c_str(),
phonesToString(unknown_token.first).c_str(), tonesToString(unknown_token.second).c_str());
}
}
}
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
SLOGI("<EOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
tonesToString(unknown_token.second).c_str());
SLOGI("\n处理结果汇总:");
SLOGI("原文: %s", text.c_str());
SLOGI("音素: %s", phonesToString(phones).c_str());
SLOGI("声调: %s", tonesToString(tones).c_str());
SLOGI("====================");
DEBUG_LOG("<EOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
tonesToString(unknown_token.second).c_str());
DEBUG_LOG("\nProcessing Summary:");
DEBUG_LOG("Original text: %s", text.c_str());
DEBUG_LOG("Phonemes: %s", phonesToString(phones).c_str());
DEBUG_LOG("Tones: %s", tonesToString(tones).c_str());
DEBUG_LOG("====================");
}

private:
Expand Down
Loading