From e3c70bc1273fb9ac668650ba80e34f27a6cfc9a2 Mon Sep 17 00:00:00 2001
From: yuyun2000 <15515722313yxw@gmail.com>
Date: Tue, 6 May 2025 16:48:11 +0800
Subject: [PATCH 1/2] Implement Sola algorithm for smoother audio transitions

Apply the Synchronized Overlap-Add (SOLA) algorithm to smooth the connection between audio segments output by the decoder, resulting in more natural-sounding transitions between segments.
---
 .../llm_framework/main_melotts/src/main.cpp   |  67 +++--
 .../main_melotts/src/runner/Lexicon.hpp       |   1 +
 .../main_melotts/src/runner/SolaProcessor.h   | 269 ++++++++++++++++++
 3 files changed, 315 insertions(+), 22 deletions(-)
 create mode 100644 projects/llm_framework/main_melotts/src/runner/SolaProcessor.h
diff --git a/projects/llm_framework/main_melotts/src/main.cpp b/projects/llm_framework/main_melotts/src/main.cpp
index b5a27cb..610362a 100644
--- a/projects/llm_framework/main_melotts/src/main.cpp
+++ b/projects/llm_framework/main_melotts/src/main.cpp
@@ -9,6 +9,7 @@
 #include "Lexicon.hpp"
 #include <ax_sys_api.h>
 #include "AudioFile.h"
+#include "SolaProcessor.h"
 #include "Lexicon.hpp"
 
 #include <signal.h>
@@ -263,49 +264,71 @@ class llm_task {
             auto encoder_output =
                 encoder_->Run(phones, tones, langids, g_matrix, mode_config_.noise_scale, mode_config_.noise_scale_w,
                               mode_config_.get_length_scale(), mode_config_.sdp_ratio);
-            float *zp_data      = encoder_output.at(0).GetTensorMutableData<float>();
-            int audio_len       = encoder_output.at(2).GetTensorMutableData<int>()[0];
-            auto zp_info        = encoder_output.at(0).GetTensorTypeAndShapeInfo();
-            auto zp_shape       = zp_info.GetShape();
-            int zp_size         = decoder_->GetInputSize(0) / sizeof(float);
-            int dec_len         = zp_size / zp_shape[1];
-            int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float);
-            std::vector<float> decoder_output(audio_slice_len);
-            int dec_slice_num = int(std::ceil(zp_shape[2] * 1.0 / dec_len));
+            float *zp_data = encoder_output.at(0).GetTensorMutableData<float>();
+            int audio_len  = encoder_output.at(2).GetTensorMutableData<int>()[0];
+            auto zp_info   = encoder_output.at(0).GetTensorTypeAndShapeInfo();
+            auto zp_shape  = zp_info.GetShape();
+
+            // Decoder parameters setup
+            int zp_size                 = decoder_->GetInputSize(0) / sizeof(float);
+            int dec_len                 = zp_size / zp_shape[1];
+            int audio_slice_len         = decoder_->GetOutputSize(0) / sizeof(float);
+            const int pad_frames        = 16;
+            const int samples_per_frame = 512;
+            const int effective_frames  = dec_len - 2 * pad_frames;
+            int dec_slice_num =
+                static_cast<int>(std::ceil(static_cast<double>(zp_shape[2]) / static_cast<double>(effective_frames)));
+            SolaProcessor sola(pad_frames, samples_per_frame);
             std::vector<float> pcmlist;
+
             for (int i = 0; i < dec_slice_num; i++) {
+                int input_start = i * effective_frames;
+                if (i > 0) {
+                    input_start -= pad_frames;
+                }
+                input_start    = std::max(0, input_start);
+                int actual_len = std::min(dec_len, static_cast<int>(zp_shape[2] - input_start));
                 std::vector<float> zp(zp_size, 0);
-                int actual_size = (i + 1) * dec_len < zp_shape[2] ? dec_len : zp_shape[2] - i * dec_len;
+
                 for (int n = 0; n < zp_shape[1]; n++) {
-                    memcpy(zp.data() + n * dec_len, zp_data + n * zp_shape[2] + i * dec_len,
-                           sizeof(float) * actual_size);
+                    int copy_size = std::min(actual_len, static_cast<int>(zp_shape[2] - input_start));
+                    if (copy_size > 0) {
+                        memcpy(zp.data() + n * dec_len, zp_data + n * zp_shape[2] + input_start,
+                               sizeof(float) * copy_size);
+                    }
                 }
+                // Run decoder
+                std::vector<float> decoder_output(audio_slice_len);
                 decoder_->SetInput(zp.data(), 0);
                 decoder_->SetInput(g_matrix.data(), 1);
                 if (0 != decoder_->Run()) {
-                    printf("Run decoder model failed!\n");
                     throw std::string("decoder_ RunSync error");
                 }
                 decoder_->GetOutput(decoder_output.data(), 0);
-                actual_size = (i + 1) * audio_slice_len < audio_len ? audio_slice_len : audio_len - i * audio_slice_len;
-                if (decoder_output.size() > actual_size) {
-                    pcmlist.reserve(pcmlist.size() + actual_size);
-                    std::copy(decoder_output.begin(), decoder_output.begin() + actual_size,
-                              std::back_inserter(pcmlist));
-                } else {
-                    pcmlist.reserve(pcmlist.size() + decoder_output.size());
-                    std::copy(decoder_output.begin(), decoder_output.end(), std::back_inserter(pcmlist));
-                }
+                std::vector<float> processed_output = sola.ProcessFrame(decoder_output, i, dec_slice_num, actual_len);
+
+                pcmlist.insert(pcmlist.end(), processed_output.begin(), processed_output.end());
             }
+
             double src_ratio = (mode_config_.audio_rate * 1.0f) / (mode_config_.mode_rate * 1.0f);
             std::vector<float> tmp_pcm((pcmlist.size() * src_ratio + 1));
             int len;
             resample_audio(pcmlist.data(), pcmlist.size(), tmp_pcm.data(), &len, src_ratio);
+
+            // Convert to 16-bit PCM
+            wav_pcm_data.reserve(len);
             std::transform(tmp_pcm.begin(), tmp_pcm.begin() + len, std::back_inserter(wav_pcm_data),
                            [](const auto val) { return (int16_t)(val * INT16_MAX); });
+
+            // Call callback function with output
             if (out_callback_)
                 out_callback_(std::string((char *)wav_pcm_data.data(), wav_pcm_data.size() * sizeof(int16_t)), finish);
+
+        } catch (const std::exception &e) {
+            SLOGI("TTS processing exception: %s", e.what());
+            return true;
         } catch (...) {
+            SLOGI("TTS processing encountered unknown exception");
             return true;
         }
         return false;
diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp
index 242fb15..242e9e0 100644
--- a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp
+++ b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp
@@ -32,6 +32,7 @@ class Lexicon {
 public:
     Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0)
     {
+        SLOGI("词典加载: %zu 发音表加载: %zu", tokens_filename, lexicon_filename);
         std::unordered_map<std::string, int> tokens;
         std::ifstream ifs(tokens_filename);
         assert(ifs.is_open());
diff --git a/projects/llm_framework/main_melotts/src/runner/SolaProcessor.h b/projects/llm_framework/main_melotts/src/runner/SolaProcessor.h
new file mode 100644
index 0000000..a2286bb
--- /dev/null
+++ b/projects/llm_framework/main_melotts/src/runner/SolaProcessor.h
@@ -0,0 +1,269 @@
+#ifndef SOLA_PROCESSOR_H
+#define SOLA_PROCESSOR_H
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <string>
+#include <vector>
+
+/**
+ * SolaProcessor - Synchronous Overlap-Add method for audio frame processing
+ *
+ * This class provides functionality for smoothly concatenating audio frames
+ * using the SOLA algorithm, which finds optimal alignment points between
+ * consecutive frames and applies crossfading for smooth transitions.
+ */
+class SolaProcessor {
+public:
+    /**
+     * Constructor
+     *
+     * @param padFrames Number of padding frames at the beginning and end
+     * @param samplesPerFrame Number of audio samples in each frame
+     */
+    SolaProcessor(int padFrames, int samplesPerFrame)
+        : pad_frames_(padFrames), samples_per_frame_(samplesPerFrame), first_frame_(true)
+    {
+        Initialize();
+    }
+
+    /**
+     * Reset the processor to its initial state
+     */
+    void Reset()
+    {
+        first_frame_ = true;
+        std::fill(sola_buffer_.begin(), sola_buffer_.end(), 0.0f);
+    }
+
+    /**
+     * Process a single audio frame
+     *
+     * @param decoder_output Raw audio data from decoder
+     * @param frameIndex Current frame index
+     * @param totalFrames Total number of frames
+     * @param actualFrameLen Actual length of the frame
+     * @return Processed audio samples
+     */
+    std::vector<float> ProcessFrame(const std::vector<float>& decoder_output, int frameIndex, int totalFrames,
+                                    int actualFrameLen)
+    {
+        std::vector<float> processed_output;
+
+        if (first_frame_) {
+            // Special handling for the first frame
+            ProcessFirstFrame(decoder_output, processed_output, actualFrameLen);
+            first_frame_ = false;
+        } else {
+            // Process subsequent frames with SOLA algorithm
+            ProcessSubsequentFrame(decoder_output, processed_output, frameIndex, totalFrames, actualFrameLen);
+        }
+
+        return processed_output;
+    }
+
+private:
+    /**
+     * Initialize the SOLA processor parameters and buffers
+     */
+    void Initialize()
+    {
+        // Calculate SOLA parameters
+        sola_buffer_frame_ = pad_frames_ * samples_per_frame_;
+        sola_search_frame_ = pad_frames_ * samples_per_frame_;
+        effective_frames_  = 0;  // Will be set during frame processing
+
+        // Create fade-in and fade-out windows
+        fade_in_window_.resize(sola_buffer_frame_);
+        fade_out_window_.resize(sola_buffer_frame_);
+
+        for (int i = 0; i < sola_buffer_frame_; i++) {
+            fade_in_window_[i]  = static_cast<float>(i) / sola_buffer_frame_;
+            fade_out_window_[i] = 1.0f - fade_in_window_[i];
+        }
+
+        // Initialize SOLA buffer
+        sola_buffer_.resize(sola_buffer_frame_, 0.0f);
+    }
+
+    /**
+     * Process the first audio frame
+     *
+     * @param decoder_output Raw audio data from decoder
+     * @param processed_output Output buffer for processed audio
+     * @param actualFrameLen Actual length of the frame
+     */
+    void ProcessFirstFrame(const std::vector<float>& decoder_output, std::vector<float>& processed_output,
+                           int actualFrameLen)
+    {
+        int audio_start = pad_frames_ * samples_per_frame_;
+        int audio_len   = (actualFrameLen - 2 * pad_frames_) * samples_per_frame_;
+
+        // Boundary check
+        audio_len = std::min(audio_len, static_cast<int>(decoder_output.size() - audio_start));
+
+        // Add first frame data to output
+        processed_output.insert(processed_output.end(), decoder_output.begin() + audio_start,
+                                decoder_output.begin() + audio_start + audio_len);
+
+        // Save the end part to SOLA buffer for next frame alignment
+        int buffer_start = audio_start + audio_len;
+        if (buffer_start + sola_buffer_frame_ <= decoder_output.size()) {
+            std::copy(decoder_output.begin() + buffer_start, decoder_output.begin() + buffer_start + sola_buffer_frame_,
+                      sola_buffer_.begin());
+        }
+    }
+
+    /**
+     * Process subsequent audio frames using SOLA algorithm
+     *
+     * @param decoder_output Raw audio data from decoder
+     * @param processed_output Output buffer for processed audio
+     * @param frameIndex Current frame index
+     * @param totalFrames Total number of frames
+     * @param actualFrameLen Actual length of the frame
+     */
+    void ProcessSubsequentFrame(const std::vector<float>& decoder_output, std::vector<float>& processed_output,
+                                int frameIndex, int totalFrames, int actualFrameLen)
+    {
+        int audio_start = pad_frames_ * samples_per_frame_;
+
+        // 1. Prepare search window
+        std::vector<float> search_window(sola_buffer_frame_ + sola_search_frame_);
+        std::copy(decoder_output.begin() + audio_start, decoder_output.begin() + audio_start + search_window.size(),
+                  search_window.begin());
+
+        // 2. Find best alignment point (compute cross-correlation)
+        int best_offset = FindBestOffset(search_window);
+
+        // 3. Apply alignment offset
+        int aligned_start = audio_start + best_offset;
+
+        // 4. Create smooth transition
+        std::vector<float> crossfade_region = CreateCrossfade(decoder_output, aligned_start);
+
+        // 5. Add crossfade region to output
+        processed_output.insert(processed_output.end(), crossfade_region.begin(), crossfade_region.end());
+
+        // 6. Add remaining valid audio data
+        AddRemainingAudio(decoder_output, processed_output, aligned_start, frameIndex, totalFrames, actualFrameLen);
+    }
+
+    /**
+     * Find the best alignment offset using normalized cross-correlation
+     *
+     * @param search_window Window of audio samples to search in
+     * @return Optimal offset for alignment
+     */
+    int FindBestOffset(const std::vector<float>& search_window)
+    {
+        int best_offset        = 0;
+        float best_correlation = -1.0f;
+
+        for (int offset = 0; offset <= sola_search_frame_; offset++) {
+            float correlation = 0.0f;
+            float energy      = 0.0f;
+
+            for (int j = 0; j < sola_buffer_frame_; j++) {
+                correlation += sola_buffer_[j] * search_window[j + offset];
+                energy += search_window[j + offset] * search_window[j + offset];
+            }
+
+            // Normalize correlation
+            float normalized_correlation = (energy > 1e-8) ? correlation / std::sqrt(energy) : 0.0f;
+
+            if (normalized_correlation > best_correlation) {
+                best_correlation = normalized_correlation;
+                best_offset      = offset;
+            }
+        }
+
+        return best_offset;
+    }
+
+    /**
+     * Create crossfade transition region
+     *
+     * @param decoder_output Raw audio data from decoder
+     * @param aligned_start Starting point after alignment
+     * @return Crossfaded audio samples
+     */
+    std::vector<float> CreateCrossfade(const std::vector<float>& decoder_output, int aligned_start)
+    {
+        std::vector<float> crossfade_region(sola_buffer_frame_);
+
+        for (int j = 0; j < sola_buffer_frame_; j++) {
+            // Apply fade-in and fade-out window functions
+            crossfade_region[j] =
+                decoder_output[aligned_start + j] * fade_in_window_[j] + sola_buffer_[j] * fade_out_window_[j];
+        }
+
+        return crossfade_region;
+    }
+
+    /**
+     * Add remaining audio data and update buffer
+     *
+     * @param decoder_output Raw audio data from decoder
+     * @param processed_output Output buffer for processed audio
+     * @param aligned_start Starting point after alignment
+     * @param frameIndex Current frame index
+     * @param totalFrames Total number of frames
+     * @param actualFrameLen Actual length of the frame
+     */
+    void AddRemainingAudio(const std::vector<float>& decoder_output, std::vector<float>& processed_output,
+                           int aligned_start, int frameIndex, int totalFrames, int actualFrameLen)
+    {
+        int remaining_start = aligned_start + sola_buffer_frame_;
+        int remaining_len   = (actualFrameLen - 2 * pad_frames_) * samples_per_frame_ - sola_buffer_frame_;
+
+        // Boundary check
+        remaining_len = std::min(remaining_len, static_cast<int>(decoder_output.size() - remaining_start));
+
+        if (remaining_len > 0) {
+            processed_output.insert(processed_output.end(), decoder_output.begin() + remaining_start,
+                                    decoder_output.begin() + remaining_start + remaining_len);
+        }
+
+        // Update SOLA buffer
+        UpdateSolaBuffer(decoder_output, remaining_start + remaining_len);
+    }
+
+    /**
+     * Update SOLA buffer with new audio data
+     *
+     * @param decoder_output Raw audio data from decoder
+     * @param buffer_start Starting point for the new buffer data
+     */
+    void UpdateSolaBuffer(const std::vector<float>& decoder_output, int buffer_start)
+    {
+        // Check if there's enough data for the next buffer
+        if (buffer_start + sola_buffer_frame_ <= decoder_output.size()) {
+            std::copy(decoder_output.begin() + buffer_start, decoder_output.begin() + buffer_start + sola_buffer_frame_,
+                      sola_buffer_.begin());
+        } else {
+            // Fill with zeros if not enough data
+            int avail = static_cast<int>(decoder_output.size() - buffer_start);
+            if (avail > 0) {
+                std::copy(decoder_output.begin() + buffer_start, decoder_output.end(), sola_buffer_.begin());
+            }
+            std::fill(sola_buffer_.begin() + avail, sola_buffer_.end(), 0.0f);
+        }
+    }
+
+private:
+    int pad_frames_;         // Number of padding frames
+    int samples_per_frame_;  // Number of samples per frame
+    int effective_frames_;   // Number of effective frames
+    int sola_buffer_frame_;  // SOLA buffer length
+    int sola_search_frame_;  // SOLA search window length
+
+    std::vector<float> fade_in_window_;   // Fade-in window
+    std::vector<float> fade_out_window_;  // Fade-out window
+    std::vector<float> sola_buffer_;      // SOLA buffer
+
+    bool first_frame_;  // Flag for first frame processing
+};
+
+#endif  // SOLA_PROCESSOR_H

From a151affa1d2891da0e8f65ce4f176a249df97b3f Mon Sep 17 00:00:00 2001
From: yuyun2000 <15515722313yxw@gmail.com>
Date: Tue, 6 May 2025 16:50:35 +0800
Subject: [PATCH 2/2] Translate logs in Lexicon.hpp to English and add debug
 switch

- Convert all Chinese log messages in Lexicon.hpp to English for better international compatibility\n- Add a debug flag to control whether to display g2p process logs\n- Improve code readability and debugging experience
---
 .../main_melotts/src/runner/Lexicon.hpp       | 104 +++++++++---------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp
index 242e9e0..d1bcbe9 100644
--- a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp
+++ b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp
@@ -1,5 +1,4 @@
 #pragma once
-
 #include <string>
 #include <vector>
 #include <fstream>
@@ -9,7 +8,15 @@
 #include <cassert>
 #include <iostream>
 #include "../../../../../SDK/components/utilities/include/sample_log.h"
-
+// Debug logging switch - set to true to enable debug logs
+static bool DEBUG_LOGGING = false;
+// Macro for debug logging
+#define DEBUG_LOG(fmt, ...)            \
+    do {                               \
+        if (DEBUG_LOGGING) {           \
+            SLOGI(fmt, ##__VA_ARGS__); \
+        }                              \
+    } while (0)
 std::vector<std::string> split(const std::string& s, char delim)
 {
     std::vector<std::string> result;
@@ -30,9 +37,16 @@ class Lexicon {
     std::unordered_map<int, std::string> reverse_tokens;
 
 public:
+    // Setter for debug logging
+    static void setDebugLogging(bool enable)
+    {
+        DEBUG_LOGGING = enable;
+    }
     Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0)
     {
-        SLOGI("词典加载: %zu 发音表加载: %zu", tokens_filename, lexicon_filename);
+        DEBUG_LOG("Dictionary loading: %s Pronunciation table loading: %s", tokens_filename.c_str(),
+                  lexicon_filename.c_str());
+
         std::unordered_map<std::string, int> tokens;
         std::ifstream ifs(tokens_filename);
         assert(ifs.is_open());
@@ -83,8 +97,10 @@ class Lexicon {
         lexicon["。"] = lexicon["."];
         lexicon["！"] = lexicon["!"];
         lexicon["？"] = lexicon["?"];
-        SLOGI("词典加载完成，包含 %zu 个条目，最长词组长度: %zu", lexicon.size(), max_phrase_length);
+        DEBUG_LOG("Dictionary loading complete, containing %zu entries, longest phrase length: %zu", lexicon.size(),
+                  max_phrase_length);
     }
+
     std::vector<std::string> splitEachChar(const std::string& text)
     {
         std::vector<std::string> words;
@@ -95,93 +111,77 @@ class Lexicon {
             if ((text[i] & 0x80) == 0x00) {
                 // ASCII
             } else if ((text[i] & 0xE0) == 0xC0) {
-                next = 2;  // 2字节UTF-8
+                next = 2;  // 2-byte UTF-8
             } else if ((text[i] & 0xF0) == 0xE0) {
-                next = 3;  // 3字节UTF-8
+                next = 3;  // 3-byte UTF-8
             } else if ((text[i] & 0xF8) == 0xF0) {
-                next = 4;  // 4字节UTF-8
+                next = 4;  // 4-byte UTF-8
             }
             words.push_back(text.substr(i, next));
             i += next;
         }
         return words;
     }
+
     bool is_english(const std::string& s)
     {
         return s.size() == 1 && ((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z'));
     }
-
     bool is_english_token_char(const std::string& s)
     {
         if (s.size() != 1) return false;
         char c = s[0];
         return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' || c == '_';
     }
-
     void process_unknown_english(const std::string& word, std::vector<int>& phones, std::vector<int>& tones)
     {
-        SLOGI("Processing unknown term: %s", word.c_str());
-
+        DEBUG_LOG("Processing unknown term: %s", word.c_str());
         std::string orig_word = word;
         std::vector<std::string> parts;
         std::vector<std::string> phonetic_parts;
-
         size_t start = 0;
         while (start < word.size()) {
             bool matched = false;
-
             for (size_t len = std::min(word.size() - start, (size_t)10); len > 0 && !matched; --len) {
                 std::string sub_word       = word.substr(start, len);
                 std::string lower_sub_word = sub_word;
                 std::transform(lower_sub_word.begin(), lower_sub_word.end(), lower_sub_word.begin(),
                                [](unsigned char c) { return std::tolower(c); });
-
                 if (lexicon.find(lower_sub_word) != lexicon.end()) {
                     // Substring found in lexicon
                     auto& [sub_phones, sub_tones] = lexicon[lower_sub_word];
                     phones.insert(phones.end(), sub_phones.begin(), sub_phones.end());
                     tones.insert(tones.end(), sub_tones.begin(), sub_tones.end());
-
                     parts.push_back(sub_word);
                     phonetic_parts.push_back(phonesToString(sub_phones));
-
-                    SLOGI("  Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str());
-
+                    DEBUG_LOG("  Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str());
                     start += len;
                     matched = true;
                     break;
                 }
             }
-
             if (!matched) {
                 std::string single_char = word.substr(start, 1);
                 std::string lower_char  = single_char;
                 std::transform(lower_char.begin(), lower_char.end(), lower_char.begin(),
                                [](unsigned char c) { return std::tolower(c); });
-
                 if (lexicon.find(lower_char) != lexicon.end()) {
                     auto& [char_phones, char_tones] = lexicon[lower_char];
                     phones.insert(phones.end(), char_phones.begin(), char_phones.end());
                     tones.insert(tones.end(), char_tones.begin(), char_tones.end());
-
                     parts.push_back(single_char);
                     phonetic_parts.push_back(phonesToString(char_phones));
-
-                    SLOGI("  Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str());
+                    DEBUG_LOG("  Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str());
                 } else {
                     phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
                     tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
-
                     parts.push_back(single_char);
                     phonetic_parts.push_back("_unknown_");
-
-                    SLOGI("  Unknown: '%s'", single_char.c_str());
+                    DEBUG_LOG("  Unknown: '%s'", single_char.c_str());
                 }
-
                 start++;
             }
         }
-
         std::string parts_str, phonetic_str;
         for (size_t i = 0; i < parts.size(); i++) {
             if (i > 0) {
@@ -191,20 +191,20 @@ class Lexicon {
             parts_str += parts[i];
             phonetic_str += phonetic_parts[i];
         }
-
-        SLOGI("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(), phonetic_str.c_str());
+        DEBUG_LOG("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(),
+                  phonetic_str.c_str());
     }
+
     void convert(const std::string& text, std::vector<int>& phones, std::vector<int>& tones)
     {
-        SLOGI("\n开始处理文本: \"%s\"", text.c_str());
-        SLOGI("=======匹配结果=======");
-        SLOGI("单元\t|\t音素\t|\t声调");
-        SLOGI("-----------------------------");
+        DEBUG_LOG("\nStarting text processing: \"%s\"", text.c_str());
+        DEBUG_LOG("=======Matching Results=======");
+        DEBUG_LOG("Unit\t|\tPhonemes\t|\tTones");
+        DEBUG_LOG("-----------------------------");
         phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
         tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
-
-        SLOGI("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
-              tonesToString(unknown_token.second).c_str());
+        DEBUG_LOG("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
+                  tonesToString(unknown_token.second).c_str());
         auto chars = splitEachChar(text);
         int i      = 0;
         while (i < chars.size()) {
@@ -221,8 +221,8 @@ class Lexicon {
                     auto& [eng_phones, eng_tones] = lexicon[eng_word];
                     phones.insert(phones.end(), eng_phones.begin(), eng_phones.end());
                     tones.insert(tones.end(), eng_tones.begin(), eng_tones.end());
-                    SLOGI("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(),
-                          tonesToString(eng_tones).c_str());
+                    DEBUG_LOG("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(),
+                              tonesToString(eng_tones).c_str());
                 } else {
                     process_unknown_english(orig_word, phones, tones);
                 }
@@ -241,8 +241,8 @@ class Lexicon {
                     auto& [phrase_phones, phrase_tones] = lexicon[phrase];
                     phones.insert(phones.end(), phrase_phones.begin(), phrase_phones.end());
                     tones.insert(tones.end(), phrase_tones.begin(), phrase_tones.end());
-                    SLOGI("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(),
-                          tonesToString(phrase_tones).c_str());
+                    DEBUG_LOG("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(),
+                              tonesToString(phrase_tones).c_str());
                     i += len;
                     matched = true;
                     break;
@@ -264,25 +264,25 @@ class Lexicon {
                     auto& [char_phones, char_tones] = lexicon[s];
                     phones.insert(phones.end(), char_phones.begin(), char_phones.end());
                     tones.insert(tones.end(), char_tones.begin(), char_tones.end());
-                    SLOGI("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(),
-                          tonesToString(char_tones).c_str());
+                    DEBUG_LOG("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(),
+                              tonesToString(char_tones).c_str());
                 } else {
                     phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
                     tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
-                    SLOGI("%s\t|\t%s (未匹配)\t|\t%s", orig_char.c_str(), phonesToString(unknown_token.first).c_str(),
-                          tonesToString(unknown_token.second).c_str());
+                    DEBUG_LOG("%s\t|\t%s (Not matched)\t|\t%s", orig_char.c_str(),
+                              phonesToString(unknown_token.first).c_str(), tonesToString(unknown_token.second).c_str());
                 }
             }
         }
         phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
         tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
-        SLOGI("<EOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
-              tonesToString(unknown_token.second).c_str());
-        SLOGI("\n处理结果汇总:");
-        SLOGI("原文: %s", text.c_str());
-        SLOGI("音素: %s", phonesToString(phones).c_str());
-        SLOGI("声调: %s", tonesToString(tones).c_str());
-        SLOGI("====================");
+        DEBUG_LOG("<EOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
+                  tonesToString(unknown_token.second).c_str());
+        DEBUG_LOG("\nProcessing Summary:");
+        DEBUG_LOG("Original text: %s", text.c_str());
+        DEBUG_LOG("Phonemes: %s", phonesToString(phones).c_str());
+        DEBUG_LOG("Tones: %s", tonesToString(tones).c_str());
+        DEBUG_LOG("====================");
     }
 
 private: