feat: Add reusable Emotion Vector normalization helper

- The WebUI was secretly squashing all emotion vectors and re-scaling them. It's a good idea for user friendliness, but it makes it harder to learn what values will work in Python when using the WebUI for testing. - Instead, let's move the normalization code into IndexTTS2 as a helper function which is used by Gradio and can be used from other people's code too. - The emotion bias (which reduces the influence of certain emotions) has also been converted into an optional feature, which can be turned off if such biasing isn't wanted. And all biasing values have been re-scaled to use 1.0 as the reference, to avoid scaling relative to 0.8 (which previously meant that it applied double scaling).
2025-11-26 03:44:54 +08:00 · 2025-09-14 00:22:41 +02:00
parent 1520d0689b
commit 8aa8064a53
2 changed files with 17 additions and 11 deletions
--- a/indextts/infer_v2.py
+++ b/indextts/infer_v2.py
@@ -305,6 +305,22 @@ class IndexTTS2:
                print(f"Audio too long ({audio.shape[1]} samples), truncating to {max_audio_samples} samples")
            audio = audio[:, :max_audio_samples]
        return audio, sr
+    
+    def normalize_emo_vec(self, emo_vector, apply_bias=True):
+        # apply biased emotion factors for better user experience,
+        # by de-emphasizing emotions that can cause strange results
+        if apply_bias:
+            # [happy, angry, sad, afraid, disgusted, melancholic, surprised, calm]
+            emo_bias = [0.9375, 0.875, 1.0, 1.0, 0.9375, 0.9375, 0.6875, 0.5625]
+            emo_vector = [vec * bias for vec, bias in zip(emo_vector, emo_bias)]
+
+        # the total emotion sum must be 0.8 or less
+        emo_sum = sum(emo_vector)
+        if emo_sum > 0.8:
+            scale_factor = 0.8 / emo_sum
+            emo_vector = [vec * scale_factor for vec in emo_vector]
+
+        return emo_vector

    # 原始推理模式
    def infer(self, spk_audio_prompt, text, output_path,
--- a/webui.py
+++ b/webui.py
@@ -6,8 +6,6 @@ import time

 import warnings

-import numpy as np
-
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", category=UserWarning)

@@ -104,14 +102,6 @@ with open("examples/cases.jsonl", "r", encoding="utf-8") as f:
                             example.get("emo_text") is not None]
                             )

-def normalize_emo_vec(emo_vec):
-    # emotion factors for better user experience
-    k_vec = [0.75,0.70,0.80,0.80,0.75,0.75,0.55,0.45]
-    tmp = np.array(k_vec) * np.array(emo_vec)
-    if np.sum(tmp) > 0.8:
-        tmp = tmp * 0.8/ np.sum(tmp)
-    return tmp.tolist()
-
 def gen_single(emo_control_method,prompt, text,
               emo_ref_path, emo_weight,
               vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8,
@@ -145,7 +135,7 @@ def gen_single(emo_control_method,prompt, text,
        pass
    if emo_control_method == 2:  # emotion from custom vectors
        vec = [vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8]
-        vec = normalize_emo_vec(vec)
+        vec = tts.normalize_emo_vec(vec, apply_bias=True)
    else:
        # don't use the emotion vector inputs for the other modes
        vec = None