feat: Add reusable Emotion Vector normalization helper

- The WebUI was secretly squashing all emotion vectors and re-scaling them. It's a good idea for user friendliness, but it makes it harder to learn what values will work in Python when using the WebUI for testing.

- Instead, let's move the normalization code into IndexTTS2 as a helper function which is used by Gradio and can be used from other people's code too.

- The emotion bias (which reduces the influence of certain emotions) has also been converted into an optional feature, which can be turned off if such biasing isn't wanted. And all biasing values have been re-scaled to use 1.0 as the reference, to avoid scaling relative to 0.8 (which previously meant that it applied double scaling).
This commit is contained in:
Arcitec
2025-09-14 00:22:41 +02:00
parent 1520d0689b
commit 8aa8064a53
2 changed files with 17 additions and 11 deletions

View File

@@ -305,6 +305,22 @@ class IndexTTS2:
print(f"Audio too long ({audio.shape[1]} samples), truncating to {max_audio_samples} samples")
audio = audio[:, :max_audio_samples]
return audio, sr
def normalize_emo_vec(self, emo_vector, apply_bias=True):
# apply biased emotion factors for better user experience,
# by de-emphasizing emotions that can cause strange results
if apply_bias:
# [happy, angry, sad, afraid, disgusted, melancholic, surprised, calm]
emo_bias = [0.9375, 0.875, 1.0, 1.0, 0.9375, 0.9375, 0.6875, 0.5625]
emo_vector = [vec * bias for vec, bias in zip(emo_vector, emo_bias)]
# the total emotion sum must be 0.8 or less
emo_sum = sum(emo_vector)
if emo_sum > 0.8:
scale_factor = 0.8 / emo_sum
emo_vector = [vec * scale_factor for vec in emo_vector]
return emo_vector
# 原始推理模式
def infer(self, spk_audio_prompt, text, output_path,

View File

@@ -6,8 +6,6 @@ import time
import warnings
import numpy as np
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
@@ -104,14 +102,6 @@ with open("examples/cases.jsonl", "r", encoding="utf-8") as f:
example.get("emo_text") is not None]
)
def normalize_emo_vec(emo_vec):
# emotion factors for better user experience
k_vec = [0.75,0.70,0.80,0.80,0.75,0.75,0.55,0.45]
tmp = np.array(k_vec) * np.array(emo_vec)
if np.sum(tmp) > 0.8:
tmp = tmp * 0.8/ np.sum(tmp)
return tmp.tolist()
def gen_single(emo_control_method,prompt, text,
emo_ref_path, emo_weight,
vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8,
@@ -145,7 +135,7 @@ def gen_single(emo_control_method,prompt, text,
pass
if emo_control_method == 2: # emotion from custom vectors
vec = [vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8]
vec = normalize_emo_vec(vec)
vec = tts.normalize_emo_vec(vec, apply_bias=True)
else:
# don't use the emotion vector inputs for the other modes
vec = None