From 0828dcb098247760ba17ba5ad8a5b6b5cb460f95 Mon Sep 17 00:00:00 2001 From: nanaoto Date: Fri, 12 Sep 2025 16:45:37 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E8=A3=81=E5=89=AA=E8=BF=87=E9=95=BF?= =?UTF-8?q?=E7=9A=84=E8=BE=93=E5=85=A5=E9=9F=B3=E9=A2=91=E8=87=B315s,?= =?UTF-8?q?=E5=87=8F=E5=B0=91=E7=88=86=E5=86=85=E5=AD=98=E5=92=8C=E6=98=BE?= =?UTF-8?q?=E5=AD=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- indextts/infer_v2.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/indextts/infer_v2.py b/indextts/infer_v2.py index 71c36e1..972339c 100644 --- a/indextts/infer_v2.py +++ b/indextts/infer_v2.py @@ -292,6 +292,20 @@ class IndexTTS2: if self.gr_progress is not None: self.gr_progress(value, desc=desc) + def _load_and_cut_audio(self,audio_path,max_audio_length_seconds,verbose=False,sr=None): + if not sr: + audio, sr = librosa.load(audio_path) + else: + audio, _ = librosa.load(audio_path,sr=sr) + audio = torch.tensor(audio).unsqueeze(0) + max_audio_samples = int(max_audio_length_seconds * sr) + + if audio.shape[1] > max_audio_samples: + if verbose: + print(f"Audio too long ({audio.shape[1]} samples), truncating to {max_audio_samples} samples") + audio = audio[:, :max_audio_samples] + return audio, sr + # 原始推理模式 def infer(self, spk_audio_prompt, text, output_path, emo_audio_prompt=None, emo_alpha=1.0, @@ -340,8 +354,7 @@ class IndexTTS2: # 如果参考音频改变了,才需要重新生成, 提升速度 if self.cache_spk_cond is None or self.cache_spk_audio_prompt != spk_audio_prompt: - audio, sr = librosa.load(spk_audio_prompt) - audio = torch.tensor(audio).unsqueeze(0) + audio,sr = self._load_and_cut_audio(spk_audio_prompt,15,verbose) audio_22k = torchaudio.transforms.Resample(sr, 22050)(audio) audio_16k = torchaudio.transforms.Resample(sr, 16000)(audio) @@ -392,7 +405,7 @@ class IndexTTS2: emovec_mat = emovec_mat.unsqueeze(0) if self.cache_emo_cond is None or self.cache_emo_audio_prompt != emo_audio_prompt: - emo_audio, _ = librosa.load(emo_audio_prompt, sr=16000) + emo_audio, _ = self._load_and_cut_audio(emo_audio_prompt,15,verbose,sr=16000) emo_inputs = self.extract_features(emo_audio, sampling_rate=16000, return_tensors="pt") emo_input_features = emo_inputs["input_features"] emo_attention_mask = emo_inputs["attention_mask"]