From 2cfc76ad9c49b4376e3d10a5bfeafbad9e5b5cd3 Mon Sep 17 00:00:00 2001 From: LGZwr Date: Tue, 19 Aug 2025 16:53:29 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=A0=B7=E6=9C=AC?= =?UTF-8?q?=E9=9F=B3=E9=A2=91=E5=A4=AA=E9=95=BF=E6=8A=A5=E9=94=99=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98=EF=BC=8C=E5=AF=B9=E9=9F=B3=E9=A2=91=E8=BF=9B?= =?UTF-8?q?=E8=A1=8C=E8=A3=81=E5=88=87=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- indextts/infer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/indextts/infer.py b/indextts/infer.py index f137f25..2cdd9ec 100644 --- a/indextts/infer.py +++ b/indextts/infer.py @@ -305,6 +305,15 @@ class IndexTTS: if audio.shape[0] > 1: audio = audio[0].unsqueeze(0) audio = torchaudio.transforms.Resample(sr, 24000)(audio) + + max_audio_length_seconds = 50 + max_audio_samples = int(max_audio_length_seconds * 24000) + + if audio.shape[1] > max_audio_samples: + if verbose: + print(f"Audio too long ({audio.shape[1]} samples), truncating to {max_audio_samples} samples") + audio = audio[:, :max_audio_samples] + cond_mel = MelSpectrogramFeatures()(audio).to(self.device) cond_mel_frame = cond_mel.shape[-1] if verbose: