feat: general subtitle generator interface

This commit is contained in:
Xinrea
2025-03-27 09:24:39 +08:00
parent dc44327d4c
commit 54ba265510
8 changed files with 294 additions and 3 deletions

4
.gitignore vendored
View File

@@ -22,3 +22,7 @@ dist-ssr
*.njsproj
*.sln
*.sw?
# test files
src-tauri/tests/audio/*.srt
src-tauri/tests/model/*.bin

117
src-tauri/Cargo.lock generated
View File

@@ -415,6 +415,7 @@ dependencies = [
"felgens",
"ffmpeg-sidecar",
"futures",
"hound",
"hyper 0.14.25",
"log",
"m3u8-rs",
@@ -446,6 +447,27 @@ dependencies = [
"tokio",
"toml 0.7.3",
"urlencoding",
"whisper-rs",
]
[[package]]
name = "bindgen"
version = "0.71.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3"
dependencies = [
"bitflags 2.6.0",
"cexpr",
"clang-sys",
"itertools",
"log",
"prettyplease",
"proc-macro2",
"quote",
"regex",
"rustc-hash",
"shlex",
"syn 2.0.87",
]
[[package]]
@@ -652,6 +674,15 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
[[package]]
name = "cexpr"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
dependencies = [
"nom",
]
[[package]]
name = "cfb"
version = "0.7.3"
@@ -700,6 +731,26 @@ dependencies = [
"winapi",
]
[[package]]
name = "clang-sys"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
dependencies = [
"glob",
"libc",
"libloading 0.8.6",
]
[[package]]
name = "cmake"
version = "0.1.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0"
dependencies = [
"cc",
]
[[package]]
name = "cocoa"
version = "0.26.0"
@@ -1205,7 +1256,7 @@ version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "330c60081dcc4c72131f8eb70510f1ac07223e5d4163db481a04a0befcffa412"
dependencies = [
"libloading",
"libloading 0.8.6",
]
[[package]]
@@ -1550,6 +1601,12 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "fs_extra"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "futf"
version = "0.1.5"
@@ -2135,6 +2192,12 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "hound"
version = "3.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f"
[[package]]
name = "html5ever"
version = "0.26.0"
@@ -2470,6 +2533,15 @@ dependencies = [
"once_cell",
]
[[package]]
name = "itertools"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "0.4.8"
@@ -2621,7 +2693,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e9ec52138abedcc58dc17a7c6c0c00a2bdb4f3427c7f63fa97fd0d859155caf"
dependencies = [
"gtk-sys",
"libloading",
"libloading 0.7.4",
"once_cell",
]
@@ -2641,6 +2713,16 @@ dependencies = [
"winapi",
]
[[package]]
name = "libloading"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
dependencies = [
"cfg-if",
"windows-targets 0.52.6",
]
[[package]]
name = "libm"
version = "0.2.8"
@@ -3711,6 +3793,16 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "prettyplease"
version = "0.2.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033"
dependencies = [
"proc-macro2",
"syn 2.0.87",
]
[[package]]
name = "proc-macro-crate"
version = "1.3.1"
@@ -6441,6 +6533,27 @@ dependencies = [
"windows-core 0.58.0",
]
[[package]]
name = "whisper-rs"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b6dca51a101d32fa551d66d34fef899a39d5c8b68b6ea5adf4080b9ec37bb58"
dependencies = [
"whisper-rs-sys",
]
[[package]]
name = "whisper-rs-sys"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7c6b8157262ff9e4239549db921ed40ba758e03f565893d4e700380286c643b"
dependencies = [
"bindgen",
"cfg-if",
"cmake",
"fs_extra",
]
[[package]]
name = "whoami"
version = "1.5.2"

View File

@@ -28,7 +28,7 @@ toml = "0.7.3"
custom_error = "1.9.2"
felgens = { git = "https://github.com/Xinrea/felgens.git", tag = "v0.4.1" }
regex = "1.7.3"
tokio = "1.27.0"
tokio = { version = "1.27.0", features = ["process"] }
platform-dirs = "0.3.0"
pct-str = "1.2.0"
md5 = "0.7.0"
@@ -51,6 +51,8 @@ rand = "0.8.5"
base64 = "0.21"
mime_guess = "2.0"
async-trait = "0.1.87"
whisper-rs = "0.14.2"
hound = "3.5.1"
[features]
# this feature is used for production builds or when `devPath` points to the filesystem
@@ -59,3 +61,11 @@ custom-protocol = ["tauri/custom-protocol"]
[target.'cfg(not(any(target_os = "android", target_os = "ios")))'.dependencies]
tauri-plugin-single-instance = "2"
[target.'cfg(windows)'.dependencies.whisper-rs]
version = "0.14.2"
features = ["cuda"]
[target.'cfg(darwin)'.dependencies.whisper-rs]
version = "0.14.2"
features = ["metal"]

View File

@@ -9,6 +9,7 @@ mod progress_event;
mod recorder;
mod recorder_manager;
mod state;
mod subtitle_generator;
mod tray;
use config::Config;

View File

@@ -0,0 +1,29 @@
use async_std::path::{Path, PathBuf};
use async_trait::async_trait;
pub mod whisper;
// subtitle_generator types
pub enum SubtitleGeneratorType {
Whisper,
}
impl SubtitleGeneratorType {
pub fn as_str(&self) -> &'static str {
match self {
SubtitleGeneratorType::Whisper => "whisper",
}
}
pub fn from_str(s: &str) -> Option<Self> {
match s {
"whisper" => Some(SubtitleGeneratorType::Whisper),
_ => None,
}
}
}
#[async_trait]
pub trait SubtitleGenerator {
async fn generate_subtitle(&self, video_path: &Path, output_path: &Path) -> Result<(), String>;
}

View File

@@ -0,0 +1,134 @@
use async_trait::async_trait;
use async_std::path::{Path, PathBuf};
use async_std::sync::{Arc, RwLock};
use tokio::io::AsyncWriteExt;
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
use super::SubtitleGenerator;
#[derive(Clone)]
pub struct WhisperCPP {
ctx: Arc<RwLock<WhisperContext>>,
model_path: Arc<RwLock<PathBuf>>,
}
pub async fn new(model: &Path) -> Result<WhisperCPP, String> {
let ctx = WhisperContext::new_with_params(
model.to_str().unwrap(),
WhisperContextParameters::default(),
)
.expect("failed to load model");
Ok(WhisperCPP {
ctx: Arc::new(RwLock::new(ctx)),
model_path: Arc::new(RwLock::new(model.to_path_buf())),
})
}
#[async_trait]
impl SubtitleGenerator for WhisperCPP {
async fn generate_subtitle(&self, audio_path: &Path, output_path: &Path) -> Result<(), String> {
let samples: Vec<i16> = hound::WavReader::open(audio_path)
.unwrap()
.into_samples::<i16>()
.map(|x| x.unwrap())
.collect();
let mut state = self
.ctx
.read()
.await
.create_state()
.expect("failed to create state");
let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
// and set the language to translate to to auto
params.set_language(None);
// we also explicitly disable anything that prints to stdout
params.set_print_special(false);
params.set_print_progress(false);
params.set_print_realtime(false);
params.set_print_timestamps(false);
params.set_token_timestamps(true);
let mut inter_samples = vec![Default::default(); samples.len()];
whisper_rs::convert_integer_to_float_audio(&samples, &mut inter_samples)
.expect("failed to convert audio data");
let samples = whisper_rs::convert_stereo_to_mono_audio(&inter_samples)
.expect("failed to convert audio data");
state
.full(params, &samples[..])
.expect("failed to run model");
// open the output file
let mut output_file = tokio::fs::File::create(output_path)
.await
.expect("failed to create output file");
// fetch the results
let num_segments = state
.full_n_segments()
.expect("failed to get number of segments");
for i in 0..num_segments {
let segment = state
.full_get_segment_text(i)
.expect("failed to get segment");
let start_timestamp = state
.full_get_segment_t0(i)
.expect("failed to get segment start timestamp");
let end_timestamp = state
.full_get_segment_t1(i)
.expect("failed to get segment end timestamp");
let format_time = |timestamp: f64| {
let hours = (timestamp / 3600.0).floor();
let minutes = ((timestamp - hours * 3600.0) / 60.0).floor();
let seconds = timestamp - hours * 3600.0 - minutes * 60.0;
format!("{:02}:{:02}:{:06.3}", hours, minutes, seconds)
};
let line = format!(
"{}\n{} --> {}\n{}\n\n",
i + 1,
format_time(start_timestamp as f64 / 100.0),
format_time(end_timestamp as f64 / 100.0),
segment,
);
output_file
.write_all(line.as_bytes())
.await
.expect("failed to write to output file");
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
#[ignore = "need whisper-cli"]
async fn create_whisper_cpp() {
let result = new(Path::new("tests/model/ggml-model-whisper-tiny.bin")).await;
assert!(result.is_ok());
}
#[tokio::test]
#[ignore = "need large model"]
async fn process_by_whisper_cpp() {
let whisper = new(Path::new("tests/model/ggml-model-whisper-large-q5_0.bin"))
.await
.unwrap();
let audio_path = Path::new("tests/audio/test.wav");
let output_path = Path::new("tests/audio/test.srt");
let result = whisper.generate_subtitle(audio_path, output_path).await;
assert!(result.is_ok());
}
}

Binary file not shown.

View File