Files

37 lines
1.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Long-audio chunking (§4.1, §13.4).
Podcasts run 13 h; the diarizer caps at 4 speakers/chunk and Spark 2 is a single GPU, so we cut
long audio into ~23 min pieces sent SEQUENTIALLY (parallel audio → 503 FFT race). Each chunk is
diarized independently and re-stitched across chunks by voiceprint (see speaker_stitch.py).
Requires ffmpeg/ffprobe.
"""
from __future__ import annotations
import subprocess
from pathlib import Path
CHUNK_SECONDS_DEFAULT = 150 # 2.5 min, within the ~23 min guidance
def duration_seconds(src: str | Path) -> float:
out = subprocess.run(
["ffprobe", "-v", "error", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", str(src)],
check=True, capture_output=True, text=True,
)
return float(out.stdout.strip())
def chunk_audio(src: str | Path, out_dir: str | Path, *, chunk_seconds: int = CHUNK_SECONDS_DEFAULT) -> list[Path]:
"""Split into fixed-length WAV chunks using ffmpeg's segment muxer (no re-encode of timing).
Returns chunk paths in order. Order matters: the queue sends them sequentially."""
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
pattern = str(out_dir / "chunk_%04d.wav")
subprocess.run(
["ffmpeg", "-y", "-i", str(src), "-f", "segment", "-segment_time", str(chunk_seconds),
"-ar", "16000", "-ac", "1", "-reset_timestamps", "1", pattern],
check=True, capture_output=True,
)
return sorted(out_dir.glob("chunk_*.wav"))