Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,36 @@
|
||||
"""Long-audio chunking (§4.1, §13.4).
|
||||
|
||||
Podcasts run 1–3 h; the diarizer caps at 4 speakers/chunk and Spark 2 is a single GPU, so we cut
|
||||
long audio into ~2–3 min pieces sent SEQUENTIALLY (parallel audio → 503 FFT race). Each chunk is
|
||||
diarized independently and re-stitched across chunks by voiceprint (see speaker_stitch.py).
|
||||
Requires ffmpeg/ffprobe.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
CHUNK_SECONDS_DEFAULT = 150 # 2.5 min, within the ~2–3 min guidance
|
||||
|
||||
|
||||
def duration_seconds(src: str | Path) -> float:
|
||||
out = subprocess.run(
|
||||
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
||||
"-of", "default=noprint_wrappers=1:nokey=1", str(src)],
|
||||
check=True, capture_output=True, text=True,
|
||||
)
|
||||
return float(out.stdout.strip())
|
||||
|
||||
|
||||
def chunk_audio(src: str | Path, out_dir: str | Path, *, chunk_seconds: int = CHUNK_SECONDS_DEFAULT) -> list[Path]:
|
||||
"""Split into fixed-length WAV chunks using ffmpeg's segment muxer (no re-encode of timing).
|
||||
Returns chunk paths in order. Order matters: the queue sends them sequentially."""
|
||||
out_dir = Path(out_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
pattern = str(out_dir / "chunk_%04d.wav")
|
||||
subprocess.run(
|
||||
["ffmpeg", "-y", "-i", str(src), "-f", "segment", "-segment_time", str(chunk_seconds),
|
||||
"-ar", "16000", "-ac", "1", "-reset_timestamps", "1", pattern],
|
||||
check=True, capture_output=True,
|
||||
)
|
||||
return sorted(out_dir.glob("chunk_*.wav"))
|
||||
Reference in New Issue
Block a user