"""Audio acquisition (§4.1). Spark Control transcribes audio you fetch — this fetches it. - Podcast enclosures: a plain streaming download that follows the Podtrac/Megaphone redirects to the final signed CDN object (download immediately; resolved URLs carry short-lived params). - YouTube: yt-dlp (audio-only → 16 kHz mono WAV). NOTE: 2026 YouTube enforces PO Tokens broadly — run the `bgutil-ytdlp-pot-provider` sidecar or pulls will 403. yt-dlp is treated as a LAST resort; prefer the RSS enclosure where a show publishes both (ToS: downloading YT audio violates YouTube ToS). """ from __future__ import annotations import subprocess from pathlib import Path import requests DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)" def download_enclosure(url: str, dest: str | Path, *, user_agent: str = DEFAULT_UA, timeout: int = 120) -> Path: dest = Path(dest) dest.parent.mkdir(parents=True, exist_ok=True) with requests.get(url, stream=True, allow_redirects=True, headers={"User-Agent": user_agent}, timeout=timeout) as r: r.raise_for_status() with open(dest, "wb") as f: for chunk in r.iter_content(chunk_size=1 << 16): f.write(chunk) return dest def to_wav_16k_mono(src: str | Path, dst: str | Path) -> Path: """Normalize any audio to 16 kHz mono PCM WAV (what the ASR endpoint wants). Requires ffmpeg.""" dst = Path(dst) dst.parent.mkdir(parents=True, exist_ok=True) subprocess.run( ["ffmpeg", "-y", "-i", str(src), "-ar", "16000", "-ac", "1", "-f", "wav", str(dst)], check=True, capture_output=True, ) return dst def download_youtube_audio(url: str, out_dir: str | Path, *, archive_file: str | Path | None = None) -> Path: """Audio-only via yt-dlp → 16 kHz mono WAV. `archive_file` (yt-dlp --download-archive) is the canonical 'only-new' dedup for channel/playlist back-catalog pulls.""" out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) cmd = [ "yt-dlp", "-f", "bestaudio/best", "-x", "--audio-format", "wav", "--postprocessor-args", "ffmpeg:-ar 16000 -ac 1", "-o", str(out_dir / "%(id)s.%(ext)s"), "--no-progress", ] if archive_file: cmd += ["--download-archive", str(archive_file)] cmd.append(url) subprocess.run(cmd, check=True, capture_output=True) # yt-dlp names the file by video id; return the newest wav wavs = sorted(out_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime) if not wavs: raise RuntimeError("yt-dlp produced no wav (PO-token/cookies issue? see module docstring)") return wavs[-1]