Files
ten31-signal-engine/signal_engine/ingest/download.py
T

62 lines
2.6 KiB
Python

"""Audio acquisition (§4.1). Spark Control transcribes audio you fetch — this fetches it.
- Podcast enclosures: a plain streaming download that follows the Podtrac/Megaphone redirects to the
final signed CDN object (download immediately; resolved URLs carry short-lived params).
- YouTube: yt-dlp (audio-only → 16 kHz mono WAV). NOTE: 2026 YouTube enforces PO Tokens broadly — run
the `bgutil-ytdlp-pot-provider` sidecar or pulls will 403. yt-dlp is treated as a LAST resort; prefer
the RSS enclosure where a show publishes both (ToS: downloading YT audio violates YouTube ToS).
"""
from __future__ import annotations
import subprocess
from pathlib import Path
import requests
DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
def download_enclosure(url: str, dest: str | Path, *, user_agent: str = DEFAULT_UA, timeout: int = 120) -> Path:
dest = Path(dest)
dest.parent.mkdir(parents=True, exist_ok=True)
with requests.get(url, stream=True, allow_redirects=True,
headers={"User-Agent": user_agent}, timeout=timeout) as r:
r.raise_for_status()
with open(dest, "wb") as f:
for chunk in r.iter_content(chunk_size=1 << 16):
f.write(chunk)
return dest
def to_wav_16k_mono(src: str | Path, dst: str | Path) -> Path:
"""Normalize any audio to 16 kHz mono PCM WAV (what the ASR endpoint wants). Requires ffmpeg."""
dst = Path(dst)
dst.parent.mkdir(parents=True, exist_ok=True)
subprocess.run(
["ffmpeg", "-y", "-i", str(src), "-ar", "16000", "-ac", "1", "-f", "wav", str(dst)],
check=True, capture_output=True,
)
return dst
def download_youtube_audio(url: str, out_dir: str | Path, *, archive_file: str | Path | None = None) -> Path:
"""Audio-only via yt-dlp → 16 kHz mono WAV. `archive_file` (yt-dlp --download-archive) is the
canonical 'only-new' dedup for channel/playlist back-catalog pulls."""
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
cmd = [
"yt-dlp", "-f", "bestaudio/best", "-x", "--audio-format", "wav",
"--postprocessor-args", "ffmpeg:-ar 16000 -ac 1",
"-o", str(out_dir / "%(id)s.%(ext)s"),
"--no-progress",
]
if archive_file:
cmd += ["--download-archive", str(archive_file)]
cmd.append(url)
subprocess.run(cmd, check=True, capture_output=True)
# yt-dlp names the file by video id; return the newest wav
wavs = sorted(out_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime)
if not wavs:
raise RuntimeError("yt-dlp produced no wav (PO-token/cookies issue? see module docstring)")
return wavs[-1]