62 lines
2.6 KiB
Python
62 lines
2.6 KiB
Python
"""Audio acquisition (§4.1). Spark Control transcribes audio you fetch — this fetches it.
|
|
|
|
- Podcast enclosures: a plain streaming download that follows the Podtrac/Megaphone redirects to the
|
|
final signed CDN object (download immediately; resolved URLs carry short-lived params).
|
|
- YouTube: yt-dlp (audio-only → 16 kHz mono WAV). NOTE: 2026 YouTube enforces PO Tokens broadly — run
|
|
the `bgutil-ytdlp-pot-provider` sidecar or pulls will 403. yt-dlp is treated as a LAST resort; prefer
|
|
the RSS enclosure where a show publishes both (ToS: downloading YT audio violates YouTube ToS).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
|
|
|
|
|
|
def download_enclosure(url: str, dest: str | Path, *, user_agent: str = DEFAULT_UA, timeout: int = 120) -> Path:
|
|
dest = Path(dest)
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
with requests.get(url, stream=True, allow_redirects=True,
|
|
headers={"User-Agent": user_agent}, timeout=timeout) as r:
|
|
r.raise_for_status()
|
|
with open(dest, "wb") as f:
|
|
for chunk in r.iter_content(chunk_size=1 << 16):
|
|
f.write(chunk)
|
|
return dest
|
|
|
|
|
|
def to_wav_16k_mono(src: str | Path, dst: str | Path) -> Path:
|
|
"""Normalize any audio to 16 kHz mono PCM WAV (what the ASR endpoint wants). Requires ffmpeg."""
|
|
dst = Path(dst)
|
|
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
subprocess.run(
|
|
["ffmpeg", "-y", "-i", str(src), "-ar", "16000", "-ac", "1", "-f", "wav", str(dst)],
|
|
check=True, capture_output=True,
|
|
)
|
|
return dst
|
|
|
|
|
|
def download_youtube_audio(url: str, out_dir: str | Path, *, archive_file: str | Path | None = None) -> Path:
|
|
"""Audio-only via yt-dlp → 16 kHz mono WAV. `archive_file` (yt-dlp --download-archive) is the
|
|
canonical 'only-new' dedup for channel/playlist back-catalog pulls."""
|
|
out_dir = Path(out_dir)
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
cmd = [
|
|
"yt-dlp", "-f", "bestaudio/best", "-x", "--audio-format", "wav",
|
|
"--postprocessor-args", "ffmpeg:-ar 16000 -ac 1",
|
|
"-o", str(out_dir / "%(id)s.%(ext)s"),
|
|
"--no-progress",
|
|
]
|
|
if archive_file:
|
|
cmd += ["--download-archive", str(archive_file)]
|
|
cmd.append(url)
|
|
subprocess.run(cmd, check=True, capture_output=True)
|
|
# yt-dlp names the file by video id; return the newest wav
|
|
wavs = sorted(out_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime)
|
|
if not wavs:
|
|
raise RuntimeError("yt-dlp produced no wav (PO-token/cookies issue? see module docstring)")
|
|
return wavs[-1]
|