Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
"""Audio acquisition (§4.1). Spark Control transcribes audio you fetch — this fetches it.
|
||||
|
||||
- Podcast enclosures: a plain streaming download that follows the Podtrac/Megaphone redirects to the
|
||||
final signed CDN object (download immediately; resolved URLs carry short-lived params).
|
||||
- YouTube: yt-dlp (audio-only → 16 kHz mono WAV). NOTE: 2026 YouTube enforces PO Tokens broadly — run
|
||||
the `bgutil-ytdlp-pot-provider` sidecar or pulls will 403. yt-dlp is treated as a LAST resort; prefer
|
||||
the RSS enclosure where a show publishes both (ToS: downloading YT audio violates YouTube ToS).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
|
||||
|
||||
|
||||
def download_enclosure(url: str, dest: str | Path, *, user_agent: str = DEFAULT_UA, timeout: int = 120) -> Path:
|
||||
dest = Path(dest)
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
with requests.get(url, stream=True, allow_redirects=True,
|
||||
headers={"User-Agent": user_agent}, timeout=timeout) as r:
|
||||
r.raise_for_status()
|
||||
with open(dest, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=1 << 16):
|
||||
f.write(chunk)
|
||||
return dest
|
||||
|
||||
|
||||
def to_wav_16k_mono(src: str | Path, dst: str | Path) -> Path:
|
||||
"""Normalize any audio to 16 kHz mono PCM WAV (what the ASR endpoint wants). Requires ffmpeg."""
|
||||
dst = Path(dst)
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
subprocess.run(
|
||||
["ffmpeg", "-y", "-i", str(src), "-ar", "16000", "-ac", "1", "-f", "wav", str(dst)],
|
||||
check=True, capture_output=True,
|
||||
)
|
||||
return dst
|
||||
|
||||
|
||||
def download_youtube_audio(url: str, out_dir: str | Path, *, archive_file: str | Path | None = None) -> Path:
|
||||
"""Audio-only via yt-dlp → 16 kHz mono WAV. `archive_file` (yt-dlp --download-archive) is the
|
||||
canonical 'only-new' dedup for channel/playlist back-catalog pulls."""
|
||||
out_dir = Path(out_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
cmd = [
|
||||
"yt-dlp", "-f", "bestaudio/best", "-x", "--audio-format", "wav",
|
||||
"--postprocessor-args", "ffmpeg:-ar 16000 -ac 1",
|
||||
"-o", str(out_dir / "%(id)s.%(ext)s"),
|
||||
"--no-progress",
|
||||
]
|
||||
if archive_file:
|
||||
cmd += ["--download-archive", str(archive_file)]
|
||||
cmd.append(url)
|
||||
subprocess.run(cmd, check=True, capture_output=True)
|
||||
# yt-dlp names the file by video id; return the newest wav
|
||||
wavs = sorted(out_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime)
|
||||
if not wavs:
|
||||
raise RuntimeError("yt-dlp produced no wav (PO-token/cookies issue? see module docstring)")
|
||||
return wavs[-1]
|
||||
Reference in New Issue
Block a user