"""Load the source registry (companies + podcasts, §7.3/§7.4) into SQLite. Idempotent upsert.""" from __future__ import annotations import sqlite3 from pathlib import Path from typing import Any import yaml _COLS = ("source_id", "name", "kind", "source_cluster", "role", "rss_url", "channel_url", "ticker", "cluster_capped_low", "own_network", "backtest_2022_2023", "notes") def _row(s: dict[str, Any]) -> dict[str, Any]: return { "source_id": s["id"], "name": s["name"], "kind": s["kind"], "source_cluster": s.get("cluster"), "role": s.get("role", "none"), "rss_url": s.get("rss_url"), "channel_url": s.get("channel_url"), "ticker": s.get("ticker"), "cluster_capped_low": 1 if s.get("cluster_capped_low") else 0, "own_network": 1 if s.get("own_network") else 0, "backtest_2022_2023": s.get("backtest_2022_2023"), "notes": s.get("notes"), } def update_feeds(conn: sqlite3.Connection, path: Path) -> int: """Apply resolved/verified podcast feed URLs + backtest-reach to existing source rows.""" try: conn.execute("ALTER TABLE sources ADD COLUMN backtest_2022_2023 TEXT") conn.commit() except sqlite3.OperationalError: pass # column already exists data = yaml.safe_load(Path(path).read_text()) or {} rows = data.get("feeds", []) for f in rows: conn.execute( """UPDATE sources SET rss_url=:rss_url, channel_url=:youtube_channel_url, backtest_2022_2023=:backtest_2022_2023, notes=COALESCE(:note, notes) WHERE source_id=:id""", { "id": f["id"], "rss_url": f.get("rss_url"), "youtube_channel_url": f.get("youtube_channel_url"), "backtest_2022_2023": f.get("backtest_2022_2023"), "note": f.get("note"), }, ) conn.commit() return len(rows) def load_source_edges(conn: sqlite3.Connection, path: Path) -> int: """Seed EISC connectedness edges (priors) idempotently. Stores src_a,src_b in sorted order to match the transcribe_worker's convention (sorted([a,b]) + ON CONFLICT weight+=1) so real detections accumulate on the same PK instead of creating a reversed duplicate. DO NOTHING on conflict → a re-run won't inflate, and won't clobber a stronger auto-detected weight.""" data = yaml.safe_load(Path(path).read_text()) or {} rows = data.get("edges", []) applied = 0 for e in rows: a, b = sorted([e["a"], e["b"]]) cur = conn.execute( """INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence) VALUES (?,?,?,?,?) ON CONFLICT(src_a, src_b, edge_type) DO NOTHING""", (a, b, e["type"], float(e.get("weight", 1.0)), e.get("evidence")), ) applied += cur.rowcount conn.commit() return applied def load_sources(conn: sqlite3.Connection, path: Path) -> int: data = yaml.safe_load(Path(path).read_text()) or {} rows = data.get("sources", []) cols = ", ".join(_COLS) placeholders = ", ".join(f":{c}" for c in _COLS) updates = ", ".join(f"{c}=excluded.{c}" for c in _COLS if c != "source_id") sql = ( f"INSERT INTO sources ({cols}, created_at) VALUES ({placeholders}, datetime('now')) " f"ON CONFLICT(source_id) DO UPDATE SET {updates}" ) for s in rows: conn.execute(sql, _row(s)) conn.commit() return len(rows)