5deffddb17
chunk_text split only on "\n\n", but ASR transcripts have none (speaker turns are joined by a single "\n"), so whole 2-3h episodes (~250K chars) went to the extractor in one call and 400'd on context overflow. Fall through paragraph -> line -> sentence -> word -> hard char-slice so no chunk exceeds the cap regardless of punctuation; guard max_chars < 1. Default extraction to recall-first full coverage (chunk_chars 12K, max_chunks 999) and expose both as run-extract --chunk-chars / --max-chunks.
144 lines
5.7 KiB
Python
144 lines
5.7 KiB
Python
"""Claim extraction: text → 0..N claim units → SQLite (§4.2)."""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import sqlite3
|
|
from typing import Any
|
|
|
|
from .prompt import SEED_TOPICS, build_messages
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_ENUMS = {
|
|
"claim_type": {"interpretive", "predictive", "descriptive", "reactive"},
|
|
"time_horizon": {"near", "medium", "long", "unspecified"},
|
|
"confidence": {"low", "med", "high"},
|
|
"thesis_seam": {"energy_compute", "debasement_bitcoin", "ai_data_ownership", "none"},
|
|
"salience": {"central", "secondary", "aside"},
|
|
}
|
|
|
|
|
|
def register_seed_topics(conn: sqlite3.Connection) -> None:
|
|
"""Pre-load the controlled half of the hybrid topic vocabulary (§4.2)."""
|
|
for t in SEED_TOPICS:
|
|
conn.execute(
|
|
"INSERT INTO topics (topic_canonical, status) VALUES (?, 'controlled') "
|
|
"ON CONFLICT(topic_canonical) DO UPDATE SET status='controlled'",
|
|
(t,),
|
|
)
|
|
conn.commit()
|
|
|
|
|
|
# Coarse→fine split boundaries. Transcripts arrive as `Speaker: turn` lines joined by a SINGLE
|
|
# newline (ASR output has no blank-line paragraphs), filings as paragraph text — so splitting on
|
|
# "\n\n" alone never fires on a transcript and the whole episode would go in one call. "" is the
|
|
# per-character hard cap that guarantees termination regardless of punctuation.
|
|
_SEPARATORS = ["\n\n", "\n", ". ", " ", ""]
|
|
|
|
|
|
def chunk_text(text: str, max_chars: int) -> list[str]:
|
|
"""Pack text into windows that each fit the model context alongside the prompt.
|
|
|
|
Falls through paragraph → line → sentence → word → hard char-slice, so NO chunk ever exceeds
|
|
max_chars however the source is punctuated, while keeping speaker turns intact when they fit.
|
|
"""
|
|
if max_chars < 1: # else _pack recurses past the last separator → IndexError
|
|
raise ValueError(f"max_chars must be >= 1, got {max_chars}")
|
|
text = text.strip()
|
|
if not text:
|
|
return []
|
|
return _pack(text, max_chars, _SEPARATORS)
|
|
|
|
|
|
def _pack(text: str, max_chars: int, seps: list[str]) -> list[str]:
|
|
"""Recursively pack `text` on the coarsest separator in `seps` that keeps chunks within
|
|
max_chars, descending to a finer one only for a part that is itself still too big."""
|
|
if len(text) <= max_chars:
|
|
return [text]
|
|
sep, rest = seps[0], seps[1:]
|
|
parts = list(text) if sep == "" else text.split(sep)
|
|
out: list[str] = []
|
|
cur = ""
|
|
for p in parts:
|
|
candidate = p if not cur else cur + sep + p
|
|
if len(candidate) <= max_chars:
|
|
cur = candidate
|
|
continue
|
|
if cur:
|
|
out.append(cur)
|
|
if len(p) <= max_chars:
|
|
cur = p
|
|
else: # a single part still too big → split it on the next-finer boundary
|
|
out.extend(_pack(p, max_chars, rest))
|
|
cur = ""
|
|
if cur:
|
|
out.append(cur)
|
|
return out
|
|
|
|
|
|
def _parse_claims(content: str) -> list[dict]:
|
|
try:
|
|
obj = json.loads(content)
|
|
except Exception:
|
|
i, j = content.find("{"), content.rfind("}")
|
|
if i < 0 or j < 0:
|
|
return []
|
|
try:
|
|
obj = json.loads(content[i:j + 1])
|
|
except Exception:
|
|
return []
|
|
claims = obj.get("claims", []) if isinstance(obj, dict) else []
|
|
return [c for c in claims if isinstance(c, dict) and c.get("proposition")]
|
|
|
|
|
|
def extract_claims_from_text(backend, text: str, *, source_name: str, source_cluster: str | None,
|
|
date: str | None, kind: str) -> list[dict]:
|
|
"""`backend` is any object with .complete_json(messages, max_tokens) -> str
|
|
(see extract.backends: LocalQwenBackend | GeminiBackend)."""
|
|
messages = build_messages(text, source_name=source_name, source_cluster=source_cluster,
|
|
date=date, kind=kind)
|
|
content = backend.complete_json(messages, max_tokens=4000)
|
|
return _parse_claims(content)
|
|
|
|
|
|
def _enum(c: dict, field: str, default: str) -> str:
|
|
v = c.get(field)
|
|
return v if v in _ENUMS[field] else default
|
|
|
|
|
|
def persist_claims(conn: sqlite3.Connection, *, doc: sqlite3.Row, source: sqlite3.Row | None,
|
|
claims: list[dict], chunk_idx: int) -> int:
|
|
n = 0
|
|
cluster = source["source_cluster"] if source else None
|
|
for i, c in enumerate(claims):
|
|
seam = _enum(c, "thesis_seam", "none")
|
|
topic = c.get("topic_canonical") or None
|
|
if topic:
|
|
# register emergent topics BEFORE the claim (claims.topic_canonical is a FK → topics)
|
|
conn.execute(
|
|
"INSERT OR IGNORE INTO topics (topic_canonical, status, seam) VALUES (?, 'emergent', ?)",
|
|
(topic, seam),
|
|
)
|
|
claim_id = f"{doc['doc_id']}:{chunk_idx}:{i}"
|
|
conn.execute(
|
|
"""INSERT OR IGNORE INTO claims
|
|
(claim_id, doc_id, source_id, proposition, topic_canonical, topic_raw, claimant,
|
|
source_cluster, date, claim_type, time_horizon, confidence, rel_polarity,
|
|
engages_consensus, counters_position, thesis_seam, salience)
|
|
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
|
|
(
|
|
claim_id, doc["doc_id"], doc["source_id"], str(c["proposition"])[:1000],
|
|
topic, c.get("topic_raw"),
|
|
c.get("claimant") or (source["name"] if source else None),
|
|
cluster, doc["date"],
|
|
_enum(c, "claim_type", "descriptive"), _enum(c, "time_horizon", "unspecified"),
|
|
_enum(c, "confidence", "med"), "none",
|
|
1 if c.get("engages_consensus") else 0, c.get("counters_position"),
|
|
seam, _enum(c, "salience", "secondary"),
|
|
),
|
|
)
|
|
n += 1
|
|
conn.commit()
|
|
return n
|