Fix transcript chunker context overflow; full-coverage extraction defaults

chunk_text split only on "\n\n", but ASR transcripts have none (speaker turns are joined by a single "\n"), so whole 2-3h episodes (~250K chars) went to the extractor in one call and 400'd on context overflow. Fall through paragraph -> line -> sentence -> word -> hard char-slice so no chunk exceeds the cap regardless of punctuation; guard max_chars < 1. Default extraction to recall-first full coverage (chunk_chars 12K, max_chunks 999) and expose both as run-extract --chunk-chars / --max-chunks.
2026-06-15 22:28:12 -05:00
parent cabb8a3d6c
commit 5deffddb17
4 changed files with 50 additions and 16 deletions
@@ -30,25 +30,51 @@ def register_seed_topics(conn: sqlite3.Connection) -> None:
    conn.commit()


+# Coarse→fine split boundaries. Transcripts arrive as `Speaker: turn` lines joined by a SINGLE
+# newline (ASR output has no blank-line paragraphs), filings as paragraph text — so splitting on
+# "\n\n" alone never fires on a transcript and the whole episode would go in one call. "" is the
+# per-character hard cap that guarantees termination regardless of punctuation.
+_SEPARATORS = ["\n\n", "\n", ". ", " ", ""]
+
+
 def chunk_text(text: str, max_chars: int) -> list[str]:
-    """Split on paragraph boundaries into windows that fit the model context alongside the prompt."""
+    """Pack text into windows that each fit the model context alongside the prompt.
+
+    Falls through paragraph → line → sentence → word → hard char-slice, so NO chunk ever exceeds
+    max_chars however the source is punctuated, while keeping speaker turns intact when they fit.
+    """
+    if max_chars < 1:  # else _pack recurses past the last separator → IndexError
+        raise ValueError(f"max_chars must be >= 1, got {max_chars}")
    text = text.strip()
    if not text:
        return []
+    return _pack(text, max_chars, _SEPARATORS)
+
+
+def _pack(text: str, max_chars: int, seps: list[str]) -> list[str]:
+    """Recursively pack `text` on the coarsest separator in `seps` that keeps chunks within
+    max_chars, descending to a finer one only for a part that is itself still too big."""
    if len(text) <= max_chars:
        return [text]
-    chunks: list[str] = []
-    cur: list[str] = []
-    size = 0
-    for para in text.split("\n\n"):
-        if size + len(para) > max_chars and cur:
-            chunks.append("\n\n".join(cur))
-            cur, size = [], 0
-        cur.append(para)
-        size += len(para) + 2
+    sep, rest = seps[0], seps[1:]
+    parts = list(text) if sep == "" else text.split(sep)
+    out: list[str] = []
+    cur = ""
+    for p in parts:
+        candidate = p if not cur else cur + sep + p
+        if len(candidate) <= max_chars:
+            cur = candidate
+            continue
+        if cur:
+            out.append(cur)
+        if len(p) <= max_chars:
+            cur = p
+        else:  # a single part still too big → split it on the next-finer boundary
+            out.extend(_pack(p, max_chars, rest))
+            cur = ""
    if cur:
-        chunks.append("\n\n".join(cur))
-    return chunks
+        out.append(cur)
+    return out


 def _parse_claims(content: str) -> list[dict]:
@@ -28,8 +28,8 @@ def _document_text(doc, *, user_agent: str) -> str:
    raise ValueError(f"no text source for {doc['doc_id']} (kind={doc['kind']}, url={doc['url']})")


-def run_extract(conn, sc, cfg, *, limit: int = 10, max_chunks_per_doc: int = 4,
-                chunk_chars: int = 18_000, lease_seconds: int = 900,
+def run_extract(conn, sc, cfg, *, limit: int = 10, max_chunks_per_doc: int = 999,
+                chunk_chars: int = 12_000, lease_seconds: int = 900,
                worker_id: str = "extract-1") -> dict:
    from .backends import from_config as backend_from_config
    backend = backend_from_config(cfg, sc)