Fix transcript chunker context overflow; full-coverage extraction defaults
chunk_text split only on "\n\n", but ASR transcripts have none (speaker turns are joined by a single "\n"), so whole 2-3h episodes (~250K chars) went to the extractor in one call and 400'd on context overflow. Fall through paragraph -> line -> sentence -> word -> hard char-slice so no chunk exceeds the cap regardless of punctuation; guard max_chars < 1. Default extraction to recall-first full coverage (chunk_chars 12K, max_chunks 999) and expose both as run-extract --chunk-chars / --max-chunks.
This commit is contained in:
@@ -254,7 +254,8 @@ def cmd_run_extract(args: argparse.Namespace) -> int:
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
sc = from_config(cfg)
|
||||
result = run_extract(conn, sc, cfg, limit=args.limit, max_chunks_per_doc=args.max_chunks)
|
||||
result = run_extract(conn, sc, cfg, limit=args.limit, max_chunks_per_doc=args.max_chunks,
|
||||
chunk_chars=args.chunk_chars)
|
||||
print(f"extraction: {result['jobs_processed']} jobs, {result['claims_written']} claims written")
|
||||
return 0
|
||||
|
||||
@@ -581,7 +582,10 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
|
||||
re = sub.add_parser("run-extract", help="Drain 'extract' jobs → claims via the local LLM (§4.2)")
|
||||
re.add_argument("--limit", type=int, default=5, help="max jobs to process this run")
|
||||
re.add_argument("--max-chunks", type=int, default=4, help="max chunks per document")
|
||||
re.add_argument("--max-chunks", type=int, default=999,
|
||||
help="max chunks per document (default: full coverage (999))")
|
||||
re.add_argument("--chunk-chars", type=int, default=12_000,
|
||||
help="chars per extraction chunk; smaller = better recall, more LLM calls")
|
||||
re.set_defaults(func=cmd_run_extract)
|
||||
|
||||
sub.add_parser("queue-status", help="Backfill queue counts by type/state").set_defaults(func=cmd_queue_status)
|
||||
|
||||
Reference in New Issue
Block a user