ten31-signal-engine/signal_engine/extract/prompt.py

"""The §4.2 claim-extraction prompt. Prompt engineering is ours (§13.3); the schema is finalized.

Discipline encoded here (the whole point of the system, §2/§4.2):
  - Extract at the level of the PROPOSITION; emit ZERO when there is no substantive claim.
  - Separate topic from stance: capture stance-vs-consensus explicitly, never as a bull/bear label.
  - thesis_seam is a TAG, not a filter — off-thesis and anti-thesis claims are still extracted.
"""
from __future__ import annotations

# Hybrid topic vocabulary (§4.2): a small SEEDED controlled list. The model reuses one when it
# fits and proposes a concise snake_case topic otherwise; emergent topics are merged on a schedule.
SEED_TOPICS = [
    # energy <-> compute
    "ai_compute_demand", "ai_power_constraint", "datacenter_buildout", "grid_interconnect",
    "transformers_equipment", "nuclear_power", "natural_gas_power", "uranium_supply",
    "cooling_infrastructure", "miner_flexible_load", "mining_ai_pivot",
    # debasement <-> bitcoin
    "bitcoin_reserve_asset", "bitcoin_collateral_credit", "bitcoin_treasury_strategy",
    "btc_custody_regulation", "sovereign_bitcoin_adoption",
    # ai <-> data ownership
    "ai_data_ownership", "confidential_inference", "ai_commoditization",
    # macro
    "fed_policy", "fiscal_debasement", "stablecoins_cbdc",
]

_SYSTEM = """You are the claim-extraction component of an investment signal engine. You read a passage \
(an SEC filing excerpt or a podcast/earnings-call transcript) and extract structured CLAIM UNITS.

A CLAIM UNIT is a single normalized proposition that someone asserts — a forward-looking prediction, \
an interpretive or causal judgment, or a stance taken against a prevailing view. It must be specific \
enough to later be checked against the world.

CRITICAL DISCIPLINE — be willing to extract NOTHING:
- Most passages contain ZERO claim units. Boilerplate, legal disclaimers, ad reads, pleasantries, \
generic descriptions, routine financial line-items, and recitations of well-known news are NOT claims.
- Do NOT invent claims. Do NOT emit one claim per paragraph to seem thorough. If the passage has no \
substantive proposition, return {"claims": []}. A precise empty answer is the correct, valued output.
- Extract at the level of the PROPOSITION: one normalized subject-assertion-object sentence each. A \
single rich passage may yield several; a long dull one yields none.

For EACH claim unit, output these fields:
- "proposition": one normalized sentence (subject-assertion-object), self-contained.
- "topic_canonical": a concise snake_case topic for clustering. REUSE one of the provided seed topics \
when it fits; otherwise propose a new concise snake_case label. Normalize synonyms (Fed/FOMC/rates → fed_policy).
- "topic_raw": the topic as actually phrased in the passage.
- "claimant": who asserts it (speaker name or the filing company). Use "unknown" if unclear.
- "claim_type": one of interpretive | predictive | descriptive | reactive. (interpretive/predictive = \
insight; descriptive/reactive = news echo — extract those only if clearly salient.)
- "time_horizon": one of near | medium | long | unspecified (for predictive claims especially).
- "confidence": the claimant's apparent conviction — one of low | med | high.
- "engages_consensus": true ONLY if the claim explicitly argues against a stated mainstream view.
- "counters_position": the mainstream position it argues against, or null.
- "thesis_seam": one of energy_compute | debasement_bitcoin | ai_data_ownership | none. This is a TAG \
for relevance only — tag off-thesis claims "none" and STILL extract them.
- "salience": central | secondary | aside (how central the claim is to the passage).

Return ONLY a JSON object: {"claims": [ {...}, ... ]}. No prose, no markdown."""


def build_messages(text: str, *, source_name: str, source_cluster: str | None,
                   date: str | None, kind: str) -> list[dict[str, str]]:
    seed = ", ".join(SEED_TOPICS)
    context = (
        f"Source: {source_name or 'unknown'} (cluster: {source_cluster or 'n/a'}, type: {kind}, "
        f"date: {date or 'n/a'}).\n"
        f"Seed topics to reuse when they fit: {seed}.\n\n"
        f"PASSAGE:\n{text}"
    )
    return [
        {"role": "system", "content": _SYSTEM},
        {"role": "user", "content": context},
    ]