73 lines
4.3 KiB
Python
73 lines
4.3 KiB
Python
"""The §4.2 claim-extraction prompt. Prompt engineering is ours (§13.3); the schema is finalized.
|
|
|
|
Discipline encoded here (the whole point of the system, §2/§4.2):
|
|
- Extract at the level of the PROPOSITION; emit ZERO when there is no substantive claim.
|
|
- Separate topic from stance: capture stance-vs-consensus explicitly, never as a bull/bear label.
|
|
- thesis_seam is a TAG, not a filter — off-thesis and anti-thesis claims are still extracted.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
# Hybrid topic vocabulary (§4.2): a small SEEDED controlled list. The model reuses one when it
|
|
# fits and proposes a concise snake_case topic otherwise; emergent topics are merged on a schedule.
|
|
SEED_TOPICS = [
|
|
# energy <-> compute
|
|
"ai_compute_demand", "ai_power_constraint", "datacenter_buildout", "grid_interconnect",
|
|
"transformers_equipment", "nuclear_power", "natural_gas_power", "uranium_supply",
|
|
"cooling_infrastructure", "miner_flexible_load", "mining_ai_pivot",
|
|
# debasement <-> bitcoin
|
|
"bitcoin_reserve_asset", "bitcoin_collateral_credit", "bitcoin_treasury_strategy",
|
|
"btc_custody_regulation", "sovereign_bitcoin_adoption",
|
|
# ai <-> data ownership
|
|
"ai_data_ownership", "confidential_inference", "ai_commoditization",
|
|
# macro
|
|
"fed_policy", "fiscal_debasement", "stablecoins_cbdc",
|
|
]
|
|
|
|
_SYSTEM = """You are the claim-extraction component of an investment signal engine. You read a passage \
|
|
(an SEC filing excerpt or a podcast/earnings-call transcript) and extract structured CLAIM UNITS.
|
|
|
|
A CLAIM UNIT is a single normalized proposition that someone asserts — a forward-looking prediction, \
|
|
an interpretive or causal judgment, or a stance taken against a prevailing view. It must be specific \
|
|
enough to later be checked against the world.
|
|
|
|
CRITICAL DISCIPLINE — be willing to extract NOTHING:
|
|
- Most passages contain ZERO claim units. Boilerplate, legal disclaimers, ad reads, pleasantries, \
|
|
generic descriptions, routine financial line-items, and recitations of well-known news are NOT claims.
|
|
- Do NOT invent claims. Do NOT emit one claim per paragraph to seem thorough. If the passage has no \
|
|
substantive proposition, return {"claims": []}. A precise empty answer is the correct, valued output.
|
|
- Extract at the level of the PROPOSITION: one normalized subject-assertion-object sentence each. A \
|
|
single rich passage may yield several; a long dull one yields none.
|
|
|
|
For EACH claim unit, output these fields:
|
|
- "proposition": one normalized sentence (subject-assertion-object), self-contained.
|
|
- "topic_canonical": a concise snake_case topic for clustering. REUSE one of the provided seed topics \
|
|
when it fits; otherwise propose a new concise snake_case label. Normalize synonyms (Fed/FOMC/rates → fed_policy).
|
|
- "topic_raw": the topic as actually phrased in the passage.
|
|
- "claimant": who asserts it (speaker name or the filing company). Use "unknown" if unclear.
|
|
- "claim_type": one of interpretive | predictive | descriptive | reactive. (interpretive/predictive = \
|
|
insight; descriptive/reactive = news echo — extract those only if clearly salient.)
|
|
- "time_horizon": one of near | medium | long | unspecified (for predictive claims especially).
|
|
- "confidence": the claimant's apparent conviction — one of low | med | high.
|
|
- "engages_consensus": true ONLY if the claim explicitly argues against a stated mainstream view.
|
|
- "counters_position": the mainstream position it argues against, or null.
|
|
- "thesis_seam": one of energy_compute | debasement_bitcoin | ai_data_ownership | none. This is a TAG \
|
|
for relevance only — tag off-thesis claims "none" and STILL extract them.
|
|
- "salience": central | secondary | aside (how central the claim is to the passage).
|
|
|
|
Return ONLY a JSON object: {"claims": [ {...}, ... ]}. No prose, no markdown."""
|
|
|
|
|
|
def build_messages(text: str, *, source_name: str, source_cluster: str | None,
|
|
date: str | None, kind: str) -> list[dict[str, str]]:
|
|
seed = ", ".join(SEED_TOPICS)
|
|
context = (
|
|
f"Source: {source_name or 'unknown'} (cluster: {source_cluster or 'n/a'}, type: {kind}, "
|
|
f"date: {date or 'n/a'}).\n"
|
|
f"Seed topics to reuse when they fit: {seed}.\n\n"
|
|
f"PASSAGE:\n{text}"
|
|
)
|
|
return [
|
|
{"role": "system", "content": _SYSTEM},
|
|
{"role": "user", "content": context},
|
|
]
|