Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

This commit is contained in:
Keysat
2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
+47
View File
@@ -0,0 +1,47 @@
"""SEC filing HTML → plain text. Stdlib only (boring, inspectable).
Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge <ix:hidden> section of
numeric facts that would otherwise swamp the extractor), and collapses whitespace.
"""
from __future__ import annotations
import re
from html.parser import HTMLParser
_SKIP_TAGS = {"script", "style", "head"}
_SKIP_PREFIXES = ("ix:hidden",) # inline-XBRL hidden fact dump
_BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"}
class _Stripper(HTMLParser):
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self._skip_depth = 0
self._parts: list[str] = []
def handle_starttag(self, tag: str, attrs) -> None:
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
self._skip_depth += 1
elif tag in _BLOCK_TAGS:
self._parts.append("\n")
def handle_endtag(self, tag: str) -> None:
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
self._skip_depth = max(0, self._skip_depth - 1)
elif tag in _BLOCK_TAGS:
self._parts.append("\n")
def handle_data(self, data: str) -> None:
if self._skip_depth == 0 and data.strip():
self._parts.append(data)
def html_to_text(html: str, *, max_chars: int = 300_000) -> str:
p = _Stripper()
p.feed(html)
text = "".join(p._parts)
text = re.sub(r"[ \t ]+", " ", text)
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
text = "\n".join(line.strip() for line in text.splitlines())
text = text.strip()
return text[:max_chars]