Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,47 @@
|
||||
"""SEC filing HTML → plain text. Stdlib only (boring, inspectable).
|
||||
|
||||
Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge <ix:hidden> section of
|
||||
numeric facts that would otherwise swamp the extractor), and collapses whitespace.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
|
||||
_SKIP_TAGS = {"script", "style", "head"}
|
||||
_SKIP_PREFIXES = ("ix:hidden",) # inline-XBRL hidden fact dump
|
||||
_BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"}
|
||||
|
||||
|
||||
class _Stripper(HTMLParser):
|
||||
def __init__(self) -> None:
|
||||
super().__init__(convert_charrefs=True)
|
||||
self._skip_depth = 0
|
||||
self._parts: list[str] = []
|
||||
|
||||
def handle_starttag(self, tag: str, attrs) -> None:
|
||||
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
|
||||
self._skip_depth += 1
|
||||
elif tag in _BLOCK_TAGS:
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
|
||||
self._skip_depth = max(0, self._skip_depth - 1)
|
||||
elif tag in _BLOCK_TAGS:
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if self._skip_depth == 0 and data.strip():
|
||||
self._parts.append(data)
|
||||
|
||||
|
||||
def html_to_text(html: str, *, max_chars: int = 300_000) -> str:
|
||||
p = _Stripper()
|
||||
p.feed(html)
|
||||
text = "".join(p._parts)
|
||||
text = re.sub(r"[ \t ]+", " ", text)
|
||||
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
|
||||
text = "\n".join(line.strip() for line in text.splitlines())
|
||||
text = text.strip()
|
||||
return text[:max_chars]
|
||||
Reference in New Issue
Block a user