ten31-signal-engine/signal_engine/extract/html_text.py

"""SEC filing HTML → plain text. Stdlib only (boring, inspectable).

Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge <ix:hidden> section of
numeric facts that would otherwise swamp the extractor), and collapses whitespace.
"""
from __future__ import annotations

import re
from html.parser import HTMLParser

_SKIP_TAGS = {"script", "style", "head"}
_SKIP_PREFIXES = ("ix:hidden",)          # inline-XBRL hidden fact dump
_BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"}


class _Stripper(HTMLParser):
    def __init__(self) -> None:
        super().__init__(convert_charrefs=True)
        self._skip_depth = 0
        self._parts: list[str] = []

    def handle_starttag(self, tag: str, attrs) -> None:
        if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
            self._skip_depth += 1
        elif tag in _BLOCK_TAGS:
            self._parts.append("\n")

    def handle_endtag(self, tag: str) -> None:
        if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
            self._skip_depth = max(0, self._skip_depth - 1)
        elif tag in _BLOCK_TAGS:
            self._parts.append("\n")

    def handle_data(self, data: str) -> None:
        if self._skip_depth == 0 and data.strip():
            self._parts.append(data)


def html_to_text(html: str, *, max_chars: int = 300_000) -> str:
    p = _Stripper()
    p.feed(html)
    text = "".join(p._parts)
    text = re.sub(r"[ \t ]+", " ", text)
    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
    text = "\n".join(line.strip() for line in text.splitlines())
    text = text.strip()
    return text[:max_chars]