"""SEC filing HTML → plain text. Stdlib only (boring, inspectable). Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge section of numeric facts that would otherwise swamp the extractor), and collapses whitespace. """ from __future__ import annotations import re from html.parser import HTMLParser _SKIP_TAGS = {"script", "style", "head"} _SKIP_PREFIXES = ("ix:hidden",) # inline-XBRL hidden fact dump _BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"} class _Stripper(HTMLParser): def __init__(self) -> None: super().__init__(convert_charrefs=True) self._skip_depth = 0 self._parts: list[str] = [] def handle_starttag(self, tag: str, attrs) -> None: if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES): self._skip_depth += 1 elif tag in _BLOCK_TAGS: self._parts.append("\n") def handle_endtag(self, tag: str) -> None: if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES): self._skip_depth = max(0, self._skip_depth - 1) elif tag in _BLOCK_TAGS: self._parts.append("\n") def handle_data(self, data: str) -> None: if self._skip_depth == 0 and data.strip(): self._parts.append(data) def html_to_text(html: str, *, max_chars: int = 300_000) -> str: p = _Stripper() p.feed(html) text = "".join(p._parts) text = re.sub(r"[ \t ]+", " ", text) text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text) text = "\n".join(line.strip() for line in text.splitlines()) text = text.strip() return text[:max_chars]