48 lines
1.6 KiB
Python
48 lines
1.6 KiB
Python
"""SEC filing HTML → plain text. Stdlib only (boring, inspectable).
|
||
|
||
Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge <ix:hidden> section of
|
||
numeric facts that would otherwise swamp the extractor), and collapses whitespace.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
from html.parser import HTMLParser
|
||
|
||
_SKIP_TAGS = {"script", "style", "head"}
|
||
_SKIP_PREFIXES = ("ix:hidden",) # inline-XBRL hidden fact dump
|
||
_BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"}
|
||
|
||
|
||
class _Stripper(HTMLParser):
|
||
def __init__(self) -> None:
|
||
super().__init__(convert_charrefs=True)
|
||
self._skip_depth = 0
|
||
self._parts: list[str] = []
|
||
|
||
def handle_starttag(self, tag: str, attrs) -> None:
|
||
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
|
||
self._skip_depth += 1
|
||
elif tag in _BLOCK_TAGS:
|
||
self._parts.append("\n")
|
||
|
||
def handle_endtag(self, tag: str) -> None:
|
||
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
|
||
self._skip_depth = max(0, self._skip_depth - 1)
|
||
elif tag in _BLOCK_TAGS:
|
||
self._parts.append("\n")
|
||
|
||
def handle_data(self, data: str) -> None:
|
||
if self._skip_depth == 0 and data.strip():
|
||
self._parts.append(data)
|
||
|
||
|
||
def html_to_text(html: str, *, max_chars: int = 300_000) -> str:
|
||
p = _Stripper()
|
||
p.feed(html)
|
||
text = "".join(p._parts)
|
||
text = re.sub(r"[ \t ]+", " ", text)
|
||
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
|
||
text = "\n".join(line.strip() for line in text.splitlines())
|
||
text = text.strip()
|
||
return text[:max_chars]
|