Files

48 lines
1.6 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""SEC filing HTML → plain text. Stdlib only (boring, inspectable).
Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge <ix:hidden> section of
numeric facts that would otherwise swamp the extractor), and collapses whitespace.
"""
from __future__ import annotations
import re
from html.parser import HTMLParser
_SKIP_TAGS = {"script", "style", "head"}
_SKIP_PREFIXES = ("ix:hidden",) # inline-XBRL hidden fact dump
_BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"}
class _Stripper(HTMLParser):
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self._skip_depth = 0
self._parts: list[str] = []
def handle_starttag(self, tag: str, attrs) -> None:
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
self._skip_depth += 1
elif tag in _BLOCK_TAGS:
self._parts.append("\n")
def handle_endtag(self, tag: str) -> None:
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
self._skip_depth = max(0, self._skip_depth - 1)
elif tag in _BLOCK_TAGS:
self._parts.append("\n")
def handle_data(self, data: str) -> None:
if self._skip_depth == 0 and data.strip():
self._parts.append(data)
def html_to_text(html: str, *, max_chars: int = 300_000) -> str:
p = _Stripper()
p.feed(html)
text = "".join(p._parts)
text = re.sub(r"[ \t ]+", " ", text)
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
text = "\n".join(line.strip() for line in text.splitlines())
text = text.strip()
return text[:max_chars]