smaug/ingestion/form4_parser.py
Dominik Roth b5268f063e feat(ingestion): bulk historical ingest, form4 tx_code, parser fixes
- sec_bulk_ingest.py: new module — downloads quarterly form.idx from SEC EDGAR,
  filters Form 4/4A, fetches each filing's SGML/XML, parses and stores.
  Adaptive token-bucket rate limiter (backs off on 429/5xx, ramps on success).
  Uses filter_new_accessions for fast quarter-level dedup before any HTTP.
  Marks derivative-only filings as seen so they're skipped on resume.
- form4_parser: extract tx_code (transactionCode) from each transaction row;
  fix role extraction (Director/10%owner/Officer fallback); fix _text() to
  handle <value> sub-elements; fix footnote text extraction
- edgar_poller: filter feed entries to Form 4/4A only; skip XSLT stylesheet URLs
  when resolving XML filing links

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 17:48:51 +02:00

116 lines
3.3 KiB
Python

import re
from lxml import etree
from typing import Optional
_10B51_PATTERNS = [
r"10b5-1",
r"rule 10b5",
r"adopted a plan",
r"10b5\(1\)",
]
def _is_10b51(text: str) -> bool:
text_lower = text.lower()
return any(re.search(p, text_lower) for p in _10B51_PATTERNS)
def _text(el, tag: str) -> Optional[str]:
node = el.find(".//" + tag)
if node is None:
return None
if node.text and node.text.strip():
return node.text.strip()
value_node = node.find("value")
if value_node is not None and value_node.text and value_node.text.strip():
return value_node.text.strip()
return None
def _float(el, tag: str) -> Optional[float]:
val = _text(el, tag)
if val is None:
return None
try:
return float(val.replace(",", ""))
except ValueError:
return None
def parse_form4(xml_bytes: bytes, accession_number: str, filed_date: str) -> list[dict]:
try:
root = etree.fromstring(xml_bytes)
except etree.XMLSyntaxError:
return []
ticker = _text(root, "issuerTradingSymbol") or ""
cik = _text(root, "issuerCik") or ""
insider_name = _text(root, "rptOwnerName") or ""
officer_title = _text(root, "officerTitle") or ""
if officer_title:
role = officer_title
elif _text(root, "isDirector") == "1":
role = "Director"
elif _text(root, "isTenPercentOwner") == "1":
role = "10% owner"
elif _text(root, "isOfficer") == "1":
role = "Officer"
else:
role = ""
footnotes_text = " ".join(
(node.text or "") for node in root.findall(".//footnote")
)
global_10b51 = _is_10b51(footnotes_text)
transactions = root.findall(".//nonDerivativeTransaction")
results = []
for tx in transactions:
flag = _text(tx, "transactionAcquiredDisposedCode")
if not flag:
continue
tx_code = _text(tx, "transactionCode") or ""
shares = _float(tx, "transactionShares")
price = _float(tx, "transactionPricePerShare")
total_value = _float(tx, "transactionTotalValue")
if total_value is None and shares is not None and price is not None:
total_value = shares * price
post_tx_shares = _float(tx, "sharesOwnedFollowingTransaction")
tx_date = _text(tx, "transactionDate") or filed_date
tx_footnote_ids = [
fn.get("id", "") for fn in tx.findall(".//footnoteId")
]
tx_footnote_text = " ".join(
(root.find(f".//footnote[@id='{fid}']") is not None
and root.find(f".//footnote[@id='{fid}']").text or "")
for fid in tx_footnote_ids
)
is_10b51 = int(global_10b51 or _is_10b51(tx_footnote_text))
results.append(
{
"accession_number": accession_number,
"ticker": ticker.upper(),
"cik": cik,
"insider_name": insider_name,
"role": role,
"transaction_date": tx_date,
"filed_date": filed_date,
"shares": shares,
"price": price,
"total_value": total_value,
"flag": flag.upper(),
"tx_code": tx_code.upper(),
"is_10b51": is_10b51,
"post_tx_shares": post_tx_shares,
}
)
return results