- sec_bulk_ingest.py: new module — downloads quarterly form.idx from SEC EDGAR, filters Form 4/4A, fetches each filing's SGML/XML, parses and stores. Adaptive token-bucket rate limiter (backs off on 429/5xx, ramps on success). Uses filter_new_accessions for fast quarter-level dedup before any HTTP. Marks derivative-only filings as seen so they're skipped on resume. - form4_parser: extract tx_code (transactionCode) from each transaction row; fix role extraction (Director/10%owner/Officer fallback); fix _text() to handle <value> sub-elements; fix footnote text extraction - edgar_poller: filter feed entries to Form 4/4A only; skip XSLT stylesheet URLs when resolving XML filing links Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
116 lines
3.3 KiB
Python
116 lines
3.3 KiB
Python
import re
|
|
from lxml import etree
|
|
from typing import Optional
|
|
|
|
|
|
_10B51_PATTERNS = [
|
|
r"10b5-1",
|
|
r"rule 10b5",
|
|
r"adopted a plan",
|
|
r"10b5\(1\)",
|
|
]
|
|
|
|
|
|
def _is_10b51(text: str) -> bool:
|
|
text_lower = text.lower()
|
|
return any(re.search(p, text_lower) for p in _10B51_PATTERNS)
|
|
|
|
|
|
def _text(el, tag: str) -> Optional[str]:
|
|
node = el.find(".//" + tag)
|
|
if node is None:
|
|
return None
|
|
if node.text and node.text.strip():
|
|
return node.text.strip()
|
|
value_node = node.find("value")
|
|
if value_node is not None and value_node.text and value_node.text.strip():
|
|
return value_node.text.strip()
|
|
return None
|
|
|
|
|
|
def _float(el, tag: str) -> Optional[float]:
|
|
val = _text(el, tag)
|
|
if val is None:
|
|
return None
|
|
try:
|
|
return float(val.replace(",", ""))
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def parse_form4(xml_bytes: bytes, accession_number: str, filed_date: str) -> list[dict]:
|
|
try:
|
|
root = etree.fromstring(xml_bytes)
|
|
except etree.XMLSyntaxError:
|
|
return []
|
|
|
|
ticker = _text(root, "issuerTradingSymbol") or ""
|
|
cik = _text(root, "issuerCik") or ""
|
|
insider_name = _text(root, "rptOwnerName") or ""
|
|
|
|
officer_title = _text(root, "officerTitle") or ""
|
|
if officer_title:
|
|
role = officer_title
|
|
elif _text(root, "isDirector") == "1":
|
|
role = "Director"
|
|
elif _text(root, "isTenPercentOwner") == "1":
|
|
role = "10% owner"
|
|
elif _text(root, "isOfficer") == "1":
|
|
role = "Officer"
|
|
else:
|
|
role = ""
|
|
|
|
footnotes_text = " ".join(
|
|
(node.text or "") for node in root.findall(".//footnote")
|
|
)
|
|
global_10b51 = _is_10b51(footnotes_text)
|
|
|
|
transactions = root.findall(".//nonDerivativeTransaction")
|
|
results = []
|
|
|
|
for tx in transactions:
|
|
flag = _text(tx, "transactionAcquiredDisposedCode")
|
|
if not flag:
|
|
continue
|
|
|
|
tx_code = _text(tx, "transactionCode") or ""
|
|
|
|
shares = _float(tx, "transactionShares")
|
|
price = _float(tx, "transactionPricePerShare")
|
|
total_value = _float(tx, "transactionTotalValue")
|
|
if total_value is None and shares is not None and price is not None:
|
|
total_value = shares * price
|
|
post_tx_shares = _float(tx, "sharesOwnedFollowingTransaction")
|
|
tx_date = _text(tx, "transactionDate") or filed_date
|
|
|
|
tx_footnote_ids = [
|
|
fn.get("id", "") for fn in tx.findall(".//footnoteId")
|
|
]
|
|
tx_footnote_text = " ".join(
|
|
(root.find(f".//footnote[@id='{fid}']") is not None
|
|
and root.find(f".//footnote[@id='{fid}']").text or "")
|
|
for fid in tx_footnote_ids
|
|
)
|
|
is_10b51 = int(global_10b51 or _is_10b51(tx_footnote_text))
|
|
|
|
results.append(
|
|
{
|
|
"accession_number": accession_number,
|
|
"ticker": ticker.upper(),
|
|
"cik": cik,
|
|
"insider_name": insider_name,
|
|
"role": role,
|
|
"transaction_date": tx_date,
|
|
"filed_date": filed_date,
|
|
"shares": shares,
|
|
"price": price,
|
|
"total_value": total_value,
|
|
"flag": flag.upper(),
|
|
"tx_code": tx_code.upper(),
|
|
"is_10b51": is_10b51,
|
|
"post_tx_shares": post_tx_shares,
|
|
}
|
|
)
|
|
|
|
return results
|