- sec_bulk_ingest.py: new module — downloads quarterly form.idx from SEC EDGAR, filters Form 4/4A, fetches each filing's SGML/XML, parses and stores. Adaptive token-bucket rate limiter (backs off on 429/5xx, ramps on success). Uses filter_new_accessions for fast quarter-level dedup before any HTTP. Marks derivative-only filings as seen so they're skipped on resume. - form4_parser: extract tx_code (transactionCode) from each transaction row; fix role extraction (Director/10%owner/Officer fallback); fix _text() to handle <value> sub-elements; fix footnote text extraction - edgar_poller: filter feed entries to Form 4/4A only; skip XSLT stylesheet URLs when resolving XML filing links Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
226 lines
6.9 KiB
Python
226 lines
6.9 KiB
Python
"""
|
|
Proper SEC EDGAR bulk ingest using quarterly form.idx files.
|
|
|
|
Flow:
|
|
1. Download form.idx for a quarter (one request, ~50 MB uncompressed)
|
|
2. Filter to Form 4 / 4/A entries
|
|
3. For each entry the index gives us the direct submission .txt URL
|
|
4. Fetch .txt → parse SGML → extract embedded XML → parse Form 4
|
|
5. No index-page roundtrip needed; one HTTP request per filing.
|
|
|
|
Rate: stays at 10 req/s using a persistent requests.Session for connection reuse.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
import threading
|
|
import time
|
|
from typing import Optional
|
|
|
|
import requests
|
|
|
|
from db.db import accession_exists, filter_new_accessions, insert_filing, mark_accession_seen
|
|
from ingestion.form4_parser import parse_form4
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
FULL_INDEX_BASE = "https://www.sec.gov/Archives/edgar/full-index"
|
|
EDGAR_BASE = "https://www.sec.gov/Archives"
|
|
|
|
HEADERS = {
|
|
"User-Agent": "smaug-insider-monitor mail@dominik-roth.eu",
|
|
"Accept-Encoding": "gzip, deflate",
|
|
}
|
|
|
|
_RATE_INIT = 9.0 # starting req/s (SEC allows 10)
|
|
_RATE_MIN = 1.0
|
|
_RATE_MAX = 9.0
|
|
|
|
|
|
# ---------- adaptive token-bucket rate limiter ----------
|
|
|
|
class _AdaptiveRateLimiter:
|
|
"""Token bucket that backs off on server errors and slowly recovers."""
|
|
|
|
def __init__(self, rate: float):
|
|
self._rate = rate
|
|
self._tokens = rate
|
|
self._last = time.monotonic()
|
|
self._lock = threading.Lock()
|
|
|
|
def acquire(self):
|
|
with self._lock:
|
|
now = time.monotonic()
|
|
self._tokens = min(self._rate, self._tokens + (now - self._last) * self._rate)
|
|
self._last = now
|
|
wait = max(0.0, (1 - self._tokens) / self._rate)
|
|
self._tokens -= 1
|
|
if wait:
|
|
time.sleep(wait)
|
|
|
|
def on_success(self):
|
|
with self._lock:
|
|
self._rate = min(_RATE_MAX, self._rate * 1.02) # slow ramp-up
|
|
|
|
def on_throttle(self):
|
|
with self._lock:
|
|
self._rate = max(_RATE_MIN, self._rate * 0.5)
|
|
logger.debug(f"Rate backed off to {self._rate:.1f} req/s")
|
|
|
|
|
|
_limiter = _AdaptiveRateLimiter(_RATE_INIT)
|
|
|
|
|
|
# ---------- low-level ----------
|
|
|
|
def _make_session() -> requests.Session:
|
|
s = requests.Session()
|
|
s.headers.update(HEADERS)
|
|
return s
|
|
|
|
|
|
def _get(session: requests.Session, url: str, retries: int = 4) -> requests.Response:
|
|
delay = 1.0
|
|
for attempt in range(retries):
|
|
_limiter.acquire()
|
|
try:
|
|
resp = session.get(url, timeout=30)
|
|
if resp.status_code == 429 or resp.status_code >= 500:
|
|
_limiter.on_throttle()
|
|
raise requests.HTTPError(f"HTTP {resp.status_code}")
|
|
resp.raise_for_status()
|
|
_limiter.on_success()
|
|
return resp
|
|
except requests.HTTPError:
|
|
if attempt == retries - 1:
|
|
raise
|
|
wait = delay * (2 ** attempt)
|
|
logger.debug(f"Retry {attempt+1} in {wait:.0f}s")
|
|
time.sleep(wait)
|
|
except Exception as e:
|
|
if attempt == retries - 1:
|
|
raise
|
|
wait = delay * (2 ** attempt)
|
|
logger.debug(f"Retry {attempt+1} in {wait:.0f}s ({e})")
|
|
time.sleep(wait)
|
|
|
|
|
|
# ---------- form.idx parsing ----------
|
|
|
|
def _download_form_idx(session: requests.Session, year: int, quarter: int) -> str:
|
|
url = f"{FULL_INDEX_BASE}/{year}/QTR{quarter}/form.idx"
|
|
logger.info(f"Downloading {url}")
|
|
resp = _get(session, url)
|
|
return resp.text
|
|
|
|
|
|
_IDX_LINE = re.compile(
|
|
r"^(4|4/A)\s+.+?\s+\d+\s+(\d{4}-\d{2}-\d{2})\s+(edgar/data/\S+\.txt)",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _parse_form_idx(text: str) -> list[tuple[str, str, str]]:
|
|
"""Return (accession, filed_date, txt_path) for all Form 4/4A entries."""
|
|
results = []
|
|
for line in text.splitlines():
|
|
m = _IDX_LINE.match(line)
|
|
if not m:
|
|
continue
|
|
filed_date = m.group(2)
|
|
txt_path = m.group(3)
|
|
accession = txt_path.split("/")[-1].replace(".txt", "")
|
|
results.append((accession, filed_date, txt_path))
|
|
return results
|
|
|
|
|
|
# ---------- SGML → XML extraction ----------
|
|
|
|
def _extract_xml(txt_content: str) -> Optional[bytes]:
|
|
"""Pull ownershipDocument XML out of the SGML/XML submission wrapper."""
|
|
end_tag = "</ownershipDocument>"
|
|
end = txt_content.find(end_tag)
|
|
if end == -1:
|
|
return None
|
|
end += len(end_tag)
|
|
# Start from <?xml declaration or <ownershipDocument>, whichever comes first
|
|
start_xml = txt_content.find("<?xml")
|
|
start_doc = txt_content.find("<ownershipDocument>")
|
|
candidates = [i for i in (start_xml, start_doc) if i != -1]
|
|
if not candidates:
|
|
return None
|
|
start = min(candidates)
|
|
return txt_content[start:end].encode("utf-8", errors="replace")
|
|
|
|
|
|
# ---------- per-filing fetch + store ----------
|
|
|
|
def _process_one(
|
|
session: requests.Session,
|
|
accession: str,
|
|
filed_date: str,
|
|
txt_path: str,
|
|
) -> int:
|
|
if accession_exists(accession):
|
|
return 0
|
|
|
|
url = f"{EDGAR_BASE}/{txt_path}"
|
|
try:
|
|
resp = _get(session, url)
|
|
except Exception as e:
|
|
logger.debug(f"Skip {accession}: {e}")
|
|
return 0
|
|
|
|
xml_bytes = _extract_xml(resp.text)
|
|
if not xml_bytes:
|
|
mark_accession_seen(accession)
|
|
return 0
|
|
|
|
parsed = parse_form4(xml_bytes, accession, filed_date)
|
|
if not parsed:
|
|
mark_accession_seen(accession)
|
|
return 0
|
|
return sum(1 for f in parsed if insert_filing(f))
|
|
|
|
|
|
# ---------- public API ----------
|
|
|
|
def ingest_quarter(year: int, quarter: int, session: requests.Session = None) -> int:
|
|
"""Ingest all Form 4 filings for one calendar quarter. Returns rows stored."""
|
|
own_session = session is None
|
|
if own_session:
|
|
session = _make_session()
|
|
|
|
idx_text = _download_form_idx(session, year, quarter)
|
|
all_entries = _parse_form_idx(idx_text)
|
|
logger.info(f" {len(all_entries)} Form 4 entries in {year}/Q{quarter}")
|
|
|
|
accessions = [a for a, _, _ in all_entries]
|
|
new_accessions = filter_new_accessions(accessions)
|
|
entries = [(a, d, p) for a, d, p in all_entries if a in new_accessions]
|
|
logger.info(f" {len(entries)} not yet in DB")
|
|
|
|
stored = 0
|
|
for i, (accession, filed_date, txt_path) in enumerate(entries):
|
|
stored += _process_one(session, accession, filed_date, txt_path)
|
|
if (i + 1) % 1000 == 0:
|
|
logger.info(f" {i+1}/{len(entries)} processed, {stored} rows stored")
|
|
|
|
logger.info(f" Quarter done: {stored} rows stored")
|
|
if own_session:
|
|
session.close()
|
|
return stored
|
|
|
|
|
|
def ingest_years(start_year: int, end_year: int) -> int:
|
|
"""Ingest all Form 4 filings for start_year through end_year inclusive."""
|
|
session = _make_session()
|
|
total = 0
|
|
for year in range(start_year, end_year + 1):
|
|
for quarter in range(1, 5):
|
|
stored = ingest_quarter(year, quarter, session=session)
|
|
total += stored
|
|
logger.info(f"Cumulative after {year}/Q{quarter}: {total} rows")
|
|
session.close()
|
|
return total
|