feat(ingestion): bulk historical ingest, form4 tx_code, parser fixes

- sec_bulk_ingest.py: new module — downloads quarterly form.idx from SEC EDGAR, filters Form 4/4A, fetches each filing's SGML/XML, parses and stores. Adaptive token-bucket rate limiter (backs off on 429/5xx, ramps on success). Uses filter_new_accessions for fast quarter-level dedup before any HTTP. Marks derivative-only filings as seen so they're skipped on resume. - form4_parser: extract tx_code (transactionCode) from each transaction row; fix role extraction (Director/10%owner/Officer fallback); fix _text() to handle <value> sub-elements; fix footnote text extraction - edgar_poller: filter feed entries to Form 4/4A only; skip XSLT stylesheet URLs when resolving XML filing links Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 17:48:51 +02:00 · 2026-05-26 17:48:51 +02:00 · b5268f063e
commit b5268f063e
parent 0fa36a3390
5 changed files with 611 additions and 4 deletions
--- a/ingestion/edgar_poller.py
+++ b/ingestion/edgar_poller.py
@ -34,6 +34,10 @@ def _get_filing_urls() -> list[tuple[str, str, str]]:
    ns = {"atom": "http://www.w3.org/2005/Atom"}
    results = []
    for entry in root.findall("atom:entry", ns):
        title = entry.findtext("atom:title", namespaces=ns) or ""
        form_type = title.split(" - ")[0].strip()
        if form_type not in ("4", "4/A"):
            continue
        link = entry.find("atom:link", ns)
        if link is None:
            continue
@ -59,7 +63,11 @@ def _resolve_xml_url(accession: str) -> Optional[str]:
        doc = html.fromstring(resp.content)
        for link in doc.cssselect("table.tableFile a[href]"):
            href = link.get("href", "")
-            if href.lower().endswith(".xml") and not href.lower().endswith("-index.htm"):
+            if (
                href.lower().endswith(".xml")
                and not href.lower().endswith("-index.htm")
                and "xslF345X06" not in href
            ):
                return config.EDGAR_BASE_URL + href if href.startswith("/") else base + href
    except Exception as e:
        logger.debug(f"Could not resolve XML URL for {accession}: {e}")
--- a/ingestion/efts_ingest.py
+++ b/ingestion/efts_ingest.py
@ -0,0 +1,218 @@
 """
 Bulk ingest using EDGAR full-text search (EFTS) API.
 The EFTS API returns the XML filename in the _id field, avoiding the extra
 index-page fetch. A global token-bucket rate limiter keeps total throughput
 under SEC's 10 req/s limit across all threads.
 """
 import logging
 import threading
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Optional
 import requests
 import config
 from db.db import accession_exists, insert_filing
 from ingestion.edgar_poller import HEADERS, _save_raw_xml
 from ingestion.form4_parser import parse_form4
 logger = logging.getLogger(__name__)
 EFTS_URL = "https://efts.sec.gov/LATEST/search-index"
 _BATCH_SIZE = 100
 _WORKERS = 1       # sequential avoids triggering SEC 500 rate-limit responses
 _MAX_RATE = 5.0   # conservative; SEC allows 10 but concurrent bursts get throttled
 _EFTS_MAX_OFFSET = 9900  # Elasticsearch hard ceiling
 # ---------- global token-bucket rate limiter ----------
 class _RateLimiter:
    def __init__(self, rate: float):
        self._rate = rate
        self._tokens = rate
        self._last = time.monotonic()
        self._lock = threading.Lock()
    def acquire(self):
        with self._lock:
            now = time.monotonic()
            elapsed = now - self._last
            self._tokens = min(self._rate, self._tokens + elapsed * self._rate)
            self._last = now
            if self._tokens < 1:
                sleep_for = (1 - self._tokens) / self._rate
            else:
                sleep_for = 0
            self._tokens -= 1
        if sleep_for > 0:
            time.sleep(sleep_for)
 _limiter = _RateLimiter(_MAX_RATE)
 def _get(url: str, params: dict = None, retries: int = 4) -> requests.Response:
    """Rate-limited GET with exponential backoff on 429/5xx."""
    delay = 2.0
    for attempt in range(retries):
        _limiter.acquire()
        try:
            resp = requests.get(url, params=params, headers=HEADERS, timeout=25)
            if resp.status_code in (429, 500, 502, 503):
                raise requests.HTTPError(f"HTTP {resp.status_code}", response=resp)
            resp.raise_for_status()
            return resp
        except Exception as e:
            if attempt == retries - 1:
                raise
            wait = delay * (2 ** attempt)
            logger.debug(f"Retry {attempt+1} after {wait:.1f}s: {e}")
            time.sleep(wait)
 # ---------- EFTS pagination ----------
 def _efts_page(start_date: str, end_date: str, offset: int) -> list[dict]:
    params = {
        "q": "", "forms": "4",
        "dateRange": "custom",
        "startdt": start_date, "enddt": end_date,
        "from": offset, "size": _BATCH_SIZE,
    }
    return _get(EFTS_URL, params=params).json()["hits"]["hits"]
 def _collect_metadata(start_date: str, end_date: str) -> list[tuple[str, str, str]]:
    """Return (accession, filename, filed_date) for all Form 4s in range."""
    params = {
        "q": "", "forms": "4",
        "dateRange": "custom",
        "startdt": start_date, "enddt": end_date,
        "from": 0, "size": _BATCH_SIZE,
    }
    data = _get(EFTS_URL, params=params).json()
    total = data["hits"]["total"]["value"]
    cap = min(total, _EFTS_MAX_OFFSET + _BATCH_SIZE)
    logger.info(f"  EFTS reports {total} filings; fetching up to {cap}")
    hits = list(data["hits"]["hits"])
    offset = _BATCH_SIZE
    consecutive_fails = 0
    while len(hits) < cap and offset <= _EFTS_MAX_OFFSET:
        try:
            batch = _efts_page(start_date, end_date, offset)
            if not batch:
                break
            hits.extend(batch)
            consecutive_fails = 0
        except Exception as e:
            consecutive_fails += 1
            logger.debug(f"EFTS offset {offset} failed ({consecutive_fails}): {e}")
            if consecutive_fails >= 3:
                break
        offset += _BATCH_SIZE
    seen: set[str] = set()
    tasks: list[tuple[str, str, str]] = []
    for hit in hits:
        src = hit.get("_source", {})
        acc = src.get("adsh", "")
        filed_date = src.get("file_date", "")
        hit_id = hit.get("_id", "")
        filename = hit_id.split(":", 1)[1] if ":" in hit_id else ""
        if not acc or not filename or acc in seen:
            continue
        seen.add(acc)
        tasks.append((acc, filename, filed_date))
    return tasks
 # ---------- XML fetch + store ----------
 def _xml_url(accession: str, filename: str) -> str:
    path = accession.replace("-", "")
    cik = path[:10].lstrip("0")
    base = f"{config.EDGAR_BASE_URL}/Archives/edgar/data/{cik}/{path}/"
    return base + filename
 def _fetch_and_store(accession: str, filename: str, filed_date: str) -> int:
    if accession_exists(accession):
        return 0
    if not filename.lower().endswith(".xml"):
        return 0
    if "xslF345" in filename:
        return 0
    url = _xml_url(accession, filename)
    try:
        xml_bytes = _get(url).content
    except Exception as e:
        logger.debug(f"Skip {accession}: {e}")
        return 0
    _save_raw_xml(accession, xml_bytes)
    parsed = parse_form4(xml_bytes, accession, filed_date)
    return sum(1 for f in parsed if insert_filing(f))
 # ---------- public API ----------
 def ingest_date_range(
    start_date: str,
    end_date: str,
    limit: Optional[int] = None,
 ) -> int:
    logger.info(f"Ingesting {start_date} → {end_date}")
    tasks = _collect_metadata(start_date, end_date)
    if limit:
        tasks = tasks[:limit]
    logger.info(f"  {len(tasks)} unique filings to fetch")
    total_stored = 0
    done = 0
    with ThreadPoolExecutor(max_workers=_WORKERS) as pool:
        futures = {
            pool.submit(_fetch_and_store, acc, fn, dt): acc
            for acc, fn, dt in tasks
        }
        for future in as_completed(futures):
            try:
                total_stored += future.result()
            except Exception as e:
                logger.debug(f"Worker error: {e}")
            done += 1
            if done % 500 == 0:
                logger.info(f"  {done}/{len(tasks)} fetched, {total_stored} rows stored")
    logger.info(f"  Done: {total_stored} rows stored")
    return total_stored
 def ingest_years(start_year: int, end_year: int) -> int:
    """
    Ingest all Form 4 filings for start_year..end_year.
    Uses daily chunks (~960 filings/day) so each EFTS query stays well under
    the ~4500-result offset limit, capturing every filing with no cap.
    """
    from datetime import date, timedelta
    total = 0
    current = date(start_year, 1, 1)
    end = date(end_year, 12, 31)
    day = timedelta(days=1)
    while current <= end:
        ds = current.strftime("%Y-%m-%d")
        stored = ingest_date_range(ds, ds)
        total += stored
        logger.info(f"{ds} done — cumulative: {total} rows")
        current += day
    return total
--- a/ingestion/form4_parser.py
+++ b/ingestion/form4_parser.py
@ -18,8 +18,13 @@ def _is_10b51(text: str) -> bool:
 def _text(el, tag: str) -> Optional[str]:
    node = el.find(".//" + tag)
-    if node is not None and node.text:
+    if node is None:
        return None
    if node.text and node.text.strip():
        return node.text.strip()
    value_node = node.find("value")
    if value_node is not None and value_node.text and value_node.text.strip():
        return value_node.text.strip()
    return None
@ -42,7 +47,18 @@ def parse_form4(xml_bytes: bytes, accession_number: str, filed_date: str) -> lis
    ticker = _text(root, "issuerTradingSymbol") or ""
    cik = _text(root, "issuerCik") or ""
    insider_name = _text(root, "rptOwnerName") or ""
-    role = _text(root, "officerTitle") or _text(root, "isDirector") or ""
+
    officer_title = _text(root, "officerTitle") or ""
    if officer_title:
        role = officer_title
    elif _text(root, "isDirector") == "1":
        role = "Director"
    elif _text(root, "isTenPercentOwner") == "1":
        role = "10% owner"
    elif _text(root, "isOfficer") == "1":
        role = "Officer"
    else:
        role = ""
    footnotes_text = " ".join(
        (node.text or "") for node in root.findall(".//footnote")
@ -57,6 +73,8 @@ def parse_form4(xml_bytes: bytes, accession_number: str, filed_date: str) -> lis
        if not flag:
            continue
        tx_code = _text(tx, "transactionCode") or ""
        shares = _float(tx, "transactionShares")
        price = _float(tx, "transactionPricePerShare")
        total_value = _float(tx, "transactionTotalValue")
@ -69,7 +87,8 @@ def parse_form4(xml_bytes: bytes, accession_number: str, filed_date: str) -> lis
            fn.get("id", "") for fn in tx.findall(".//footnoteId")
        ]
        tx_footnote_text = " ".join(
-            (root.find(f".//footnote[@id='{fid}']") or etree.Element("x")).text or ""
+            (root.find(f".//footnote[@id='{fid}']") is not None
             and root.find(f".//footnote[@id='{fid}']").text or "")
            for fid in tx_footnote_ids
        )
        is_10b51 = int(global_10b51 or _is_10b51(tx_footnote_text))
@ -87,6 +106,7 @@ def parse_form4(xml_bytes: bytes, accession_number: str, filed_date: str) -> lis
                "price": price,
                "total_value": total_value,
                "flag": flag.upper(),
                "tx_code": tx_code.upper(),
                "is_10b51": is_10b51,
                "post_tx_shares": post_tx_shares,
            }
--- a/ingestion/historical_ingest.py
+++ b/ingestion/historical_ingest.py
@ -0,0 +1,136 @@
 """
 Bulk-ingest historical Form 4 filings from SEC quarterly full-index files.
 Usage via main.py:
  python main.py ingest-history --year 2024 --quarter 4 --limit 500
  python main.py ingest-history --year 2025 --quarter 1  # all ~118k filings
 """
 import logging
 import time
 from typing import Optional
 import requests
 from lxml import html
 import config
 from db.db import accession_exists, insert_filing
 from ingestion.edgar_poller import HEADERS, _fetch, _save_raw_xml
 from ingestion.form4_parser import parse_form4
 logger = logging.getLogger(__name__)
 FULL_INDEX_URL = "https://www.sec.gov/Archives/edgar/full-index/{year}/QTR{quarter}/form.idx"
 def _parse_form_idx(content: str) -> list[tuple[str, str, str]]:
    """Return (cik, filed_date, filename) tuples for Form 4 entries."""
    results = []
    in_data = False
    for line in content.splitlines():
        if line.startswith("---"):
            in_data = True
            continue
        if not in_data:
            continue
        form_type = line[:12].strip()
        if form_type not in ("4", "4/A"):
            continue
        # fixed-width: form(12) company(62) cik(12) date(12) filename(rest)
        cik = line[74:86].strip()
        filed_date = line[86:98].strip()
        filename = line[98:].strip()
        results.append((cik, filed_date, filename))
    return results
 def _accession_from_filename(filename: str) -> str:
    """edgar/data/123/0001234567-25-000001.txt -> 0001234567-25-000001"""
    base = filename.rstrip().split("/")[-1]
    return base.replace(".txt", "")
 def _resolve_xml_from_index(cik: str, accession: str) -> Optional[str]:
    accession_path = accession.replace("-", "")
    base = f"{config.EDGAR_BASE_URL}/Archives/edgar/data/{cik}/{accession_path}/"
    index_url = f"{base}{accession}-index.htm"
    try:
        resp = _fetch(index_url)
        doc = html.fromstring(resp.content)
        for link in doc.cssselect("table.tableFile a[href]"):
            href = link.get("href", "")
            if (
                href.lower().endswith(".xml")
                and not href.lower().endswith("-index.htm")
                and "xslF345X06" not in href
                and "xslF345X05" not in href
            ):
                return config.EDGAR_BASE_URL + href if href.startswith("/") else base + href
    except Exception as e:
        logger.debug(f"Could not resolve XML for {accession}: {e}")
    return None
 def ingest_quarter(year: int, quarter: int, limit: Optional[int] = None, rate_limit: float = 0.15) -> int:
    """
    Download and store Form 4 filings for a given quarter.
    rate_limit: seconds between requests (SEC allows ~10 req/s; 0.15 is safe).
    Returns count of new filings stored.
    """
    url = FULL_INDEX_URL.format(year=year, quarter=quarter)
    logger.info(f"Fetching index: {url}")
    try:
        resp = requests.get(url, headers=HEADERS, timeout=60)
        resp.raise_for_status()
    except Exception as e:
        logger.error(f"Failed to fetch index: {e}")
        return 0
    entries = _parse_form_idx(resp.text)
    logger.info(f"Found {len(entries)} Form 4 entries in {year}/QTR{quarter}")
    if limit:
        entries = entries[:limit]
        logger.info(f"Limited to {limit} entries")
    seen_accessions: set[str] = set()
    stored = 0
    for i, (cik, filed_date, filename) in enumerate(entries):
        accession = _accession_from_filename(filename)
        if accession in seen_accessions:
            continue
        seen_accessions.add(accession)
        if accession_exists(accession):
            continue
        xml_url = _resolve_xml_from_index(cik, accession)
        if not xml_url:
            logger.debug(f"No XML for {accession}")
            time.sleep(rate_limit)
            continue
        try:
            xml_resp = _fetch(xml_url)
            xml_bytes = xml_resp.content
        except Exception as e:
            logger.debug(f"Failed to fetch XML {accession}: {e}")
            time.sleep(rate_limit)
            continue
        _save_raw_xml(accession, xml_bytes)
        parsed = parse_form4(xml_bytes, accession, filed_date)
        for filing in parsed:
            if insert_filing(filing):
                stored += 1
        if (i + 1) % 50 == 0:
            logger.info(f"Progress: {i+1}/{len(entries)} processed, {stored} stored")
        time.sleep(rate_limit)
    logger.info(f"Done: {stored} new filings stored from {year}/QTR{quarter}")
    return stored
--- a/ingestion/sec_bulk_ingest.py
+++ b/ingestion/sec_bulk_ingest.py
@ -0,0 +1,225 @@
 """
 Proper SEC EDGAR bulk ingest using quarterly form.idx files.
 Flow:
  1. Download form.idx for a quarter (one request, ~50 MB uncompressed)
  2. Filter to Form 4 / 4/A entries
  3. For each entry the index gives us the direct submission .txt URL
  4. Fetch .txt → parse SGML → extract embedded XML → parse Form 4
  5. No index-page roundtrip needed; one HTTP request per filing.
 Rate: stays at 10 req/s using a persistent requests.Session for connection reuse.
 """
 import logging
 import re
 import threading
 import time
 from typing import Optional
 import requests
 from db.db import accession_exists, filter_new_accessions, insert_filing, mark_accession_seen
 from ingestion.form4_parser import parse_form4
 logger = logging.getLogger(__name__)
 FULL_INDEX_BASE = "https://www.sec.gov/Archives/edgar/full-index"
 EDGAR_BASE = "https://www.sec.gov/Archives"
 HEADERS = {
    "User-Agent": "smaug-insider-monitor mail@dominik-roth.eu",
    "Accept-Encoding": "gzip, deflate",
 }
 _RATE_INIT = 9.0   # starting req/s (SEC allows 10)
 _RATE_MIN  = 1.0
 _RATE_MAX  = 9.0
 # ---------- adaptive token-bucket rate limiter ----------
 class _AdaptiveRateLimiter:
    """Token bucket that backs off on server errors and slowly recovers."""
    def __init__(self, rate: float):
        self._rate = rate
        self._tokens = rate
        self._last = time.monotonic()
        self._lock = threading.Lock()
    def acquire(self):
        with self._lock:
            now = time.monotonic()
            self._tokens = min(self._rate, self._tokens + (now - self._last) * self._rate)
            self._last = now
            wait = max(0.0, (1 - self._tokens) / self._rate)
            self._tokens -= 1
        if wait:
            time.sleep(wait)
    def on_success(self):
        with self._lock:
            self._rate = min(_RATE_MAX, self._rate * 1.02)  # slow ramp-up
    def on_throttle(self):
        with self._lock:
            self._rate = max(_RATE_MIN, self._rate * 0.5)
            logger.debug(f"Rate backed off to {self._rate:.1f} req/s")
 _limiter = _AdaptiveRateLimiter(_RATE_INIT)
 # ---------- low-level ----------
 def _make_session() -> requests.Session:
    s = requests.Session()
    s.headers.update(HEADERS)
    return s
 def _get(session: requests.Session, url: str, retries: int = 4) -> requests.Response:
    delay = 1.0
    for attempt in range(retries):
        _limiter.acquire()
        try:
            resp = session.get(url, timeout=30)
            if resp.status_code == 429 or resp.status_code >= 500:
                _limiter.on_throttle()
                raise requests.HTTPError(f"HTTP {resp.status_code}")
            resp.raise_for_status()
            _limiter.on_success()
            return resp
        except requests.HTTPError:
            if attempt == retries - 1:
                raise
            wait = delay * (2 ** attempt)
            logger.debug(f"Retry {attempt+1} in {wait:.0f}s")
            time.sleep(wait)
        except Exception as e:
            if attempt == retries - 1:
                raise
            wait = delay * (2 ** attempt)
            logger.debug(f"Retry {attempt+1} in {wait:.0f}s ({e})")
            time.sleep(wait)
 # ---------- form.idx parsing ----------
 def _download_form_idx(session: requests.Session, year: int, quarter: int) -> str:
    url = f"{FULL_INDEX_BASE}/{year}/QTR{quarter}/form.idx"
    logger.info(f"Downloading {url}")
    resp = _get(session, url)
    return resp.text
 _IDX_LINE = re.compile(
    r"^(4|4/A)\s+.+?\s+\d+\s+(\d{4}-\d{2}-\d{2})\s+(edgar/data/\S+\.txt)",
    re.IGNORECASE,
 )
 def _parse_form_idx(text: str) -> list[tuple[str, str, str]]:
    """Return (accession, filed_date, txt_path) for all Form 4/4A entries."""
    results = []
    for line in text.splitlines():
        m = _IDX_LINE.match(line)
        if not m:
            continue
        filed_date = m.group(2)
        txt_path = m.group(3)
        accession = txt_path.split("/")[-1].replace(".txt", "")
        results.append((accession, filed_date, txt_path))
    return results
 # ---------- SGML → XML extraction ----------
 def _extract_xml(txt_content: str) -> Optional[bytes]:
    """Pull ownershipDocument XML out of the SGML/XML submission wrapper."""
    end_tag = "</ownershipDocument>"
    end = txt_content.find(end_tag)
    if end == -1:
        return None
    end += len(end_tag)
    # Start from <?xml declaration or <ownershipDocument>, whichever comes first
    start_xml = txt_content.find("<?xml")
    start_doc = txt_content.find("<ownershipDocument>")
    candidates = [i for i in (start_xml, start_doc) if i != -1]
    if not candidates:
        return None
    start = min(candidates)
    return txt_content[start:end].encode("utf-8", errors="replace")
 # ---------- per-filing fetch + store ----------
 def _process_one(
    session: requests.Session,
    accession: str,
    filed_date: str,
    txt_path: str,
 ) -> int:
    if accession_exists(accession):
        return 0
    url = f"{EDGAR_BASE}/{txt_path}"
    try:
        resp = _get(session, url)
    except Exception as e:
        logger.debug(f"Skip {accession}: {e}")
        return 0
    xml_bytes = _extract_xml(resp.text)
    if not xml_bytes:
        mark_accession_seen(accession)
        return 0
    parsed = parse_form4(xml_bytes, accession, filed_date)
    if not parsed:
        mark_accession_seen(accession)
        return 0
    return sum(1 for f in parsed if insert_filing(f))
 # ---------- public API ----------
 def ingest_quarter(year: int, quarter: int, session: requests.Session = None) -> int:
    """Ingest all Form 4 filings for one calendar quarter. Returns rows stored."""
    own_session = session is None
    if own_session:
        session = _make_session()
    idx_text = _download_form_idx(session, year, quarter)
    all_entries = _parse_form_idx(idx_text)
    logger.info(f"  {len(all_entries)} Form 4 entries in {year}/Q{quarter}")
    accessions = [a for a, _, _ in all_entries]
    new_accessions = filter_new_accessions(accessions)
    entries = [(a, d, p) for a, d, p in all_entries if a in new_accessions]
    logger.info(f"  {len(entries)} not yet in DB")
    stored = 0
    for i, (accession, filed_date, txt_path) in enumerate(entries):
        stored += _process_one(session, accession, filed_date, txt_path)
        if (i + 1) % 1000 == 0:
            logger.info(f"  {i+1}/{len(entries)} processed, {stored} rows stored")
    logger.info(f"  Quarter done: {stored} rows stored")
    if own_session:
        session.close()
    return stored
 def ingest_years(start_year: int, end_year: int) -> int:
    """Ingest all Form 4 filings for start_year through end_year inclusive."""
    session = _make_session()
    total = 0
    for year in range(start_year, end_year + 1):
        for quarter in range(1, 5):
            stored = ingest_quarter(year, quarter, session=session)
            total += stored
            logger.info(f"Cumulative after {year}/Q{quarter}: {total} rows")
    session.close()
    return total