smaug/ingestion/historical_ingest.py

"""
Bulk-ingest historical Form 4 filings from SEC quarterly full-index files.

Usage via main.py:
  python main.py ingest-history --year 2024 --quarter 4 --limit 500
  python main.py ingest-history --year 2025 --quarter 1  # all ~118k filings
"""

import logging
import time
from typing import Optional

import requests
from lxml import html

import config
from db.db import accession_exists, insert_filing
from ingestion.edgar_poller import HEADERS, _fetch, _save_raw_xml
from ingestion.form4_parser import parse_form4

logger = logging.getLogger(__name__)

FULL_INDEX_URL = "https://www.sec.gov/Archives/edgar/full-index/{year}/QTR{quarter}/form.idx"


def _parse_form_idx(content: str) -> list[tuple[str, str, str]]:
    """Return (cik, filed_date, filename) tuples for Form 4 entries."""
    results = []
    in_data = False
    for line in content.splitlines():
        if line.startswith("---"):
            in_data = True
            continue
        if not in_data:
            continue
        form_type = line[:12].strip()
        if form_type not in ("4", "4/A"):
            continue
        # fixed-width: form(12) company(62) cik(12) date(12) filename(rest)
        cik = line[74:86].strip()
        filed_date = line[86:98].strip()
        filename = line[98:].strip()
        results.append((cik, filed_date, filename))
    return results


def _accession_from_filename(filename: str) -> str:
    """edgar/data/123/0001234567-25-000001.txt -> 0001234567-25-000001"""
    base = filename.rstrip().split("/")[-1]
    return base.replace(".txt", "")


def _resolve_xml_from_index(cik: str, accession: str) -> Optional[str]:
    accession_path = accession.replace("-", "")
    base = f"{config.EDGAR_BASE_URL}/Archives/edgar/data/{cik}/{accession_path}/"
    index_url = f"{base}{accession}-index.htm"
    try:
        resp = _fetch(index_url)
        doc = html.fromstring(resp.content)
        for link in doc.cssselect("table.tableFile a[href]"):
            href = link.get("href", "")
            if (
                href.lower().endswith(".xml")
                and not href.lower().endswith("-index.htm")
                and "xslF345X06" not in href
                and "xslF345X05" not in href
            ):
                return config.EDGAR_BASE_URL + href if href.startswith("/") else base + href
    except Exception as e:
        logger.debug(f"Could not resolve XML for {accession}: {e}")
    return None


def ingest_quarter(year: int, quarter: int, limit: Optional[int] = None, rate_limit: float = 0.15) -> int:
    """
    Download and store Form 4 filings for a given quarter.
    rate_limit: seconds between requests (SEC allows ~10 req/s; 0.15 is safe).
    Returns count of new filings stored.
    """
    url = FULL_INDEX_URL.format(year=year, quarter=quarter)
    logger.info(f"Fetching index: {url}")
    try:
        resp = requests.get(url, headers=HEADERS, timeout=60)
        resp.raise_for_status()
    except Exception as e:
        logger.error(f"Failed to fetch index: {e}")
        return 0

    entries = _parse_form_idx(resp.text)
    logger.info(f"Found {len(entries)} Form 4 entries in {year}/QTR{quarter}")

    if limit:
        entries = entries[:limit]
        logger.info(f"Limited to {limit} entries")

    seen_accessions: set[str] = set()
    stored = 0

    for i, (cik, filed_date, filename) in enumerate(entries):
        accession = _accession_from_filename(filename)

        if accession in seen_accessions:
            continue
        seen_accessions.add(accession)

        if accession_exists(accession):
            continue

        xml_url = _resolve_xml_from_index(cik, accession)
        if not xml_url:
            logger.debug(f"No XML for {accession}")
            time.sleep(rate_limit)
            continue

        try:
            xml_resp = _fetch(xml_url)
            xml_bytes = xml_resp.content
        except Exception as e:
            logger.debug(f"Failed to fetch XML {accession}: {e}")
            time.sleep(rate_limit)
            continue

        _save_raw_xml(accession, xml_bytes)
        parsed = parse_form4(xml_bytes, accession, filed_date)

        for filing in parsed:
            if insert_filing(filing):
                stored += 1

        if (i + 1) % 50 == 0:
            logger.info(f"Progress: {i+1}/{len(entries)} processed, {stored} stored")

        time.sleep(rate_limit)

    logger.info(f"Done: {stored} new filings stored from {year}/QTR{quarter}")
    return stored