smaug/ingestion/edgar_poller.py
claude b119b9abae feat: SQLAlchemy ORM models, filing cache incremental fetch, yfinance price cache
- Replace db/schema.sql + raw sqlite3 with SQLAlchemy ORM (db/models.py)
  - Filing, Signal, PriceCache models with proper indexes
  - db/db.py uses SQLAlchemy sessions throughout; no raw SQL strings
- Add PriceCache table: stores daily close prices per ticker
  - backtest._fetch_prices checks DB first; skips yfinance for completed ranges
  - New data persisted via upsert_prices()
  - get_cached_prices() / upsert_prices() added to db.py
- EDGAR poller incremental fetch: get_latest_filed_date() returns newest
  filed_date in DB; fetch_and_store_new_filings skips entries older than
  that cutoff before even checking accession_exists
- Add get_signals_for_backtest() to db.py; backtest no longer opens its
  own sqlite3 connection
- requirements.txt: add sqlalchemy>=2.0.0

Co-authored-by: dodox <dodox@users.noreply.local>
2026-05-04 17:21:23 +00:00

130 lines
4.2 KiB
Python

import time
import os
import logging
from typing import Optional
import requests
from lxml import etree, html
import config
from ingestion.form4_parser import parse_form4
from db.db import accession_exists, get_latest_filed_date, insert_filing
logger = logging.getLogger(__name__)
HEADERS = {
"User-Agent": "smaug-insider-monitor contact@example.com",
"Accept-Encoding": "gzip, deflate",
}
EDGAR_ATOM_URL = (
"https://www.sec.gov/cgi-bin/browse-edgar"
"?action=getcurrent&type=4&dateb=&owner=include&count=40&output=atom"
)
def _fetch(url: str, timeout: int = 30) -> requests.Response:
resp = requests.get(url, headers=HEADERS, timeout=timeout)
resp.raise_for_status()
return resp
def _get_filing_urls() -> list[tuple[str, str, str]]:
resp = _fetch(EDGAR_ATOM_URL)
root = etree.fromstring(resp.content)
ns = {"atom": "http://www.w3.org/2005/Atom"}
results = []
for entry in root.findall("atom:entry", ns):
link = entry.find("atom:link", ns)
if link is None:
continue
url = link.get("href", "")
updated = (entry.findtext("atom:updated", namespaces=ns) or "")[:10]
raw = url.rstrip("/").split("/")[-1].replace("-index.htm", "")
raw = raw.replace("-", "")
if len(raw) == 18:
accession = f"{raw[:10]}-{raw[10:12]}-{raw[12:]}"
else:
accession = raw
results.append((url, accession, updated))
return results
def _resolve_xml_url(accession: str) -> Optional[str]:
accession_path = accession.replace("-", "")
cik = accession_path[:10].lstrip("0")
base = f"{config.EDGAR_BASE_URL}/Archives/edgar/data/{cik}/{accession_path}/"
index_url = f"{base}{accession}-index.htm"
try:
resp = _fetch(index_url)
doc = html.fromstring(resp.content)
for link in doc.cssselect("table.tableFile a[href]"):
href = link.get("href", "")
if href.lower().endswith(".xml") and not href.lower().endswith("-index.htm"):
return config.EDGAR_BASE_URL + href if href.startswith("/") else base + href
except Exception as e:
logger.debug(f"Could not resolve XML URL for {accession}: {e}")
return None
def _save_raw_xml(accession: str, xml_bytes: bytes):
os.makedirs(config.DATA_DIR, exist_ok=True)
path = os.path.join(config.DATA_DIR, f"{accession}.xml")
if not os.path.exists(path):
with open(path, "wb") as f:
f.write(xml_bytes)
def fetch_and_store_new_filings() -> list[dict]:
new_filings = []
try:
entries = _get_filing_urls()
except Exception as e:
logger.error(f"Failed to fetch EDGAR index: {e}")
return new_filings
latest_in_db = get_latest_filed_date()
for _index_url, accession, filed_date in entries:
if latest_in_db and filed_date and filed_date < latest_in_db:
logger.debug(f"Skipping {accession}: filed_date {filed_date} older than latest in DB {latest_in_db}")
continue
if accession_exists(accession):
continue
xml_url = _resolve_xml_url(accession)
if not xml_url:
logger.warning(f"No XML found for {accession}")
continue
try:
xml_resp = _fetch(xml_url)
xml_bytes = xml_resp.content
except Exception as e:
logger.error(f"Failed to fetch XML for {accession}: {e}")
continue
_save_raw_xml(accession, xml_bytes)
parsed = parse_form4(xml_bytes, accession, filed_date)
for filing in parsed:
if insert_filing(filing):
new_filings.append(filing)
return new_filings
def run_poller(on_new_filing=None):
logger.info("EDGAR poller started")
while True:
logger.info("Polling EDGAR for new Form 4 filings...")
new = fetch_and_store_new_filings()
logger.info(f"Found {len(new)} new filings")
if on_new_filing:
for filing in new:
try:
on_new_filing(filing)
except Exception as e:
logger.error(f"Error processing filing {filing.get('accession_number')}: {e}")
time.sleep(config.EDGAR_POLL_INTERVAL)