- Replace db/schema.sql + raw sqlite3 with SQLAlchemy ORM (db/models.py) - Filing, Signal, PriceCache models with proper indexes - db/db.py uses SQLAlchemy sessions throughout; no raw SQL strings - Add PriceCache table: stores daily close prices per ticker - backtest._fetch_prices checks DB first; skips yfinance for completed ranges - New data persisted via upsert_prices() - get_cached_prices() / upsert_prices() added to db.py - EDGAR poller incremental fetch: get_latest_filed_date() returns newest filed_date in DB; fetch_and_store_new_filings skips entries older than that cutoff before even checking accession_exists - Add get_signals_for_backtest() to db.py; backtest no longer opens its own sqlite3 connection - requirements.txt: add sqlalchemy>=2.0.0 Co-authored-by: dodox <dodox@users.noreply.local>
130 lines
4.2 KiB
Python
130 lines
4.2 KiB
Python
import time
|
|
import os
|
|
import logging
|
|
from typing import Optional
|
|
import requests
|
|
from lxml import etree, html
|
|
|
|
import config
|
|
from ingestion.form4_parser import parse_form4
|
|
from db.db import accession_exists, get_latest_filed_date, insert_filing
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
HEADERS = {
|
|
"User-Agent": "smaug-insider-monitor contact@example.com",
|
|
"Accept-Encoding": "gzip, deflate",
|
|
}
|
|
|
|
EDGAR_ATOM_URL = (
|
|
"https://www.sec.gov/cgi-bin/browse-edgar"
|
|
"?action=getcurrent&type=4&dateb=&owner=include&count=40&output=atom"
|
|
)
|
|
|
|
|
|
def _fetch(url: str, timeout: int = 30) -> requests.Response:
|
|
resp = requests.get(url, headers=HEADERS, timeout=timeout)
|
|
resp.raise_for_status()
|
|
return resp
|
|
|
|
|
|
def _get_filing_urls() -> list[tuple[str, str, str]]:
|
|
resp = _fetch(EDGAR_ATOM_URL)
|
|
root = etree.fromstring(resp.content)
|
|
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
|
results = []
|
|
for entry in root.findall("atom:entry", ns):
|
|
link = entry.find("atom:link", ns)
|
|
if link is None:
|
|
continue
|
|
url = link.get("href", "")
|
|
updated = (entry.findtext("atom:updated", namespaces=ns) or "")[:10]
|
|
raw = url.rstrip("/").split("/")[-1].replace("-index.htm", "")
|
|
raw = raw.replace("-", "")
|
|
if len(raw) == 18:
|
|
accession = f"{raw[:10]}-{raw[10:12]}-{raw[12:]}"
|
|
else:
|
|
accession = raw
|
|
results.append((url, accession, updated))
|
|
return results
|
|
|
|
|
|
def _resolve_xml_url(accession: str) -> Optional[str]:
|
|
accession_path = accession.replace("-", "")
|
|
cik = accession_path[:10].lstrip("0")
|
|
base = f"{config.EDGAR_BASE_URL}/Archives/edgar/data/{cik}/{accession_path}/"
|
|
index_url = f"{base}{accession}-index.htm"
|
|
try:
|
|
resp = _fetch(index_url)
|
|
doc = html.fromstring(resp.content)
|
|
for link in doc.cssselect("table.tableFile a[href]"):
|
|
href = link.get("href", "")
|
|
if href.lower().endswith(".xml") and not href.lower().endswith("-index.htm"):
|
|
return config.EDGAR_BASE_URL + href if href.startswith("/") else base + href
|
|
except Exception as e:
|
|
logger.debug(f"Could not resolve XML URL for {accession}: {e}")
|
|
return None
|
|
|
|
|
|
def _save_raw_xml(accession: str, xml_bytes: bytes):
|
|
os.makedirs(config.DATA_DIR, exist_ok=True)
|
|
path = os.path.join(config.DATA_DIR, f"{accession}.xml")
|
|
if not os.path.exists(path):
|
|
with open(path, "wb") as f:
|
|
f.write(xml_bytes)
|
|
|
|
|
|
def fetch_and_store_new_filings() -> list[dict]:
|
|
new_filings = []
|
|
try:
|
|
entries = _get_filing_urls()
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch EDGAR index: {e}")
|
|
return new_filings
|
|
|
|
latest_in_db = get_latest_filed_date()
|
|
|
|
for _index_url, accession, filed_date in entries:
|
|
if latest_in_db and filed_date and filed_date < latest_in_db:
|
|
logger.debug(f"Skipping {accession}: filed_date {filed_date} older than latest in DB {latest_in_db}")
|
|
continue
|
|
|
|
if accession_exists(accession):
|
|
continue
|
|
|
|
xml_url = _resolve_xml_url(accession)
|
|
if not xml_url:
|
|
logger.warning(f"No XML found for {accession}")
|
|
continue
|
|
|
|
try:
|
|
xml_resp = _fetch(xml_url)
|
|
xml_bytes = xml_resp.content
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch XML for {accession}: {e}")
|
|
continue
|
|
|
|
_save_raw_xml(accession, xml_bytes)
|
|
parsed = parse_form4(xml_bytes, accession, filed_date)
|
|
|
|
for filing in parsed:
|
|
if insert_filing(filing):
|
|
new_filings.append(filing)
|
|
|
|
return new_filings
|
|
|
|
|
|
def run_poller(on_new_filing=None):
|
|
logger.info("EDGAR poller started")
|
|
while True:
|
|
logger.info("Polling EDGAR for new Form 4 filings...")
|
|
new = fetch_and_store_new_filings()
|
|
logger.info(f"Found {len(new)} new filings")
|
|
if on_new_filing:
|
|
for filing in new:
|
|
try:
|
|
on_new_filing(filing)
|
|
except Exception as e:
|
|
logger.error(f"Error processing filing {filing.get('accession_number')}: {e}")
|
|
time.sleep(config.EDGAR_POLL_INTERVAL)
|