smaug/backtest/simulate.py

"""
Portfolio simulator for the insider-copytrade strategy.

Usage:
    python backtest/simulate.py [options]

Strategy params:
    --holding-days      Calendar days to hold each position (default: 7)
    --buy-delay         Days after signal trigger to enter (default: 1)
    --position-size     Fraction of available cash per trade (default: 0.10)
    --min-score         Minimum signal score filter (default: 0.0)
    --min-cluster       Minimum cluster size filter (default: 1)
    --capital           Initial capital in USD (default: 100000)

Transaction cost params:
    --spread            One-way bid-ask half-spread, e.g. 0.003 = 0.3% (default: 0.003)
    --slippage          Entry slippage / market impact (default: 0.002)
    --commission        Flat per-trade commission as fraction of notional (default: 0.001)

Round-trip cost = spread*2 + slippage + commission*2 (applied at buy and sell)
"""

import argparse
import logging
import math
import os
import sys
from collections import defaultdict
from datetime import datetime, timedelta

# Allow running as script from repo root
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

import config
from db.db import get_signals_for_backtest, get_cached_market_caps, upsert_market_caps

CAP_TIERS = {
    "large": (10_000_000_000, None),
    "mid":   (2_000_000_000, 10_000_000_000),
    "small": (300_000_000,   2_000_000_000),
    "micro": (0,             300_000_000),
}


def _fetch_market_caps(tickers: list[str]) -> dict[str, float]:
    """Return market cap for each ticker, using DB cache then yfinance for misses."""
    import yfinance as yf
    from concurrent.futures import ThreadPoolExecutor, as_completed

    cached, already_fetched = get_cached_market_caps(tickers)
    # Skip tickers already tried (even if null) and those with special chars
    missing = [t for t in tickers if t not in already_fetched and "/" not in t]

    if missing:
        logger.info(f"Fetching market caps for {len(missing)} tickers via yfinance (parallel)...")
        fetched = {}

        def _get_cap(ticker):
            try:
                cap = getattr(yf.Ticker(ticker).fast_info, "market_cap", None)
                return ticker, float(cap) if cap else None
            except Exception:
                return ticker, None

        with ThreadPoolExecutor(max_workers=20) as pool:
            futures = {pool.submit(_get_cap, t): t for t in missing}
            done = 0
            for future in as_completed(futures):
                ticker, cap = future.result()
                done += 1
                if cap:
                    fetched[ticker] = cap
                if done % 50 == 0:
                    print(f"  market caps: {done}/{len(missing)} fetched, {len(fetched)} hits", flush=True)

        # Cache all results (including None = not found) to avoid re-querying
        all_results = {t: fetched.get(t) for t in missing}
        upsert_market_caps(all_results)
        cached.update({t: v for t, v in all_results.items() if v is not None})

    return cached

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Price loading
# ---------------------------------------------------------------------------

def _load_all_prices() -> dict[str, tuple[list, list]]:
    """
    Load price cache from DB into memory.
    Returns {ticker: (sorted_dates, closes)} for O(log n) bisect lookups.
    """
    from sqlalchemy import create_engine, text
    from collections import defaultdict

    engine = create_engine(
        f"sqlite:///{config.DB_PATH}",
        connect_args={"check_same_thread": False},
    )
    with engine.connect() as conn:
        rows = conn.execute(text(
            "SELECT ticker, date, close FROM price_cache ORDER BY ticker, date"
        )).fetchall()

    raw: dict[str, list] = defaultdict(list)
    for ticker, date, close in rows:
        raw[ticker].append((date, close))

    prices = {
        ticker: ([d for d, _ in pairs], [c for _, c in pairs])
        for ticker, pairs in raw.items()
    }
    logger.info(f"Loaded prices for {len(prices)} tickers ({sum(len(v[0]) for v in prices.values())} rows)")
    return prices


def _closest_price_on_or_after(prices: tuple, date_str: str) -> float | None:
    import bisect
    dates, closes = prices
    i = bisect.bisect_left(dates, date_str)
    return closes[i] if i < len(closes) else None


def _closest_price_on_or_before(prices: tuple, date_str: str) -> float | None:
    import bisect
    dates, closes = prices
    i = bisect.bisect_right(dates, date_str) - 1
    return closes[i] if i >= 0 else None


# ---------------------------------------------------------------------------
# Core simulation
# ---------------------------------------------------------------------------

class Strategy:
    def __init__(
        self,
        holding_days: int = 7,
        buy_delay: int = 1,
        position_size: float = 0.10,
        min_score: float = 0.0,
        min_cluster: int = 1,
        capital: float = 100_000.0,
        spread: float = 0.003,
        slippage: float = 0.002,
        commission: float = 0.001,
        cap_tier: str = None,
    ):
        self.holding_days = holding_days
        self.buy_delay = buy_delay
        self.position_size = position_size
        self.min_score = min_score
        self.min_cluster = min_cluster
        self.capital = capital
        self.spread = spread
        self.slippage = slippage
        self.commission = commission
        self.cap_tier = cap_tier  # "large" | "mid" | "small" | "micro" | None

    # cost applied at entry: half-spread + slippage + commission
    @property
    def entry_cost(self) -> float:
        return self.spread + self.slippage + self.commission

    # cost applied at exit: half-spread + commission
    @property
    def exit_cost(self) -> float:
        return self.spread + self.commission

    @property
    def roundtrip_cost(self) -> float:
        return self.entry_cost + self.exit_cost


def simulate(strategy: Strategy, prices: dict = None,
             _signals: list = None, _market_caps: dict = None) -> dict:
    if _signals is None:
        _signals = get_signals_for_backtest(strategy.min_score, strategy.min_cluster)

    # Filter malformed dates
    valid = []
    for s in _signals:
        try:
            date_str = s["trigger_date"][:10]
            yr = int(date_str[:4])
            if yr >= 2020:
                s = dict(s, trigger_date=date_str)
                valid.append(s)
        except Exception:
            pass
    signals = valid

    if not signals:
        return {"error": "No signals after filtering"}

    if strategy.cap_tier:
        tier = CAP_TIERS.get(strategy.cap_tier)
        if tier is None:
            raise ValueError(f"Unknown cap_tier {strategy.cap_tier!r}. Use: {list(CAP_TIERS)}")
        cap_min, cap_max = tier
        if _market_caps is None:
            tickers = list({s["ticker"] for s in signals})
            _market_caps = _fetch_market_caps(tickers)
        signals = [
            s for s in signals
            if _market_caps.get(s["ticker"], 0) >= cap_min
            and (cap_max is None or _market_caps.get(s["ticker"], 0) < cap_max)
        ]
        logger.info(f"Cap tier '{strategy.cap_tier}': {len(signals)} signals after filtering")
        if not signals:
            return {"error": f"No signals after cap_tier={strategy.cap_tier} filter"}

    if prices is None:
        prices = _load_all_prices()

    # Build trade list: {entry_date_str: [(ticker, exit_date_str, signal)]}
    trades_by_entry: dict[str, list] = defaultdict(list)
    for sig in signals:
        trigger_dt = datetime.strptime(sig["trigger_date"], "%Y-%m-%d")
        entry_dt = trigger_dt + timedelta(days=strategy.buy_delay)
        exit_dt = entry_dt + timedelta(days=strategy.holding_days)
        entry_str = entry_dt.strftime("%Y-%m-%d")
        exit_str = exit_dt.strftime("%Y-%m-%d")
        trades_by_entry[entry_str].append((sig["ticker"], exit_str, sig))

    # Collect all dates with events
    all_dates = sorted(set(trades_by_entry.keys()))

    # State
    cash = strategy.capital
    # open positions: list of (exit_date_str, ticker, cost_basis, shares, notional_invested)
    open_positions: list[tuple[str, str, float, float, float]] = []

    equity_curve: list[tuple[str, float]] = []  # (date, portfolio_value)
    trade_log: list[dict] = []
    trades_executed = 0
    trades_skipped_no_price = 0

    spy_prices = prices.get("SPY", ([], []))

    for date_str in all_dates:
        # 1. Close any positions whose exit_date <= today
        still_open = []
        for pos in open_positions:
            exit_dt_str, ticker, cost_basis, shares, notional = pos
            if exit_dt_str <= date_str:
                px = prices.get(ticker, ([], []))
                exit_price = _closest_price_on_or_before(px, exit_dt_str)
                if exit_price is None:
                    exit_price = _closest_price_on_or_before(px, date_str)
                if exit_price is None:
                    # can't find exit price — recover notional (no gain/loss)
                    cash += notional
                    trade_log.append({
                        "ticker": ticker,
                        "entry_date": date_str,
                        "exit_date": exit_dt_str,
                        "gross_return": 0.0,
                        "net_return": 0.0,
                        "pnl": 0.0,
                        "note": "no_exit_price",
                    })
                    continue

                gross_return = (exit_price - cost_basis) / cost_basis
                net_return = gross_return - strategy.exit_cost
                exit_proceeds = notional * (1 + net_return)
                cash += exit_proceeds

                trade_log.append({
                    "ticker": ticker,
                    "exit_date": exit_dt_str,
                    "gross_return": round(gross_return, 5),
                    "net_return": round(net_return, 5),
                    "pnl": round(exit_proceeds - notional, 2),
                    "notional": round(notional, 2),
                })
            else:
                still_open.append(pos)
        open_positions = still_open

        # 2. Open new positions for today's signals
        for ticker, exit_date_str, sig in trades_by_entry[date_str]:
            px = prices.get(ticker, ([], []))
            entry_price = _closest_price_on_or_after(px, date_str)
            if entry_price is None:
                trades_skipped_no_price += 1
                continue

            notional = cash * strategy.position_size
            if notional < 1.0:
                continue

            # Deduct entry cost from proceeds (effective entry price is higher)
            effective_entry = entry_price * (1 + strategy.entry_cost)
            shares = notional / effective_entry
            cash -= notional
            open_positions.append((exit_date_str, ticker, effective_entry, shares, notional))
            trades_executed += 1

        # Track equity (cash + mark-to-market open positions at cost basis — conservative)
        open_value = sum(n for _, _, _, _, n in open_positions)
        equity_curve.append((date_str, cash + open_value))

    # Close all remaining open positions at last available price
    for exit_dt_str, ticker, cost_basis, shares, notional in open_positions:
        px = prices.get(ticker, ([], []))
        exit_price = _closest_price_on_or_before(px, exit_dt_str) or cost_basis
        gross_return = (exit_price - cost_basis) / cost_basis
        net_return = gross_return - strategy.exit_cost
        cash += notional * (1 + net_return)

    final_value = cash

    # SPY benchmark
    if equity_curve and spy_prices[0]:
        start_str = equity_curve[0][0]
        end_str = equity_curve[-1][0]
        spy_start = _closest_price_on_or_after(spy_prices, start_str)
        spy_end = _closest_price_on_or_before(spy_prices, end_str)
        spy_total = (spy_end - spy_start) / spy_start if (spy_start and spy_end) else 0.0
    else:
        spy_total = 0.0

    # Annualized metrics
    if equity_curve:
        start_dt = datetime.strptime(equity_curve[0][0], "%Y-%m-%d")
        end_dt = datetime.strptime(equity_curve[-1][0], "%Y-%m-%d")
        years = max((end_dt - start_dt).days / 365.25, 0.001)
    else:
        years = 1.0

    total_return = (final_value - strategy.capital) / strategy.capital
    ann_return = (1 + total_return) ** (1 / years) - 1
    spy_ann = (1 + spy_total) ** (1 / years) - 1

    # Max drawdown from equity curve
    peak = strategy.capital
    max_dd = 0.0
    for _, val in equity_curve:
        if val > peak:
            peak = val
        dd = (peak - val) / peak
        if dd > max_dd:
            max_dd = dd

    # Per-trade Sharpe from trade log
    net_returns = [t["net_return"] for t in trade_log if "net_return" in t]
    if net_returns:
        avg_r = sum(net_returns) / len(net_returns)
        std_r = math.sqrt(sum((r - avg_r) ** 2 for r in net_returns) / len(net_returns))
        trades_per_year = trades_executed / years
        sharpe = (avg_r / std_r * math.sqrt(trades_per_year)) if std_r > 0 else 0.0
        win_rate = sum(1 for r in net_returns if r > 0) / len(net_returns)
        avg_net_return = avg_r
    else:
        sharpe = win_rate = avg_net_return = 0.0

    return {
        "strategy": {
            "holding_days": strategy.holding_days,
            "buy_delay": strategy.buy_delay,
            "position_size": strategy.position_size,
            "min_score": strategy.min_score,
            "min_cluster": strategy.min_cluster,
            "roundtrip_cost_pct": round(strategy.roundtrip_cost * 100, 3),
            "cap_tier": strategy.cap_tier or "all",
        },
        "period": {
            "start": equity_curve[0][0] if equity_curve else "n/a",
            "end": equity_curve[-1][0] if equity_curve else "n/a",
            "years": round(years, 2),
        },
        "performance": {
            "initial_capital": strategy.capital,
            "final_value": round(final_value, 2),
            "total_return_pct": round(total_return * 100, 2),
            "annualized_return_pct": round(ann_return * 100, 2),
            "spy_annualized_pct": round(spy_ann * 100, 2),
            "excess_return_pct": round((ann_return - spy_ann) * 100, 2),
            "max_drawdown_pct": round(max_dd * 100, 2),
            "sharpe": round(sharpe, 3),
        },
        "trades": {
            "signals_total": len(signals),
            "executed": trades_executed,
            "skipped_no_price": trades_skipped_no_price,
            "win_rate_pct": round(win_rate * 100, 2),
            "avg_net_return_pct": round(avg_net_return * 100, 3),
        },
        "equity_curve": equity_curve,
    }


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def _print_results(r: dict):
    if "error" in r:
        print(f"Error: {r['error']}")
        return

    s = r["strategy"]
    p = r["performance"]
    t = r["trades"]
    period = r["period"]

    w = 48
    print(f"\n{'=' * w}")
    print(f"  Portfolio Simulation Results")
    print(f"{'=' * w}")
    print(f"  Strategy")
    print(f"    Hold: {s['holding_days']}d  |  Delay: {s['buy_delay']}d  |  Size: {s['position_size']*100:.0f}% of cash")
    print(f"    Score ≥ {s['min_score']}  |  Cluster ≥ {s['min_cluster']}  |  Cap: {s['cap_tier']}")
    print(f"    Round-trip cost: {s['roundtrip_cost_pct']:.2f}%")
    print(f"  Period: {period['start']} → {period['end']}  ({period['years']}y)")
    print(f"{'─' * w}")
    print(f"  Capital:    ${p['initial_capital']:>12,.0f}  →  ${p['final_value']:>12,.2f}")
    print(f"  Total ret:  {p['total_return_pct']:>+8.1f}%")
    print(f"  Ann. ret:   {p['annualized_return_pct']:>+8.1f}%  (SPY: {p['spy_annualized_pct']:+.1f}%)")
    print(f"  Excess:     {p['excess_return_pct']:>+8.1f}%")
    print(f"  Max DD:     {p['max_drawdown_pct']:>8.1f}%")
    print(f"  Sharpe:     {p['sharpe']:>8.3f}")
    print(f"{'─' * w}")
    print(f"  Trades executed:  {t['executed']:>6}  /  {t['signals_total']} signals")
    print(f"  Skipped (no px):  {t['skipped_no_price']:>6}")
    print(f"  Win rate:         {t['win_rate_pct']:>5.1f}%")
    print(f"  Avg net return:   {t['avg_net_return_pct']:>+6.3f}%  per trade")
    print(f"{'=' * w}\n")


def main():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    )

    parser = argparse.ArgumentParser(
        description="Simulate insider-copytrade portfolio with realistic costs",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    # Strategy
    parser.add_argument("--holding-days", type=int, default=7)
    parser.add_argument("--buy-delay", type=int, default=1)
    parser.add_argument("--position-size", type=float, default=0.10,
                        help="Fraction of available cash per trade (0.10 = 10%%)")
    parser.add_argument("--min-score", type=float, default=0.0)
    parser.add_argument("--min-cluster", type=int, default=1)
    parser.add_argument("--cap-tier", choices=["large", "mid", "small", "micro"],
                        default=None, help="Filter by market cap tier")
    parser.add_argument("--capital", type=float, default=100_000.0)
    # Costs
    parser.add_argument("--spread", type=float, default=0.003,
                        help="Half bid-ask spread paid on entry AND exit (0.003 = 0.3%%)")
    parser.add_argument("--slippage", type=float, default=0.002,
                        help="Entry slippage / market impact (0.002 = 0.2%%)")
    parser.add_argument("--commission", type=float, default=0.001,
                        help="Per-trade commission as fraction of notional")

    # When invoked via `python main.py simulate ...`, argv[1] is 'simulate' -- skip it
    raw = sys.argv[1:]
    if raw and raw[0] == "simulate":
        raw = raw[1:]
    args = parser.parse_args(raw)

    from db.db import init_db
    init_db()

    strategy = Strategy(
        holding_days=args.holding_days,
        buy_delay=args.buy_delay,
        position_size=args.position_size,
        min_score=args.min_score,
        min_cluster=args.min_cluster,
        capital=args.capital,
        spread=args.spread,
        slippage=args.slippage,
        commission=args.commission,
        cap_tier=args.cap_tier,
    )

    result = simulate(strategy)
    _print_results(result)


if __name__ == "__main__":
    main()