%load_ext autoreload
%autoreload 2

import gc
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import yfinance as yf
from config.paths import DATA_INTER, DATA_PROC
from matplotlib import gridspec
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

import volatility_trading.rv_forecasting.features as rvfeat
import volatility_trading.rv_forecasting.plotting as ph
import volatility_trading.rv_forecasting.vol_estimators as rvvol
from volatility_trading.iv_surface.ssvi_model import SSVI
from volatility_trading.rv_forecasting.data_loading import load_intraday_prices
from volatility_trading.rv_forecasting.modelling import (
    DataProcessor,
    PurgedKFold,
    WalkForwardOOS,
    compute_metrics,
    compute_subperiod_metrics,
    eval_ensembles,
    eval_model_cv,
    in_sample_stability,
    oos_perm_importance,
    single_feature_importance,
)

np.random.seed(42)
random.seed(42)

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

pd.options.mode.chained_assignment = None

start = "2010-01-01"
end = "2020-12-31"

spx = yf.download("^GSPC", start=start, end=end, auto_adjust=True)
spx.columns = spx.columns.droplevel("Ticker")
spx.columns.name = None
spx

[*********************100%***********************]  1 of 1 completed

spx["returns"] = np.log(spx["Close"] / spx["Close"].shift(1))
spx = spx.dropna()
spx

spx["returns"].plot(figsize=(12, 6))
plt.title("Log Returns")

Text(0.5, 1.0, 'Log Returns')

spx["rv_close"] = rvvol.rv_close_to_close(spx["returns"], h=21)
spx["rv_close"].plot(figsize=(12, 6))
plt.show()

spx["rv_parkinson"] = rvvol.rv_parkinson(spx["High"], spx["Low"], h=21)
spx["rv_parkinson"].plot(figsize=(12, 6))
plt.show()

spx["rv_gk"] = rvvol.rv_garman_klass(spx["Open"], spx["High"], spx["Low"], spx["Close"], h=21)
spx["rv_gk"].plot(figsize=(12, 6))
plt.show()

spx["rv_rs"] = rvvol.rv_rogers_satchell(spx["Open"], spx["High"], spx["Low"], spx["Close"], h=21)
spx["rv_rs"].plot(figsize=(12, 6))
plt.show()

spx["rv_yz"] = rvvol.rv_yang_zhang(spx["Open"], spx["High"], spx["Low"], spx["Close"], h=21)
spx["rv_yz"].plot(figsize=(12, 6))
plt.show()

spx["overnight_volatility"] = np.log(spx["Open"] / spx["Close"].shift(1)).rolling(21).std()
spx["intraday_volatility"] = np.log(spx["Close"] / spx["Open"]).rolling(21).std()

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)

# --- top panel: volatility estimators ---
spx.loc[:, spx.columns.str.startswith("rv")].plot(ax=ax1)
ax1.set_title("Volatility Estimators")
ax1.set_ylabel("Annualized Volatility")
ax1.grid(alpha=0.3)
ax1.legend()

# --- bottom panel: overnight vs intraday volatility ---
spx[["overnight_volatility", "intraday_volatility"]].plot(ax=ax2)
ax2.set_title("21D Volatility: Overnight vs Intraday")
ax2.set_ylabel("Annualized Volatility")
ax2.grid(alpha=0.3)
ax2.legend()

plt.tight_layout()
plt.show()

es_5min = load_intraday_prices(DATA_INTER / "es-5m.csv", start=start, end=end)
es_5min

# pick 4 random days
all_days = es_5min.index.normalize().unique().date
days = np.random.choice(all_days, size=4, replace=False)

fig, axes = plt.subplots(2, 2, figsize=(12, 8))

for ax, day in zip(axes.ravel(), days):
    es_5min.loc[str(day), "close"].plot(ax=ax)
    ax.set_title(day.strftime("%Y-%m-%d"))
    ax.set_xlabel("Time")
    ax.set_ylabel("Close")

plt.tight_layout()
plt.show()

daily_rv = rvvol.rv_intraday(es_5min["close"])
es_rv_21 = np.sqrt(daily_rv.rolling(21).mean() * 252)

fig, ax = plt.subplots(1, 1, figsize=(12, 6))

es_rv_21.plot(ax=ax, label="ES 5-min RV", lw=2)
spx.loc[:, spx.columns.str.startswith("rv")].plot(ax=ax, lw=1, alpha=0.8)

ax.set_title("ES vs SPX Realized Volatility")
ax.set_ylabel("Annualized Volatility")
ax.grid(alpha=0.3)
ax.legend()

plt.show()

daily_c2c = spx["returns"].pow(2)

plt.figure(figsize=(12, 6))
daily_rv.loc["2018"].plot(alpha=1, label="Daily Close-To-Close")
daily_c2c.loc["2018"].plot(alpha=0.4, label="Daily Realized Variance")
plt.legend()
plt.show()

ph.plot_acf_pacf(daily_c2c, lags=40, title="Daily Squarred returns")
ph.plot_acf_pacf(daily_rv, lags=40, title="Daily Realized Variance")

from statsmodels.tsa.ar_model import AutoReg

# Fit AR(1) model
model = AutoReg(daily_rv.to_numpy(), lags=1, old_names=False)
res = model.fit()
print(res.summary())

                            AutoReg Model Results                             
==============================================================================
Dep. Variable:                      y   No. Observations:                 2835
Model:                     AutoReg(1)   Log Likelihood               19576.101
Method:               Conditional MLE   S.D. of innovations              0.000
Date:                Sat, 22 Nov 2025   AIC                         -39146.202
Time:                        15:48:56   BIC                         -39128.354
Sample:                             1   HQIC                        -39139.764
                                 2835                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const       3.869e-05   4.81e-06      8.045      0.000    2.93e-05    4.81e-05
y.L1           0.6461      0.014     45.068      0.000       0.618       0.674
                                    Roots                                    
=============================================================================
                  Real          Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
AR.1            1.5477           +0.0000j            1.5477            0.0000
-----------------------------------------------------------------------------

ph.plot_hist_transform(daily_rv, use_log=True, figsize=(10,4))

fig = plt.figure(figsize=(12,6))
gs = gridspec.GridSpec(2, 1, height_ratios=[3, 1])

# SPX closing price
ax1 = fig.add_subplot(gs[0])
ax1.plot(spx.index, spx["Close"], color="steelblue")
ax1.set_title("SPX Closing Price")
ax1.set_ylabel("Price")

# realized volatility
ax2 = fig.add_subplot(gs[1], sharex=ax1)
ax2.plot(daily_rv.index, daily_rv, color="darkorange", linestyle="--")
ax2.set_title("Daily Realized Variance")
ax2.set_ylabel("Variance")
ax2.set_xlabel("Date")

plt.tight_layout()
plt.show()

H = 21
y = rvfeat.create_forward_target(daily_rv, horizon=H)

fig, axes = plt.subplots(1, 2, figsize=(10, 4))

np.exp(y).hist(bins=30, ax=axes[0], label="RV", color="steelblue", alpha=0.7)
axes[0].set_title("Distribution of 21-D Realized Variance")
axes[0].legend()

y.hist(bins=30, ax=axes[1], label="Log(RV)", color="darkorange", alpha=0.7)
axes[1].set_title("Distribution of Logarithm of 21-D Realized Variance")
axes[1].legend()

plt.tight_layout()
plt.show()

summary_stats = ['count','mean','std','min','max','skew','kurtosis']

X_har = rvfeat.create_har_lags(daily_rv)
X_har.agg(summary_stats)

fig, ax = plt.subplots(1,1, figsize=(12, 5))
ax.set_yscale("log")
X_har["RV_D"].plot(ax=ax, alpha=0.4, label="Log (RV_D)")
X_har["RV_W"].plot(ax=ax, alpha=1, label="Log (RV_W)")
X_har["RV_M"].plot(ax=ax, alpha=1, label="Log (RV_M)")
plt.legend();
plt.title("HAR-RV Log components")
plt.show()

ph.plot_feature_histograms(X_har, figsize=(10, 3))

ph.plot_features_vs_target(X_har, y, log_features=X_har.columns.tolist(), figsize=(10, 3), nrows=1, ncols=3)

spx_options = pd.read_parquet(DATA_INTER / "full_spx_options_2010_2020.parquet")
spx_options["T"] = spx_options["dte"] / 252 
spx_options["k"] = np.log(
    spx_options["strike"] / spx_options["underlying_last"]
 )
spx_options.head()

ssvi = SSVI()
df_g = pd.read_parquet(DATA_PROC / "ssvi_globals_2010_2020.parquet")
df_k = pd.read_parquet(DATA_PROC / "ssvi_knots_2010_2020.parquet")

params_ssvi = ssvi.build_params_dict(df_g, df_k)
X_iv_surface = rvfeat.create_iv_surface_predictors(spx_options, ssvi, params=params_ssvi)
X_iv_surface = X_iv_surface.reindex(y.index).ffill()

X_iv_surface.agg(summary_stats)

X_iv_surface.loc["2019-05-02":"2019-05-08", :] = np.nan
X_iv_surface.loc["2019-05-17", :] = np.nan
X_iv_surface = X_iv_surface.interpolate(method="linear")

X_iv_surface.agg(summary_stats)

colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]  # blue, orange, green, red
fig, axes = plt.subplots(3, 1, figsize=(10, 5), sharex=True)

for ax, col, c in zip(axes.flat, X_iv_surface.columns, colors):
    X_iv_surface[col].plot(ax=ax, lw=1.5, color=c)
    ax.set_title(col, fontsize=11)
    ax.grid(alpha=0.3)
    ax.set_xlabel("Date")
    ax.set_ylabel("IV")
    
plt.tight_layout()
plt.show()

ph.plot_feature_histograms(X_iv_surface, figsize=(10, 3))

ph.plot_features_vs_target(X_iv_surface, y, log_features=["atm_iv_30d", "iv_skew"], figsize=(10, 3), nrows=1, ncols=3)

X_returns = rvfeat.create_return_predictors(spx["returns"], es_5min, h=H)
X_returns = X_returns.reindex(y.index).ffill()
X_returns.agg(summary_stats)

ph.plot_feature_histograms(X_returns, figsize=(10, 4))

log_features_ret = ["abs_r", "r2", "down_var", "up_var"]
sqrt_features_ret = ["neg_r2"]
ph.plot_features_vs_target(X_returns, y, log_features=log_features_ret, sqrt_features=sqrt_features_ret, figsize=(12, 5), nrows=2, ncols=4)

X_macro = rvfeat.create_macro_features(start=start, end=end)
X_macro = X_macro.reindex(y.index).ffill()
X_macro.agg(summary_stats)

ph.plot_macro_block(X_macro)

ph.plot_feature_histograms(X_macro, figsize=(8, 4), nrows=2, ncols=3)

ph.plot_features_vs_target(X_macro, y, log_features=["HY_OAS", "IG_OAS"], figsize=(10, 5), nrows=2, ncols=3)

X_macro = X_macro.drop("DGS3MO", axis=1)

X_market = rvfeat.create_market_features(start=start, end=end)
X_market = X_market.reindex(y.index).ffill()

ph.plot_feature_histograms(X_market, figsize=(8, 3))

ph.plot_features_vs_target(X_market, y, log_features=["VIX", "VVIX"], figsize=(10, 3), nrows=1, ncols=3)

X_core = pd.concat([X_har, X_iv_surface, X_macro, X_returns, X_market], axis=1)
X_eng = rvfeat.feature_engineering(X_core)
X = pd.concat([X_core, X_eng], axis=1)
X = X.dropna(axis=0)

core_features = X_core.columns.tolist()
eng_features = X_eng.columns.tolist()

X_eng.agg(summary_stats)

ph.plot_feature_histograms(X_eng, figsize=(10, 5))

log_features_eng = list(set(eng_features) - set(["dVIX_5d", "dSkew_5d", "vvix_over_vix"]))
ph.plot_features_vs_target(X_eng, y, log_features=log_features_eng, figsize=(10, 6), nrows=3, ncols=4)

del X_har, X_iv_surface, X_macro, X_returns, X_market, X_core, X_eng
gc.collect()

ph.plot_hist_transform(X["iv_skew"], use_log=True)
ph.plot_hist_transform(X["iv_ts"], winsorize=(0.005, 0.995))
ph.plot_hist_transform(X["neg_r2"], use_sqrt=True)

log_features = [
    "RV_D", "RV_W", "RV_M", "RV_D_ewma",
    "abs_r", "r2", "down_var", "up_var",
    "VIX", "VVIX", "VIX_rm5", "VIX_rm21", "VIX_ewma",
    "atm_iv_30d", "iv_skew", "iv_minus_realized",
    "VIX_time_HY_OAS", "RV_D_rollvol5", "RV_D_rollvol21"
]

winsor_sqrt_features = [
    ([0.0, 0.995], ["neg_r2"])
]

winsor_features = [
    ([0.01, 1],      ["vix_ts"]),
    ([0.005, 0.995], ["iv_ts", "overnight_ret"]),
    ([0.001, 0.995], ["dVIX_5d", "dSkew_5d"])
]

dp = DataProcessor(
    log_features=log_features,
    winsor_sqrt_features=winsor_sqrt_features,
    winsor_features=winsor_features,
    scale=True
)

X_proc = dp.fit_transform(X)
X_proc = pd.DataFrame(X_proc, index=X.index, columns=dp.get_feature_names_out())

rng = np.random.default_rng(42)
cols_sample = rng.choice(X_proc.columns, size=8, replace=False)
X_proc[cols_sample[:4]].agg(summary_stats)

ph.plot_feature_histograms(X_proc[cols_sample], figsize=(10, 4))

y = rvfeat.create_forward_target(daily_rv, horizon=21)

data = pd.concat([X_proc, y], axis=1)
data = data.dropna()

X_clean = data[X_proc.columns]
y_clean = data[y.name]

corr = X_clean.corr()

plt.figure(figsize=(8, 6)) 
sns.heatmap(
    corr,
    cmap="coolwarm",        
    center=0,                   
    cbar_kws={"shrink": .8}
)
plt.title("Feature Correlation Heatmap", fontsize=14, pad=12)
plt.tight_layout()
plt.show()

corr_threshold = 0.95
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
high_pairs = (
    upper.stack()
         .loc[lambda s: s > corr_threshold] 
         .sort_values(ascending=False)
)
print(high_pairs)

atm_iv_30d         iv_minus_realized    0.999999
VIX_rm5            VIX_ewma             0.994107
HY_OAS             HY_OAS_ewma          0.990606
abs_r              r2                   0.987160
VIX                atm_iv_30d           0.984047
                   iv_minus_realized    0.983999
                   VIX_rm5              0.972769
RV_W               RV_D_ewma            0.970479
VIX                VIX_ewma             0.966937
VIX_rm21           VIX_ewma             0.965546
RV_M               RV_D_rollvol21       0.964809
VIX_rm5            atm_iv_30d           0.960073
                   iv_minus_realized    0.959925
VIX_ewma           atm_iv_30d           0.955040
                   iv_minus_realized    0.954847
atm_iv_30d         VIX_time_HY_OAS      0.953364
iv_minus_realized  VIX_time_HY_OAS      0.953324
dtype: float64

corr_features_to_drop = [
    "atm_iv_30d", "iv_minus_realized", "r2",
    "VIX_rm5", "RV_D_ewma", "HY_OAS", "VIX_ewma", "RV_D_rollvol21"
]
X_clean = X_clean.drop(corr_features_to_drop, axis=1)

purged_cv = PurgedKFold(
    n_splits=5,      # ≈ 2-year validation blocks
    purge_gap=21,   # 21-day RV horizon
    embargo=0.01,  # 1% embargo after validation fold
)

ph.plot_purged_kfold_splits(purged_cv, X_clean, y_clean)

lin_model = LinearRegression()

rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    max_features="sqrt",
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
)

sfi_df, scores = single_feature_importance(X_clean, y_clean, lin_model, purged_cv)
sfi_df.head(10)

lasso_cv = LassoCV(
        alphas=np.logspace(-4, 0, 30),
        cv=purged_cv,
        max_iter=100000,
        n_jobs=-1
)
lasso_cv.fit(X_clean, y_clean)
lasso = Lasso(alpha=lasso_cv.alpha_)

lasso_coefs, lasso_summary = in_sample_stability(X_clean, y_clean, lasso, purged_cv)

ph.plot_mean_std_importance(
    lasso_summary,
    title="Lasso coefficients (mean ± std)",
    top_n=20,
    abs_values=True,
)

ph.plot_lasso_coef_paths(lasso_coefs, lasso_summary["feature"])

lasso_features = lasso_summary.loc[lasso_summary["mean"].abs() > 0.0, "feature"]
X_clean_lin = X_clean[lasso_features]
X_clean_lin = X_clean_lin.drop(["VIX_time_HY_OAS"], axis=1)

rf_fi, rf_summary = in_sample_stability(X_clean, y_clean, rf, purged_cv)

ph.plot_mean_std_importance(
    rf_summary,
    title="RF Mean Decrease Impurity (mean ± std)",
    top_n=len(rf_summary),
    abs_values=True,
)

lasso_pi, lasso_pi_summary = oos_perm_importance(
    X_clean_lin, y_clean, lin_model, purged_cv, random_state=42
)

ph.plot_mean_std_importance(
    lasso_pi_summary,
    title="Linear OOS Permutation Importance (mean ± std)",
    top_n=20,
    abs_values=False,
)

rf_pi, rf_pi_summary = oos_perm_importance(
    X_clean, y_clean, rf, purged_cv, random_state=42
)

ph.plot_mean_std_importance(
    rf_pi_summary,
    title="RF OOS Permutation Importance (mean ± std)",
    top_n=len(rf_pi_summary),
    abs_values=False
)

features_to_drop = [
    "rku",
    "rsk",
    "term_spread_10y_3m",
    "VIX_rm21",
    "DGS2",
    "up_var",
    "RV_D_rollvol5",
    "dSkew_5d",
    "overnight_ret"
]

X_clean_rf = X_clean.drop(columns=features_to_drop)
features_rf = X_clean_rf.columns
print(features_rf)

Index(['RV_D', 'RV_W', 'RV_M', 'abs_r', 'down_var', 'VIX', 'VVIX', 'iv_skew',
       'VIX_time_HY_OAS', 'vix_ts', 'iv_ts', 'dVIX_5d', 'neg_r2', 'DGS10',
       'IG_OAS', 'HY_OAS_ewma', 'vvix_over_vix'],
      dtype='object')

data = pd.concat([X, y], axis=1)
data = data.dropna()

X_train = data[X.columns]
y_train = data[y.name]

purged_cv = PurgedKFold(
    n_splits=10,      # ≈ 1-year validation blocks
    purge_gap=21,   # 21-day RV horizon
    embargo=0.01,  # 1% embargo after validation fold
)

# Common data preprocessing config
dp_kwargs = dict(
    log_features=log_features,
    winsor_sqrt_features=winsor_sqrt_features,
    winsor_features=winsor_features
)

y_naive_rv, y_naive_iv = rvfeat.build_naive_targets(
    X_train["RV_M"], X_train["atm_iv_30d"]
)

metrics_rv = compute_metrics(y_train, y_naive_rv)
metrics_iv = compute_metrics(y_train, y_naive_iv, y_naive_rv)

row_rv = {"model": "Naive-RV", **metrics_rv}
row_iv = {"model": "Naive-IV", **metrics_iv}

metrics_bench = pd.DataFrame([row_rv, row_iv]).set_index("model")
display(metrics_bench)

har_specs = { 
    "HAR-RV":                 ["RV_D", "RV_W", "RV_M"],
    "HAR-RV-VIX":             ["RV_D", "RV_W", "RV_M", "VIX"],
    "HAR-RV-VIX-VVIX":        ["RV_D", "RV_W", "RV_M", "VIX", "VVIX"],
    "HAR-RV-VIX-L":           ["RV_D", "RV_W", "RV_M", "VIX", "neg_r2"],
    "HAR-RV-VIX-IVTS":        ["RV_D", "RV_W", "RV_M", "VIX", "iv_ts"],
    "HAR-RV-VIX-VVIX-L-IVTS": ["RV_D", "RV_W", "RV_M", "VIX", "VVIX", "neg_r2", "iv_ts"]
}

dp_kwargs["scale"] = True # Need to scale

metrics_lin = []  # list of metrics dicts
y_preds_lin = {}  # name -> CV predictions (Series)

for name, feats in har_specs.items():
    metrics, y_pred = eval_model_cv(
        name=name,
        base_estimator=lin_model,
        features=feats,
        X=X_train,
        y=y_train,
        cv=purged_cv,
        dp_kwargs=dp_kwargs,
        y_pred_bench=y_naive_rv,
    )
    metrics_lin.append(metrics)
    y_preds_lin[name] = y_pred

metrics_lin = pd.DataFrame(metrics_lin).set_index("model")
display(pd.concat([metrics_bench, metrics_lin], axis=0))

lin_model = Pipeline([
    ("dp", DataProcessor(**dp_kwargs)),
    ("lr", LinearRegression()),
])

# HAR-RV-VIX
scores_vix = cross_val_score(
    lin_model,
    X_train[har_specs["HAR-RV-VIX"]],
    y_train,
    cv=purged_cv,
    scoring="neg_mean_squared_error",
)

# HAR-RV-VIX-IVTS
scores_vix_ivts = cross_val_score(
    lin_model,
    X_train[har_specs["HAR-RV-VIX-IVTS"]],
    y_train,
    cv=purged_cv,
    scoring="neg_mean_squared_error",
)

ph.plot_cv_mse_comparison(
    scores_vix,
    scores_vix_ivts,
    label_a="HAR-RV-VIX",
    label_b="HAR-RV-VIX-IVTS",
)

Folds where HAR-RV-VIX-IVTS better: 6 / 10
Mean ΔMSE (B - A): -0.0018831446138953337

param_grid = {
    "rf__max_depth": [3, 5, 7],
    "rf__min_samples_leaf": [5, 10, 20],
    "rf__max_features": ["sqrt", 0.5, 1.0],
}

dp_kwargs["scale"] = False # No need to scale

pipe_rf = Pipeline([
    ("dp", DataProcessor(**dp_kwargs)),
    ("rf", RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1,
    )),
])

gscv = GridSearchCV(
    pipe_rf, param_grid=param_grid, 
    scoring="neg_mean_squared_error",
    cv=purged_cv, n_jobs=-1
)

X_train_rf = X_train[features_rf]
gscv.fit(X_train_rf, y_train)
print("Best params:", gscv.best_params_)

Best params: {'rf__max_depth': 7, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 20}

best_rf_pipe = gscv.best_estimator_
best_rf = best_rf_pipe.named_steps["rf"] 

metrics_rf, y_pred_rf = eval_model_cv(
    name="Random Forest",
    base_estimator=best_rf,
    features=features_rf,
    X=X_train,
    y=y_train,
    cv=purged_cv,
    dp_kwargs=dp_kwargs,
    y_pred_bench=y_naive_rv,
)
metrics_rf = pd.DataFrame([metrics_rf]).set_index("model")

display(pd.concat([metrics_bench, metrics_rf], axis=0))

ph.plot_model_comparison_ts(
    y_train,
    y_pred_1=y_preds_lin["HAR-RV-VIX"],
    y_pred_2=y_pred_rf,
    label_1="HAR-RV-VIX",
    label_2="RF",
)

ph.plot_model_comparison_scatter(
    y_train,
    y_preds_lin["HAR-RV-VIX"], 
    y_pred_rf,
    label_1="HAR_RV_VIX",
    label_2="RF",
)

weights = [0.5, 0.7, 0.8]

ens_metrics, ens_preds = eval_ensembles(
    y_train,
    y_preds_lin["HAR-RV-VIX"],
    y_pred_rf,
    weights,
    y_pred_bench=y_naive_rv
)

display(pd.concat([metrics_lin.loc[["HAR-RV-VIX"]], metrics_rf, ens_metrics], axis=0))

perf = pd.concat(
    [y_train.rename("y_true"),
     y_preds_lin["HAR-RV-VIX"].rename("har_vix"),
     y_naive_rv.rename("naive_rv"),
     y_naive_iv.rename("naive_iv")],
    axis=1
).dropna()

subperiods = [
    ("2010-01-01", "2012-12-31", "2010–2012"),
    ("2013-01-01", "2015-12-31", "2013–2015"),
    ("2016-01-01", "2018-12-31", "2016–2018"),
    ("2019-01-01", "2020-12-31", "2019–2020"),
]

ph.plot_subperiod_comparison(perf, subperiods)

metrics_subperiod = compute_subperiod_metrics(perf, subperiods)
display(metrics_subperiod)

# From start of research (2010-01-01) to end of backtest (2025-12-31)
start_backtest = "2021-01-01"
end_backtest = "2025-12-31"

es_5min_full = load_intraday_prices(
    DATA_INTER / "es-5m.csv",
    start=start,
    end=end_backtest,
)

iv_atm_30d = pd.read_csv(
    DATA_PROC / "spx_atm_iv_30d_2016_2023.csv", 
    index_col=0, parse_dates=True
)
iv_atm_30d = iv_atm_30d.loc[start_backtest:end_backtest]

X_full, y_full = rvfeat.build_har_vix_dataset(
    es_5min_full,
    h=21,
)

X_full

dp_kwargs = dict(
    log_features=log_features,
    winsor_sqrt_features=winsor_sqrt_features,
    winsor_features=winsor_features,
    scale=True
)

lin_pipe = Pipeline([
    ("dp", DataProcessor(**dp_kwargs)),
    ("lr", LinearRegression()),
])

wf = WalkForwardOOS(
    estimator=lin_pipe,
    start_backtest=start_backtest,
    end_backtest=end_backtest,
    expanding=False,   # rolling window
    window_years=3,
    rebal_freq="ME",   # month-end
    purge_horizon=21,
)

# 1) Run WF for HAR-RV-VIX (final model) and HAR-RV (benchmark)
preds_rv_vix = wf.run(X_full, y_full)
preds_rv = wf.run(X_full.drop(columns=["VIX"]), y_full)

# no intersection with iv here – use full OOS
y_true_full     = preds_rv_vix["y_true"]
y_pred_vix_full = preds_rv_vix["y_pred"]
y_pred_har_full = preds_rv["y_pred"]

y_naive_rv_full, _ = rvfeat.build_naive_targets(
    X_full.loc[y_true_full.index, "RV_M"],
    iv_atm=None  # or just ignore IV here
)

metrics_naive_rv = compute_metrics(y_true_full, y_naive_rv_full)
metrics_har      = compute_metrics(y_true_full, y_pred_har_full,  y_pred_bench=y_naive_rv_full)
metrics_vix      = compute_metrics(y_true_full, y_pred_vix_full,  y_pred_bench=y_naive_rv_full)

metrics_naive_rv["model"] = "Naive_RV"
metrics_har["model"]      = "HAR-RV"
metrics_vix["model"]      = "HAR-RV-VIX"

# build table for 2021–2025 (Naive_RV, HAR-RV, HAR-RV-VIX)
metrics_df = pd.DataFrame(
    [metrics_naive_rv, metrics_har, metrics_vix]
).set_index("model")

display(metrics_df)

plt.figure(figsize=(12, 4))

y_true_full.plot(label="True 21D RV", lw=1.5)
y_naive_rv_full.plot(label="Naive RV", lw=1)
y_pred_har_full.plot(label="HAR-RV", lw=1)
y_pred_vix_full.plot(label="HAR-RV-VIX", lw=1.5)

plt.legend(loc="upper left")
plt.title("OOS forecasts: True vs Naive, HAR-RV, HAR-RV-VIX")
plt.ylabel("log 21D RV")
plt.tight_layout()
plt.show()

models = {
    "Naive-RV":  y_naive_rv_full,
    "HAR-RV":    y_pred_har_full,
    "HAR-RV-VIX": y_pred_vix_full,
}

fig, axes = plt.subplots(1, 3, figsize=(14, 4), sharex=True, sharey=True)
xy_min = min(y_true_full.min(), *(p.min() for p in models.values()))
xy_max = max(y_true_full.max(), *(p.max() for p in models.values()))

for ax, (name, preds) in zip(axes, models.items()):
    ax.scatter(y_true_full, preds, s=8, alpha=0.6)
    ax.plot([xy_min, xy_max], [xy_min, xy_max])
    ax.set_title(name)
    ax.set_xlabel("True log 21D RV")
axes[0].set_ylabel("Predicted log 21D RV")

plt.tight_layout()
plt.show()

res_vix = y_true_full - y_pred_vix_full

plt.figure(figsize=(12, 4))
res_vix.plot(lw=0.8, label="Residuals")
res_vix.rolling(126).std().plot(lw=1.5, alpha=0.8, label="6M rolling std")
plt.axhline(0.0, color="k", lw=1)
plt.title("HAR-RV-VIX residuals over time (walk-forward OOS)")
plt.legend()
plt.tight_layout()
plt.show()

idx_iv = preds_rv_vix.index.intersection(iv_atm_30d.index)

y_true_iv     = preds_rv_vix.loc[idx_iv, "y_true"]
y_pred_vix_iv = preds_rv_vix.loc[idx_iv, "y_pred"]
y_pred_har_iv = preds_rv.loc[idx_iv, "y_pred"]
iv_slice      = iv_atm_30d.loc[idx_iv]

y_naive_rv_iv, y_naive_iv = rvfeat.build_naive_targets(
    X_full.loc[y_true_iv.index, "RV_M"],
    iv_slice,
)

metrics_naive_rv_iv = compute_metrics(y_true_iv, y_naive_rv_iv)
metrics_naive_iv    = compute_metrics(y_true_iv, y_naive_iv, y_pred_bench=y_naive_rv_iv)
metrics_har_iv      = compute_metrics(y_true_iv, y_pred_har_iv, y_pred_bench=y_naive_rv_iv)
metrics_vix_iv      = compute_metrics(y_true_iv, y_pred_vix_iv, y_pred_bench=y_naive_rv_iv)

metrics_naive_rv_iv["model"] = "Naive_RV"
metrics_naive_iv["model"] = "Naive-IV"
metrics_har_iv["model"] = "HAR-RV"
metrics_vix_iv["model"] = "HAR-RV-VIX"

# build table for 2021–2025 (Naive_RV, HAR-RV, HAR-RV-VIX)
metrics_df = pd.DataFrame(
    [metrics_naive_rv_iv, metrics_naive_iv, metrics_har_iv, metrics_vix_iv]
).set_index("model")

display(metrics_df)

plt.figure(figsize=(12, 4))

y_true_iv.plot(label="True 21D RV", lw=1.5)
y_naive_iv.plot(label="Naive IV", lw=1)
y_pred_vix_iv.plot(label="HAR-RV-VIX", lw=1.5)

plt.legend(loc="upper left")
plt.title("OOS forecasts: True vs Naive-IV, HAR-RV-VIX")
plt.ylabel("log 21D RV")
plt.tight_layout()
plt.show()

	Close	High	Low	Open	Volume
Date
2010-01-04	1132.989990	1133.869995	1116.560059	1116.560059	3991400000
2010-01-05	1136.520020	1136.630005	1129.660034	1132.660034	2491020000
2010-01-06	1137.140015	1139.189941	1133.949951	1135.709961	4972660000
2010-01-07	1141.689941	1142.459961	1131.319946	1136.270020	5270680000
2010-01-08	1144.979980	1145.390015	1136.219971	1140.520020	4389590000
...	...	...	...	...	...
2020-12-23	3690.010010	3711.239990	3689.280029	3693.419922	3779160000
2020-12-24	3703.060059	3703.820068	3689.320068	3694.030029	1883780000
2020-12-28	3735.360107	3740.510010	3723.030029	3723.030029	3535460000
2020-12-29	3727.040039	3756.120117	3723.310059	3750.010010	3393290000
2020-12-30	3732.040039	3744.629883	3730.209961	3736.189941	3154850000

	Close	High	Low	Open	Volume	returns
Date
2010-01-05	1136.520020	1136.630005	1129.660034	1132.660034	2491020000	0.003111
2010-01-06	1137.140015	1139.189941	1133.949951	1135.709961	4972660000	0.000545
2010-01-07	1141.689941	1142.459961	1131.319946	1136.270020	5270680000	0.003993
2010-01-08	1144.979980	1145.390015	1136.219971	1140.520020	4389590000	0.002878
2010-01-11	1146.979980	1149.739990	1142.020020	1145.959961	4255780000	0.001745
...	...	...	...	...	...	...
2020-12-23	3690.010010	3711.239990	3689.280029	3693.419922	3779160000	0.000746
2020-12-24	3703.060059	3703.820068	3689.320068	3694.030029	1883780000	0.003530
2020-12-28	3735.360107	3740.510010	3723.030029	3723.030029	3535460000	0.008685
2020-12-29	3727.040039	3756.120117	3723.310059	3750.010010	3393290000	-0.002230
2020-12-30	3732.040039	3744.629883	3730.209961	3736.189941	3154850000	0.001341

	open	high	low	close	volume
datetime
2010-01-03 17:00:00	1113.75	1115.25	1113.25	1114.25	4700
2010-01-03 17:05:00	1114.25	1115.50	1114.25	1114.75	1804
2010-01-03 17:10:00	1114.50	1115.25	1114.50	1115.00	785
2010-01-03 17:15:00	1115.25	1116.00	1114.75	1115.75	1143
2010-01-03 17:20:00	1115.75	1116.25	1115.50	1116.00	643
...	...	...	...	...	...
2020-12-31 15:35:00	3746.75	3747.25	3746.00	3746.75	902
2020-12-31 15:40:00	3746.75	3747.25	3746.50	3747.00	582
2020-12-31 15:45:00	3747.00	3747.50	3745.75	3746.75	879
2020-12-31 15:50:00	3746.50	3748.25	3746.25	3748.00	615
2020-12-31 15:55:00	3748.00	3749.25	3747.50	3748.75	1335

	RV_D	RV_W	RV_M
count	2.835000e+03	2831.000000	2815.000000
mean	1.092941e-04	0.000109	0.000110
std	3.170964e-04	0.000263	0.000219
min	3.257165e-07	0.000004	0.000006
max	7.191973e-03	0.004678	0.002556
skew	1.255384e+01	10.254128	7.915824
kurtosis	2.195748e+02	135.858842	74.452912

	underlying_last	expiry	dte	strike	strike_distance	strike_distance_pct	c_delta	p_delta	c_gamma	p_gamma	...	c_bid	p_bid	c_ask	p_ask	c_size_bid	p_size_bid	c_size_ask	p_size_ask	T	k
date
2010-01-04	1132.99	2010-01-07	3.0	925.0	208.0	0.184	1.0	-0.00077	0.0	0.00004	...	205.99	0.00	208.99	0.06	101.0	0.0	101.0	420.0	0.011905	-0.202822
2010-01-04	1132.99	2010-01-07	3.0	950.0	183.0	0.162	1.0	-0.00203	0.0	0.00006	...	181.00	0.00	184.00	0.10	101.0	0.0	101.0	455.0	0.011905	-0.176153
2010-01-04	1132.99	2010-01-07	3.0	975.0	158.0	0.139	1.0	-0.00369	0.0	0.00022	...	156.00	0.05	159.00	0.10	101.0	717.0	101.0	177.0	0.011905	-0.150178
2010-01-04	1132.99	2010-01-07	3.0	1000.0	133.0	0.117	1.0	-0.00894	0.0	0.00038	...	131.10	0.15	134.10	0.21	101.0	470.0	101.0	98.0	0.011905	-0.124860
2010-01-04	1132.99	2010-01-07	3.0	1025.0	108.0	0.095	1.0	-0.01251	0.0	0.00066	...	106.21	0.20	109.20	0.25	101.0	428.0	101.0	63.0	0.011905	-0.100168

	atm_iv_30d	iv_skew	iv_ts
count	2835.000000	2835.000000	2835.000000
mean	0.149578	0.066827	0.007513
std	0.066943	0.035850	0.011356
min	0.062058	-0.465572	-0.087146
max	0.776023	0.429290	0.080241
skew	2.599984	-2.281396	-1.635627
kurtosis	12.596851	66.977281	12.256910

	overnight_ret	abs_r	r2	neg_r2	down_var	up_var	rsk	rku
count	2830.000000	2830.000000	2.830000e+03	2830.000000	2.813000e+03	2813.000000	2815.000000	2815.000000
mean	0.000316	0.007043	1.217114e-04	0.000065	6.564926e-05	0.000057	0.429764	18.354739
std	0.003314	0.006107	3.871663e-04	0.000269	1.870573e-04	0.000124	0.608587	4.168998
min	-0.030006	0.000581	3.711225e-07	0.000000	4.745243e-07	0.000003	-1.527082	5.903105
max	0.023852	0.085571	8.078781e-03	0.005823	2.249933e-03	0.001509	2.371407	34.007878
skew	-0.424568	4.938071	1.351947e+01	15.069714	9.039184e+00	8.188091	-0.108629	0.295547
kurtosis	9.912796	43.773244	2.319493e+02	277.032016	9.238763e+01	77.965242	0.073100	0.495762

	feature	mean_R2	std_R2
0	VIX	0.294929	0.086601
1	VIX_time_HY_OAS	0.209846	0.087117
2	VIX_rm21	0.097467	0.093864
3	iv_skew	0.086281	0.207520
4	RV_W	0.081386	0.160824
5	RV_M	0.074438	0.107090
6	vvix_over_vix	-0.020182	0.413011
7	down_var	-0.100095	0.222663
8	RV_D_rollvol5	-0.102031	0.256368
9	RV_D	-0.126413	0.248300

	R2	MSE	QLIKE	Var_res	R2_oos
model
Naive-RV	0.254411	0.633659	0.325268	0.633860	0.000000
Naive-IV	0.471264	0.449361	0.231955	0.414992	0.290848

	R2	MSE	QLIKE	Var_res	R2_oos
model
HAR-RV-VIX	0.508466	0.417744	0.183540	0.417825	0.340744
Random Forest	0.454920	0.463251	0.206699	0.463101	0.268927
0.50 * Model_A + 0.50 * Model_B	0.493576	0.430398	0.189751	0.430383	0.320774
0.70 * Model_A + 0.30 * Model_B	0.502384	0.422913	0.185976	0.422941	0.332587
0.80 * Model_A + 0.20 * Model_B	0.505362	0.420382	0.184731	0.420429	0.336581

		R2	MSE	QLIKE	Var_res	R2_oos
period	model
2010–2012	HAR-RV-VIX	0.431265	0.313219	0.139706	0.313630	0.355441
2010–2012	Naive_IV	0.258411	0.408415	0.237751	0.325782	0.159541
2013–2015	HAR-RV-VIX	0.147567	0.328351	0.149250	0.328433	0.347760
2013–2015	Naive_IV	0.077852	0.355205	0.196782	0.321251	0.294417
2016–2018	HAR-RV-VIX	0.476514	0.417189	0.221914	0.398821	0.243616
2016–2018	Naive_IV	0.501642	0.397163	0.212957	0.375742	0.279925
2019–2020	HAR-RV-VIX	0.423882	0.716155	0.243604	0.699486	0.397198
2019–2020	Naive_IV	0.405144	0.739447	0.307647	0.732765	0.377592

	RV_D	RV_W	RV_M	VIX
2010-02-01	0.000210	0.000161	0.000085	22.590000
2010-02-02	0.000089	0.000165	0.000089	21.480000
2010-02-03	0.000069	0.000159	0.000091	21.600000
2010-02-04	0.000625	0.000244	0.000120	26.080000
2010-02-05	0.000204	0.000239	0.000129	26.110001
...	...	...	...	...
2025-09-16	0.000004	0.000054	0.000047	16.360001
2025-09-17	0.000098	0.000069	0.000051	15.720000
2025-09-18	0.000063	0.000071	0.000049	15.700000
2025-09-19	0.000009	0.000071	0.000048	15.450000
2025-09-22	0.000004	0.000036	0.000038	16.100000

	R2	MSE	QLIKE	Var_res	R2_oos
model
Naive_RV	0.094304	0.507837	0.279067	0.508041	0.000000
HAR-RV	0.291954	0.397012	0.208633	0.392031	0.218230
HAR-RV-VIX	0.367568	0.354613	0.178821	0.354909	0.301718

	R2	MSE	QLIKE	Var_res	R2_oos
model
Naive_RV	0.259605	0.397882	0.215882	0.397322	0.000000
Naive-IV	0.534942	0.249918	0.147596	0.230769	0.371879
HAR-RV	0.376741	0.334934	0.185149	0.328940	0.158208
HAR-RV-VIX	0.459459	0.290482	0.157580	0.290875	0.269929

	DGS10	DGS2	DGS3MO	term_spread_10y_3m	HY_OAS	IG_OAS
count	2835.000000	2835.000000	2835.000000	2835.000000	2835.000000	2835.000000
mean	2.265898	0.908762	0.556808	1.709090	5.045884	1.507993
std	0.692320	0.747118	0.776442	0.958803	1.278209	0.386707
min	0.520000	0.110000	0.000000	-0.520000	3.160000	0.900000
max	4.010000	2.980000	2.490000	3.830000	10.870000	4.010000
skew	-0.243451	1.162005	1.339041	-0.086261	0.777947	1.348852
kurtosis	0.188206	0.250079	0.240950	-0.607675	0.104021	2.877455

	VIX_rm5	VIX_rm21	VIX_ewma	RV_D_ewma	HY_OAS_ewma	dVIX_5d	dSkew_5d	iv_minus_realized	vvix_over_vix	VIX_time_HY_OAS	RV_D_rollvol5	RV_D_rollvol21
count	2835.000000	2835.000000	2835.000000	2835.000000	2835.000000	2830.000000	2830.000000	2815.000000	2835.000000	2835.000000	2831.000000	2815.000000
mean	17.971053	17.960360	17.969347	0.000109	5.049248	0.005693	0.000061	0.149081	5.655410	97.091306	0.000081	0.000109
std	7.259670	6.853001	7.046994	0.000240	1.263522	3.593055	0.018515	0.066756	1.435551	68.567904	0.000181	0.000209
min	9.376000	9.800000	9.614762	0.000004	3.233705	-21.100002	-0.225512	0.062051	2.054459	30.979201	0.000001	0.000003
max	74.618001	61.258572	65.811602	0.003825	9.377609	35.850002	0.234780	0.774351	10.316940	707.040000	0.002710	0.002145
skew	2.536454	2.301621	2.402397	9.136190	0.713000	1.683577	1.018086	2.616658	0.373912	3.348497	8.036214	6.563821
kurtosis	10.793452	8.074845	9.234084	104.384005	-0.209626	14.954005	37.015940	12.701030	0.257500	18.603871	86.449242	52.444094

	HY_OAS_ewma	dSkew_5d	term_spread_10y_3m	RV_D_rollvol21
count	2.813000e+03	2.813000e+03	2.813000e+03	2.813000e+03
mean	-1.010370e-16	4.104628e-18	-8.082960e-17	1.414518e-15
std	1.000178e+00	1.000178e+00	1.000178e+00	1.000178e+00
min	-1.429195e+00	-6.151124e+00	-2.338689e+00	-2.970176e+00
max	3.431921e+00	4.908140e+00	2.256667e+00	3.613552e+00
skew	7.329708e-01	3.144251e-01	-1.109238e-01	6.171295e-01
kurtosis	-1.781129e-01	7.017764e+00	-6.160553e-01	6.743350e-01

	R2	MSE	QLIKE	Var_res	R2_oos
model
Naive-RV	0.254411	0.633659	0.325268	0.633860	0.000000
Naive-IV	0.471264	0.449361	0.231955	0.414992	0.290848
HAR-RV	0.428283	0.485890	0.227163	0.486040	0.233201
HAR-RV-VIX	0.508466	0.417744	0.183540	0.417825	0.340744
HAR-RV-VIX-VVIX	0.496790	0.427667	0.193307	0.427322	0.325084
HAR-RV-VIX-L	0.506065	0.419785	0.185642	0.419805	0.337523
HAR-RV-VIX-IVTS	0.510692	0.415852	0.184177	0.415904	0.343730
HAR-RV-VIX-VVIX-L-IVTS	0.497268	0.427260	0.194387	0.426773	0.325725

Forecasting Realized Volatility¶

Why Forecast Realized Volatility?¶

Research Questions¶

Summary¶

1. Read & Prepare Data¶

2. Volatility Estimators¶

2.1 Historical / Close-to-Close¶

2.2 Range-Based Proxies (OHLC)¶

2.2.1 Parkinson Estimator¶

2.2.2 Garman–Klass Estimator¶

2.2.3 Rogers–Satchell Estimator¶

2.2.4 Yang–Zhang Estimator¶

Which Estimator to choose ?¶

2.3 High-Frequency Estimators: Realized Variance¶

3. Stylized Facts of Daily Volatility¶

3.1 Volatility clustering¶

3.2 Long Memory / Slow Decay¶

3.3 Mean-reverting behaviour¶

3.4 Volatility distributions are often log-normal¶

3.5 Leverage Effect / Asymmetric Return–Volatility Relationship¶

4. Problem Formulation¶

4.1 Target Variable: 21-Day Realized Variance¶

4.2 Predictor Families (Information Available at Time $t$)¶

4.2.1 Lagged Volatility Measures: HAR-RV Lags (1D, 5D, 22D)¶

Transformation Decisions¶

4.2.2 Implied Volatility Surface Predictors¶

SPX option chain¶

SSVI-based IV surface predictors¶

Correct data error from iv puts¶

Transformation Decisions¶

4.2.3 Return-Based Predictors¶

Transformation Decisions¶

4.2.4 Macro & Market Predictors¶

Macro Fundamentals¶

Transformation Decisions¶

Market / Sentiment¶

Transformation Decisions¶

5. Feature Engineering¶

6. Data Preprocessing¶

Apply the appropiate transformation¶

Data preprocessing pipeline¶

Correlation / redundancy check¶

7. Modelling Framework¶

7.1 Cross-Validation Scheme (Purged K-Fold)¶

7.2 Model Specifications¶

7.2.1 Linear Models (HAR-RV and HAR-RV-X)¶

7.2.2 Non-Linear Model (Random Forest)¶

7.3 Evaluation Metrics¶

8. Feature Importance and Stability Analysis¶

8.1 Single-Feature Importance (SFI)¶

8.2 In-model stability analysis (with substitution effects)¶

Lasso¶

Random Forest¶

8.3 Out-of-Sample Permutation Importance (Mean Decrease Accuracy)¶

Linear Regression¶

Random Forest¶

9. Model Selection¶

9.1 Benchmarks (Naive RV and Naive IV)¶

9.1 Linear Models¶

9.2 Random Forest¶

9.3 Model Comparison and Selection¶

Conclusion¶

9.4 Time-variation in benchmark strength (IV vs HAR-RV-VIX)¶

10. Out-of-Sample Walk-Forward Evaluation¶

10.1 Backtest Horizon & Data Construction¶

10.2 Walk-Forward Configuration¶

10.3 Walk-Forward Results & Performance Metrics¶

10.3 Walk-Forward Results (RV Benchmarks, 2021–2025)¶

10.4 Walk-Forward Results vs Implied Volatility (2021–2023)¶

11. Conclusion¶