Source code for finance_datagen._risk_models

"""Synthetic factor-risk-model data generators."""

from __future__ import annotations

from datetime import date, timedelta
from typing import Annotated, Sequence

import numpy as np
import polars as pl
from finance_enums import Sector
from pydantic import Field, model_validator

from ._base import DataGenerator, NonNegativeFloat, PositiveFloat, PositiveInt

_DEFAULT_SECTORS = tuple(member.value for member in Sector)


def _date_range(n_dates: int, start: date | None) -> list[date]:
    first = date(2020, 1, 1) if start is None else start
    return [first + timedelta(days=i) for i in range(n_dates)]


def _symbols(n_assets: int, symbols: Sequence[str] | None) -> list[str]:
    values = [f"A{i:04d}" for i in range(n_assets)] if symbols is None else list(symbols)
    if len(values) != n_assets:
        raise ValueError(f"symbols length {len(values)} != n_assets {n_assets}")
    return values


def _standardize(values: np.ndarray) -> np.ndarray:
    centered = values - values.mean()
    std = centered.std(ddof=0)
    if std == 0.0:
        return centered
    return centered / std


[docs] class SpecificVarianceGenerator(DataGenerator[pl.DataFrame]): """Generate a positive idiosyncratic variance vector.""" n_assets: PositiveInt = 50 target_vol: PositiveFloat = 0.25 dispersion: NonNegativeFloat = 0.35 seed: int | None = None symbols: tuple[str, ...] | None = None @model_validator(mode="after") def _validate_symbols(self): _symbols(self.n_assets, self.symbols) return self
[docs] def generate(self) -> pl.DataFrame: """Return ``[symbol, specific_variance]``.""" rng = np.random.default_rng(self.seed) mean = np.log(self.target_vol * self.target_vol) - 0.5 * self.dispersion * self.dispersion variance = rng.lognormal(mean=mean, sigma=self.dispersion, size=self.n_assets) return pl.DataFrame({"symbol": _symbols(self.n_assets, self.symbols), "specific_variance": variance})
[docs] class FactorCovarianceGenerator(DataGenerator[pl.DataFrame]): """Generate a symmetric positive semidefinite factor covariance matrix.""" factors: tuple[str, ...] = ("market", "sector", "value", "momentum", "size", "quality", "low_vol", "growth") factor_vol: PositiveFloat = 0.16 eigen_decay: Annotated[float, Field(gt=0.0, le=1.0)] = 0.75 base_corr: Annotated[float, Field(gt=-1.0, lt=1.0)] = 0.25 seed: int | None = None @model_validator(mode="after") def _validate_factors(self): if not self.factors: raise ValueError("factors must not be empty") return self
[docs] def generate(self) -> pl.DataFrame: """Return a wide covariance matrix with a leading ``factor`` column.""" _ = np.random.default_rng(self.seed) n_factors = len(self.factors) indices = np.arange(n_factors) corr = self.base_corr ** np.abs(indices[:, None] - indices[None, :]) vol = self.factor_vol * self.eigen_decay ** (indices / 2.0) covariance = np.outer(vol, vol) * corr return pl.DataFrame({"factor": self.factors, **{factor: covariance[:, i] for i, factor in enumerate(self.factors)}})
[docs] class FundamentalRiskModelGenerator(DataGenerator[pl.DataFrame]): """Generate Barra-style sector and style-factor exposure data.""" n_assets: Annotated[int, Field(gt=1)] = 50 sectors: tuple[str, ...] = _DEFAULT_SECTORS style_factors: tuple[str, ...] = ("value", "momentum", "size", "quality", "low_vol", "growth") seed: int | None = None symbols: tuple[str, ...] | None = None @model_validator(mode="after") def _validate_inputs(self): if not self.sectors or not self.style_factors: raise ValueError("sectors and style_factors must not be empty") _symbols(self.n_assets, self.symbols) return self
[docs] def generate(self) -> pl.DataFrame: """Return wide factor loadings with a positive specific variance.""" rng = np.random.default_rng(self.seed) symbols = _symbols(self.n_assets, self.symbols) cols: dict[str, Sequence[float] | Sequence[str] | np.ndarray] = { "symbol": symbols, "sector": rng.choice(np.array(self.sectors), size=self.n_assets), "market": np.ones(self.n_assets), } for factor in self.style_factors: cols[factor] = _standardize(rng.normal(0.0, 1.0, self.n_assets)) cols["specific_variance"] = ( SpecificVarianceGenerator(n_assets=self.n_assets, seed=self.seed, symbols=tuple(symbols)).generate()["specific_variance"].to_numpy() ) return pl.DataFrame(cols)
[docs] class StatisticalRiskModelGenerator(DataGenerator[dict[str, pl.DataFrame]]): """Generate PCA-style statistical factor model components.""" n_dates: Annotated[int, Field(gt=1)] = 252 n_assets: Annotated[int, Field(gt=1)] = 50 n_factors: PositiveInt = 5 factor_vol: PositiveFloat = 0.01 idiosyncratic_vol: PositiveFloat = 0.015 seed: int | None = None start: date | None = None symbols: tuple[str, ...] | None = None @model_validator(mode="after") def _validate_inputs(self): if self.n_factors >= self.n_assets: raise ValueError("n_factors must be smaller than n_assets") _symbols(self.n_assets, self.symbols) return self
[docs] def generate(self) -> dict[str, pl.DataFrame]: """Return factor loadings, factor returns, and specific variance.""" rng = np.random.default_rng(self.seed) true_factor_returns = rng.normal(0.0, self.factor_vol, size=(self.n_dates, self.n_factors)) true_loadings = rng.normal(0.0, 1.0, size=(self.n_assets, self.n_factors)) idiosyncratic = rng.normal(0.0, self.idiosyncratic_vol, size=(self.n_dates, self.n_assets)) returns = true_factor_returns @ true_loadings.T + idiosyncratic centered = returns - returns.mean(axis=0, keepdims=True) u_matrix, singular_values, vt_matrix = np.linalg.svd(centered, full_matrices=False) factors = [f"factor_{i + 1}" for i in range(self.n_factors)] factor_scores = u_matrix[:, : self.n_factors] * singular_values[: self.n_factors] loadings = vt_matrix[: self.n_factors].T residual = centered - factor_scores @ vt_matrix[: self.n_factors] specific_variance = residual.var(axis=0, ddof=0) symbols = _symbols(self.n_assets, self.symbols) factor_loadings = pl.DataFrame({"symbol": symbols, **{factor: loadings[:, i] for i, factor in enumerate(factors)}}) factor_returns = pl.DataFrame( { "date": pl.Series(_date_range(self.n_dates, self.start)).cast(pl.Date), **{factor: factor_scores[:, i] for i, factor in enumerate(factors)}, } ) specific = pl.DataFrame({"symbol": symbols, "specific_variance": specific_variance}) return {"factor_loadings": factor_loadings, "factor_returns": factor_returns, "specific_variance": specific}
[docs] def generate_specific_variance( n_assets: int = 50, target_vol: float = 0.25, dispersion: float = 0.35, seed: int | None = None, symbols: Sequence[str] | None = None, ) -> pl.DataFrame: """Generate a positive idiosyncratic variance vector.""" return SpecificVarianceGenerator( n_assets=n_assets, target_vol=target_vol, dispersion=dispersion, seed=seed, symbols=None if symbols is None else tuple(symbols), ).generate()
[docs] def generate_factor_covariance( factors: Sequence[str] = ("market", "sector", "value", "momentum", "size", "quality", "low_vol", "growth"), factor_vol: float = 0.16, eigen_decay: float = 0.75, base_corr: float = 0.25, seed: int | None = None, ) -> pl.DataFrame: """Generate a symmetric positive semidefinite factor covariance matrix.""" return FactorCovarianceGenerator( factors=tuple(factors), factor_vol=factor_vol, eigen_decay=eigen_decay, base_corr=base_corr, seed=seed, ).generate()
[docs] def generate_fundamental_risk_model( n_assets: int = 50, sectors: Sequence[str] = _DEFAULT_SECTORS, style_factors: Sequence[str] = ("value", "momentum", "size", "quality", "low_vol", "growth"), seed: int | None = None, symbols: Sequence[str] | None = None, ) -> pl.DataFrame: """Generate Barra-style sector and style-factor exposure data.""" return FundamentalRiskModelGenerator( n_assets=n_assets, sectors=tuple(sectors), style_factors=tuple(style_factors), seed=seed, symbols=None if symbols is None else tuple(symbols), ).generate()
[docs] def generate_statistical_risk_model( n_dates: int = 252, n_assets: int = 50, n_factors: int = 5, factor_vol: float = 0.01, idiosyncratic_vol: float = 0.015, seed: int | None = None, start: date | None = None, symbols: Sequence[str] | None = None, ) -> dict[str, pl.DataFrame]: """Generate PCA-style statistical factor model components.""" return StatisticalRiskModelGenerator( n_dates=n_dates, n_assets=n_assets, n_factors=n_factors, factor_vol=factor_vol, idiosyncratic_vol=idiosyncratic_vol, seed=seed, start=start, symbols=None if symbols is None else tuple(symbols), ).generate()