import copy
import warnings
import numpy as np
import pandas as pd
import patsy
from statsmodels.stats.outliers_influence import (
variance_inflation_factor as vif,
)
from tqdm import tqdm
from statsmodels.regression.linear_model import RegressionResultsWrapper
from fitgrid.fitgrid import FitGrid
# ------------------------------------------------------------
# fitgrid's database of statsmodels OLSInfluence diagnostics:
#
# * what there is, how it runs, and data type, like so
#
# * attr : (calc_type, value_dtype, df.index.names)
#
# df.index.names are as returned by LMFitGrid attribute getter
# nobs = number of observations
# nobs_k = number of observations x model regressors
# nobs_loop = nobs re-fitting ... slow
# TPU 03/19
# ------------------------------------------------------------
# '_TIME' and '_EPOCH_ID' are used to compare indexes returned
# by the diagnostics with with those in the grid.
_FLOAT_TYPE = np.float64
_INT_TYPE = np.int64
_OLS_INFLUENCE_ATTRS = {
'cooks_distance': ('nobs', _FLOAT_TYPE, ['_TIME', None, '_EPOCH_ID']),
'cov_ratio': ('nobs_loop', _FLOAT_TYPE, ['_TIME', '_EPOCH_ID']),
'dfbetas': ('nobs_loop', _FLOAT_TYPE, ['_TIME', '_EPOCH_ID', None]),
'dffits': ('nobs_loop', _FLOAT_TYPE, ['_TIME', None, '_EPOCH_ID']),
'dffits_internal': ('nobs', _FLOAT_TYPE, ['_TIME', None, '_EPOCH_ID']),
'ess_press': ('nobs', _FLOAT_TYPE, ['_TIME']),
'hat_matrix_diag': ('nobs', _FLOAT_TYPE, ['_TIME', '_EPOCH_ID']),
'influence': ('nobs', _FLOAT_TYPE, ['_TIME', '_EPOCH_ID']),
'k_vars': ('nobs', _INT_TYPE, ['_TIME']),
'nobs': ('nobs', _INT_TYPE, ['_TIME']),
'resid_press': ('nobs', _FLOAT_TYPE, ['_TIME', '_EPOCH_ID']),
'resid_std': ('nobs', _FLOAT_TYPE, ['_TIME', '_EPOCH_ID']),
'resid_studentized_external': (
'nobs_loop',
_FLOAT_TYPE,
['_TIME', '_EPOCH_ID'],
),
'resid_studentized_internal': (
'nobs',
_FLOAT_TYPE,
['_TIME', '_EPOCH_ID'],
),
'resid_var': ('nobs', _FLOAT_TYPE, ['_TIME', '_EPOCH_ID']),
}
[docs]def get_vifs(epochs, RHS, quiet=False):
def get_single_vif(group, RHS):
dmatrix = patsy.dmatrix(formula_like=RHS, data=group)
vifs = {
name: vif(dmatrix, index)
for name, index in dmatrix.design_info.column_name_indexes.items()
}
return pd.Series(vifs)
tqdm.pandas(desc="Time", disable=quiet)
# return epochs._snapshots.progress_apply(get_single_vif, RHS=RHS)
return epochs._snapshots.apply(get_single_vif, RHS=RHS)
# ------------------------------------------------------------
# OLSInfluence diagnostic helpers TPU 03/19
# ------------------------------------------------------------
def _check_get_diagnostic_args(lm_grid, diagnostic, do_nobs_loop):
# type, value checking doesn't run anything, for args see get_diagnostic()
# types ------------------------------------------------------
msg = None
if not isinstance(lm_grid, FitGrid):
msg = f"lm_grid must be a FitGrid not {type(lm_grid)}"
if not isinstance(lm_grid.tester, RegressionResultsWrapper):
msg = f"lm_grid must be fit with fitgrid.lm()"
if not isinstance(diagnostic, str):
msg = f"{diagnostic} must be a string"
if not isinstance(do_nobs_loop, bool):
msg = f"do_nobs_loop must be True or False"
if msg is not None:
raise TypeError(msg)
# values ------------------------------------------------------
infl_calc, infl_dtype, index_names = _OLS_INFLUENCE_ATTRS[diagnostic]
if diagnostic not in _OLS_INFLUENCE_ATTRS:
msg = f"unknown OLSInfluence attribute {diagnostic}"
if infl_calc == "nobs_loop" and not do_nobs_loop:
msg = f"{diagnostic} is slow, set do_nobs_loop=True to calculate"
if msg is not None:
raise ValueError(msg)
if infl_calc is None:
msg = (
f"get_diagnostic({diagnostic}), try"
" lm_grid.get_influence().{diagnostic}"
)
raise NotImplementedError(msg)
def _get_diagnostic(lm_grid, diag, do_nobs_loop):
"""grid scraper with a modicum of validation"""
# modicum of guarding
_check_get_diagnostic_args(
lm_grid=lm_grid, diagnostic=diag, do_nobs_loop=do_nobs_loop
)
infl_calc, infl_dtype, index_names = _OLS_INFLUENCE_ATTRS[diag]
attr_df = getattr(lm_grid.get_influence(), diag).copy()
if not isinstance(attr_df, pd.DataFrame):
raise TypeError(f"{diag} grid is not a pandas DataFrame")
actual_type = type(attr_df.iloc[0, 0])
if actual_type is not infl_dtype:
raise TypeError(f"gridded {diag} dtype should be {infl_dtype}")
# swap in grid values for diagnostic _TIME and _EPOCH_ID and check
_index_names = copy.copy(index_names) # else _OLS_INFLUENCE is modified
for idx, index_name in enumerate(_index_names):
if index_name == "_TIME":
_index_names[idx] = lm_grid.time
if index_name == "_EPOCH_ID":
_index_names[idx] = lm_grid.epoch_index.name
if not _index_names == attr_df.index.names:
import pdb
pdb.set_trace()
raise TypeError(
f"_OLS_INFLUENCE_ATTRS thinks {diag} index names"
f" should be {index_names} not {attr_df.index.names}"
)
return attr_df
# ------------------------------------------------------------
# UI wrappers
# ------------------------------------------------------------
[docs]def list_diagnostics():
"""Display `statsmodels` diagnostics implemented in fitgrid.utils.lm"""
fast = [
f" get_diagnostic(lm_grid, '{attr}')"
for attr, spec in _OLS_INFLUENCE_ATTRS.items()
if spec[0] not in [None, 'nobs_loop']
]
slow = [
f" get_diagnostic(lm_grid, '{attr}', do_nobs_loop=True)"
for attr, spec in _OLS_INFLUENCE_ATTRS.items()
if spec[0] == 'nobs_loop'
]
not_implemented = [
(
f" {attr}: not implemented for get_diagnostic(), "
"try querying the grid directly"
)
for attr, spec in _OLS_INFLUENCE_ATTRS.items()
if spec[0] is None
]
print(
"Fast: These are caclulated quickly with get_diagnostic(),"
" usable for large data sets\n"
)
for usage in fast:
print(usage)
print(
"\nSlow: These are available with get_diagnostic() but"
" refit a model without each data point. Disabled by"
" default but can be forced like so\n"
)
for usage in slow:
print(usage)
print(
"\nNot implemented:\nThese are not available for get_diagnostic() but"
"may be available in the fitted grid.\n"
)
for usage in not_implemented:
print(usage)
[docs]def get_diagnostic(lm_grid, diagnostic, do_nobs_loop=False):
"""Fetch `statsmodels` diagnostic as a Time x Channel dataframe
`statsmodels` implements a variety of data and model diagnostic
measures. For some, it also computes a version of a recommended
critical value or :math:`p`-value. Use these at your own risk
after careful study of the `statsmodels` source code. For details
visit :sm_docs:`statsmodels.stats.outliers_influence.OLSInfluence.html`
For a catalog of the measures available for `fitgrid.lm()` run
this in Python
.. code-block:: python
>>>fitgrid.utils.lm.list_diagnostics()
.. Warning:: Data diagnostics can be very large and very slow, see
Notes for details.
* By default **all** values of the diagnostics are computed,
this dataframe can be pruned with
:meth:`fitgrid.utils.lm.filter_diagnostic` function.
* By default slow diagnostics are **not** computed, this can be
forced by setting `do_nobs_loop=True`.
Parameters
----------
lm_grid : fitgrid.LMFitGrid
As returned by :meth:`fitgrid.lm`.
diagnostic : string
As implemented in `statsmodels`, e.g., "cooks_distance",
"dffits_internal", "est_std", "dfbetas".
do_nobs_loop : bool
`True` forces slow leave-one-observation-out model refitting.
Returns
-------
diagnostic_df : pandas.DataFrame
Channels are in columns. Model measures are row indexed by
time; data measures add an epoch row index; parameter measures add
a parameter row index.
sm_1_df : pandas.DataFrame
The supplemenatary values `statsmodels` returns, or `None`,
same shape as diagnostic_df.
Notes
-----
* **Size:** `diagnostic_df` values for data measures like
`cooks_distance` and `hat_matrix_diagonal` are the size of the
original data plus a row index and for some data measures like
`dfbetas`, they are the size of the data multiplied by the
number of regressors in the model.
* **Speed:** Leave-one-observation-out (LOOO) model refitting takes
as long as it takes to fit one model multiplied by the number of
observations. This can be intractable for large
datasets. Diagnostic measures calculated from the original fit
like `cooks_distance` and `dffits_internal` are tractable even
for large data sets.
Examples
--------
.. code-block:: python
# fake data
epochs_fg = fitgrid.generate()
lm_grid = fitgrid.lm(
epochs_fg,
LHS=epochs_fg.channels,
RHS='continuous + categorical',
parallel=True,
n_cores=4,
)
# data diagnostic, one dataframe with the values
ess_press, _ = fitgrid.utils.lm.get_diagnostic(
lm_grid,
'ess_press'
)
# Cook's D dataframe AND the p-values statsmodels computes
cooks_Ds, sm_pvals = fitgrid.utils.lm.get_diagnostic(
lm_grid,
'cooks_distance'
)
# this fails because it requires LOOO loop
dfbetas_df, _ = fitgrid.utils.lm.get_diagnostic(
lm_grid,
'dfbetas'
)
# this succeeds by forcing LOOO loop calculation
dfbetas_df, _ = fitgrid.utils.lm.get_diagnostic(
lm_grid,
'dfbetas',
do_nobs_loop=True
)
"""
diag_df = _get_diagnostic(lm_grid, diagnostic, do_nobs_loop)
# special case diagnostic handling is unavoidable b.c. some
# OLSInflunce methods return 2-ples, most don't. Extract
# both parts for those that do.
sm_1_df = None
if diagnostic in ["cooks_distance", "dffits_internal", "dffits"]:
assert len(diag_df.index.names) == 3
assert diag_df.index.names[1] is None # diagnostic index
# values returned as 2nd item in diagnostic 2-ple
sm_1_df = diag_df.loc[pd.IndexSlice[:, 1, :], :].reset_index(
1, drop=True
)
# label the columns
sm_1_df.columns = pd.MultiIndex.from_product(
[[f"{diagnostic}_sm_1"], diag_df.columns],
names=['diagnostic', 'channel'],
)
# diagnostic measures
diag_df = diag_df.loc[pd.IndexSlice[:, 0, :], :].reset_index(
1, drop=True
)
# name unlabeled index from fitgrid
if len(diag_df.index.names) == 3 and diag_df.index.names[2] is None:
diag_df.index.names = [
name if name is not None else f"{diagnostic}_id"
for name in diag_df.index.names
]
# label the columns
diag_df.columns = pd.MultiIndex.from_product(
[[diagnostic], diag_df.columns], names=['diagnostic', 'channel']
)
# wide columns == channels diagnostic dataframe
return diag_df, sm_1_df
[docs]def filter_diagnostic(
diagnostic_df, how, bound_0, bound_1=None, format='long'
):
"""Select a subset of a fitgrid `statsmodels` diagnostic dataframe by value.
Use this to identify time ponts, epochs, parameters, channels with
outlying or potentially influential data.
Parameters
----------
diagnostic_df : pandas.DataFrame
As returned by :meth:`fitgrid.utils.lm.get_diagnostic`
how : {'above', 'below', 'inside', 'outside'}
slice `diagnostic_df` above or below `bound_0` or inside or
outside the closed interval `(bound_0, bound_1)`.
bound_0 : scalar or array-like
`bound_0` is the mandatory boundary for all `how`. See
`pandas.DataFrame.gt` and `pandas.DataFrame.lt` documents
for binary comparisons with dataframes.
bound_1: scalar or array-like
`bound_1 is the mandatory upper bound for `how="inside"` and
`how="outside"`.
format : {'long', 'wide'}
The `long` format pivots the channel columns into a row index
and returns just those times, (epochs, parameters), channels
that pass the filter. The `wide` format returns `filtered_df`
with the same shape as `diagnostic_df`, those datapoints that
pass the filter in their original row, column location, `nans`
elsewhere.
Returns
-------
selected_df : pandas.DataFrame
"""
if how in ["above", "below"]:
try:
bound_0 > 0
except Exception as fail:
fail.args = ("bound_0", *fail.args)
raise fail
if bound_1 is not None:
msg = "bound_1 is ignored with how=above and how=below"
warnings.warn(msg)
bound_1 = None
elif how in ["inside", "outside"]:
# are bounds comparable, legal
try:
bound_1 < bound_0
except Exception as fail:
fail.args = ("bound_1, bound_0", *fail.args)
raise fail
if np.array(bound_1).__lt__(bound_0):
msg = "upper bound_1 value(s) less than bound_0"
raise ValueError(msg)
else:
msg = f"how must be above, below, inside, outside"
raise ValueError(msg)
# all and only four cases, pandas handles failures
if how == "above":
diagnostic_df = diagnostic_df.where(diagnostic_df.gt(bound_0))
if how == "below":
diagnostic_df = diagnostic_df.where(diagnostic_df.lt(bound_0))
if how == "inside":
diagnostic_df = diagnostic_df.where(
diagnostic_df.gt(bound_0) & diagnostic_df.lt(bound_1)
)
if how == "outside":
diagnostic_df = diagnostic_df.where(
diagnostic_df.lt(bound_0) | diagnostic_df.gt(bound_1)
)
if format == "long":
return diagnostic_df.stack(1, dropna=True).sort_index()
elif format == "wide":
return diagnostic_df
else:
raise ValueError(f"format must be long or wide, not {format}")