Source code for sdmetrics.single_table.efficacy.base

"""Base class for Efficacy metrics for single table datasets."""

import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

from sdmetrics.single_table.base import SingleTableMetric
from sdmetrics.utils import HyperTransformer


[docs]class MLEfficacyMetric(SingleTableMetric): """Base class for Machine Learning Efficacy metrics on single tables. These metrics fit a Machine Learning model on the training data and then evaluate it making predictions on the test data. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. model: Model class to use for the prediction. model_kwargs: Keyword arguments to use to create the model instance. """ name = None goal = None min_value = None max_value = None MODEL = None MODEL_KWARGS = None METRICS = None @classmethod def _fit_predict(cls, train_data, train_target, test_data, test_target): """Fit a model to the training data and make predictions for the test data.""" del test_target # delete argument which subclasses use but this method does not. unique_labels = np.unique(train_target) if len(unique_labels) == 1: predictions = np.full(len(test_data), unique_labels[0]) else: ht = HyperTransformer() test_data = ht.fit_transform(test_data) train_data = ht.transform(train_data) test_data[np.isin(test_data, [np.inf, -np.inf])] = None train_data[np.isin(train_data, [np.inf, -np.inf])] = None model_kwargs = cls.MODEL_KWARGS.copy() if cls.MODEL_KWARGS else {} model = cls.MODEL(**model_kwargs) pipeline = Pipeline([ ('imputer', SimpleImputer()), ('scaler', RobustScaler()), ('model', model) ]) pipeline.fit(train_data, train_target) predictions = pipeline.predict(test_data) return predictions @classmethod def _validate_inputs(cls, test_data, train_data, metadata, target): test_data, train_data, metadata = super()._validate_inputs( test_data, train_data, metadata) if 'target' in metadata: target = metadata['target'] elif target is None: raise TypeError('`target` must be passed either directly or inside `metadata`') return target @classmethod def _score(cls, scorer, test_target, predictions): scorer = scorer or cls.SCORER if isinstance(scorer, (list, tuple)): scorers = scorer return tuple((scorer(test_target, predictions) for scorer in scorers)) else: return scorer(test_target, predictions) @classmethod def compute(cls, test_data, train_data, metadata=None, target=None, scorer=None): """Compute this metric. This fits a Machine Learning model on the training data and then evaluates it making predictions on the test data. A ``target`` column name must be given, either directly or as a first level entry in the ``metadata`` dict, which will be used as the target column for the Machine Learning prediction. Optionally, a list of ML scorer functions can be given. Otherwise, the default one for the type of problem is used. Args: test_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the test dataset. train_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the training dataset. target (str): Name of the column to use as the target. scorer (Union[callable, list[callable], NoneType]): Scorer (or list of scorers) to apply. If not passed, use the default one for the type of metric. Returns: union[float, tuple[float]]: Scores obtained by the models when evaluated on the test data. """ target = cls._validate_inputs(test_data, train_data, metadata, target) test_data = test_data.copy() train_data = train_data.copy() test_target = test_data.pop(target) train_target = train_data.pop(target) predictions = cls._fit_predict(train_data, train_target, test_data, test_target) return cls._score(scorer, test_target, predictions)