Source code for sdmetrics.single_table.gaussian_mixture

"""GaussianMixture based metrics for single table."""
import itertools
import logging

import numpy as np
from sklearn.mixture import GaussianMixture

from sdmetrics.errors import IncomputableMetricError
from sdmetrics.goal import Goal
from sdmetrics.single_table.base import SingleTableMetric

LOGGER = logging.getLogger(__name__)


[docs]class GMLogLikelihood(SingleTableMetric): """GaussianMixture Single Table metric. This metric fits multiple GaussianMixture models to the real data and then evaluates how likely it is that the synthetic data belongs to the same distribution as the real data. By default, GaussianMixture models with 10, 20 and 30 components are fitted a total of 3 times. The output is the average log likelihood across all the GMMs. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. """ name = 'GaussianMixture Log Likelihood' goal = Goal.MAXIMIZE min_value = -np.inf max_value = np.inf @classmethod def _select_gmm(cls, real_data, n_components, covariance_type): if isinstance(n_components, int): min_comp = max_comp = n_components else: min_comp, max_comp = n_components if isinstance(covariance_type, str): covariance_type = (covariance_type, ) combinations = list(itertools.product(range(min_comp, max_comp + 1), covariance_type)) if len(combinations) == 1: return combinations[0] lowest_bic = np.inf best = None for n_components, covariance_type in combinations: gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type) try: gmm.fit(real_data) bic = gmm.bic(real_data) LOGGER.debug('%s, %s: %s', n_components, covariance_type, bic) if bic < lowest_bic: lowest_bic = bic best = (n_components, covariance_type) except ValueError: pass if not best: metric_name = cls.name raise IncomputableMetricError(f'{metric_name}: Unable to fit GaussianMixture') return best
[docs] @classmethod def compute(cls, real_data, synthetic_data, metadata=None, n_components=(1, 30), covariance_type='diag', iterations=3, retries=3): """Compute this metric. This fits multiple GaussianMixture models to the real data and then evaluates how likely it is that the synthetic data belongs to the same distribution as the real data. By default, GaussianMixture models will search for the optimal number of components and covariance type using the real data and then evaluate the likelihood of the synthetic data using those arguments 3 times. Real data and synthetic data must be passed as ``pandas.DataFrame`` instances and ``metadata`` as a ``Table`` metadata ``dict`` representation. If no ``metadata`` is given, one will be built from the values observed in the ``real_data``. The output is the average log likelihood across all the GMMs evaluated. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the real dataset. synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the synthetic dataset. metadata (dict): Table metadata dict. n_components (Union[int, tuple[int]]): Number of components to use for the GMM. If a tuple with 2 integers is passed, the optimal number of components within the range will be searched. Defaults to (1, 30) covariance_type (Union[str, tuple[str]]): Covariange type to use for the GMM. If multiple values are passed, the best one will be searched. Defaults to ``'diag'``. iterations (int): Number of times that each number of components should be evaluated before averaging the scores. Defaults to 3. retries (int): Number of times that each iteration will be retried if the GMM model crashes during fit. Defaults to 3. Returns: float: Average score returned by the GaussianMixtures. """ real_data, synthetic_data, metadata = cls._validate_inputs( real_data, synthetic_data, metadata) fields = cls._select_fields(metadata, 'numerical') real_data = real_data[fields] synthetic_data = synthetic_data[fields] real_data = real_data.fillna(real_data.mean()) synthetic_data = synthetic_data.fillna(synthetic_data.mean()) if not isinstance(n_components, int) or not isinstance(covariance_type, str): LOGGER.debug('Selecting best GMM parameters') best_gmm = cls._select_gmm(real_data, n_components, covariance_type) if best_gmm is None: return np.nan n_components, covariance_type = best_gmm LOGGER.debug('n_components=%s and covariance_type=%s selected', n_components, covariance_type) scores = [] for _ in range(iterations * retries): try: gmm = GaussianMixture(n_components, covariance_type=covariance_type) gmm.fit(real_data) scores.append(gmm.score(synthetic_data)) if len(scores) >= iterations: break except ValueError: pass if not scores: metric_name = cls.name raise IncomputableMetricError(f'{metric_name}: Exhausted retries for GaussianMixture') return np.mean(scores)
@classmethod def normalize(cls, raw_score): """Normalize the log-likelihood value. Notice that this is not the mean likelihood. Args: raw_score (float): The value of the metric from `compute`. Returns: float: The normalized value of the metric """ return super().normalize(raw_score)