Source code for sdmetrics.single_table.multi_column_pairs

"""SingleTable metrics based on applying a ColumnPairsMetrics on all the possible column pairs."""

from itertools import combinations

import numpy as np

from sdmetrics import column_pairs
from sdmetrics.single_table.base import SingleTableMetric
from sdmetrics.utils import nested_attrs_meta


[docs]class MultiColumnPairsMetric(
    SingleTableMetric,
    metaclass=nested_attrs_meta('column_pairs_metric')
):
    """SingleTableMetric subclass that applies a ColumnPairsMetric on each possible column pair.

    Attributes:
        name (str):
            Name to use when reports about this metric are printed.
        goal (sdmetrics.goal.Goal):
            The goal of this metric.
        min_value (Union[float, tuple[float]]):
            Minimum value or values that this metric can take.
        max_value (Union[float, tuple[float]]):
            Maximum value or values that this metric can take.
        column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric):
            ColumnPairsMetric to apply.
        field_types (dict):
            Field types to which the SingleColumn metric will be applied.
    """

    column_pairs_metric = None
    column_pairs_metric_kwargs = None
    field_types = None

[docs]    def __init__(self, column_pairs_metric, **column_pairs_metric_kwargs):
        self.column_pairs_metric = column_pairs_metric
        self.column_pairs_metric_kwargs = column_pairs_metric_kwargs
        self.compute = self._compute

    def _compute(self, real_data, synthetic_data, metadata=None, **kwargs):
        """Compute this metric.

        This is done by grouping all the columns that are compatible with the
        underlying ColumnPairs metric in groups of 2 and then evaluating them
        using the ColumnPairs metric.

        The output is the average of the scores obtained.

        Args:
            real_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the real dataset.
            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict.
            **kwargs:
                Any additional keyword arguments will be passed down
                to the column pairs metric

        Returns:
            Union[float, tuple[float]]:
                Metric output.
        """
        real_data, synthetic_data, metadata = self._validate_inputs(
            real_data, synthetic_data, metadata)

        fields = self._select_fields(metadata, self.field_types)

        values = []
        for columns in combinations(fields, r=2):
            real = real_data[list(columns)]
            synthetic = synthetic_data[list(columns)]
            values.append(self.column_pairs_metric.compute(real, synthetic))

        return np.nanmean(values)

    @classmethod
    def compute(cls, real_data, synthetic_data, metadata=None, **kwargs):
        """Compute this metric.

        Args:
            real_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the real dataset.
            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict.
            **kwargs:
                Any additional keyword arguments will be passed down
                to the column pairs metric

        Returns:
            Union[float, tuple[float]]:
                Metric output.
        """
        return cls._compute(cls, real_data, synthetic_data, metadata, **kwargs)

    @classmethod
    def compute_breakdown(cls, real_data, synthetic_data, metadata=None, **kwargs):
        """Compute the breakdown of this metric.

        Args:
            real_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the real dataset.
            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict.
            **kwargs:
                Any additional keyword arguments will be passed down
                to the column pairs metric

        Returns:
            dict:
                Metric output.
        """
        real_data, synthetic_data, metadata = cls._validate_inputs(
            real_data, synthetic_data, metadata)

        fields = cls._select_fields(metadata, cls.field_types)

        breakdown = {}
        for columns in combinations(fields, r=2):
            sorted_columns = tuple(sorted(columns))
            real = real_data[list(sorted_columns)]
            synthetic = synthetic_data[list(sorted_columns)]
            breakdown[sorted_columns] = cls.column_pairs_metric.compute_breakdown(
                real, synthetic, **kwargs)

        return breakdown

    @classmethod
    def normalize(cls, raw_score):
        """Return the `raw_score` as is, since it is already normalized.

        Args:
            raw_score (float):
                The value of the metric from `compute`.

        Returns:
            float:
                The normalized value of the metric
        """
        assert cls.min_value == 0.0
        return super().normalize(raw_score)


[docs]class ContinuousKLDivergence(MultiColumnPairsMetric):
    """MultiColumnPairsMetric based on ColumnPairs ContinuousKLDivergence.

    This approximates the KL divergence by binning the continuous values
    to turn them into categorical values and then computing the relative
    entropy. Afterwards normalizes the value applying ``1 / (1 + KLD)``.

    Attributes:
        name (str):
            Name to use when reports about this metric are printed.
        goal (sdmetrics.goal.Goal):
            The goal of this metric.
        min_value (Union[float, tuple[float]]):
            Minimum value or values that this metric can take.
        max_value (Union[float, tuple[float]]):
            Maximum value or values that this metric can take.
        column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric):
            ColumnPairs ContinuousKLDivergence.
        field_types (dict):
            Field types to which the SingleColumn metric will be applied.
    """

    field_types = ('numerical', )
    column_pairs_metric = column_pairs.statistical.kl_divergence.ContinuousKLDivergence


[docs]class DiscreteKLDivergence(MultiColumnPairsMetric):
    """MultiColumnPairsMetric based on ColumnPairs DiscreteKLDivergence.

    This computes the KL divergence and afterwards normalizes the
    value applying ``1 / (1 + KLD)``.

    Attributes:
        name (str):
            Name to use when reports about this metric are printed.
        goal (sdmetrics.goal.Goal):
            The goal of this metric.
        min_value (Union[float, tuple[float]]):
            Minimum value or values that this metric can take.
        max_value (Union[float, tuple[float]]):
            Maximum value or values that this metric can take.
        column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric):
            ColumnPairs DiscreteKLDivergence.
        field_types (dict):
            Field types to which the SingleColumn metric will be applied.
    """

    field_types = ('boolean', 'categorical')
    column_pairs_metric = column_pairs.statistical.kl_divergence.DiscreteKLDivergence


class ContingencySimilarity(MultiColumnPairsMetric):
    """MultiColumnPairsMetric based on ColumnPairs ContingencySimilarity.

    This computes the complement of the total variation distance between
    the contingency contingency tables of the real and synthetic data.

    Attributes:
        name (str):
            Name to use when reports about this metric are printed.
        goal (sdmetrics.goal.Goal):
            The goal of this metric.
        min_value (Union[float, tuple[float]]):
            Minimum value or values that this metric can take.
        max_value (Union[float, tuple[float]]):
            Maximum value or values that this metric can take.
        column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric):
            ColumnPairs DiscreteKLDivergence.
        field_types (dict):
            Field types to which the SingleColumn metric will be applied.
    """

    field_types = ('boolean', 'categorical')
    column_pairs_metric = column_pairs.statistical.contingency_similarity.ContingencySimilarity


class CorrelationSimilarity(MultiColumnPairsMetric):
    """MultiColumnPairsMetric based on ColumnPairs CorrelationSimilarity.

    This computes the correlation between column pairs based on the specified coefficient,
    which defaults to 'Pearson'.

    Attributes:
        name (str):
            Name to use when reports about this metric are printed.
        goal (sdmetrics.goal.Goal):
            The goal of this metric.
        min_value (Union[float, tuple[float]]):
            Minimum value or values that this metric can take.
        max_value (Union[float, tuple[float]]):
            Maximum value or values that this metric can take.
        column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric):
            ColumnPairs DiscreteKLDivergence.
        field_types (dict):
            Field types to which the SingleColumn metric will be applied.
    """

    field_types = ('numerical', 'datetime')
    column_pairs_metric = column_pairs.statistical.correlation_similarity.CorrelationSimilarity