Source code for sdmetrics.single_table.multi_column_pairs

"""SingleTable metrics based on applying a ColumnPairsMetrics on all the possible column pairs."""

from itertools import combinations

import numpy as np

from sdmetrics import column_pairs
from sdmetrics.single_table.base import SingleTableMetric
from sdmetrics.utils import nested_attrs_meta


[docs]class MultiColumnPairsMetric( SingleTableMetric, metaclass=nested_attrs_meta('column_pairs_metric') ): """SingleTableMetric subclass that applies a ColumnPairsMetric on each possible column pair. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric): ColumnPairsMetric to apply. field_types (dict): Field types to which the SingleColumn metric will be applied. """ column_pairs_metric = None column_pairs_metric_kwargs = None field_types = None
[docs] def __init__(self, column_pairs_metric, **column_pairs_metric_kwargs): self.column_pairs_metric = column_pairs_metric self.column_pairs_metric_kwargs = column_pairs_metric_kwargs self.compute = self._compute
def _compute(self, real_data, synthetic_data, metadata=None, **kwargs): """Compute this metric. This is done by grouping all the columns that are compatible with the underlying ColumnPairs metric in groups of 2 and then evaluating them using the ColumnPairs metric. The output is the average of the scores obtained. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the real dataset. synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the synthetic dataset. metadata (dict): Table metadata dict. **kwargs: Any additional keyword arguments will be passed down to the column pairs metric Returns: Union[float, tuple[float]]: Metric output. """ real_data, synthetic_data, metadata = self._validate_inputs( real_data, synthetic_data, metadata) fields = self._select_fields(metadata, self.field_types) values = [] for columns in combinations(fields, r=2): real = real_data[list(columns)] synthetic = synthetic_data[list(columns)] values.append(self.column_pairs_metric.compute(real, synthetic)) return np.nanmean(values) @classmethod def compute(cls, real_data, synthetic_data, metadata=None, **kwargs): """Compute this metric. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the real dataset. synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the synthetic dataset. metadata (dict): Table metadata dict. **kwargs: Any additional keyword arguments will be passed down to the column pairs metric Returns: Union[float, tuple[float]]: Metric output. """ return cls._compute(cls, real_data, synthetic_data, metadata, **kwargs) @classmethod def compute_breakdown(cls, real_data, synthetic_data, metadata=None, **kwargs): """Compute the breakdown of this metric. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the real dataset. synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the synthetic dataset. metadata (dict): Table metadata dict. **kwargs: Any additional keyword arguments will be passed down to the column pairs metric Returns: dict: Metric output. """ real_data, synthetic_data, metadata = cls._validate_inputs( real_data, synthetic_data, metadata) fields = cls._select_fields(metadata, cls.field_types) breakdown = {} for columns in combinations(fields, r=2): sorted_columns = tuple(sorted(columns)) real = real_data[list(sorted_columns)] synthetic = synthetic_data[list(sorted_columns)] breakdown[sorted_columns] = cls.column_pairs_metric.compute_breakdown( real, synthetic, **kwargs) return breakdown @classmethod def normalize(cls, raw_score): """Return the `raw_score` as is, since it is already normalized. Args: raw_score (float): The value of the metric from `compute`. Returns: float: The normalized value of the metric """ assert cls.min_value == 0.0 return super().normalize(raw_score)
[docs]class ContinuousKLDivergence(MultiColumnPairsMetric): """MultiColumnPairsMetric based on ColumnPairs ContinuousKLDivergence. This approximates the KL divergence by binning the continuous values to turn them into categorical values and then computing the relative entropy. Afterwards normalizes the value applying ``1 / (1 + KLD)``. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric): ColumnPairs ContinuousKLDivergence. field_types (dict): Field types to which the SingleColumn metric will be applied. """ field_types = ('numerical', ) column_pairs_metric = column_pairs.statistical.kl_divergence.ContinuousKLDivergence
[docs]class DiscreteKLDivergence(MultiColumnPairsMetric): """MultiColumnPairsMetric based on ColumnPairs DiscreteKLDivergence. This computes the KL divergence and afterwards normalizes the value applying ``1 / (1 + KLD)``. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric): ColumnPairs DiscreteKLDivergence. field_types (dict): Field types to which the SingleColumn metric will be applied. """ field_types = ('boolean', 'categorical') column_pairs_metric = column_pairs.statistical.kl_divergence.DiscreteKLDivergence
class ContingencySimilarity(MultiColumnPairsMetric): """MultiColumnPairsMetric based on ColumnPairs ContingencySimilarity. This computes the complement of the total variation distance between the contingency contingency tables of the real and synthetic data. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric): ColumnPairs DiscreteKLDivergence. field_types (dict): Field types to which the SingleColumn metric will be applied. """ field_types = ('boolean', 'categorical') column_pairs_metric = column_pairs.statistical.contingency_similarity.ContingencySimilarity class CorrelationSimilarity(MultiColumnPairsMetric): """MultiColumnPairsMetric based on ColumnPairs CorrelationSimilarity. This computes the correlation between column pairs based on the specified coefficient, which defaults to 'Pearson'. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric): ColumnPairs DiscreteKLDivergence. field_types (dict): Field types to which the SingleColumn metric will be applied. """ field_types = ('numerical', 'datetime') column_pairs_metric = column_pairs.statistical.correlation_similarity.CorrelationSimilarity