Source code for sdmetrics.single_table.multi_single_column

"""SingleTable metrics based on applying a SingleColumnMetric on all the columns."""

import numpy as np

from sdmetrics import single_column
from sdmetrics.single_table.base import SingleTableMetric
from sdmetrics.utils import get_columns_from_metadata, nested_attrs_meta


[docs]class MultiSingleColumnMetric(SingleTableMetric, metaclass=nested_attrs_meta('single_column_metric')): """SingleTableMetric subclass that applies a SingleColumnMetric on each column. This class can either be used by creating a subclass that inherits from it and sets the SingleColumn Metric as the ``single_column_metric`` attribute, or by creating an instance of this class passing the underlying SingleColumn metric as an argument. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. single_column_metric (sdmetrics.single_column.base.SingleColumnMetric): SingleColumn metric to apply. field_types (dict): Field types to which the SingleColumn metric will be applied. """ single_column_metric = None single_column_metric_kwargs = None field_types = None
[docs] def __init__(self, single_column_metric=None, **single_column_metric_kwargs): self.single_column_metric = single_column_metric self.single_column_metric_kwargs = single_column_metric_kwargs self.compute = self._compute
def _compute(self, real_data, synthetic_data, metadata=None, store_errors=False, **kwargs): """Compute this metric for all columns. This is done by computing the underlying SingleColumn metric to all the columns that are compatible with it. The output is a mapping of column name to the score of that column. Args: real_data (pandas.DataFrame): The values from the real dataset. synthetic_data (pandas.DataFrame): The values from the synthetic dataset. metadata (dict): Table metadata dict. store_errors (bool): Whether or not to store any metric computation errors in the results. **kwargs: Any additional keyword arguments will be passed down to the single column metric Returns: Dict[string -> Union[float, tuple[float]]]: A mapping of column name to metric output. """ real_data, synthetic_data, metadata = self._validate_inputs( real_data, synthetic_data, metadata) fields = self._select_fields(metadata, self.field_types) invalid_cols = set(get_columns_from_metadata(metadata).keys()) - set(fields) scores = {col: {'score': np.nan} for col in invalid_cols} for column_name, real_column in real_data.items(): if column_name in fields: real_column = real_column.to_numpy() synthetic_column = synthetic_data[column_name].to_numpy() try: score = self.single_column_metric.compute_breakdown( real_column, synthetic_column, **(self.single_column_metric_kwargs or {}), **kwargs ) scores[column_name] = score except Exception as error: if store_errors: scores[column_name] = {'error': error} else: raise error return scores @classmethod def compute(cls, real_data, synthetic_data, metadata=None, **kwargs): """Compute this metric. This is done by computing the underlying SingleColumn metric to all the columns that are compatible with it. The output is the average of the scores obtained. Args: real_data (pandas.DataFrame): The values from the real dataset. synthetic_data (pandas.DataFrame): The values from the synthetic dataset. metadata (dict): Table metadata dict. **kwargs: Any additional keyword arguments will be passed down to the single column metric Returns: Union[float, tuple[float]]: Metric output. """ scores = cls._compute(cls, real_data, synthetic_data, metadata, **kwargs) return np.nanmean([breakdown['score'] for breakdown in scores.values()]) @classmethod def compute_breakdown(cls, real_data, synthetic_data, metadata=None, **kwargs): """Compute this metric broken down by column. This is done by computing the underlying SingleColumn metric to all the columns that are compatible with it. The output is a mapping of column to the column's score. Args: real_data (pandas.DataFrame): The values from the real dataset. synthetic_data (pandas.DataFrame): The values from the synthetic dataset. metadata (dict): Table metadata dict. **kwargs: Any additional keyword arguments will be passed down to the single column metric Returns: Dict[string -> Union[float, tuple[float]]]: A mapping of column name to metric output. """ return cls._compute( cls, real_data, synthetic_data, metadata, store_errors=True, **kwargs) @classmethod def normalize(cls, raw_score): """Return the `raw_score` as is, since it is already normalized. Args: raw_score (float): The value of the metric from `compute`. Returns: float: The normalized value of the metric """ assert cls.min_value == 0.0 return super().normalize(raw_score)
[docs]class CSTest(MultiSingleColumnMetric): """MultiSingleColumnMetric based on SingleColumn CSTest. This function applies the single column ``CSTest`` metric to all the discrete columns found in the table and then returns the average of all the scores obtained. """ field_types = ('boolean', 'categorical') single_column_metric = single_column.statistical.CSTest
[docs]class KSComplement(MultiSingleColumnMetric): """MultiSingleColumnMetric based on SingleColumn KSComplement. This function applies the single column ``KSComplement`` metric to all the numerical columns found in the table and then returns the average of all the scores obtained. """ field_types = ('numerical', 'datetime') single_column_metric = single_column.statistical.KSComplement
class StatisticSimilarity(MultiSingleColumnMetric): """MultiSingleColumnMetric based on SingleColumn StatisticSimilarity. Apply the desired statistic to compare the real and synthetic data. """ field_types = ('numerical', 'datetime') single_column_metric = single_column.statistical.StatisticSimilarity class BoundaryAdherence(MultiSingleColumnMetric): """MultiSingleColumnMetric based on SingleColumn BoundaryAdherence. Compute the fraction of rows in the synthetic data that are within the min and max bounds of the real data. """ field_types = ('numerical', 'datetime') single_column_metric = single_column.statistical.BoundaryAdherence class MissingValueSimilarity(MultiSingleColumnMetric): """MultiSingleColumnMetric based on SingleColumn MissingValueSimilarity. Compare the percentage of missing values between the real and synthetic data. """ field_types = ('numerical', 'datetime') single_column_metric = single_column.statistical.MissingValueSimilarity class CategoryCoverage(MultiSingleColumnMetric): """MultiSingleColumnMetric based on SingleColumn CategoryCoverage. Compute the fraction of real data categories that are present in the synthetic data. """ field_types = ('categorical', 'boolean') single_column_metric = single_column.statistical.CategoryCoverage class TVComplement(MultiSingleColumnMetric): """MultiSingleColumnMetric based on SingleColumn TVComplement. Compute the complement of the total variaton distance between the real and synthetic data """ field_types = ('categorical', 'boolean') single_column_metric = single_column.statistical.TVComplement class RangeCoverage(MultiSingleColumnMetric): """MultiSingleColumnMetric based on SingleColumn RangeCoverage. Compute the complement of the total variaton distance between the real and synthetic data """ field_types = ('numerical', 'datetime') single_column_metric = single_column.statistical.RangeCoverage