Source code for sdmetrics.multi_table.multi_single_table

"""MultiTable metrics based on applying SingleTable metrics on all the tables."""

from collections import defaultdict

import numpy as np

from sdmetrics import single_table
from sdmetrics.errors import IncomputableMetricError
from sdmetrics.multi_table.base import MultiTableMetric
from sdmetrics.utils import nested_attrs_meta


[docs]class MultiSingleTableMetric(MultiTableMetric, metaclass=nested_attrs_meta('single_table_metric')): """MultiTableMetric subclass that applies a SingleTableMetric on each table. This class can either be used by creating a subclass that inherits from it and sets the SingleTable Metric as the ``single_table_metric`` attribute, or by creating an instance of this class passing the underlying SingleTable metric as an argument. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. single_table_metric (sdmetrics.single_table.base.SingleTableMetric): SingleTableMetric to apply. """ single_table_metric = None
[docs] def __init__(self, single_table_metric): self.single_table_metric = single_table_metric self.compute = self._compute
def _compute(self, real_data, synthetic_data, metadata=None, **kwargs): """Compute this metric. This applies the underlying single table metric to all the tables found in the dataset and then returns the average score obtained. Args: real_data (dict[str, pandas.DataFrame]): The tables from the real dataset. synthetic_data (dict[str, pandas.DataFrame]): The tables from the synthetic dataset. metadata (dict): Multi-table metadata dict. If not passed, it is build based on the real_data fields and dtypes. **kwargs: Any additional keyword arguments will be passed down to the single table metric Returns: Union[float, tuple[float]]: Metric output. """ if set(real_data.keys()) != set(synthetic_data.keys()): raise ValueError('`real_data` and `synthetic_data` must have the same tables') if metadata is None: metadata = {'tables': defaultdict(type(None))} elif not isinstance(metadata, dict): metadata = metadata.to_dict() scores = {} errors = {} for table_name, real_table in real_data.items(): synthetic_table = synthetic_data[table_name] table_meta = metadata['tables'][table_name] try: score_breakdown = self.single_table_metric.compute_breakdown( real_table, synthetic_table, table_meta, **kwargs) scores[table_name] = score_breakdown except AttributeError: score = self.single_table_metric.compute( real_table, synthetic_table, table_meta, **kwargs) scores[table_name] = score except Exception as error: errors[table_name] = error if not scores: raise IncomputableMetricError(f'Encountered the following errors: {errors}') return scores @classmethod def compute(cls, real_data, synthetic_data, metadata=None, **kwargs): """Compute this metric. This applies the underlying single table metric to all the tables found in the dataset and then returns the average score obtained. Args: real_data (dict[str, pandas.DataFrame]): The tables from the real dataset. synthetic_data (dict[str, pandas.DataFrame]): The tables from the synthetic dataset. metadata (dict): Multi-table metadata dict. If not passed, it is build based on the real_data fields and dtypes. **kwargs: Any additional keyword arguments will be passed down to the single table metric Returns: Union[float, tuple[float]]: Metric output. """ scores = cls._compute(cls, real_data, synthetic_data, metadata, **kwargs) scores = list(scores.values()) if len(scores) > 0 and isinstance(scores[0], dict): all_scores = [] for table_scores in scores: if 'score' in table_scores: all_scores.append(table_scores['score']) else: all_scores.extend([ result['score'] for result in table_scores.values() if 'score' in result ]) scores = all_scores return np.nanmean(scores) @classmethod def compute_breakdown(cls, real_data, synthetic_data, metadata=None, **kwargs): """Compute this metric broken down by tables and columns. This applies the underlying single table metric to all the tables found in the dataset and then returns the breakdown of the obtained scores. Args: real_data (dict[str, pandas.DataFrame]): The tables from the real dataset. synthetic_data (dict[str, pandas.DataFrame]): The tables from the synthetic dataset. metadata (dict): Multi-table metadata dict. If not passed, it is build based on the real_data fields and dtypes. **kwargs: Any additional keyword arguments will be passed down to the single table metric Returns: dict[string -> dict[string -> Union[float, tuple[float]]]]: A mapping of table name to column metric breakdowns. """ return cls._compute(cls, real_data, synthetic_data, metadata, **kwargs) @classmethod def normalize(cls, raw_score): """Return the `raw_score` as is, since it is already normalized. Args: raw_score (float): The value of the metric from `compute`. Returns: float: The normalized value of the metric """ assert cls.min_value == 0.0 return super().normalize(raw_score)
[docs]class CSTest(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable CSTest.""" single_table_metric = single_table.multi_single_column.CSTest
[docs]class KSComplement(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable KSComplement.""" single_table_metric = single_table.multi_single_column.KSComplement
class StatisticSimilarity(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable StatisticSimilarity.""" single_table_metric = single_table.multi_single_column.StatisticSimilarity class BoundaryAdherence(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable BoundaryAdherence.""" single_table_metric = single_table.multi_single_column.BoundaryAdherence class MissingValueSimilarity(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable MissingValueSimilarity.""" single_table_metric = single_table.multi_single_column.MissingValueSimilarity class CategoryCoverage(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable CategoryCoverage.""" single_table_metric = single_table.multi_single_column.CategoryCoverage class TVComplement(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable TVComplement.""" single_table_metric = single_table.multi_single_column.TVComplement class RangeCoverage(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable RangeCoverage.""" single_table_metric = single_table.multi_single_column.RangeCoverage class CorrelationSimilarity(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable CorrelationSimilarity.""" single_table_metric = single_table.multi_column_pairs.CorrelationSimilarity class ContingencySimilarity(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable ContingencySimilarity.""" single_table_metric = single_table.multi_column_pairs.ContingencySimilarity
[docs]class LogisticDetection(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable LogisticDetection.""" single_table_metric = single_table.detection.LogisticDetection
[docs]class SVCDetection(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable SVCDetection.""" single_table_metric = single_table.detection.SVCDetection
[docs]class BNLikelihood(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable BNLikelihood.""" single_table_metric = single_table.bayesian_network.BNLikelihood
class NewRowSynthesis(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable NewRowSynthesis.""" single_table_metric = single_table.new_row_synthesis.NewRowSynthesis
[docs]class BNLogLikelihood(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable BNLogLikelihood.""" single_table_metric = single_table.bayesian_network.BNLogLikelihood @classmethod def normalize(cls, raw_score): """Normalize the log-likelihood value. Note that this is not the mean likelihood but rather the exponentiation of the mean log-likelihood. Args: raw_score (float): The value of the metric from `compute`. Returns: float: The normalized value of the metric """ return super().normalize(raw_score)