Source code for sdmetrics.single_table.bayesian_network

"""BayesianNetwork based metrics for single table."""

import json
import logging

import numpy as np

from sdmetrics.goal import Goal
from sdmetrics.single_table.base import SingleTableMetric

LOGGER = logging.getLogger(__name__)


class BNLikelihoodBase(SingleTableMetric):
    """BayesianNetwork Likelihood Single Table base metric."""

    @classmethod
    def _likelihoods(cls, real_data, synthetic_data, metadata=None, structure=None):
        try:
            from pomegranate import BayesianNetwork
        except ImportError:
            raise ImportError('Please install pomegranate with `pip install pomegranate`')

        real_data, synthetic_data, metadata = cls._validate_inputs(
            real_data, synthetic_data, metadata)
        structure = metadata.get('structure', structure)
        fields = cls._select_fields(metadata, ('categorical', 'boolean'))

        if not fields:
            return np.full(len(real_data), np.nan)

        LOGGER.debug('Fitting the BayesianNetwork to the real data')
        if structure:
            if isinstance(structure, dict):
                structure = BayesianNetwork.from_json(json.dumps(structure)).structure

            bn = BayesianNetwork.from_structure(real_data[fields].to_numpy(), structure)
        else:
            bn = BayesianNetwork.from_samples(real_data[fields].to_numpy(), algorithm='chow-liu')

        LOGGER.debug('Evaluating likelihood of the synthetic data')
        probabilities = []
        for _, row in synthetic_data[fields].iterrows():
            try:
                probabilities.append(bn.probability([row.to_numpy()]))
            except ValueError:
                probabilities.append(0)

        return np.asarray(probabilities)


[docs]class BNLikelihood(BNLikelihoodBase): """BayesianNetwork Likelihood Single Table metric. This metric fits a BayesianNetwork to the real data and then evaluates how likely it is that the synthetic data belongs to the same distribution. The output is the average probability across all the synthetic rows. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. """ name = 'BayesianNetwork Likelihood' goal = Goal.MAXIMIZE min_value = 0.0 max_value = 1.0
[docs] @classmethod def compute(cls, real_data, synthetic_data, metadata=None, structure=None): """Compute this metric. This fits a BayesianNetwork to the real data and then evaluates how likely it is that the synthetic data belongs to the same distribution. Real data and synthetic data must be passed as ``pandas.DataFrame`` instances and ``metadata`` as a ``Table`` metadata ``dict`` representation. If no ``metadata`` is given, one will be built from the values observed in the ``real_data``. If a ``structure`` is given, either directly or as a ``structure`` first level entry within the ``metadata`` dict, it is passed to the underlying BayesianNetwork for fitting. Otherwise, the structure is learned from the data using the ``chow-liu`` algorithm. ``structure`` can be passed as either a tuple of tuples representing only the network structure or as a ``dict`` representing a full serialization of a previously fitted ``BayesianNetwork``. In the later scenario, only the ``structure`` will be extracted from the ``BayesianNetwork`` instance, and then a new one will be fitted to the given data. The output is the average probability across all the synthetic rows. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the real dataset. synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the synthetic dataset. metadata (dict): Table metadata dict. If not passed, it is build based on the real_data fields and dtypes. Optionally, the metadata can include a ``structure`` entry with the structure of the Bayesian Network. structure (dict): Optional. BayesianNetwork structure to use when fitting to the real data. If not passed, learn it from the data using the ``chow-liu`` algorith. This is ignored if ``metadata`` is passed and it contains a ``structure`` entry in it. Returns: float: Mean of the probabilities returned by the Bayesian Network. """ return np.mean(cls._likelihoods(real_data, synthetic_data, metadata, structure))
[docs]class BNLogLikelihood(BNLikelihoodBase): """BayesianNetwork Log Likelihood Single Table metric. This metric fits a BayesianNetwork to the real data and then evaluates how likely it is that the synthetic data belongs to the same distribution. The output is the average log probability across all the synthetic rows. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. """ name = 'BayesianNetwork Log Likelihood' goal = Goal.MAXIMIZE min_value = -np.inf max_value = 0
[docs] @classmethod def compute(cls, real_data, synthetic_data, metadata=None, structure=None): """Compute this metric. This fits a BayesianNetwork to the real data and then evaluates how likely it is that the synthetic data belongs to the same distribution. Real data and synthetic data must be passed as ``pandas.DataFrame`` instances and ``metadata`` as a ``Table`` metadata ``dict`` representation. If no ``metadata`` is given, one will be built from the values observed in the ``real_data``. If a ``structure`` is given, either directly or as a ``structure`` first level entry within the ``metadata`` dict, it is passed to the underlying BayesianNetwork for fitting. Otherwise, the structure is learned from the data using the ``chow-liu`` algorithm. ``structure`` can be passed as either a tuple of tuples representing only the network structure or as a ``dict`` representing a full serialization of a previously fitted ``BayesianNetwork``. In the later scenario, only the ``structure`` will be extracted from the ``BayesianNetwork`` instance, and then a new one will be fitted to the given data. The output is the average log probability across all the synthetic rows. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the real dataset. synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the synthetic dataset. metadata (dict): Table metadata dict. If not passed, it is build based on the real_data fields and dtypes. Optionally, the metadata can include a ``structure`` entry with the structure of the Bayesian Network. structure (dict): Optional. BayesianNetwork structure to use when fitting to the real data. If not passed, learn it from the data using the ``chow-liu`` algorith. This is ignored if ``metadata`` is passed and it contains a ``structure`` entry in it. Returns: float: Mean of the log probabilities returned by the Bayesian Network. """ likelihoods = cls._likelihoods(real_data, synthetic_data, metadata, structure) likelihoods[np.where(likelihoods == 0)] = 1e-8 return np.mean(np.log(likelihoods))
@classmethod def normalize(cls, raw_score): """Normalize the log-likelihood value. Note that this is not the mean likelihood but rather the exponentiation of the mean log-likelihood. Args: raw_score (float): The value of the metric from `compute`. Returns: float: The normalized value of the metric """ return super().normalize(raw_score)