Source code for sdmetrics.single_table.privacy.ensemble

"""CategoricalEnsemble module and its attacker."""

from sdmetrics.single_table.privacy.base import CategoricalPrivacyMetric, PrivacyAttackerModel
from sdmetrics.single_table.privacy.util import majority


class CategoricalEnsembleAttacker(PrivacyAttackerModel):
    """The Categorical ENS (ensemble 'majority vote' classifier) privacy attacker.

    It will predict the majority of the specified sub-attackers's predicions, and the privacy
    score will be calculated based on the accuracy of its prediction.
    """

    def __init__(self, attackers=[]):
        self.attackers = [attacker() for attacker in attackers]

    def fit(self, synthetic_data, key_fields, sensitive_fields):
        """Fit the CategoricalEnsembleAttacker on the synthetic data.

        Args:
            synthetic_data(pandas.DataFrame):
                The synthetic data table used for adverserial learning.
            key_fields(list[str]):
                The names of the key columns.
            sensitive_fields(list[str]):
                The names of the sensitive columns.
        """
        for attacker in self.attackers:
            attacker.fit(synthetic_data, key_fields, sensitive_fields)

    def predict(self, key_data):
        """Make a prediction of the sensitive data given keys.

        Args:
            key_data(tuple):
                The key data.

        Returns:
            tuple:
                The predicted sensitive data.
        """
        predictions = [attacker.predict(key_data) for attacker in self.attackers]
        return majority(predictions)


[docs]class CategoricalEnsemble(CategoricalPrivacyMetric):
    """The Categorical Ensemble privacy metric. Scored based on the CategoricalEnsembleAttacker.

    When calling `cls.compute`, please make sure to pass in the argument
    `model_kwargs (dict): {attackers: list[PrivacyAttackerModel]}`.
    """

    name = 'Ensemble'
    MODEL = CategoricalEnsembleAttacker
    ACCURACY_BASE = True

[docs]    @classmethod
    def compute(cls, real_data, synthetic_data, metadata=None, key_fields=None,
                sensitive_fields=None, model_kwargs=None):
        """Compute this metric.

        This fits the CategoricalEnsembleAttacker on the synthetic data and
        then evaluates it making predictions on the real data.

        A ``key_fields`` column(s) name must be given, either directly or as a first level
        entry in the ``metadata`` dict, which will be used as the key column(s) for the
        attack.

        A ``sensitive_fields`` column(s) name must be given, either directly or as a first level
        entry in the ``metadata`` dict, which will be used as the sensitive_fields column(s)
        for the attack.

        Args:
            real_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the real dataset.
            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict. If not passed, it is build based on the
                real_data fields and dtypes.
            key_fields (list(str)):
                Name of the column(s) to use as the key attributes.
            sensitive_fields (list(str)):
                Name of the column(s) to use as the sensitive attributes.
            model_kwargs (dict):
                Key word arguments of the attacker model. cls.MODEL_KWARGS will be used
                if none is provided.

        Returns:
            union[float, tuple[float]]:
                Score obtained by the CategoricalEnsembleAttacker when evaluated on the real data.
        """
        if model_kwargs is None:
            model_kwargs = cls.MODEL_KWARGS

        if 'attackers' not in model_kwargs:  # no attackers specfied
            return ValueError('No attackers specified.')
        elif (not isinstance(model_kwargs['attackers'], list)
              or len(model_kwargs['attackers']) == 0):  # zero attackers specfied
            return ValueError('Zero attackers specified')

        return super().compute(
            real_data,
            synthetic_data,
            metadata,
            key_fields,
            sensitive_fields,
            model_kwargs
        )