Source code for sdmetrics.single_table.privacy.radius_nearest_neighbor

"""RadiusNearestNeighbor module and its attacker/utilities."""
import numpy as np

from sdmetrics.single_table.privacy.base import NumericalPrivacyMetric, PrivacyAttackerModel
from sdmetrics.single_table.privacy.loss import InverseCDFDistance


class NumericalRadiusNearestNeighborAttacker(PrivacyAttackerModel):
    """The Radius Nearest Neighbor Attacker.

    It will predict the sensitive value to be a weighted mean of the entries in the
    synthetic table. Where this weight is given by a separate function, and typically
    describes the closeness between the given key and the corresponding entry in the table.

    Args:
        weight_func (Class):
            The weight function to use.
        weight_func_kwargs (dict):
            Parameters of the weight function.
    """

    def __init__(self, weight_func=None, weight_func_kwargs=None):
        if weight_func_kwargs is None:
            weight_func_kwargs = {}

        self.weight_func = weight_func(**weight_func_kwargs)
        self.synthetic_data = None
        self.key = None
        self.sensitive_fields = None
        self.key_fields = None

    def fit(self, synthetic_data, key_fields, sensitive_fields):
        """Fit the NumericalRadiusNearestNeighborAttacker on the synthetic data.

        Args:
            synthetic_data(pandas.DataFrame):
                The synthetic data table used for adverserial learning.
            key_fields(list[str]):
                The names of the key columns.
            sensitive_fields(list[str]):
                The names of the sensitive columns.
        """
        self.weight_func.fit(synthetic_data, key_fields)
        self.synthetic_data = synthetic_data
        self.key_fields = key_fields
        self.sensitive_fields = sensitive_fields

    def predict(self, key_data):
        """Make a prediction of the sensitive data given keys.

        Args:
            key_data(tuple):
                The key data.

        Returns:
            tuple:
                The predicted sensitive data.
        """
        weights = 0
        summ = None
        modified = False
        for idx in range(len(self.synthetic_data)):
            ref_key = tuple(self.synthetic_data[self.key_fields].iloc[idx])
            sensitive_data = np.array(self.synthetic_data[self.sensitive_fields].iloc[idx])
            weight = self.weight_func.measure(key_data, ref_key)
            weights += weight
            if not modified:
                summ = sensitive_data.copy()
                modified = True
            else:
                summ += sensitive_data

        if weights == 0:
            return (0,) * len(self.sensitive_fields)
        else:
            return tuple(summ / weights)


class InverseCDFCutoff(InverseCDFDistance):
    """Gives weight = 1 if the Lp averaged distance between the entries is below a given cutoff.

    Formally, suppose given key = (k1,..,kn), while the reference key is (k1',...,kn').
    Suppose the cdfs of each entry are c1,...,cn, resp.
    Then weight = 1 if and only if (sum |c_i(ki) - c_i(ki')|**p) / n <= cutoff**p.
    """

    def __init__(self, p=2, cutoff=0.1):
        self.p = p
        self.cdfs = []
        self.cutoff = cutoff**p

    def fit(self, data, cols):
        """Fits univariate distributions (automatically selected).

        Args:
            data (DataFrame):
                Data, where each column in `cols` is a continuous column.
            cols (list[str]):
                Column names.
        """
        InverseCDFDistance.fit(self, data, cols)
        self.cutoff *= len(cols)

    def measure(self, pred, real):
        """Compute the distance (L_p norm) between the pred and real values.

        This uses the probability integral transform to map the pred/real values
        to a CDF value (between 0.0 and 1.0). Then, it computes the L_p norm
        between the CDF(pred) and CDF(real).

        Args:
            pred (tuple):
                Predicted value(s) corresponding to the columns specified in fit.
            real (tuple):
                Real value(s) corresponding to the columns specified in fit.

        Returns:
            float:
                The L_p norm of the CDF value.
        """
        dist = InverseCDFDistance.measure(self, pred, real)
        return 1 if dist < self.cutoff else 0


[docs]class NumericalRadiusNearestNeighbor(NumericalPrivacyMetric):
    """The Radius Nearest Neighbor privacy metric.

    Scored based on the NumericalRadiusNearestNeighborAttacker.
    """

    name = 'Numerical Radius Nearest Neighbor'
    MODEL = NumericalRadiusNearestNeighborAttacker
    MODEL_KWARGS = {
        'weight_func': InverseCDFCutoff, 'weight_func_kwargs': {'p': 2, 'cutoff': 0.3}
    }