Source code for sdv.tabular.copulagan

"""Combination of GaussianCopula transformation and GANs."""

from rdt import HyperTransformer
from rdt.transformers import GaussianNormalizer

from sdv.tabular.ctgan import CTGAN


[docs]class CopulaGAN(CTGAN):
    """Combination of GaussianCopula transformation and GANs.

    This model extends the ``CTGAN`` model to add the flexibility of the GaussianCopula
    transformations provided by the ``GaussianNormalizer`` from ``RDT``.

    Overall, the fitting process consists of the following steps:

    1. Transform each non categorical variable from the input
       data using a ``GaussianNormalizer``:

       i. If not specified, find out the distribution which each one
          of the variables from the input dataset has.
       ii. Transform each variable to a standard normal space by applying
           the CDF of the corresponding distribution and later on applying
           an inverse CDF from a standard normal distribution.

    2. Fit CTGAN with the transformed table.

    And the process of sampling is:

    1. Sample using CTGAN
    2. Reverse the previous transformation by applying the CDF of a standard normal
       distribution and then inverting the CDF of the distribution that correpsonds
       to each variable.

    The arguments of this model are the same as for CTGAN except for two additional
    arguments, ``field_distributions`` and ``default_distribution`` that give the
    ability to define specific transformations for individual fields as well as
    which distribution to use by default if no specific distribution has been selected.

    Distributions can be passed as a ``copulas`` univariate instance or as one
    of the following string values:

    * ``gaussian``: Use a Gaussian distribution.
    * ``gamma``: Use a Gamma distribution.
    * ``beta``: Use a Beta distribution.
    * ``student_t``: Use a Student T distribution.
    * ``gaussian_kde``: Use a GaussianKDE distribution. This model is non-parametric,
      so using this will make ``get_parameters`` unusable.
    * ``truncated_gaussian``: Use a Truncated Gaussian distribution.

    Args:
        field_names (list[str]):
            List of names of the fields that need to be modeled
            and included in the generated output data. Any additional
            fields found in the data will be ignored and will not be
            included in the generated output.
            If ``None``, all the fields found in the data are used.
        field_types (dict[str, dict]):
            Dictinary specifying the data types and subtypes
            of the fields that will be modeled. Field types and subtypes
            combinations must be compatible with the SDV Metadata Schema.
        field_transformers (dict[str, str]):
            Dictinary specifying which transformers to use for each field.
            Available transformers are:

                * ``FloatFormatter``: Uses a ``FloatFormatter`` for numerical data.
                * ``FrequencyEncoder``: Uses a ``FrequencyEncoder`` without gaussian noise.
                * ``FrequencyEncoder_noised``: Uses a ``FrequencyEncoder`` adding gaussian noise.
                * ``OneHotEncoder``: Uses a ``OneHotEncoder``.
                * ``LabelEncoder``: Uses a ``LabelEncoder`` without gaussian nose.
                * ``LabelEncoder_noised``: Uses a ``LabelEncoder`` adding gaussian noise.
                * ``BinaryEncoder``: Uses a ``BinaryEncoder``.
                * ``UnixTimestampEncoder``: Uses a ``UnixTimestampEncoder``.

        anonymize_fields (dict[str, str]):
            Dict specifying which fields to anonymize and what faker
            category they belong to.
        primary_key (str):
            Name of the field which is the primary key of the table.
        constraints (list[Constraint, dict]):
            List of Constraint objects or dicts.
        table_metadata (dict or metadata.Table):
            Table metadata instance or dict representation.
            If given alongside any other metadata-related arguments, an
            exception will be raised.
            If not given at all, it will be built using the other
            arguments or learned from the data.
        log_frequency (boolean):
            Whether to use log frequency of categorical levels in conditional
            sampling. Defaults to ``True``.
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        generator_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Resiudal Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        discriminator_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear
            Layer will be created for each one of the values provided. Defaults to (256, 256).
        batch_size (int):
            Number of data samples to process in each step.
        verbose (bool):
            Whether to print fit progress on stdout. Defaults to ``False``.
        epochs (int):
            Number of training epochs. Defaults to 300.
        cuda (bool or str):
            If ``True``, use CUDA. If an ``str``, use the indicated device.
            If ``False``, do not use cuda at all.
        field_distributions (dict):
            Optionally specify a dictionary that maps the name of each field to the distribution
            that must be used in it. Fields that are not specified in the input ``dict`` will
            be modeled using the default distribution. Defaults to ``None``.
        default_distribution (copulas.univariate.Univariate or str):
            Distribution to use on the fields for which no specific distribution has been given.
            Defaults to ``truncated_gaussian``.
        learn_rounding_scheme (bool):
            Define rounding scheme for ``FloatFormatter``. If ``True``, the data returned by
            ``reverse_transform`` will be rounded to that place. Defaults to ``True``.
        enforce_min_max_values (bool):
            Specify whether or not to clip the data returned by ``reverse_transform`` of
            the numerical transformer, ``FloatFormatter``, to the min and max values seen
            during ``fit``. Defaults to ``True``.
    """

    DEFAULT_DISTRIBUTION = 'truncated_gaussian'
    _field_distributions = None
    _default_distribution = None
    _hyper_transformer = None

[docs]    def __init__(self, field_names=None, field_types=None, field_transformers=None,
                 anonymize_fields=None, primary_key=None, constraints=None, table_metadata=None,
                 embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256),
                 generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4,
                 discriminator_decay=1e-6, batch_size=500, discriminator_steps=1,
                 log_frequency=True, verbose=False, epochs=300, cuda=True,
                 field_distributions=None, default_distribution=None, learn_rounding_scheme=True,
                 enforce_min_max_values=True):
        super().__init__(
            field_names=field_names,
            primary_key=primary_key,
            field_types=field_types,
            field_transformers=field_transformers,
            anonymize_fields=anonymize_fields,
            constraints=constraints,
            table_metadata=table_metadata,
            embedding_dim=embedding_dim,
            generator_dim=generator_dim,
            discriminator_dim=discriminator_dim,
            generator_lr=generator_lr,
            generator_decay=generator_decay,
            discriminator_lr=discriminator_lr,
            discriminator_decay=discriminator_decay,
            batch_size=batch_size,
            discriminator_steps=discriminator_steps,
            log_frequency=log_frequency,
            verbose=verbose,
            epochs=epochs,
            cuda=cuda,
            learn_rounding_scheme=learn_rounding_scheme,
            enforce_min_max_values=enforce_min_max_values,
        )
        self._field_distributions = field_distributions or dict()
        self._default_distribution = default_distribution or self.DEFAULT_DISTRIBUTION

[docs]    def get_distributions(self):
        """Get the marginal distributions used by this CopulaGAN.

        Returns:
            dict:
                Dictionary containing the distributions used or detected
                for each column.
        """
        return {
            transformer.column_prefix: transformer._univariate.to_dict()['type']
            for transformer in self._hyper_transformer._transformers_sequence
            if isinstance(transformer, GaussianNormalizer)
        }

    def _fit(self, table_data):
        """Fit the model to the table.

        Args:
            table_data (pandas.DataFrame):
                Data to be learned.
        """
        distributions = self._field_distributions
        fields = self._metadata.get_fields()

        sdtypes = {}
        transformers = {}
        for field in table_data:
            field_sdtype = fields.get(field, {}).get('type')
            if field in fields and field_sdtype != 'categorical':
                sdtypes[field] = 'numerical'
                transformers[field] = GaussianNormalizer(
                    model_missing_values=True,
                    distribution=distributions.get(field, self._default_distribution)
                )
            else:
                sdtypes[field] = field_sdtype or 'categorical'
                transformers[field] = None

        self._hyper_transformer = HyperTransformer()
        self._hyper_transformer.set_config({'transformers': transformers, 'sdtypes': sdtypes})
        table_data = self._hyper_transformer.fit_transform(table_data[list(transformers)])

        super()._fit(table_data)

    def _sample(self, num_rows, conditions=None):
        """Sample the indicated number of rows from the model.

        Args:
            num_rows (int):
                Amount of rows to sample.
            conditions (dict):
                If specified, this dictionary maps column names to the column
                value. Then, this method generates `num_rows` samples, all of
                which are conditioned on the given variables.

        Returns:
            pandas.DataFrame:
                Sampled data.
        """
        sampled = super()._sample(num_rows, conditions)
        return self._hyper_transformer.reverse_transform(sampled)