"""Functions to load demo datasets.""" import io import logging import os import urllib.request from datetime import datetime, timedelta from zipfile import ZipFile import numpy as np import pandas as pd import scipy as sp from faker import Faker from sdv.metadata import Metadata, Table LOGGER = logging.getLogger(__name__) DEMO_METADATA = { 'tables': { 'users': { 'primary_key': 'user_id', 'fields': { 'user_id': { 'type': 'id', 'subtype': 'integer' }, 'country': { 'type': 'categorical' }, 'gender': { 'type': 'categorical' }, 'age': { 'type': 'numerical', 'subtype': 'integer' } } }, 'sessions': { 'primary_key': 'session_id', 'fields': { 'session_id': { 'type': 'id', 'subtype': 'integer' }, 'user_id': { 'ref': { 'field': 'user_id', 'table': 'users' }, 'type': 'id', 'subtype': 'integer' }, 'device': { 'type': 'categorical' }, 'os': { 'type': 'categorical' }, 'minutes': { 'type': 'numerical', 'subtype': 'integer' } } }, 'transactions': { 'primary_key': 'transaction_id', 'fields': { 'transaction_id': { 'type': 'id', 'subtype': 'integer' }, 'session_id': { 'ref': { 'field': 'session_id', 'table': 'sessions' }, 'type': 'id', 'subtype': 'integer' }, 'timestamp': { 'type': 'datetime', 'format': '%Y-%m-%dT%H:%M' }, 'amount': { 'type': 'numerical', 'subtype': 'float' }, 'cancelled': { 'type': 'boolean' } } } } } DATA_PATH = os.path.join(os.path.dirname(__file__), 'data') DATA_URL = 'https://sdv-datasets.s3.amazonaws.com/{}.zip' DATASETS_URL = 'https://sdv-datasets.s3.amazonaws.com/datasets.csv' def _dtypes64(table): for name, column in table.items(): if column.dtype == np.int32: table[name] = column.astype('int64') elif column.dtype == np.float32: table[name] = column.astype('float64') return table def _download(dataset_name, data_path): url = DATA_URL.format(dataset_name) LOGGER.info('Downloading dataset {} from {}'.format(dataset_name, url)) response = urllib.request.urlopen(url) bytes_io = io.BytesIO(response.read()) LOGGER.info('Extracting dataset into {}'.format(data_path)) with ZipFile(bytes_io) as zf: zf.extractall(data_path) def _get_dataset_path(dataset_name, data_path): if not os.path.exists(data_path): os.makedirs(data_path) if not os.path.exists(os.path.join(data_path, dataset_name)): _download(dataset_name, data_path) return os.path.join(data_path, dataset_name) def _load_relational_dummy(): users = pd.DataFrame({ 'user_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'country': ['US', 'UK', 'ES', 'UK', 'US', 'DE', 'BG', 'ES', 'FR', 'UK'], 'gender': ['M', 'F', None, 'M', 'F', 'M', 'F', None, 'F', None], 'age': [34, 23, 44, 22, 54, 57, 45, 41, 23, 30] }) sessions = pd.DataFrame({ 'session_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'user_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'device': ['mobile', 'tablet', 'tablet', 'mobile', 'mobile', 'mobile', 'mobile', 'tablet', 'mobile', 'tablet'], 'os': ['android', 'ios', 'android', 'android', 'ios', 'android', 'ios', 'ios', 'ios', 'ios'], 'minutes': [23, 12, 8, 13, 9, 32, 7, 21, 29, 34], }) transactions = pd.DataFrame({ 'transaction_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'session_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'timestamp': ['2019-01-01T12:34:32', '2019-01-01T12:42:21', '2019-01-07T17:23:11', '2019-01-10T11:08:57', '2019-01-10T21:54:08', '2019-01-11T11:21:20', '2019-01-22T14:44:10', '2019-01-23T10:14:09', '2019-01-27T16:09:17', '2019-01-29T12:10:48'], 'amount': [100.0, 55.3, 79.5, 112.1, 110.0, 76.3, 89.5, 132.1, 68.0, 99.9], 'cancelled': [False, False, False, True, True, False, False, True, False, False], }) transactions['timestamp'] = pd.to_datetime(transactions['timestamp']) tables = { 'users': _dtypes64(users), 'sessions': _dtypes64(sessions), 'transactions': _dtypes64(transactions), } return Metadata(DEMO_METADATA), tables def sample_relational_demo(size=30): """Sample demo data with the indicate number of rows in the parent table.""" # Users faker = Faker() countries = [faker.country_code() for _ in range(5)] country = np.random.choice(countries, size=size) gender = np.random.choice(['F', 'M', None], p=[0.5, 0.4, 0.1], size=size) age = ( sp.stats.truncnorm.rvs(-1.2, 1.5, loc=30, scale=10, size=size).astype(int) + 3 * (gender == 'M') + 3 * (country == countries[0]).astype(int) ) num_sessions = ( sp.stats.gamma.rvs(1, loc=0, scale=2, size=size) * (0.8 + 0.2 * (gender == 'F')) ).round().astype(int) users = pd.DataFrame({ 'country': country, 'gender': gender, 'age': age, 'num_sessions': num_sessions }) users.index.name = 'user_id' # Sessions sessions = pd.DataFrame() for user_id, user in users.iterrows(): device_weights = [0.1, 0.4, 0.5] if user.gender == 'M' else [0.3, 0.4, 0.3] devices = np.random.choice( ['mobile', 'tablet', 'pc'], size=user.num_sessions, p=device_weights ) os = [] pc_weights = [0.6, 0.3, 0.1] if user.age > 30 else [0.2, 0.4, 0.4] pc_os = np.random.choice(['windows', 'macos', 'linux'], p=pc_weights) phone_weights = [0.7, 0.3] if user.age > 30 else [0.9, 0.1] phone_os = np.random.choice(['android', 'ios'], p=phone_weights) for device in devices: os.append(pc_os if device == 'pc' else phone_os) minutes = ( sp.stats.truncnorm.rvs(-3, 3, loc=30, scale=10, size=user.num_sessions) * (1 + 0.1 * (user.gender == 'M')) * (1 + user.age / 100) * (1 + 0.1 * (devices == 'pc')) ) num_transactions = (minutes / 10) * (0.5 + (user.gender == 'F')) sessions = sessions.append(pd.DataFrame({ 'user_id': np.full(user.num_sessions, int(user_id)), 'device': devices, 'os': os, 'minutes': minutes.round().astype(int), 'num_transactions': num_transactions.round().astype(int), }), ignore_index=True) sessions.index.name = 'session_id' del users['num_sessions'] # Transactions transactions = pd.DataFrame() for session_id, session in sessions.iterrows(): size = session.num_transactions if size: amount_base = sp.stats.truncnorm.rvs(-2, 4, loc=100, scale=50, size=size) is_apple = session['os'] in ('ios', 'macos') amount_modif = np.random.random(size) * 100 * is_apple amount = amount_base / np.random.randint(1, size + 1) + amount_modif seconds = np.random.randint(3600 * 24 * 365) start = datetime(2019, 1, 1) + timedelta(seconds=seconds) timestamp = sorted([ start + timedelta(seconds=int(seconds)) for seconds in np.random.randint(60 * session.minutes, size=size) ]) cancelled = np.random.random(size=size) < (1 / (size * 2)) transactions = transactions.append(pd.DataFrame({ 'session_id': np.full(session.num_transactions, int(session_id)), 'timestamp': timestamp, 'amount': amount.round(2), 'cancelled': cancelled, }), ignore_index=True) transactions.index.name = 'transaction_id' del sessions['num_transactions'] tables = { 'users': _dtypes64(users.reset_index()), 'sessions': _dtypes64(sessions.reset_index()), 'transactions': _dtypes64(transactions.reset_index()), } return Metadata(DEMO_METADATA), tables def _load_demo_dataset(dataset_name, data_path): dataset_path = _get_dataset_path(dataset_name, data_path) meta = Metadata(metadata=os.path.join(dataset_path, 'metadata.json')) tables = { name: _dtypes64(table) for name, table in meta.load_tables().items() } return meta, tables [docs]def load_demo(dataset_name='demo_multi_table', data_path=DATA_PATH, metadata=False): """Load relational demo data. If a dataset name is given, it is downloaded from the sdv-datasets S3 bucket. Otherwise, a toy dataset with three simple tables is loaded: * users: user data including country, gender and age. * sessions: sessions data with a foreign key to user. * transactions: transactions data with a foreign key to sessions. If ``metadata`` is ``True``, the output will be a tuple with a ``Metadata`` instance for the dataset and a ``tables`` dict that contains the tables loaded as ``pandas.DataFrames``. If ``metadata`` is ``False``, only the ``tables`` are returned. Args: dataset_name (str): Dataset name to be downloaded. Defaults to ``'demo_multi_table'``, which is the name of the multi table demo dataset. data_path (str): Data path to save the dataset files, only used if dataset_name is provided. Defaults to ``DATA_PATH``. metadata (bool): If ``True`` return Metadata object. Defaults to ``False``. Returns: dict or tuple: If ``metadata`` is ``False`` return a ``dict`` with the tables data. If ``metadata`` is ``True`` return a ``tuple`` with Metadata and tables data. """ if dataset_name is None: raise ValueError("'dataset_name' cannot be None.") elif dataset_name == 'demo_multi_table': meta, tables = _load_relational_dummy() else: meta, tables = _load_demo_dataset(dataset_name, data_path) if metadata: return meta, tables return tables def _load_tabular_dummy(): """Load a dummy tabular demo dataframe.""" age = np.random.randint(30, 50, 12) age_when_joined = age - np.random.randint(1, 10, 12) years_exp = np.random.randint(1, 6, 12) contractor = [0.0, 1.0, 0.0, 1.0, 0.0, 0.0] * 2 is_contractor = np.array(contractor).astype(bool) salary = np.random.randint(60, 320, 12) * 500. bonus = np.random.randint(10, 50, 12) * 500. salary[is_contractor] = np.random.uniform(30000, 160000, 4).round(2) bonus[is_contractor] = np.random.uniform(5000, 25000, 4).round(2) return pd.DataFrame({ 'company': ['Pear', 'Pear', 'Glasses', 'Glasses', 'Cheerper', 'Cheerper'] * 2, 'department': ['Sales', 'Design', 'AI', 'Search Engine', 'BigData', 'Support'] * 2, 'employee_id': [1, 5, 1, 7, 6, 11, 28, 75, 33, 56, 42, 80], 'age': age, 'age_when_joined': age_when_joined, 'years_in_the_company': age - age_when_joined, 'salary': salary, 'annual_bonus': bonus, 'prior_years_experience': years_exp, 'full_time': [1.0, 0.0, 1.0, 0.0, 0.0, 0.0] * 2, 'part_time': [0.0, 0.0, 0.0, 0.0, 1.0, 1.0] * 2, 'contractor': contractor }) [docs]def load_tabular_demo(dataset_name='demo_single_table', table_name=None, data_path=DATA_PATH, metadata=False): """Load a tabular demo. If a dataset name is given, it is downloaded from the sdv-datasets S3 bucket. Otherwise, a toy dataset with a single table that contains data from a short fake collection of employees. If ``metadata`` is ``True``, the output will be a tuple with a ``Metadata`` instance for the dataset and a ``pandas.DataFrame`` with the data from the table. If ``metadata`` is ``False``, only the ``pandas.DataFrame`` is returned. Args: dataset_name (str): Dataset name to be downloaded. Defaults to ``'demo_single_table'``, which is the name of the single table demo dataset. table_name (str): If a table name is given, return this table from the indicated dataset. Otherwise, return the first one. data_path (str): Data path to save the dataset files, only used if dataset_name is provided. Defaults to ``DATA_PATH``. metadata (bool): If ``True`` also return a Table object. Defaults to ``False``. Returns: pandas.DataFrame or tuple: If ``metadata`` is ``False`` return a ``pandas.DataFrame`` with the tables data. If ``metadata`` is ``True`` return a ``tuple`` with a Table and the data. """ if dataset_name is None: raise ValueError("'dataset_name' cannot be None.") if dataset_name != 'demo_single_table': meta, tables = _load_demo_dataset(dataset_name, data_path) if table_name is None: table_name = meta.get_tables()[0] table = _dtypes64(tables[table_name]) if metadata: return Table.from_dict(meta.get_table_meta(table_name)), table return table table = _dtypes64(_load_tabular_dummy()) if metadata: table_meta = Table.from_dict({ 'fields': { 'company': {'type': 'categorical'}, 'department': {'type': 'categorical'}, 'employee_id': {'type': 'numerical', 'subtype': 'integer'}, 'age': {'type': 'numerical', 'subtype': 'integer'}, 'age_when_joined': {'type': 'numerical', 'subtype': 'integer'}, 'years_in_the_company': {'type': 'numerical', 'subtype': 'integer'}, 'salary': {'type': 'numerical', 'subtype': 'float'}, 'annual_bonus': {'type': 'numerical', 'subtype': 'float'}, 'prior_years_experience': {'type': 'numerical', 'subtype': 'integer'} }, 'constraints': [ { 'constraint': 'FixedCombinations', 'column_names': ['company', 'department'], }, { 'constraint': 'Inequality', 'low_column_name': 'age_when_joined', 'high_column_name': 'age' }, { 'constraint': 'ScalarInequality', 'column_name': 'salary', 'relation': '>', 'value': 30000 }, { 'constraint': 'Positive', 'column_name': 'prior_years_experience' } ], 'model_kwargs': {} }) return table_meta, table return table [docs]def load_timeseries_demo(dataset_name=None, table_name=None, metadata=False): """Load a timeseries demo. If a dataset name is given, it is downloaded from the sdv-datasets S3 bucket. Otherwise, a the NASDAQ100_2019 dataset is loaded. If ``metadata`` is ``True``, the output will be a tuple with a ``Metadata`` instance for the dataset and a ``pandas.DataFrame`` with the data from the table. If ``metadata`` is ``False``, only the ``pandas.DataFrame`` is returned. Args: dataset_name (str): Dataset name to be downloaded, if ``None`` use default dataset. Defaults to ``None``. table_name (str): If a table name is given, return this table from the indicated dataset. Otherwise, return the first one. data_path (str): Data path to save the dataset files, only used if dataset_name is provided. Defaults to ``DATA_PATH``. metadata (bool): If ``True`` also return a Table object. Defaults to ``False``. Returns: pandas.DataFrame or tuple: If ``metadata`` is ``False`` return a ``pandas.DataFrame`` with the tables data. If ``metadata`` is ``True`` return a ``tuple`` with a Table and the data. """ dataset_name = dataset_name or 'nasdaq100_2019' return load_tabular_demo(dataset_name, table_name, data_path=DATA_PATH, metadata=metadata) [docs]def get_available_demos(): """Get available demos and information about them. Returns: pandas.DataFrame: Table with the available demos. """ return pd.read_csv(DATASETS_URL)