API Reference

NoGANSynth Class and Methods

NoGAN Tabular Data Synthesizer generates synthetic data based on the multivariate binning technique performed on the Training or Real Dataset.

The main NoGAN Synthesizer Class

Source code in nogan_synthesizer\nogan_synthesizer.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class NoGANSynth:
    """
    The main NoGAN Synthesizer Class
    """
    def __init__(self, data: pd.DataFrame, random_seed: int = None) -> None:
        """
        Initialize Data, no of objects, features, no of features and epsilon

        Args:
            data (pd.DataFrame): Input Pandas DataFrame to be trained on
            random_seed (int, optional): Random seed to be set before 
                                        operations. If set random seed is set using `np.random.seed(random_seed)`. Defaults to None

        Raises:
            TypeError: Throws error if Input Dataset is not a Pandas DataFrame
            ValueError: Throws error if Input Dataset is empty
            TypeError: Throws error if non numerical columns are present in the 
                        Input Dataset
            ValueError: Throws error if there are special characters or space in 
                        column names of Input Dataset

        Returns:
            None   
        """

        if not isinstance(data, pd.DataFrame):
            raise TypeError("Input Dataset should be a Pandas DataFrame!!")

        if data.empty:
            raise ValueError("Input Dataset should not be empty!!") 

        if len(data.select_dtypes(exclude=['number']).columns) != 0:
            raise TypeError("There are non numeric columns present in the Input Dataset. Please process them using wrap_category_columns function")

        if re.search(r'[^a-zA-Z0-9_]', "".join(data.columns)):
            raise ValueError("There are special characters or space in the Column Names of Input Dataset. Please clean them before processing.")

        self.data = np.array(data.copy())
        self.features = data.columns
        self.int_columns = data.select_dtypes(include=['int']).columns  
        self.nobs = len(self.data)
        self.median = np.median(self.data, axis = 0)

        # Any special characters or space in column names will be cleaned up
        self.n_features = len(self.features)
        self.eps = 0.0000000001
        self.random_seed = random_seed

    def fit(self, bins: List = None) -> None:
        """
        Function to create bins for each Data column.

        Args:
            bins (List, optional): Bins List. Defaults to None. If it is None, then random bins between 50 to 100 will be assigned. Recommended to pass a tuned hyperparameter bins list
        """

        if self.random_seed:
            np.random.seed(self.random_seed)

        # Get bin indices for each row in the data
        if bins is None:
            self.bins_per_feature = [np.random.randint(50, 100) 
                                     for _ in range(self.n_features)]
        else:
            self.bins_per_feature = bins

        # create quantile table bin_edges, one row for each feature
        self.bin_edges = [np.quantile(self.data[:, k],
                                      np.arange(0, 1 + self.eps,
                                                1/self.bins_per_feature[k]),
                                      axis=0
                                      ) for k in range(self.n_features)]

        bin_keys = {}
        for obs in self.data:   
            # For each observation column get the respective bin index based 
            # on the quantile table bin_edges
            bin_indices = [np.clip(np.searchsorted(self.bin_edges[k], 
                                                   obs[k], side='right')-1,
                                   0,
                                   len(self.bin_edges[k])-2
                                   )
                           for k in range(self.n_features)]

            # Convert the bin_indices into a string of comma separated values
            # They are the multivariate keys used in bin_keys dictionary
            key_str = ', '.join(map(str, bin_indices))              

            # Calculate lower & upper bounds
            lower_val = [self.bin_edges[k][bin_indices[k]]
                         for k in range(self.n_features)]
            upper_val = [self.bin_edges[k][1 + bin_indices[k]]
                         for k in range(self.n_features)]

            # frequency & sum_obs are the running counts & sum of observations
            if key_str in bin_keys:
                bin_keys[key_str]["frequency"] += 1
                bin_keys[key_str]["sum_obs"] += obs
            else:
                bin_keys[key_str] = {"sum_obs": obs,
                                     "frequency": 1,
                                     "value": bin_indices,
                                     "lower_val": lower_val,
                                     "upper_val": upper_val}

        self.bin_keys = bin_keys


    def _random_bin_counts(self, no_of_rows: int) -> np.array:
        """
        Args:
            no_of_rows (int): Row Count

        Returns:
            np.array: Random Bin Count Array
        """
        pvals = []
        for key in self.bin_keys:
            pvals.append(self.bin_keys[key]["frequency"]/self.nobs)
        return(np.random.multinomial(no_of_rows, pvals))

    def generate_synthetic_data(self, no_of_rows: int = 100, 
                                stretch_type: List = None,
                                stretch: List = None,
                                gen_random_seed: int = None,
                                debug: bool = False
                                ) -> pd.DataFrame:
        """
        The main function which Generates the Synthetic Data.
        It calls random bin to create the multinomial bin counts.
        Then for each key, gets the lower and upper bound and generates an observation (random uniform value) between those bounds.
        Once the new observations list is generated, convert into a pandas synthetic dataframe and return.

        Args:
            no_of_rows (int): Row Count
            stretch_type (List): List of values {"Gaussian","Uniform"}. Specifies 
                                the Sampling Type for each column. Any value in List which is not `Uniform` will be treated as `Gaussian`. Default value is `Uniform` for all columns.
            stretch (List): Specifies the stretching factor (scale) for each 
                            column. Values between 0 and 1 with `Uniform` stretch type keeps generated observations inside each
                            hyperrectangle. Default value is 1.0 for all columns.
            gen_random_seed (int, optional): Random seed to be set before 
                                        generation. It is set using `np.random.seed(random_seed)`. Defaults to None. If set to None, the random seed set at instantiation will be used
            debug (bool): Flag to activate debugging. Default is False

        Returns:
            pd.DataFrame: Generate Synthetic Pandas DataFrame
        """
        if gen_random_seed:
            np.random.seed(gen_random_seed)
        elif self.random_seed:
            np.random.seed(self.random_seed)

        if not stretch_type:
            stretch_type = ["Uniform" for _ in range(self.n_features)]
        if not stretch:
            stretch = [1.0 for _ in range(self.n_features)]
        stretch = np.array(stretch, dtype = np.float32)

        if debug:
            print(f"List `stretch_type`: {stretch_type}")
            print(f"List `stretch`: {stretch}")

        bin_count_random = self._random_bin_counts(no_of_rows)
        data_synth = []

        for i, key in enumerate(self.bin_keys):
            lower_val = self.bin_keys[key]["lower_val"]
            upper_val = self.bin_keys[key]["upper_val"]
            mean_val = self.bin_keys[key]["sum_obs"] / self.bin_keys[key]["frequency"]
            count = bin_count_random[i]
            for j in range(count):
                new_obs = np.empty(self.n_features)  # synthesized obs
                for k in range(self.n_features):
                    if stretch[k] < 0:
                        new_obs[k] = np.random.uniform(lower_val[k], 
                                                       upper_val[k])
                    else:
                        if stretch_type[k] == 'Uniform':
                            deviate = np.random.uniform(-1, 1) 
                        else:
                            deviate = np.random.normal(0, 1)
                        dist_to_edge = \
                        min(mean_val[k] - lower_val[k], upper_val[k] - mean_val[k])
                        new_obs[k] = \
                        mean_val[k] + dist_to_edge * stretch[k] * deviate

                data_synth.append(new_obs)
        data_synth = pd.DataFrame(data_synth, columns=self.features)

        for col in self.int_columns:
            data_synth[col] = data_synth[col].astype(int)

        return data_synth

__init__(data, random_seed=None)

Initialize Data, no of objects, features, no of features and epsilon

Parameters:

Name Type Description Default
data DataFrame

Input Pandas DataFrame to be trained on

required
random_seed int

Random seed to be set before operations. If set random seed is set using np.random.seed(random_seed). Defaults to None

None

Raises:

Type Description
TypeError

Throws error if Input Dataset is not a Pandas DataFrame

ValueError

Throws error if Input Dataset is empty

TypeError

Throws error if non numerical columns are present in the Input Dataset

ValueError

Throws error if there are special characters or space in column names of Input Dataset

Returns:

Type Description
None

None

Source code in nogan_synthesizer\nogan_synthesizer.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def __init__(self, data: pd.DataFrame, random_seed: int = None) -> None:
    """
    Initialize Data, no of objects, features, no of features and epsilon

    Args:
        data (pd.DataFrame): Input Pandas DataFrame to be trained on
        random_seed (int, optional): Random seed to be set before 
                                    operations. If set random seed is set using `np.random.seed(random_seed)`. Defaults to None

    Raises:
        TypeError: Throws error if Input Dataset is not a Pandas DataFrame
        ValueError: Throws error if Input Dataset is empty
        TypeError: Throws error if non numerical columns are present in the 
                    Input Dataset
        ValueError: Throws error if there are special characters or space in 
                    column names of Input Dataset

    Returns:
        None   
    """

    if not isinstance(data, pd.DataFrame):
        raise TypeError("Input Dataset should be a Pandas DataFrame!!")

    if data.empty:
        raise ValueError("Input Dataset should not be empty!!") 

    if len(data.select_dtypes(exclude=['number']).columns) != 0:
        raise TypeError("There are non numeric columns present in the Input Dataset. Please process them using wrap_category_columns function")

    if re.search(r'[^a-zA-Z0-9_]', "".join(data.columns)):
        raise ValueError("There are special characters or space in the Column Names of Input Dataset. Please clean them before processing.")

    self.data = np.array(data.copy())
    self.features = data.columns
    self.int_columns = data.select_dtypes(include=['int']).columns  
    self.nobs = len(self.data)
    self.median = np.median(self.data, axis = 0)

    # Any special characters or space in column names will be cleaned up
    self.n_features = len(self.features)
    self.eps = 0.0000000001
    self.random_seed = random_seed

fit(bins=None)

Function to create bins for each Data column.

Parameters:

Name Type Description Default
bins List

Bins List. Defaults to None. If it is None, then random bins between 50 to 100 will be assigned. Recommended to pass a tuned hyperparameter bins list

None
Source code in nogan_synthesizer\nogan_synthesizer.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def fit(self, bins: List = None) -> None:
    """
    Function to create bins for each Data column.

    Args:
        bins (List, optional): Bins List. Defaults to None. If it is None, then random bins between 50 to 100 will be assigned. Recommended to pass a tuned hyperparameter bins list
    """

    if self.random_seed:
        np.random.seed(self.random_seed)

    # Get bin indices for each row in the data
    if bins is None:
        self.bins_per_feature = [np.random.randint(50, 100) 
                                 for _ in range(self.n_features)]
    else:
        self.bins_per_feature = bins

    # create quantile table bin_edges, one row for each feature
    self.bin_edges = [np.quantile(self.data[:, k],
                                  np.arange(0, 1 + self.eps,
                                            1/self.bins_per_feature[k]),
                                  axis=0
                                  ) for k in range(self.n_features)]

    bin_keys = {}
    for obs in self.data:   
        # For each observation column get the respective bin index based 
        # on the quantile table bin_edges
        bin_indices = [np.clip(np.searchsorted(self.bin_edges[k], 
                                               obs[k], side='right')-1,
                               0,
                               len(self.bin_edges[k])-2
                               )
                       for k in range(self.n_features)]

        # Convert the bin_indices into a string of comma separated values
        # They are the multivariate keys used in bin_keys dictionary
        key_str = ', '.join(map(str, bin_indices))              

        # Calculate lower & upper bounds
        lower_val = [self.bin_edges[k][bin_indices[k]]
                     for k in range(self.n_features)]
        upper_val = [self.bin_edges[k][1 + bin_indices[k]]
                     for k in range(self.n_features)]

        # frequency & sum_obs are the running counts & sum of observations
        if key_str in bin_keys:
            bin_keys[key_str]["frequency"] += 1
            bin_keys[key_str]["sum_obs"] += obs
        else:
            bin_keys[key_str] = {"sum_obs": obs,
                                 "frequency": 1,
                                 "value": bin_indices,
                                 "lower_val": lower_val,
                                 "upper_val": upper_val}

    self.bin_keys = bin_keys

generate_synthetic_data(no_of_rows=100, stretch_type=None, stretch=None, gen_random_seed=None, debug=False)

The main function which Generates the Synthetic Data. It calls random bin to create the multinomial bin counts. Then for each key, gets the lower and upper bound and generates an observation (random uniform value) between those bounds. Once the new observations list is generated, convert into a pandas synthetic dataframe and return.

Parameters:

Name Type Description Default
no_of_rows int

Row Count

100
stretch_type List

List of values {"Gaussian","Uniform"}. Specifies the Sampling Type for each column. Any value in List which is not Uniform will be treated as Gaussian. Default value is Uniform for all columns.

None
stretch List

Specifies the stretching factor (scale) for each column. Values between 0 and 1 with Uniform stretch type keeps generated observations inside each hyperrectangle. Default value is 1.0 for all columns.

None
gen_random_seed int

Random seed to be set before generation. It is set using np.random.seed(random_seed). Defaults to None. If set to None, the random seed set at instantiation will be used

None
debug bool

Flag to activate debugging. Default is False

False

Returns:

Type Description
DataFrame

pd.DataFrame: Generate Synthetic Pandas DataFrame

Source code in nogan_synthesizer\nogan_synthesizer.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def generate_synthetic_data(self, no_of_rows: int = 100, 
                            stretch_type: List = None,
                            stretch: List = None,
                            gen_random_seed: int = None,
                            debug: bool = False
                            ) -> pd.DataFrame:
    """
    The main function which Generates the Synthetic Data.
    It calls random bin to create the multinomial bin counts.
    Then for each key, gets the lower and upper bound and generates an observation (random uniform value) between those bounds.
    Once the new observations list is generated, convert into a pandas synthetic dataframe and return.

    Args:
        no_of_rows (int): Row Count
        stretch_type (List): List of values {"Gaussian","Uniform"}. Specifies 
                            the Sampling Type for each column. Any value in List which is not `Uniform` will be treated as `Gaussian`. Default value is `Uniform` for all columns.
        stretch (List): Specifies the stretching factor (scale) for each 
                        column. Values between 0 and 1 with `Uniform` stretch type keeps generated observations inside each
                        hyperrectangle. Default value is 1.0 for all columns.
        gen_random_seed (int, optional): Random seed to be set before 
                                    generation. It is set using `np.random.seed(random_seed)`. Defaults to None. If set to None, the random seed set at instantiation will be used
        debug (bool): Flag to activate debugging. Default is False

    Returns:
        pd.DataFrame: Generate Synthetic Pandas DataFrame
    """
    if gen_random_seed:
        np.random.seed(gen_random_seed)
    elif self.random_seed:
        np.random.seed(self.random_seed)

    if not stretch_type:
        stretch_type = ["Uniform" for _ in range(self.n_features)]
    if not stretch:
        stretch = [1.0 for _ in range(self.n_features)]
    stretch = np.array(stretch, dtype = np.float32)

    if debug:
        print(f"List `stretch_type`: {stretch_type}")
        print(f"List `stretch`: {stretch}")

    bin_count_random = self._random_bin_counts(no_of_rows)
    data_synth = []

    for i, key in enumerate(self.bin_keys):
        lower_val = self.bin_keys[key]["lower_val"]
        upper_val = self.bin_keys[key]["upper_val"]
        mean_val = self.bin_keys[key]["sum_obs"] / self.bin_keys[key]["frequency"]
        count = bin_count_random[i]
        for j in range(count):
            new_obs = np.empty(self.n_features)  # synthesized obs
            for k in range(self.n_features):
                if stretch[k] < 0:
                    new_obs[k] = np.random.uniform(lower_val[k], 
                                                   upper_val[k])
                else:
                    if stretch_type[k] == 'Uniform':
                        deviate = np.random.uniform(-1, 1) 
                    else:
                        deviate = np.random.normal(0, 1)
                    dist_to_edge = \
                    min(mean_val[k] - lower_val[k], upper_val[k] - mean_val[k])
                    new_obs[k] = \
                    mean_val[k] + dist_to_edge * stretch[k] * deviate

            data_synth.append(new_obs)
    data_synth = pd.DataFrame(data_synth, columns=self.features)

    for col in self.int_columns:
        data_synth[col] = data_synth[col].astype(int)

    return data_synth

Preprocessing

wrap_category_columns(data: pd.DataFrame, cat_cols: List[str])

Categorical Columns can be preprocessed using key-value pairs (called flag vector) of all categorical columns and collapsing all these columns into a single feature with integer values. wrap_category_columns implements this concept.

Parameters:

Name Type Description Default
data DataFrame

Pandas DataFrame

required
cat_cols List[str]

List of all categorical columns

required

Raises:

Type Description
TypeError

Throws error if Input Dataset is not a Pandas DataFrame

ValueError

Throws error if Input Dataset is empty

ValueError

Throws error if there are special characters or space in column names of Input Dataset

TypeError

Throws error if 'cat_cols' is not a list

ValueError

Throws error if 'cat_cols' is empty

Returns:

Name Type Description
Tuple Tuple

Returns a Pandas DataFrame with all category columns wrapped & Dictionaries 'idx_to_key' and 'key_to_idx' which contain key-index, index-key pairs of flag vector

Source code in nogan_synthesizer\preprocessing.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def wrap_category_columns(data: pd.DataFrame,
                          cat_cols: List[str]) -> Tuple:
    """
    Args:
        data (pd.DataFrame): Pandas DataFrame
        cat_cols (List[str]): List of all categorical columns

    Raises:
        TypeError: Throws error if Input Dataset is not a Pandas DataFrame
        ValueError: Throws error if Input Dataset is empty
        ValueError: Throws error if there are special characters or space in column 
                    names of Input Dataset
        TypeError: Throws error if 'cat_cols' is not a list
        ValueError: Throws error if 'cat_cols' is empty 

    Returns:
        Tuple: Returns a Pandas DataFrame with all category columns wrapped & Dictionaries 'idx_to_key' and 'key_to_idx' which contain key-index, index-key pairs of flag vector
    """

    if not isinstance(data, pd.DataFrame):
        raise TypeError("Input Dataset should be a Pandas DataFrame!!")

    if data.empty:
        raise ValueError("Input Dataset should not be empty!!")

    if re.search(r'[^a-zA-Z0-9_]', "".join(data.columns)):
        raise ValueError("There are special characters or space in the Column Names of Input Dataset. Please clean them before processing.")

    if not isinstance(cat_cols, list):
        raise TypeError("Input 'cat_cols' should a List!!")

    if not cat_cols:
        raise ValueError("'cat_cols' should not be Empty!!")

    df = data.copy()

    cat_data = df[cat_cols]
    num_cols = [f for f in data.columns if f not in cat_cols]

    flag_vector = [list(row) for row in 
                   cat_data.drop_duplicates().to_records(index=False)]

    key_to_idx = {str(v).strip("[").strip("]"):i 
                  for i, v in enumerate(flag_vector,1)}
    idx_to_key = {i:tuple(v) for i, v in enumerate(flag_vector,1)}

    df["cat_label"] = \
        [key_to_idx[str(tuple(row)).strip("(").strip(")").strip(",")]
         for row in cat_data.to_records(index=False)]

    df = df[num_cols + ["cat_label"]]

    return df, idx_to_key, key_to_idx

unwrap_category_columns(data: pd.DataFrame, idx_to_key: dict, cat_cols: List[str])

All the collapsed categorical columns can also be expanded using the same flag vector created during wrapping process

Parameters:

Name Type Description Default
data DataFrame

Pandas DataFrame

required
idx_to_key dict

Dictionary that holds the key-index pairs of the flag vector

required
cat_cols List[str]

List of all categorical columns

required

Raises:

Type Description
TypeError

Throws error if Input Dataset is not a Pandas DataFrame

ValueError

Throws error if Input Dataset is empty

ValueError

Throws error if there are special characters or space in column names of Input Dataset

TypeError

Throws error if 'cat_label' column is not present in the Input Dataset

TypeError

Throws error if 'idx_to_key' is not a Dictionary

ValueError

Throws error if 'idx_to_key' is empty

TypeError

Throws error if 'cat_cols' is not a List

ValueError

Throws error if 'cat_cols' is empty

Returns:

Type Description
DataFrame

pd.DataFrame: Pandas DataFrame with expanded Categorical Columns

Source code in nogan_synthesizer\preprocessing.py
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def unwrap_category_columns(data: pd.DataFrame, idx_to_key: dict,
                            cat_cols: List[str]) -> pd.DataFrame:
    """
    Args:
        data (pd.DataFrame): Pandas DataFrame
        idx_to_key (dict): Dictionary that holds the key-index pairs of the flag 
                            vector
        cat_cols (List[str]): List of all categorical columns

    Raises:
        TypeError: Throws error if Input Dataset is not a Pandas DataFrame
        ValueError: Throws error if Input Dataset is empty
        ValueError: Throws error if there are special characters or space in column 
                    names of Input Dataset
        TypeError: Throws error if 'cat_label' column is not present in the 
                    Input Dataset
        TypeError: Throws error if 'idx_to_key' is not a Dictionary
        ValueError: Throws error if 'idx_to_key' is empty
        TypeError: Throws error if 'cat_cols' is not a List
        ValueError: Throws error if 'cat_cols' is empty

    Returns:
        pd.DataFrame: Pandas DataFrame with expanded Categorical Columns
    """

    if not isinstance(data, pd.DataFrame):
        raise TypeError("Input Dataset should be a Pandas DataFrame!!")

    if data.empty:
        raise ValueError("Input Dataset should not be empty!!")

    if re.search(r'[^a-zA-Z0-9_]', "".join(data.columns)):
        raise ValueError("There are special characters or space in the Column Names of Input Dataset. Please clean them before processing.")

    if "cat_label" not in data.columns:
        raise TypeError("Column named 'cat_label' is expected and not present!!")

    if not isinstance(idx_to_key, dict):
        raise TypeError("'idx_to_key' should a Dictionary!!")

    if not idx_to_key:
        raise ValueError("'idx_to_key' should not be empty!!")

    if not isinstance(cat_cols, list):
        raise TypeError("'cat_cols' should a List!!")

    if not cat_cols:
        raise ValueError("'cat_cols' should not be empty!!")

    df = data.copy()

    df_cat = pd.DataFrame([idx_to_key[idx] for idx in df.cat_label], 
                          columns=cat_cols)

    data_unwrapped = pd.concat([df, df_cat], axis=1)

    return data_unwrapped.drop(["cat_label"], axis=1)