The library leverages Polars, which ensures faster computations compared to other data manipulation libraries. It supports both Polars and Pandas dataframes.
The user must provide both the original real dataset and the corresponding synthetic dataset to enable the library's modules to perform the necessary computations for evaluation.
Below is a code snippet example for the usage of the library:
# Import the necessary modules from the SURE library
from sure import Preprocessor, report
from sure.utility import (compute_statistical_metrics, compute_mutual_info,
compute_utility_metrics_class)
from sure.privacy import (distance_to_closest_record, dcr_stats, number_of_dcr_equal_to_zero, validation_dcr_test,
adversary_dataset, membership_inference_test)
# Assuming
# real_data,
# valid_data
# synth_data
# are three pandas DataFrames
# Real dataset - Preprocessor initialization and query execution on the real, synthetic and validation datasets
preprocessor = Preprocessor(real_data, get_discarded_info=False)
real_data_preprocessed = preprocessor.transform(real_data, num_fill_null='forward', scaling='standardize')
valid_data_preprocessed = preprocessor.transform(valid_data, num_fill_null='forward', scaling='standardize')
synth_data_preprocessed = preprocessor.transform(synth_data, num_fill_null='forward', scaling='standardize')
# Statistical properties and mutual information
num_features_stats, cat_features_stats, temporal_feat_stats = compute_statistical_metrics(real_data_preprocessed, synth_data_preprocessed)
corr_real, corr_synth, corr_difference = compute_mutual_info(real_data_preprocessed, synth_data_preprocessed)
# ML utility: TSTR - Train on Synthetic, Test on Real
X_train = real_data_preprocessed.drop("label", axis=1) # Assuming the datasets have a “label” column for the machine learning task they are intended for
y_train = real_data_preprocessed["label"]
X_synth = synth_data_preprocessed.drop("label", axis=1)
y_synth = synth_data_preprocessed["label"]
X_test = valid_data_preprocessed.drop("label", axis=1).limit(10000) # Test the trained models on a portion of the original real dataset (first 10k rows)
y_test = valid_data_preprocessed["label"].limit(10000)
TSTR_real, TSTR_synth, TSTR_delta = compute_utility_metrics_class(X_train, X_synth, X_test, y_train, y_synth, y_test)
# Distance to closest record
dcr_synth_train = distance_to_closest_record("synth_train", synth_data_preprocessed, real_data_preprocessed)
dcr_synth_valid = distance_to_closest_record("synth_val", synth_data_preprocessed, valid_data_preprocessed)
dcr_stats_synth_train = dcr_stats("synth_train", dcr_synth_train)
dcr_stats_synth_valid = dcr_stats("synth_val", dcr_synth_valid)
dcr_zero_synth_train = number_of_dcr_equal_to_zero("synth_train", dcr_synth_train)
dcr_zero_synth_valid = number_of_dcr_equal_to_zero("synth_val", dcr_synth_valid)
share = validation_dcr_test(dcr_synth_train, dcr_synth_valid)
# ML privacy attack sandbox initialization and simulation
adv_data = adversary_dataset(real_data_preprocessed, valid_data_preprocessed)
# The function adversary_dataset adds a column "privacy_test_is_training" ****to the adversary dataset, indicating whether the record was part of the training set or not
adv_guesses_ground_truth = adv_data["privacy_test_is_training"]
MIA = membership_inference_test(adv_data, synth_data_preprocessed, adv_guesses_ground_truth)
# Report generation as a HTML page
report(real_data, synth_data)
Please review the Modules section dedicated to the individual modules to learn how to further customize your synthetic data assessment pipeline.