SageMaker Clarify E2E Test#
Simple end-to-end test for the Clarify utils implementation
import sys
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
import boto3
# Add the clarify utils to path
# sys.path.insert(0, '/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-staging-molly/sagemaker_utils/src')
from sagemaker.core.clarify import (
SageMakerClarifyProcessor,
DataConfig,
BiasConfig,
ModelConfig,
SHAPConfig
)
from sagemaker.core.helper.session_helper import Session,get_execution_role
role = get_execution_role()
1. Create Sample Data#
# Create synthetic dataset
X, y = make_classification(
n_samples=1000,
n_features=10,
n_informative=5,
n_redundant=2,
random_state=42
)
# Add a sensitive feature (simulating gender: 0=female, 1=male)
sensitive_feature = np.random.binomial(1, 0.4, size=X.shape[0])
X = np.column_stack([X, sensitive_feature])
# Create DataFrame
feature_names = [f'feature_{i}' for i in range(10)] + ['gender']
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y
print(f"Dataset shape: {df.shape}")
print(f"Target distribution: {df['target'].value_counts()}")
print(f"Gender distribution: {df['gender'].value_counts()}")
2. Train Simple Model#
# Split data
X_train, X_test, y_train, y_test = train_test_split(
df.drop('target', axis=1), df['target'], test_size=0.2, random_state=42
)
# Train model
model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X_train, y_train)
print(f"Model accuracy: {model.score(X_test, y_test):.3f}")
3. Upload Data to S3#
# Setup S3 paths
session = Session()
bucket = session.default_bucket()
prefix = 'clarify-test'
# Save test data (without target for inference)
test_data = X_test.copy()
test_data['target'] = y_test
test_data.to_csv('/tmp/test_data.csv', index=False)
# Save model
joblib.dump(model, '/tmp/model.joblib')
# Upload to S3
s3_client = boto3.client('s3')
s3_client.upload_file('/tmp/test_data.csv', bucket, f'{prefix}/data/test_data.csv')
s3_client.upload_file('/tmp/model.joblib', bucket, f'{prefix}/model/model.joblib')
data_uri = f's3://{bucket}/{prefix}/data/test_data.csv'
output_uri = f's3://{bucket}/{prefix}/output'
print(f"Data uploaded to: {data_uri}")
print(f"Output will be saved to: {output_uri}")
4. Configure Clarify#
# Data configuration
data_config = DataConfig(
s3_data_input_path=data_uri,
s3_output_path=output_uri,
label='target',
headers=list(test_data.columns),
dataset_type='text/csv'
)
# Bias configuration
bias_config = BiasConfig(
label_values_or_threshold=[1], # Positive class
facet_name='gender',
facet_values_or_threshold=[1] # Male as sensitive group
)
# SHAP configuration
shap_config = SHAPConfig(
baseline=None, # Auto-generate baseline
num_samples=10, # Small number for quick test
agg_method='mean_abs'
)
print("Configurations created successfully")
5. Create Clarify Processor#
# Create Clarify processor
clarify_processor = SageMakerClarifyProcessor(
role=role,
instance_count=1,
instance_type='ml.m5.large',
sagemaker_session=session
)
print(f"Clarify processor created with role: {role}")
6. Run Pre-training Bias Analysis#
# Run pre-training bias analysis (no model needed)
try:
clarify_processor.run_pre_training_bias(
data_config=data_config,
data_bias_config=bias_config,
methods=['CI', 'DPL'], # Class Imbalance and Difference in Positive Proportions
wait=False, # Don't wait for completion in test
logs=False
)
print("✅ Pre-training bias analysis job submitted successfully")
except Exception as e:
print(f"❌ Pre-training bias analysis failed: {str(e)}")
# You can go to SageMaker AI console -> Processing jobs and check the job status
# Or you can run the below command
# Note that it takes ~5min for the job to be complete
response = session.sagemaker_client.describe_processing_job(ProcessingJobName='Clarify-Pretraining-Bias-2025-11-09-02-39-36-699')
print(f"Status: {response['ProcessingJobStatus']}")
7. Test Configuration Generation#
# Test the internal config generation
from sagemaker.core.clarify import _AnalysisConfigGenerator
try:
# Generate bias config
bias_analysis_config = _AnalysisConfigGenerator.bias_pre_training(
data_config=data_config,
bias_config=bias_config,
methods=['CI', 'DPL']
)
print("✅ Bias analysis config generated successfully")
print(f"Config keys: {list(bias_analysis_config.keys())}")
# Validate config structure
required_keys = ['dataset_type', 'label_values_or_threshold', 'facet', 'methods']
missing_keys = [key for key in required_keys if key not in bias_analysis_config]
if missing_keys:
print(f"❌ Missing required keys: {missing_keys}")
else:
print("✅ All required keys present in config")
except Exception as e:
print(f"❌ Config generation failed: {str(e)}")
8. Test Schema Validation#
# Test schema validation
from sagemaker.core.clarify import ANALYSIS_CONFIG_SCHEMA_V1_0
try:
# Validate the generated config
ANALYSIS_CONFIG_SCHEMA_V1_0.validate(bias_analysis_config)
print("✅ Schema validation passed")
except Exception as e:
print(f"❌ Schema validation failed: {str(e)}")