SageMaker Clarify E2E Test#

Simple end-to-end test for the Clarify utils implementation

import sys
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
import boto3

# Add the clarify utils to path
# sys.path.insert(0, '/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-staging-molly/sagemaker_utils/src')

from sagemaker.core.clarify import (
    SageMakerClarifyProcessor,
    DataConfig,
    BiasConfig,
    ModelConfig,
    SHAPConfig
)
from sagemaker.core.helper.session_helper import Session,get_execution_role
role = get_execution_role()

1. Create Sample Data#

# Create synthetic dataset
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=5,
    n_redundant=2,
    random_state=42
)

# Add a sensitive feature (simulating gender: 0=female, 1=male)
sensitive_feature = np.random.binomial(1, 0.4, size=X.shape[0])
X = np.column_stack([X, sensitive_feature])

# Create DataFrame
feature_names = [f'feature_{i}' for i in range(10)] + ['gender']
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

print(f"Dataset shape: {df.shape}")
print(f"Target distribution: {df['target'].value_counts()}")
print(f"Gender distribution: {df['gender'].value_counts()}")

2. Train Simple Model#

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('target', axis=1), df['target'], test_size=0.2, random_state=42
)

# Train model
model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X_train, y_train)

print(f"Model accuracy: {model.score(X_test, y_test):.3f}")

3. Upload Data to S3#

# Setup S3 paths
session = Session()
bucket = session.default_bucket()
prefix = 'clarify-test'

# Save test data (without target for inference)
test_data = X_test.copy()
test_data['target'] = y_test
test_data.to_csv('/tmp/test_data.csv', index=False)

# Save model
joblib.dump(model, '/tmp/model.joblib')

# Upload to S3
s3_client = boto3.client('s3')
s3_client.upload_file('/tmp/test_data.csv', bucket, f'{prefix}/data/test_data.csv')
s3_client.upload_file('/tmp/model.joblib', bucket, f'{prefix}/model/model.joblib')

data_uri = f's3://{bucket}/{prefix}/data/test_data.csv'
output_uri = f's3://{bucket}/{prefix}/output'

print(f"Data uploaded to: {data_uri}")
print(f"Output will be saved to: {output_uri}")

4. Configure Clarify#

# Data configuration
data_config = DataConfig(
    s3_data_input_path=data_uri,
    s3_output_path=output_uri,
    label='target',
    headers=list(test_data.columns),
    dataset_type='text/csv'
)

# Bias configuration
bias_config = BiasConfig(
    label_values_or_threshold=[1],  # Positive class
    facet_name='gender',
    facet_values_or_threshold=[1]   # Male as sensitive group
)

# SHAP configuration
shap_config = SHAPConfig(
    baseline=None,  # Auto-generate baseline
    num_samples=10,  # Small number for quick test
    agg_method='mean_abs'
)

print("Configurations created successfully")

5. Create Clarify Processor#

# Create Clarify processor
clarify_processor = SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    sagemaker_session=session
)

print(f"Clarify processor created with role: {role}")

6. Run Pre-training Bias Analysis#

# Run pre-training bias analysis (no model needed)
try:
    clarify_processor.run_pre_training_bias(
        data_config=data_config,
        data_bias_config=bias_config,
        methods=['CI', 'DPL'],  # Class Imbalance and Difference in Positive Proportions
        wait=False,  # Don't wait for completion in test
        logs=False
    )
    print("✅ Pre-training bias analysis job submitted successfully")
except Exception as e:
    print(f"❌ Pre-training bias analysis failed: {str(e)}")
# You can go to SageMaker AI console -> Processing jobs and check the job status
# Or you can run the below command
# Note that it takes ~5min for the job to be complete

response = session.sagemaker_client.describe_processing_job(ProcessingJobName='Clarify-Pretraining-Bias-2025-11-09-02-39-36-699')
print(f"Status: {response['ProcessingJobStatus']}")

7. Test Configuration Generation#

# Test the internal config generation
from sagemaker.core.clarify import _AnalysisConfigGenerator

try:
    # Generate bias config
    bias_analysis_config = _AnalysisConfigGenerator.bias_pre_training(
        data_config=data_config,
        bias_config=bias_config,
        methods=['CI', 'DPL']
    )
    
    print("✅ Bias analysis config generated successfully")
    print(f"Config keys: {list(bias_analysis_config.keys())}")
    
    # Validate config structure
    required_keys = ['dataset_type', 'label_values_or_threshold', 'facet', 'methods']
    missing_keys = [key for key in required_keys if key not in bias_analysis_config]
    
    if missing_keys:
        print(f"❌ Missing required keys: {missing_keys}")
    else:
        print("✅ All required keys present in config")
        
except Exception as e:
    print(f"❌ Config generation failed: {str(e)}")

8. Test Schema Validation#

# Test schema validation
from sagemaker.core.clarify import ANALYSIS_CONFIG_SCHEMA_V1_0

try:
    # Validate the generated config
    ANALYSIS_CONFIG_SCHEMA_V1_0.validate(bias_analysis_config)
    print("✅ Schema validation passed")
except Exception as e:
    print(f"❌ Schema validation failed: {str(e)}")