SageMaker V3 Train-to-Inference E2E Example#
This notebook demonstrates the complete end-to-end workflow from training a custom PyTorch model to deploying it for inference using SageMaker V3.
Prerequisites#
Note: Ensure you have sagemaker and ipywidgets installed in your environment. The ipywidgets package is required to monitor endpoint deployment progress in Jupyter notebooks.
# Import required libraries
import json
import uuid
import tempfile
import os
import boto3
from sagemaker.serve.model_builder import ModelBuilder
from sagemaker.serve.builder.schema_builder import SchemaBuilder
from sagemaker.serve.utils.types import ModelServer
from sagemaker.serve.spec.inference_spec import InferenceSpec
from sagemaker.train.model_trainer import ModelTrainer
from sagemaker.train.configs import SourceCode
from sagemaker.core.resources import EndpointConfig
from sagemaker.core.helper.session_helper import Session
Step 1: Configure Training Job#
Set up a custom PyTorch model for training. We’ll create a simple neural network for demonstration.
# Configuration for training
MODEL_NAME_PREFIX = "train-inf-v3-example-model"
ENDPOINT_NAME_PREFIX = "train-inf-v3-example-endpoint"
TRAINING_JOB_PREFIX = "e2e-v3-pytorch"
# AWS Configuration
AWS_REGION = Session.boto_region_name
PYTORCH_TRAINING_IMAGE = f"763104351884.dkr.ecr.{AWS_REGION}.amazonaws.com/pytorch-training:1.13.1-cpu-py39"
# Generate unique identifiers
unique_id = str(uuid.uuid4())[:8]
training_job_name = f"{TRAINING_JOB_PREFIX}-{unique_id}"
model_name = f"{MODEL_NAME_PREFIX}-{unique_id}"
endpoint_name = f"{ENDPOINT_NAME_PREFIX}-{unique_id}"
print(f"Training job name: {training_job_name}")
print(f"Model name: {model_name}")
print(f"Endpoint name: {endpoint_name}")
Step 2: Create Training Code#
Create a simple PyTorch training script and requirements file.
def create_pytorch_training_code():
"""Create PyTorch training script."""
temp_dir = tempfile.mkdtemp()
train_script = '''import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import os
class SimpleModel(nn.Module):
def __init__(self):
super().__init__()
self.linear = nn.Linear(4, 2)
def forward(self, x):
return torch.softmax(self.linear(x), dim=1)
def train():
model = SimpleModel()
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
# Synthetic data
X = torch.randn(100, 4)
y = torch.randint(0, 2, (100,))
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32)
# Train for 1 epoch
model.train()
for batch_x, batch_y in dataloader:
optimizer.zero_grad()
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
# Save model for TorchServe
model.eval()
traced_model = torch.jit.trace(model, torch.randn(1, 4))
model_dir = os.environ.get('SM_MODEL_DIR', '/opt/ml/model')
os.makedirs(model_dir, exist_ok=True)
torch.jit.save(traced_model, os.path.join(model_dir, 'model.pth'))
print("Training completed and model saved!")
if __name__ == "__main__":
train()
'''
with open(os.path.join(temp_dir, 'train.py'), 'w') as f:
f.write(train_script)
with open(os.path.join(temp_dir, 'requirements.txt'), 'w') as f:
f.write('torch>=1.13.0,<2.0.0\n')
return temp_dir
# Create training code
training_code_dir = create_pytorch_training_code()
print(f"Training code created in: {training_code_dir}")
Step 3: Create ModelTrainer and Start Training#
Set up the ModelTrainer with custom PyTorch code and launch the training job.
# Create SageMaker session
boto_session = boto3.Session(region_name=AWS_REGION)
sagemaker_session = Session(boto_session=boto_session)
# Create ModelTrainer with custom code
model_trainer = ModelTrainer(
sagemaker_session=sagemaker_session,
training_image=PYTORCH_TRAINING_IMAGE,
source_code=SourceCode(
source_dir=training_code_dir,
entry_script="train.py",
requirements="requirements.txt",
),
base_job_name=training_job_name
)
# Start training job
print(f"Starting training job: {training_job_name}")
print("Note: This will take a few minutes to complete.")
model_trainer.train()
print("Model Training Completed!")
Step 4: Create Schema Builder and Inference Spec#
Set up the schema and inference specification for the trained model.
# Create schema builder for tensor-based models
def create_schema_builder():
"""Create schema builder for tensor-based models."""
sample_input = [[0.1, 0.2, 0.3, 0.4]]
sample_output = [[0.8, 0.2]]
return SchemaBuilder(sample_input, sample_output)
# Create inference specification
class SimpleInferenceSpec(InferenceSpec):
def load(self, model_dir):
import torch
return torch.jit.load(f"{model_dir}/model.pth")
def invoke(self, input_object, model):
import torch
return model(torch.tensor(input_object)).tolist()
schema_builder = create_schema_builder()
print("Schema builder and inference spec created successfully!")
Step 5: Create ModelBuilder and Build Model#
Create the ModelBuilder with the trained model and build it for deployment.
# Create ModelBuilder with trained model
model_builder = ModelBuilder(
model=model_trainer,
schema_builder=schema_builder,
model_server=ModelServer.TORCHSERVE,
inference_spec=SimpleInferenceSpec(),
image_uri=PYTORCH_TRAINING_IMAGE.replace("training", "inference"),
dependencies={"auto": False},
)
# Build the trained model
core_model = model_builder.build(model_name=model_name, region=AWS_REGION)
print(f"Model Successfully Created: {core_model.model_name}")
Step 6: Deploy the Trained Model#
Deploy the trained model to a SageMaker endpoint.
# Deploy the trained model
core_endpoint = model_builder.deploy(
endpoint_name=endpoint_name,
initial_instance_count=1
)
print(f"Endpoint Successfully Created: {core_endpoint.endpoint_name}")
Step 7: Test the Trained Model#
Test the deployed trained model with sample tensor inputs.
# Test the trained model with tensor input
test_data = [[0.1, 0.2, 0.3, 0.4]]
result = core_endpoint.invoke(
body=json.dumps(test_data),
content_type="application/json"
)
# Decode and display the result
prediction = json.loads(result.body.read().decode('utf-8'))
print(f"Result of invoking endpoint: {prediction}")
# Test with different tensor inputs
test_inputs = [
[[0.5, 0.3, 0.2, 0.1]],
[[0.9, 0.1, 0.8, 0.2]],
[[0.2, 0.7, 0.4, 0.6]]
]
for i, test_input in enumerate(test_inputs, 1):
result = core_endpoint.invoke(
body=json.dumps(test_input),
content_type="application/json"
)
prediction = json.loads(result.body.read().decode('utf-8'))
print(f"Test {i} - Input {test_input}: {prediction}")
print("-" * 50)
Step 8: Clean Up Resources#
Clean up all created resources including the trained model and endpoint.
# Clean up resources
core_endpoint_config = EndpointConfig.get(endpoint_config_name=core_endpoint.endpoint_name)
# Delete in the correct order
core_model.delete()
core_endpoint.delete()
core_endpoint_config.delete()
print("Model and Endpoint Successfully Deleted!")
print(f"Note: Training job artifacts remain in S3 for reference.")
Summary#
This notebook demonstrated the complete E2E workflow:
Creating custom PyTorch training code
Configuring a ModelTrainer with custom source code
Running a training job on SageMaker
Building a ModelBuilder from training artifacts
Deploying the trained model to an endpoint
Testing the trained model with tensor inputs
Proper cleanup of inference resources
Key Benefits of E2E Training:#
Custom training: Full control over PyTorch training process
Seamless workflow: Train → Build → Deploy in one pipeline
Artifact management: Automatic handling of training outputs
TorchServe integration: Easy deployment with TorchServe
Production ready: Trained models ready for immediate deployment
The V3 ModelBuilder makes it easy to go from custom training to production inference with minimal code!