from pprint import pprint
from sagemaker.core.resources import TrainingJob, HubContent, InferenceComponent, ModelPackage
from sagemaker.core.utils.utils import Unassigned
! ada credentials update --provider=isengard --account=<> --role=Admin --profile=default --once
! aws configure set region us-west-2
from sagemaker.core.resources import Endpoint
# Delete endpoints starting with 'e2e-'
for endpoint in Endpoint.get_all():
if endpoint.endpoint_name.startswith('e2e-'):
endpoint.delete()
from sagemaker.core.resources import TrainingJob, HubContent, InferenceComponent, ModelPackage
from sagemaker.core.utils.utils import Unassigned
for training_job in TrainingJob.get_all(region="us-west-2"):
if not isinstance(training_job.output_model_package_arn, Unassigned):
try:
model_package = ModelPackage.get(training_job.output_model_package_arn)
if not isinstance(model_package.inference_specification.containers[0].image,Unassigned)\
and model_package.inference_specification.containers[0].image is not None:
print(training_job.training_job_arn)
print(model_package.inference_specification.containers[0].image)
except:
pass
from sagemaker.core.resources import TrainingJob
import random
training_job = TrainingJob.get(training_job_name="meta-textgeneration-llama-3-2-1b-instruct-sft-20251123162832")
print(training_job.output_model_package_arn)
name = f"e2e-{random.randint(100, 10000)}"
from sagemaker.serve import ModelBuilder
model_builder = ModelBuilder(model=training_job)
model = model_builder.build(model_name=name)
print(model.model_arn)
import random
#endpoint = model_builder.deploy(endpoint_name=name)
endpoint = model_builder.deploy(endpoint_name=name)
from sagemaker.core.resources import InferenceComponent, Tag
from pprint import pprint
for inference_component in InferenceComponent.get_all(endpoint_name_equals="e2e-2358"):
print(inference_component.inference_component_arn)
for tag in Tag.get_all(resource_arn=inference_component.inference_component_arn):
pprint(tag)
import json
# Note this is expected to fail since Endpoint invoke is only available for authorized users. The Invoke call here is the sagemaker-core Endpoint.invoke call .
print(endpoint.endpoint_arn)
endpoint.invoke(body=json.dumps({"inputs": "What is the capital of France?", "parameters": {"max_new_tokens": 50}}))
from sagemaker.core.resources import TrainingJob
from sagemaker.serve import ModelBuilder
model_builder = ModelBuilder(model=TrainingJob.get(training_job_name="meta-textgeneration-llama-3-2-1b-instruct-sft-20251123162832"))
model_builder.fetch_endpoint_names_for_base_model()
name = f"e2e-{random.randint(100, 10000)}"
model_builder.name = name
endpoint = model_builder.deploy(endpoint_name=name, inference_component_name=f"{name}-adapter")
sda
Part 2: Deploy from ModelPackage#
This section demonstrates an alternative deployment workflow using SageMaker Model Registry. This approach is ideal for production environments where:
Model Registry Benefits:
Version Control: Track multiple versions of your models
Governance: Implement approval workflows before deployment
Reproducibility: Deploy the exact same model version across environments
Metadata Management: Store model metrics, lineage, and documentation
CI/CD Integration: Automate deployment pipelines with versioned artifacts
When to Use ModelPackages:
Production deployments requiring approval gates
Multi-environment deployments (dev, staging, prod)
Models shared across teams or accounts
Compliance and audit requirements
ModelPackages are automatically created when training jobs complete, or can be registered manually.
Create ModelPackage Resource#
Instantiate a ModelPackage resource from the SageMaker Model Registry. This represents a versioned, registered model with:
ModelPackage Metadata:
Group: ‘test-finetuned-models’ (collection of related model versions)
Version: 3 (specific iteration of the fine-tuned model)
Status: Completed (ready for deployment)
Inference Specification:
Model artifacts location in S3
Base model reference (Llama 3.2 1B Instruct v0.0.3)
Recipe name for fine-tuning configuration
Container and runtime requirements
This ModelPackage was automatically created by the training job in Part 1, demonstrating the integration between training and model registry.
Build Model from ModelPackage#
Use ModelBuilder with a ModelPackage resource instead of a TrainingJob. The process is similar but with key differences:
ModelPackage vs TrainingJob Deployment:
ModelPackage: Uses versioned, approved artifacts from Model Registry
TrainingJob: Uses artifacts directly from training output
Advantages of ModelPackage Approach:
Deploy any approved version, not just the latest training run
Rollback to previous versions easily
Deploy the same version across multiple environments
Leverage approval workflows and governance policies
ModelBuilder automatically resolves all necessary metadata from the ModelPackage, including model artifacts, base model references, and inference configurations.
import random
from sagemaker.serve import ModelBuilder
from sagemaker.core.resources import ModelPackage
name = f"e2e-{random.randint(100, 1000000)}"
model_package = ModelPackage.get(model_package_name="arn:aws:sagemaker:us-west-2:<>:model-package/test-finetuned-models-gamma/68")
model_builder = ModelBuilder(model=model_package)
model_builder.build()
Deploy ModelPackage to Endpoint#
Deploy the versioned ModelPackage to a new SageMaker real-time endpoint. This deployment:
Deployment Characteristics:
Uses the exact model version specified in the ModelPackage
Maintains full traceability to the original training job
Can be deployed to multiple endpoints simultaneously
Supports the same deployment patterns (standalone or multi-adapter)
Production Best Practices:
Use ModelPackages for all production deployments
Implement approval workflows before deployment
Tag endpoints with model version for tracking
Monitor model performance and drift
The deployment process is identical to Part 1, but with the confidence that you’re deploying a versioned, approved model artifact.
endpoint = model_builder.deploy( endpoint_name=name)
Bedrock Model Builder#
import boto3
import json
# Create config.json for Llama 3.2 1B model
config = {
"architectures": ["LlamaForCausalLM"],
"attention_bias": False,
"attention_dropout": 0.0,
"bos_token_id": 128000,
"eos_token_id": 128001,
"hidden_act": "silu",
"hidden_size": 2048,
"initializer_range": 0.02,
"intermediate_size": 8192,
"max_position_embeddings": 131072,
"model_type": "llama",
"num_attention_heads": 32,
"num_hidden_layers": 16,
"num_key_value_heads": 8,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_scaling": None,
"rope_theta": 500000.0,
"tie_word_embeddings": True,
"torch_dtype": "bfloat16",
"transformers_version": "4.45.0",
"use_cache": True,
"vocab_size": 128256
}
# Upload to S3
s3 = boto3.client('s3')
s3.put_object(
Bucket='open-models-testing-pdx',
Key='output/meta-textgeneration-llama-3-2-1b-instruct-sft-20251114104310/output/model/config.json',
Body=json.dumps(config, indent=2),
ContentType='application/json'
)
print("config.json uploaded successfully")
import boto3
import json
s3 = boto3.client('s3', region_name='us-west-2')
config = {"add_bos_token": True, "add_eos_token": False, "bos_token": "<|begin_of_text|>", "eos_token": "<|end_of_text|>", "pad_token": "<|end_of_text|>", "model_max_length": 131072, "tokenizer_class": "LlamaTokenizer"}
s3.put_object(Bucket="open-models-testing-pdx", Key="output/meta-textgeneration-llama-3-2-1b-instruct-sft-20251114104310/output/model/tokenizer_config.json", Body=json.dumps(config))
! ada credentials update --provider=isengard --account=<> --role=Admin --profile=default --once
from sagemaker.core.resources import TrainingJob
import random
training_job = TrainingJob.get(training_job_name="11-21-llama33-70b-bbh-v1-2025-11-21-18-47-09-200", region="us-west-2")
name = f"e2e-{random.randint(100, 10000)}"
# bedrock_builder = BedrockModelBuilder(model=training_job)
# bedrock_builder.deploy(job_name=name, imported_model_name=name, role_arn="arn:aws:iam::<>:role/Admin")
# Assuming you previously did something like:
# bedrock_builder = BedrockModelBuilder(model_trainer)
# import_response = bedrock_builder.deploy(imported_model_name="my-custom-model-name", ...)
# Use the imported_model_name as the modelId for Bedrock inference
bedrock_runtime = boto3.client('bedrock-runtime', region_name='us-west-2')
response = bedrock_runtime.invoke_model(
modelId=name, # This is the imported_model_name from your deploy call
body=json.dumps({
"inputText": "What is the capital of France?",
"textGenerationConfig": {
"maxTokenCount": 50
}
})
)
Summary#
This notebook provided a comprehensive guide to deploying fine-tuned LLMs on Amazon SageMaker using two distinct workflows:
Key Takeaways#
Deployment Approaches:
TrainingJob → Endpoint: Direct deployment for rapid iteration and testing
ModelPackage → Endpoint: Versioned deployment for production governance
Deployment Patterns:
Standalone Endpoints: Dedicated resources, full isolation, simple management
Multi-Adapter Endpoints: Shared base model, cost-efficient, dynamic routing
Best Practices:
Use TrainingJob deployment for development and experimentation
Use ModelPackage deployment for production with approval workflows
Leverage multi-adapter deployment to reduce costs when serving multiple variants
Always test endpoints with sample requests before production traffic
Next Steps:
Implement monitoring and logging for production endpoints
Set up auto-scaling policies based on traffic patterns
Create CI/CD pipelines for automated model deployment
Explore model monitoring for drift detection and performance tracking
import boto3
bedrock = boto3.client('bedrock', region_name='us-west-2')
# List and delete model import jobs
import_jobs = bedrock.list_model_import_jobs()
for job in import_jobs['modelImportJobSummaries']:
job_arn = job['jobArn']
print(f"Deleting import job: {job_arn}")
# Note: Import jobs auto-cleanup, but you can stop in-progress ones
if job['status'] in ['InProgress', 'Submitted']:
bedrock.stop_model_import_job(jobIdentifier=job_arn)
# List and delete imported models
imported_models = bedrock.list_imported_models()
for model in imported_models['modelSummaries']:
model_arn = model['modelArn']
print(f"Deleting imported model: {model_arn}")
bedrock.delete_imported_model(modelIdentifier=model_arn)