from pprint import pprint

from sagemaker.core.resources import TrainingJob, HubContent, InferenceComponent, ModelPackage
from sagemaker.core.utils.utils import Unassigned
! ada credentials update --provider=isengard --account=<> --role=Admin --profile=default --once
! aws configure set region  us-west-2
from sagemaker.core.resources import Endpoint

# Delete endpoints starting with 'e2e-'
for endpoint in Endpoint.get_all():
    if endpoint.endpoint_name.startswith('e2e-'):
        endpoint.delete()
from sagemaker.core.resources import TrainingJob, HubContent, InferenceComponent, ModelPackage
from sagemaker.core.utils.utils import Unassigned

for training_job in TrainingJob.get_all(region="us-west-2"):
    if not isinstance(training_job.output_model_package_arn, Unassigned):
        try:
            model_package = ModelPackage.get(training_job.output_model_package_arn)
            if not isinstance(model_package.inference_specification.containers[0].image,Unassigned)\
                    and model_package.inference_specification.containers[0].image is not None:
                print(training_job.training_job_arn)
                print(model_package.inference_specification.containers[0].image)
        except:
            pass
from sagemaker.core.resources import TrainingJob
import random
training_job = TrainingJob.get(training_job_name="meta-textgeneration-llama-3-2-1b-instruct-sft-20251123162832")
print(training_job.output_model_package_arn)
name = f"e2e-{random.randint(100, 10000)}"
from sagemaker.serve import ModelBuilder
model_builder = ModelBuilder(model=training_job)
model = model_builder.build(model_name=name)
print(model.model_arn)
import random
#endpoint = model_builder.deploy(endpoint_name=name)
endpoint = model_builder.deploy(endpoint_name=name)
from sagemaker.core.resources import InferenceComponent, Tag
from pprint import pprint

for inference_component in InferenceComponent.get_all(endpoint_name_equals="e2e-2358"):
    print(inference_component.inference_component_arn)
    for tag in Tag.get_all(resource_arn=inference_component.inference_component_arn):
        pprint(tag)
import json
# Note this is expected to fail since Endpoint invoke is only available for authorized users. The Invoke call here is the sagemaker-core Endpoint.invoke call .
print(endpoint.endpoint_arn)
endpoint.invoke(body=json.dumps({"inputs": "What is the capital of France?", "parameters": {"max_new_tokens": 50}}))
from sagemaker.core.resources import TrainingJob
from sagemaker.serve import ModelBuilder

model_builder = ModelBuilder(model=TrainingJob.get(training_job_name="meta-textgeneration-llama-3-2-1b-instruct-sft-20251123162832"))
model_builder.fetch_endpoint_names_for_base_model()
name = f"e2e-{random.randint(100, 10000)}"
model_builder.name = name
endpoint = model_builder.deploy(endpoint_name=name, inference_component_name=f"{name}-adapter")
sda

Part 2: Deploy from ModelPackage#

This section demonstrates an alternative deployment workflow using SageMaker Model Registry. This approach is ideal for production environments where:

Model Registry Benefits:

  • Version Control: Track multiple versions of your models

  • Governance: Implement approval workflows before deployment

  • Reproducibility: Deploy the exact same model version across environments

  • Metadata Management: Store model metrics, lineage, and documentation

  • CI/CD Integration: Automate deployment pipelines with versioned artifacts

When to Use ModelPackages:

  • Production deployments requiring approval gates

  • Multi-environment deployments (dev, staging, prod)

  • Models shared across teams or accounts

  • Compliance and audit requirements

ModelPackages are automatically created when training jobs complete, or can be registered manually.

Create ModelPackage Resource#

Instantiate a ModelPackage resource from the SageMaker Model Registry. This represents a versioned, registered model with:

ModelPackage Metadata:

  • Group: ‘test-finetuned-models’ (collection of related model versions)

  • Version: 3 (specific iteration of the fine-tuned model)

  • Status: Completed (ready for deployment)

Inference Specification:

  • Model artifacts location in S3

  • Base model reference (Llama 3.2 1B Instruct v0.0.3)

  • Recipe name for fine-tuning configuration

  • Container and runtime requirements

This ModelPackage was automatically created by the training job in Part 1, demonstrating the integration between training and model registry.

Build Model from ModelPackage#

Use ModelBuilder with a ModelPackage resource instead of a TrainingJob. The process is similar but with key differences:

ModelPackage vs TrainingJob Deployment:

  • ModelPackage: Uses versioned, approved artifacts from Model Registry

  • TrainingJob: Uses artifacts directly from training output

Advantages of ModelPackage Approach:

  • Deploy any approved version, not just the latest training run

  • Rollback to previous versions easily

  • Deploy the same version across multiple environments

  • Leverage approval workflows and governance policies

ModelBuilder automatically resolves all necessary metadata from the ModelPackage, including model artifacts, base model references, and inference configurations.

import random
from sagemaker.serve import ModelBuilder

from sagemaker.core.resources import ModelPackage

name = f"e2e-{random.randint(100, 1000000)}"
model_package = ModelPackage.get(model_package_name="arn:aws:sagemaker:us-west-2:<>:model-package/test-finetuned-models-gamma/68")
model_builder = ModelBuilder(model=model_package)
model_builder.build()

Deploy ModelPackage to Endpoint#

Deploy the versioned ModelPackage to a new SageMaker real-time endpoint. This deployment:

Deployment Characteristics:

  • Uses the exact model version specified in the ModelPackage

  • Maintains full traceability to the original training job

  • Can be deployed to multiple endpoints simultaneously

  • Supports the same deployment patterns (standalone or multi-adapter)

Production Best Practices:

  • Use ModelPackages for all production deployments

  • Implement approval workflows before deployment

  • Tag endpoints with model version for tracking

  • Monitor model performance and drift

The deployment process is identical to Part 1, but with the confidence that you’re deploying a versioned, approved model artifact.

endpoint = model_builder.deploy( endpoint_name=name)

Bedrock Model Builder#

import boto3
import json

# Create config.json for Llama 3.2 1B model
config = {
    "architectures": ["LlamaForCausalLM"],
    "attention_bias": False,
    "attention_dropout": 0.0,
    "bos_token_id": 128000,
    "eos_token_id": 128001,
    "hidden_act": "silu",
    "hidden_size": 2048,
    "initializer_range": 0.02,
    "intermediate_size": 8192,
    "max_position_embeddings": 131072,
    "model_type": "llama",
    "num_attention_heads": 32,
    "num_hidden_layers": 16,
    "num_key_value_heads": 8,
    "pretraining_tp": 1,
    "rms_norm_eps": 1e-05,
    "rope_scaling": None,
    "rope_theta": 500000.0,
    "tie_word_embeddings": True,
    "torch_dtype": "bfloat16",
    "transformers_version": "4.45.0",
    "use_cache": True,
    "vocab_size": 128256
}

# Upload to S3
s3 = boto3.client('s3')
s3.put_object(
    Bucket='open-models-testing-pdx',
    Key='output/meta-textgeneration-llama-3-2-1b-instruct-sft-20251114104310/output/model/config.json',
    Body=json.dumps(config, indent=2),
    ContentType='application/json'
)

print("config.json uploaded successfully")
import boto3
import json

s3 = boto3.client('s3', region_name='us-west-2')
config = {"add_bos_token": True, "add_eos_token": False, "bos_token": "<|begin_of_text|>", "eos_token": "<|end_of_text|>", "pad_token": "<|end_of_text|>", "model_max_length": 131072, "tokenizer_class": "LlamaTokenizer"}
s3.put_object(Bucket="open-models-testing-pdx", Key="output/meta-textgeneration-llama-3-2-1b-instruct-sft-20251114104310/output/model/tokenizer_config.json", Body=json.dumps(config))
! ada credentials update --provider=isengard --account=<> --role=Admin --profile=default --once
from sagemaker.core.resources import TrainingJob
import random


training_job = TrainingJob.get(training_job_name="11-21-llama33-70b-bbh-v1-2025-11-21-18-47-09-200", region="us-west-2")
name = f"e2e-{random.randint(100, 10000)}"

# bedrock_builder = BedrockModelBuilder(model=training_job)
# bedrock_builder.deploy(job_name=name, imported_model_name=name, role_arn="arn:aws:iam::<>:role/Admin")
# Assuming you previously did something like:
# bedrock_builder = BedrockModelBuilder(model_trainer)
# import_response = bedrock_builder.deploy(imported_model_name="my-custom-model-name", ...)

# Use the imported_model_name as the modelId for Bedrock inference
bedrock_runtime = boto3.client('bedrock-runtime', region_name='us-west-2')

response = bedrock_runtime.invoke_model(
    modelId=name,  # This is the imported_model_name from your deploy call
    body=json.dumps({
        "inputText": "What is the capital of France?",
        "textGenerationConfig": {
            "maxTokenCount": 50
        }
    })
)

Summary#

This notebook provided a comprehensive guide to deploying fine-tuned LLMs on Amazon SageMaker using two distinct workflows:

Key Takeaways#

Deployment Approaches:

  1. TrainingJob → Endpoint: Direct deployment for rapid iteration and testing

  2. ModelPackage → Endpoint: Versioned deployment for production governance

Deployment Patterns:

  • Standalone Endpoints: Dedicated resources, full isolation, simple management

  • Multi-Adapter Endpoints: Shared base model, cost-efficient, dynamic routing

Best Practices:

  • Use TrainingJob deployment for development and experimentation

  • Use ModelPackage deployment for production with approval workflows

  • Leverage multi-adapter deployment to reduce costs when serving multiple variants

  • Always test endpoints with sample requests before production traffic

Next Steps:

  • Implement monitoring and logging for production endpoints

  • Set up auto-scaling policies based on traffic patterns

  • Create CI/CD pipelines for automated model deployment

  • Explore model monitoring for drift detection and performance tracking

import boto3

bedrock = boto3.client('bedrock', region_name='us-west-2')

# List and delete model import jobs
import_jobs = bedrock.list_model_import_jobs()
for job in import_jobs['modelImportJobSummaries']:
    job_arn = job['jobArn']
    print(f"Deleting import job: {job_arn}")
    # Note: Import jobs auto-cleanup, but you can stop in-progress ones
    if job['status'] in ['InProgress', 'Submitted']:
        bedrock.stop_model_import_job(jobIdentifier=job_arn)

# List and delete imported models
imported_models = bedrock.list_imported_models()
for model in imported_models['modelSummaries']:
    model_arn = model['modelArn']
    print(f"Deleting imported model: {model_arn}")
    bedrock.delete_imported_model(modelIdentifier=model_arn)