SageMaker V3 HuggingFace Model Example#

This notebook demonstrates how to deploy HuggingFace models using SageMaker V3 ModelBuilder for text generation tasks.

Prerequisites#

Note: Ensure you have sagemaker and ipywidgets installed in your environment. The ipywidgets package is required to monitor endpoint deployment progress in Jupyter notebooks.

# Import required libraries
import json
import uuid

from sagemaker.serve.model_builder import ModelBuilder
from sagemaker.serve.spec.inference_spec import InferenceSpec
from sagemaker.serve.builder.schema_builder import SchemaBuilder
from sagemaker.serve.utils.types import ModelServer
from sagemaker.core.resources import EndpointConfig

Step 1: Define HuggingFace InferenceSpec#

Create a custom InferenceSpec for HuggingFace text generation models.

class HuggingFaceInferenceSpec(InferenceSpec):
    """Custom InferenceSpec for HuggingFace text generation models."""
    
    def __init__(self):
        self.model_name = "microsoft/DialoGPT-small"
    
    def get_model(self):
        """Return the HuggingFace model ID for auto-detection."""
        return self.model_name
    
    def load(self, model_dir: str):
        """Load HuggingFace model and tokenizer."""
        try:
            from transformers import AutoTokenizer, AutoModelForCausalLM
            
            tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            model = AutoModelForCausalLM.from_pretrained(self.model_name)
            
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
            
            return {"model": model, "tokenizer": tokenizer}
        except ImportError:
            return {"model_type": "huggingface_mock"}
    
    def invoke(self, input_object, model):
        """Generate text using the HuggingFace model."""
        if isinstance(model, dict) and "model_type" in model:
            # Mock behavior for demo
            if isinstance(input_object, dict) and "inputs" in input_object:
                text = input_object["inputs"]
                return [{"generated_text": f"Mock response for: {text}"}]
            return [{"generated_text": "Mock response"}]
        
        # Real HuggingFace inference
        if isinstance(input_object, dict) and "inputs" in input_object:
            text = input_object["inputs"]
        else:
            text = str(input_object)
        
        tokenizer = model["tokenizer"]
        hf_model = model["model"]
        
        inputs = tokenizer.encode(text, return_tensors="pt")
        
        import torch
        with torch.no_grad():
            outputs = hf_model.generate(
                inputs,
                max_length=inputs.shape[1] + 20,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=True,
                temperature=0.7
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return [{"generated_text": response}]

print("HuggingFace InferenceSpec defined successfully!")

Step 2: Create Schema Builder#

Define the input/output schema for the model.

# Create schema builder
sample_input = {"inputs": "Hello, how are you?"}
sample_output = [{"generated_text": "Hello, how are you? I'm doing well!"}]
schema_builder = SchemaBuilder(sample_input, sample_output)

print("Schema builder created successfully!")

Step 3: Configure ModelBuilder#

Set up the ModelBuilder with HuggingFace configuration.

# Configuration
MODEL_NAME_PREFIX = "hf-v3-example-model"
ENDPOINT_NAME_PREFIX = "hf-v3-example-endpoint"

# Generate unique identifiers
unique_id = str(uuid.uuid4())[:8]
model_name = f"{MODEL_NAME_PREFIX}-{unique_id}"
endpoint_name = f"{ENDPOINT_NAME_PREFIX}-{unique_id}"

# Create ModelBuilder
inference_spec = HuggingFaceInferenceSpec()
model_builder = ModelBuilder(
    inference_spec=inference_spec,
    model_server=ModelServer.MMS,  # Multi Model Server for HuggingFace
    schema_builder=schema_builder
)

print(f"ModelBuilder configured for model: {model_name}")
print(f"Target endpoint: {endpoint_name}")

Step 4: Build the Model#

Build the model artifacts.

# Build the model
core_model = model_builder.build(model_name=model_name)
print(f"Model Successfully Created: {core_model.model_name}")

Step 5: Deploy the Model#

Deploy to a SageMaker endpoint.

# Deploy the model
core_endpoint = model_builder.deploy(endpoint_name=endpoint_name)
print(f"Endpoint Successfully Created: {core_endpoint.endpoint_name}")

Step 6: Test Text Generation#

Test the deployed model with various text generation tasks.

# Test 1: Simple conversation
test_input_1 = {"inputs": "Hello, how are you today?"}

result_1 = core_endpoint.invoke(
    body=json.dumps(test_input_1),
    content_type="application/json"
)

response_1 = json.loads(result_1.body.read().decode('utf-8'))
print(f"Conversation Test: {response_1}")
# Test 2: Creative writing
test_input_2 = {"inputs": "Once upon a time in a magical forest"}

result_2 = core_endpoint.invoke(
    body=json.dumps(test_input_2),
    content_type="application/json"
)

response_2 = json.loads(result_2.body.read().decode('utf-8'))
print(f"Creative Writing Test: {response_2}")

Step 7: Clean Up Resources#

Clean up all created resources.

# Clean up resources
core_endpoint_config = EndpointConfig.get(endpoint_config_name=core_endpoint.endpoint_name)

core_model.delete()
core_endpoint.delete()
core_endpoint_config.delete()

print("All resources successfully deleted!")

Summary#

This notebook demonstrated:

  1. Creating a custom InferenceSpec for HuggingFace models

  2. Setting up schema builders for text generation

  3. Configuring ModelBuilder with Multi Model Server

  4. Deploying HuggingFace models to SageMaker endpoints

  5. Testing text generation capabilities

  6. Proper resource cleanup

The V3 ModelBuilder provides a flexible way to deploy any HuggingFace model with custom inference logic!