RLVR Example - Finetuning with Sagemaker#

This notebook demonstrates basic user flow for RLVR Finetuning from a model available in Sagemaker Jumpstart. Information on available models on jumpstart: https://docs.aws.amazon.com/sagemaker/latest/dg/jumpstart-foundation-models-latest.html

Setup and Configuration#

Initialize the environment by importing necessary libraries and configuring AWS credentials

# Configure AWS credentials and region
#! ada credentials update --provider=isengard --account=<> --role=Admin --profile=default --once
#! aws configure set region us-west-2

from sagemaker.train.rlvr_trainer import RLVRTrainer
from sagemaker.train.configs import InputData
from rich import print as rprint
from rich.pretty import pprint
from sagemaker.core.resources import ModelPackage
from sagemaker.train.common import TrainingType


import boto3
import os
from sagemaker.core.helper.session_helper import Session

# For MLFlow native metrics in Trainer wait, run below line with approriate region
os.environ["SAGEMAKER_MLFLOW_CUSTOM_ENDPOINT"] = "https://mlflow.sagemaker.us-west-2.app.aws"

Prepare and Register Dataset#

Prepare and Register Dataset for Finetuning

from sagemaker.ai_registry.dataset import DataSet
from sagemaker.ai_registry.dataset_utils import CustomizationTechnique



# Register dataset in SageMaker AI Registry
# This creates a versioned dataset that can be referenced by ARN
# Provide a source (it can be local file path or S3 URL)
dataset = DataSet.create(
    name="demo-2",
    source="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl"
)

print(f"Dataset ARN: {dataset.arn}")

Create a Model Package group (if not already exists)#

from sagemaker.core.resources import ModelPackage, ModelPackageGroup

model_package_group=ModelPackageGroup.create(model_package_group_name="test-model-package-group")

Create RLVRTrainer#

Required Parameters

  • model: base_model id on Sagemaker Hubcontent that is available to finetune (or) ModelPackage artifacts

Optional Parameters

  • custom_reward_function: Custom reward function/Evaluator ARN

  • model_package_group: ModelPackage group name or ModelPackageGroup object. This parameter is mandatory when a base model ID is provided, but optional when a model package is provided.

  • mlflow_resource_arn: MLFlow app ARN to track the training job

  • mlflow_experiment_name: MLFlow app experiment name(str)

  • mlflow_run_name: MLFlow app run name(str)

  • training_dataset: Training Dataset - should be a Dataset ARN or Dataset object (Please note training dataset is required for a training job to run, can be either provided via Trainer or .train())

  • validation_dataset: Validation Dataset - should be a Dataset ARN or Dataset object

  • s3_output_path: S3 path for the trained model artifacts

Reference#

Refer this doc for other models that support Model Customization: https://docs.aws.amazon.com/bedrock/latest/userguide/custom-model-supported.html

# For fine-tuning (prod)
rlvr_trainer = RLVRTrainer(
    model="meta-textgeneration-llama-3-2-1b-instruct", 
    model_package_group=model_package_group, # or use an existing model package group arn
    mlflow_experiment_name="test-rlvr-finetuned-models-exp", 
    mlflow_run_name="test-rlvr-finetuned-models-run", 
    training_dataset=dataset.arn,
    s3_output_path="s3://mc-flows-sdk-testing/output/",
    accept_eula=True
)

Discover and update Finetuning options#

Each of the technique and model has overridable hyperparameters that can be finetuned by the user.

print("Default Finetuning Options:")
pprint(rlvr_trainer.hyperparameters.to_dict()) # rename as hyperparameters

#set options
rlvr_trainer.hyperparameters.get_info()

Start RLVR training#

training_job = rlvr_trainer.train(wait=True)
import re
from sagemaker.core.utils.utils import Unassigned
import json


def pretty_print(obj):
    def parse_unassigned(item):
        if isinstance(item, Unassigned):
            return None
        if isinstance(item, dict):
            return {k: parse_unassigned(v) for k, v in item.items() if parse_unassigned(v) is not None}
        if isinstance(item, list):
            return [parse_unassigned(x) for x in item if parse_unassigned(x) is not None]
        if isinstance(item, str) and "Unassigned object" in item:
            pairs = re.findall(r"(\w+)=([^<][^=]*?)(?=\s+\w+=|$)", item)
            result = {k: v.strip("'\"") for k, v in pairs}
            return result if result else None
        return item

    cleaned = parse_unassigned(obj.__dict__ if hasattr(obj, '__dict__') else obj)
    print(json.dumps(cleaned, indent=2, default=str))

# Usage
pretty_print(training_job)
training_job = rlvr_trainer.train(wait=True)

View any Training job details#

We can get any training job details and its status with TrainingJob.get(…)

from sagemaker.core.resources import TrainingJob

response = TrainingJob.get(training_job_name="meta-textgeneration-llama-3-2-3b-instruct-rlvr-20251123033517")
pretty_print(response)
training_job.refresh()
pretty_print(training_job)

Test RLVR with Custom RewardFunction#

Here we are providing a user-defined reward function ARN

Create a custom reward function#

from rich.pretty import pprint

from sagemaker.ai_registry.air_constants import REWARD_FUNCTION, REWARD_PROMPT
from sagemaker.ai_registry.evaluator import Evaluator

# Method : Lambda
evaluator = Evaluator.create(
    name = "sdk-new-rf11",
    source="arn:aws:lambda:us-west-2:<>:function:sm-eval-vinayshm-rlvr-llama-321b-instruct-v1-<>8",
    type=REWARD_FUNCTION

)

Use it with RLVR Trainer#

# For fine-tuning 
rlvr_trainer = RLVRTrainer(
    model="meta-textgeneration-llama-3-2-1b-instruct", # Union[str, ModelPackage] 
    model_package_group="sdk-test-finetuned-models", # Make it Optional
    mlflow_experiment_name="test-rlvr-finetuned-models-exp", # Optional[str]
    mlflow_run_name="test-rlvr-finetuned-models-run", # Optional[str]
    training_dataset=dataset, #Optional[]
    s3_output_path="s3://mc-flows-sdk-testing/output/",
    custom_reward_function=evaluator,
    accept_eula=True
    
)
training_job = rlvr_trainer.train(wait=True)
#training_job.refresh()
pretty_print(training_job)

Continued Finetuning (or) Finetuning on Model Artifacts#

Discover a ModelPackage and get its details#

from rich import print as rprint
from rich.pretty import pprint
from sagemaker.core.resources import ModelPackage, ModelPackageGroup

#model_package_iter = ModelPackage.get_all(model_package_group_name="test-finetuned-models-gamma")
model_package = ModelPackage.get(model_package_name="arn:aws:sagemaker:us-west-2:<>:model-package/test-finetuned-models-gamma/61")

pretty_print(model_package)

Create Trainer#

Trainer creation is same as above Finetuning Section except for model’s input is ModelPackage(previously trained artifacts)

# For fine-tuning 
rlvr_trainer = RLVRTrainer(
    model=model_package, # Union[str, ModelPackage] 
    training_type=TrainingType.LORA, 
    model_package_group="test-finetuned-models-gamma", #"test-finetuned-models", # Make it Optional
    mlflow_resource_arn="arn:aws:sagemaker:us-west-2:<>:mlflow-tracking-server/mmlu-eval-experiment",  # Optional[str] - MLflow app ARN (auto-resolved if not provided), can accept name and search in the account
    mlflow_experiment_name="test-rlvr-finetuned-models-exp", # Optional[str]
    mlflow_run_name="test-rlvr-finetuned-models-run", # Optional[str]
    training_dataset=dataset.arn,     #"arn:aws:sagemaker:us-west-2:<>:hub-content/AIRegistry/DataSet/MarketingDemoDataset1/1.0.0", #Optional[]
    s3_output_path="s3://open-models-testing-pdx/output",
    accept_eula=True
)

Start the Training#

training_job = rlvr_trainer.train(
    wait=True,
)
pretty_print(training_job)

Nova RLVR job#

import os
os.environ['SAGEMAKER_REGION'] = 'us-east-1'

# For fine-tuning 
rlvr_trainer = RLVRTrainer(
    model="nova-textgeneration-lite-v2", # Union[str, ModelPackage] 
    model_package_group="sdk-test-finetuned-models", #"test-finetuned-models", # Make it Optional
    #mlflow_resource_arn="arn:aws:sagemaker:us-east-1:<>:mlflow-app/app-UNBKLOAX64PX",  # Optional[str] - MLflow app ARN (auto-resolved if not provided), can accept name and search in the account
    mlflow_experiment_name="test-nova-rlvr-finetuned-models-exp", # Optional[str]
    mlflow_run_name="test-nova-rlvr-finetuned-models-run", # Optional[str]
    training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl",
    validation_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl",
    s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/",
    custom_reward_function="arn:aws:sagemaker:us-east-1:<>:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1",
    accept_eula=True
)
rlvr_trainer.hyperparameters.to_dict()
rlvr_trainer.hyperparameters.data_s3_path = 's3://example-bucket'

rlvr_trainer.hyperparameters.reward_lambda_arn = 'arn:aws:lambda:us-east-1:<>:function:rlvr-nova-reward-function'
rlvr_trainer.hyperparameters.to_dict()
training_job = rlvr_trainer.train(wait=True)