Source code for sagemaker.core.debugger.profiler_config

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
#     http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
"""Configuration for collecting system and framework metrics in SageMaker training jobs."""
from __future__ import absolute_import

import logging
from typing import Optional, Union

from sagemaker.core.debugger.framework_profile import FrameworkProfile
from sagemaker.core.debugger.profiler import Profiler
from sagemaker.core.helper.pipeline_variable import PipelineVariable
from sagemaker.core.deprecations import deprecation_warn_base

logger = logging.getLogger(__name__)


[docs] class ProfilerConfig(object): """Configuration for collecting system and framework metrics of SageMaker training jobs. SageMaker Debugger collects system and framework profiling information of training jobs and identify performance bottlenecks. """ def __init__( self, s3_output_path: Optional[Union[str, PipelineVariable]] = None, system_monitor_interval_millis: Optional[Union[int, PipelineVariable]] = None, framework_profile_params: Optional[FrameworkProfile] = None, profile_params: Optional[Profiler] = None, disable_profiler: Optional[Union[str, PipelineVariable]] = False, ): """Initialize a ``ProfilerConfig`` instance. Pass the output of this class to the ``profiler_config`` parameter of the generic :class:`~sagemaker.estimator.Estimator` class and SageMaker Framework estimators. Args: s3_output_path (str or PipelineVariable): The location in Amazon S3 to store the output. The default Debugger output path for profiling data is created under the default output path of the :class:`~sagemaker.estimator.Estimator` class. For example, s3://sagemaker-<region>-<12digit_account_id>/<training-job-name>/profiler-output/. system_monitor_interval_millis (int or PipelineVariable): The time interval in milliseconds to collect system metrics. Available values are 100, 200, 500, 1000 (1 second), 5000 (5 seconds), and 60000 (1 minute) milliseconds. The default is 500 milliseconds. framework_profile_params (:class:`~sagemaker.debugger.FrameworkProfile`): (Deprecated) A parameter object for framework metrics profiling. Configure it using the :class:`~sagemaker.debugger.FrameworkProfile` class. To use the default framework profile parameters, pass ``FrameworkProfile()``. For more information about the default values, see :class:`~sagemaker.debugger.FrameworkProfile`. disable_profiler (bool): Switch the basic monitoring on or off using this parameter. The default is ``False``. profile_params (dict or an object of :class:`sagemaker.Profiler`): Pass this parameter to activate SageMaker Profiler using the :class:`sagemaker.Profiler` class. **Basic profiling using SageMaker Debugger** By default, if you submit training jobs using SageMaker Python SDK's estimator classes, SageMaker runs basic profiling automatically. The following example shows the basic profiling configuration that you can utilize to update the time interval for collecting system resource utilization. .. code:: python import sagemaker from sagemaker.pytorch import PyTorch from sagemaker.debugger import ProfilerConfig profiler_config = ProfilerConfig( system_monitor_interval_millis = 500 ) estimator = PyTorch( framework_version="2.0.0", ... # Set up other essential parameters for the estimator class profiler_config=profiler_config ) For a complete instruction on activating and using SageMaker Debugger, see `Monitor AWS compute resource utilization in Amazon SageMaker Studio <https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html>`_. **Deep profiling using SageMaker Profiler** The following example shows an example configration for activating SageMaker Profiler. .. code:: python import sagemaker from sagemaker.pytorch import PyTorch from sagemaker import ProfilerConfig, Profiler profiler_config = ProfilerConfig( profiler_params = Profiler(cpu_profiling_duration=3600) ) estimator = PyTorch( framework_version="2.0.0", ... # Set up other essential parameters for the estimator class profiler_config=profiler_config ) For a complete instruction on activating and using SageMaker Profiler, see `Use Amazon SageMaker Profiler to profile activities on AWS compute resources <https://docs.aws.amazon.com/sagemaker/latest/dg/train-profile-computational-performance.html>`_. """ assert framework_profile_params is None or isinstance( framework_profile_params, FrameworkProfile ), "framework_profile_params must be of type FrameworkProfile if specified." assert profile_params is None or isinstance( profile_params, Profiler ), "profile_params must be of type Profiler if specified." if profile_params and framework_profile_params: raise ValueError("Profiler will not work when Framework Profiler is ON") self.s3_output_path = s3_output_path self.system_monitor_interval_millis = system_monitor_interval_millis self.framework_profile_params = framework_profile_params self.profile_params = profile_params self.disable_profiler = disable_profiler if self.framework_profile_params is not None: deprecation_warn_base( "Framework profiling will be deprecated from tensorflow 2.12 and pytorch 2.0" ) def _to_request_dict(self): """Generate a request dictionary using the parameters provided when initializing the object. Returns: dict: An portion of an API request as a dictionary. """ profiler_config_request = {} if ( self.s3_output_path is not None and self.disable_profiler is not None and self.disable_profiler is False ): profiler_config_request["S3OutputPath"] = self.s3_output_path profiler_config_request["DisableProfiler"] = self.disable_profiler if self.system_monitor_interval_millis is not None: profiler_config_request["ProfilingIntervalInMilliseconds"] = ( self.system_monitor_interval_millis ) if self.framework_profile_params is not None: profiler_config_request["ProfilingParameters"] = ( self.framework_profile_params.profiling_parameters ) if self.profile_params is not None: profiler_config_request["ProfilingParameters"] = ( self.profile_params.profiling_parameters ) return profiler_config_request @classmethod def _to_profiler_disabled_request_dict(cls): """Generate a request dictionary for updating the training job to disable profiler. Returns: dict: An portion of an API request as a dictionary. """ profiler_config_request = {"DisableProfiler": True} return profiler_config_request