Loading megatron/checkpointing.py +5 −1 Original line number Diff line number Diff line Loading @@ -205,12 +205,16 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'): try: state_dict = torch.load(checkpoint_name, map_location='cpu') except ModuleNotFoundError: from megatron.fp16_deprecated import loss_scaler # For backward compatibility. print_rank_0(' > deserializing using the old code structure ...') sys.modules['fp16.loss_scaler'] = sys.modules[ 'megatron.fp16.loss_scaler'] 'megatron.fp16_deprecated.loss_scaler'] sys.modules['megatron.fp16.loss_scaler'] = sys.modules[ 'megatron.fp16_deprecated.loss_scaler'] state_dict = torch.load(checkpoint_name, map_location='cpu') sys.modules.pop('fp16.loss_scaler', None) sys.modules.pop('megatron.fp16.loss_scaler', None) except BaseException: print_rank_0('could not load the checkpoint') sys.exit() Loading megatron/fp16_deprecated/loss_scaler.py 0 → 100755 +39 −0 Original line number Diff line number Diff line # coding=utf-8 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """For backward compatibility, we need the class definitions to deserialize.""" class LossScaler: def __init__(self, scale=1): self.cur_scale = scale class DynamicLossScaler: def __init__(self, init_scale=2**32, scale_factor=2., scale_window=1000, min_scale=1, delayed_shift=1, consecutive_hysteresis=False): self.cur_scale = init_scale self.cur_iter = 0 self.last_overflow_iter = -1 self.scale_factor = scale_factor self.scale_window = scale_window self.min_scale = min_scale self.delayed_shift = delayed_shift self.cur_hysteresis = delayed_shift self.consecutive_hysteresis = consecutive_hysteresis Loading
megatron/checkpointing.py +5 −1 Original line number Diff line number Diff line Loading @@ -205,12 +205,16 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'): try: state_dict = torch.load(checkpoint_name, map_location='cpu') except ModuleNotFoundError: from megatron.fp16_deprecated import loss_scaler # For backward compatibility. print_rank_0(' > deserializing using the old code structure ...') sys.modules['fp16.loss_scaler'] = sys.modules[ 'megatron.fp16.loss_scaler'] 'megatron.fp16_deprecated.loss_scaler'] sys.modules['megatron.fp16.loss_scaler'] = sys.modules[ 'megatron.fp16_deprecated.loss_scaler'] state_dict = torch.load(checkpoint_name, map_location='cpu') sys.modules.pop('fp16.loss_scaler', None) sys.modules.pop('megatron.fp16.loss_scaler', None) except BaseException: print_rank_0('could not load the checkpoint') sys.exit() Loading
megatron/fp16_deprecated/loss_scaler.py 0 → 100755 +39 −0 Original line number Diff line number Diff line # coding=utf-8 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """For backward compatibility, we need the class definitions to deserialize.""" class LossScaler: def __init__(self, scale=1): self.cur_scale = scale class DynamicLossScaler: def __init__(self, init_scale=2**32, scale_factor=2., scale_window=1000, min_scale=1, delayed_shift=1, consecutive_hysteresis=False): self.cur_scale = init_scale self.cur_iter = 0 self.last_overflow_iter = -1 self.scale_factor = scale_factor self.scale_window = scale_window self.min_scale = min_scale self.delayed_shift = delayed_shift self.cur_hysteresis = delayed_shift self.consecutive_hysteresis = consecutive_hysteresis