Loading megatron/__init__.py +1 −0 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ import torch from .global_vars import get_args from .global_vars import get_current_global_batch_size from .global_vars import get_num_microbatches from .global_vars import get_signal_handler from .global_vars import update_num_microbatches from .global_vars import get_tokenizer from .global_vars import get_tensorboard_writer Loading megatron/arguments.py +6 −0 Original line number Diff line number Diff line Loading @@ -377,6 +377,9 @@ def _add_logging_args(parser): group.add_argument('--log-memory-to-tensorboard', action='store_true', help='Enable memory logging to tensorboard.') group.add_argument('--log-world-size-to-tensorboard', action='store_true', help='Enable world size logging to tensorboard.') return parser Loading Loading @@ -472,6 +475,9 @@ def _add_training_args(parser): 'by this value.') group.add_argument('--exit-duration-in-mins', type=int, default=None, help='Exit the program after this many minutes.') group.add_argument('--exit-signal-handler', action='store_true', help='Dynamically save the checkpoint and shutdown the ' 'training if SIGTERM is received') group.add_argument('--tensorboard-dir', type=str, default=None, help='Write TensorBoard logs to this directory.') group.add_argument('--no-masked-softmax-fusion', Loading megatron/dist_signal_handler.py 0 → 100644 +81 −0 Original line number Diff line number Diff line import signal import torch def get_world_size(): if torch.distributed.is_available() and torch.distributed.is_initialized(): world_size = torch.distributed.get_world_size() else: world_size = 1 return world_size def get_device(local_rank=None): backend = torch.distributed.get_backend() if backend == 'nccl': if local_rank is None: device = torch.device('cuda') else: device = torch.device(f'cuda:{local_rank}') elif backend == 'gloo': device = torch.device('cpu') else: raise RuntimeError return device def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None): if not torch.distributed.is_available() or \ not torch.distributed.is_initialized(): return [item] device = get_device(local_rank) if group is not None: group_size = group.size() else: group_size = get_world_size() tensor = torch.tensor([item], device=device, dtype=dtype) output_tensors = [ torch.zeros(1, dtype=tensor.dtype, device=tensor.device) for _ in range(group_size) ] torch.distributed.all_gather(output_tensors, tensor, group, async_op) output = [elem.item() for elem in output_tensors] return output class DistributedSignalHandler: def __init__(self, sig=signal.SIGTERM): self.sig = sig def signals_received(self): all_received = all_gather_item( self._signal_received, dtype=torch.int32 ) return all_received def __enter__(self): self._signal_received = False self.released = False self.original_handler = signal.getsignal(self.sig) def handler(signum, frame): self._signal_received = True signal.signal(self.sig, handler) return self def __exit__(self, type, value, tb): self.release() def release(self): if self.released: return False signal.signal(self.sig, self.original_handler) self.released = True return True megatron/global_vars.py +13 −0 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ import time import torch from megatron import dist_signal_handler from megatron.tokenizer import build_tokenizer from .arguments import parse_args from .microbatches import build_num_microbatches_calculator Loading @@ -31,6 +32,7 @@ _GLOBAL_TOKENIZER = None _GLOBAL_TENSORBOARD_WRITER = None _GLOBAL_ADLR_AUTORESUME = None _GLOBAL_TIMERS = None _GLOBAL_SIGNAL_HANDLER = None def get_args(): Loading Loading @@ -75,6 +77,14 @@ def get_timers(): _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers') return _GLOBAL_TIMERS def get_signal_handler(): _ensure_var_is_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler') return _GLOBAL_SIGNAL_HANDLER def _set_signal_handler(): global _GLOBAL_SIGNAL_HANDLER _ensure_var_is_not_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler') _GLOBAL_SIGNAL_HANDLER = dist_signal_handler.DistributedSignalHandler().__enter__() def set_global_variables(extra_args_provider=None, args_defaults={}, ignore_unknown_args=False): Loading @@ -89,6 +99,9 @@ def set_global_variables(extra_args_provider=None, args_defaults={}, _set_adlr_autoresume(args) _set_timers() if args.exit_signal_handler: _set_signal_handler() def _parse_args(extra_args_provider=None, defaults={}, ignore_unknown_args=False): Loading megatron/initialize.py +1 −1 Original line number Diff line number Diff line Loading @@ -180,7 +180,7 @@ def _initialize_distributed(): torch.distributed.init_process_group( backend=args.distributed_backend, world_size=args.world_size, rank=args.rank, timeout=timedelta(days=7)) timeout=timedelta(minutes=10)) # Set the tensor model-parallel, pipeline model-parallel, and # data-parallel communicators. Loading Loading
megatron/__init__.py +1 −0 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ import torch from .global_vars import get_args from .global_vars import get_current_global_batch_size from .global_vars import get_num_microbatches from .global_vars import get_signal_handler from .global_vars import update_num_microbatches from .global_vars import get_tokenizer from .global_vars import get_tensorboard_writer Loading
megatron/arguments.py +6 −0 Original line number Diff line number Diff line Loading @@ -377,6 +377,9 @@ def _add_logging_args(parser): group.add_argument('--log-memory-to-tensorboard', action='store_true', help='Enable memory logging to tensorboard.') group.add_argument('--log-world-size-to-tensorboard', action='store_true', help='Enable world size logging to tensorboard.') return parser Loading Loading @@ -472,6 +475,9 @@ def _add_training_args(parser): 'by this value.') group.add_argument('--exit-duration-in-mins', type=int, default=None, help='Exit the program after this many minutes.') group.add_argument('--exit-signal-handler', action='store_true', help='Dynamically save the checkpoint and shutdown the ' 'training if SIGTERM is received') group.add_argument('--tensorboard-dir', type=str, default=None, help='Write TensorBoard logs to this directory.') group.add_argument('--no-masked-softmax-fusion', Loading
megatron/dist_signal_handler.py 0 → 100644 +81 −0 Original line number Diff line number Diff line import signal import torch def get_world_size(): if torch.distributed.is_available() and torch.distributed.is_initialized(): world_size = torch.distributed.get_world_size() else: world_size = 1 return world_size def get_device(local_rank=None): backend = torch.distributed.get_backend() if backend == 'nccl': if local_rank is None: device = torch.device('cuda') else: device = torch.device(f'cuda:{local_rank}') elif backend == 'gloo': device = torch.device('cpu') else: raise RuntimeError return device def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None): if not torch.distributed.is_available() or \ not torch.distributed.is_initialized(): return [item] device = get_device(local_rank) if group is not None: group_size = group.size() else: group_size = get_world_size() tensor = torch.tensor([item], device=device, dtype=dtype) output_tensors = [ torch.zeros(1, dtype=tensor.dtype, device=tensor.device) for _ in range(group_size) ] torch.distributed.all_gather(output_tensors, tensor, group, async_op) output = [elem.item() for elem in output_tensors] return output class DistributedSignalHandler: def __init__(self, sig=signal.SIGTERM): self.sig = sig def signals_received(self): all_received = all_gather_item( self._signal_received, dtype=torch.int32 ) return all_received def __enter__(self): self._signal_received = False self.released = False self.original_handler = signal.getsignal(self.sig) def handler(signum, frame): self._signal_received = True signal.signal(self.sig, handler) return self def __exit__(self, type, value, tb): self.release() def release(self): if self.released: return False signal.signal(self.sig, self.original_handler) self.released = True return True
megatron/global_vars.py +13 −0 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ import time import torch from megatron import dist_signal_handler from megatron.tokenizer import build_tokenizer from .arguments import parse_args from .microbatches import build_num_microbatches_calculator Loading @@ -31,6 +32,7 @@ _GLOBAL_TOKENIZER = None _GLOBAL_TENSORBOARD_WRITER = None _GLOBAL_ADLR_AUTORESUME = None _GLOBAL_TIMERS = None _GLOBAL_SIGNAL_HANDLER = None def get_args(): Loading Loading @@ -75,6 +77,14 @@ def get_timers(): _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers') return _GLOBAL_TIMERS def get_signal_handler(): _ensure_var_is_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler') return _GLOBAL_SIGNAL_HANDLER def _set_signal_handler(): global _GLOBAL_SIGNAL_HANDLER _ensure_var_is_not_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler') _GLOBAL_SIGNAL_HANDLER = dist_signal_handler.DistributedSignalHandler().__enter__() def set_global_variables(extra_args_provider=None, args_defaults={}, ignore_unknown_args=False): Loading @@ -89,6 +99,9 @@ def set_global_variables(extra_args_provider=None, args_defaults={}, _set_adlr_autoresume(args) _set_timers() if args.exit_signal_handler: _set_signal_handler() def _parse_args(extra_args_provider=None, defaults={}, ignore_unknown_args=False): Loading
megatron/initialize.py +1 −1 Original line number Diff line number Diff line Loading @@ -180,7 +180,7 @@ def _initialize_distributed(): torch.distributed.init_process_group( backend=args.distributed_backend, world_size=args.world_size, rank=args.rank, timeout=timedelta(days=7)) timeout=timedelta(minutes=10)) # Set the tensor model-parallel, pipeline model-parallel, and # data-parallel communicators. Loading