Commit b462d254 authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'slym/allreduce-overlap' into 'main'

use low-priority stream for nccl overlapping all-reduce and gemm

See merge request ADLR/megatron-lm!319
parents 6e1bde1e f3e57f6f
Loading
Loading
Loading
Loading
+5 −16
Original line number Diff line number Diff line
@@ -176,22 +176,11 @@ def _initialize_distributed():
            else:
                args.local_rank = device
            torch.cuda.set_device(device)
        # Increase cuda stream priority of NCCL ops when overlapping with other ops
        if (not args.no_async_tensor_model_parallel_allreduce and
                args.tensor_model_parallel_size > 1):
            from torch._C._distributed_c10d import ProcessGroupNCCL

            pg_options = ProcessGroupNCCL.Options()
            pg_options.is_high_priority_stream = True
            pg_options._timeout = timedelta(days=7)
        else:
            pg_options = None
    # Call the init process
    torch.distributed.init_process_group(
        backend=args.distributed_backend,
        world_size=args.world_size, rank=args.rank,
            timeout=timedelta(days=7),
            pg_options=pg_options)
        timeout=timedelta(days=7))

    # Set the tensor model-parallel, pipeline model-parallel, and
    # data-parallel communicators.