Commit ee4fd509 authored by Mohammad Shoeybi's avatar Mohammad Shoeybi Committed by Jared Casper
Browse files

changed torch distributed init method from tcp to env

parent 78a69eaf
Loading
Loading
Loading
Loading
+2 −6
Original line number Diff line number Diff line
@@ -176,14 +176,10 @@ def _initialize_distributed():
                args.local_rank = device
            torch.cuda.set_device(device)
        # Call the init process
        init_method = 'tcp://'
        master_ip = os.getenv('MASTER_ADDR', 'localhost')
        master_port = os.getenv('MASTER_PORT', '6000')
        init_method += master_ip + ':' + master_port
        torch.distributed.init_process_group(
            backend=args.distributed_backend,
            world_size=args.world_size, rank=args.rank,
            init_method=init_method)
            world_size=args.world_size, rank=args.rank)


    # Set the tensor model-parallel, pipeline model-parallel, and
    # data-parallel communicators.