Commit 98feae4e authored by mohammad's avatar mohammad
Browse files

added allgather and allreduce

parent de6640be
Loading
Loading
Loading
Loading
+93 −12
Original line number Diff line number Diff line
@@ -57,6 +57,73 @@ def model_provider():
    return general_model_provider(False, False)



def get_group_world_size_rank():

    group = mpu.get_data_parallel_group()
    rank = torch.distributed.get_rank(group=group)
    world_size = torch.distributed.get_world_size(group=group)

    return group, rank, world_size


def get_rank_chunk_along_first_dim(tensor):

    group, rank, world_size = get_group_world_size_rank()

    assert tensor.shape[0] % world_size == 0
    dim_size = tensor.shape[0] // world_size
    output_list = torch.split(tensor, dim_size, dim=0)
    
    output = output_list[rank].contiguous()
    return output


class AllgatherFromDataParallelRegion(torch.autograd.Function):

    @staticmethod
    def forward(ctx, input_):

        assert input_.dim() == 2
        group, rank, world_size = get_group_world_size_rank()

        tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
        tensor_list[rank] = input_
        torch.distributed.all_gather(tensor_list, input_, group=group)

        output = torch.cat(tensor_list, dim=0).contiguous()

        return output


    @staticmethod
    def backward(ctx, grad_output):

        return get_rank_chunk_along_first_dim(grad_output)


class AllReduceFromDataParallelRegion(torch.autograd.Function):

    @staticmethod
    def forward(ctx, input_):

        assert input_.dim() == 2
        group, rank, world_size = get_group_world_size_rank()

        tensor_list = [torch.zero_like(input_) for _ in range(world_size)]
        tensor_list[rank] = input_
        output = torch.cat(tensor_list, dim=0).contiguous() 
        torch.distributed.all_reduce(output, group=group)

        return output


    @staticmethod
    def backward(ctx, grad_output):

        return get_rank_chunk_along_first_dim(grad_output)


def get_batch(data_iterator):
    # Items and their type.
    keys = ['query_tokens', 'query_pad_mask',
@@ -95,6 +162,9 @@ def forward_step(data_iterator, model):
    # Forward model.
    query_logits, block_logits = model(query_tokens, query_pad_mask, block_tokens, block_pad_mask)

    IMPLEMENTATION = 'original'

    if IMPLEMENTATION == 'original':
        data_parallel_size = dist.get_world_size() / args.model_parallel_size
        batch_size = query_logits.shape[0]
        global_batch_size = int(batch_size * data_parallel_size)
@@ -111,6 +181,17 @@ def forward_step(data_iterator, model):
        dist.all_reduce(all_query_logits)
        dist.all_reduce(all_block_logits)

    elif IMPLEMENTATION == 'allgather':
        all_query_logits = AllgatherFromDataParallelRegion.apply(query_logits)
        all_block_logits = AllgatherFromDataParallelRegion.apply(block_logits)

    elif IMPLEMENTATION == 'allreduce':
        all_query_logits = AllReduceFromDataParallelRegion.apply(query_logits)
        all_block_logits = AllReduceFromDataParallelRegion.apply(block_logits)

    else:
        raise Exception('should not be here.')

    # scores are inner products between query and block embeddings
    retrieval_scores = all_query_logits.float().matmul(torch.transpose(all_block_logits, 0, 1).float())
    softmaxed = F.softmax(retrieval_scores, dim=1)