Commit ba385ae2 authored by Matt Arsenault's avatar Matt Arsenault Committed by Matt Arsenault
Browse files

Reapply "RegisterCoalescer: Add implicit-def of super register when coalescing SUBREG_TO_REG"

This reverts commit e0f86ca2.

This was hitting some assertions which have since been relaxed.
parent d2752775
Loading
Loading
Loading
Loading
+42 −9
Original line number Diff line number Diff line
@@ -305,7 +305,11 @@ namespace {
    /// number if it is not zero. If DstReg is a physical register and the
    /// existing subregister number of the def / use being updated is not zero,
    /// make sure to set it to the correct physical subregister.
    void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx);
    ///
    /// If \p IsSubregToReg, we are coalescing a DstReg = SUBREG_TO_REG
    /// SrcReg. This introduces an implicit-def of DstReg on coalesced users.
    void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx,
                           bool IsSubregToReg);

    /// If the given machine operand reads only undefined lanes add an undef
    /// flag.
@@ -1323,8 +1327,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
    if (DstReg.isPhysical()) {
      Register NewDstReg = DstReg;

      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
                                              DefMI->getOperand(0).getSubReg());
      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(), DefSubIdx);
      if (NewDstIdx)
        NewDstReg = TRI->getSubReg(DstReg, NewDstIdx);

@@ -1473,7 +1476,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
    MRI->setRegClass(DstReg, NewRC);

    // Update machine operands and add flags.
    updateRegDefsUses(DstReg, DstReg, DstIdx);
    updateRegDefsUses(DstReg, DstReg, DstIdx, false);
    NewMI.getOperand(0).setSubReg(NewIdx);
    // updateRegDefUses can add an "undef" flag to the definition, since
    // it will replace DstReg with DstReg.DstIdx. If NewIdx is 0, make
@@ -1788,7 +1791,7 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
}

void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
                                          unsigned SubIdx) {
                                          unsigned SubIdx, bool IsSubregToReg) {
  bool DstIsPhys = DstReg.isPhysical();
  LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);

@@ -1828,6 +1831,8 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
    if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr())
      Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));

    bool FullDef = true;

    // Replace SrcReg with DstReg in all UseMI operands.
    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
      MachineOperand &MO = UseMI->getOperand(Ops[i]);
@@ -1835,9 +1840,13 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
      // Adjust <undef> flags in case of sub-register joins. We don't want to
      // turn a full def into a read-modify-write sub-register def and vice
      // versa.
      if (SubIdx && MO.isDef())
      if (SubIdx && MO.isDef()) {
        MO.setIsUndef(!Reads);

        if (!Reads)
          FullDef = false;
      }

      // A subreg use of a partially undef (super) register may be a complete
      // undef use now and then has to be marked that way.
      if (MO.isUse() && !DstIsPhys) {
@@ -1869,6 +1878,25 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
        MO.substVirtReg(DstReg, SubIdx, *TRI);
    }

    if (IsSubregToReg && !FullDef) {
      // If the coalesed instruction doesn't fully define the register, we need
      // to preserve the original super register liveness for SUBREG_TO_REG.
      //
      // We pretended SUBREG_TO_REG was a regular copy for coalescing purposes,
      // but it introduces liveness for other subregisters. Downstream users may
      // have been relying on those bits, so we need to ensure their liveness is
      // captured with a def of other lanes.

      // FIXME: Need to add new subrange if tracking subranges. We could also
      // skip adding this if we knew the other lanes are dead, and only for
      // other lanes.

      assert(!MRI->shouldTrackSubRegLiveness(DstReg) &&
             "this should update subranges");
      MachineInstrBuilder MIB(*MF, UseMI);
      MIB.addReg(DstReg, RegState::ImplicitDefine);
    }

    LLVM_DEBUG({
      dbgs() << "\t\tupdated: ";
      if (!UseMI->isDebugInstr())
@@ -2068,6 +2096,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
    });
  }

  const bool IsSubregToReg = CopyMI->isSubregToReg();

  ShrinkMask = LaneBitmask::getNone();
  ShrinkMainRange = false;

@@ -2135,9 +2165,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {

  // Rewrite all SrcReg operands to DstReg.
  // Also update DstReg operands to include DstIdx if it is set.
  if (CP.getDstIdx())
    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
  if (CP.getDstIdx()) {
    assert(!IsSubregToReg && "can this happen?");
    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx(), false);
  }
  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx(),
                    IsSubregToReg);

  // Shrink subregister ranges if necessary.
  if (ShrinkMask.any()) {
+46 −46

File changed.

Preview size limit exceeded, changes collapsed.

+185 −0
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=x86_64-grtev4-linux-gnu < %s | FileCheck %s

%struct.wibble = type { %struct.wombat }
%struct.wombat = type { %struct.ham, [3 x i8] }
%struct.ham = type { %struct.zot }
%struct.zot = type { %struct.blam }
%struct.blam = type { %struct.ham.0 }
%struct.ham.0 = type { %struct.bar }
%struct.bar = type { %struct.bar.1 }
%struct.bar.1 = type { %struct.baz, i8 }
%struct.baz = type { %struct.snork }
%struct.snork = type <{ %struct.spam, i8, [3 x i8] }>
%struct.spam = type { %struct.snork.2, %struct.snork.2 }
%struct.snork.2 = type { i32 }
%struct.snork.3 = type { %struct.baz, i8, [3 x i8] }

define void @foo(ptr %arg, ptr %arg1, i40 %arg2, ptr %arg3, i32 %arg4) #0 {
; CHECK-LABEL: foo:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    pushq %rbp
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset %rbp, -16
; CHECK-NEXT:    movq %rsp, %rbp
; CHECK-NEXT:    .cfi_def_cfa_register %rbp
; CHECK-NEXT:    pushq %r15
; CHECK-NEXT:    pushq %r14
; CHECK-NEXT:    pushq %r13
; CHECK-NEXT:    pushq %r12
; CHECK-NEXT:    pushq %rbx
; CHECK-NEXT:    subq $24, %rsp
; CHECK-NEXT:    .cfi_offset %rbx, -56
; CHECK-NEXT:    .cfi_offset %r12, -48
; CHECK-NEXT:    .cfi_offset %r13, -40
; CHECK-NEXT:    .cfi_offset %r14, -32
; CHECK-NEXT:    .cfi_offset %r15, -24
; CHECK-NEXT:    movl %r8d, %r14d
; CHECK-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT:    movq %rsi, %r13
; CHECK-NEXT:    movq %rdi, %r15
; CHECK-NEXT:    incl %r14d
; CHECK-NEXT:    xorl %ebx, %ebx
; CHECK-NEXT:    # implicit-def: $r12
; CHECK-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT:    jmp .LBB0_3
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB0_1: # %bb17
; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT:    movq %r15, %r13
; CHECK-NEXT:    xorl %r15d, %r15d
; CHECK-NEXT:    testq %rbx, %rbx
; CHECK-NEXT:    sete %r15b
; CHECK-NEXT:    xorl %edi, %edi
; CHECK-NEXT:    callq _Znwm@PLT
; CHECK-NEXT:    shll $4, %r15d
; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
; CHECK-NEXT:    movq %r12, %rcx
; CHECK-NEXT:    shrq $32, %rcx
; CHECK-NEXT:    movb %cl, 12(%rax)
; CHECK-NEXT:    movl %r12d, 8(%rax)
; CHECK-NEXT:    movq %r15, %rbx
; CHECK-NEXT:    movq %r13, %r15
; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
; CHECK-NEXT:    decl %r14d
; CHECK-NEXT:    je .LBB0_8
; CHECK-NEXT:  .LBB0_3: # %bb7
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    callq widget@PLT
; CHECK-NEXT:    cmpb $-5, (%r13)
; CHECK-NEXT:    jae .LBB0_5
; CHECK-NEXT:  # %bb.4: # in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT:    movl %r12d, %r12d
; CHECK-NEXT:    cmpq %r15, %rbx
; CHECK-NEXT:    jbe .LBB0_1
; CHECK-NEXT:    jmp .LBB0_7
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB0_5: # %bb12
; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT:    movq 0, %rax
; CHECK-NEXT:    movq 8, %rax
; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
; CHECK-NEXT:    cmpq %r15, %rbx
; CHECK-NEXT:    jbe .LBB0_1
; CHECK-NEXT:  .LBB0_7: # in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    xorl %ebx, %ebx
; CHECK-NEXT:    decl %r14d
; CHECK-NEXT:    jne .LBB0_3
; CHECK-NEXT:  .LBB0_8: # %bb21
; CHECK-NEXT:    cmpb $0, 12(%rax)
; CHECK-NEXT:    jne .LBB0_10
; CHECK-NEXT:  # %bb.9: # %bb26
; CHECK-NEXT:    addq $24, %rsp
; CHECK-NEXT:    popq %rbx
; CHECK-NEXT:    popq %r12
; CHECK-NEXT:    popq %r13
; CHECK-NEXT:    popq %r14
; CHECK-NEXT:    popq %r15
; CHECK-NEXT:    popq %rbp
; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
; CHECK-NEXT:    retq
; CHECK-NEXT:  .LBB0_10: # %bb25
; CHECK-NEXT:    .cfi_def_cfa %rbp, 16
; CHECK-NEXT:    movq %r15, %rdi
; CHECK-NEXT:    callq pluto@PLT
bb:
  br label %bb7

bb5:                                              ; preds = %bb17, %bb14
  %phi = phi ptr [ %call19, %bb17 ], [ null, %bb14 ]
  %phi6 = phi ptr [ %getelementptr, %bb17 ], [ null, %bb14 ]
  %add = add i32 %phi9, 1
  %icmp = icmp eq i32 %phi9, %arg4
  br i1 %icmp, label %bb21, label %bb7

bb7:                                              ; preds = %bb5, %bb
  %phi8 = phi ptr [ null, %bb ], [ %phi6, %bb5 ]
  %phi9 = phi i32 [ 0, %bb ], [ %add, %bb5 ]
  %phi10 = phi i40 [ undef, %bb ], [ %phi15, %bb5 ]
  %call = call ptr @widget()
  %load = load i8, ptr %arg1, align 8
  %icmp11 = icmp ult i8 %load, -5
  %and = and i40 %phi10, 4294967295
  br i1 %icmp11, label %bb14, label %bb12

bb12:                                             ; preds = %bb7
  %load13 = load volatile { i64, i64 }, ptr null, align 4294967296
  br label %bb14

bb14:                                             ; preds = %bb12, %bb7
  %phi15 = phi i40 [ %and, %bb7 ], [ %arg2, %bb12 ]
  %icmp16 = icmp ugt ptr %phi8, %arg
  br i1 %icmp16, label %bb5, label %bb17

bb17:                                             ; preds = %bb14
  %icmp18 = icmp eq ptr %phi8, null
  %zext = zext i1 %icmp18 to i64
  %call19 = call ptr @_Znwm(i64 0)
  %getelementptr = getelementptr %struct.wibble, ptr %arg3, i64 %zext
  %getelementptr20 = getelementptr i8, ptr %call19, i64 8
  store i40 %phi15, ptr %getelementptr20, align 4
  br label %bb5

bb21:                                             ; preds = %bb5
  %getelementptr22 = getelementptr %struct.snork.3, ptr %phi, i64 0, i32 1
  %load23 = load i8, ptr %getelementptr22, align 4
  %icmp24 = icmp eq i8 %load23, 0
  br i1 %icmp24, label %bb26, label %bb25

bb25:                                             ; preds = %bb21
  call void @pluto(ptr %arg)
  unreachable

bb26:                                             ; preds = %bb21
  ret void
}

define void @eggs(ptr %arg, ptr %arg1) {
; CHECK-LABEL: eggs:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    pushq %rax
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    movq %rdi, %rax
; CHECK-NEXT:    movq %rsi, %rdi
; CHECK-NEXT:    movq %rax, %rsi
; CHECK-NEXT:    xorl %edx, %edx
; CHECK-NEXT:    xorl %ecx, %ecx
; CHECK-NEXT:    xorl %r8d, %r8d
; CHECK-NEXT:    callq foo@PLT
; CHECK-NEXT:    popq %rax
; CHECK-NEXT:    .cfi_def_cfa_offset 8
; CHECK-NEXT:    retq
bb:
  call void @foo(ptr %arg1, ptr %arg, i40 0, ptr null, i32 0)
  ret void
}

declare ptr @widget()

declare void @pluto(ptr)

declare ptr @_Znwm(i64)

attributes #0 = { noinline "frame-pointer"="all" }
+47 −0
Original line number Diff line number Diff line
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -mtriple=x86_64-- -run-pass=register-coalescer -enable-subreg-liveness -verify-coalescing -o - %s | FileCheck %s


# FIXME: Need to handle subrange updates when coalescing with subreg_to_reg
# This will fail if x86 enables subregister liveness.
---
name: requires_new_subrange_coalesce_subreg_to_reg
tracksRegLiveness: true
body:             |
  ; CHECK-LABEL: name: requires_new_subrange_coalesce_subreg_to_reg
  ; CHECK: bb.0:
  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
  ; CHECK-NEXT:   liveins: $eax
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT:   undef %a.sub_32bit:gr64_with_sub_8bit = COPY $eax
  ; CHECK-NEXT:   %b:gr32 = IMPLICIT_DEF
  ; CHECK-NEXT:   %c:gr64 = INSERT_SUBREG %a, %b, %subreg.sub_32bit
  ; CHECK-NEXT:   JCC_1 %bb.2, 4, implicit undef $eflags
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT: bb.1:
  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT:   undef %a.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
  ; CHECK-NEXT:   %c.sub_32bit:gr64 = COPY %a
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT: bb.2:
  ; CHECK-NEXT:   %c.sub_32bit:gr64 = SUBREG_TO_REG %a, %b, %subreg.sub_32bit
  ; CHECK-NEXT:   RET 0, implicit %c
  bb.0:
    liveins: $eax
    %init_eax:gr32 = COPY $eax
    %a:gr64 = SUBREG_TO_REG 0, %init_eax, %subreg.sub_32bit
    %b:gr32 = IMPLICIT_DEF
    %c:gr64 = INSERT_SUBREG %a, %b, %subreg.sub_32bit
    JCC_1 %bb.2, 4, implicit undef $eflags

  bb.1:
    %imm0:gr32 = MOV32r0 implicit-def dead $eflags
    %a = SUBREG_TO_REG 0, %imm0, %subreg.sub_32bit
    %c.sub_32bit = COPY %a

  bb.2:
    %c.sub_32bit = SUBREG_TO_REG %a, %b, %subreg.sub_32bit
    RET 0, implicit %c

...
+348 −0

File added.

Preview size limit exceeded, changes collapsed.