Commit 26bf877e authored by Nemanja Ivanovic's avatar Nemanja Ivanovic
Browse files

[PowerPC] Fix spilling of vector registers in PEI of EH aware functions

On little endian targets prior to Power9, we spill vector registers using a
swapping store (i.e. stdxvd2x saves the vector with the two doublewords in
big endian order regardless of endianness). This is generally not a problem
since we restore them using the corresponding swapping load (lxvd2x). However
if the restore is done by the unwinder, the vector register contains data in
the incorrect order.

This patch fixes that by using Altivec loads/stores for vector saves and
restores in PEI (which keep the order correct) under those specific conditions:
- EH aware function
- Subtarget requires swaps for VSX memops (Little Endian prior to Power9)

Differential revision: https://reviews.llvm.org/D73692
parent 499ad458
Loading
Loading
Loading
Loading
+19 −3
Original line number Diff line number Diff line
@@ -2241,8 +2241,15 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
        // Use !IsLiveIn for the kill flag.
        // We do not want to kill registers that are live in this function
        // before their use because they will become undefined registers.
        TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn,
        // Functions without NoUnwind need to preserve the order of elements in
        // saved vector registers.
        if (Subtarget.needsSwapsForVSXMemOps() &&
            !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
          TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn,
                                       CSI[i].getFrameIdx(), RC, TRI);
        else
          TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, CSI[i].getFrameIdx(),
                                  RC, TRI);
      }
    }
  }
@@ -2394,7 +2401,16 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
      } else {
       // Default behavior for non-CR saves.
        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);

        // Functions without NoUnwind need to preserve the order of elements in
        // saved vector registers.
        if (Subtarget.needsSwapsForVSXMemOps() &&
            !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
          TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC,
                                        TRI);
        else
          TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);

        assert(I != MBB.begin() &&
               "loadRegFromStackSlot didn't insert any code!");
      }
+42 −31
Original line number Diff line number Diff line
@@ -1222,24 +1222,13 @@ void PPCInstrInfo::StoreRegToStackSlot(
    FuncInfo->setHasNonRISpills();
}

void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MI,
                                       Register SrcReg, bool isKill,
                                       int FrameIdx,
                                       const TargetRegisterClass *RC,
void PPCInstrInfo::storeRegToStackSlotNoUpd(
    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg,
    bool isKill, int FrameIdx, const TargetRegisterClass *RC,
    const TargetRegisterInfo *TRI) const {
  MachineFunction &MF = *MBB.getParent();
  SmallVector<MachineInstr *, 4> NewMIs;

  // We need to avoid a situation in which the value from a VRRC register is
  // spilled using an Altivec instruction and reloaded into a VSRC register
  // using a VSX instruction. The issue with this is that the VSX
  // load/store instructions swap the doublewords in the vector and the Altivec
  // ones don't. The register classes on the spill/reload may be different if
  // the register is defined using an Altivec instruction and is then used by a
  // VSX instruction.
  RC = updatedRC(RC);

  StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs);

  for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
@@ -1253,6 +1242,23 @@ void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
  NewMIs.back()->addMemOperand(MF, MMO);
}

void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MI,
                                       Register SrcReg, bool isKill,
                                       int FrameIdx,
                                       const TargetRegisterClass *RC,
                                       const TargetRegisterInfo *TRI) const {
  // We need to avoid a situation in which the value from a VRRC register is
  // spilled using an Altivec instruction and reloaded into a VSRC register
  // using a VSX instruction. The issue with this is that the VSX
  // load/store instructions swap the doublewords in the vector and the Altivec
  // ones don't. The register classes on the spill/reload may be different if
  // the register is defined using an Altivec instruction and is then used by a
  // VSX instruction.
  RC = updatedRC(RC);
  storeRegToStackSlotNoUpd(MBB, MI, SrcReg, isKill, FrameIdx, RC, TRI);
}

void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
                                        unsigned DestReg, int FrameIdx,
                                        const TargetRegisterClass *RC,
@@ -1274,11 +1280,9 @@ void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
    FuncInfo->setHasNonRISpills();
}

void
PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MI,
                                   Register DestReg, int FrameIdx,
                                   const TargetRegisterClass *RC,
void PPCInstrInfo::loadRegFromStackSlotNoUpd(
    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg,
    int FrameIdx, const TargetRegisterClass *RC,
    const TargetRegisterInfo *TRI) const {
  MachineFunction &MF = *MBB.getParent();
  SmallVector<MachineInstr*, 4> NewMIs;
@@ -1288,16 +1292,6 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
  FuncInfo->setHasSpills();

  // We need to avoid a situation in which the value from a VRRC register is
  // spilled using an Altivec instruction and reloaded into a VSRC register
  // using a VSX instruction. The issue with this is that the VSX
  // load/store instructions swap the doublewords in the vector and the Altivec
  // ones don't. The register classes on the spill/reload may be different if
  // the register is defined using an Altivec instruction and is then used by a
  // VSX instruction.
  if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
    RC = &PPC::VSRCRegClass;

  LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs);

  for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
@@ -1311,6 +1305,23 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
  NewMIs.back()->addMemOperand(MF, MMO);
}

void PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MI,
                                        Register DestReg, int FrameIdx,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
  // We need to avoid a situation in which the value from a VRRC register is
  // spilled using an Altivec instruction and reloaded into a VSRC register
  // using a VSX instruction. The issue with this is that the VSX
  // load/store instructions swap the doublewords in the vector and the Altivec
  // ones don't. The register classes on the spill/reload may be different if
  // the register is defined using an Altivec instruction and is then used by a
  // VSX instruction.
  RC = updatedRC(RC);

  loadRegFromStackSlotNoUpd(MBB, MI, DestReg, FrameIdx, RC, TRI);
}

bool PPCInstrInfo::
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
  assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
+18 −0
Original line number Diff line number Diff line
@@ -296,12 +296,30 @@ public:
                           const TargetRegisterClass *RC,
                           const TargetRegisterInfo *TRI) const override;

  // Emits a register spill without updating the register class for vector
  // registers. This ensures that when we spill a vector register the
  // element order in the register is the same as it was in memory.
  void storeRegToStackSlotNoUpd(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MBBI,
                                unsigned SrcReg, bool isKill, int FrameIndex,
                                const TargetRegisterClass *RC,
                                const TargetRegisterInfo *TRI) const;

  void loadRegFromStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
                            Register DestReg, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;

  // Emits a register reload without updating the register class for vector
  // registers. This ensures that when we reload a vector register the
  // element order in the register is the same as it was in memory.
  void loadRegFromStackSlotNoUpd(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MBBI,
                                 unsigned DestReg, int FrameIndex,
                                 const TargetRegisterClass *RC,
                                 const TargetRegisterInfo *TRI) const;

  unsigned getStoreOpcodeForSpill(unsigned Reg,
                                  const TargetRegisterClass *RC = nullptr) const;

+12 −12
Original line number Diff line number Diff line
@@ -126,9 +126,9 @@ define dso_local signext i32 @caller3(i32 signext %a, i32 signext %b) local_unna
; CHECK-PWR8-NEXT:    .cfi_offset v20, -192
; CHECK-PWR8-NEXT:    .cfi_offset v21, -176
; CHECK-PWR8-NEXT:    li r5, 48
; CHECK-PWR8-NEXT:    stxvd2x v20, r1, r5 # 16-byte Folded Spill
; CHECK-PWR8-NEXT:    stvx v20, r1, r5 # 16-byte Folded Spill
; CHECK-PWR8-NEXT:    li r5, 64
; CHECK-PWR8-NEXT:    stxvd2x v21, r1, r5 # 16-byte Folded Spill
; CHECK-PWR8-NEXT:    stvx v21, r1, r5 # 16-byte Folded Spill
; CHECK-PWR8-NEXT:    #APP
; CHECK-PWR8-NEXT:    add r3, r3, r4
; CHECK-PWR8-NEXT:    #NO_APP
@@ -136,9 +136,9 @@ define dso_local signext i32 @caller3(i32 signext %a, i32 signext %b) local_unna
; CHECK-PWR8-NEXT:    bl callee
; CHECK-PWR8-NEXT:    nop
; CHECK-PWR8-NEXT:    li r4, 64
; CHECK-PWR8-NEXT:    lxvd2x v21, r1, r4 # 16-byte Folded Reload
; CHECK-PWR8-NEXT:    lvx v21, r1, r4 # 16-byte Folded Reload
; CHECK-PWR8-NEXT:    li r4, 48
; CHECK-PWR8-NEXT:    lxvd2x v20, r1, r4 # 16-byte Folded Reload
; CHECK-PWR8-NEXT:    lvx v20, r1, r4 # 16-byte Folded Reload
; CHECK-PWR8-NEXT:    addi r1, r1, 240
; CHECK-PWR8-NEXT:    ld r0, 16(r1)
; CHECK-PWR8-NEXT:    mtlr r0
@@ -184,9 +184,9 @@ define dso_local signext i32 @caller4(i32 signext %a, i32 signext %b) local_unna
; CHECK-PWR8-NEXT:    .cfi_offset v20, -192
; CHECK-PWR8-NEXT:    .cfi_offset v21, -176
; CHECK-PWR8-NEXT:    li r5, 48
; CHECK-PWR8-NEXT:    stxvd2x v20, r1, r5 # 16-byte Folded Spill
; CHECK-PWR8-NEXT:    stvx v20, r1, r5 # 16-byte Folded Spill
; CHECK-PWR8-NEXT:    li r5, 64
; CHECK-PWR8-NEXT:    stxvd2x v21, r1, r5 # 16-byte Folded Spill
; CHECK-PWR8-NEXT:    stvx v21, r1, r5 # 16-byte Folded Spill
; CHECK-PWR8-NEXT:    #APP
; CHECK-PWR8-NEXT:    add r3, r3, r4
; CHECK-PWR8-NEXT:    #NO_APP
@@ -194,9 +194,9 @@ define dso_local signext i32 @caller4(i32 signext %a, i32 signext %b) local_unna
; CHECK-PWR8-NEXT:    bl callee
; CHECK-PWR8-NEXT:    nop
; CHECK-PWR8-NEXT:    li r4, 64
; CHECK-PWR8-NEXT:    lxvd2x v21, r1, r4 # 16-byte Folded Reload
; CHECK-PWR8-NEXT:    lvx v21, r1, r4 # 16-byte Folded Reload
; CHECK-PWR8-NEXT:    li r4, 48
; CHECK-PWR8-NEXT:    lxvd2x v20, r1, r4 # 16-byte Folded Reload
; CHECK-PWR8-NEXT:    lvx v20, r1, r4 # 16-byte Folded Reload
; CHECK-PWR8-NEXT:    addi r1, r1, 240
; CHECK-PWR8-NEXT:    ld r0, 16(r1)
; CHECK-PWR8-NEXT:    mtlr r0
@@ -246,9 +246,9 @@ define dso_local signext i32 @caller_mixed(i32 signext %a, i32 signext %b) local
; CHECK-PWR8-NEXT:    li r5, 48
; CHECK-PWR8-NEXT:    std r14, 240(r1) # 8-byte Folded Spill
; CHECK-PWR8-NEXT:    stfd f14, 384(r1) # 8-byte Folded Spill
; CHECK-PWR8-NEXT:    stxvd2x v20, r1, r5 # 16-byte Folded Spill
; CHECK-PWR8-NEXT:    stvx v20, r1, r5 # 16-byte Folded Spill
; CHECK-PWR8-NEXT:    li r5, 64
; CHECK-PWR8-NEXT:    stxvd2x v21, r1, r5 # 16-byte Folded Spill
; CHECK-PWR8-NEXT:    stvx v21, r1, r5 # 16-byte Folded Spill
; CHECK-PWR8-NEXT:    #APP
; CHECK-PWR8-NEXT:    add r3, r3, r4
; CHECK-PWR8-NEXT:    #NO_APP
@@ -258,9 +258,9 @@ define dso_local signext i32 @caller_mixed(i32 signext %a, i32 signext %b) local
; CHECK-PWR8-NEXT:    li r4, 64
; CHECK-PWR8-NEXT:    lfd f14, 384(r1) # 8-byte Folded Reload
; CHECK-PWR8-NEXT:    ld r14, 240(r1) # 8-byte Folded Reload
; CHECK-PWR8-NEXT:    lxvd2x v21, r1, r4 # 16-byte Folded Reload
; CHECK-PWR8-NEXT:    lvx v21, r1, r4 # 16-byte Folded Reload
; CHECK-PWR8-NEXT:    li r4, 48
; CHECK-PWR8-NEXT:    lxvd2x v20, r1, r4 # 16-byte Folded Reload
; CHECK-PWR8-NEXT:    lvx v20, r1, r4 # 16-byte Folded Reload
; CHECK-PWR8-NEXT:    addi r1, r1, 528
; CHECK-PWR8-NEXT:    ld r0, 16(r1)
; CHECK-PWR8-NEXT:    mtlr r0
+2 −2
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ define dso_local signext i32 @caller(i32 signext %a, i32 signext %b) local_unnam
; CHECK-NEXT:    .cfi_offset lr, 16
; CHECK-NEXT:    .cfi_offset v20, -192
; CHECK-NEXT:    li r5, 48
; CHECK-NEXT:    stxvd2x v20, r1, r5 # 16-byte Folded Spill
; CHECK-NEXT:    stvx v20, r1, r5 # 16-byte Folded Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    add r3, r3, r4
; CHECK-NEXT:    #NO_APP
@@ -20,7 +20,7 @@ define dso_local signext i32 @caller(i32 signext %a, i32 signext %b) local_unnam
; CHECK-NEXT:    bl callee
; CHECK-NEXT:    nop
; CHECK-NEXT:    li r4, 48
; CHECK-NEXT:    lxvd2x v20, r1, r4 # 16-byte Folded Reload
; CHECK-NEXT:    lvx v20, r1, r4 # 16-byte Folded Reload
; CHECK-NEXT:    addi r1, r1, 240
; CHECK-NEXT:    ld r0, 16(r1)
; CHECK-NEXT:    mtlr r0
Loading