Commit 68b102b9 authored by Matt Arsenault's avatar Matt Arsenault
Browse files

AMDGPU: Directly select 16-bank LDS case of llvm.amdgcn.interp.p1.f16

Manually select this is as a tablegen workraound. Both SelectionDAG
and GlobalISel end up misplacing the copy to m0 when both instructions
in the output need it. Neither considers that both output instructions
depend on m0. I don't know of any other pattern we need to handle this
case, so it's less effort to just workaround this for now.
parent 0d7bd343
Loading
Loading
Loading
Loading
+62 −0
Original line number Diff line number Diff line
@@ -300,6 +300,7 @@ private:
  void SelectATOMIC_CMP_SWAP(SDNode *N);
  void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
  void SelectDS_GWS(SDNode *N, unsigned IntrID);
  void SelectInterpP1F16(SDNode *N);
  void SelectINTRINSIC_W_CHAIN(SDNode *N);
  void SelectINTRINSIC_WO_CHAIN(SDNode *N);
  void SelectINTRINSIC_VOID(SDNode *N);
@@ -2339,6 +2340,64 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}

void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
  if (Subtarget->getLDSBankCount() != 16) {
    // This is a single instruction with a pattern.
    SelectCode(N);
    return;
  }

  SDLoc DL(N);

  // This requires 2 instructions. It is possible to write a pattern to support
  // this, but the generated isel emitter doesn't correctly deal with multiple
  // output instructions using the same physical register input. The copy to m0
  // is incorrectly placed before the second instruction.
  //
  // TODO: Match source modifiers.
  //
  // def : Pat <
  //   (int_amdgcn_interp_p1_f16
  //    (VOP3Mods f32:$src0, i32:$src0_modifiers),
  //                             (i32 timm:$attrchan), (i32 timm:$attr),
  //                             (i1 timm:$high), M0),
  //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
  //       timm:$attrchan, 0,
  //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
  //   let Predicates = [has16BankLDS];
  // }

  // 16 bank LDS
  SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
                                      N->getOperand(5), SDValue());

  SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);

  SDNode *InterpMov =
    CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
        CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
        N->getOperand(3),  // Attr
        N->getOperand(2),  // Attrchan
        ToM0.getValue(1) // In glue
  });

  SDNode *InterpP1LV =
    CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
        CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
        N->getOperand(1), // Src0
        N->getOperand(3), // Attr
        N->getOperand(2), // Attrchan
        CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
        SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
        N->getOperand(4), // high
        CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
        CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
        SDValue(InterpMov, 1)
  });

  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
}

void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
  switch (IntrID) {
@@ -2367,6 +2426,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
  case Intrinsic::amdgcn_wwm:
    Opcode = AMDGPU::WWM;
    break;
  case Intrinsic::amdgcn_interp_p1_f16:
    SelectInterpP1F16(N);
    return;
  default:
    SelectCode(N);
    return;
+0 −3
Original line number Diff line number Diff line
@@ -4323,9 +4323,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
  NODE_NAME_CASE(KILL)
  NODE_NAME_CASE(DUMMY_CHAIN)
  case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
  NODE_NAME_CASE(INTERP_P1LL_F16)
  NODE_NAME_CASE(INTERP_P1LV_F16)
  NODE_NAME_CASE(INTERP_P2_F16)
  NODE_NAME_CASE(LOAD_D16_HI)
  NODE_NAME_CASE(LOAD_D16_LO)
  NODE_NAME_CASE(LOAD_D16_HI_I8)
+0 −3
Original line number Diff line number Diff line
@@ -474,9 +474,6 @@ enum NodeType : unsigned {
  BUILD_VERTICAL_VECTOR,
  /// Pointer to the start of the shader's constant data.
  CONST_DATA_PTR,
  INTERP_P1LL_F16,
  INTERP_P1LV_F16,
  INTERP_P2_F16,
  PC_ADD_REL_OFFSET,
  LDS,
  KILL,
+0 −12
Original line number Diff line number Diff line
@@ -329,18 +329,6 @@ def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",

def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;

def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16",
                            SDTypeProfile<1, 7, [SDTCisFP<0>]>,
                            [SDNPInGlue, SDNPOutGlue]>;

def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16",
                            SDTypeProfile<1, 9, [SDTCisFP<0>]>,
                            [SDNPInGlue, SDNPOutGlue]>;

def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16",
                          SDTypeProfile<1, 8, [SDTCisFP<0>]>,
                          [SDNPInGlue]>;

def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
  [SDNPHasChain, SDNPSideEffect]>;

+0 −32
Original line number Diff line number Diff line
@@ -5872,38 +5872,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
  }
  case Intrinsic::amdgcn_fdiv_fast:
    return lowerFDIV_FAST(Op, DAG);
  case Intrinsic::amdgcn_interp_p1_f16: {
    if (getSubtarget()->getLDSBankCount() == 16) {
      // 16 bank LDS
      SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0,
                                      Op.getOperand(5), SDValue());

      // FIXME: This implicitly will insert a second CopyToReg to M0.
      SDValue S = DAG.getNode(
        ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
        DAG.getTargetConstant(Intrinsic::amdgcn_interp_mov, DL, MVT::i32),
        DAG.getTargetConstant(2, DL, MVT::i32), // P0
        Op.getOperand(2),  // Attrchan
        Op.getOperand(3),  // Attr
        Op.getOperand(5)); // m0

      SDValue Ops[] = {
        Op.getOperand(1), // Src0
        Op.getOperand(2), // Attrchan
        Op.getOperand(3), // Attr
        DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
        S, // Src2 - holds two f16 values selected by high
        DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
        Op.getOperand(4), // high
        DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
        DAG.getTargetConstant(0, DL, MVT::i32), // $omod
        ToM0.getValue(1)
      };
      return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
    }

    return SDValue();
  }
  case Intrinsic::amdgcn_sin:
    return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));

Loading