Commit 39497411 authored by Anna Welker's avatar Anna Welker
Browse files

[ARM][MVE] Restrict allowed types of gather/scatter offsets

The MVE gather instructions smaller than 32bits zext extend the values
in the offset register, as opposed to sign extending them. We need to
make sure that the code that we select from is suitably extended, which
this patch attempts to fix by tightening up the offset checks.

Differential Revision: https://reviews.llvm.org/D75361
parent 4962a0b2
Loading
Loading
Loading
Loading
+21 −12
Original line number Diff line number Diff line
@@ -145,26 +145,35 @@ Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr,
    return nullptr;
  }
  Offsets = GEP->getOperand(1);
  // SExt offsets inside masked gathers are not permitted by the architecture;
  // we therefore can't fold them
  // Paranoid check whether the number of parallel lanes is the same
  assert(Ty->getVectorNumElements() ==
         Offsets->getType()->getVectorNumElements());
  // Only <N x i32> offsets can be integrated into an arm gather, any smaller
  // type would have to be sign extended by the gep - and arm gathers can only
  // zero extend. Additionally, the offsets do have to originate from a zext of
  // a vector with element types smaller or equal the type of the gather we're
  // looking at
  if (Offsets->getType()->getScalarSizeInBits() != 32)
    return nullptr;
  if (ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets))
    Offsets = ZextOffs->getOperand(0);
  Type *OffsType = VectorType::getInteger(cast<VectorType>(Ty));
  // If the offset we found does not have the type the intrinsic expects,
  // i.e., the same type as the gather (or scatter input) itself, we need to
  // convert it (only i types) or fall back to expanding the gather
  if (OffsType != Offsets->getType()) {
    if (OffsType->getScalarSizeInBits() >
        Offsets->getType()->getScalarSizeInBits()) {
      LLVM_DEBUG(dbgs() << "masked gathers/scatters: extending offsets\n");
      Offsets = Builder.CreateZExt(Offsets, OffsType, "");
    } else {
  else if (!(Offsets->getType()->getVectorNumElements() == 4 &&
             Offsets->getType()->getScalarSizeInBits() == 32))
    return nullptr;

  if (Ty != Offsets->getType()) {
    if ((Ty->getScalarSizeInBits() <
         Offsets->getType()->getScalarSizeInBits())) {
      LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type."
                        << " Can't create intrinsic.\n");
      return nullptr;
    } else {
      Offsets = Builder.CreateZExt(
          Offsets, VectorType::getInteger(cast<VectorType>(Ty)));
    }
  }
  // If none of the checks failed, return the gep's base pointer
  LLVM_DEBUG(dbgs() << "masked gathers/scatters: found correct offsets\n");
  return GEPPtr;
}

+131 −0
Original line number Diff line number Diff line
@@ -16,6 +16,137 @@ entry:
  ret <8 x i16> %gather.zext
}

define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16_noext(i8* %base, <8 x i8>* %offptr) {
; CHECK-LABEL: zext_unscaled_i8_i16_noext:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r4, r5, r7, lr}
; CHECK-NEXT:    push {r4, r5, r7, lr}
; CHECK-NEXT:    vldrb.s32 q0, [r1]
; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
; CHECK-NEXT:    vadd.i32 q0, q0, r0
; CHECK-NEXT:    vadd.i32 q1, q1, r0
; CHECK-NEXT:    vmov r2, s3
; CHECK-NEXT:    vmov r3, s1
; CHECK-NEXT:    vmov r5, s0
; CHECK-NEXT:    vmov r0, s4
; CHECK-NEXT:    vmov r1, s5
; CHECK-NEXT:    vmov r4, s7
; CHECK-NEXT:    ldrb.w r12, [r2]
; CHECK-NEXT:    vmov r2, s2
; CHECK-NEXT:    ldrb.w lr, [r3]
; CHECK-NEXT:    vmov r3, s6
; CHECK-NEXT:    ldrb r5, [r5]
; CHECK-NEXT:    ldrb r0, [r0]
; CHECK-NEXT:    vmov.16 q0[0], r5
; CHECK-NEXT:    ldrb r1, [r1]
; CHECK-NEXT:    vmov.16 q0[1], lr
; CHECK-NEXT:    ldrb r4, [r4]
; CHECK-NEXT:    ldrb r2, [r2]
; CHECK-NEXT:    ldrb r3, [r3]
; CHECK-NEXT:    vmov.16 q0[2], r2
; CHECK-NEXT:    vmov.16 q0[3], r12
; CHECK-NEXT:    vmov.16 q0[4], r0
; CHECK-NEXT:    vmov.16 q0[5], r1
; CHECK-NEXT:    vmov.16 q0[6], r3
; CHECK-NEXT:    vmov.16 q0[7], r4
; CHECK-NEXT:    vmovlb.u8 q0, q0
; CHECK-NEXT:    pop {r4, r5, r7, pc}
entry:
  %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
  %ptrs = getelementptr inbounds i8, i8* %base, <8 x i8> %offs
  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
  %gather.zext = zext <8 x i8> %gather to <8 x i16>
  ret <8 x i16> %gather.zext
}

define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(i16* %base, <8 x i8>* %offptr) {
; CHECK-LABEL: scaled_v8i16_sext:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r4, r5, r7, lr}
; CHECK-NEXT:    push {r4, r5, r7, lr}
; CHECK-NEXT:    vldrb.s32 q0, [r1]
; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
; CHECK-NEXT:    vshl.i32 q0, q0, #1
; CHECK-NEXT:    vshl.i32 q1, q1, #1
; CHECK-NEXT:    vadd.i32 q0, q0, r0
; CHECK-NEXT:    vadd.i32 q1, q1, r0
; CHECK-NEXT:    vmov r2, s2
; CHECK-NEXT:    vmov r3, s3
; CHECK-NEXT:    vmov r5, s1
; CHECK-NEXT:    vmov r0, s4
; CHECK-NEXT:    vmov r1, s5
; CHECK-NEXT:    vmov r4, s7
; CHECK-NEXT:    ldrh.w r12, [r2]
; CHECK-NEXT:    vmov r2, s0
; CHECK-NEXT:    ldrh.w lr, [r3]
; CHECK-NEXT:    vmov r3, s6
; CHECK-NEXT:    ldrh r5, [r5]
; CHECK-NEXT:    ldrh r0, [r0]
; CHECK-NEXT:    ldrh r1, [r1]
; CHECK-NEXT:    ldrh r4, [r4]
; CHECK-NEXT:    ldrh r2, [r2]
; CHECK-NEXT:    ldrh r3, [r3]
; CHECK-NEXT:    vmov.16 q0[0], r2
; CHECK-NEXT:    vmov.16 q0[1], r5
; CHECK-NEXT:    vmov.16 q0[2], r12
; CHECK-NEXT:    vmov.16 q0[3], lr
; CHECK-NEXT:    vmov.16 q0[4], r0
; CHECK-NEXT:    vmov.16 q0[5], r1
; CHECK-NEXT:    vmov.16 q0[6], r3
; CHECK-NEXT:    vmov.16 q0[7], r4
; CHECK-NEXT:    pop {r4, r5, r7, pc}
entry:
  %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
  %offs.sext = sext <8 x i8> %offs to <8 x i16>
  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs.sext
  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
  ret <8 x i16> %gather
}

define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_zext(i16* %base, <8 x i8>* %offptr) {
; CHECK-LABEL: scaled_v8i16_zext:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r4, r5, r7, lr}
; CHECK-NEXT:    push {r4, r5, r7, lr}
; CHECK-NEXT:    vldrb.u32 q0, [r1]
; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
; CHECK-NEXT:    vshl.i32 q0, q0, #1
; CHECK-NEXT:    vshl.i32 q1, q1, #1
; CHECK-NEXT:    vadd.i32 q0, q0, r0
; CHECK-NEXT:    vadd.i32 q1, q1, r0
; CHECK-NEXT:    vmov r2, s2
; CHECK-NEXT:    vmov r3, s3
; CHECK-NEXT:    vmov r5, s1
; CHECK-NEXT:    vmov r0, s4
; CHECK-NEXT:    vmov r1, s5
; CHECK-NEXT:    vmov r4, s7
; CHECK-NEXT:    ldrh.w r12, [r2]
; CHECK-NEXT:    vmov r2, s0
; CHECK-NEXT:    ldrh.w lr, [r3]
; CHECK-NEXT:    vmov r3, s6
; CHECK-NEXT:    ldrh r5, [r5]
; CHECK-NEXT:    ldrh r0, [r0]
; CHECK-NEXT:    ldrh r1, [r1]
; CHECK-NEXT:    ldrh r4, [r4]
; CHECK-NEXT:    ldrh r2, [r2]
; CHECK-NEXT:    ldrh r3, [r3]
; CHECK-NEXT:    vmov.16 q0[0], r2
; CHECK-NEXT:    vmov.16 q0[1], r5
; CHECK-NEXT:    vmov.16 q0[2], r12
; CHECK-NEXT:    vmov.16 q0[3], lr
; CHECK-NEXT:    vmov.16 q0[4], r0
; CHECK-NEXT:    vmov.16 q0[5], r1
; CHECK-NEXT:    vmov.16 q0[6], r3
; CHECK-NEXT:    vmov.16 q0[7], r4
; CHECK-NEXT:    pop {r4, r5, r7, pc}
entry:
  %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
  %offs.zext = zext <8 x i8> %offs to <8 x i16>
  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs.zext
  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
  ret <8 x i16> %gather
}

define arm_aapcs_vfpcc <8 x i16> @sext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) {
; CHECK-LABEL: sext_unscaled_i8_i16:
; CHECK:       @ %bb.0: @ %entry