Commit 9390b85a authored by David Green's avatar David Green
Browse files

[ARM] Use half directly for args/return types in test. NFC

Until fairly recently the calling convention for IR half was not handled
correctly in the ARM backend, meaning we needed to pass pointers that
were loaded/stored. Now that that is fixed we can switch to using the
type directly instead.
parent 93eef7d8
Loading
Loading
Loading
Loading
+85 −108
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s

define arm_aapcs_vfpcc void @test_fadd(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
define arm_aapcs_vfpcc void @test_fadd(half* noalias nocapture readonly %A, half %B, half* noalias nocapture %C, i32 %n) {
; CHECK-LABEL: test_fadd:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    cmp r2, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB0_1: @ %vector.ph
; CHECK-NEXT:    ldrh r1, [r1]
; CHECK-NEXT:    vmov.f16 r3, s0
; CHECK-NEXT:  .LBB0_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vadd.f16 q0, q0, r1
; CHECK-NEXT:    vstrb.8 q0, [r2], #16
; CHECK-NEXT:    subs r2, #8
; CHECK-NEXT:    vadd.f16 q0, q0, r3
; CHECK-NEXT:    vstrb.8 q0, [r1], #16
; CHECK-NEXT:    bne .LBB0_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %B = load half, half* %BB
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -48,25 +47,24 @@ for.cond.cleanup: ; preds = %vector.body, %entry
  ret void
}

define arm_aapcs_vfpcc void @test_fadd_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
define arm_aapcs_vfpcc void @test_fadd_r(half* noalias nocapture readonly %A, half %B, half* noalias nocapture %C, i32 %n) {
; CHECK-LABEL: test_fadd_r:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    cmp r2, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB1_1: @ %vector.ph
; CHECK-NEXT:    ldrh r1, [r1]
; CHECK-NEXT:    vmov.f16 r3, s0
; CHECK-NEXT:  .LBB1_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vadd.f16 q0, q0, r1
; CHECK-NEXT:    vstrb.8 q0, [r2], #16
; CHECK-NEXT:    subs r2, #8
; CHECK-NEXT:    vadd.f16 q0, q0, r3
; CHECK-NEXT:    vstrb.8 q0, [r1], #16
; CHECK-NEXT:    bne .LBB1_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %B = load half, half* %BB
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -95,25 +93,24 @@ for.cond.cleanup: ; preds = %vector.body, %entry
  ret void
}

define arm_aapcs_vfpcc void @test_fmul(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
define arm_aapcs_vfpcc void @test_fmul(half* noalias nocapture readonly %A, half %B, half* noalias nocapture %C, i32 %n) {
; CHECK-LABEL: test_fmul:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    cmp r2, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB2_1: @ %vector.ph
; CHECK-NEXT:    ldrh r1, [r1]
; CHECK-NEXT:    vmov.f16 r3, s0
; CHECK-NEXT:  .LBB2_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vmul.f16 q0, q0, r1
; CHECK-NEXT:    vstrb.8 q0, [r2], #16
; CHECK-NEXT:    subs r2, #8
; CHECK-NEXT:    vmul.f16 q0, q0, r3
; CHECK-NEXT:    vstrb.8 q0, [r1], #16
; CHECK-NEXT:    bne .LBB2_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %B = load half, half* %BB
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -142,25 +139,24 @@ for.cond.cleanup: ; preds = %vector.body, %entry
  ret void
}

define arm_aapcs_vfpcc void @test_fmul_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
define arm_aapcs_vfpcc void @test_fmul_r(half* noalias nocapture readonly %A, half %B, half* noalias nocapture %C, i32 %n) {
; CHECK-LABEL: test_fmul_r:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    cmp r2, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB3_1: @ %vector.ph
; CHECK-NEXT:    ldrh r1, [r1]
; CHECK-NEXT:    vmov.f16 r3, s0
; CHECK-NEXT:  .LBB3_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vmul.f16 q0, q0, r1
; CHECK-NEXT:    vstrb.8 q0, [r2], #16
; CHECK-NEXT:    subs r2, #8
; CHECK-NEXT:    vmul.f16 q0, q0, r3
; CHECK-NEXT:    vstrb.8 q0, [r1], #16
; CHECK-NEXT:    bne .LBB3_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %B = load half, half* %BB
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -189,25 +185,24 @@ for.cond.cleanup: ; preds = %vector.body, %entry
  ret void
}

define arm_aapcs_vfpcc void @test_fsub(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
define arm_aapcs_vfpcc void @test_fsub(half* noalias nocapture readonly %A, half %B, half* noalias nocapture %C, i32 %n) {
; CHECK-LABEL: test_fsub:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    cmp r2, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB4_1: @ %vector.ph
; CHECK-NEXT:    ldrh r1, [r1]
; CHECK-NEXT:    vmov.f16 r3, s0
; CHECK-NEXT:  .LBB4_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vsub.f16 q0, q0, r1
; CHECK-NEXT:    vstrb.8 q0, [r2], #16
; CHECK-NEXT:    subs r2, #8
; CHECK-NEXT:    vsub.f16 q0, q0, r3
; CHECK-NEXT:    vstrb.8 q0, [r1], #16
; CHECK-NEXT:    bne .LBB4_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %B = load half, half* %BB
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -236,26 +231,25 @@ for.cond.cleanup: ; preds = %vector.body, %entry
  ret void
}

define arm_aapcs_vfpcc void @test_fsub_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
define arm_aapcs_vfpcc void @test_fsub_r(half* noalias nocapture readonly %A, half %B, half* noalias nocapture %C, i32 %n) {
; CHECK-LABEL: test_fsub_r:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    cmp r2, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB5_1: @ %vector.ph
; CHECK-NEXT:    ldrh r1, [r1]
; CHECK-NEXT:    vdup.16 q0, r1
; CHECK-NEXT:    vmov.f16 r3, s0
; CHECK-NEXT:    vdup.16 q0, r3
; CHECK-NEXT:  .LBB5_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    subs r2, #8
; CHECK-NEXT:    vsub.f16 q1, q0, q1
; CHECK-NEXT:    vstrb.8 q1, [r2], #16
; CHECK-NEXT:    vstrb.8 q1, [r1], #16
; CHECK-NEXT:    bne .LBB5_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %B = load half, half* %BB
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -285,27 +279,25 @@ for.cond.cleanup: ; preds = %vector.body, %entry
}


define arm_aapcs_vfpcc void @test_fmas(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
define arm_aapcs_vfpcc void @test_fmas(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fmas:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    ldr.w r12, [sp]
; CHECK-NEXT:    cmp.w r12, #1
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB6_1: @ %vector.ph
; CHECK-NEXT:    ldrh r2, [r2]
; CHECK-NEXT:    vmov.f16 r12, s0
; CHECK-NEXT:  .LBB6_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
; CHECK-NEXT:    subs.w r12, r12, #8
; CHECK-NEXT:    vfmas.f16 q1, q0, r2
; CHECK-NEXT:    vstrb.8 q1, [r3], #16
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vfmas.f16 q1, q0, r12
; CHECK-NEXT:    vstrb.8 q1, [r2], #16
; CHECK-NEXT:    bne .LBB6_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %C = load half, half* %CC
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -338,27 +330,25 @@ for.cond.cleanup: ; preds = %vector.body, %entry
  ret void
}

define arm_aapcs_vfpcc void @test_fmas_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
define arm_aapcs_vfpcc void @test_fmas_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fmas_r:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    ldr.w r12, [sp]
; CHECK-NEXT:    cmp.w r12, #1
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB7_1: @ %vector.ph
; CHECK-NEXT:    ldrh r2, [r2]
; CHECK-NEXT:    vmov.f16 r12, s0
; CHECK-NEXT:  .LBB7_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
; CHECK-NEXT:    subs.w r12, r12, #8
; CHECK-NEXT:    vfmas.f16 q1, q0, r2
; CHECK-NEXT:    vstrb.8 q1, [r3], #16
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vfmas.f16 q1, q0, r12
; CHECK-NEXT:    vstrb.8 q1, [r2], #16
; CHECK-NEXT:    bne .LBB7_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %C = load half, half* %CC
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -391,27 +381,25 @@ for.cond.cleanup: ; preds = %vector.body, %entry
  ret void
}

define arm_aapcs_vfpcc void @test_fma(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
define arm_aapcs_vfpcc void @test_fma(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fma:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    ldr.w r12, [sp]
; CHECK-NEXT:    cmp.w r12, #1
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB8_1: @ %vector.ph
; CHECK-NEXT:    ldrh r2, [r2]
; CHECK-NEXT:    vmov.f16 r12, s0
; CHECK-NEXT:  .LBB8_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
; CHECK-NEXT:    subs.w r12, r12, #8
; CHECK-NEXT:    vfma.f16 q1, q0, r2
; CHECK-NEXT:    vstrb.8 q1, [r3], #16
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vfma.f16 q1, q0, r12
; CHECK-NEXT:    vstrb.8 q1, [r2], #16
; CHECK-NEXT:    bne .LBB8_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %C = load half, half* %CC
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -444,27 +432,25 @@ for.cond.cleanup: ; preds = %vector.body, %entry
  ret void
}

define arm_aapcs_vfpcc void @test_fma_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
define arm_aapcs_vfpcc void @test_fma_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fma_r:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    ldr.w r12, [sp]
; CHECK-NEXT:    cmp.w r12, #1
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB9_1: @ %vector.ph
; CHECK-NEXT:    ldrh r2, [r2]
; CHECK-NEXT:    vmov.f16 r12, s0
; CHECK-NEXT:  .LBB9_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
; CHECK-NEXT:    subs.w r12, r12, #8
; CHECK-NEXT:    vfma.f16 q1, q0, r2
; CHECK-NEXT:    vstrb.8 q1, [r3], #16
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vfma.f16 q1, q0, r12
; CHECK-NEXT:    vstrb.8 q1, [r2], #16
; CHECK-NEXT:    bne .LBB9_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %C = load half, half* %CC
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -498,30 +484,28 @@ for.cond.cleanup: ; preds = %vector.body, %entry
}


define arm_aapcs_vfpcc void @test_fmss(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
define arm_aapcs_vfpcc void @test_fmss(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fmss:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    ldr.w r12, [sp]
; CHECK-NEXT:    cmp.w r12, #1
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB10_1: @ %vector.ph
; CHECK-NEXT:    ldrh r2, [r2]
; CHECK-NEXT:    vdup.16 q0, r2
; CHECK-NEXT:    vmov.f16 r12, s0
; CHECK-NEXT:    vdup.16 q0, r12
; CHECK-NEXT:    vneg.f16 q0, q0
; CHECK-NEXT:  .LBB10_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
; CHECK-NEXT:    vmov q3, q0
; CHECK-NEXT:    subs.w r12, r12, #8
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vfma.f16 q3, q2, q1
; CHECK-NEXT:    vstrb.8 q3, [r3], #16
; CHECK-NEXT:    vstrb.8 q3, [r2], #16
; CHECK-NEXT:    bne .LBB10_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %C = load half, half* %CC
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -554,29 +538,27 @@ for.cond.cleanup: ; preds = %vector.body, %entry
  ret void
}

define arm_aapcs_vfpcc void @test_fmss_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
define arm_aapcs_vfpcc void @test_fmss_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fmss_r:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    ldr.w r12, [sp]
; CHECK-NEXT:    cmp.w r12, #1
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB11_1: @ %vector.ph
; CHECK-NEXT:    ldrh r2, [r2]
; CHECK-NEXT:    vdup.16 q0, r2
; CHECK-NEXT:    vmov.f16 r12, s0
; CHECK-NEXT:    vdup.16 q0, r12
; CHECK-NEXT:  .LBB11_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
; CHECK-NEXT:    vmov q3, q0
; CHECK-NEXT:    subs.w r12, r12, #8
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vfms.f16 q3, q2, q1
; CHECK-NEXT:    vstrb.8 q3, [r3], #16
; CHECK-NEXT:    vstrb.8 q3, [r2], #16
; CHECK-NEXT:    bne .LBB11_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %C = load half, half* %CC
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -609,28 +591,26 @@ for.cond.cleanup: ; preds = %vector.body, %entry
  ret void
}

define arm_aapcs_vfpcc void @test_fms(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
define arm_aapcs_vfpcc void @test_fms(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fms:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    ldr.w r12, [sp]
; CHECK-NEXT:    cmp.w r12, #1
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB12_1: @ %vector.ph
; CHECK-NEXT:    ldrh r2, [r2]
; CHECK-NEXT:    vmov.f16 r12, s0
; CHECK-NEXT:  .LBB12_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
; CHECK-NEXT:    subs.w r12, r12, #8
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vneg.f16 q0, q0
; CHECK-NEXT:    vfma.f16 q0, q1, r2
; CHECK-NEXT:    vstrb.8 q0, [r3], #16
; CHECK-NEXT:    vfma.f16 q0, q1, r12
; CHECK-NEXT:    vstrb.8 q0, [r2], #16
; CHECK-NEXT:    bne .LBB12_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %C = load half, half* %CC
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -663,28 +643,26 @@ for.cond.cleanup: ; preds = %vector.body, %entry
  ret void
}

define arm_aapcs_vfpcc void @test_fms_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
define arm_aapcs_vfpcc void @test_fms_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fms_r:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    ldr.w r12, [sp]
; CHECK-NEXT:    cmp.w r12, #1
; CHECK-NEXT:    cmp r3, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB13_1: @ %vector.ph
; CHECK-NEXT:    ldrh r2, [r2]
; CHECK-NEXT:    vmov.f16 r12, s0
; CHECK-NEXT:  .LBB13_2: @ %vector.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
; CHECK-NEXT:    subs.w r12, r12, #8
; CHECK-NEXT:    subs r3, #8
; CHECK-NEXT:    vneg.f16 q0, q0
; CHECK-NEXT:    vfma.f16 q0, q1, r2
; CHECK-NEXT:    vstrb.8 q0, [r3], #16
; CHECK-NEXT:    vfma.f16 q0, q1, r12
; CHECK-NEXT:    vstrb.8 q0, [r2], #16
; CHECK-NEXT:    bne .LBB13_2
; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT:    bx lr
entry:
  %C = load half, half* %CC
  %0 = and i32 %n, 7
  %cmp = icmp eq i32 %0, 0
  tail call void @llvm.assume(i1 %cmp)
@@ -718,7 +696,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
}


define dso_local void @test_nested(half* noalias nocapture %pInT1, half* noalias nocapture readonly %pOutT1, half* noalias nocapture readonly %pPRT_in, half* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, half *%ina) local_unnamed_addr #0 {
define dso_local void @test_nested(half* noalias nocapture %pInT1, half* noalias nocapture readonly %pOutT1, half* noalias nocapture readonly %pPRT_in, half* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l) local_unnamed_addr #0 {
; CHECK-LABEL: test_nested:
; CHECK:       @ %bb.0: @ %for.body.us.preheader
; CHECK-NEXT:    .save {r4, r5, r6, lr}
@@ -752,7 +730,6 @@ define dso_local void @test_nested(half* noalias nocapture %pInT1, half* noalias
; CHECK-NEXT:  @ %bb.4: @ %for.end14
; CHECK-NEXT:    pop {r4, r5, r6, pc}
for.body.us.preheader:
  %in = load half, half* %ina
  %cmp = icmp sgt i32 %numRows, 0
  tail call void @llvm.assume(i1 %cmp)
  %cmp1 = icmp sgt i32 %numCols, 0
+1 −1
Original line number Diff line number Diff line
@@ -696,7 +696,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
}


define dso_local void @test_nested(float* noalias nocapture %pInT1, float* noalias nocapture readonly %pOutT1, float* noalias nocapture readonly %pPRT_in, float* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, float %in) local_unnamed_addr #0 {
define dso_local void @test_nested(float* noalias nocapture %pInT1, float* noalias nocapture readonly %pOutT1, float* noalias nocapture readonly %pPRT_in, float* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l) local_unnamed_addr #0 {
; CHECK-LABEL: test_nested:
; CHECK:       @ %bb.0: @ %for.body.us.preheader
; CHECK-NEXT:    .save {r4, r5, r6, lr}
+2 −4
Original line number Diff line number Diff line
@@ -512,14 +512,12 @@ entry:
  ret <4 x float> %res
}

; TODO: Calling convention needs fixing to pass half types directly to functions
define arm_aapcs_vfpcc <8 x half> @insert_f16(half *%aa) {
define arm_aapcs_vfpcc <8 x half> @insert_f16(half %a) {
; CHECK-LABEL: insert_f16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldr.16 s0, [r0]
; CHECK-NEXT:    @ kill: def $s0 killed $s0 def $q0
; CHECK-NEXT:    bx lr
entry:
  %a = load half, half* %aa
  %res = insertelement <8 x half> undef, half %a, i32 0
  ret <8 x half> %res
}
+8 −12
Original line number Diff line number Diff line
@@ -86,14 +86,13 @@ entry:
  ret <4 x float> %c
}

define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16(<8 x half> %src, half *%src2p, <8 x half> %a, <8 x half> %b) {
define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: vaddqr_v8f16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    ldrh r0, [r0]
; CHECK-NEXT:    vmov.f16 r0, s4
; CHECK-NEXT:    vadd.f16 q0, q0, r0
; CHECK-NEXT:    bx lr
entry:
  %src2 = load half, half *%src2p, align 2
  %i = insertelement <8 x half> undef, half %src2, i32 0
  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
  %c = fadd <8 x half> %src, %sp
@@ -113,14 +112,13 @@ entry:
  ret <4 x float> %c
}

define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16_2(<8 x half> %src, half *%src2p, <8 x half> %a, <8 x half> %b) {
define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16_2(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: vaddqr_v8f16_2:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    ldrh r0, [r0]
; CHECK-NEXT:    vmov.f16 r0, s4
; CHECK-NEXT:    vadd.f16 q0, q0, r0
; CHECK-NEXT:    bx lr
entry:
  %src2 = load half, half *%src2p, align 2
  %i = insertelement <8 x half> undef, half %src2, i32 0
  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
  %c = fadd <8 x half> %sp, %src
@@ -142,14 +140,13 @@ entry:
  ret <4 x float> %c
}

define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16_3(<8 x half> %src, half *%src2p, <8 x half> %a, <8 x half> %b) {
define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16_3(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: vaddqr_v8f16_3:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    ldrh r0, [r0]
; CHECK-NEXT:    vmov.f16 r0, s4
; CHECK-NEXT:    vadd.f16 q0, q0, r0
; CHECK-NEXT:    bx lr
entry:
  %src2 = load half, half *%src2p, align 2
  %src2bc = bitcast half %src2 to i16
  %i = insertelement <8 x i16> undef, i16 %src2bc, i32 0
  %spbc = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
@@ -173,14 +170,13 @@ entry:
  ret <4 x float> %c
}

define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16_4(<8 x half> %src, half *%src2p, <8 x half> %a, <8 x half> %b) {
define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16_4(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: vaddqr_v8f16_4:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    ldrh r0, [r0]
; CHECK-NEXT:    vmov.f16 r0, s4
; CHECK-NEXT:    vadd.f16 q0, q0, r0
; CHECK-NEXT:    bx lr
entry:
  %src2 = load half, half *%src2p, align 2
  %src2bc = bitcast half %src2 to i16
  %i = insertelement <8 x i16> undef, i16 %src2bc, i32 0
  %spbc = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
+2400 −2458

File changed.

Preview size limit exceeded, changes collapsed.

Loading