Unverified Commit cf30e4b5 authored by Jack Styles's avatar Jack Styles Committed by GitHub
Browse files

[AArch64] Enable Spillage Copy Elimination by default (#186093)

In times of high register pressure, the greedy register allocator can
emit large eviction chains that consist of many `mov` instructions. The
Spillage Copy Elimination pass handles this, by finding these chains and
decreasing their impact. Take a mov chain such as the following where
`x8` is used for an 8-byte Folded Reload:
```
mov x7, x6
mov x6, x5
mov x5, x4
mov x4, x3
mov x3, x2
mov x2, x1
mov x1, x30
mov x30, x8
< use x8 >
mov x8, x30
mov x30, x1
mov x1, x2
mov x2, x3
mov x3, x4
mov x4, x5
mov x5, x6
mov x6, x7
```
Becomes:
```
mov x7, x6
mov x6, x8
< use x8 >
mov x8, x6
mov x6, x7
```

This provides performance benefits for long mov chains, where we are no
longer needing to copy these values between registers.

This does introduce compile time regressions, as was originally noted in
the initial review. From my testing, this was around 0.17% on average
using LLVM Test Suite.

Further information:
Original Review: https://reviews.llvm.org/D122118
Compile Time Regression information from original review:
http://llvm-compile-time-tracker.com/compare.php?from=781eabeb40b8e47e3a46b0b927784e63f0aad9ab&to=0af2744a89bf0ed05e83ac1ed9d21d6d74cdfeca&stat=instructions%3Au

Assisted-by: codex (Generation of new test)
parent c94db1af
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -159,6 +159,7 @@ public:
  bool enableMachineScheduler() const override { return true; }
  bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
  bool enableSubRegLiveness() const override { return EnableSubregLiveness; }
  bool enableSpillageCopyElimination() const override { return true; }

  bool enableMachinePipeliner() const override;
  bool useDFAforSMS() const override { return false; }
+403 −0
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=aarch64-linux-gnu -o - -O3 %s | FileCheck %s

; During times of high register pressure, Greedy register allocation
; may emit large mov spill chains on AArch64. The Spill Copy Elimination
; pass can simplify these chains, and improve runtime performance. For situations
; where this is likely, we need to ensure it is simplifying the register allocation

define void @_test(ptr readonly nocapture %0, ptr readonly nocapture %1, ptr readonly nocapture %2, ptr readonly nocapture %3, ptr readonly nocapture %4, ptr readonly nocapture %5, ptr readonly nocapture %6, ptr readonly nocapture %7, ptr readonly nocapture %8, ptr readonly nocapture %9, ptr readonly nocapture %10, ptr readonly nocapture %11, ptr readonly nocapture %12, ptr readonly nocapture %13, ptr readonly nocapture %14, ptr readonly nocapture %15) #0 {
; CHECK-LABEL: _test:
; CHECK:       // %bb.0:
; CHECK-NEXT:    str x30, [sp, #-96]! // 8-byte Folded Spill
; CHECK-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
; CHECK-NEXT:    sub sp, sp, #3648
; CHECK-NEXT:    ldr x9, [sp, #7896]
; CHECK-NEXT:    ldr x10, [sp, #7888]
; CHECK-NEXT:    add x17, sp, #1, lsl #12 // =4096
; CHECK-NEXT:    ldr x11, [sp, #7880]
; CHECK-NEXT:    ldr x12, [sp, #7872]
; CHECK-NEXT:    add x18, sp, #1, lsl #12 // =4096
; CHECK-NEXT:    ldr x13, [sp, #7864]
; CHECK-NEXT:    ldr x14, [sp, #7856]
; CHECK-NEXT:    add x19, sp, #1, lsl #12 // =4096
; CHECK-NEXT:    ldr x15, [sp, #7848]
; CHECK-NEXT:    ldr x16, [sp, #7840]
; CHECK-NEXT:    add x20, sp, #1, lsl #12 // =4096
; CHECK-NEXT:    add x21, sp, #1, lsl #12 // =4096
; CHECK-NEXT:    add x22, sp, #1, lsl #12 // =4096
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:    add x17, x17, #3148
; CHECK-NEXT:    add x18, x18, #2148
; CHECK-NEXT:    add x19, x19, #1648
; CHECK-NEXT:    add x20, x20, #2648
; CHECK-NEXT:    add x21, x21, #1148
; CHECK-NEXT:    add x22, x22, #148
; CHECK-NEXT:    add x23, sp, #3744
; CHECK-NEXT:    add x24, sp, #3244
; CHECK-NEXT:    add x25, sp, #2244
; CHECK-NEXT:    add x26, sp, #1744
; CHECK-NEXT:    add x27, sp, #244
; CHECK-NEXT:    mov w28, #1 // =0x1
; CHECK-NEXT:  .LBB0_1: // %.preheader167
; CHECK-NEXT:    // =>This Loop Header: Depth=1
; CHECK-NEXT:    // Child Loop BB0_2 Depth 2
; CHECK-NEXT:    // Child Loop BB0_3 Depth 3
; CHECK-NEXT:    stp x28, x27, [sp, #8] // 16-byte Folded Spill
; CHECK-NEXT:    mov x30, x0
; CHECK-NEXT:    mov x28, x18
; CHECK-NEXT:    stp x0, x1, [sp, #168] // 16-byte Folded Spill
; CHECK-NEXT:    mov x0, x9
; CHECK-NEXT:    stp x18, x17, [sp, #88] // 16-byte Folded Spill
; CHECK-NEXT:    mov x18, x17
; CHECK-NEXT:    mov w17, #1 // =0x1
; CHECK-NEXT:    stp x2, x3, [sp, #184] // 16-byte Folded Spill
; CHECK-NEXT:    stp x4, x5, [sp, #200] // 16-byte Folded Spill
; CHECK-NEXT:    stp x6, x7, [sp, #216] // 16-byte Folded Spill
; CHECK-NEXT:    stp x16, x15, [sp, #104] // 16-byte Folded Spill
; CHECK-NEXT:    stp x14, x13, [sp, #120] // 16-byte Folded Spill
; CHECK-NEXT:    stp x12, x11, [sp, #136] // 16-byte Folded Spill
; CHECK-NEXT:    stp x10, x9, [sp, #152] // 16-byte Folded Spill
; CHECK-NEXT:    stp x26, x25, [sp, #24] // 16-byte Folded Spill
; CHECK-NEXT:    stp x24, x23, [sp, #40] // 16-byte Folded Spill
; CHECK-NEXT:    stp x22, x21, [sp, #56] // 16-byte Folded Spill
; CHECK-NEXT:    stp x20, x19, [sp, #72] // 16-byte Folded Spill
; CHECK-NEXT:    str x17, [sp, #232] // 8-byte Spill
; CHECK-NEXT:  .LBB0_2: // %.preheader166
; CHECK-NEXT:    // Parent Loop BB0_1 Depth=1
; CHECK-NEXT:    // => This Loop Header: Depth=2
; CHECK-NEXT:    // Child Loop BB0_3 Depth 3
; CHECK-NEXT:    mov x9, #-500 // =0xfffffffffffffe0c
; CHECK-NEXT:    mov x17, x28
; CHECK-NEXT:    str x20, [sp, #7752] // 8-byte Spill
; CHECK-NEXT:  .LBB0_3: // Parent Loop BB0_1 Depth=1
; CHECK-NEXT:    // Parent Loop BB0_2 Depth=2
; CHECK-NEXT:    // => This Inner Loop Header: Depth=3
; CHECK-NEXT:    add x28, x26, x9
; CHECK-NEXT:    ldr s1, [x5, x9]
; CHECK-NEXT:    ldr s3, [x14, x9]
; CHECK-NEXT:    ldr s2, [x28, #500]
; CHECK-NEXT:    ldr s0, [x8]
; CHECK-NEXT:    add x28, x27, x9
; CHECK-NEXT:    ldr s6, [x30, x9]
; CHECK-NEXT:    ldr s7, [x28, #500]
; CHECK-NEXT:    add x28, x25, x9
; CHECK-NEXT:    fmul s4, s1, s2
; CHECK-NEXT:    fmul s5, s3, s2
; CHECK-NEXT:    ldr s16, [x1, x9]
; CHECK-NEXT:    fmul s2, s0, s2
; CHECK-NEXT:    ldr s17, [x3, x9]
; CHECK-NEXT:    ldr s18, [x28, #500]
; CHECK-NEXT:    add x28, x24, x9
; CHECK-NEXT:    ldr s19, [x6, x9]
; CHECK-NEXT:    mov x20, x21
; CHECK-NEXT:    ldr s21, [x28, #500]
; CHECK-NEXT:    add x28, x23, x9
; CHECK-NEXT:    fmadd s4, s6, s7, s4
; CHECK-NEXT:    ldr s6, [x4, x9]
; CHECK-NEXT:    fmadd s5, s16, s7, s5
; CHECK-NEXT:    fmadd s1, s1, s7, s2
; CHECK-NEXT:    fmadd s20, s6, s7, s2
; CHECK-NEXT:    fmadd s2, s17, s7, s2
; CHECK-NEXT:    ldr s7, [x15, x9]
; CHECK-NEXT:    fmadd s4, s16, s18, s4
; CHECK-NEXT:    fmadd s5, s19, s18, s5
; CHECK-NEXT:    ldr s16, [x16, x9]
; CHECK-NEXT:    fmul s19, s0, s21
; CHECK-NEXT:    fmadd s1, s3, s18, s1
; CHECK-NEXT:    fmadd s3, s7, s18, s20
; CHECK-NEXT:    fmadd s2, s16, s18, s2
; CHECK-NEXT:    fmadd s4, s6, s21, s4
; CHECK-NEXT:    fmadd s5, s7, s21, s5
; CHECK-NEXT:    ldr s6, [x0, x9]
; CHECK-NEXT:    fadd s1, s1, s19
; CHECK-NEXT:    ldr s7, [x28, #500]
; CHECK-NEXT:    add x28, x22, x9
; CHECK-NEXT:    fadd s3, s3, s19
; CHECK-NEXT:    fmadd s2, s6, s21, s2
; CHECK-NEXT:    ldr s18, [x28, #500]
; CHECK-NEXT:    add x28, x21, x9
; CHECK-NEXT:    mov x21, x8
; CHECK-NEXT:    ldr x8, [sp, #7752] // 8-byte Reload
; CHECK-NEXT:    fmadd s4, s17, s7, s4
; CHECK-NEXT:    fmadd s5, s16, s7, s5
; CHECK-NEXT:    ldr s16, [x10, x9]
; CHECK-NEXT:    fmadd s0, s0, s7, s1
; CHECK-NEXT:    ldr s17, [x2, x9]
; CHECK-NEXT:    ldr s1, [x7, x9]
; CHECK-NEXT:    fmadd s3, s6, s7, s3
; CHECK-NEXT:    ldr s6, [x11, x9]
; CHECK-NEXT:    fmadd s2, s16, s7, s2
; CHECK-NEXT:    ldr s7, [x12, x9]
; CHECK-NEXT:    add x8, x8, x9
; CHECK-NEXT:    fmadd s4, s17, s18, s4
; CHECK-NEXT:    fmadd s1, s1, s18, s5
; CHECK-NEXT:    ldr s5, [x13, x9]
; CHECK-NEXT:    fmadd s0, s6, s18, s0
; CHECK-NEXT:    fmadd s3, s7, s18, s3
; CHECK-NEXT:    fmadd s2, s5, s18, s2
; CHECK-NEXT:    str s4, [x28, #500]
; CHECK-NEXT:    add x28, x19, x9
; CHECK-NEXT:    str s1, [x8, #500]
; CHECK-NEXT:    add x8, x17, x9
; CHECK-NEXT:    str s0, [x28, #500]
; CHECK-NEXT:    add x28, x18, x9
; CHECK-NEXT:    add x9, x9, #4
; CHECK-NEXT:    str s3, [x8, #500]
; CHECK-NEXT:    mov x8, x21
; CHECK-NEXT:    mov x21, x20
; CHECK-NEXT:    cmn x9, #480
; CHECK-NEXT:    str s2, [x28, #500]
; CHECK-NEXT:    b.ne .LBB0_3
; CHECK-NEXT:  // %bb.4: // in Loop: Header=BB0_2 Depth=2
; CHECK-NEXT:    ldr x9, [sp, #232] // 8-byte Reload
; CHECK-NEXT:    ldr x20, [sp, #7752] // 8-byte Reload
; CHECK-NEXT:    add x18, x18, #20
; CHECK-NEXT:    add x28, x17, #20
; CHECK-NEXT:    add x19, x19, #20
; CHECK-NEXT:    add x21, x21, #20
; CHECK-NEXT:    add x9, x9, #1
; CHECK-NEXT:    add x20, x20, #20
; CHECK-NEXT:    add x22, x22, #20
; CHECK-NEXT:    add x23, x23, #20
; CHECK-NEXT:    add x24, x24, #20
; CHECK-NEXT:    add x25, x25, #20
; CHECK-NEXT:    add x26, x26, #20
; CHECK-NEXT:    add x27, x27, #20
; CHECK-NEXT:    add x0, x0, #20
; CHECK-NEXT:    add x10, x10, #20
; CHECK-NEXT:    add x11, x11, #20
; CHECK-NEXT:    add x12, x12, #20
; CHECK-NEXT:    add x13, x13, #20
; CHECK-NEXT:    add x14, x14, #20
; CHECK-NEXT:    add x15, x15, #20
; CHECK-NEXT:    add x16, x16, #20
; CHECK-NEXT:    add x7, x7, #20
; CHECK-NEXT:    add x6, x6, #20
; CHECK-NEXT:    add x5, x5, #20
; CHECK-NEXT:    add x4, x4, #20
; CHECK-NEXT:    add x3, x3, #20
; CHECK-NEXT:    cmp x9, #6
; CHECK-NEXT:    add x2, x2, #20
; CHECK-NEXT:    add x1, x1, #20
; CHECK-NEXT:    add x30, x30, #20
; CHECK-NEXT:    str x9, [sp, #232] // 8-byte Spill
; CHECK-NEXT:    b.ne .LBB0_2
; CHECK-NEXT:  // %bb.5: // in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT:    ldp x18, x17, [sp, #88] // 16-byte Folded Reload
; CHECK-NEXT:    ldp x28, x27, [sp, #8] // 16-byte Folded Reload
; CHECK-NEXT:    ldp x20, x19, [sp, #72] // 16-byte Folded Reload
; CHECK-NEXT:    add x28, x28, #1
; CHECK-NEXT:    ldp x22, x21, [sp, #56] // 16-byte Folded Reload
; CHECK-NEXT:    add x17, x17, #100
; CHECK-NEXT:    add x18, x18, #100
; CHECK-NEXT:    add x27, x27, #100
; CHECK-NEXT:    ldp x24, x23, [sp, #40] // 16-byte Folded Reload
; CHECK-NEXT:    add x19, x19, #100
; CHECK-NEXT:    ldp x26, x25, [sp, #24] // 16-byte Folded Reload
; CHECK-NEXT:    add x20, x20, #100
; CHECK-NEXT:    ldp x10, x9, [sp, #152] // 16-byte Folded Reload
; CHECK-NEXT:    add x21, x21, #100
; CHECK-NEXT:    ldp x12, x11, [sp, #136] // 16-byte Folded Reload
; CHECK-NEXT:    add x22, x22, #100
; CHECK-NEXT:    ldp x14, x13, [sp, #120] // 16-byte Folded Reload
; CHECK-NEXT:    add x23, x23, #100
; CHECK-NEXT:    ldp x16, x15, [sp, #104] // 16-byte Folded Reload
; CHECK-NEXT:    add x24, x24, #100
; CHECK-NEXT:    ldp x6, x7, [sp, #216] // 16-byte Folded Reload
; CHECK-NEXT:    add x25, x25, #100
; CHECK-NEXT:    ldp x4, x5, [sp, #200] // 16-byte Folded Reload
; CHECK-NEXT:    add x26, x26, #100
; CHECK-NEXT:    ldp x2, x3, [sp, #184] // 16-byte Folded Reload
; CHECK-NEXT:    add x9, x9, #100
; CHECK-NEXT:    ldp x0, x1, [sp, #168] // 16-byte Folded Reload
; CHECK-NEXT:    add x10, x10, #100
; CHECK-NEXT:    add x11, x11, #100
; CHECK-NEXT:    add x12, x12, #100
; CHECK-NEXT:    add x13, x13, #100
; CHECK-NEXT:    add x14, x14, #100
; CHECK-NEXT:    add x15, x15, #100
; CHECK-NEXT:    add x16, x16, #100
; CHECK-NEXT:    add x7, x7, #100
; CHECK-NEXT:    add x6, x6, #100
; CHECK-NEXT:    add x5, x5, #100
; CHECK-NEXT:    add x4, x4, #100
; CHECK-NEXT:    add x3, x3, #100
; CHECK-NEXT:    add x2, x2, #100
; CHECK-NEXT:    add x1, x1, #100
; CHECK-NEXT:    add x0, x0, #100
; CHECK-NEXT:    b .LBB0_1
  %17 = alloca [5 x [5 x [5 x float]]], align 4
  %18 = alloca [5 x [5 x [5 x float]]], align 4
  %19 = alloca [5 x [5 x [5 x float]]], align 4
  %20 = alloca [5 x [5 x [5 x float]]], align 4
  %21 = alloca [5 x [5 x [5 x float]]], align 4
  %22 = alloca [5 x [5 x [5 x float]]], align 4
  %23 = alloca [5 x [5 x [5 x float]]], align 4
  %24 = alloca [5 x [5 x [5 x float]]], align 4
  %25 = alloca [5 x [5 x [5 x float]]], align 4
  %26 = alloca [5 x [5 x [5 x float]]], align 4
  %27 = alloca [5 x [5 x [5 x float]]], align 4
  %28 = alloca [5 x [5 x [5 x float]]], align 4
  %29 = alloca [5 x [5 x [5 x float]]], align 4
  %30 = alloca [5 x [5 x [5 x float]]], align 4
  %31 = alloca [5 x [5 x [5 x float]]], align 4
  %32 = sext i32 0 to i64
  %.idx = mul nsw i64 %32, 4500
  %33 = getelementptr i8, ptr null, i64 -4500
  %34 = mul nsw i64 %32, 125
  %35 = add nsw i64 %34, -125
  br label %.preheader167

.preheader167:                                    ; preds = %154, %16
  %indvars.iv175 = phi i64 [ 1, %16 ], [ %indvars.iv.next176, %154 ]
  %36 = mul nuw nsw i64 %indvars.iv175, 25
  %37 = add nsw i64 %36, -31
  br label %.preheader166

.preheader166:                                    ; preds = %153, %.preheader167
  %indvars.iv172 = phi i64 [ 1, %.preheader167 ], [ %indvars.iv.next173, %153 ]
  %38 = mul nuw nsw i64 %indvars.iv172, 5
  %39 = add nsw i64 %37, %38
  br label %40

40:                                               ; preds = %40, %.preheader166
  %indvars.iv = phi i64 [ 1, %.preheader166 ], [ %indvars.iv.next, %40 ]
  %41 = phi i64 [ 5, %.preheader166 ], [ %152, %40 ]
  %42 = add nsw i64 %39, %indvars.iv
  %43 = add nsw i64 %35, %42
  %44 = getelementptr float, ptr %0, i64 %43
  %45 = load float, ptr %44, align 4
  %46 = getelementptr float, ptr %1, i64 %43
  %47 = load float, ptr %46, align 4
  %48 = getelementptr float, ptr %2, i64 %43
  %49 = load float, ptr %48, align 4
  %50 = getelementptr float, ptr %3, i64 %43
  %51 = load float, ptr %50, align 4
  %52 = getelementptr float, ptr %4, i64 %43
  %53 = load float, ptr %52, align 4
  %54 = getelementptr float, ptr %5, i64 %43
  %55 = load float, ptr %54, align 4
  %56 = getelementptr float, ptr %6, i64 %43
  %57 = load float, ptr %56, align 4
  %58 = getelementptr float, ptr %7, i64 %43
  %59 = load float, ptr %58, align 4
  %60 = getelementptr float, ptr %8, i64 %43
  %61 = load float, ptr %60, align 4
  %62 = getelementptr float, ptr %9, i64 %43
  %63 = load float, ptr %62, align 4
  %64 = getelementptr float, ptr %10, i64 %43
  %65 = load float, ptr %64, align 4
  %66 = getelementptr float, ptr %11, i64 %43
  %67 = load float, ptr %66, align 4
  %68 = getelementptr float, ptr %12, i64 %43
  %69 = load float, ptr %68, align 4
  %70 = getelementptr float, ptr %13, i64 %43
  %71 = load float, ptr %70, align 4
  %72 = getelementptr float, ptr %14, i64 %43
  %73 = load float, ptr %72, align 4
  %74 = getelementptr float, ptr %15, i64 %43
  %75 = load float, ptr %74, align 4
  %76 = load float, ptr null, align 4
  %77 = load float, ptr null, align 4
  %78 = load float, ptr null, align 4
  %79 = load float, ptr null, align 4
  %80 = getelementptr float, ptr %31, i64 %42
  %81 = load float, ptr %80, align 4
  %82 = fmul contract float %45, %81
  %83 = getelementptr float, ptr %28, i64 %42
  %84 = load float, ptr %83, align 4
  %85 = fmul contract float %55, %84
  %86 = fadd contract float %82, %85
  %87 = getelementptr float, ptr %27, i64 %42
  %88 = load float, ptr %87, align 4
  %89 = fmul contract float %47, %88
  %90 = fadd contract float %86, %89
  %91 = getelementptr float, ptr %25, i64 %42
  %92 = load float, ptr %91, align 4
  %93 = fmul contract float %53, %92
  %94 = fadd contract float %90, %93
  %95 = getelementptr float, ptr %24, i64 %42
  %96 = load float, ptr %95, align 4
  %97 = fmul contract float %51, %96
  %98 = fadd contract float %94, %97
  %99 = getelementptr float, ptr %23, i64 %42
  %100 = load float, ptr %99, align 4
  %101 = fmul contract float %49, %100
  %102 = fadd contract float %98, %101
  %103 = getelementptr float, ptr %21, i64 %42
  store float %102, ptr %103, align 4
  %104 = fmul contract float %47, %81
  %105 = fmul contract float %65, %84
  %106 = fadd contract float %104, %105
  %107 = fmul contract float %57, %88
  %108 = fadd contract float %106, %107
  %109 = fmul contract float %63, %92
  %110 = fadd contract float %108, %109
  %111 = fmul contract float %61, %96
  %112 = fadd contract float %110, %111
  %113 = fmul contract float %59, %100
  %114 = fadd contract float %112, %113
  %115 = getelementptr float, ptr %18, i64 %42
  store float %114, ptr %115, align 4
  %116 = fmul contract float %55, %81
  %117 = fmul contract float %79, %84
  %118 = fadd contract float %116, %117
  %119 = fmul contract float %65, %88
  %120 = fadd contract float %118, %119
  %121 = fmul contract float %78, %92
  %122 = fadd contract float %120, %121
  %123 = fmul contract float %76, %96
  %124 = fadd contract float %122, %123
  %125 = fmul contract float %71, %100
  %126 = fadd contract float %124, %125
  %127 = getelementptr float, ptr %20, i64 %42
  store float %126, ptr %127, align 4
  %128 = fmul contract float %53, %81
  %129 = fmul contract float %78, %84
  %130 = fadd contract float %128, %129
  %131 = fmul contract float %63, %88
  %132 = fadd contract float %130, %131
  %133 = fmul contract float %77, %92
  %134 = fadd contract float %132, %133
  %135 = fmul contract float %75, %96
  %136 = fadd contract float %134, %135
  %137 = fmul contract float %69, %100
  %138 = fadd contract float %136, %137
  %139 = getelementptr float, ptr %19, i64 %42
  store float %138, ptr %139, align 4
  %140 = fmul contract float %51, %81
  %141 = fmul contract float %76, %84
  %142 = fadd contract float %140, %141
  %143 = fmul contract float %61, %88
  %144 = fadd contract float %142, %143
  %145 = fmul contract float %75, %92
  %146 = fadd contract float %144, %145
  %147 = fmul contract float %73, %96
  %148 = fadd contract float %146, %147
  %149 = fmul contract float %67, %100
  %150 = fadd contract float %148, %149
  %151 = getelementptr float, ptr %17, i64 %42
  store float %150, ptr %151, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %152 = add nsw i64 %41, -1
  %.not145 = icmp eq i64 %152, 0
  br i1 %.not145, label %153, label %40

153:                                              ; preds = %40
  %indvars.iv.next173 = add nuw nsw i64 %indvars.iv172, 1
  %exitcond.not = icmp eq i64 %indvars.iv.next173, 6
  br i1 %exitcond.not, label %154, label %.preheader166

154:                                              ; preds = %153
  %indvars.iv.next176 = add nuw nsw i64 %indvars.iv175, 1
  %exitcond178.not = icmp eq i64 %indvars.iv.next176, 6
  br label %.preheader167
}

attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) "frame-pointer"="non-leaf" "target-cpu"="generic" "target-features"="+outline-atomics,+v8a,+fp-armv8,+neon" }
+20 −30
Original line number Diff line number Diff line
@@ -83,29 +83,24 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-NEXT:    mul x0, x18, x18
; CHECK-NEXT:    ldr x5, [x5, #128]
; CHECK-NEXT:    mov v9.16b, v30.16b
; CHECK-NEXT:    mov v30.16b, v25.16b
; CHECK-NEXT:    mov v25.16b, v20.16b
; CHECK-NEXT:    mov v20.16b, v1.16b
; CHECK-NEXT:    mul x17, x16, x18
; CHECK-NEXT:    mov v30.16b, v1.16b
; CHECK-NEXT:    mov v31.16b, v26.16b
; CHECK-NEXT:    mov v26.16b, v21.16b
; CHECK-NEXT:    fmov d14, x1
; CHECK-NEXT:    mov v21.16b, v16.16b
; CHECK-NEXT:    mov v16.16b, v2.16b
; CHECK-NEXT:    mul x4, x2, x18
; CHECK-NEXT:    mov v26.16b, v2.16b
; CHECK-NEXT:    mul x17, x16, x18
; CHECK-NEXT:    mov v6.16b, v10.16b
; CHECK-NEXT:    mov v10.16b, v17.16b
; CHECK-NEXT:    fmov d15, x0
; CHECK-NEXT:    mov v17.16b, v3.16b
; CHECK-NEXT:    mov v10.16b, v3.16b
; CHECK-NEXT:    fmov d14, x1
; CHECK-NEXT:    mov v24.16b, v19.16b
; CHECK-NEXT:    mov v0.16b, v14.16b
; CHECK-NEXT:    mul x3, x14, x18
; CHECK-NEXT:    mov v19.16b, v5.16b
; CHECK-NEXT:    mul x4, x2, x18
; CHECK-NEXT:    add x11, x11, #8
; CHECK-NEXT:    add x12, x12, #1
; CHECK-NEXT:    fmov d15, x0
; CHECK-NEXT:    cmp x11, #64
; CHECK-NEXT:    mov v0.16b, v14.16b
; CHECK-NEXT:    mul x3, x14, x18
; CHECK-NEXT:    mov v15.d[1], x17
; CHECK-NEXT:    mul x6, x15, x15
; CHECK-NEXT:    cmp x11, #64
; CHECK-NEXT:    mov v0.d[1], x1
; CHECK-NEXT:    fmov d1, x4
; CHECK-NEXT:    mul x7, x15, x5
@@ -134,30 +129,26 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-NEXT:    add v12.2d, v12.2d, v4.2d
; CHECK-NEXT:    mul x16, x16, x15
; CHECK-NEXT:    add v15.2d, v15.2d, v1.2d
; CHECK-NEXT:    mov v1.16b, v20.16b
; CHECK-NEXT:    mov v1.16b, v30.16b
; CHECK-NEXT:    mov v5.d[1], x0
; CHECK-NEXT:    str q0, [sp, #96] // 16-byte Spill
; CHECK-NEXT:    mov v20.16b, v25.16b
; CHECK-NEXT:    mul x14, x14, x15
; CHECK-NEXT:    mov v25.16b, v30.16b
; CHECK-NEXT:    add v11.2d, v11.2d, v3.2d
; CHECK-NEXT:    fmov d0, x17
; CHECK-NEXT:    mov v3.16b, v17.16b
; CHECK-NEXT:    mov v17.16b, v10.16b
; CHECK-NEXT:    mul x14, x14, x15
; CHECK-NEXT:    mov v3.16b, v10.16b
; CHECK-NEXT:    mov v10.16b, v6.16b
; CHECK-NEXT:    fmov d0, x17
; CHECK-NEXT:    add v8.2d, v8.2d, v2.2d
; CHECK-NEXT:    mov v2.16b, v16.16b
; CHECK-NEXT:    mov v2.16b, v26.16b
; CHECK-NEXT:    add v26.2d, v31.2d, v4.2d
; CHECK-NEXT:    add v1.2d, v1.2d, v4.2d
; CHECK-NEXT:    add v7.2d, v7.2d, v4.2d
; CHECK-NEXT:    mov v14.d[1], x16
; CHECK-NEXT:    mov v16.16b, v21.16b
; CHECK-NEXT:    mov v21.16b, v26.16b
; CHECK-NEXT:    add v30.2d, v9.2d, v5.2d
; CHECK-NEXT:    mov v5.16b, v19.16b
; CHECK-NEXT:    add v26.2d, v31.2d, v4.2d
; CHECK-NEXT:    mov v0.d[1], x14
; CHECK-NEXT:    add v19.2d, v24.2d, v4.2d
; CHECK-NEXT:    add v1.2d, v1.2d, v4.2d
; CHECK-NEXT:    add v7.2d, v7.2d, v4.2d
; CHECK-NEXT:    ldp q4, q6, [sp, #16] // 32-byte Folded Reload
; CHECK-NEXT:    mov v0.d[1], x14
; CHECK-NEXT:    mov x14, x13
; CHECK-NEXT:    add v10.2d, v10.2d, v14.2d
; CHECK-NEXT:    add v29.2d, v29.2d, v14.2d
; CHECK-NEXT:    add v27.2d, v27.2d, v14.2d
@@ -175,7 +166,6 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-NEXT:    add v17.2d, v17.2d, v0.2d
; CHECK-NEXT:    add v4.2d, v4.2d, v0.2d
; CHECK-NEXT:    add v0.2d, v18.2d, v0.2d
; CHECK-NEXT:    mov x14, x13
; CHECK-NEXT:    b.ne .LBB0_1
; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
; CHECK-NEXT:    ldp q24, q18, [sp, #64] // 32-byte Folded Reload