Commit 7507fd1c authored by Tom Stellard's avatar Tom Stellard
Browse files

Merging r314251:

------------------------------------------------------------------------
r314251 | gberry | 2017-09-26 14:40:41 -0700 (Tue, 26 Sep 2017) | 16 lines

[AArch64][Falkor] Fix correctness bug in falkor prefetcher fix pass and correct some opcode tag computations.

Summary:
This addresses a correctness bug for LD[1234]*_POST opcodes that have
the prefetcher fix applied to them: the base register was not being
written back from the temp after being incremented, so it would appear
to never be incremented.

Also, fix some opcode tag computations based on some updated HW details
to get better tag avoidance and thus better prefetcher performance.

Reviewers: mcrosier

Subscribers: aemerson, rengolin, javed.absar, kristof.beyls

Differential Revision: https://reviews.llvm.org/D38256
------------------------------------------------------------------------

llvm-svn: 314554
parent 4041da7a
Loading
Loading
Loading
Loading
+52 −49
Original line number Diff line number Diff line
@@ -220,27 +220,27 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
  default:
    return None;

  case AArch64::LD1i64:
  case AArch64::LD2i64:
    DestRegIdx = 0;
    BaseRegIdx = 3;
    OffsetIdx = -1;
    IsPrePost = false;
    break;

  case AArch64::LD1i8:
  case AArch64::LD1i16:
  case AArch64::LD1i32:
  case AArch64::LD1i64:
  case AArch64::LD2i8:
  case AArch64::LD2i16:
  case AArch64::LD2i32:
  case AArch64::LD2i64:
  case AArch64::LD3i8:
  case AArch64::LD3i16:
  case AArch64::LD3i32:
  case AArch64::LD3i64:
  case AArch64::LD4i8:
  case AArch64::LD4i16:
  case AArch64::LD4i32:
    DestRegIdx = 0;
    BaseRegIdx = 3;
    OffsetIdx = -1;
    IsPrePost = false;
    break;

  case AArch64::LD3i64:
  case AArch64::LD4i64:
    DestRegIdx = -1;
    BaseRegIdx = 3;
@@ -264,23 +264,16 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
  case AArch64::LD1Rv4s:
  case AArch64::LD1Rv8h:
  case AArch64::LD1Rv16b:
  case AArch64::LD1Twov1d:
  case AArch64::LD1Twov2s:
  case AArch64::LD1Twov4h:
  case AArch64::LD1Twov8b:
  case AArch64::LD2Twov2s:
  case AArch64::LD2Twov4s:
  case AArch64::LD2Twov8b:
  case AArch64::LD2Rv1d:
  case AArch64::LD2Rv2s:
  case AArch64::LD2Rv4s:
  case AArch64::LD2Rv8b:
    DestRegIdx = 0;
    BaseRegIdx = 1;
    OffsetIdx = -1;
    IsPrePost = false;
    break;

  case AArch64::LD1Twov1d:
  case AArch64::LD1Twov2s:
  case AArch64::LD1Twov4h:
  case AArch64::LD1Twov8b:
  case AArch64::LD1Twov2d:
  case AArch64::LD1Twov4s:
  case AArch64::LD1Twov8h:
@@ -301,10 +294,17 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
  case AArch64::LD1Fourv4s:
  case AArch64::LD1Fourv8h:
  case AArch64::LD1Fourv16b:
  case AArch64::LD2Twov2s:
  case AArch64::LD2Twov4s:
  case AArch64::LD2Twov8b:
  case AArch64::LD2Twov2d:
  case AArch64::LD2Twov4h:
  case AArch64::LD2Twov8h:
  case AArch64::LD2Twov16b:
  case AArch64::LD2Rv1d:
  case AArch64::LD2Rv2s:
  case AArch64::LD2Rv4s:
  case AArch64::LD2Rv8b:
  case AArch64::LD2Rv2d:
  case AArch64::LD2Rv4h:
  case AArch64::LD2Rv8h:
@@ -345,32 +345,32 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
    IsPrePost = false;
    break;

  case AArch64::LD1i64_POST:
  case AArch64::LD2i64_POST:
    DestRegIdx = 1;
    BaseRegIdx = 4;
    OffsetIdx = 5;
    IsPrePost = true;
    break;

  case AArch64::LD1i8_POST:
  case AArch64::LD1i16_POST:
  case AArch64::LD1i32_POST:
  case AArch64::LD1i64_POST:
  case AArch64::LD2i8_POST:
  case AArch64::LD2i16_POST:
  case AArch64::LD2i32_POST:
  case AArch64::LD2i64_POST:
  case AArch64::LD3i8_POST:
  case AArch64::LD3i16_POST:
  case AArch64::LD3i32_POST:
  case AArch64::LD3i64_POST:
  case AArch64::LD4i8_POST:
  case AArch64::LD4i16_POST:
  case AArch64::LD4i32_POST:
    DestRegIdx = 1;
    BaseRegIdx = 4;
    OffsetIdx = 5;
    IsPrePost = false;
    break;

  case AArch64::LD3i64_POST:
  case AArch64::LD4i64_POST:
    DestRegIdx = -1;
    BaseRegIdx = 4;
    OffsetIdx = 5;
    IsPrePost = false;
    IsPrePost = true;
    break;

  case AArch64::LD1Onev1d_POST:
@@ -389,23 +389,16 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
  case AArch64::LD1Rv4s_POST:
  case AArch64::LD1Rv8h_POST:
  case AArch64::LD1Rv16b_POST:
  case AArch64::LD1Twov1d_POST:
  case AArch64::LD1Twov2s_POST:
  case AArch64::LD1Twov4h_POST:
  case AArch64::LD1Twov8b_POST:
  case AArch64::LD2Twov2s_POST:
  case AArch64::LD2Twov4s_POST:
  case AArch64::LD2Twov8b_POST:
  case AArch64::LD2Rv1d_POST:
  case AArch64::LD2Rv2s_POST:
  case AArch64::LD2Rv4s_POST:
  case AArch64::LD2Rv8b_POST:
    DestRegIdx = 1;
    BaseRegIdx = 2;
    OffsetIdx = 3;
    IsPrePost = false;
    IsPrePost = true;
    break;

  case AArch64::LD1Twov1d_POST:
  case AArch64::LD1Twov2s_POST:
  case AArch64::LD1Twov4h_POST:
  case AArch64::LD1Twov8b_POST:
  case AArch64::LD1Twov2d_POST:
  case AArch64::LD1Twov4s_POST:
  case AArch64::LD1Twov8h_POST:
@@ -426,10 +419,17 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
  case AArch64::LD1Fourv4s_POST:
  case AArch64::LD1Fourv8h_POST:
  case AArch64::LD1Fourv16b_POST:
  case AArch64::LD2Twov2s_POST:
  case AArch64::LD2Twov4s_POST:
  case AArch64::LD2Twov8b_POST:
  case AArch64::LD2Twov2d_POST:
  case AArch64::LD2Twov4h_POST:
  case AArch64::LD2Twov8h_POST:
  case AArch64::LD2Twov16b_POST:
  case AArch64::LD2Rv1d_POST:
  case AArch64::LD2Rv2s_POST:
  case AArch64::LD2Rv4s_POST:
  case AArch64::LD2Rv8b_POST:
  case AArch64::LD2Rv2d_POST:
  case AArch64::LD2Rv4h_POST:
  case AArch64::LD2Rv8h_POST:
@@ -467,7 +467,7 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
    DestRegIdx = -1;
    BaseRegIdx = 2;
    OffsetIdx = 3;
    IsPrePost = false;
    IsPrePost = true;
    break;

  case AArch64::LDRBBroW:
@@ -572,16 +572,19 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
    IsPrePost = true;
    break;

  case AArch64::LDNPDi:
  case AArch64::LDNPQi:
  case AArch64::LDNPSi:
  case AArch64::LDPQi:
  case AArch64::LDPDi:
  case AArch64::LDPSi:
    DestRegIdx = -1;
    BaseRegIdx = 2;
    OffsetIdx = 3;
    IsPrePost = false;
    break;

  case AArch64::LDPDi:
  case AArch64::LDPSWi:
  case AArch64::LDPSi:
  case AArch64::LDPWi:
  case AArch64::LDPXi:
    DestRegIdx = 0;
@@ -592,18 +595,18 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {

  case AArch64::LDPQpost:
  case AArch64::LDPQpre:
  case AArch64::LDPDpost:
  case AArch64::LDPDpre:
  case AArch64::LDPSpost:
  case AArch64::LDPSpre:
    DestRegIdx = -1;
    BaseRegIdx = 3;
    OffsetIdx = 4;
    IsPrePost = true;
    break;

  case AArch64::LDPDpost:
  case AArch64::LDPDpre:
  case AArch64::LDPSWpost:
  case AArch64::LDPSWpre:
  case AArch64::LDPSpost:
  case AArch64::LDPSpre:
  case AArch64::LDPWpost:
  case AArch64::LDPWpre:
  case AArch64::LDPXpost:
+266 −11
Original line number Diff line number Diff line
# RUN: llc -mtriple=aarch64-linux-gnu -mcpu=falkor -run-pass falkor-hwpf-fix-late -o - %s | FileCheck %s
--- |
  @g = external global i32

  define void @hwpf1() { ret void }
  define void @hwpf2() { ret void }
...
---
# Verify that the tag collision between the loads is resolved.
# Verify that the tag collision between the loads is resolved for various load opcodes.

# CHECK-LABEL: name: hwpf1
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LDRWui %[[BASE]], 0
@@ -17,7 +12,7 @@ body: |
  bb.0:
    liveins: %w0, %x1

    %w2 = LDRWui %x1, 0 :: ("aarch64-strided-access" load 4 from @g)
    %w2 = LDRWui %x1, 0 :: ("aarch64-strided-access" load 4)
    %w2 = LDRWui %x1, 1

    %w0 = SUBWri %w0, 1, 0
@@ -28,19 +23,147 @@ body: |
    RET_ReallyLR
...
---
# Verify that the tag collision between the loads is resolved and written back for post increment addressing.
# CHECK-LABEL: name: hwpf2
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LD1i64 %q2, 0, %[[BASE]]
# CHECK: LDRWui %x1, 0
name:            hwpf2
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1, %q2

    %q2 = LD1i64 %q2, 0, %x1 :: ("aarch64-strided-access" load 4)
    %w2 = LDRWui %x1, 0

    %w0 = SUBWri %w0, 1, 0
    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
    Bcc 9, %bb.0, implicit %nzcv

  bb.1:
    RET_ReallyLR
...
---
# CHECK-LABEL: name: hwpf3
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LD1i8 %q2, 0, %[[BASE]]
# CHECK: LDRWui %x1, 0
name:            hwpf3
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1, %q2

    %q2 = LD1i8 %q2, 0, %x1 :: ("aarch64-strided-access" load 4)
    %w0 = LDRWui %x1, 0

    %w0 = SUBWri %w0, 1, 0
    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
    Bcc 9, %bb.0, implicit %nzcv

  bb.1:
    RET_ReallyLR
...
---
# CHECK-LABEL: name: hwpf4
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LD1Onev1d %[[BASE]]
# CHECK: LDRWui %x1, 0
name:            hwpf4
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1

    %d2 = LD1Onev1d %x1 :: ("aarch64-strided-access" load 4)
    %w2 = LDRWui %x1, 0

    %w0 = SUBWri %w0, 1, 0
    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
    Bcc 9, %bb.0, implicit %nzcv

  bb.1:
    RET_ReallyLR
...
---
# CHECK-LABEL: name: hwpf5
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LD1Twov1d %[[BASE]]
# CHECK: LDRWui %x1, 0
name:            hwpf5
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1

    %d2_d3 = LD1Twov1d %x1 :: ("aarch64-strided-access" load 4)
    %w0 = LDRWui %x1, 0

    %w0 = SUBWri %w0, 1, 0
    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
    Bcc 9, %bb.0, implicit %nzcv

  bb.1:
    RET_ReallyLR
...
---
# CHECK-LABEL: name: hwpf6
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LDPQi %[[BASE]]
# CHECK: LDRWui %x1, 3
name:            hwpf6
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1

    %q2, %q3 = LDPQi %x1, 3 :: ("aarch64-strided-access" load 4)
    %w0 = LDRWui %x1, 3

    %w0 = SUBWri %w0, 1, 0
    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
    Bcc 9, %bb.0, implicit %nzcv

  bb.1:
    RET_ReallyLR
...
---
# CHECK-LABEL: name: hwpf7
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LDPXi %[[BASE]]
# CHECK: LDRWui %x1, 2
name:            hwpf7
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1

    %x2, %x3 = LDPXi %x1, 3 :: ("aarch64-strided-access" load 4)
    %w2 = LDRWui %x1, 2

    %w0 = SUBWri %w0, 1, 0
    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
    Bcc 9, %bb.0, implicit %nzcv

  bb.1:
    RET_ReallyLR
...
---
# Verify that the tag collision between the loads is resolved and written back
# for post increment addressing for various load opcodes.

# CHECK-LABEL: name: hwpfinc1
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LDRWpost %[[BASE]], 0
# CHECK: %x1 = ORRXrs %xzr, %[[BASE]], 0
# CHECK: LDRWui %x1, 1
name:            hwpf2
name:            hwpfinc1
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1

    %x1, %w2 = LDRWpost %x1, 0 :: ("aarch64-strided-access" load 4 from @g)
    %x1, %w2 = LDRWpost %x1, 0 :: ("aarch64-strided-access" load 4)
    %w2 = LDRWui %x1, 1

    %w0 = SUBWri %w0, 1, 0
@@ -50,3 +173,135 @@ body: |
  bb.1:
    RET_ReallyLR
...
---
# CHECK-LABEL: name: hwpfinc2
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LD1i64_POST %q2, 0, %[[BASE]]
# CHECK: %x1 = ORRXrs %xzr, %[[BASE]], 0
# CHECK: LDRWui %x1, 1
name:            hwpfinc2
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1, %q2

    %x1, %q2 = LD1i64_POST %q2, 0, %x1, %x1 :: ("aarch64-strided-access" load 4)
    %w2 = LDRWui %x1, 132

    %w0 = SUBWri %w0, 1, 0
    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
    Bcc 9, %bb.0, implicit %nzcv

  bb.1:
    RET_ReallyLR
...
---
# CHECK-LABEL: name: hwpfinc3
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LD1i8_POST %q2, 0, %[[BASE]]
# CHECK: %x1 = ORRXrs %xzr, %[[BASE]], 0
# CHECK: LDRWui %x1, 132
name:            hwpfinc3
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1, %q2

    %x1, %q2 = LD1i8_POST %q2, 0, %x1, %x1 :: ("aarch64-strided-access" load 4)
    %w0 = LDRWui %x1, 132

    %w0 = SUBWri %w0, 1, 0
    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
    Bcc 9, %bb.0, implicit %nzcv

  bb.1:
    RET_ReallyLR
...
---
# CHECK-LABEL: name: hwpfinc4
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LD1Rv1d_POST %[[BASE]]
# CHECK: %x1 = ORRXrs %xzr, %[[BASE]], 0
# CHECK: LDRWui %x1, 252
name:            hwpfinc4
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1, %q2

    %x1, %d2 = LD1Rv1d_POST %x1, %xzr :: ("aarch64-strided-access" load 4)
    %w2 = LDRWui %x1, 252

    %w0 = SUBWri %w0, 1, 0
    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
    Bcc 9, %bb.0, implicit %nzcv

  bb.1:
    RET_ReallyLR
...
---
# CHECK-LABEL: name: hwpfinc5
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LD3Threev2s_POST %[[BASE]]
# CHECK: %x1 = ORRXrs %xzr, %[[BASE]], 0
# CHECK: LDRWroX %x17, %x0
name:            hwpfinc5
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1, %x17, %q2

    %x1, %d2_d3_d4 = LD3Threev2s_POST %x1, %x0 :: ("aarch64-strided-access" load 4)
    %w0 = LDRWroX %x17, %x0, 0, 0

    %w0 = SUBWri %w0, 1, 0
    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
    Bcc 9, %bb.0, implicit %nzcv

  bb.1:
    RET_ReallyLR
...
---
# CHECK-LABEL: name: hwpfinc6
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LDPDpost %[[BASE]]
# CHECK: %x1 = ORRXrs %xzr, %[[BASE]], 0
# CHECK: LDRWui %x17, 2
name:            hwpfinc6
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1, %x17, %q2

    %x1, %d2, %d3 = LDPDpost %x1, 3 :: ("aarch64-strided-access" load 4)
    %w16 = LDRWui %x17, 2

    %w0 = SUBWri %w0, 1, 0
    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
    Bcc 9, %bb.0, implicit %nzcv

  bb.1:
    RET_ReallyLR
...
---
# CHECK-LABEL: name: hwpfinc7
# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
# CHECK: LDPXpost %[[BASE]]
# CHECK: %x1 = ORRXrs %xzr, %[[BASE]], 0
# CHECK: LDRWui %x17, 2
name:            hwpfinc7
tracksRegLiveness: true
body: |
  bb.0:
    liveins: %w0, %x1, %x17, %q2

    %x1, %x2, %x3 = LDPXpost %x1, 3 :: ("aarch64-strided-access" load 4)
    %w18 = LDRWui %x17, 2

    %w0 = SUBWri %w0, 1, 0
    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
    Bcc 9, %bb.0, implicit %nzcv

  bb.1:
    RET_ReallyLR
...