Commit c6bdd8e7 authored by Craig Topper's avatar Craig Topper
Browse files

[X86] Improve the gather scheduler models for SkylakeClient and SkylakeServer

The load ports need a cycle for each potentially loaded element just like Haswell and Skylake. Unlike Haswell and Broadwell, the number of uops does not scale with the number of elements. Instead the load uops run for multiple cycles.

I've taken the latency number from the uops.info. The port binding for the non-load uops is taken from the original IACA data I have.

Differential Revision: https://reviews.llvm.org/D74000
parent baafe82b
Loading
Loading
Loading
Loading
+21 −23
Original line number Diff line number Diff line
@@ -1593,33 +1593,31 @@ def SKLWriteResGroup196 : SchedWriteRes<[SKLPort0,SKLPort23]> {
}
def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F(32|64)m")>;

def SKLWriteResGroup196_1 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
  let Latency = 22;
  let NumMicroOps = 5;
def SKLWriteResGroupVEX2 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
  let Latency = 18;
  let NumMicroOps = 5; // 2 uops perform multiple loads
  let ResourceCycles = [1,2,1,1];
}
def: InstRW<[SKLWriteResGroup196_1], (instrs VGATHERDPSrm,
                                             VGATHERDPDrm,
                                             VGATHERQPDrm,
                                             VGATHERQPSrm,
                                             VPGATHERDDrm,
                                             VPGATHERDQrm,
                                             VPGATHERQDrm,
                                             VPGATHERQQrm)>;
def: InstRW<[SKLWriteResGroupVEX2], (instrs VGATHERDPDrm, VPGATHERDQrm,
                                            VGATHERQPDrm, VPGATHERQQrm,
                                            VGATHERQPSrm, VPGATHERQDrm)>;

def SKLWriteResGroup196_2 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
  let Latency = 25;
  let NumMicroOps = 5;
  let ResourceCycles = [1,2,1,1];
def SKLWriteResGroupVEX4 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
  let Latency = 20;
  let NumMicroOps = 5; // 2 uops peform multiple loads
  let ResourceCycles = [1,4,1,1];
}
def: InstRW<[SKLWriteResGroupVEX4], (instrs VGATHERDPDYrm, VPGATHERDQYrm,
                                            VGATHERDPSrm,  VPGATHERDDrm,
                                            VGATHERQPDYrm, VPGATHERQQYrm,
                                            VGATHERQPSYrm,  VPGATHERQDYrm)>;

def SKLWriteResGroupVEX8 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
  let Latency = 22;
  let NumMicroOps = 5; // 2 uops perform multiple loads
  let ResourceCycles = [1,8,1,1];
}
def: InstRW<[SKLWriteResGroup196_2], (instrs VGATHERDPSYrm,
                                             VGATHERQPDYrm,
                                             VGATHERQPSYrm,
                                             VPGATHERDDYrm,
                                             VPGATHERDQYrm,
                                             VPGATHERQDYrm,
                                             VPGATHERQQYrm,
                                             VGATHERDPDYrm)>;
def: InstRW<[SKLWriteResGroupVEX8], (instrs VGATHERDPSYrm,  VPGATHERDDYrm)>;

def SKLWriteResGroup198 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort5,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
  let Latency = 23;
+53 −98
Original line number Diff line number Diff line
@@ -2145,14 +2145,6 @@ def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort015]> {
def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)",
                                              "VPMULLQZrm(b?)")>;

def SKXWriteResGroup214 : SchedWriteRes<[]> {
  let Latency = 20;
  let NumMicroOps = 0;
}
def: InstRW<[SKXWriteResGroup214], (instrs VGATHERDPSZ128rm,
                                           VGATHERQPSZrm,
                                           VPGATHERDDZ128rm)>;

def SKXWriteResGroup215 : SchedWriteRes<[SKXPort0]> {
  let Latency = 20;
  let NumMicroOps = 1;
@@ -2167,15 +2159,41 @@ def SKXWriteResGroup216 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
}
def : SchedAlias<WriteFDiv64XLd, SKXWriteResGroup216>; // TODO - convert to ZnWriteResFpuPair

def SKXWriteResGroup218 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
  let Latency = 20;
  let NumMicroOps = 5;
def SKXWriteGatherEVEX2 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
  let Latency = 17;
  let NumMicroOps = 5; // 2 uops perform multiple loads
  let ResourceCycles = [1,2,1,1];
}
def: InstRW<[SKXWriteResGroup218], (instrs VGATHERQPSZ128rm,
                                           VGATHERQPSZ256rm,
                                           VPGATHERQDZ128rm,
                                           VPGATHERQDZ256rm)>;
def: InstRW<[SKXWriteGatherEVEX2], (instrs VGATHERQPSZ128rm, VPGATHERQDZ128rm,
                                           VGATHERDPDZ128rm, VPGATHERDQZ128rm,
                                           VGATHERQPDZ128rm, VPGATHERQQZ128rm)>;

def SKXWriteGatherEVEX4 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
  let Latency = 19;
  let NumMicroOps = 5; // 2 uops perform multiple loads
  let ResourceCycles = [1,4,1,1];
}
def: InstRW<[SKXWriteGatherEVEX4], (instrs VGATHERQPSZ256rm, VPGATHERQDZ256rm,
                                           VGATHERQPDZ256rm, VPGATHERQQZ256rm,
                                           VGATHERDPSZ128rm, VPGATHERDDZ128rm,
                                           VGATHERDPDZ256rm, VPGATHERDQZ256rm)>;

def SKXWriteGatherEVEX8 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
  let Latency = 21;
  let NumMicroOps = 5; // 2 uops perform multiple loads
  let ResourceCycles = [1,8,1,1];
}
def: InstRW<[SKXWriteGatherEVEX8], (instrs VGATHERDPSZ256rm, VPGATHERDDZ256rm,
                                           VGATHERDPDZrm,    VPGATHERDQZrm,
                                           VGATHERQPDZrm,    VPGATHERQQZrm,
                                           VGATHERQPSZrm,    VPGATHERQDZrm)>;

def SKXWriteGatherEVEX16 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
  let Latency = 25;
  let NumMicroOps = 5; // 2 uops perform multiple loads
  let ResourceCycles = [1,16,1,1];
}
def: InstRW<[SKXWriteGatherEVEX16], (instrs VGATHERDPSZrm, VPGATHERDDZrm)>;

def SKXWriteResGroup219 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
  let Latency = 20;
@@ -2205,57 +2223,31 @@ def SKXWriteResGroup223 : SchedWriteRes<[SKXPort0,SKXPort23]> {
}
def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F(32|64)m")>;

def SKXWriteResGroup224 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
  let Latency = 22;
  let NumMicroOps = 5;
def SKXWriteResGroupVEX2 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
  let Latency = 18;
  let NumMicroOps = 5; // 2 uops perform multiple loads
  let ResourceCycles = [1,2,1,1];
}
def: InstRW<[SKXWriteResGroup224], (instrs VGATHERDPDZ128rm,
                                           VGATHERQPDZ128rm,
                                           VPGATHERDQZ128rm,
                                           VPGATHERQQZ128rm)>;
def: InstRW<[SKXWriteResGroupVEX2], (instrs VGATHERDPDrm, VPGATHERDQrm,
                                            VGATHERQPDrm, VPGATHERQQrm,
                                            VGATHERQPSrm, VPGATHERQDrm)>;

def SKXWriteResGroup224_2 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
  let Latency = 22;
  let NumMicroOps = 5;
  let ResourceCycles = [1,2,1,1];
def SKXWriteResGroupVEX4 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
  let Latency = 20;
  let NumMicroOps = 5; // 2 uops peform multiple loads
  let ResourceCycles = [1,4,1,1];
}
def: InstRW<[SKXWriteResGroup224_2], (instrs VGATHERDPSrm,
                                             VGATHERDPDrm,
                                             VGATHERQPDrm,
                                             VGATHERQPSrm,
                                             VPGATHERDDrm,
                                             VPGATHERDQrm,
                                             VPGATHERQDrm,
                                             VPGATHERQQrm,
                                             VPGATHERDDrm,
                                             VPGATHERQDrm,
                                             VPGATHERDQrm,
                                             VPGATHERQQrm,
                                             VGATHERDPSrm,
                                             VGATHERQPSrm,
                                             VGATHERDPDrm,
                                             VGATHERQPDrm)>;

def SKXWriteResGroup224_3 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
  let Latency = 25;
  let NumMicroOps = 5;
  let ResourceCycles = [1,2,1,1];
def: InstRW<[SKXWriteResGroupVEX4], (instrs VGATHERDPDYrm, VPGATHERDQYrm,
                                            VGATHERDPSrm,  VPGATHERDDrm,
                                            VGATHERQPDYrm, VPGATHERQQYrm,
                                            VGATHERQPSYrm,  VPGATHERQDYrm)>;

def SKXWriteResGroupVEX8 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
  let Latency = 22;
  let NumMicroOps = 5; // 2 uops perform multiple loads
  let ResourceCycles = [1,8,1,1];
}
def: InstRW<[SKXWriteResGroup224_3], (instrs VGATHERDPSYrm,
                                             VGATHERQPDYrm,
                                             VGATHERQPSYrm,
                                             VPGATHERDDYrm,
                                             VPGATHERDQYrm,
                                             VPGATHERQDYrm,
                                             VPGATHERQQYrm,
                                             VPGATHERDDYrm,
                                             VPGATHERQDYrm,
                                             VPGATHERDQYrm,
                                             VPGATHERQQYrm,
                                             VGATHERDPSYrm,
                                             VGATHERQPSYrm,
                                             VGATHERDPDYrm)>;
def: InstRW<[SKXWriteResGroupVEX8], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;

def SKXWriteResGroup225 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
  let Latency = 22;
@@ -2279,27 +2271,6 @@ def SKXWriteResGroup233 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
}
def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI(16|32)m")>;

def SKXWriteResGroup234 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
  let Latency = 25;
  let NumMicroOps = 5;
  let ResourceCycles = [1,2,1,1];
}
def: InstRW<[SKXWriteResGroup234], (instrs VGATHERDPDZ256rm,
                                           VGATHERQPDZ256rm,
                                           VPGATHERDQZ256rm,
                                           VPGATHERQDZrm,
                                           VPGATHERQQZ256rm)>;

def SKXWriteResGroup238 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
  let Latency = 26;
  let NumMicroOps = 5;
  let ResourceCycles = [1,2,1,1];
}
def: InstRW<[SKXWriteResGroup238], (instrs VGATHERDPDZrm,
                                           VGATHERQPDZrm,
                                           VPGATHERDQZrm,
                                           VPGATHERQQZrm)>;

def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> {
  let Latency = 27;
  let NumMicroOps = 2;
@@ -2307,14 +2278,6 @@ def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> {
}
def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F(32|64)m")>;

def SKXWriteResGroup240 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
  let Latency = 27;
  let NumMicroOps = 5;
  let ResourceCycles = [1,2,1,1];
}
def: InstRW<[SKXWriteResGroup240], (instrs VGATHERDPSZ256rm,
                                           VPGATHERDDZ256rm)>;

def SKXWriteResGroup242 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
  let Latency = 29;
  let NumMicroOps = 15;
@@ -2329,14 +2292,6 @@ def SKXWriteResGroup243 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
}
def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI(16|32)m")>;

def SKXWriteResGroup245 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
  let Latency = 30;
  let NumMicroOps = 5;
  let ResourceCycles = [1,2,1,1];
}
def: InstRW<[SKXWriteResGroup245], (instrs VGATHERDPSZrm,
                                           VPGATHERDDZrm)>;

def SKXWriteResGroup247 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort06,SKXPort0156]> {
  let Latency = 35;
  let NumMicroOps = 23;
+14 −1
Original line number Diff line number Diff line
@@ -181,6 +181,11 @@ vpaddq %zmm16, %zmm17, %zmm19 {z}{k1}
vpaddq            (%rax), %zmm17, %zmm19 {z}{k1}
vpaddq            (%rax){1to8}, %zmm17, %zmm19 {z}{k1}

vpgatherdq        (%rax,%ymm1,2), %zmm2 {k1}
vpgatherdd        (%rax,%zmm1,2), %zmm2 {k1}
vpgatherqq        (%rax,%zmm1,2), %zmm2 {k1}
vpgatherqd        (%rax,%zmm1,2), %ymm2 {k1}

vpmulld           %zmm16, %zmm17, %zmm19
vpmulld           (%rax), %zmm17, %zmm19
vpmulld           (%rax){1to16}, %zmm17, %zmm19
@@ -686,6 +691,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
# CHECK-NEXT:  1      1     0.50                        vpaddq	%zmm16, %zmm17, %zmm19 {%k1} {z}
# CHECK-NEXT:  2      8     0.50    *                   vpaddq	(%rax), %zmm17, %zmm19 {%k1} {z}
# CHECK-NEXT:  2      8     0.50    *                   vpaddq	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
# CHECK-NEXT:  1      5     0.50    *                   vpgatherdq	(%rax,%ymm1,2), %zmm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vpgatherdd	(%rax,%zmm1,2), %zmm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vpgatherqq	(%rax,%zmm1,2), %zmm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vpgatherqd	(%rax,%zmm1,2), %ymm2 {%k1}
# CHECK-NEXT:  1      5     1.00                        vpmulld	%zmm16, %zmm17, %zmm19
# CHECK-NEXT:  2      12    1.00    *                   vpmulld	(%rax), %zmm17, %zmm19
# CHECK-NEXT:  2      12    1.00    *                   vpmulld	(%rax){1to16}, %zmm17, %zmm19
@@ -999,7 +1008,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1}

# CHECK:      Resource pressure per iteration:
# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
# CHECK-NEXT:  -     1506.00 129.00 144.00  -    270.00 153.00 153.00
# CHECK-NEXT:  -     1506.00 129.00 144.00  -    270.00 155.00 155.00

# CHECK:      Resource pressure by instruction:
# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -1165,6 +1174,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddq	%zmm16, %zmm17, %zmm19 {%k1} {z}
# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddq	(%rax), %zmm17, %zmm19 {%k1} {z}
# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddq	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vpgatherdq	(%rax,%ymm1,2), %zmm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vpgatherdd	(%rax,%zmm1,2), %zmm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vpgatherqq	(%rax,%zmm1,2), %zmm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vpgatherqd	(%rax,%zmm1,2), %ymm2 {%k1}
# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmulld	%zmm16, %zmm17, %zmm19
# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmulld	(%rax), %zmm17, %zmm19
# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmulld	(%rax){1to16}, %zmm17, %zmm19
+53 −1
Original line number Diff line number Diff line
@@ -121,6 +121,16 @@ vdivps %ymm16, %ymm17, %ymm19 {z}{k1}
vdivps            (%rax), %ymm17, %ymm19 {z}{k1}
vdivps            (%rax){1to8}, %ymm17, %ymm19 {z}{k1}

vgatherdpd        (%rax,%xmm1,2), %ymm2 {k1}
vgatherdps        (%rax,%ymm1,2), %ymm2 {k1}
vgatherqpd        (%rax,%ymm1,2), %ymm2 {k1}
vgatherqps        (%rax,%ymm1,2), %xmm2 {k1}

vgatherdpd        (%rax,%xmm1,2), %xmm2 {k1}
vgatherdps        (%rax,%xmm1,2), %xmm2 {k1}
vgatherqpd        (%rax,%xmm1,2), %xmm2 {k1}
vgatherqps        (%rax,%xmm1,2), %xmm2 {k1}

vmaxpd            %xmm16, %xmm17, %xmm19
vmaxpd            (%rax), %xmm17, %xmm19
vmaxpd            (%rax){1to2}, %xmm17, %xmm19
@@ -421,6 +431,16 @@ vpermq %ymm16, %ymm17, %ymm19 {z}{k1}
vpermq            (%rax), %ymm17, %ymm19 {z}{k1}
vpermq            (%rax){1to4}, %ymm17, %ymm19 {z}{k1}

vpgatherdq        (%rax,%xmm1,2), %ymm2 {k1}
vpgatherdd        (%rax,%ymm1,2), %ymm2 {k1}
vpgatherqq        (%rax,%ymm1,2), %ymm2 {k1}
vpgatherqd        (%rax,%ymm1,2), %xmm2 {k1}

vpgatherdq        (%rax,%xmm1,2), %xmm2 {k1}
vpgatherdd        (%rax,%xmm1,2), %xmm2 {k1}
vpgatherqq        (%rax,%xmm1,2), %xmm2 {k1}
vpgatherqd        (%rax,%xmm1,2), %xmm2 {k1}

vpmulld           %xmm16, %xmm17, %xmm19
vpmulld           (%rax), %xmm17, %xmm19
vpmulld           (%rax){1to4}, %xmm17, %xmm19
@@ -858,6 +878,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
# CHECK-NEXT:  3      29    28.00                       vdivps	%ymm16, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT:  4      36    28.00   *                   vdivps	(%rax), %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT:  4      36    28.00   *                   vdivps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT:  1      5     0.50    *                   vgatherdpd	(%rax,%xmm1,2), %ymm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vgatherdps	(%rax,%ymm1,2), %ymm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vgatherqpd	(%rax,%ymm1,2), %ymm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vgatherqps	(%rax,%ymm1,2), %xmm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vgatherdpd	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vgatherdps	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vgatherqpd	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vgatherqps	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  1      3     1.00                        vmaxpd	%xmm16, %xmm17, %xmm19
# CHECK-NEXT:  2      9     1.00    *                   vmaxpd	(%rax), %xmm17, %xmm19
# CHECK-NEXT:  2      9     1.00    *                   vmaxpd	(%rax){1to2}, %xmm17, %xmm19
@@ -1128,6 +1156,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
# CHECK-NEXT:  1      1     1.00                        vpermq	%ymm16, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT:  2      8     1.00    *                   vpermq	(%rax), %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT:  2      8     1.00    *                   vpermq	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT:  1      5     0.50    *                   vpgatherdq	(%rax,%xmm1,2), %ymm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vpgatherdd	(%rax,%ymm1,2), %ymm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vpgatherqq	(%rax,%ymm1,2), %ymm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vpgatherqd	(%rax,%ymm1,2), %xmm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vpgatherdq	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vpgatherdd	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vpgatherqq	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  1      5     0.50    *                   vpgatherqd	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  1      5     1.00                        vpmulld	%xmm16, %xmm17, %xmm19
# CHECK-NEXT:  2      11    1.00    *                   vpmulld	(%rax), %xmm17, %xmm19
# CHECK-NEXT:  2      11    1.00    *                   vpmulld	(%rax){1to4}, %xmm17, %xmm19
@@ -1429,7 +1465,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1}

# CHECK:      Resource pressure per iteration:
# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
# CHECK-NEXT:  -     1935.00 180.00 229.50  -    346.50 222.00 222.00
# CHECK-NEXT:  -     1935.00 180.00 229.50  -    346.50 230.00 230.00

# CHECK:      Resource pressure by instruction:
# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -1541,6 +1577,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
# CHECK-NEXT:  -     28.00  2.50    -      -     0.50    -      -     vdivps	%ymm16, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT:  -     28.00  2.50    -      -     0.50   0.50   0.50   vdivps	(%rax), %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT:  -     28.00  2.50    -      -     0.50   0.50   0.50   vdivps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherdpd	(%rax,%xmm1,2), %ymm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherdps	(%rax,%ymm1,2), %ymm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherqpd	(%rax,%ymm1,2), %ymm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherqps	(%rax,%ymm1,2), %xmm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherdpd	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherdps	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherqpd	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherqps	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxpd	%xmm16, %xmm17, %xmm19
# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxpd	(%rax), %xmm17, %xmm19
# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxpd	(%rax){1to2}, %xmm17, %xmm19
@@ -1811,6 +1855,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermq	%ymm16, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermq	(%rax), %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermq	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vpgatherdq	(%rax,%xmm1,2), %ymm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vpgatherdd	(%rax,%ymm1,2), %ymm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vpgatherqq	(%rax,%ymm1,2), %ymm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vpgatherqd	(%rax,%ymm1,2), %xmm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vpgatherdq	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vpgatherdd	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vpgatherqq	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vpgatherqd	(%rax,%xmm1,2), %xmm2 {%k1}
# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmulld	%xmm16, %xmm17, %xmm19
# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmulld	(%rax), %xmm17, %xmm19
# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmulld	(%rax){1to4}, %xmm17, %xmm19
+27 −27

File changed.

Preview size limit exceeded, changes collapsed.

Loading