Commit f65493a8 authored by Craig Topper's avatar Craig Topper
Browse files

[X86] Teach X86MCInstLower to swap operands of commutable instructions to...

[X86] Teach X86MCInstLower to swap operands of commutable instructions to enable 2-byte VEX encoding.

Summary:
The 2 source operands commutable instructions are encoded in the
VEX.VVVV field and the r/m field of the MODRM byte plus the VEX.B
field.

The VEX.B field is missing from the 2-byte VEX encoding. If the
VEX.VVVV source is 0-7 and the other register is 8-15 we can
swap them to avoid needing the VEX.B field. This works as long as
the VEX.W, VEX.mmmmm, and VEX.X fields are also not needed.

Fixes PR36706.

Reviewers: RKSimon, spatel

Reviewed By: RKSimon

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D68550
parent abc04ff4
Loading
Loading
Loading
Loading
+46 −0
Original line number Diff line number Diff line
@@ -876,6 +876,52 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
  case X86::MOVSX64rr32:
    SimplifyMOVSX(OutMI);
    break;

  case X86::VCMPPDrri:
  case X86::VCMPPDYrri:
  case X86::VCMPPSrri:
  case X86::VCMPPSYrri:
  case X86::VCMPSDrr:
  case X86::VCMPSSrr: {
    // Swap the operands if it will enable a 2 byte VEX encoding.
    // FIXME: Change the immediate to improve opportunities?
    if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) &&
        X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
      unsigned Imm = MI->getOperand(3).getImm() & 0x7;
      switch (Imm) {
      default: break;
      case 0x00: // EQUAL
      case 0x03: // UNORDERED
      case 0x04: // NOT EQUAL
      case 0x07: // ORDERED
        std::swap(OutMI.getOperand(1), OutMI.getOperand(2));
        break;
      }
    }
    break;
  }

  case X86::VMOVHLPSrr:
  case X86::VUNPCKHPDrr:
    // These are not truly commutable so hide them from the default case.
    break;

  default: {
    // If the instruction is a commutable arithmetic instruction we might be
    // able to commute the operands to get a 2 byte VEX prefix.
    uint64_t TSFlags = MI->getDesc().TSFlags;
    if (MI->getDesc().isCommutable() &&
        (TSFlags & X86II::EncodingMask) == X86II::VEX &&
        (TSFlags & X86II::OpMapMask) == X86II::TB &&
        (TSFlags & X86II::FormMask) == X86II::MRMSrcReg &&
        !(TSFlags & X86II::VEX_W) && (TSFlags & X86II::VEX_4V) &&
        OutMI.getNumOperands() == 3) {
      if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) &&
          X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg()))
        std::swap(OutMI.getOperand(1), OutMI.getOperand(2));
    }
    break;
  }
  }
}

+2 −2
Original line number Diff line number Diff line
@@ -51,8 +51,8 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
; preserved ymm8-ymm15
; X64-LABEL: testf16_regs
; X64: call
; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
; X64: vaddps  {{%ymm[0-1]}}, {{%ymm[8-9]}}, {{%ymm[0-1]}}
; X64: vaddps  {{%ymm[0-1]}}, {{%ymm[8-9]}}, {{%ymm[0-1]}}
; X64: ret

define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+26 −26
Original line number Diff line number Diff line
@@ -4906,18 +4906,18 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm7
; KNL-NEXT:    vpxor %xmm8, %xmm8, %xmm8
; KNL-NEXT:    vpcmpeqw %ymm8, %ymm0, %ymm0
; KNL-NEXT:    vpcmpeqw %ymm8, %ymm7, %ymm7
; KNL-NEXT:    vpcmpeqw %ymm8, %ymm1, %ymm1
; KNL-NEXT:    vpcmpeqw %ymm0, %ymm8, %ymm0
; KNL-NEXT:    vpcmpeqw %ymm7, %ymm8, %ymm7
; KNL-NEXT:    vpcmpeqw %ymm1, %ymm8, %ymm1
; KNL-NEXT:    vpor %ymm1, %ymm0, %ymm0
; KNL-NEXT:    vpcmpeqw %ymm8, %ymm6, %ymm1
; KNL-NEXT:    vpcmpeqw %ymm6, %ymm8, %ymm1
; KNL-NEXT:    vpor %ymm1, %ymm7, %ymm1
; KNL-NEXT:    vpcmpeqw %ymm8, %ymm2, %ymm2
; KNL-NEXT:    vpcmpeqw %ymm8, %ymm5, %ymm5
; KNL-NEXT:    vpcmpeqw %ymm8, %ymm3, %ymm3
; KNL-NEXT:    vpcmpeqw %ymm2, %ymm8, %ymm2
; KNL-NEXT:    vpcmpeqw %ymm5, %ymm8, %ymm5
; KNL-NEXT:    vpcmpeqw %ymm3, %ymm8, %ymm3
; KNL-NEXT:    vpor %ymm3, %ymm2, %ymm2
; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
; KNL-NEXT:    vpcmpeqw %ymm8, %ymm4, %ymm2
; KNL-NEXT:    vpcmpeqw %ymm4, %ymm8, %ymm2
; KNL-NEXT:    vpor %ymm2, %ymm5, %ymm2
; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
@@ -4992,18 +4992,18 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm7
; AVX512DQ-NEXT:    vpxor %xmm8, %xmm8, %xmm8
; AVX512DQ-NEXT:    vpcmpeqw %ymm8, %ymm0, %ymm0
; AVX512DQ-NEXT:    vpcmpeqw %ymm8, %ymm7, %ymm7
; AVX512DQ-NEXT:    vpcmpeqw %ymm8, %ymm1, %ymm1
; AVX512DQ-NEXT:    vpcmpeqw %ymm0, %ymm8, %ymm0
; AVX512DQ-NEXT:    vpcmpeqw %ymm7, %ymm8, %ymm7
; AVX512DQ-NEXT:    vpcmpeqw %ymm1, %ymm8, %ymm1
; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT:    vpcmpeqw %ymm8, %ymm6, %ymm1
; AVX512DQ-NEXT:    vpcmpeqw %ymm6, %ymm8, %ymm1
; AVX512DQ-NEXT:    vpor %ymm1, %ymm7, %ymm1
; AVX512DQ-NEXT:    vpcmpeqw %ymm8, %ymm2, %ymm2
; AVX512DQ-NEXT:    vpcmpeqw %ymm8, %ymm5, %ymm5
; AVX512DQ-NEXT:    vpcmpeqw %ymm8, %ymm3, %ymm3
; AVX512DQ-NEXT:    vpcmpeqw %ymm2, %ymm8, %ymm2
; AVX512DQ-NEXT:    vpcmpeqw %ymm5, %ymm8, %ymm5
; AVX512DQ-NEXT:    vpcmpeqw %ymm3, %ymm8, %ymm3
; AVX512DQ-NEXT:    vpor %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT:    vpcmpeqw %ymm8, %ymm4, %ymm2
; AVX512DQ-NEXT:    vpcmpeqw %ymm4, %ymm8, %ymm2
; AVX512DQ-NEXT:    vpor %ymm2, %ymm5, %ymm2
; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
@@ -5075,21 +5075,21 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm11
; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm7
; KNL-NEXT:    vpxor %xmm8, %xmm8, %xmm8
; KNL-NEXT:    vpcmpeqb %ymm8, %ymm0, %ymm13
; KNL-NEXT:    vpcmpeqb %ymm0, %ymm8, %ymm13
; KNL-NEXT:    vextracti128 $1, %ymm13, %xmm4
; KNL-NEXT:    vpcmpeqb %ymm8, %ymm7, %ymm7
; KNL-NEXT:    vpcmpeqb %ymm7, %ymm8, %ymm7
; KNL-NEXT:    vextracti128 $1, %ymm7, %xmm5
; KNL-NEXT:    vpcmpeqb %ymm8, %ymm1, %ymm1
; KNL-NEXT:    vpcmpeqb %ymm1, %ymm8, %ymm1
; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm6
; KNL-NEXT:    vpor %xmm6, %xmm4, %xmm12
; KNL-NEXT:    vpcmpeqb %ymm8, %ymm11, %ymm6
; KNL-NEXT:    vextracti128 $1, %ymm6, %xmm4
; KNL-NEXT:    vpor %xmm4, %xmm5, %xmm11
; KNL-NEXT:    vpcmpeqb %ymm8, %ymm2, %ymm2
; KNL-NEXT:    vpcmpeqb %ymm2, %ymm8, %ymm2
; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm5
; KNL-NEXT:    vpcmpeqb %ymm8, %ymm10, %ymm10
; KNL-NEXT:    vextracti128 $1, %ymm10, %xmm4
; KNL-NEXT:    vpcmpeqb %ymm8, %ymm3, %ymm3
; KNL-NEXT:    vpcmpeqb %ymm3, %ymm8, %ymm3
; KNL-NEXT:    vextracti128 $1, %ymm3, %xmm0
; KNL-NEXT:    vpor %xmm0, %xmm5, %xmm0
; KNL-NEXT:    vpand %xmm0, %xmm12, %xmm12
@@ -5185,21 +5185,21 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm11
; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm7
; AVX512DQ-NEXT:    vpxor %xmm8, %xmm8, %xmm8
; AVX512DQ-NEXT:    vpcmpeqb %ymm8, %ymm0, %ymm13
; AVX512DQ-NEXT:    vpcmpeqb %ymm0, %ymm8, %ymm13
; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm4
; AVX512DQ-NEXT:    vpcmpeqb %ymm8, %ymm7, %ymm7
; AVX512DQ-NEXT:    vpcmpeqb %ymm7, %ymm8, %ymm7
; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm5
; AVX512DQ-NEXT:    vpcmpeqb %ymm8, %ymm1, %ymm1
; AVX512DQ-NEXT:    vpcmpeqb %ymm1, %ymm8, %ymm1
; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm6
; AVX512DQ-NEXT:    vpor %xmm6, %xmm4, %xmm12
; AVX512DQ-NEXT:    vpcmpeqb %ymm8, %ymm11, %ymm6
; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm4
; AVX512DQ-NEXT:    vpor %xmm4, %xmm5, %xmm11
; AVX512DQ-NEXT:    vpcmpeqb %ymm8, %ymm2, %ymm2
; AVX512DQ-NEXT:    vpcmpeqb %ymm2, %ymm8, %ymm2
; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm5
; AVX512DQ-NEXT:    vpcmpeqb %ymm8, %ymm10, %ymm10
; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm4
; AVX512DQ-NEXT:    vpcmpeqb %ymm8, %ymm3, %ymm3
; AVX512DQ-NEXT:    vpcmpeqb %ymm3, %ymm8, %ymm3
; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm0
; AVX512DQ-NEXT:    vpor %xmm0, %xmm5, %xmm0
; AVX512DQ-NEXT:    vpand %xmm0, %xmm12, %xmm12
+8 −8
Original line number Diff line number Diff line
@@ -386,9 +386,9 @@ define x86_regcallcc float @test_CallargRetFloat(float %a) {
; WIN64-NEXT:    .seh_savexmm %xmm8, 0
; WIN64-NEXT:    .seh_endprologue
; WIN64-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
; WIN64-NEXT:    vaddss %xmm8, %xmm0, %xmm0
; WIN64-NEXT:    vaddss %xmm0, %xmm8, %xmm0
; WIN64-NEXT:    callq test_argRetFloat
; WIN64-NEXT:    vaddss %xmm8, %xmm0, %xmm0
; WIN64-NEXT:    vaddss %xmm0, %xmm8, %xmm0
; WIN64-NEXT:    vmovaps (%rsp), %xmm8 # 16-byte Reload
; WIN64-NEXT:    addq $16, %rsp
; WIN64-NEXT:    popq %rsp
@@ -407,9 +407,9 @@ define x86_regcallcc float @test_CallargRetFloat(float %a) {
; LINUXOSX64-NEXT:    .cfi_offset %rsp, -16
; LINUXOSX64-NEXT:    .cfi_offset %xmm8, -32
; LINUXOSX64-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
; LINUXOSX64-NEXT:    vaddss %xmm8, %xmm0, %xmm0
; LINUXOSX64-NEXT:    vaddss %xmm0, %xmm8, %xmm0
; LINUXOSX64-NEXT:    callq test_argRetFloat
; LINUXOSX64-NEXT:    vaddss %xmm8, %xmm0, %xmm0
; LINUXOSX64-NEXT:    vaddss %xmm0, %xmm8, %xmm0
; LINUXOSX64-NEXT:    vmovaps (%rsp), %xmm8 # 16-byte Reload
; LINUXOSX64-NEXT:    addq $16, %rsp
; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 16
@@ -468,9 +468,9 @@ define x86_regcallcc double @test_CallargRetDouble(double %a) {
; WIN64-NEXT:    .seh_savexmm %xmm8, 0
; WIN64-NEXT:    .seh_endprologue
; WIN64-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
; WIN64-NEXT:    vaddsd %xmm8, %xmm0, %xmm0
; WIN64-NEXT:    vaddsd %xmm0, %xmm8, %xmm0
; WIN64-NEXT:    callq test_argRetDouble
; WIN64-NEXT:    vaddsd %xmm8, %xmm0, %xmm0
; WIN64-NEXT:    vaddsd %xmm0, %xmm8, %xmm0
; WIN64-NEXT:    vmovaps (%rsp), %xmm8 # 16-byte Reload
; WIN64-NEXT:    addq $16, %rsp
; WIN64-NEXT:    popq %rsp
@@ -489,9 +489,9 @@ define x86_regcallcc double @test_CallargRetDouble(double %a) {
; LINUXOSX64-NEXT:    .cfi_offset %rsp, -16
; LINUXOSX64-NEXT:    .cfi_offset %xmm8, -32
; LINUXOSX64-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
; LINUXOSX64-NEXT:    vaddsd %xmm8, %xmm0, %xmm0
; LINUXOSX64-NEXT:    vaddsd %xmm0, %xmm8, %xmm0
; LINUXOSX64-NEXT:    callq test_argRetDouble
; LINUXOSX64-NEXT:    vaddsd %xmm8, %xmm0, %xmm0
; LINUXOSX64-NEXT:    vaddsd %xmm0, %xmm8, %xmm0
; LINUXOSX64-NEXT:    vmovaps (%rsp), %xmm8 # 16-byte Reload
; LINUXOSX64-NEXT:    addq $16, %rsp
; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 16
+1 −1
Original line number Diff line number Diff line
@@ -183,7 +183,7 @@ define <64 x i16> @test8(<64 x i8> %x, <64 x i16> %a, <64 x i16> %b) {
; CHECK-KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm8
; CHECK-KNL-NEXT:    vmovdqa 16(%rbp), %ymm9
; CHECK-KNL-NEXT:    vpxor %xmm10, %xmm10, %xmm10
; CHECK-KNL-NEXT:    vpcmpeqb %ymm10, %ymm0, %ymm11
; CHECK-KNL-NEXT:    vpcmpeqb %ymm0, %ymm10, %ymm11
; CHECK-KNL-NEXT:    vpmovsxbw %xmm11, %ymm0
; CHECK-KNL-NEXT:    vpblendvb %ymm0, %ymm1, %ymm5, %ymm0
; CHECK-KNL-NEXT:    vextracti128 $1, %ymm11, %xmm1
Loading