Loading llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +125 −82 Original line number Diff line number Diff line Loading @@ -36,6 +36,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" Loading Loading @@ -98,6 +99,7 @@ private: const X86InstrInfo *TII; const TargetRegisterInfo *TRI; const TargetRegisterClass *PromoteRC; MachineDominatorTree *MDT; CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, MachineInstr &CopyDefI); Loading Loading @@ -145,6 +147,7 @@ FunctionPass *llvm::createX86FlagsCopyLoweringPass() { char X86FlagsCopyLoweringPass::ID = 0; void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<MachineDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); } Loading Loading @@ -342,6 +345,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); TII = Subtarget.getInstrInfo(); TRI = Subtarget.getRegisterInfo(); MDT = &getAnalysis<MachineDominatorTree>(); PromoteRC = &X86::GR8RegClass; if (MF.begin() == MF.end()) Loading Loading @@ -416,7 +420,45 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { // of these up front instead. CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI); for (auto MII = std::next(CopyI->getIterator()), MIE = MBB.instr_end(); // Collect the basic blocks we need to scan. Typically this will just be // a single basic block but we may have to scan multiple blocks if the // EFLAGS copy lives into successors. SmallVector<MachineBasicBlock *, 2> Blocks; SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks; Blocks.push_back(&MBB); VisitedBlocks.insert(&MBB); do { MachineBasicBlock &UseMBB = *Blocks.pop_back_val(); // We currently don't do any PHI insertion and so we require that the // test basic block dominates all of the use basic blocks. // // We could in theory do PHI insertion here if it becomes useful by just // taking undef values in along every edge that we don't trace this // EFLAGS copy along. This isn't as bad as fully general PHI insertion, // but still seems like a great deal of complexity. // // Because it is theoretically possible that some earlier MI pass or // other lowering transformation could induce this to happen, we do // a hard check even in non-debug builds here. if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) { DEBUG({ dbgs() << "ERROR: Encountered use that is not dominated by our test " "basic block! Rewriting this would require inserting PHI " "nodes to track the flag state across the CFG.\n\nTest " "block:\n"; TestMBB.dump(); dbgs() << "Use block:\n"; UseMBB.dump(); }); report_fatal_error("Cannot lower EFLAGS copy when original copy def " "does not dominate all uses."); } for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator()) : UseMBB.instr_begin(), MIE = UseMBB.instr_end(); MII != MIE;) { MachineInstr &MI = *MII++; MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS); Loading @@ -426,9 +468,9 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { // scanning here. // // NB!!! Many instructions only modify some flags. LLVM currently // models this as clobbering all flags, but if that ever changes this // will need to be carefully updated to handle that more complex // logic. // models this as clobbering all flags, but if that ever changes // this will need to be carefully updated to handle that more // complex logic. FlagsKilled = true; break; } Loading @@ -454,7 +496,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { do { JmpIs.push_back(&*JmpIt); ++JmpIt; } while (JmpIt != MBB.instr_end() && } while (JmpIt != UseMBB.instr_end() && X86::getCondFromBranchOpc(JmpIt->getOpcode()) != X86::COND_INVALID); break; Loading @@ -463,12 +505,14 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { // Otherwise we can just rewrite in-place. if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) { rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); } else if (X86::getCondFromSETOpc(MI.getOpcode()) != X86::COND_INVALID) { } else if (X86::getCondFromSETOpc(MI.getOpcode()) != X86::COND_INVALID) { rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); } else if (MI.getOpcode() == TargetOpcode::COPY) { rewriteCopy(MI, *FlagUse, CopyDefI); } else { // We assume that arithmetic instructions that use flags also def them. // We assume that arithmetic instructions that use flags also def // them. assert(MI.findRegisterDefOperand(X86::EFLAGS) && "Expected a def of EFLAGS for this instruction!"); Loading @@ -489,30 +533,29 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { break; } // If we didn't find a kill (or equivalent) check that the flags don't // live-out of the basic block. Currently we don't support lowering copies // of flags that live out in this fashion. if (!FlagsKilled && llvm::any_of(MBB.successors(), [](MachineBasicBlock *SuccMBB) { return SuccMBB->isLiveIn(X86::EFLAGS); })) { DEBUG({ dbgs() << "ERROR: Found a copied EFLAGS live-out from basic block:\n" << "----\n"; MBB.dump(); dbgs() << "----\n" << "ERROR: Cannot lower this EFLAGS copy!\n"; }); report_fatal_error( "Cannot lower EFLAGS copy that lives out of a basic block!"); } // If the flags were killed, we're done with this block. if (FlagsKilled) break; // Otherwise we need to scan successors for ones where the flags live-in // and queue those up for processing. for (MachineBasicBlock *SuccMBB : UseMBB.successors()) if (SuccMBB->isLiveIn(X86::EFLAGS) && VisitedBlocks.insert(SuccMBB).second) Blocks.push_back(SuccMBB); } while (!Blocks.empty()); // Now rewrite the jumps that use the flags. These we handle specially // because if there are multiple jumps we'll have to do surgery on the CFG. // because if there are multiple jumps in a single basic block we'll have // to do surgery on the CFG. MachineBasicBlock *LastJmpMBB = nullptr; for (MachineInstr *JmpI : JmpIs) { // Past the first jump we need to split the blocks apart. if (JmpI != JmpIs.front()) // Past the first jump within a basic block we need to split the blocks // apart. if (JmpI->getParent() == LastJmpMBB) splitBlock(*JmpI->getParent(), *JmpI, *TII); else LastJmpMBB = JmpI->getParent(); rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs); } Loading llvm/test/CodeGen/X86/O0-pipeline.ll +1 −0 Original line number Diff line number Diff line Loading @@ -37,6 +37,7 @@ ; CHECK-NEXT: X86 PIC Global Base Reg Initialization ; CHECK-NEXT: Expand ISel Pseudo-instructions ; CHECK-NEXT: Local Stack Slot Allocation ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: X86 EFLAGS copy lowering ; CHECK-NEXT: X86 WinAlloca Expander ; CHECK-NEXT: Eliminate PHI nodes for register allocation Loading llvm/test/CodeGen/X86/copy-eflags.ll +108 −0 Original line number Diff line number Diff line Loading @@ -196,3 +196,111 @@ else: tail call void @external_b() ret void } ; Test a function that gets special select lowering into CFG with copied EFLAGS ; threaded across the CFG. This requires our EFLAGS copy rewriting to handle ; cross-block rewrites in at least some narrow cases. define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, i8* %ptr1, i32* %ptr2) { ; X32-LABEL: PR37100: ; X32: # %bb.0: # %bb ; X32-NEXT: pushl %ebp ; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: pushl %ebx ; X32-NEXT: .cfi_def_cfa_offset 12 ; X32-NEXT: pushl %edi ; X32-NEXT: .cfi_def_cfa_offset 16 ; X32-NEXT: pushl %esi ; X32-NEXT: .cfi_def_cfa_offset 20 ; X32-NEXT: .cfi_offset %esi, -20 ; X32-NEXT: .cfi_offset %edi, -16 ; X32-NEXT: .cfi_offset %ebx, -12 ; X32-NEXT: .cfi_offset %ebp, -8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movb {{[0-9]+}}(%esp), %ch ; X32-NEXT: movb {{[0-9]+}}(%esp), %cl ; X32-NEXT: jmp .LBB3_1 ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB3_5: # %bb1 ; X32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: idivl %ebp ; X32-NEXT: .LBB3_1: # %bb1 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 ; X32-NEXT: movsbl %cl, %eax ; X32-NEXT: movl %eax, %edx ; X32-NEXT: sarl $31, %edx ; X32-NEXT: cmpl %eax, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: sbbl %edx, %eax ; X32-NEXT: setl %al ; X32-NEXT: setl %dl ; X32-NEXT: movzbl %dl, %ebp ; X32-NEXT: negl %ebp ; X32-NEXT: testb $-1, %al ; X32-NEXT: jne .LBB3_3 ; X32-NEXT: # %bb.2: # %bb1 ; X32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X32-NEXT: movb %ch, %cl ; X32-NEXT: .LBB3_3: # %bb1 ; X32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X32-NEXT: movb %cl, (%ebx) ; X32-NEXT: movl (%edi), %edx ; X32-NEXT: testb $-1, %al ; X32-NEXT: jne .LBB3_5 ; X32-NEXT: # %bb.4: # %bb1 ; X32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: jmp .LBB3_5 ; ; X64-LABEL: PR37100: ; X64: # %bb.0: # %bb ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: jmp .LBB3_1 ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB3_5: # %bb1 ; X64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: idivl %esi ; X64-NEXT: .LBB3_1: # %bb1 ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movsbq %dil, %rax ; X64-NEXT: xorl %esi, %esi ; X64-NEXT: cmpq %rax, %r10 ; X64-NEXT: setl %sil ; X64-NEXT: negl %esi ; X64-NEXT: cmpq %rax, %r10 ; X64-NEXT: jl .LBB3_3 ; X64-NEXT: # %bb.2: # %bb1 ; X64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X64-NEXT: movl %ecx, %edi ; X64-NEXT: .LBB3_3: # %bb1 ; X64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X64-NEXT: movb %dil, (%r8) ; X64-NEXT: jl .LBB3_5 ; X64-NEXT: # %bb.4: # %bb1 ; X64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X64-NEXT: movl (%r9), %esi ; X64-NEXT: jmp .LBB3_5 bb: br label %bb1 bb1: %tmp = phi i8 [ %tmp8, %bb1 ], [ %arg1, %bb ] %tmp2 = phi i16 [ %tmp12, %bb1 ], [ %arg2, %bb ] %tmp3 = icmp sgt i16 %tmp2, 7 %tmp4 = select i1 %tmp3, i16 %tmp2, i16 7 %tmp5 = sext i8 %tmp to i64 %tmp6 = icmp slt i64 %arg3, %tmp5 %tmp7 = sext i1 %tmp6 to i32 %tmp8 = select i1 %tmp6, i8 %tmp, i8 %arg4 store volatile i8 %tmp8, i8* %ptr1 %tmp9 = load volatile i32, i32* %ptr2 %tmp10 = select i1 %tmp6, i32 %tmp7, i32 %tmp9 %tmp11 = srem i32 0, %tmp10 %tmp12 = trunc i32 %tmp11 to i16 br label %bb1 } Loading
llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +125 −82 Original line number Diff line number Diff line Loading @@ -36,6 +36,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" Loading Loading @@ -98,6 +99,7 @@ private: const X86InstrInfo *TII; const TargetRegisterInfo *TRI; const TargetRegisterClass *PromoteRC; MachineDominatorTree *MDT; CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, MachineInstr &CopyDefI); Loading Loading @@ -145,6 +147,7 @@ FunctionPass *llvm::createX86FlagsCopyLoweringPass() { char X86FlagsCopyLoweringPass::ID = 0; void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<MachineDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); } Loading Loading @@ -342,6 +345,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); TII = Subtarget.getInstrInfo(); TRI = Subtarget.getRegisterInfo(); MDT = &getAnalysis<MachineDominatorTree>(); PromoteRC = &X86::GR8RegClass; if (MF.begin() == MF.end()) Loading Loading @@ -416,7 +420,45 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { // of these up front instead. CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI); for (auto MII = std::next(CopyI->getIterator()), MIE = MBB.instr_end(); // Collect the basic blocks we need to scan. Typically this will just be // a single basic block but we may have to scan multiple blocks if the // EFLAGS copy lives into successors. SmallVector<MachineBasicBlock *, 2> Blocks; SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks; Blocks.push_back(&MBB); VisitedBlocks.insert(&MBB); do { MachineBasicBlock &UseMBB = *Blocks.pop_back_val(); // We currently don't do any PHI insertion and so we require that the // test basic block dominates all of the use basic blocks. // // We could in theory do PHI insertion here if it becomes useful by just // taking undef values in along every edge that we don't trace this // EFLAGS copy along. This isn't as bad as fully general PHI insertion, // but still seems like a great deal of complexity. // // Because it is theoretically possible that some earlier MI pass or // other lowering transformation could induce this to happen, we do // a hard check even in non-debug builds here. if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) { DEBUG({ dbgs() << "ERROR: Encountered use that is not dominated by our test " "basic block! Rewriting this would require inserting PHI " "nodes to track the flag state across the CFG.\n\nTest " "block:\n"; TestMBB.dump(); dbgs() << "Use block:\n"; UseMBB.dump(); }); report_fatal_error("Cannot lower EFLAGS copy when original copy def " "does not dominate all uses."); } for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator()) : UseMBB.instr_begin(), MIE = UseMBB.instr_end(); MII != MIE;) { MachineInstr &MI = *MII++; MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS); Loading @@ -426,9 +468,9 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { // scanning here. // // NB!!! Many instructions only modify some flags. LLVM currently // models this as clobbering all flags, but if that ever changes this // will need to be carefully updated to handle that more complex // logic. // models this as clobbering all flags, but if that ever changes // this will need to be carefully updated to handle that more // complex logic. FlagsKilled = true; break; } Loading @@ -454,7 +496,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { do { JmpIs.push_back(&*JmpIt); ++JmpIt; } while (JmpIt != MBB.instr_end() && } while (JmpIt != UseMBB.instr_end() && X86::getCondFromBranchOpc(JmpIt->getOpcode()) != X86::COND_INVALID); break; Loading @@ -463,12 +505,14 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { // Otherwise we can just rewrite in-place. if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) { rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); } else if (X86::getCondFromSETOpc(MI.getOpcode()) != X86::COND_INVALID) { } else if (X86::getCondFromSETOpc(MI.getOpcode()) != X86::COND_INVALID) { rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); } else if (MI.getOpcode() == TargetOpcode::COPY) { rewriteCopy(MI, *FlagUse, CopyDefI); } else { // We assume that arithmetic instructions that use flags also def them. // We assume that arithmetic instructions that use flags also def // them. assert(MI.findRegisterDefOperand(X86::EFLAGS) && "Expected a def of EFLAGS for this instruction!"); Loading @@ -489,30 +533,29 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { break; } // If we didn't find a kill (or equivalent) check that the flags don't // live-out of the basic block. Currently we don't support lowering copies // of flags that live out in this fashion. if (!FlagsKilled && llvm::any_of(MBB.successors(), [](MachineBasicBlock *SuccMBB) { return SuccMBB->isLiveIn(X86::EFLAGS); })) { DEBUG({ dbgs() << "ERROR: Found a copied EFLAGS live-out from basic block:\n" << "----\n"; MBB.dump(); dbgs() << "----\n" << "ERROR: Cannot lower this EFLAGS copy!\n"; }); report_fatal_error( "Cannot lower EFLAGS copy that lives out of a basic block!"); } // If the flags were killed, we're done with this block. if (FlagsKilled) break; // Otherwise we need to scan successors for ones where the flags live-in // and queue those up for processing. for (MachineBasicBlock *SuccMBB : UseMBB.successors()) if (SuccMBB->isLiveIn(X86::EFLAGS) && VisitedBlocks.insert(SuccMBB).second) Blocks.push_back(SuccMBB); } while (!Blocks.empty()); // Now rewrite the jumps that use the flags. These we handle specially // because if there are multiple jumps we'll have to do surgery on the CFG. // because if there are multiple jumps in a single basic block we'll have // to do surgery on the CFG. MachineBasicBlock *LastJmpMBB = nullptr; for (MachineInstr *JmpI : JmpIs) { // Past the first jump we need to split the blocks apart. if (JmpI != JmpIs.front()) // Past the first jump within a basic block we need to split the blocks // apart. if (JmpI->getParent() == LastJmpMBB) splitBlock(*JmpI->getParent(), *JmpI, *TII); else LastJmpMBB = JmpI->getParent(); rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs); } Loading
llvm/test/CodeGen/X86/O0-pipeline.ll +1 −0 Original line number Diff line number Diff line Loading @@ -37,6 +37,7 @@ ; CHECK-NEXT: X86 PIC Global Base Reg Initialization ; CHECK-NEXT: Expand ISel Pseudo-instructions ; CHECK-NEXT: Local Stack Slot Allocation ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: X86 EFLAGS copy lowering ; CHECK-NEXT: X86 WinAlloca Expander ; CHECK-NEXT: Eliminate PHI nodes for register allocation Loading
llvm/test/CodeGen/X86/copy-eflags.ll +108 −0 Original line number Diff line number Diff line Loading @@ -196,3 +196,111 @@ else: tail call void @external_b() ret void } ; Test a function that gets special select lowering into CFG with copied EFLAGS ; threaded across the CFG. This requires our EFLAGS copy rewriting to handle ; cross-block rewrites in at least some narrow cases. define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, i8* %ptr1, i32* %ptr2) { ; X32-LABEL: PR37100: ; X32: # %bb.0: # %bb ; X32-NEXT: pushl %ebp ; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: pushl %ebx ; X32-NEXT: .cfi_def_cfa_offset 12 ; X32-NEXT: pushl %edi ; X32-NEXT: .cfi_def_cfa_offset 16 ; X32-NEXT: pushl %esi ; X32-NEXT: .cfi_def_cfa_offset 20 ; X32-NEXT: .cfi_offset %esi, -20 ; X32-NEXT: .cfi_offset %edi, -16 ; X32-NEXT: .cfi_offset %ebx, -12 ; X32-NEXT: .cfi_offset %ebp, -8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movb {{[0-9]+}}(%esp), %ch ; X32-NEXT: movb {{[0-9]+}}(%esp), %cl ; X32-NEXT: jmp .LBB3_1 ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB3_5: # %bb1 ; X32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: idivl %ebp ; X32-NEXT: .LBB3_1: # %bb1 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 ; X32-NEXT: movsbl %cl, %eax ; X32-NEXT: movl %eax, %edx ; X32-NEXT: sarl $31, %edx ; X32-NEXT: cmpl %eax, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: sbbl %edx, %eax ; X32-NEXT: setl %al ; X32-NEXT: setl %dl ; X32-NEXT: movzbl %dl, %ebp ; X32-NEXT: negl %ebp ; X32-NEXT: testb $-1, %al ; X32-NEXT: jne .LBB3_3 ; X32-NEXT: # %bb.2: # %bb1 ; X32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X32-NEXT: movb %ch, %cl ; X32-NEXT: .LBB3_3: # %bb1 ; X32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X32-NEXT: movb %cl, (%ebx) ; X32-NEXT: movl (%edi), %edx ; X32-NEXT: testb $-1, %al ; X32-NEXT: jne .LBB3_5 ; X32-NEXT: # %bb.4: # %bb1 ; X32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: jmp .LBB3_5 ; ; X64-LABEL: PR37100: ; X64: # %bb.0: # %bb ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: jmp .LBB3_1 ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB3_5: # %bb1 ; X64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: idivl %esi ; X64-NEXT: .LBB3_1: # %bb1 ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movsbq %dil, %rax ; X64-NEXT: xorl %esi, %esi ; X64-NEXT: cmpq %rax, %r10 ; X64-NEXT: setl %sil ; X64-NEXT: negl %esi ; X64-NEXT: cmpq %rax, %r10 ; X64-NEXT: jl .LBB3_3 ; X64-NEXT: # %bb.2: # %bb1 ; X64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X64-NEXT: movl %ecx, %edi ; X64-NEXT: .LBB3_3: # %bb1 ; X64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X64-NEXT: movb %dil, (%r8) ; X64-NEXT: jl .LBB3_5 ; X64-NEXT: # %bb.4: # %bb1 ; X64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; X64-NEXT: movl (%r9), %esi ; X64-NEXT: jmp .LBB3_5 bb: br label %bb1 bb1: %tmp = phi i8 [ %tmp8, %bb1 ], [ %arg1, %bb ] %tmp2 = phi i16 [ %tmp12, %bb1 ], [ %arg2, %bb ] %tmp3 = icmp sgt i16 %tmp2, 7 %tmp4 = select i1 %tmp3, i16 %tmp2, i16 7 %tmp5 = sext i8 %tmp to i64 %tmp6 = icmp slt i64 %arg3, %tmp5 %tmp7 = sext i1 %tmp6 to i32 %tmp8 = select i1 %tmp6, i8 %tmp, i8 %arg4 store volatile i8 %tmp8, i8* %ptr1 %tmp9 = load volatile i32, i32* %ptr2 %tmp10 = select i1 %tmp6, i32 %tmp7, i32 %tmp9 %tmp11 = srem i32 0, %tmp10 %tmp12 = trunc i32 %tmp11 to i16 br label %bb1 }