diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 11c7007d4d9c8..c872cf1287ae4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53502,6 +53502,26 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Attempt to convert a (vXi1 bitcast(iX Mask)) mask before it might get split +// by legalization. +static SDValue canonicalizeBoolMask(unsigned Opcode, EVT VT, SDValue Mask, + const SDLoc &DL, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (!DCI.isBeforeLegalizeOps() || Mask.getOpcode() != ISD::BITCAST || + Mask.getScalarValueSizeInBits() != 1 || Subtarget.hasAVX512() || + !DAG.getTargetLoweringInfo().isOperationLegalOrCustom(Opcode, VT)) + return SDValue(); + + EVT MaskVT = Mask.getValueType(); + EVT ExtMaskVT = VT.changeVectorElementTypeToInteger(); + assert(ExtMaskVT.bitsGT(MaskVT) && "Unexpected extension type"); + if (SDValue NewMask = combineToExtendBoolVectorInReg( + ISD::SIGN_EXTEND, DL, ExtMaskVT, Mask, DAG, DCI, Subtarget)) + return DAG.getNode(ISD::TRUNCATE, DL, MaskVT, NewMask); + return SDValue(); +} + /// If V is a build vector of boolean constants and exactly one of those /// constants is true, return the operand index of that true element. /// Otherwise, return -1. @@ -53678,12 +53698,23 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, return Blend; } + EVT VT = Mld->getValueType(0); + SDValue Mask = Mld->getMask(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDLoc DL(N); + + // Attempt to convert a (vXi1 bitcast(iX Mask)) mask before it might get split + // by legalization. + if (SDValue NewMask = + canonicalizeBoolMask(ISD::MLOAD, VT, Mask, DL, DAG, DCI, Subtarget)) + return DAG.getMaskedLoad(VT, DL, Mld->getChain(), Mld->getBasePtr(), + Mld->getOffset(), NewMask, Mld->getPassThru(), + Mld->getMemoryVT(), Mld->getMemOperand(), + Mld->getAddressingMode(), Mld->getExtensionType()); + // If the mask value has been legalized to a non-boolean vector, try to // simplify ops leading up to it. We only demand the MSB of each lane. - SDValue Mask = Mld->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { - EVT VT = Mld->getValueType(0); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) @@ -53693,8 +53724,8 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, if (SDValue NewMask = TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG)) return DAG.getMaskedLoad( - VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(), - NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(), + VT, DL, Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(), NewMask, + Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(), Mld->getAddressingMode(), Mld->getExtensionType()); } @@ -53784,6 +53815,15 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); } + // Attempt to convert a (vXi1 bitcast(iX Mask)) mask before it might get split + // by legalization. + if (SDValue NewMask = + canonicalizeBoolMask(ISD::MSTORE, VT, Mask, DL, DAG, DCI, Subtarget)) + return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(), + Mst->getBasePtr(), Mst->getOffset(), NewMask, + Mst->getMemoryVT(), Mst->getMemOperand(), + Mst->getAddressingMode()); + // If the mask value has been legalized to a non-boolean vector, try to // simplify ops leading up to it. We only demand the MSB of each lane. if (Mask.getScalarValueSizeInBits() != 1) { @@ -57399,35 +57439,35 @@ static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, } static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, - SDValue Index, SDValue Base, SDValue Scale, - SelectionDAG &DAG) { + SDValue Index, SDValue Base, SDValue Mask, + SDValue Scale, SelectionDAG &DAG) { SDLoc DL(GorS); if (auto *Gather = dyn_cast(GorS)) { - SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(), - Gather->getMask(), Base, Index, Scale } ; - return DAG.getMaskedGather(Gather->getVTList(), - Gather->getMemoryVT(), DL, Ops, - Gather->getMemOperand(), + SDValue Ops[] = { + Gather->getChain(), Gather->getPassThru(), Mask, Base, Index, Scale}; + return DAG.getMaskedGather(Gather->getVTList(), Gather->getMemoryVT(), DL, + Ops, Gather->getMemOperand(), Gather->getIndexType(), Gather->getExtensionType()); } auto *Scatter = cast(GorS); - SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(), - Scatter->getMask(), Base, Index, Scale }; - return DAG.getMaskedScatter(Scatter->getVTList(), - Scatter->getMemoryVT(), DL, + SDValue Ops[] = { + Scatter->getChain(), Scatter->getValue(), Mask, Base, Index, Scale}; + return DAG.getMaskedScatter(Scatter->getVTList(), Scatter->getMemoryVT(), DL, Ops, Scatter->getMemOperand(), Scatter->getIndexType(), Scatter->isTruncatingStore()); } static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc DL(N); auto *GorS = cast(N); SDValue Index = GorS->getIndex(); SDValue Base = GorS->getBasePtr(); + SDValue Mask = GorS->getMask(); SDValue Scale = GorS->getScale(); EVT IndexVT = Index.getValueType(); EVT IndexSVT = IndexVT.getVectorElementType(); @@ -57461,7 +57501,8 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, Index.getOperand(0), NewShAmt); SDValue NewScale = DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType()); - return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG); + return rebuildGatherScatter(GorS, NewIndex, Base, Mask, NewScale, + DAG); } } } @@ -57479,7 +57520,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, // a split. if (SDValue TruncIndex = DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index)) - return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG); + return rebuildGatherScatter(GorS, TruncIndex, Base, Mask, Scale, DAG); // Shrink any sign/zero extends from 32 or smaller to larger than 32 if // there are sufficient sign bits. Only do this before legalize types to @@ -57488,13 +57529,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, Index.getOpcode() == ISD::ZERO_EXTEND) && Index.getOperand(0).getScalarValueSizeInBits() <= 32) { Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); - return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); + return rebuildGatherScatter(GorS, Index, Base, Mask, Scale, DAG); } // Shrink if we remove an illegal type. if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) { Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); - return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); + return rebuildGatherScatter(GorS, Index, Base, Mask, Scale, DAG); } } } @@ -57519,13 +57560,15 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, DAG.getConstant(Adder, DL, PtrVT)); SDValue NewIndex = Index.getOperand(1 - I); - return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG); + return rebuildGatherScatter(GorS, NewIndex, NewBase, Mask, Scale, + DAG); } // For non-constant cases, limit this to non-scaled cases. if (ScaleAmt == 1) { SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat); SDValue NewIndex = Index.getOperand(1 - I); - return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG); + return rebuildGatherScatter(GorS, NewIndex, NewBase, Mask, Scale, + DAG); } } } @@ -57540,7 +57583,8 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(1 - I), Splat); SDValue NewBase = DAG.getConstant(0, DL, PtrVT); - return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG); + return rebuildGatherScatter(GorS, NewIndex, NewBase, Mask, Scale, + DAG); } } } @@ -57551,12 +57595,18 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32; IndexVT = IndexVT.changeVectorElementType(*DAG.getContext(), EltVT); Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); - return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); + return rebuildGatherScatter(GorS, Index, Base, Mask, Scale, DAG); } + + // Attempt to convert a (vXi1 bitcast(iX Mask)) mask before it might get + // split by legalization. + if (SDValue NewMask = + canonicalizeBoolMask(GorS->getOpcode(), N->getValueType(0), Mask, + DL, DAG, DCI, Subtarget)) + return rebuildGatherScatter(GorS, Index, Base, NewMask, Scale, DAG); } // With vector masks we only demand the upper bit of the mask. - SDValue Mask = GorS->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) { @@ -61701,7 +61751,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::MGATHER: case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI); case ISD::MGATHER: - case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI); + case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget); case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); case X86ISD::PMULDQ: diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll index 962ff66b072a6..2913fe13095ca 100644 --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -312,27 +312,11 @@ define <4 x float> @masked_gather_v4f32_ptr_v4i32(<4 x ptr> %ptr, i32 %trigger, ; ; AVX2-GATHER-LABEL: masked_gather_v4f32_ptr_v4i32: ; AVX2-GATHER: # %bb.0: -; AVX2-GATHER-NEXT: movl %edi, %eax -; AVX2-GATHER-NEXT: andl $1, %eax -; AVX2-GATHER-NEXT: negl %eax -; AVX2-GATHER-NEXT: vmovd %eax, %xmm2 -; AVX2-GATHER-NEXT: movl %edi, %eax -; AVX2-GATHER-NEXT: shrb %al -; AVX2-GATHER-NEXT: movzbl %al, %eax -; AVX2-GATHER-NEXT: andl $1, %eax -; AVX2-GATHER-NEXT: negl %eax -; AVX2-GATHER-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX2-GATHER-NEXT: movl %edi, %eax -; AVX2-GATHER-NEXT: shrb $2, %al -; AVX2-GATHER-NEXT: movzbl %al, %eax -; AVX2-GATHER-NEXT: andl $1, %eax -; AVX2-GATHER-NEXT: negl %eax -; AVX2-GATHER-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX2-GATHER-NEXT: andb $8, %dil -; AVX2-GATHER-NEXT: shrb $3, %dil -; AVX2-GATHER-NEXT: movzbl %dil, %eax -; AVX2-GATHER-NEXT: negl %eax -; AVX2-GATHER-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; AVX2-GATHER-NEXT: vmovd %edi, %xmm2 +; AVX2-GATHER-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX2-GATHER-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,2,4,8] +; AVX2-GATHER-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-GATHER-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-GATHER-NEXT: vgatherqps %xmm2, (,%ymm0), %xmm1 ; AVX2-GATHER-NEXT: vmovaps %xmm1, %xmm0 ; AVX2-GATHER-NEXT: vzeroupper @@ -2575,51 +2559,11 @@ define <8 x i32> @masked_gather_v8i32_v8i32(i8 %trigger) { ; ; AVX2-GATHER-LABEL: masked_gather_v8i32_v8i32: ; AVX2-GATHER: # %bb.0: -; AVX2-GATHER-NEXT: movl %edi, %eax -; AVX2-GATHER-NEXT: shrb $5, %al -; AVX2-GATHER-NEXT: movzbl %al, %eax -; AVX2-GATHER-NEXT: andl $1, %eax -; AVX2-GATHER-NEXT: negl %eax -; AVX2-GATHER-NEXT: movl %edi, %ecx -; AVX2-GATHER-NEXT: shrb $4, %cl -; AVX2-GATHER-NEXT: movzbl %cl, %ecx -; AVX2-GATHER-NEXT: andl $1, %ecx -; AVX2-GATHER-NEXT: negl %ecx -; AVX2-GATHER-NEXT: vmovd %ecx, %xmm0 -; AVX2-GATHER-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX2-GATHER-NEXT: movl %edi, %eax -; AVX2-GATHER-NEXT: shrb $6, %al -; AVX2-GATHER-NEXT: movzbl %al, %eax -; AVX2-GATHER-NEXT: andl $1, %eax -; AVX2-GATHER-NEXT: negl %eax -; AVX2-GATHER-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; AVX2-GATHER-NEXT: movl %edi, %eax -; AVX2-GATHER-NEXT: shrb $7, %al -; AVX2-GATHER-NEXT: movzbl %al, %eax -; AVX2-GATHER-NEXT: negl %eax -; AVX2-GATHER-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX2-GATHER-NEXT: movl %edi, %eax -; AVX2-GATHER-NEXT: andl $1, %eax -; AVX2-GATHER-NEXT: negl %eax -; AVX2-GATHER-NEXT: vmovd %eax, %xmm1 -; AVX2-GATHER-NEXT: movl %edi, %eax -; AVX2-GATHER-NEXT: shrb %al -; AVX2-GATHER-NEXT: movzbl %al, %eax -; AVX2-GATHER-NEXT: andl $1, %eax -; AVX2-GATHER-NEXT: negl %eax -; AVX2-GATHER-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX2-GATHER-NEXT: movl %edi, %eax -; AVX2-GATHER-NEXT: shrb $2, %al -; AVX2-GATHER-NEXT: movzbl %al, %eax -; AVX2-GATHER-NEXT: andl $1, %eax -; AVX2-GATHER-NEXT: negl %eax -; AVX2-GATHER-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX2-GATHER-NEXT: shrb $3, %dil -; AVX2-GATHER-NEXT: movzbl %dil, %eax -; AVX2-GATHER-NEXT: andl $1, %eax -; AVX2-GATHER-NEXT: negl %eax -; AVX2-GATHER-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX2-GATHER-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-GATHER-NEXT: vmovd %edi, %xmm0 +; AVX2-GATHER-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-GATHER-NEXT: vpmovzxbd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] +; AVX2-GATHER-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-GATHER-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-GATHER-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-GATHER-NEXT: vmovdqa %ymm0, %ymm2 ; AVX2-GATHER-NEXT: vpxor %xmm3, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 672ec4038d235..99a8918fef93f 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -113,21 +113,27 @@ define <2 x double> @load_v2f64_i2(i2 %trigger, ptr %addr, <2 x double> %dst) { ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: load_v2f64_i2: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: movl %edi, %eax -; AVX1OR2-NEXT: andl $1, %eax -; AVX1OR2-NEXT: negq %rax -; AVX1OR2-NEXT: vmovq %rax, %xmm1 -; AVX1OR2-NEXT: andb $2, %dil -; AVX1OR2-NEXT: shrb %dil -; AVX1OR2-NEXT: movzbl %dil, %eax -; AVX1OR2-NEXT: negq %rax -; AVX1OR2-NEXT: vmovq %rax, %xmm2 -; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1OR2-NEXT: vmaskmovpd (%rsi), %xmm1, %xmm2 -; AVX1OR2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: load_v2f64_i2: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,2] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmaskmovpd (%rsi), %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_v2f64_i2: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,2] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmaskmovpd (%rsi), %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_v2f64_i2: ; AVX512F: ## %bb.0: @@ -281,29 +287,14 @@ define <4 x double> @load_v4f64_i4(i4 %trigger, ptr %addr, <4 x double> %dst) { ; ; AVX1-LABEL: load_v4f64_i4: ; AVX1: ## %bb.0: -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: shrb %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX1-NEXT: andb $8, %dil -; AVX1-NEXT: shrb $3, %dil -; AVX1-NEXT: movzbl %dil, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmaskmovpd (%rsi), %ymm1, %ymm2 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 @@ -311,30 +302,11 @@ define <4 x double> @load_v4f64_i4(i4 %trigger, ptr %addr, <4 x double> %dst) { ; ; AVX2-LABEL: load_v4f64_i4: ; AVX2: ## %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: andb $8, %al -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: negq %rax -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negq %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negq %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: shrb %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negq %rax -; AVX2-NEXT: vmovq %rax, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmaskmovpd (%rsi), %ymm1, %ymm2 ; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1552,32 +1524,27 @@ define <4 x float> @load_v4f32_i4(i4 %trigger, ptr %addr, <4 x float> %dst) { ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE42-NEXT: retq ; -; AVX1OR2-LABEL: load_v4f32_i4: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: movl %edi, %eax -; AVX1OR2-NEXT: andl $1, %eax -; AVX1OR2-NEXT: negl %eax -; AVX1OR2-NEXT: vmovd %eax, %xmm1 -; AVX1OR2-NEXT: movl %edi, %eax -; AVX1OR2-NEXT: shrb %al -; AVX1OR2-NEXT: movzbl %al, %eax -; AVX1OR2-NEXT: andl $1, %eax -; AVX1OR2-NEXT: negl %eax -; AVX1OR2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX1OR2-NEXT: movl %edi, %eax -; AVX1OR2-NEXT: shrb $2, %al -; AVX1OR2-NEXT: movzbl %al, %eax -; AVX1OR2-NEXT: andl $1, %eax -; AVX1OR2-NEXT: negl %eax -; AVX1OR2-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX1OR2-NEXT: andb $8, %dil -; AVX1OR2-NEXT: shrb $3, %dil -; AVX1OR2-NEXT: movzbl %dil, %eax -; AVX1OR2-NEXT: negl %eax -; AVX1OR2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX1OR2-NEXT: vmaskmovps (%rsi), %xmm1, %xmm2 -; AVX1OR2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: load_v4f32_i4: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,2,4,8] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmaskmovps (%rsi), %xmm1, %xmm2 +; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_v4f32_i4: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,2,4,8] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmaskmovps (%rsi), %xmm1, %xmm2 +; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_v4f32_i4: ; AVX512F: ## %bb.0: @@ -1873,101 +1840,22 @@ define <8 x float> @load_v8f32_i8(i8 %trigger, ptr %addr) { ; ; AVX1-LABEL: load_v8f32_i8: ; AVX1: ## %bb.0: -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: shrb $5, %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: movl %edi, %ecx -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: andl $1, %ecx -; AVX1-NEXT: negl %ecx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: shrb $6, %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: shrb $7, %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: shrb %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX1-NEXT: shrb $3, %dil -; AVX1-NEXT: movzbl %dil, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vmaskmovps (%rsi), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_v8f32_i8: ; AVX2: ## %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $5, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: movl %edi, %ecx -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $6, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $7, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX2-NEXT: shrb $3, %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmaskmovps (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 23483c8c0531c..5b6001c9a3605 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -98,20 +98,25 @@ define void @store_v2f64_i2(i2 %trigger, ptr %addr, <2 x double> %val) nounwind ; SSE-NEXT: movhps %xmm0, 8(%rsi) ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: store_v2f64_i2: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: movl %edi, %eax -; AVX1OR2-NEXT: andl $1, %eax -; AVX1OR2-NEXT: negq %rax -; AVX1OR2-NEXT: vmovq %rax, %xmm1 -; AVX1OR2-NEXT: andb $2, %dil -; AVX1OR2-NEXT: shrb %dil -; AVX1OR2-NEXT: movzbl %dil, %eax -; AVX1OR2-NEXT: negq %rax -; AVX1OR2-NEXT: vmovq %rax, %xmm2 -; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1OR2-NEXT: vmaskmovpd %xmm0, %xmm1, (%rsi) -; AVX1OR2-NEXT: retq +; AVX1-LABEL: store_v2f64_i2: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,2] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmaskmovpd %xmm0, %xmm1, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_v2f64_i2: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,2] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmaskmovpd %xmm0, %xmm1, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: store_v2f64_i2: ; AVX512F: ## %bb.0: @@ -240,29 +245,14 @@ define void @store_v4f64_i4(i4 %trigger, ptr %addr, <4 x double> %val) nounwind ; ; AVX1-LABEL: store_v4f64_i4: ; AVX1: ## %bb.0: -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: shrb %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX1-NEXT: andb $8, %dil -; AVX1-NEXT: shrb $3, %dil -; AVX1-NEXT: movzbl %dil, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmaskmovpd %ymm0, %ymm1, (%rsi) ; AVX1-NEXT: vzeroupper @@ -270,30 +260,11 @@ define void @store_v4f64_i4(i4 %trigger, ptr %addr, <4 x double> %val) nounwind ; ; AVX2-LABEL: store_v4f64_i4: ; AVX2: ## %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: andb $8, %al -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: negq %rax -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negq %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negq %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: shrb %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negq %rax -; AVX2-NEXT: vmovq %rax, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmaskmovpd %ymm0, %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -698,31 +669,25 @@ define void @store_v4f32_i4(<4 x float> %x, ptr %ptr, <4 x float> %y, i4 %trigge ; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi) ; SSE4-NEXT: retq ; -; AVX1OR2-LABEL: store_v4f32_i4: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: movl %esi, %eax -; AVX1OR2-NEXT: andl $1, %eax -; AVX1OR2-NEXT: negl %eax -; AVX1OR2-NEXT: vmovd %eax, %xmm1 -; AVX1OR2-NEXT: movl %esi, %eax -; AVX1OR2-NEXT: shrb %al -; AVX1OR2-NEXT: movzbl %al, %eax -; AVX1OR2-NEXT: andl $1, %eax -; AVX1OR2-NEXT: negl %eax -; AVX1OR2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX1OR2-NEXT: movl %esi, %eax -; AVX1OR2-NEXT: shrb $2, %al -; AVX1OR2-NEXT: movzbl %al, %eax -; AVX1OR2-NEXT: andl $1, %eax -; AVX1OR2-NEXT: negl %eax -; AVX1OR2-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX1OR2-NEXT: andb $8, %sil -; AVX1OR2-NEXT: shrb $3, %sil -; AVX1OR2-NEXT: movzbl %sil, %eax -; AVX1OR2-NEXT: negl %eax -; AVX1OR2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX1OR2-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) -; AVX1OR2-NEXT: retq +; AVX1-LABEL: store_v4f32_i4: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovd %esi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,2,4,8] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_v4f32_i4: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovd %esi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,2,4,8] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: store_v4f32_i4: ; AVX512F: ## %bb.0: @@ -999,102 +964,23 @@ define void @store_v8f32_i8(<8 x float> %x, ptr %ptr, <8 x float> %y, i8 %trigge ; ; AVX1-LABEL: store_v8f32_i8: ; AVX1: ## %bb.0: -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: shrb $5, %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: movl %esi, %ecx -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: andl $1, %ecx -; AVX1-NEXT: negl %ecx -; AVX1-NEXT: vmovd %ecx, %xmm1 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: shrb $6, %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: shrb $7, %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: shrb %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: shrb $3, %sil -; AVX1-NEXT: movzbl %sil, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovd %esi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_v8f32_i8: ; AVX2: ## %bb.0: -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: shrb $5, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: vmovd %ecx, %xmm1 -; AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: shrb $6, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: shrb $7, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: shrb %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: shrb $3, %sil -; AVX2-NEXT: movzbl %sil, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovd %esi, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq