From 878c6281d32807b4a711251611a21f00ea4b09d4 Mon Sep 17 00:00:00 2001 From: Ulrich Weigand Date: Tue, 5 May 2015 19:27:45 +0000 Subject: [PATCH] [SystemZ] Add CodeGen support for v4f32 The architecture doesn't really have any native v4f32 operations except v4f32->v2f64 and v2f64->v4f32 conversions, with only half of the v4f32 elements being used. Even so, using vector registers for <4 x float> and scalarising individual operations is much better than generating completely scalar code, since there's much less register pressure. It's also more efficient to do v4f32 comparisons by extending to 2 v2f64s, comparing those, then packing the result. This particularly helps with llvmpipe. Based on a patch by Richard Sandiford. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236523 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../MCTargetDesc/SystemZMCTargetDesc.h | 5 + lib/Target/SystemZ/SystemZAsmPrinter.cpp | 15 + lib/Target/SystemZ/SystemZCallingConv.td | 6 +- lib/Target/SystemZ/SystemZISelLowering.cpp | 146 +++++- lib/Target/SystemZ/SystemZISelLowering.h | 8 + lib/Target/SystemZ/SystemZInstrFormats.td | 3 + lib/Target/SystemZ/SystemZInstrVector.td | 49 +- lib/Target/SystemZ/SystemZOperators.td | 19 + test/CodeGen/SystemZ/fp-move-09.ll | 2 +- test/CodeGen/SystemZ/fp-move-10.ll | 61 +++ test/CodeGen/SystemZ/vec-cmp-05.ll | 472 ++++++++++++++++++ test/CodeGen/SystemZ/vec-const-05.ll | 47 ++ test/CodeGen/SystemZ/vec-const-11.ll | 189 +++++++ test/CodeGen/SystemZ/vec-const-17.ll | 95 ++++ test/CodeGen/SystemZ/vec-conv-02.ll | 13 + test/CodeGen/SystemZ/vec-move-01.ll | 8 + test/CodeGen/SystemZ/vec-move-02.ll | 9 + test/CodeGen/SystemZ/vec-move-03.ll | 9 + test/CodeGen/SystemZ/vec-move-04.ll | 30 ++ test/CodeGen/SystemZ/vec-move-05.ll | 53 ++ test/CodeGen/SystemZ/vec-move-07.ll | 11 +- test/CodeGen/SystemZ/vec-move-08.ll | 81 +++ test/CodeGen/SystemZ/vec-move-09.ll | 27 + test/CodeGen/SystemZ/vec-move-10.ll | 92 ++++ test/CodeGen/SystemZ/vec-move-11.ll | 9 + test/CodeGen/SystemZ/vec-move-12.ll | 10 + test/CodeGen/SystemZ/vec-move-13.ll | 12 + test/CodeGen/SystemZ/vec-move-14.ll | 10 + test/CodeGen/SystemZ/vec-perm-01.ll | 31 ++ test/CodeGen/SystemZ/vec-perm-02.ll | 34 ++ test/CodeGen/SystemZ/vec-perm-03.ll | 38 ++ test/CodeGen/SystemZ/vec-perm-04.ll | 20 + test/CodeGen/SystemZ/vec-perm-05.ll | 20 + test/CodeGen/SystemZ/vec-perm-06.ll | 20 + test/CodeGen/SystemZ/vec-perm-07.ll | 20 + test/CodeGen/SystemZ/vec-perm-08.ll | 20 + test/CodeGen/SystemZ/vec-sub-01.ll | 27 + 37 files changed, 1699 insertions(+), 22 deletions(-) create mode 100644 test/CodeGen/SystemZ/fp-move-10.ll create mode 100644 test/CodeGen/SystemZ/vec-cmp-05.ll create mode 100644 test/CodeGen/SystemZ/vec-const-05.ll create mode 100644 test/CodeGen/SystemZ/vec-const-11.ll create mode 100644 test/CodeGen/SystemZ/vec-const-17.ll create mode 100644 test/CodeGen/SystemZ/vec-conv-02.ll diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h index 4c0661608be..36ea750ec8d 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h @@ -71,6 +71,11 @@ inline unsigned getRegAsGR32(unsigned Reg) { inline unsigned getRegAsGRH32(unsigned Reg) { return GRH32Regs[getFirstReg(Reg)]; } + +// Return the given register as a VR128. +inline unsigned getRegAsVR128(unsigned Reg) { + return VR128Regs[getFirstReg(Reg)]; +} } // end namespace SystemZMC MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII, diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp index 5f46e6a6313..026a75f2140 100644 --- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -158,6 +158,21 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) { .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg())); break; + case SystemZ::LFER: + LoweredMI = MCInstBuilder(SystemZ::VLGVF) + .addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg())) + .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg())) + .addReg(0).addImm(0); + break; + + case SystemZ::LEFR: + LoweredMI = MCInstBuilder(SystemZ::VLVGF) + .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg())) + .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg())) + .addReg(MI->getOperand(1).getReg()) + .addReg(0).addImm(0); + break; + #define LOWER_LOW(NAME) \ case SystemZ::NAME##64: LoweredMI = lowerRILow(MI, SystemZ::NAME); break diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td index 360d348af3a..a2f996e60df 100644 --- a/lib/Target/SystemZ/SystemZCallingConv.td +++ b/lib/Target/SystemZ/SystemZCallingConv.td @@ -44,7 +44,7 @@ def RetCC_SystemZ : CallingConv<[ // Similarly for vectors, with V24 being the ABI-compliant choice. CCIfSubtarget<"hasVector()", - CCIfType<[v16i8, v8i16, v4i32, v2i64, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>> // ABI-compliant code returns long double by reference, but that conversion @@ -76,13 +76,13 @@ def CC_SystemZ : CallingConv<[ // The first 8 named vector arguments are passed in V24-V31. CCIfSubtarget<"hasVector()", - CCIfType<[v16i8, v8i16, v4i32, v2i64, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfFixed>>>, // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots. CCIfSubtarget<"hasVector()", - CCIfType<[v16i8, v8i16, v4i32, v2i64, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 8>>>, // Other arguments are passed in 8-byte-aligned 8-byte stack slots. diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 5f547439c9a..391cb8c6fc9 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -101,6 +101,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass); addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass); addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass); addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass); } @@ -275,7 +276,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, if (isTypeLegal(VT)) { // These operations are legal for anything that can be stored in a // vector register, even if there is no native support for the format - // as such. + // as such. In particular, we can do these for v4f32 even though there + // are no specific instructions for that format. setOperationAction(ISD::LOAD, VT, Legal); setOperationAction(ISD::STORE, VT, Legal); setOperationAction(ISD::VSELECT, VT, Legal); @@ -365,11 +367,14 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, // Handle floating-point vector types. if (Subtarget.hasVector()) { // Scalar-to-vector conversion is just a subreg. + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); // Some insertions and extractions can be done directly but others // need to go via integers. + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); // These operations have direct equivalents. @@ -407,8 +412,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, // We have 64-bit FPR<->GPR moves, but need special handling for // 32-bit forms. - setOperationAction(ISD::BITCAST, MVT::i32, Custom); - setOperationAction(ISD::BITCAST, MVT::f32, Custom); + if (!Subtarget.hasVector()) { + setOperationAction(ISD::BITCAST, MVT::i32, Custom); + setOperationAction(ISD::BITCAST, MVT::f32, Custom); + } // VASTART and VACOPY need to deal with the SystemZ-specific varargs // structure, but VAEND is a no-op. @@ -420,6 +427,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::FP_ROUND); // Handle intrinsics. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -855,6 +863,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: + case MVT::v4f32: case MVT::v2f64: RC = &SystemZ::VR128BitRegClass; break; @@ -1977,6 +1986,33 @@ static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool IsFP, return 0; } +// Return a v2f64 that contains the extended form of elements Start and Start+1 +// of v4f32 value Op. +static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, SDLoc DL, + SDValue Op) { + int Mask[] = { Start, -1, Start + 1, -1 }; + Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask); + return DAG.getNode(SystemZISD::VEXTEND, DL, MVT::v2f64, Op); +} + +// Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode, +// producing a result of type VT. +static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, SDLoc DL, + EVT VT, SDValue CmpOp0, SDValue CmpOp1) { + // There is no hardware support for v4f32, so extend the vector into + // two v2f64s and compare those. + if (CmpOp0.getValueType() == MVT::v4f32) { + SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0); + SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0); + SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1); + SDValue L1 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp1); + SDValue HRes = DAG.getNode(Opcode, DL, MVT::v2i64, H0, H1); + SDValue LRes = DAG.getNode(Opcode, DL, MVT::v2i64, L0, L1); + return DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes); + } + return DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1); +} + // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing // an integer mask of type VT. static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT, @@ -1991,8 +2027,8 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT, Invert = true; case ISD::SETO: { assert(IsFP && "Unexpected integer comparison"); - SDValue LT = DAG.getNode(SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0); - SDValue GE = DAG.getNode(SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1); + SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0); + SDValue GE = getVectorCmp(DAG, SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1); Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GE); break; } @@ -2002,8 +2038,8 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT, Invert = true; case ISD::SETONE: { assert(IsFP && "Unexpected integer comparison"); - SDValue LT = DAG.getNode(SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0); - SDValue GT = DAG.getNode(SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1); + SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0); + SDValue GT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1); Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GT); break; } @@ -2013,11 +2049,11 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT, // there are no cases where both work. default: if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert)) - Cmp = DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1); + Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1); else { CC = ISD::getSetCCSwappedOperands(CC); if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert)) - Cmp = DAG.getNode(Opcode, DL, VT, CmpOp1, CmpOp0); + Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp1, CmpOp0); else llvm_unreachable("Unhandled comparison"); } @@ -3621,6 +3657,31 @@ static SDValue buildVector(SelectionDAG &DAG, SDLoc DL, EVT VT, if (VT == MVT::v2f64) return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]); + // Build v4f32 values directly from the FPRs: + // + // + // V V VMRHF + // + // V VMRHG + // + if (VT == MVT::v4f32) { + SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]); + SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]); + // Avoid unnecessary undefs by reusing the other operand. + if (Op01.getOpcode() == ISD::UNDEF) + Op01 = Op23; + else if (Op23.getOpcode() == ISD::UNDEF) + Op23 = Op01; + // Merging identical replications is a no-op. + if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23) + return Op01; + Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01); + Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23); + SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH, + DL, MVT::v2i64, Op01, Op23); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); + } + // Collect the constant terms. SmallVector Constants(NumElements, SDValue()); SmallVector Done(NumElements, false); @@ -3796,10 +3857,11 @@ SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SDValue Op2 = Op.getOperand(2); EVT VT = Op.getValueType(); - // Insertions into constant indices can be done using VPDI. However, - // if the inserted value is a bitcast or a constant then it's better - // to use GPRs, as below. - if (Op1.getOpcode() != ISD::BITCAST && + // Insertions into constant indices of a v2f64 can be done using VPDI. + // However, if the inserted value is a bitcast or a constant then it's + // better to use GPRs, as below. + if (VT == MVT::v2f64 && + Op1.getOpcode() != ISD::BITCAST && Op1.getOpcode() != ISD::ConstantFP && Op2.getOpcode() == ISD::Constant) { uint64_t Index = dyn_cast(Op2)->getZExtValue(); @@ -4065,6 +4127,8 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(VFCMPE); OPCODE(VFCMPH); OPCODE(VFCMPHE); + OPCODE(VEXTEND); + OPCODE(VROUND); OPCODE(ATOMIC_SWAPW); OPCODE(ATOMIC_LOADW_ADD); OPCODE(ATOMIC_LOADW_SUB); @@ -4265,6 +4329,19 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, } } } + // (z_merge_high 0, 0) -> 0. This is mostly useful for using VLLEZF + // for v4f32. + if (Opcode == SystemZISD::MERGE_HIGH) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0 == Op1) { + if (Op0.getOpcode() == ISD::BITCAST) + Op0 = Op0.getOperand(0); + if (Op0.getOpcode() == SystemZISD::BYTE_MASK && + cast(Op0.getOperand(0))->getZExtValue() == 0) + return Op1; + } + } // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better // for the extraction to be done on a vMiN value, so that we can use VSTE. // If X has wider elements then convert it to: @@ -4299,6 +4376,49 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, N->getOperand(0) == N->getOperand(1)) return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0), N->getOperand(0)); + // (fround (extract_vector_elt X 0)) + // (fround (extract_vector_elt X 1)) -> + // (extract_vector_elt (VROUND X) 0) + // (extract_vector_elt (VROUND X) 1) + // + // This is a special case since the target doesn't really support v2f32s. + if (Opcode == ISD::FP_ROUND) { + SDValue Op0 = N->getOperand(0); + if (N->getValueType(0) == MVT::f32 && + Op0.hasOneUse() && + Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Op0.getOperand(0).getValueType() == MVT::v2f64 && + Op0.getOperand(1).getOpcode() == ISD::Constant && + cast(Op0.getOperand(1))->getZExtValue() == 0) { + SDValue Vec = Op0.getOperand(0); + for (auto *U : Vec->uses()) { + if (U != Op0.getNode() && + U->hasOneUse() && + U->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + U->getOperand(0) == Vec && + U->getOperand(1).getOpcode() == ISD::Constant && + cast(U->getOperand(1))->getZExtValue() == 1) { + SDValue OtherRound = SDValue(*U->use_begin(), 0); + if (OtherRound.getOpcode() == ISD::FP_ROUND && + OtherRound.getOperand(0) == SDValue(U, 0) && + OtherRound.getValueType() == MVT::f32) { + SDValue VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N), + MVT::v4f32, Vec); + DCI.AddToWorklist(VRound.getNode()); + SDValue Extract1 = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32, + VRound, DAG.getConstant(2, SDLoc(U), MVT::i32)); + DCI.AddToWorklist(Extract1.getNode()); + DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1); + SDValue Extract0 = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, + VRound, DAG.getConstant(0, SDLoc(Op0), MVT::i32)); + return Extract0; + } + } + } + } + } return SDValue(); } diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 8319c01fc5e..24a3f4bb5d4 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -226,6 +226,14 @@ enum { VFCMPH, VFCMPHE, + // Extend the even f32 elements of vector operand 0 to produce a vector + // of f64 elements. + VEXTEND, + + // Round the f64 elements of vector operand 0 to f32s and store them in the + // even elements of the result. + VROUND, + // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or // ATOMIC_LOAD_. // diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index d7bfc12b938..dc9dfa801fd 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -2398,6 +2398,9 @@ class Alias pattern> let isCodeGenOnly = 1; } +class UnaryAliasVRS + : Alias<6, (outs cls1:$src1), (ins cls2:$src2), []>; + // An alias of a BinaryRI, but with different register sizes. class BinaryAliasRI diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td index 546974aa5d8..b6c8042b3c8 100644 --- a/lib/Target/SystemZ/SystemZInstrVector.td +++ b/lib/Target/SystemZ/SystemZInstrVector.td @@ -118,6 +118,8 @@ let Predicates = [FeatureVector] in { def VLREPH : UnaryVRX<"vlreph", 0xE705, z_replicate_loadi16, v128h, 2, 1>; def VLREPF : UnaryVRX<"vlrepf", 0xE705, z_replicate_loadi32, v128f, 4, 2>; def VLREPG : UnaryVRX<"vlrepg", 0xE705, z_replicate_loadi64, v128g, 8, 3>; + def : Pat<(v4f32 (z_replicate_loadf32 bdxaddr12only:$addr)), + (VLREPF bdxaddr12only:$addr)>; def : Pat<(v2f64 (z_replicate_loadf64 bdxaddr12only:$addr)), (VLREPG bdxaddr12only:$addr)>; @@ -126,6 +128,8 @@ let Predicates = [FeatureVector] in { def VLLEZH : UnaryVRX<"vllezh", 0xE704, z_vllezi16, v128h, 2, 1>; def VLLEZF : UnaryVRX<"vllezf", 0xE704, z_vllezi32, v128f, 4, 2>; def VLLEZG : UnaryVRX<"vllezg", 0xE704, z_vllezi64, v128g, 8, 3>; + def : Pat<(v4f32 (z_vllezf32 bdxaddr12only:$addr)), + (VLLEZF bdxaddr12only:$addr)>; def : Pat<(v2f64 (z_vllezf64 bdxaddr12only:$addr)), (VLLEZG bdxaddr12only:$addr)>; @@ -134,6 +138,8 @@ let Predicates = [FeatureVector] in { def VLEH : TernaryVRX<"vleh", 0xE701, z_vlei16, v128h, v128h, 2, imm32zx3>; def VLEF : TernaryVRX<"vlef", 0xE703, z_vlei32, v128f, v128f, 4, imm32zx2>; def VLEG : TernaryVRX<"vleg", 0xE702, z_vlei64, v128g, v128g, 8, imm32zx1>; + def : Pat<(z_vlef32 (v4f32 VR128:$val), bdxaddr12only:$addr, imm32zx2:$index), + (VLEF VR128:$val, bdxaddr12only:$addr, imm32zx2:$index)>; def : Pat<(z_vlef64 (v2f64 VR128:$val), bdxaddr12only:$addr, imm32zx1:$index), (VLEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>; @@ -158,6 +164,7 @@ defm : ReplicatePeephole; defm : ReplicatePeephole; defm : ReplicatePeephole; defm : ReplicatePeephole; +defm : ReplicatePeephole; defm : ReplicatePeephole; //===----------------------------------------------------------------------===// @@ -179,6 +186,9 @@ let Predicates = [FeatureVector] in { def VSTEH : StoreBinaryVRX<"vsteh", 0xE709, z_vstei16, v128h, 2, imm32zx3>; def VSTEF : StoreBinaryVRX<"vstef", 0xE70B, z_vstei32, v128f, 4, imm32zx2>; def VSTEG : StoreBinaryVRX<"vsteg", 0xE70A, z_vstei64, v128g, 8, imm32zx1>; + def : Pat<(z_vstef32 (v4f32 VR128:$val), bdxaddr12only:$addr, + imm32zx2:$index), + (VSTEF VR128:$val, bdxaddr12only:$addr, imm32zx2:$index)>; def : Pat<(z_vstef64 (v2f64 VR128:$val), bdxaddr12only:$addr, imm32zx1:$index), (VSTEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>; @@ -198,6 +208,7 @@ let Predicates = [FeatureVector] in { def VMRHH : BinaryVRRc<"vmrhh", 0xE761, z_merge_high, v128h, v128h, 1>; def VMRHF : BinaryVRRc<"vmrhf", 0xE761, z_merge_high, v128f, v128f, 2>; def VMRHG : BinaryVRRc<"vmrhg", 0xE761, z_merge_high, v128g, v128g, 3>; + def : BinaryRRWithType; def : BinaryRRWithType; // Merge low. @@ -205,6 +216,7 @@ let Predicates = [FeatureVector] in { def VMRLH : BinaryVRRc<"vmrlh", 0xE760, z_merge_low, v128h, v128h, 1>; def VMRLF : BinaryVRRc<"vmrlf", 0xE760, z_merge_low, v128f, v128f, 2>; def VMRLG : BinaryVRRc<"vmrlg", 0xE760, z_merge_low, v128g, v128g, 3>; + def : BinaryRRWithType; def : BinaryRRWithType; // Permute. @@ -218,6 +230,8 @@ let Predicates = [FeatureVector] in { def VREPH : BinaryVRIc<"vreph", 0xE74D, z_splat, v128h, v128h, 1>; def VREPF : BinaryVRIc<"vrepf", 0xE74D, z_splat, v128f, v128f, 2>; def VREPG : BinaryVRIc<"vrepg", 0xE74D, z_splat, v128g, v128g, 3>; + def : Pat<(v4f32 (z_splat VR128:$vec, imm32zx16:$index)), + (VREPF VR128:$vec, imm32zx16:$index)>; def : Pat<(v2f64 (z_splat VR128:$vec, imm32zx16:$index)), (VREPG VR128:$vec, imm32zx16:$index)>; @@ -301,6 +315,7 @@ defm : GenericVectorOps; defm : GenericVectorOps; defm : GenericVectorOps; defm : GenericVectorOps; +defm : GenericVectorOps; defm : GenericVectorOps; //===----------------------------------------------------------------------===// @@ -797,12 +812,13 @@ let Predicates = [FeatureVector] in { defm : VectorRounding; // Load lengthened. - def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, null_frag, v128db, v128eb, 2, 0>; + def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>; def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, null_frag, v64db, v32eb, 2, 8>; // Load rounded, def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>; def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>; + def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>; // Multiply. def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>; @@ -882,27 +898,38 @@ let Predicates = [FeatureVector] in { def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; //===----------------------------------------------------------------------===// // Replicating scalars @@ -926,6 +953,14 @@ def : Pat<(v2i64 (z_replicate GR64:$scalar)), // Floating-point insertion and extraction //===----------------------------------------------------------------------===// +// Moving 32-bit values between GPRs and FPRs can be done using VLVGF +// and VLGVF. +def LEFR : UnaryAliasVRS; +def LFER : UnaryAliasVRS; +def : Pat<(f32 (bitconvert (i32 GR32:$src))), (LEFR GR32:$src)>; +def : Pat<(i32 (bitconvert (f32 VR32:$src))), + (EXTRACT_SUBREG (LFER VR32:$src), subreg_l32)>; + // Floating-point values are stored in element 0 of the corresponding // vector register. Scalar to vector conversion is just a subreg and // scalar replication can just replicate element 0 of the vector register. @@ -937,6 +972,7 @@ multiclass ScalarToVectorFP; } +defm : ScalarToVectorFP; defm : ScalarToVectorFP; // Match v2f64 insertions. The AddedComplexity counters the 3 added by @@ -951,11 +987,16 @@ let AddedComplexity = 4 in { subreg_r64), 0)>; } -// We extract f64 element X by replicating (for elements other than 0) -// and then taking a high subreg. The AddedComplexity counters the 3 -// added by TableGen for the base register operand in VLGV-based integer +// We extract floating-point element X by replicating (for elements other +// than 0) and then taking a high subreg. The AddedComplexity counters the +// 3 added by TableGen for the base register operand in VLGV-based integer // extractions and ensures that this version is strictly better. let AddedComplexity = 4 in { + def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), 0)), + (EXTRACT_SUBREG VR128:$vec, subreg_r32)>; + def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), imm32zx2:$index)), + (EXTRACT_SUBREG (VREPF VR128:$vec, imm32zx2:$index), subreg_r32)>; + def : Pat<(f64 (z_vector_extract (v2f64 VR128:$vec), 0)), (EXTRACT_SUBREG VR128:$vec, subreg_r64)>; def : Pat<(f64 (z_vector_extract (v2f64 VR128:$vec), imm32zx1:$index)), diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index 7cf7d862ffe..63c217413ac 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -91,6 +91,9 @@ def SDT_ZExtractVectorElt : SDTypeProfile<1, 2, SDTCisVT<2, i32>]>; def SDT_ZReplicate : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDT_ZVecUnaryConv : SDTypeProfile<1, 1, + [SDTCisVec<0>, + SDTCisVec<1>]>; def SDT_ZVecBinary : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, @@ -203,6 +206,8 @@ def z_vicmphl : SDNode<"SystemZISD::VICMPHL", SDT_ZVecBinary>; def z_vfcmpe : SDNode<"SystemZISD::VFCMPE", SDT_ZVecBinaryConv>; def z_vfcmph : SDNode<"SystemZISD::VFCMPH", SDT_ZVecBinaryConv>; def z_vfcmphe : SDNode<"SystemZISD::VFCMPHE", SDT_ZVecBinaryConv>; +def z_vextend : SDNode<"SystemZISD::VEXTEND", SDT_ZVecUnaryConv>; +def z_vround : SDNode<"SystemZISD::VROUND", SDT_ZVecUnaryConv>; class AtomicWOp : SDNode<"SystemZISD::"##name, profile, @@ -508,6 +513,7 @@ def z_replicate_loadi8 : z_replicate_load; def z_replicate_loadi16 : z_replicate_load; def z_replicate_loadi32 : z_replicate_load; def z_replicate_loadi64 : z_replicate_load; +def z_replicate_loadf32 : z_replicate_load; def z_replicate_loadf64 : z_replicate_load; // Load a scalar and insert it into a single element of a vector. @@ -519,6 +525,7 @@ def z_vlei8 : z_vle; def z_vlei16 : z_vle; def z_vlei32 : z_vle; def z_vlei64 : z_vle; +def z_vlef32 : z_vle; def z_vlef64 : z_vle; // Load a scalar and insert it into the low element of the high i64 of a @@ -532,6 +539,17 @@ def z_vllezi16 : z_vllez; def z_vllezi32 : z_vllez; def z_vllezi64 : PatFrag<(ops node:$addr), (z_join_dwords (i64 (load node:$addr)), (i64 0))>; +// We use high merges to form a v4f32 from four f32s. Propagating zero +// into all elements but index 1 gives this expression. +def z_vllezf32 : PatFrag<(ops node:$addr), + (bitconvert + (z_merge_high + (v2i64 (bitconvert + (z_merge_high + (v4f32 (z_vzero)), + (v4f32 (scalar_to_vector + (f32 (load node:$addr))))))), + (v2i64 (z_vzero))))>; def z_vllezf64 : PatFrag<(ops node:$addr), (z_merge_high (scalar_to_vector (f64 (load node:$addr))), @@ -546,6 +564,7 @@ def z_vstei8 : z_vste; def z_vstei16 : z_vste; def z_vstei32 : z_vste; def z_vstei64 : z_vste; +def z_vstef32 : z_vste; def z_vstef64 : z_vste; // Arithmetic negation on vectors. diff --git a/test/CodeGen/SystemZ/fp-move-09.ll b/test/CodeGen/SystemZ/fp-move-09.ll index e4a3e9222d5..5e8dce272c2 100644 --- a/test/CodeGen/SystemZ/fp-move-09.ll +++ b/test/CodeGen/SystemZ/fp-move-09.ll @@ -1,4 +1,4 @@ -; Test moves between FPRs and GPRs for z196 and above. +; Test moves between FPRs and GPRs for z196 and zEC12. ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s diff --git a/test/CodeGen/SystemZ/fp-move-10.ll b/test/CodeGen/SystemZ/fp-move-10.ll new file mode 100644 index 00000000000..602397d58a8 --- /dev/null +++ b/test/CodeGen/SystemZ/fp-move-10.ll @@ -0,0 +1,61 @@ +; Test moves between FPRs and GPRs for z13 and above. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Check that moves from i32s to floats use a low GR32 and vector operation. +define float @f1(i16 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: llh [[REG:%r[0-5]]], 0(%r2) +; CHECK: oilh [[REG]], 16256 +; CHECK: vlvgf %v0, [[REG]], 0 +; CHECK: br %r14 + %base = load i16, i16 *%ptr + %ext = zext i16 %base to i32 + %full = or i32 %ext, 1065353216 + %res = bitcast i32 %full to float + ret float %res +} + +; Check that moves from floats to i32s use a low GR32 and vector operation. +define void @f2(float %val, i8 *%ptr) { +; CHECK-LABEL: f2: +; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0 +; CHECK: stc [[REG]], 0(%r2) +; CHECK: br %r14 + %res = bitcast float %val to i32 + %trunc = trunc i32 %res to i8 + store i8 %trunc, i8 *%ptr + ret void +} + +; Like f2, but with a conditional store. +define void @f3(float %val, i8 *%ptr, i32 %which) { +; CHECK-LABEL: f3: +; CHECK-DAG: cijlh %r3, 0, +; CHECK-DAG: vlgvf [[REG:%r[0-5]]], %v0, 0 +; CHECK: stc [[REG]], 0(%r2) +; CHECK: br %r14 + %int = bitcast float %val to i32 + %trunc = trunc i32 %int to i8 + %old = load i8, i8 *%ptr + %cmp = icmp eq i32 %which, 0 + %res = select i1 %cmp, i8 %trunc, i8 %old + store i8 %res, i8 *%ptr + ret void +} + +; ...and again with 16-bit memory. +define void @f4(float %val, i16 *%ptr, i32 %which) { +; CHECK-LABEL: f4: +; CHECK-DAG: cijlh %r3, 0, +; CHECK-DAG: vlgvf [[REG:%r[0-5]]], %v0, 0 +; CHECK: sth [[REG]], 0(%r2) +; CHECK: br %r14 + %int = bitcast float %val to i32 + %trunc = trunc i32 %int to i16 + %old = load i16, i16 *%ptr + %cmp = icmp eq i32 %which, 0 + %res = select i1 %cmp, i16 %trunc, i16 %old + store i16 %res, i16 *%ptr + ret void +} diff --git a/test/CodeGen/SystemZ/vec-cmp-05.ll b/test/CodeGen/SystemZ/vec-cmp-05.ll new file mode 100644 index 00000000000..74e99096097 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-cmp-05.ll @@ -0,0 +1,472 @@ +; Test v4f32 comparisons. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test oeq. +define <4 x i32> @f1(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f1: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfcedb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]] +; CHECK-DAG: vfcedb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]] +; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]] +; CHECK-NEXT: br %r14 + %cmp = fcmp oeq <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test one. +define <4 x i32> @f2(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f2: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfchdb [[HIGHRES0:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]] +; CHECK-DAG: vfchdb [[LOWRES0:%v[0-9]+]], [[LOW0D]], [[LOW1D]] +; CHECK-DAG: vfchdb [[HIGHRES1:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]] +; CHECK-DAG: vfchdb [[LOWRES1:%v[0-9]+]], [[LOW1D]], [[LOW0D]] +; CHECK-DAG: vpkg [[RES0:%v[0-9]+]], [[HIGHRES0]], [[LOWRES0]] +; CHECK-DAG: vpkg [[RES1:%v[0-9]+]], [[HIGHRES1]], [[LOWRES1]] +; CHECK: vo %v24, [[RES1]], [[RES0]] +; CHECK-NEXT: br %r14 + %cmp = fcmp one <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ogt. +define <4 x i32> @f3(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f3: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfchdb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]] +; CHECK-DAG: vfchdb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]] +; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ogt <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test oge. +define <4 x i32> @f4(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f4: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfchedb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]] +; CHECK-DAG: vfchedb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]] +; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]] +; CHECK-NEXT: br %r14 + %cmp = fcmp oge <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ole. +define <4 x i32> @f5(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f5: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfchedb [[HIGHRES:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]] +; CHECK-DAG: vfchedb [[LOWRES:%v[0-9]+]], [[LOW1D]], [[LOW0D]] +; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ole <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test olt. +define <4 x i32> @f6(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f6: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfchdb [[HIGHRES:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]] +; CHECK-DAG: vfchdb [[LOWRES:%v[0-9]+]], [[LOW1D]], [[LOW0D]] +; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]] +; CHECK-NEXT: br %r14 + %cmp = fcmp olt <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ueq. +define <4 x i32> @f7(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f7: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfchdb [[HIGHRES0:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]] +; CHECK-DAG: vfchdb [[LOWRES0:%v[0-9]+]], [[LOW0D]], [[LOW1D]] +; CHECK-DAG: vfchdb [[HIGHRES1:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]] +; CHECK-DAG: vfchdb [[LOWRES1:%v[0-9]+]], [[LOW1D]], [[LOW0D]] +; CHECK-DAG: vpkg [[RES0:%v[0-9]+]], [[HIGHRES0]], [[LOWRES0]] +; CHECK-DAG: vpkg [[RES1:%v[0-9]+]], [[HIGHRES1]], [[LOWRES1]] +; CHECK: vno %v24, [[RES1]], [[RES0]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ueq <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test une. +define <4 x i32> @f8(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f8: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfcedb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]] +; CHECK-DAG: vfcedb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]] +; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]] +; CHECK-NEXT: vno %v24, [[RES]], [[RES]] +; CHECK-NEXT: br %r14 + %cmp = fcmp une <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ugt. +define <4 x i32> @f9(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f9: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfchedb [[HIGHRES:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]] +; CHECK-DAG: vfchedb [[LOWRES:%v[0-9]+]], [[LOW1D]], [[LOW0D]] +; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]] +; CHECK-NEXT: vno %v24, [[RES]], [[RES]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ugt <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test uge. +define <4 x i32> @f10(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f10: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfchdb [[HIGHRES:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]] +; CHECK-DAG: vfchdb [[LOWRES:%v[0-9]+]], [[LOW1D]], [[LOW0D]] +; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]] +; CHECK-NEXT: vno %v24, [[RES]], [[RES]] +; CHECK-NEXT: br %r14 + %cmp = fcmp uge <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ule. +define <4 x i32> @f11(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f11: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfchdb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]] +; CHECK-DAG: vfchdb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]] +; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]] +; CHECK-NEXT: vno %v24, [[RES]], [[RES]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ule <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ult. +define <4 x i32> @f12(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f12: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfchedb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]] +; CHECK-DAG: vfchedb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]] +; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]] +; CHECK-NEXT: vno %v24, [[RES]], [[RES]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ult <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ord. +define <4 x i32> @f13(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f13: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfchedb [[HIGHRES0:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]] +; CHECK-DAG: vfchedb [[LOWRES0:%v[0-9]+]], [[LOW0D]], [[LOW1D]] +; CHECK-DAG: vfchdb [[HIGHRES1:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]] +; CHECK-DAG: vfchdb [[LOWRES1:%v[0-9]+]], [[LOW1D]], [[LOW0D]] +; CHECK-DAG: vpkg [[RES0:%v[0-9]+]], [[HIGHRES0]], [[LOWRES0]] +; CHECK-DAG: vpkg [[RES1:%v[0-9]+]], [[HIGHRES1]], [[LOWRES1]] +; CHECK: vo %v24, [[RES1]], [[RES0]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ord <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test uno. +define <4 x i32> @f14(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f14: +; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24 +; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26 +; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]] +; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]] +; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]] +; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]] +; CHECK-DAG: vfchedb [[HIGHRES0:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]] +; CHECK-DAG: vfchedb [[LOWRES0:%v[0-9]+]], [[LOW0D]], [[LOW1D]] +; CHECK-DAG: vfchdb [[HIGHRES1:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]] +; CHECK-DAG: vfchdb [[LOWRES1:%v[0-9]+]], [[LOW1D]], [[LOW0D]] +; CHECK-DAG: vpkg [[RES0:%v[0-9]+]], [[HIGHRES0]], [[LOWRES0]] +; CHECK-DAG: vpkg [[RES1:%v[0-9]+]], [[HIGHRES1]], [[LOWRES1]] +; CHECK: vno %v24, [[RES1]], [[RES0]] +; CHECK-NEXT: br %r14 + %cmp = fcmp uno <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test oeq selects. +define <4 x float> @f15(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f15: +; CHECK: vpkg [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp oeq <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test one selects. +define <4 x float> @f16(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f16: +; CHECK: vo [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp one <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ogt selects. +define <4 x float> @f17(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f17: +; CHECK: vpkg [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ogt <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test oge selects. +define <4 x float> @f18(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f18: +; CHECK: vpkg [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp oge <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ole selects. +define <4 x float> @f19(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f19: +; CHECK: vpkg [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ole <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test olt selects. +define <4 x float> @f20(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f20: +; CHECK: vpkg [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp olt <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ueq selects. +define <4 x float> @f21(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f21: +; CHECK: vo [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ueq <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test une selects. +define <4 x float> @f22(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f22: +; CHECK: vpkg [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp une <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ugt selects. +define <4 x float> @f23(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f23: +; CHECK: vpkg [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ugt <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test uge selects. +define <4 x float> @f24(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f24: +; CHECK: vpkg [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp uge <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ule selects. +define <4 x float> @f25(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f25: +; CHECK: vpkg [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ule <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ult selects. +define <4 x float> @f26(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f26: +; CHECK: vpkg [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ult <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ord selects. +define <4 x float> @f27(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f27: +; CHECK: vo [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ord <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test uno selects. +define <4 x float> @f28(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f28: +; CHECK: vo [[REG:%v[0-9]+]], +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp uno <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} diff --git a/test/CodeGen/SystemZ/vec-const-05.ll b/test/CodeGen/SystemZ/vec-const-05.ll new file mode 100644 index 00000000000..c4828335c4b --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-05.ll @@ -0,0 +1,47 @@ +; Test vector byte masks, v4f32 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test an all-zeros vector. +define <4 x float> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <4 x float> zeroinitializer +} + +; Test an all-ones vector. +define <4 x float> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgbm %v24, 65535 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a mixed vector (mask 0xc731). +define <4 x float> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgbm %v24, 50993 +; CHECK: br %r14 + ret <4 x float> +} + +; Test that undefs are treated as zero (mask 0xc031). +define <4 x float> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgbm %v24, 49201 +; CHECK: br %r14 + ret <4 x float> +} + +; Test that we don't use VGBM if one of the bytes is not 0 or 0xff. +define <4 x float> @f5() { +; CHECK-LABEL: f5: +; CHECK-NOT: vgbm +; CHECK: br %r14 + ret <4 x float> +} diff --git a/test/CodeGen/SystemZ/vec-const-11.ll b/test/CodeGen/SystemZ/vec-const-11.ll new file mode 100644 index 00000000000..0c69b8803b2 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-11.ll @@ -0,0 +1,189 @@ +; Test vector replicates, v4f32 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a byte-granularity replicate with the lowest useful value. +define <4 x float> @f1() { +; CHECK-LABEL: f1: +; CHECK: vrepib %v24, 1 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a byte-granularity replicate with an arbitrary value. +define <4 x float> @f2() { +; CHECK-LABEL: f2: +; CHECK: vrepib %v24, -55 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a byte-granularity replicate with the highest useful value. +define <4 x float> @f3() { +; CHECK-LABEL: f3: +; CHECK: vrepib %v24, -2 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a halfword-granularity replicate with the lowest useful value. +define <4 x float> @f4() { +; CHECK-LABEL: f4: +; CHECK: vrepih %v24, 1 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a halfword-granularity replicate with an arbitrary value. +define <4 x float> @f5() { +; CHECK-LABEL: f5: +; CHECK: vrepih %v24, 25650 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a halfword-granularity replicate with the highest useful value. +define <4 x float> @f6() { +; CHECK-LABEL: f6: +; CHECK: vrepih %v24, -2 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a word-granularity replicate with the lowest useful positive value. +define <4 x float> @f7() { +; CHECK-LABEL: f7: +; CHECK: vrepif %v24, 1 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a word-granularity replicate with the highest in-range value. +define <4 x float> @f8() { +; CHECK-LABEL: f8: +; CHECK: vrepif %v24, 32767 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a word-granularity replicate with the next highest value. +; This cannot use VREPIF. +define <4 x float> @f9() { +; CHECK-LABEL: f9: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <4 x float> +} + +; Test a word-granularity replicate with the lowest in-range value. +define <4 x float> @f10() { +; CHECK-LABEL: f10: +; CHECK: vrepif %v24, -32768 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a word-granularity replicate with the next lowest value. +; This cannot use VREPIF. +define <4 x float> @f11() { +; CHECK-LABEL: f11: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <4 x float> +} + +; Test a word-granularity replicate with the highest useful negative value. +define <4 x float> @f12() { +; CHECK-LABEL: f12: +; CHECK: vrepif %v24, -2 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a doubleword-granularity replicate with the lowest useful positive +; value. +define <4 x float> @f13() { +; CHECK-LABEL: f13: +; CHECK: vrepig %v24, 1 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a doubleword-granularity replicate with the highest in-range value. +define <4 x float> @f14() { +; CHECK-LABEL: f14: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a doubleword-granularity replicate with the next highest value. +; This cannot use VREPIG. +define <4 x float> @f15() { +; CHECK-LABEL: f15: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <4 x float> +} + +; Test a doubleword-granularity replicate with the lowest in-range value. +define <4 x float> @f16() { +; CHECK-LABEL: f16: +; CHECK: vrepig %v24, -32768 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a doubleword-granularity replicate with the next lowest value. +; This cannot use VREPIG. +define <4 x float> @f17() { +; CHECK-LABEL: f17: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <4 x float> +} + +; Test a doubleword-granularity replicate with the highest useful negative +; value. +define <4 x float> @f18() { +; CHECK-LABEL: f18: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <4 x float> +} + +; Repeat f14 with undefs optimistically treated as 0, 32767. +define <4 x float> @f19() { +; CHECK-LABEL: f19: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <4 x float> +} + +; Repeat f18 with undefs optimistically treated as -2, -1. +define <4 x float> @f20() { +; CHECK-LABEL: f20: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <4 x float> +} diff --git a/test/CodeGen/SystemZ/vec-const-17.ll b/test/CodeGen/SystemZ/vec-const-17.ll new file mode 100644 index 00000000000..1306eab556e --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-17.ll @@ -0,0 +1,95 @@ +; Test vector replicates that use VECTOR GENERATE MASK, v4f32 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a word-granularity replicate with the lowest value that cannot use +; VREPIF. +define <4 x float> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a word-granularity replicate that has the lower 17 bits set. +define <4 x float> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgmf %v24, 15, 31 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a word-granularity replicate that has the upper 15 bits set. +define <4 x float> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgmf %v24, 0, 14 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a word-granularity replicate that has middle bits set. +define <4 x float> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgmf %v24, 2, 8 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a word-granularity replicate with a wrap-around mask. +define <4 x float> @f5() { +; CHECK-LABEL: f5: +; CHECK: vgmf %v24, 9, 1 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a doubleword-granularity replicate with the lowest value that cannot +; use VREPIG. +define <4 x float> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgmg %v24, 48, 48 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a doubleword-granularity replicate that has the lower 22 bits set. +define <4 x float> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgmg %v24, 42, 63 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a doubleword-granularity replicate that has the upper 45 bits set. +define <4 x float> @f8() { +; CHECK-LABEL: f8: +; CHECK: vgmg %v24, 0, 44 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a doubleword-granularity replicate that has middle bits set. +define <4 x float> @f9() { +; CHECK-LABEL: f9: +; CHECK: vgmg %v24, 34, 41 +; CHECK: br %r14 + ret <4 x float> +} + +; Test a doubleword-granularity replicate with a wrap-around mask. +define <4 x float> @f10() { +; CHECK-LABEL: f10: +; CHECK: vgmg %v24, 32, 0 +; CHECK: br %r14 + ret <4 x float> +} diff --git a/test/CodeGen/SystemZ/vec-conv-02.ll b/test/CodeGen/SystemZ/vec-conv-02.ll new file mode 100644 index 00000000000..ceccfc60b37 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-conv-02.ll @@ -0,0 +1,13 @@ +; Test conversions between different-sized float elements. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test cases where both elements of a v2f64 are converted to f32s. +define void @f1(<2 x double> %val, <2 x float> *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vledb {{%v[0-9]+}}, %v24, 0, 0 +; CHECK: br %r14 + %res = fptrunc <2 x double> %val to <2 x float> + store <2 x float> %res, <2 x float> *%ptr + ret void +} diff --git a/test/CodeGen/SystemZ/vec-move-01.ll b/test/CodeGen/SystemZ/vec-move-01.ll index f9ae13b3ba1..896d24a1d20 100644 --- a/test/CodeGen/SystemZ/vec-move-01.ll +++ b/test/CodeGen/SystemZ/vec-move-01.ll @@ -34,6 +34,14 @@ define <2 x i64> @f4(<2 x i64> %val1, <2 x i64> %val2) { ret <2 x i64> %val2 } +; Test v4f32 moves. +define <4 x float> @f5(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f5: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <4 x float> %val2 +} + ; Test v2f64 moves. define <2 x double> @f6(<2 x double> %val1, <2 x double> %val2) { ; CHECK-LABEL: f6: diff --git a/test/CodeGen/SystemZ/vec-move-02.ll b/test/CodeGen/SystemZ/vec-move-02.ll index a8c6912f0c7..e43676055fa 100644 --- a/test/CodeGen/SystemZ/vec-move-02.ll +++ b/test/CodeGen/SystemZ/vec-move-02.ll @@ -38,6 +38,15 @@ define <2 x i64> @f4(<2 x i64> *%ptr) { ret <2 x i64> %ret } +; Test v4f32 loads. +define <4 x float> @f5(<4 x float> *%ptr) { +; CHECK-LABEL: f5: +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ret = load <4 x float>, <4 x float> *%ptr + ret <4 x float> %ret +} + ; Test v2f64 loads. define <2 x double> @f6(<2 x double> *%ptr) { ; CHECK-LABEL: f6: diff --git a/test/CodeGen/SystemZ/vec-move-03.ll b/test/CodeGen/SystemZ/vec-move-03.ll index abd7c939fbe..1b1f96163a0 100644 --- a/test/CodeGen/SystemZ/vec-move-03.ll +++ b/test/CodeGen/SystemZ/vec-move-03.ll @@ -38,6 +38,15 @@ define void @f4(<2 x i64> %val, <2 x i64> *%ptr) { ret void } +; Test v4f32 stores. +define void @f5(<4 x float> %val, <4 x float> *%ptr) { +; CHECK-LABEL: f5: +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + store <4 x float> %val, <4 x float> *%ptr + ret void +} + ; Test v2f64 stores. define void @f6(<2 x double> %val, <2 x double> *%ptr) { ; CHECK-LABEL: f6: diff --git a/test/CodeGen/SystemZ/vec-move-04.ll b/test/CodeGen/SystemZ/vec-move-04.ll index 4e75d21dc96..27c9e5f71f4 100644 --- a/test/CodeGen/SystemZ/vec-move-04.ll +++ b/test/CodeGen/SystemZ/vec-move-04.ll @@ -110,6 +110,36 @@ define <2 x i64> @f12(<2 x i64> %val, i64 %element, i32 %index) { ret <2 x i64> %ret } +; Test v4f32 insertion into the first element. +define <4 x float> @f13(<4 x float> %val, float %element) { +; CHECK-LABEL: f13: +; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0 +; CHECK: vlvgf %v24, [[REG]], 0 +; CHECK: br %r14 + %ret = insertelement <4 x float> %val, float %element, i32 0 + ret <4 x float> %ret +} + +; Test v4f32 insertion into the last element. +define <4 x float> @f14(<4 x float> %val, float %element) { +; CHECK-LABEL: f14: +; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0 +; CHECK: vlvgf %v24, [[REG]], 3 +; CHECK: br %r14 + %ret = insertelement <4 x float> %val, float %element, i32 3 + ret <4 x float> %ret +} + +; Test v4f32 insertion into a variable element. +define <4 x float> @f15(<4 x float> %val, float %element, i32 %index) { +; CHECK-LABEL: f15: +; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0 +; CHECK: vlvgf %v24, [[REG]], 0(%r2) +; CHECK: br %r14 + %ret = insertelement <4 x float> %val, float %element, i32 %index + ret <4 x float> %ret +} + ; Test v2f64 insertion into the first element. define <2 x double> @f16(<2 x double> %val, double %element) { ; CHECK-LABEL: f16: diff --git a/test/CodeGen/SystemZ/vec-move-05.ll b/test/CodeGen/SystemZ/vec-move-05.ll index 234157a0abb..99871196d68 100644 --- a/test/CodeGen/SystemZ/vec-move-05.ll +++ b/test/CodeGen/SystemZ/vec-move-05.ll @@ -150,6 +150,59 @@ define i64 @f16(<2 x i64> %val, i32 %index) { ret i64 %ret } +; Test v4f32 extraction of element 0. +define float @f17(<4 x float> %val) { +; CHECK-LABEL: f17: +; CHECK: vlr %v0, %v24 +; CHECK: br %r14 + %ret = extractelement <4 x float> %val, i32 0 + ret float %ret +} + +; Test v4f32 extraction of element 1. +define float @f18(<4 x float> %val) { +; CHECK-LABEL: f18: +; CHECK: vrepf %v0, %v24, 1 +; CHECK: br %r14 + %ret = extractelement <4 x float> %val, i32 1 + ret float %ret +} + +; Test v4f32 extraction of element 2. +define float @f19(<4 x float> %val) { +; CHECK-LABEL: f19: +; CHECK: vrepf %v0, %v24, 2 +; CHECK: br %r14 + %ret = extractelement <4 x float> %val, i32 2 + ret float %ret +} + +; Test v4f32 extraction of element 3. +define float @f20(<4 x float> %val) { +; CHECK-LABEL: f20: +; CHECK: vrepf %v0, %v24, 3 +; CHECK: br %r14 + %ret = extractelement <4 x float> %val, i32 3 + ret float %ret +} + +; Test v4f32 extractions of an absurd element number. This must compile +; but we don't care what it does. +define float @f21(<4 x float> %val) { + %ret = extractelement <4 x float> %val, i32 100000 + ret float %ret +} + +; Test v4f32 extraction of a variable element. +define float @f22(<4 x float> %val, i32 %index) { +; CHECK-LABEL: f22: +; CHECK: vlgvf [[REG:%r[0-5]]], %v24, 0(%r2) +; CHECK: vlvgf %v0, [[REG]], 0 +; CHECK: br %r14 + %ret = extractelement <4 x float> %val, i32 %index + ret float %ret +} + ; Test v2f64 extraction of the first element. define double @f23(<2 x double> %val) { ; CHECK-LABEL: f23: diff --git a/test/CodeGen/SystemZ/vec-move-07.ll b/test/CodeGen/SystemZ/vec-move-07.ll index 0cb8a0a1dfc..b0d06f782de 100644 --- a/test/CodeGen/SystemZ/vec-move-07.ll +++ b/test/CodeGen/SystemZ/vec-move-07.ll @@ -38,7 +38,16 @@ define <2 x i64> @f4(i64 %val) { ret <2 x i64> %ret } -; Test v2f64, which is just a move. +; Test v4f32, which is just a move. +define <4 x float> @f5(float %val) { +; CHECK-LABEL: f5: +; CHECK: vlr %v24, %v0 +; CHECK: br %r14 + %ret = insertelement <4 x float> undef, float %val, i32 0 + ret <4 x float> %ret +} + +; Likewise v2f64. define <2 x double> @f6(double %val) { ; CHECK-LABEL: f6: ; CHECK: vlr %v24, %v0 diff --git a/test/CodeGen/SystemZ/vec-move-08.ll b/test/CodeGen/SystemZ/vec-move-08.ll index 6148529c225..5396a1edec6 100644 --- a/test/CodeGen/SystemZ/vec-move-08.ll +++ b/test/CodeGen/SystemZ/vec-move-08.ll @@ -214,6 +214,59 @@ define <2 x i64> @f20(<2 x i64> %val, i64 *%ptr, i32 %index) { ret <2 x i64> %ret } +; Test v4f32 insertion into the first element. +define <4 x float> @f21(<4 x float> %val, float *%ptr) { +; CHECK-LABEL: f21: +; CHECK: vlef %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = load float, float *%ptr + %ret = insertelement <4 x float> %val, float %element, i32 0 + ret <4 x float> %ret +} + +; Test v4f32 insertion into the last element. +define <4 x float> @f22(<4 x float> %val, float *%ptr) { +; CHECK-LABEL: f22: +; CHECK: vlef %v24, 0(%r2), 3 +; CHECK: br %r14 + %element = load float, float *%ptr + %ret = insertelement <4 x float> %val, float %element, i32 3 + ret <4 x float> %ret +} + +; Test v4f32 insertion with the highest in-range offset. +define <4 x float> @f23(<4 x float> %val, float *%base) { +; CHECK-LABEL: f23: +; CHECK: vlef %v24, 4092(%r2), 2 +; CHECK: br %r14 + %ptr = getelementptr float, float *%base, i32 1023 + %element = load float, float *%ptr + %ret = insertelement <4 x float> %val, float %element, i32 2 + ret <4 x float> %ret +} + +; Test v4f32 insertion with the first ouf-of-range offset. +define <4 x float> @f24(<4 x float> %val, float *%base) { +; CHECK-LABEL: f24: +; CHECK: aghi %r2, 4096 +; CHECK: vlef %v24, 0(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr float, float *%base, i32 1024 + %element = load float, float *%ptr + %ret = insertelement <4 x float> %val, float %element, i32 1 + ret <4 x float> %ret +} + +; Test v4f32 insertion into a variable element. +define <4 x float> @f25(<4 x float> %val, float *%ptr, i32 %index) { +; CHECK-LABEL: f25: +; CHECK-NOT: vlef +; CHECK: br %r14 + %element = load float, float *%ptr + %ret = insertelement <4 x float> %val, float %element, i32 %index + ret <4 x float> %ret +} + ; Test v2f64 insertion into the first element. define <2 x double> @f26(<2 x double> %val, double *%ptr) { ; CHECK-LABEL: f26: @@ -336,6 +389,34 @@ define <2 x i64> @f35(<2 x i64> %val, <2 x i64> %index, i64 %base) { ret <2 x i64> %ret } +; Test a v4f32 gather of the first element. +define <4 x float> @f36(<4 x float> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f36: +; CHECK: vgef %v24, 0(%v26,%r2), 0 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 0 + %ext = zext i32 %elem to i64 + %add = add i64 %base, %ext + %ptr = inttoptr i64 %add to float * + %element = load float, float *%ptr + %ret = insertelement <4 x float> %val, float %element, i32 0 + ret <4 x float> %ret +} + +; Test a v4f32 gather of the last element. +define <4 x float> @f37(<4 x float> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f37: +; CHECK: vgef %v24, 0(%v26,%r2), 3 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 3 + %ext = zext i32 %elem to i64 + %add = add i64 %base, %ext + %ptr = inttoptr i64 %add to float * + %element = load float, float *%ptr + %ret = insertelement <4 x float> %val, float %element, i32 3 + ret <4 x float> %ret +} + ; Test a v2f64 gather of the first element. define <2 x double> @f38(<2 x double> %val, <2 x i64> %index, i64 %base) { ; CHECK-LABEL: f38: diff --git a/test/CodeGen/SystemZ/vec-move-09.ll b/test/CodeGen/SystemZ/vec-move-09.ll index 78c5454fb55..5a53a2d6a19 100644 --- a/test/CodeGen/SystemZ/vec-move-09.ll +++ b/test/CodeGen/SystemZ/vec-move-09.ll @@ -236,6 +236,33 @@ define <2 x i64> @f26(<2 x i64> %val, i32 %index) { ret <2 x i64> %ret } +; Test v4f32 insertion of 0 into the first element. +define <4 x float> @f27(<4 x float> %val) { +; CHECK-LABEL: f27: +; CHECK: vleif %v24, 0, 0 +; CHECK: br %r14 + %ret = insertelement <4 x float> %val, float 0.0, i32 0 + ret <4 x float> %ret +} + +; Test v4f32 insertion of 0 into the last element. +define <4 x float> @f28(<4 x float> %val) { +; CHECK-LABEL: f28: +; CHECK: vleif %v24, 0, 3 +; CHECK: br %r14 + %ret = insertelement <4 x float> %val, float 0.0, i32 3 + ret <4 x float> %ret +} + +; Test v4f32 insertion of a nonzero value. +define <4 x float> @f29(<4 x float> %val) { +; CHECK-LABEL: f29: +; CHECK-NOT: vleif +; CHECK: br %r14 + %ret = insertelement <4 x float> %val, float 1.0, i32 1 + ret <4 x float> %ret +} + ; Test v2f64 insertion of 0 into the first element. define <2 x double> @f30(<2 x double> %val) { ; CHECK-LABEL: f30: diff --git a/test/CodeGen/SystemZ/vec-move-10.ll b/test/CodeGen/SystemZ/vec-move-10.ll index bc854214bbd..894d0c2b41f 100644 --- a/test/CodeGen/SystemZ/vec-move-10.ll +++ b/test/CodeGen/SystemZ/vec-move-10.ll @@ -258,6 +258,70 @@ define void @f24(<2 x i64> %val, i64 *%ptr, i32 %index) { ret void } +; Test v4f32 extraction from the first element. +define void @f25(<4 x float> %val, float *%ptr) { +; CHECK-LABEL: f25: +; CHECK: vstef %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = extractelement <4 x float> %val, i32 0 + store float %element, float *%ptr + ret void +} + +; Test v4f32 extraction from the last element. +define void @f26(<4 x float> %val, float *%ptr) { +; CHECK-LABEL: f26: +; CHECK: vstef %v24, 0(%r2), 3 +; CHECK: br %r14 + %element = extractelement <4 x float> %val, i32 3 + store float %element, float *%ptr + ret void +} + +; Test v4f32 extraction of an invalid element. This must compile, +; but we don't care what it does. +define void @f27(<4 x float> %val, float *%ptr) { +; CHECK-LABEL: f27: +; CHECK-NOT: vstef %v24, 0(%r2), 4 +; CHECK: br %r14 + %element = extractelement <4 x float> %val, i32 4 + store float %element, float *%ptr + ret void +} + +; Test v4f32 extraction with the highest in-range offset. +define void @f28(<4 x float> %val, float *%base) { +; CHECK-LABEL: f28: +; CHECK: vstef %v24, 4092(%r2), 2 +; CHECK: br %r14 + %ptr = getelementptr float, float *%base, i32 1023 + %element = extractelement <4 x float> %val, i32 2 + store float %element, float *%ptr + ret void +} + +; Test v4f32 extraction with the first ouf-of-range offset. +define void @f29(<4 x float> %val, float *%base) { +; CHECK-LABEL: f29: +; CHECK: aghi %r2, 4096 +; CHECK: vstef %v24, 0(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr float, float *%base, i32 1024 + %element = extractelement <4 x float> %val, i32 1 + store float %element, float *%ptr + ret void +} + +; Test v4f32 extraction from a variable element. +define void @f30(<4 x float> %val, float *%ptr, i32 %index) { +; CHECK-LABEL: f30: +; CHECK-NOT: vstef +; CHECK: br %r14 + %element = extractelement <4 x float> %val, i32 %index + store float %element, float *%ptr + ret void +} + ; Test v2f64 extraction from the first element. define void @f32(<2 x double> %val, double *%ptr) { ; CHECK-LABEL: f32: @@ -380,6 +444,34 @@ define void @f41(<2 x i64> %val, <2 x i64> %index, i64 %base) { ret void } +; Test a v4f32 scatter of the first element. +define void @f42(<4 x float> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f42: +; CHECK: vscef %v24, 0(%v26,%r2), 0 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 0 + %ext = zext i32 %elem to i64 + %add = add i64 %base, %ext + %ptr = inttoptr i64 %add to float * + %element = extractelement <4 x float> %val, i32 0 + store float %element, float *%ptr + ret void +} + +; Test a v4f32 scatter of the last element. +define void @f43(<4 x float> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f43: +; CHECK: vscef %v24, 0(%v26,%r2), 3 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 3 + %ext = zext i32 %elem to i64 + %add = add i64 %base, %ext + %ptr = inttoptr i64 %add to float * + %element = extractelement <4 x float> %val, i32 3 + store float %element, float *%ptr + ret void +} + ; Test a v2f64 scatter of the first element. define void @f44(<2 x double> %val, <2 x i64> %index, i64 %base) { ; CHECK-LABEL: f44: diff --git a/test/CodeGen/SystemZ/vec-move-11.ll b/test/CodeGen/SystemZ/vec-move-11.ll index 07a037ccdf2..fd9c3d3559f 100644 --- a/test/CodeGen/SystemZ/vec-move-11.ll +++ b/test/CodeGen/SystemZ/vec-move-11.ll @@ -92,6 +92,15 @@ define <2 x i64> @f10(i64 %val) { ret <2 x i64> %ret } +; Test v4f32 insertion into an undef. +define <4 x float> @f11(float %val) { +; CHECK-LABEL: f11: +; CHECK: vrepf %v24, %v0, 0 +; CHECK: br %r14 + %ret = insertelement <4 x float> undef, float %val, i32 2 + ret <4 x float> %ret +} + ; Test v2f64 insertion into an undef. define <2 x double> @f12(double %val) { ; CHECK-LABEL: f12: diff --git a/test/CodeGen/SystemZ/vec-move-12.ll b/test/CodeGen/SystemZ/vec-move-12.ll index 94b186f46e5..bc8ff97f805 100644 --- a/test/CodeGen/SystemZ/vec-move-12.ll +++ b/test/CodeGen/SystemZ/vec-move-12.ll @@ -102,6 +102,16 @@ define <2 x i64> @f10(i64 *%ptr) { ret <2 x i64> %ret } +; Test v4f32 insertion into an undef. +define <4 x float> @f11(float *%ptr) { +; CHECK-LABEL: f11: +; CHECK: vlrepf %v24, 0(%r2) +; CHECK: br %r14 + %val = load float, float *%ptr + %ret = insertelement <4 x float> undef, float %val, i32 2 + ret <4 x float> %ret +} + ; Test v2f64 insertion into an undef. define <2 x double> @f12(double *%ptr) { ; CHECK-LABEL: f12: diff --git a/test/CodeGen/SystemZ/vec-move-13.ll b/test/CodeGen/SystemZ/vec-move-13.ll index c50c94afb6c..4ad8e3f5210 100644 --- a/test/CodeGen/SystemZ/vec-move-13.ll +++ b/test/CodeGen/SystemZ/vec-move-13.ll @@ -46,6 +46,17 @@ define <2 x i64> @f4(i64 %val) { ret <2 x i64> %ret } +; Test v4f32 insertion into 0. +define <4 x float> @f5(float %val) { +; CHECK-LABEL: f5: +; CHECK: vgbm [[ZERO:%v[0-9]+]], 0 +; CHECK: vmrhf [[REG:%v[0-9]+]], [[ZERO]], %v0 +; CHECK: vmrhg %v24, [[ZERO]], [[REG]] +; CHECK: br %r14 + %ret = insertelement <4 x float> zeroinitializer, float %val, i32 3 + ret <4 x float> %ret +} + ; Test v2f64 insertion into 0. define <2 x double> @f6(double %val) { ; CHECK-LABEL: f6: @@ -55,3 +66,4 @@ define <2 x double> @f6(double %val) { %ret = insertelement <2 x double> zeroinitializer, double %val, i32 1 ret <2 x double> %ret } + diff --git a/test/CodeGen/SystemZ/vec-move-14.ll b/test/CodeGen/SystemZ/vec-move-14.ll index b48f2175ebe..e41eb9da034 100644 --- a/test/CodeGen/SystemZ/vec-move-14.ll +++ b/test/CodeGen/SystemZ/vec-move-14.ll @@ -75,6 +75,16 @@ define <2 x i64> @f7(i64 *%ptr) { ret <2 x i64> %ret } +; Test VLLEZF with a float. +define <4 x float> @f8(float *%ptr) { +; CHECK-LABEL: f8: +; CHECK: vllezf %v24, 0(%r2) +; CHECK: br %r14 + %val = load float, float *%ptr + %ret = insertelement <4 x float> zeroinitializer, float %val, i32 1 + ret <4 x float> %ret +} + ; Test VLLEZG with a double. define <2 x double> @f9(double *%ptr) { ; CHECK-LABEL: f9: diff --git a/test/CodeGen/SystemZ/vec-perm-01.ll b/test/CodeGen/SystemZ/vec-perm-01.ll index c68958a98a2..4beec05eaec 100644 --- a/test/CodeGen/SystemZ/vec-perm-01.ll +++ b/test/CodeGen/SystemZ/vec-perm-01.ll @@ -123,6 +123,37 @@ define <2 x i64> @f11(<2 x i64> %val) { ret <2 x i64> %ret } +; Test v4f32 splat of the first element. +define <4 x float> @f12(<4 x float> %val) { +; CHECK-LABEL: f12: +; CHECK: vrepf %v24, %v24, 0 +; CHECK: br %r14 + %ret = shufflevector <4 x float> %val, <4 x float> undef, + <4 x i32> zeroinitializer + ret <4 x float> %ret +} + +; Test v4f32 splat of the last element. +define <4 x float> @f13(<4 x float> %val) { +; CHECK-LABEL: f13: +; CHECK: vrepf %v24, %v24, 3 +; CHECK: br %r14 + %ret = shufflevector <4 x float> %val, <4 x float> undef, + <4 x i32> + ret <4 x float> %ret +} + +; Test v4f32 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <4 x float> @f14(<4 x float> %val) { +; CHECK-LABEL: f14: +; CHECK: vrepf %v24, %v24, 1 +; CHECK: br %r14 + %ret = shufflevector <4 x float> undef, <4 x float> %val, + <4 x i32> + ret <4 x float> %ret +} + ; Test v2f64 splat of the first element. define <2 x double> @f15(<2 x double> %val) { ; CHECK-LABEL: f15: diff --git a/test/CodeGen/SystemZ/vec-perm-02.ll b/test/CodeGen/SystemZ/vec-perm-02.ll index 7158990174b..e5c6df8e955 100644 --- a/test/CodeGen/SystemZ/vec-perm-02.ll +++ b/test/CodeGen/SystemZ/vec-perm-02.ll @@ -143,6 +143,40 @@ define <2 x i64> @f11(i64 %scalar) { ret <2 x i64> %ret } +; Test v4f32 splat of the first element. +define <4 x float> @f12(float %scalar) { +; CHECK-LABEL: f12: +; CHECK: vrepf %v24, %v0, 0 +; CHECK: br %r14 + %val = insertelement <4 x float> undef, float %scalar, i32 0 + %ret = shufflevector <4 x float> %val, <4 x float> undef, + <4 x i32> zeroinitializer + ret <4 x float> %ret +} + +; Test v4f32 splat of the last element. +define <4 x float> @f13(float %scalar) { +; CHECK-LABEL: f13: +; CHECK: vrepf %v24, %v0, 0 +; CHECK: br %r14 + %val = insertelement <4 x float> undef, float %scalar, i32 3 + %ret = shufflevector <4 x float> %val, <4 x float> undef, + <4 x i32> + ret <4 x float> %ret +} + +; Test v4f32 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <4 x float> @f14(float %scalar) { +; CHECK-LABEL: f14: +; CHECK: vrepf %v24, %v0, 0 +; CHECK: br %r14 + %val = insertelement <4 x float> undef, float %scalar, i32 1 + %ret = shufflevector <4 x float> undef, <4 x float> %val, + <4 x i32> + ret <4 x float> %ret +} + ; Test v2f64 splat of the first element. define <2 x double> @f15(double %scalar) { ; CHECK-LABEL: f15: diff --git a/test/CodeGen/SystemZ/vec-perm-03.ll b/test/CodeGen/SystemZ/vec-perm-03.ll index c30a87601a4..663815549c3 100644 --- a/test/CodeGen/SystemZ/vec-perm-03.ll +++ b/test/CodeGen/SystemZ/vec-perm-03.ll @@ -158,6 +158,44 @@ define <2 x i64> @f12(i64 *%base) { ret <2 x i64> %ret } +; Test a v4f32 replicating load with no offset. +define <4 x float> @f13(float *%ptr) { +; CHECK-LABEL: f13: +; CHECK: vlrepf %v24, 0(%r2) +; CHECK: br %r14 + %scalar = load float, float *%ptr + %val = insertelement <4 x float> undef, float %scalar, i32 0 + %ret = shufflevector <4 x float> %val, <4 x float> undef, + <4 x i32> zeroinitializer + ret <4 x float> %ret +} + +; Test a v4f32 replicating load with the maximum in-range offset. +define <4 x float> @f14(float *%base) { +; CHECK-LABEL: f14: +; CHECK: vlrepf %v24, 4092(%r2) +; CHECK: br %r14 + %ptr = getelementptr float, float *%base, i64 1023 + %scalar = load float, float *%ptr + %val = insertelement <4 x float> undef, float %scalar, i32 0 + %ret = shufflevector <4 x float> %val, <4 x float> undef, + <4 x i32> zeroinitializer + ret <4 x float> %ret +} + +; Test a v4f32 replicating load with the first out-of-range offset. +define <4 x float> @f15(float *%base) { +; CHECK-LABEL: f15: +; CHECK: aghi %r2, 4096 +; CHECK: vlrepf %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr float, float *%base, i64 1024 + %scalar = load float, float *%ptr + %val = insertelement <4 x float> undef, float %scalar, i32 0 + %ret = shufflevector <4 x float> %val, <4 x float> undef, + <4 x i32> zeroinitializer + ret <4 x float> %ret +} ; Test a v2f64 replicating load with no offset. define <2 x double> @f16(double *%ptr) { diff --git a/test/CodeGen/SystemZ/vec-perm-04.ll b/test/CodeGen/SystemZ/vec-perm-04.ll index ca04fdf6913..0df6f4fbb01 100644 --- a/test/CodeGen/SystemZ/vec-perm-04.ll +++ b/test/CodeGen/SystemZ/vec-perm-04.ll @@ -159,6 +159,26 @@ define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2) { ret <2 x i64> %ret } +; Test a canonical v4f32 merge high. +define <4 x float> @f14(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f14: +; CHECK: vmrhf %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <4 x float> %val1, <4 x float> %val2, + <4 x i32> + ret <4 x float> %ret +} + +; Test a reversed v4f32 merge high. +define <4 x float> @f15(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f15: +; CHECK: vmrhf %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <4 x float> %val1, <4 x float> %val2, + <4 x i32> + ret <4 x float> %ret +} + ; Test a canonical v2f64 merge high. define <2 x double> @f16(<2 x double> %val1, <2 x double> %val2) { ; CHECK-LABEL: f16: diff --git a/test/CodeGen/SystemZ/vec-perm-05.ll b/test/CodeGen/SystemZ/vec-perm-05.ll index f4a46ff4e27..b585cefbf84 100644 --- a/test/CodeGen/SystemZ/vec-perm-05.ll +++ b/test/CodeGen/SystemZ/vec-perm-05.ll @@ -159,6 +159,26 @@ define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2) { ret <2 x i64> %ret } +; Test a canonical v4f32 merge low. +define <4 x float> @f14(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f14: +; CHECK: vmrlf %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <4 x float> %val1, <4 x float> %val2, + <4 x i32> + ret <4 x float> %ret +} + +; Test a reversed v4f32 merge low. +define <4 x float> @f15(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f15: +; CHECK: vmrlf %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <4 x float> %val1, <4 x float> %val2, + <4 x i32> + ret <4 x float> %ret +} + ; Test a canonical v2f64 merge low. define <2 x double> @f16(<2 x double> %val1, <2 x double> %val2) { ; CHECK-LABEL: f16: diff --git a/test/CodeGen/SystemZ/vec-perm-06.ll b/test/CodeGen/SystemZ/vec-perm-06.ll index 298fc60e851..835276a3672 100644 --- a/test/CodeGen/SystemZ/vec-perm-06.ll +++ b/test/CodeGen/SystemZ/vec-perm-06.ll @@ -138,3 +138,23 @@ define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2) { <4 x i32> ret <4 x i32> %ret } + +; Test a canonical v4f32 pack. +define <4 x float> @f12(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f12: +; CHECK: vpkg %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <4 x float> %val1, <4 x float> %val2, + <4 x i32> + ret <4 x float> %ret +} + +; Test a reversed v4f32 pack. +define <4 x float> @f13(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f13: +; CHECK: vpkg %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <4 x float> %val1, <4 x float> %val2, + <4 x i32> + ret <4 x float> %ret +} diff --git a/test/CodeGen/SystemZ/vec-perm-07.ll b/test/CodeGen/SystemZ/vec-perm-07.ll index 40ca3995524..9a370af2c0e 100644 --- a/test/CodeGen/SystemZ/vec-perm-07.ll +++ b/test/CodeGen/SystemZ/vec-perm-07.ll @@ -122,4 +122,24 @@ define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) { ret <4 x i32> %ret } +; Test a v4f32 shift with the lowest useful shift amount. +define <4 x float> @f12(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f12: +; CHECK: vsldb %v24, %v24, %v26, 4 +; CHECK: br %r14 + %ret = shufflevector <4 x float> %val1, <4 x float> %val2, + <4 x i32> + ret <4 x float> %ret +} + +; Test a v4f32 shift with the highest useful shift amount. +define <4 x float> @f13(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f13: +; CHECK: vsldb %v24, %v24, %v26, 12 +; CHECK: br %r14 + %ret = shufflevector <4 x float> %val1, <4 x float> %val2, + <4 x i32> + ret <4 x float> %ret +} + ; We use VPDI for v2i64 shuffles. diff --git a/test/CodeGen/SystemZ/vec-perm-08.ll b/test/CodeGen/SystemZ/vec-perm-08.ll index b5220ab6712..a18ca7b7397 100644 --- a/test/CodeGen/SystemZ/vec-perm-08.ll +++ b/test/CodeGen/SystemZ/vec-perm-08.ll @@ -129,6 +129,26 @@ define <2 x i64> @f11(<2 x i64> %val1, <2 x i64> %val2) { ret <2 x i64> %ret } +; Test a high1/low2 permute for v4f32. +define <4 x float> @f12(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f12: +; CHECK: vpdi %v24, %v24, %v26, 1 +; CHECK: br %r14 + %ret = shufflevector <4 x float> %val1, <4 x float> %val2, + <4 x i32> + ret <4 x float> %ret +} + +; Test a low2/high1 permute for v4f32. +define <4 x float> @f13(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f13: +; CHECK: vpdi %v24, %v26, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <4 x float> %val1, <4 x float> %val2, + <4 x i32> + ret <4 x float> %ret +} + ; Test a high1/low2 permute for v2f64. define <2 x double> @f14(<2 x double> %val1, <2 x double> %val2) { ; CHECK-LABEL: f14: diff --git a/test/CodeGen/SystemZ/vec-sub-01.ll b/test/CodeGen/SystemZ/vec-sub-01.ll index 24d4ba5a2bd..aabf1c9be4a 100644 --- a/test/CodeGen/SystemZ/vec-sub-01.ll +++ b/test/CodeGen/SystemZ/vec-sub-01.ll @@ -38,6 +38,33 @@ define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { ret <2 x i64> %ret } +; Test a v4f32 subtraction, as an example of an operation that needs to be +; scalarized and reassembled. At present there's an unnecessary move that +; could be avoided with smarter ordering. It also isn't important whether +; the VSLDBs use the result of the VLRs or use %v24 and %v26 directly. +define <4 x float> @f5(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f5: +; CHECK-DAG: vlr %v[[A1:[0-5]]], %v24 +; CHECK-DAG: vlr %v[[A2:[0-5]]], %v26 +; CHECK-DAG: vrepf %v[[B1:[0-5]]], %v[[A1]], 1 +; CHECK-DAG: vrepf %v[[B2:[0-5]]], %v[[A2]], 1 +; CHECK-DAG: vrepf %v[[C1:[0-5]]], %v[[A1]], 2 +; CHECK-DAG: vrepf %v[[C2:[0-5]]], %v[[A2]], 2 +; CHECK-DAG: vrepf %v[[D1:[0-5]]], %v[[A1]], 3 +; CHECK-DAG: vrepf %v[[D2:[0-5]]], %v[[A2]], 3 +; CHECK-DAG: ler %f[[A1copy:[0-5]]], %f[[A1]] +; CHECK-DAG: sebr %f[[A1copy]], %f[[A2]] +; CHECK-DAG: sebr %f[[B1]], %f[[B2]] +; CHECK-DAG: sebr %f[[C1]], %f[[C2]] +; CHECK-DAG: sebr %f[[D1]], %f[[D2]] +; CHECK-DAG: vmrhf [[HIGH:%v[0-9]+]], %v[[A1copy]], %v[[B1]] +; CHECK-DAG: vmrhf [[LOW:%v[0-9]+]], %v[[C1]], %v[[D1]] +; CHECK: vmrhg %v24, [[HIGH]], [[LOW]] +; CHECK: br %r14 + %ret = fsub <4 x float> %val1, %val2 + ret <4 x float> %ret +} + ; Test a v2f64 subtraction. define <2 x double> @f6(<2 x double> %dummy, <2 x double> %val1, <2 x double> %val2) { -- 2.34.1