From aa5c996eda6dbaab398c8502717b37de2d10e4b7 Mon Sep 17 00:00:00 2001 From: Ulrich Weigand Date: Tue, 5 May 2015 19:25:42 +0000 Subject: [PATCH] [SystemZ] Add CodeGen support for integer vector types This the first of a series of patches to add CodeGen support exploiting the instructions of the z13 vector facility. This patch adds support for the native integer vector types (v16i8, v8i16, v4i32, v2i64). When the vector facility is present, we default to the new vector ABI. This is characterized by two major differences: - Vector types are passed/returned in vector registers (except for unnamed arguments of a variable-argument list function). - Vector types are at most 8-byte aligned. The reason for the choice of 8-byte vector alignment is that the hardware is able to efficiently load vectors at 8-byte alignment, and the ABI only guarantees 8-byte alignment of the stack pointer, so requiring any higher alignment for vectors would require dynamic stack re-alignment code. However, for compatibility with old code that may use vector types, when *not* using the vector facility, the old alignment rules (vector types are naturally aligned) remain in use. These alignment rules are not only implemented at the C language level (implemented in clang), but also at the LLVM IR level. This is done by selecting a different DataLayout string depending on whether the vector ABI is in effect or not. Based on a patch by Richard Sandiford. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236521 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/SystemZ/SystemZ.h | 7 + lib/Target/SystemZ/SystemZAsmPrinter.cpp | 7 + lib/Target/SystemZ/SystemZCallingConv.h | 44 + lib/Target/SystemZ/SystemZCallingConv.td | 27 +- lib/Target/SystemZ/SystemZISelDAGToDAG.cpp | 122 ++ lib/Target/SystemZ/SystemZISelLowering.cpp | 1269 ++++++++++++++++- lib/Target/SystemZ/SystemZISelLowering.h | 79 + lib/Target/SystemZ/SystemZInstrFormats.td | 4 + lib/Target/SystemZ/SystemZInstrInfo.cpp | 7 + lib/Target/SystemZ/SystemZInstrVector.td | 459 ++++-- lib/Target/SystemZ/SystemZOperators.td | 170 +++ lib/Target/SystemZ/SystemZTargetMachine.cpp | 63 +- .../SystemZ/SystemZTargetTransformInfo.cpp | 18 + .../SystemZ/SystemZTargetTransformInfo.h | 8 + test/CodeGen/SystemZ/frame-19.ll | 314 ++++ test/CodeGen/SystemZ/vec-abi-align.ll | 49 + test/CodeGen/SystemZ/vec-abs-01.ll | 146 ++ test/CodeGen/SystemZ/vec-abs-02.ll | 142 ++ test/CodeGen/SystemZ/vec-abs-03.ll | 138 ++ test/CodeGen/SystemZ/vec-abs-04.ll | 138 ++ test/CodeGen/SystemZ/vec-add-01.ll | 39 + test/CodeGen/SystemZ/vec-and-01.ll | 39 + test/CodeGen/SystemZ/vec-and-02.ll | 91 ++ test/CodeGen/SystemZ/vec-and-03.ll | 113 ++ test/CodeGen/SystemZ/vec-args-01.ll | 48 + test/CodeGen/SystemZ/vec-args-02.ll | 31 + test/CodeGen/SystemZ/vec-args-03.ll | 16 + test/CodeGen/SystemZ/vec-cmp-01.ll | 228 +++ test/CodeGen/SystemZ/vec-cmp-02.ll | 228 +++ test/CodeGen/SystemZ/vec-cmp-03.ll | 228 +++ test/CodeGen/SystemZ/vec-cmp-04.ll | 228 +++ test/CodeGen/SystemZ/vec-combine-01.ll | 107 ++ test/CodeGen/SystemZ/vec-const-01.ll | 55 + test/CodeGen/SystemZ/vec-const-02.ll | 47 + test/CodeGen/SystemZ/vec-const-03.ll | 43 + test/CodeGen/SystemZ/vec-const-04.ll | 43 + test/CodeGen/SystemZ/vec-const-07.ll | 229 +++ test/CodeGen/SystemZ/vec-const-08.ll | 189 +++ test/CodeGen/SystemZ/vec-const-09.ll | 169 +++ test/CodeGen/SystemZ/vec-const-10.ll | 169 +++ test/CodeGen/SystemZ/vec-const-13.ll | 193 +++ test/CodeGen/SystemZ/vec-const-14.ll | 113 ++ test/CodeGen/SystemZ/vec-const-15.ll | 85 ++ test/CodeGen/SystemZ/vec-const-16.ll | 85 ++ test/CodeGen/SystemZ/vec-ctlz-01.ll | 81 ++ test/CodeGen/SystemZ/vec-ctpop-01.ll | 53 + test/CodeGen/SystemZ/vec-cttz-01.ll | 81 ++ test/CodeGen/SystemZ/vec-div-01.ll | 62 + test/CodeGen/SystemZ/vec-max-01.ll | 83 ++ test/CodeGen/SystemZ/vec-max-02.ll | 83 ++ test/CodeGen/SystemZ/vec-max-03.ll | 83 ++ test/CodeGen/SystemZ/vec-max-04.ll | 83 ++ test/CodeGen/SystemZ/vec-min-01.ll | 83 ++ test/CodeGen/SystemZ/vec-min-02.ll | 83 ++ test/CodeGen/SystemZ/vec-min-03.ll | 83 ++ test/CodeGen/SystemZ/vec-min-04.ll | 83 ++ test/CodeGen/SystemZ/vec-move-01.ll | 35 + test/CodeGen/SystemZ/vec-move-02.ll | 93 ++ test/CodeGen/SystemZ/vec-move-03.ll | 93 ++ test/CodeGen/SystemZ/vec-move-04.ll | 121 ++ test/CodeGen/SystemZ/vec-move-05.ll | 161 +++ test/CodeGen/SystemZ/vec-move-06.ll | 13 + test/CodeGen/SystemZ/vec-move-07.ll | 39 + test/CodeGen/SystemZ/vec-move-08.ll | 284 ++++ test/CodeGen/SystemZ/vec-move-09.ll | 237 +++ test/CodeGen/SystemZ/vec-move-10.ll | 328 +++++ test/CodeGen/SystemZ/vec-move-11.ll | 93 ++ test/CodeGen/SystemZ/vec-move-12.ll | 103 ++ test/CodeGen/SystemZ/vec-move-13.ll | 47 + test/CodeGen/SystemZ/vec-move-14.ll | 76 + test/CodeGen/SystemZ/vec-mul-01.ll | 39 + test/CodeGen/SystemZ/vec-mul-02.ll | 36 + test/CodeGen/SystemZ/vec-neg-01.ll | 39 + test/CodeGen/SystemZ/vec-or-01.ll | 39 + test/CodeGen/SystemZ/vec-or-02.ll | 107 ++ test/CodeGen/SystemZ/vec-perm-01.ll | 124 ++ test/CodeGen/SystemZ/vec-perm-02.ll | 144 ++ test/CodeGen/SystemZ/vec-perm-03.ll | 173 +++ test/CodeGen/SystemZ/vec-perm-04.ll | 160 +++ test/CodeGen/SystemZ/vec-perm-05.ll | 160 +++ test/CodeGen/SystemZ/vec-perm-06.ll | 140 ++ test/CodeGen/SystemZ/vec-perm-07.ll | 125 ++ test/CodeGen/SystemZ/vec-perm-08.ll | 130 ++ test/CodeGen/SystemZ/vec-perm-09.ll | 38 + test/CodeGen/SystemZ/vec-perm-10.ll | 36 + test/CodeGen/SystemZ/vec-perm-11.ll | 35 + test/CodeGen/SystemZ/vec-shift-01.ll | 39 + test/CodeGen/SystemZ/vec-shift-02.ll | 39 + test/CodeGen/SystemZ/vec-shift-03.ll | 39 + test/CodeGen/SystemZ/vec-shift-04.ll | 134 ++ test/CodeGen/SystemZ/vec-shift-05.ll | 134 ++ test/CodeGen/SystemZ/vec-shift-06.ll | 134 ++ test/CodeGen/SystemZ/vec-shift-07.ll | 182 +++ test/CodeGen/SystemZ/vec-sub-01.ll | 39 + test/CodeGen/SystemZ/vec-xor-01.ll | 39 + 95 files changed, 10849 insertions(+), 146 deletions(-) create mode 100644 test/CodeGen/SystemZ/frame-19.ll create mode 100644 test/CodeGen/SystemZ/vec-abi-align.ll create mode 100644 test/CodeGen/SystemZ/vec-abs-01.ll create mode 100644 test/CodeGen/SystemZ/vec-abs-02.ll create mode 100644 test/CodeGen/SystemZ/vec-abs-03.ll create mode 100644 test/CodeGen/SystemZ/vec-abs-04.ll create mode 100644 test/CodeGen/SystemZ/vec-add-01.ll create mode 100644 test/CodeGen/SystemZ/vec-and-01.ll create mode 100644 test/CodeGen/SystemZ/vec-and-02.ll create mode 100644 test/CodeGen/SystemZ/vec-and-03.ll create mode 100644 test/CodeGen/SystemZ/vec-args-01.ll create mode 100644 test/CodeGen/SystemZ/vec-args-02.ll create mode 100644 test/CodeGen/SystemZ/vec-args-03.ll create mode 100644 test/CodeGen/SystemZ/vec-cmp-01.ll create mode 100644 test/CodeGen/SystemZ/vec-cmp-02.ll create mode 100644 test/CodeGen/SystemZ/vec-cmp-03.ll create mode 100644 test/CodeGen/SystemZ/vec-cmp-04.ll create mode 100644 test/CodeGen/SystemZ/vec-combine-01.ll create mode 100644 test/CodeGen/SystemZ/vec-const-01.ll create mode 100644 test/CodeGen/SystemZ/vec-const-02.ll create mode 100644 test/CodeGen/SystemZ/vec-const-03.ll create mode 100644 test/CodeGen/SystemZ/vec-const-04.ll create mode 100644 test/CodeGen/SystemZ/vec-const-07.ll create mode 100644 test/CodeGen/SystemZ/vec-const-08.ll create mode 100644 test/CodeGen/SystemZ/vec-const-09.ll create mode 100644 test/CodeGen/SystemZ/vec-const-10.ll create mode 100644 test/CodeGen/SystemZ/vec-const-13.ll create mode 100644 test/CodeGen/SystemZ/vec-const-14.ll create mode 100644 test/CodeGen/SystemZ/vec-const-15.ll create mode 100644 test/CodeGen/SystemZ/vec-const-16.ll create mode 100644 test/CodeGen/SystemZ/vec-ctlz-01.ll create mode 100644 test/CodeGen/SystemZ/vec-ctpop-01.ll create mode 100644 test/CodeGen/SystemZ/vec-cttz-01.ll create mode 100644 test/CodeGen/SystemZ/vec-div-01.ll create mode 100644 test/CodeGen/SystemZ/vec-max-01.ll create mode 100644 test/CodeGen/SystemZ/vec-max-02.ll create mode 100644 test/CodeGen/SystemZ/vec-max-03.ll create mode 100644 test/CodeGen/SystemZ/vec-max-04.ll create mode 100644 test/CodeGen/SystemZ/vec-min-01.ll create mode 100644 test/CodeGen/SystemZ/vec-min-02.ll create mode 100644 test/CodeGen/SystemZ/vec-min-03.ll create mode 100644 test/CodeGen/SystemZ/vec-min-04.ll create mode 100644 test/CodeGen/SystemZ/vec-move-01.ll create mode 100644 test/CodeGen/SystemZ/vec-move-02.ll create mode 100644 test/CodeGen/SystemZ/vec-move-03.ll create mode 100644 test/CodeGen/SystemZ/vec-move-04.ll create mode 100644 test/CodeGen/SystemZ/vec-move-05.ll create mode 100644 test/CodeGen/SystemZ/vec-move-06.ll create mode 100644 test/CodeGen/SystemZ/vec-move-07.ll create mode 100644 test/CodeGen/SystemZ/vec-move-08.ll create mode 100644 test/CodeGen/SystemZ/vec-move-09.ll create mode 100644 test/CodeGen/SystemZ/vec-move-10.ll create mode 100644 test/CodeGen/SystemZ/vec-move-11.ll create mode 100644 test/CodeGen/SystemZ/vec-move-12.ll create mode 100644 test/CodeGen/SystemZ/vec-move-13.ll create mode 100644 test/CodeGen/SystemZ/vec-move-14.ll create mode 100644 test/CodeGen/SystemZ/vec-mul-01.ll create mode 100644 test/CodeGen/SystemZ/vec-mul-02.ll create mode 100644 test/CodeGen/SystemZ/vec-neg-01.ll create mode 100644 test/CodeGen/SystemZ/vec-or-01.ll create mode 100644 test/CodeGen/SystemZ/vec-or-02.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-01.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-02.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-03.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-04.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-05.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-06.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-07.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-08.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-09.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-10.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-11.ll create mode 100644 test/CodeGen/SystemZ/vec-shift-01.ll create mode 100644 test/CodeGen/SystemZ/vec-shift-02.ll create mode 100644 test/CodeGen/SystemZ/vec-shift-03.ll create mode 100644 test/CodeGen/SystemZ/vec-shift-04.ll create mode 100644 test/CodeGen/SystemZ/vec-shift-05.ll create mode 100644 test/CodeGen/SystemZ/vec-shift-06.ll create mode 100644 test/CodeGen/SystemZ/vec-shift-07.ll create mode 100644 test/CodeGen/SystemZ/vec-sub-01.ll create mode 100644 test/CodeGen/SystemZ/vec-xor-01.ll diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h index b3a7310f04b..6834818fc37 100644 --- a/lib/Target/SystemZ/SystemZ.h +++ b/lib/Target/SystemZ/SystemZ.h @@ -87,6 +87,13 @@ const unsigned IPM_CC = 28; const unsigned PFD_READ = 1; const unsigned PFD_WRITE = 2; +// Number of bits in a vector register. +const unsigned VectorBits = 128; + +// Number of bytes in a vector register (and consequently the number of +// bytes in a general permute vector). +const unsigned VectorBytes = VectorBits / 8; + // Return true if Val fits an LLILL operand. static inline bool isImmLL(uint64_t Val) { return (Val & ~0x000000000000ffffULL) == 0; diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp index c83a3d0af0d..5f46e6a6313 100644 --- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -151,6 +151,13 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) { LoweredMI = lowerRIEfLow(MI, SystemZ::RISBLG); break; + case SystemZ::VLVGP32: + LoweredMI = MCInstBuilder(SystemZ::VLVGP) + .addReg(MI->getOperand(0).getReg()) + .addReg(SystemZMC::getRegAsGR64(MI->getOperand(1).getReg())) + .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg())); + break; + #define LOWER_LOW(NAME) \ case SystemZ::NAME##64: LoweredMI = lowerRILow(MI, SystemZ::NAME); break diff --git a/lib/Target/SystemZ/SystemZCallingConv.h b/lib/Target/SystemZ/SystemZCallingConv.h index 71605ac1126..8b8146762b6 100644 --- a/lib/Target/SystemZ/SystemZCallingConv.h +++ b/lib/Target/SystemZ/SystemZCallingConv.h @@ -10,6 +10,9 @@ #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/CallingConvLower.h" + namespace llvm { namespace SystemZ { const unsigned NumArgGPRs = 5; @@ -18,6 +21,47 @@ namespace SystemZ { const unsigned NumArgFPRs = 4; extern const unsigned ArgFPRs[NumArgFPRs]; } // end namespace SystemZ + +class SystemZCCState : public CCState { +private: + /// Records whether the value was a fixed argument. + /// See ISD::OutputArg::IsFixed. + SmallVector ArgIsFixed; + +public: + SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, + SmallVectorImpl &locs, LLVMContext &C) + : CCState(CC, isVarArg, MF, locs, C) {} + + void AnalyzeFormalArguments(const SmallVectorImpl &Ins, + CCAssignFn Fn) { + // Formal arguments are always fixed. + ArgIsFixed.clear(); + for (unsigned i = 0; i < Ins.size(); ++i) + ArgIsFixed.push_back(true); + + CCState::AnalyzeFormalArguments(Ins, Fn); + } + + void AnalyzeCallOperands(const SmallVectorImpl &Outs, + CCAssignFn Fn) { + // Record whether the call operand was a fixed argument. + ArgIsFixed.clear(); + for (unsigned i = 0; i < Outs.size(); ++i) + ArgIsFixed.push_back(Outs[i].IsFixed); + + CCState::AnalyzeCallOperands(Outs, Fn); + } + + // This version of AnalyzeCallOperands in the base class is not usable + // since we must provide a means of accessing ISD::OutputArg::IsFixed. + void AnalyzeCallOperands(const SmallVectorImpl &Outs, + SmallVectorImpl &Flags, + CCAssignFn Fn) = delete; + + bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; } +}; + } // end namespace llvm #endif diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td index fb0d1d8a3fe..f5eb32c0a60 100644 --- a/lib/Target/SystemZ/SystemZCallingConv.td +++ b/lib/Target/SystemZ/SystemZCallingConv.td @@ -12,6 +12,15 @@ class CCIfExtend : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>; +class CCIfSubtarget + : CCIf" + "(State.getMachineFunction().getSubtarget()).", F), + A>; + +// Match if this specific argument is a fixed (i.e. named) argument. +class CCIfFixed + : CCIf<"static_cast(&State)->IsFixed(ValNo)", A>; + //===----------------------------------------------------------------------===// // z/Linux return value calling convention //===----------------------------------------------------------------------===// @@ -31,7 +40,12 @@ def RetCC_SystemZ : CallingConv<[ // doesn't care about the ABI. All floating-point argument registers // are call-clobbered, so we can use all of them here. CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>, - CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>> + CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>, + + // Similarly for vectors, with V24 being the ABI-compliant choice. + CCIfSubtarget<"hasVector()", + CCIfType<[v16i8, v8i16, v4i32, v2i64], + CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>> // ABI-compliant code returns long double by reference, but that conversion // is left to higher-level code. Perhaps we could add an f128 definition @@ -60,6 +74,17 @@ def CC_SystemZ : CallingConv<[ CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>, CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>, + // The first 8 named vector arguments are passed in V24-V31. + CCIfSubtarget<"hasVector()", + CCIfType<[v16i8, v8i16, v4i32, v2i64], + CCIfFixed>>>, + + // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots. + CCIfSubtarget<"hasVector()", + CCIfType<[v16i8, v8i16, v4i32, v2i64], + CCAssignToStack<16, 8>>>, + // Other arguments are passed in 8-byte-aligned 8-byte stack slots. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> ]>; diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 80a98772db7..63992936813 100644 --- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -255,6 +255,13 @@ class SystemZDAGToDAGISel : public SelectionDAGISel { Addr, Base, Disp, Index); } + // Try to match Addr as an address with a base, 12-bit displacement + // and index, where the index is element Elem of a vector. + // Return true on success, storing the base, displacement and vector + // in Base, Disp and Index respectively. + bool selectBDVAddr12Only(SDValue Addr, SDValue Elem, SDValue &Base, + SDValue &Disp, SDValue &Index) const; + // Check whether (or Op (and X InsertMask)) is effectively an insertion // of X into bits InsertMask of some Y != Op. Return true if so and // set Op to that Y. @@ -292,6 +299,12 @@ class SystemZDAGToDAGISel : public SelectionDAGISel { SDNode *splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0, uint64_t UpperVal, uint64_t LowerVal); + // Try to use gather instruction Opcode to implement vector insertion N. + SDNode *tryGather(SDNode *N, unsigned Opcode); + + // Try to use scatter instruction Opcode to implement store Store. + SDNode *tryScatter(StoreSDNode *Store, unsigned Opcode); + // Return true if Load and Store are loads and stores of the same size // and are guaranteed not to overlap. Such operations can be implemented // using block (SS-format) instructions. @@ -645,6 +658,30 @@ bool SystemZDAGToDAGISel::selectBDXAddr(SystemZAddressingMode::AddrForm Form, return true; } +bool SystemZDAGToDAGISel::selectBDVAddr12Only(SDValue Addr, SDValue Elem, + SDValue &Base, + SDValue &Disp, + SDValue &Index) const { + SDValue Regs[2]; + if (selectBDXAddr12Only(Addr, Regs[0], Disp, Regs[1]) && + Regs[0].getNode() && Regs[1].getNode()) { + for (unsigned int I = 0; I < 2; ++I) { + Base = Regs[I]; + Index = Regs[1 - I]; + // We can't tell here whether the index vector has the right type + // for the access; the caller needs to do that instead. + if (Index.getOpcode() == ISD::ZERO_EXTEND) + Index = Index.getOperand(0); + if (Index.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Index.getOperand(1) == Elem) { + Index = Index.getOperand(0); + return true; + } + } + } + return false; +} + bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op, uint64_t InsertMask) const { // We're only interested in cases where the insertion is into some operand @@ -984,6 +1021,71 @@ SDNode *SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node, return Or.getNode(); } +SDNode *SystemZDAGToDAGISel::tryGather(SDNode *N, unsigned Opcode) { + SDValue ElemV = N->getOperand(2); + auto *ElemN = dyn_cast(ElemV); + if (!ElemN) + return 0; + + unsigned Elem = ElemN->getZExtValue(); + EVT VT = N->getValueType(0); + if (Elem >= VT.getVectorNumElements()) + return 0; + + auto *Load = dyn_cast(N->getOperand(1)); + if (!Load || !Load->hasOneUse()) + return 0; + if (Load->getMemoryVT().getSizeInBits() != + Load->getValueType(0).getSizeInBits()) + return 0; + + SDValue Base, Disp, Index; + if (!selectBDVAddr12Only(Load->getBasePtr(), ElemV, Base, Disp, Index) || + Index.getValueType() != VT.changeVectorElementTypeToInteger()) + return 0; + + SDLoc DL(Load); + SDValue Ops[] = { + N->getOperand(0), Base, Disp, Index, + CurDAG->getTargetConstant(Elem, DL, MVT::i32), Load->getChain() + }; + SDNode *Res = CurDAG->getMachineNode(Opcode, DL, VT, MVT::Other, Ops); + ReplaceUses(SDValue(Load, 1), SDValue(Res, 1)); + return Res; +} + +SDNode *SystemZDAGToDAGISel::tryScatter(StoreSDNode *Store, unsigned Opcode) { + SDValue Value = Store->getValue(); + if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return 0; + if (Store->getMemoryVT().getSizeInBits() != + Value.getValueType().getSizeInBits()) + return 0; + + SDValue ElemV = Value.getOperand(1); + auto *ElemN = dyn_cast(ElemV); + if (!ElemN) + return 0; + + SDValue Vec = Value.getOperand(0); + EVT VT = Vec.getValueType(); + unsigned Elem = ElemN->getZExtValue(); + if (Elem >= VT.getVectorNumElements()) + return 0; + + SDValue Base, Disp, Index; + if (!selectBDVAddr12Only(Store->getBasePtr(), ElemV, Base, Disp, Index) || + Index.getValueType() != VT.changeVectorElementTypeToInteger()) + return 0; + + SDLoc DL(Store); + SDValue Ops[] = { + Vec, Base, Disp, Index, CurDAG->getTargetConstant(Elem, DL, MVT::i32), + Store->getChain() + }; + return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops); +} + bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store, LoadSDNode *Load) const { // Check that the two memory operands have the same size. @@ -1120,6 +1222,26 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) { } break; } + + case ISD::INSERT_VECTOR_ELT: { + EVT VT = Node->getValueType(0); + unsigned ElemBitSize = VT.getVectorElementType().getSizeInBits(); + if (ElemBitSize == 32) + ResNode = tryGather(Node, SystemZ::VGEF); + else if (ElemBitSize == 64) + ResNode = tryGather(Node, SystemZ::VGEG); + break; + } + + case ISD::STORE: { + auto *Store = cast(Node); + unsigned ElemBitSize = Store->getValue().getValueType().getSizeInBits(); + if (ElemBitSize == 32) + ResNode = tryScatter(Store, SystemZ::VSCEF); + else if (ElemBitSize == 64) + ResNode = tryScatter(Store, SystemZ::VSCEG); + break; + } } // Select the default instruction diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 44bc8acb6d8..ddcb792ee09 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -96,6 +96,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass); addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass); + if (Subtarget.hasVector()) { + addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass); + } + // Compute derived properties from the register classes computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -111,7 +118,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, setSchedulingPreference(Sched::RegPressure); setBooleanContents(ZeroOrOneBooleanContent); - setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct? + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // Instructions are strings of 2-byte aligned 2-byte values. setMinFunctionAlignment(2); @@ -250,6 +257,76 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, // Handle prefetches with PFD or PFDRL. setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + for (MVT VT : MVT::vector_valuetypes()) { + // Assume by default that all vector operations need to be expanded. + for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode) + if (getOperationAction(Opcode, VT) == Legal) + setOperationAction(Opcode, VT, Expand); + + // Likewise all truncating stores and extending loads. + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + } + + if (isTypeLegal(VT)) { + // These operations are legal for anything that can be stored in a + // vector register, even if there is no native support for the format + // as such. + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::BITCAST, VT, Legal); + setOperationAction(ISD::UNDEF, VT, Legal); + + // Likewise, except that we need to replace the nodes with something + // more specific. + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + } + } + + // Handle integer vector types. + for (MVT VT : MVT::integer_vector_valuetypes()) { + if (isTypeLegal(VT)) { + // These operations have direct equivalents. + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal); + setOperationAction(ISD::ADD, VT, Legal); + setOperationAction(ISD::SUB, VT, Legal); + if (VT != MVT::v2i64) + setOperationAction(ISD::MUL, VT, Legal); + setOperationAction(ISD::AND, VT, Legal); + setOperationAction(ISD::OR, VT, Legal); + setOperationAction(ISD::XOR, VT, Legal); + setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Legal); + setOperationAction(ISD::CTLZ, VT, Legal); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); + + // Convert a GPR scalar to a vector by inserting it into element 0. + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + + // Detect shifts by a scalar amount and convert them into + // V*_BY_SCALAR. + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + + // At present ROTL isn't matched by DAGCombiner. ROTR should be + // converted into ROTL. + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + + // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands + // and inverting the result as necessary. + setOperationAction(ISD::SETCC, VT, Custom); + } + } + // Handle floating-point types. for (unsigned I = MVT::FIRST_FP_VALUETYPE; I <= MVT::LAST_FP_VALUETYPE; @@ -304,6 +381,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, // Codes for which we want to perform some z-specific combinations. setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); // Handle intrinsics. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -703,7 +782,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, // Assign locations to all of the incoming arguments. SmallVector ArgLocs; - CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ); unsigned NumFixedGPRs = 0; @@ -735,6 +814,12 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, NumFixedFPRs += 1; RC = &SystemZ::FP64BitRegClass; break; + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + case MVT::v2i64: + RC = &SystemZ::VR128BitRegClass; + break; } unsigned VReg = MRI.createVirtualRegister(RC); @@ -842,7 +927,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, // Analyze the operands of the call, assigning locations to each operand. SmallVector ArgLocs; - CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ); // We don't support GuaranteedTailCallOpt, only automatically-detected @@ -1809,12 +1894,78 @@ static SDValue emitSETCC(SelectionDAG &DAG, SDLoc DL, SDValue Glue, return Result; } +// Return the SystemZISD vector comparison operation for CC, or 0 if it cannot +// be done directly. +static unsigned getVectorComparison(ISD::CondCode CC) { + switch (CC) { + case ISD::SETEQ: + return SystemZISD::VICMPE; + + case ISD::SETGT: + return SystemZISD::VICMPH; + + case ISD::SETUGT: + return SystemZISD::VICMPHL; + + default: + return 0; + } +} + +// Return the SystemZISD vector comparison operation for CC or its inverse, +// or 0 if neither can be done directly. Indicate in Invert whether the +// result is for the inverse of CC. +static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool &Invert) { + if (unsigned Opcode = getVectorComparison(CC)) { + Invert = false; + return Opcode; + } + + CC = ISD::getSetCCInverse(CC, true); + if (unsigned Opcode = getVectorComparison(CC)) { + Invert = true; + return Opcode; + } + + return 0; +} + +// Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing +// an integer mask of type VT. +static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT, + ISD::CondCode CC, SDValue CmpOp0, + SDValue CmpOp1) { + bool Invert = false; + SDValue Cmp; + // It doesn't really matter whether we try the inversion or the swap first, + // since there are no cases where both work. + if (unsigned Opcode = getVectorComparisonOrInvert(CC, Invert)) + Cmp = DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1); + else { + CC = ISD::getSetCCSwappedOperands(CC); + if (unsigned Opcode = getVectorComparisonOrInvert(CC, Invert)) + Cmp = DAG.getNode(Opcode, DL, VT, CmpOp1, CmpOp0); + else + llvm_unreachable("Unhandled comparison"); + } + if (Invert) { + SDValue Mask = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8, + DAG.getConstant(65535, DL, MVT::i32)); + Mask = DAG.getNode(ISD::BITCAST, DL, VT, Mask); + Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask); + } + return Cmp; +} + SDValue SystemZTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const { SDValue CmpOp0 = Op.getOperand(0); SDValue CmpOp1 = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDLoc DL(Op); + EVT VT = Op.getValueType(); + if (VT.isVector()) + return lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1); Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL)); SDValue Glue = emitCmp(DAG, DL, C); @@ -2146,6 +2297,13 @@ SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op, EVT InVT = In.getValueType(); EVT ResVT = Op.getValueType(); + // Convert loads directly. This is normally done by DAGCombiner, + // but we need this case for bitcasts that are created during lowering + // and which are then lowered themselves. + if (auto *LoadN = dyn_cast(In)) + return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(), + LoadN->getMemOperand()); + if (InVT == MVT::i32 && ResVT == MVT::f32) { SDValue In64; if (Subtarget.hasHighWord()) { @@ -2421,11 +2579,44 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const { SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - int64_t OrigBitSize = VT.getSizeInBits(); SDLoc DL(Op); + Op = Op.getOperand(0); + + // Handle vector types via VPOPCT. + if (VT.isVector()) { + Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Op); + Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::v16i8, Op); + switch (VT.getVectorElementType().getSizeInBits()) { + case 8: + break; + case 16: { + Op = DAG.getNode(ISD::BITCAST, DL, VT, Op); + SDValue Shift = DAG.getConstant(8, DL, MVT::i32); + SDValue Tmp = DAG.getNode(SystemZISD::VSHL_BY_SCALAR, DL, VT, Op, Shift); + Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp); + Op = DAG.getNode(SystemZISD::VSRL_BY_SCALAR, DL, VT, Op, Shift); + break; + } + case 32: { + SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8, + DAG.getConstant(0, DL, MVT::i32)); + Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp); + break; + } + case 64: { + SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8, + DAG.getConstant(0, DL, MVT::i32)); + Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Tmp); + Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp); + break; + } + default: + llvm_unreachable("Unexpected type"); + } + return Op; + } // Get the known-zero mask for the operand. - Op = Op.getOperand(0); APInt KnownZero, KnownOne; DAG.computeKnownBits(Op, KnownZero, KnownOne); unsigned NumSignificantBits = (~KnownZero).getActiveBits(); @@ -2433,6 +2624,7 @@ SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op, return DAG.getConstant(0, DL, VT); // Skip known-zero high parts of the operand. + int64_t OrigBitSize = VT.getSizeInBits(); int64_t BitSize = (int64_t)1 << Log2_32_Ceil(NumSignificantBits); BitSize = std::min(BitSize, OrigBitSize); @@ -2698,6 +2890,837 @@ SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op, return SDValue(); } +namespace { +// Says that SystemZISD operation Opcode can be used to perform the equivalent +// of a VPERM with permute vector Bytes. If Opcode takes three operands, +// Operand is the constant third operand, otherwise it is the number of +// bytes in each element of the result. +struct Permute { + unsigned Opcode; + unsigned Operand; + unsigned char Bytes[SystemZ::VectorBytes]; +}; +} + +static const Permute PermuteForms[] = { + // VMRHG + { SystemZISD::MERGE_HIGH, 8, + { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 } }, + // VMRHF + { SystemZISD::MERGE_HIGH, 4, + { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 } }, + // VMRHH + { SystemZISD::MERGE_HIGH, 2, + { 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 } }, + // VMRHB + { SystemZISD::MERGE_HIGH, 1, + { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 } }, + // VMRLG + { SystemZISD::MERGE_LOW, 8, + { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 } }, + // VMRLF + { SystemZISD::MERGE_LOW, 4, + { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 } }, + // VMRLH + { SystemZISD::MERGE_LOW, 2, + { 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 } }, + // VMRLB + { SystemZISD::MERGE_LOW, 1, + { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 } }, + // VPKG + { SystemZISD::PACK, 4, + { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 } }, + // VPKF + { SystemZISD::PACK, 2, + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 } }, + // VPKH + { SystemZISD::PACK, 1, + { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } }, + // VPDI V1, V2, 4 (low half of V1, high half of V2) + { SystemZISD::PERMUTE_DWORDS, 4, + { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 } }, + // VPDI V1, V2, 1 (high half of V1, low half of V2) + { SystemZISD::PERMUTE_DWORDS, 1, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 } } +}; + +// Called after matching a vector shuffle against a particular pattern. +// Both the original shuffle and the pattern have two vector operands. +// OpNos[0] is the operand of the original shuffle that should be used for +// operand 0 of the pattern, or -1 if operand 0 of the pattern can be anything. +// OpNos[1] is the same for operand 1 of the pattern. Resolve these -1s and +// set OpNo0 and OpNo1 to the shuffle operands that should actually be used +// for operands 0 and 1 of the pattern. +static bool chooseShuffleOpNos(int *OpNos, unsigned &OpNo0, unsigned &OpNo1) { + if (OpNos[0] < 0) { + if (OpNos[1] < 0) + return false; + OpNo0 = OpNo1 = OpNos[1]; + } else if (OpNos[1] < 0) { + OpNo0 = OpNo1 = OpNos[0]; + } else { + OpNo0 = OpNos[0]; + OpNo1 = OpNos[1]; + } + return true; +} + +// Bytes is a VPERM-like permute vector, except that -1 is used for +// undefined bytes. Return true if the VPERM can be implemented using P. +// When returning true set OpNo0 to the VPERM operand that should be +// used for operand 0 of P and likewise OpNo1 for operand 1 of P. +// +// For example, if swapping the VPERM operands allows P to match, OpNo0 +// will be 1 and OpNo1 will be 0. If instead Bytes only refers to one +// operand, but rewriting it to use two duplicated operands allows it to +// match P, then OpNo0 and OpNo1 will be the same. +static bool matchPermute(const SmallVectorImpl &Bytes, const Permute &P, + unsigned &OpNo0, unsigned &OpNo1) { + int OpNos[] = { -1, -1 }; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) { + int Elt = Bytes[I]; + if (Elt >= 0) { + // Make sure that the two permute vectors use the same suboperand + // byte number. Only the operand numbers (the high bits) are + // allowed to differ. + if ((Elt ^ P.Bytes[I]) & (SystemZ::VectorBytes - 1)) + return false; + int ModelOpNo = P.Bytes[I] / SystemZ::VectorBytes; + int RealOpNo = unsigned(Elt) / SystemZ::VectorBytes; + // Make sure that the operand mappings are consistent with previous + // elements. + if (OpNos[ModelOpNo] == 1 - RealOpNo) + return false; + OpNos[ModelOpNo] = RealOpNo; + } + } + return chooseShuffleOpNos(OpNos, OpNo0, OpNo1); +} + +// As above, but search for a matching permute. +static const Permute *matchPermute(const SmallVectorImpl &Bytes, + unsigned &OpNo0, unsigned &OpNo1) { + for (auto &P : PermuteForms) + if (matchPermute(Bytes, P, OpNo0, OpNo1)) + return &P; + return nullptr; +} + +// Bytes is a VPERM-like permute vector, except that -1 is used for +// undefined bytes. This permute is an operand of an outer permute. +// See whether redistributing the -1 bytes gives a shuffle that can be +// implemented using P. If so, set Transform to a VPERM-like permute vector +// that, when applied to the result of P, gives the original permute in Bytes. +static bool matchDoublePermute(const SmallVectorImpl &Bytes, + const Permute &P, + SmallVectorImpl &Transform) { + unsigned To = 0; + for (unsigned From = 0; From < SystemZ::VectorBytes; ++From) { + int Elt = Bytes[From]; + if (Elt < 0) + // Byte number From of the result is undefined. + Transform[From] = -1; + else { + while (P.Bytes[To] != Elt) { + To += 1; + if (To == SystemZ::VectorBytes) + return false; + } + Transform[From] = To; + } + } + return true; +} + +// As above, but search for a matching permute. +static const Permute *matchDoublePermute(const SmallVectorImpl &Bytes, + SmallVectorImpl &Transform) { + for (auto &P : PermuteForms) + if (matchDoublePermute(Bytes, P, Transform)) + return &P; + return nullptr; +} + +// Convert the mask of the given VECTOR_SHUFFLE into a byte-level mask, +// as if it had type vNi8. +static void getVPermMask(ShuffleVectorSDNode *VSN, + SmallVectorImpl &Bytes) { + EVT VT = VSN->getValueType(0); + unsigned NumElements = VT.getVectorNumElements(); + unsigned BytesPerElement = VT.getVectorElementType().getStoreSize(); + Bytes.resize(NumElements * BytesPerElement, -1); + for (unsigned I = 0; I < NumElements; ++I) { + int Index = VSN->getMaskElt(I); + if (Index >= 0) + for (unsigned J = 0; J < BytesPerElement; ++J) + Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J; + } +} + +// Bytes is a VPERM-like permute vector, except that -1 is used for +// undefined bytes. See whether bytes [Start, Start + BytesPerElement) of +// the result come from a contiguous sequence of bytes from one input. +// Set Base to the selector for the first byte if so. +static bool getShuffleInput(const SmallVectorImpl &Bytes, unsigned Start, + unsigned BytesPerElement, int &Base) { + Base = -1; + for (unsigned I = 0; I < BytesPerElement; ++I) { + if (Bytes[Start + I] >= 0) { + unsigned Elem = Bytes[Start + I]; + if (Base < 0) { + Base = Elem - I; + // Make sure the bytes would come from one input operand. + if (unsigned(Base) % Bytes.size() + BytesPerElement > Bytes.size()) + return false; + } else if (unsigned(Base) != Elem - I) + return false; + } + } + return true; +} + +// Bytes is a VPERM-like permute vector, except that -1 is used for +// undefined bytes. Return true if it can be performed using VSLDI. +// When returning true, set StartIndex to the shift amount and OpNo0 +// and OpNo1 to the VPERM operands that should be used as the first +// and second shift operand respectively. +static bool isShlDoublePermute(const SmallVectorImpl &Bytes, + unsigned &StartIndex, unsigned &OpNo0, + unsigned &OpNo1) { + int OpNos[] = { -1, -1 }; + int Shift = -1; + for (unsigned I = 0; I < 16; ++I) { + int Index = Bytes[I]; + if (Index >= 0) { + int ExpectedShift = (Index - I) % SystemZ::VectorBytes; + int ModelOpNo = unsigned(ExpectedShift + I) / SystemZ::VectorBytes; + int RealOpNo = unsigned(Index) / SystemZ::VectorBytes; + if (Shift < 0) + Shift = ExpectedShift; + else if (Shift != ExpectedShift) + return false; + // Make sure that the operand mappings are consistent with previous + // elements. + if (OpNos[ModelOpNo] == 1 - RealOpNo) + return false; + OpNos[ModelOpNo] = RealOpNo; + } + } + StartIndex = Shift; + return chooseShuffleOpNos(OpNos, OpNo0, OpNo1); +} + +// Create a node that performs P on operands Op0 and Op1, casting the +// operands to the appropriate type. The type of the result is determined by P. +static SDValue getPermuteNode(SelectionDAG &DAG, SDLoc DL, + const Permute &P, SDValue Op0, SDValue Op1) { + // VPDI (PERMUTE_DWORDS) always operates on v2i64s. The input + // elements of a PACK are twice as wide as the outputs. + unsigned InBytes = (P.Opcode == SystemZISD::PERMUTE_DWORDS ? 8 : + P.Opcode == SystemZISD::PACK ? P.Operand * 2 : + P.Operand); + // Cast both operands to the appropriate type. + MVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBytes * 8), + SystemZ::VectorBytes / InBytes); + Op0 = DAG.getNode(ISD::BITCAST, DL, InVT, Op0); + Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1); + SDValue Op; + if (P.Opcode == SystemZISD::PERMUTE_DWORDS) { + SDValue Op2 = DAG.getConstant(P.Operand, DL, MVT::i32); + Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2); + } else if (P.Opcode == SystemZISD::PACK) { + MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8), + SystemZ::VectorBytes / P.Operand); + Op = DAG.getNode(SystemZISD::PACK, DL, OutVT, Op0, Op1); + } else { + Op = DAG.getNode(P.Opcode, DL, InVT, Op0, Op1); + } + return Op; +} + +// Bytes is a VPERM-like permute vector, except that -1 is used for +// undefined bytes. Implement it on operands Ops[0] and Ops[1] using +// VSLDI or VPERM. +static SDValue getGeneralPermuteNode(SelectionDAG &DAG, SDLoc DL, SDValue *Ops, + const SmallVectorImpl &Bytes) { + for (unsigned I = 0; I < 2; ++I) + Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]); + + // First see whether VSLDI can be used. + unsigned StartIndex, OpNo0, OpNo1; + if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1)) + return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0], + Ops[OpNo1], DAG.getConstant(StartIndex, DL, MVT::i32)); + + // Fall back on VPERM. Construct an SDNode for the permute vector. + SDValue IndexNodes[SystemZ::VectorBytes]; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) + if (Bytes[I] >= 0) + IndexNodes[I] = DAG.getConstant(Bytes[I], DL, MVT::i32); + else + IndexNodes[I] = DAG.getUNDEF(MVT::i32); + SDValue Op2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, IndexNodes); + return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2); +} + +namespace { +// Describes a general N-operand vector shuffle. +struct GeneralShuffle { + GeneralShuffle(EVT vt) : VT(vt) {} + void addUndef(); + void add(SDValue, unsigned); + SDValue getNode(SelectionDAG &, SDLoc); + + // The operands of the shuffle. + SmallVector Ops; + + // Index I is -1 if byte I of the result is undefined. Otherwise the + // result comes from byte Bytes[I] % SystemZ::VectorBytes of operand + // Bytes[I] / SystemZ::VectorBytes. + SmallVector Bytes; + + // The type of the shuffle result. + EVT VT; +}; +} + +// Add an extra undefined element to the shuffle. +void GeneralShuffle::addUndef() { + unsigned BytesPerElement = VT.getVectorElementType().getStoreSize(); + for (unsigned I = 0; I < BytesPerElement; ++I) + Bytes.push_back(-1); +} + +// Add an extra element to the shuffle, taking it from element Elem of Op. +// A null Op indicates a vector input whose value will be calculated later; +// there is at most one such input per shuffle and it always has the same +// type as the result. +void GeneralShuffle::add(SDValue Op, unsigned Elem) { + unsigned BytesPerElement = VT.getVectorElementType().getStoreSize(); + + // The source vector can have wider elements than the result, + // either through an explicit TRUNCATE or because of type legalization. + // We want the least significant part. + EVT FromVT = Op.getNode() ? Op.getValueType() : VT; + unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize(); + assert(FromBytesPerElement >= BytesPerElement && + "Invalid EXTRACT_VECTOR_ELT"); + unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes + + (FromBytesPerElement - BytesPerElement)); + + // Look through things like shuffles and bitcasts. + while (Op.getNode()) { + if (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + else if (Op.getOpcode() == ISD::VECTOR_SHUFFLE && Op.hasOneUse()) { + // See whether the bytes we need come from a contiguous part of one + // operand. + SmallVector OpBytes; + getVPermMask(cast(Op), OpBytes); + int NewByte; + if (!getShuffleInput(OpBytes, Byte, BytesPerElement, NewByte)) + break; + if (NewByte < 0) { + addUndef(); + return; + } + Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes); + Byte = unsigned(NewByte) % SystemZ::VectorBytes; + } else if (Op.getOpcode() == ISD::UNDEF) { + addUndef(); + return; + } else + break; + } + + // Make sure that the source of the extraction is in Ops. + unsigned OpNo = 0; + for (; OpNo < Ops.size(); ++OpNo) + if (Ops[OpNo] == Op) + break; + if (OpNo == Ops.size()) + Ops.push_back(Op); + + // Add the element to Bytes. + unsigned Base = OpNo * SystemZ::VectorBytes + Byte; + for (unsigned I = 0; I < BytesPerElement; ++I) + Bytes.push_back(Base + I); +} + +// Return SDNodes for the completed shuffle. +SDValue GeneralShuffle::getNode(SelectionDAG &DAG, SDLoc DL) { + assert(Bytes.size() == SystemZ::VectorBytes && "Incomplete vector"); + + if (Ops.size() == 0) + return DAG.getUNDEF(VT); + + // Make sure that there are at least two shuffle operands. + if (Ops.size() == 1) + Ops.push_back(DAG.getUNDEF(MVT::v16i8)); + + // Create a tree of shuffles, deferring root node until after the loop. + // Try to redistribute the undefined elements of non-root nodes so that + // the non-root shuffles match something like a pack or merge, then adjust + // the parent node's permute vector to compensate for the new order. + // Among other things, this copes with vectors like <2 x i16> that were + // padded with undefined elements during type legalization. + // + // In the best case this redistribution will lead to the whole tree + // using packs and merges. It should rarely be a loss in other cases. + unsigned Stride = 1; + for (; Stride * 2 < Ops.size(); Stride *= 2) { + for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) { + SDValue SubOps[] = { Ops[I], Ops[I + Stride] }; + + // Create a mask for just these two operands. + SmallVector NewBytes(SystemZ::VectorBytes); + for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) { + unsigned OpNo = unsigned(Bytes[J]) / SystemZ::VectorBytes; + unsigned Byte = unsigned(Bytes[J]) % SystemZ::VectorBytes; + if (OpNo == I) + NewBytes[J] = Byte; + else if (OpNo == I + Stride) + NewBytes[J] = SystemZ::VectorBytes + Byte; + else + NewBytes[J] = -1; + } + // See if it would be better to reorganize NewMask to avoid using VPERM. + SmallVector NewBytesMap(SystemZ::VectorBytes); + if (const Permute *P = matchDoublePermute(NewBytes, NewBytesMap)) { + Ops[I] = getPermuteNode(DAG, DL, *P, SubOps[0], SubOps[1]); + // Applying NewBytesMap to Ops[I] gets back to NewBytes. + for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) { + if (NewBytes[J] >= 0) { + assert(unsigned(NewBytesMap[J]) < SystemZ::VectorBytes && + "Invalid double permute"); + Bytes[J] = I * SystemZ::VectorBytes + NewBytesMap[J]; + } else + assert(NewBytesMap[J] < 0 && "Invalid double permute"); + } + } else { + // Just use NewBytes on the operands. + Ops[I] = getGeneralPermuteNode(DAG, DL, SubOps, NewBytes); + for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) + if (NewBytes[J] >= 0) + Bytes[J] = I * SystemZ::VectorBytes + J; + } + } + } + + // Now we just have 2 inputs. Put the second operand in Ops[1]. + if (Stride > 1) { + Ops[1] = Ops[Stride]; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) + if (Bytes[I] >= int(SystemZ::VectorBytes)) + Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes; + } + + // Look for an instruction that can do the permute without resorting + // to VPERM. + unsigned OpNo0, OpNo1; + SDValue Op; + if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1)) + Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]); + else + Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); +} + +// Extend GPR scalars Op0 and Op1 to doublewords and return a v2i64 +// vector for them. +static SDValue joinDwords(SelectionDAG &DAG, SDLoc DL, SDValue Op0, + SDValue Op1) { + if (Op0.getOpcode() == ISD::UNDEF && Op1.getOpcode() == ISD::UNDEF) + return DAG.getUNDEF(MVT::v2i64); + // If one of the two inputs is undefined then replicate the other one, + // in order to avoid using another register unnecessarily. + if (Op0.getOpcode() == ISD::UNDEF) + Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1); + else if (Op1.getOpcode() == ISD::UNDEF) + Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0); + else { + Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0); + Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1); + } + return DAG.getNode(SystemZISD::JOIN_DWORDS, DL, MVT::v2i64, Op0, Op1); +} + +// Try to represent constant BUILD_VECTOR node BVN using a +// SystemZISD::BYTE_MASK-style mask. Store the mask value in Mask +// on success. +static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask) { + EVT ElemVT = BVN->getValueType(0).getVectorElementType(); + unsigned BytesPerElement = ElemVT.getStoreSize(); + for (unsigned I = 0, E = BVN->getNumOperands(); I != E; ++I) { + SDValue Op = BVN->getOperand(I); + if (Op.getOpcode() != ISD::UNDEF) { + uint64_t Value; + if (Op.getOpcode() == ISD::Constant) + Value = dyn_cast(Op)->getZExtValue(); + else if (Op.getOpcode() == ISD::ConstantFP) + Value = (dyn_cast(Op)->getValueAPF().bitcastToAPInt() + .getZExtValue()); + else + return false; + for (unsigned J = 0; J < BytesPerElement; ++J) { + uint64_t Byte = (Value >> (J * 8)) & 0xff; + if (Byte == 0xff) + Mask |= 1 << ((E - I - 1) * BytesPerElement + J); + else if (Byte != 0) + return false; + } + } + } + return true; +} + +// Try to load a vector constant in which BitsPerElement-bit value Value +// is replicated to fill the vector. VT is the type of the resulting +// constant, which may have elements of a different size from BitsPerElement. +// Return the SDValue of the constant on success, otherwise return +// an empty value. +static SDValue tryBuildVectorReplicate(SelectionDAG &DAG, + const SystemZInstrInfo *TII, + SDLoc DL, EVT VT, uint64_t Value, + unsigned BitsPerElement) { + // Signed 16-bit values can be replicated using VREPI. + int64_t SignedValue = SignExtend64(Value, BitsPerElement); + if (isInt<16>(SignedValue)) { + MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), + SystemZ::VectorBits / BitsPerElement); + SDValue Op = DAG.getNode(SystemZISD::REPLICATE, DL, VecVT, + DAG.getConstant(SignedValue, DL, MVT::i32)); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); + } + // See whether rotating the constant left some N places gives a value that + // is one less than a power of 2 (i.e. all zeros followed by all ones). + // If so we can use VGM. + unsigned Start, End; + if (TII->isRxSBGMask(Value, BitsPerElement, Start, End)) { + // isRxSBGMask returns the bit numbers for a full 64-bit value, + // with 0 denoting 1 << 63 and 63 denoting 1. Convert them to + // bit numbers for an BitsPerElement value, so that 0 denotes + // 1 << (BitsPerElement-1). + Start -= 64 - BitsPerElement; + End -= 64 - BitsPerElement; + MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), + SystemZ::VectorBits / BitsPerElement); + SDValue Op = DAG.getNode(SystemZISD::ROTATE_MASK, DL, VecVT, + DAG.getConstant(Start, DL, MVT::i32), + DAG.getConstant(End, DL, MVT::i32)); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); + } + return SDValue(); +} + +// If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually +// better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for +// the non-EXTRACT_VECTOR_ELT elements. See if the given BUILD_VECTOR +// would benefit from this representation and return it if so. +static SDValue tryBuildVectorShuffle(SelectionDAG &DAG, + BuildVectorSDNode *BVN) { + EVT VT = BVN->getValueType(0); + unsigned NumElements = VT.getVectorNumElements(); + + // Represent the BUILD_VECTOR as an N-operand VECTOR_SHUFFLE-like operation + // on byte vectors. If there are non-EXTRACT_VECTOR_ELT elements that still + // need a BUILD_VECTOR, add an additional placeholder operand for that + // BUILD_VECTOR and store its operands in ResidueOps. + GeneralShuffle GS(VT); + SmallVector ResidueOps; + bool FoundOne = false; + for (unsigned I = 0; I < NumElements; ++I) { + SDValue Op = BVN->getOperand(I); + if (Op.getOpcode() == ISD::TRUNCATE) + Op = Op.getOperand(0); + if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Op.getOperand(1).getOpcode() == ISD::Constant) { + unsigned Elem = cast(Op.getOperand(1))->getZExtValue(); + GS.add(Op.getOperand(0), Elem); + FoundOne = true; + } else if (Op.getOpcode() == ISD::UNDEF) { + GS.addUndef(); + } else { + GS.add(SDValue(), ResidueOps.size()); + ResidueOps.push_back(Op); + } + } + + // Nothing to do if there are no EXTRACT_VECTOR_ELTs. + if (!FoundOne) + return SDValue(); + + // Create the BUILD_VECTOR for the remaining elements, if any. + if (!ResidueOps.empty()) { + while (ResidueOps.size() < NumElements) + ResidueOps.push_back(DAG.getUNDEF(VT.getVectorElementType())); + for (auto &Op : GS.Ops) { + if (!Op.getNode()) { + Op = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BVN), VT, ResidueOps); + break; + } + } + } + return GS.getNode(DAG, SDLoc(BVN)); +} + +// Combine GPR scalar values Elems into a vector of type VT. +static SDValue buildVector(SelectionDAG &DAG, SDLoc DL, EVT VT, + SmallVectorImpl &Elems) { + // See whether there is a single replicated value. + SDValue Single; + unsigned int NumElements = Elems.size(); + unsigned int Count = 0; + for (auto Elem : Elems) { + if (Elem.getOpcode() != ISD::UNDEF) { + if (!Single.getNode()) + Single = Elem; + else if (Elem != Single) { + Single = SDValue(); + break; + } + Count += 1; + } + } + // There are three cases here: + // + // - if the only defined element is a loaded one, the best sequence + // is a replicating load. + // + // - otherwise, if the only defined element is an i64 value, we will + // end up with the same VLVGP sequence regardless of whether we short-cut + // for replication or fall through to the later code. + // + // - otherwise, if the only defined element is an i32 or smaller value, + // we would need 2 instructions to replicate it: VLVGP followed by VREPx. + // This is only a win if the single defined element is used more than once. + // In other cases we're better off using a single VLVGx. + if (Single.getNode() && (Count > 1 || Single.getOpcode() == ISD::LOAD)) + return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single); + + // The best way of building a v2i64 from two i64s is to use VLVGP. + if (VT == MVT::v2i64) + return joinDwords(DAG, DL, Elems[0], Elems[1]); + + // Collect the constant terms. + SmallVector Constants(NumElements, SDValue()); + SmallVector Done(NumElements, false); + + unsigned NumConstants = 0; + for (unsigned I = 0; I < NumElements; ++I) { + SDValue Elem = Elems[I]; + if (Elem.getOpcode() == ISD::Constant || + Elem.getOpcode() == ISD::ConstantFP) { + NumConstants += 1; + Constants[I] = Elem; + Done[I] = true; + } + } + // If there was at least one constant, fill in the other elements of + // Constants with undefs to get a full vector constant and use that + // as the starting point. + SDValue Result; + if (NumConstants > 0) { + for (unsigned I = 0; I < NumElements; ++I) + if (!Constants[I].getNode()) + Constants[I] = DAG.getUNDEF(Elems[I].getValueType()); + Result = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Constants); + } else { + // Otherwise try to use VLVGP to start the sequence in order to + // avoid a false dependency on any previous contents of the vector + // register. This only makes sense if one of the associated elements + // is defined. + unsigned I1 = NumElements / 2 - 1; + unsigned I2 = NumElements - 1; + bool Def1 = (Elems[I1].getOpcode() != ISD::UNDEF); + bool Def2 = (Elems[I2].getOpcode() != ISD::UNDEF); + if (Def1 || Def2) { + SDValue Elem1 = Elems[Def1 ? I1 : I2]; + SDValue Elem2 = Elems[Def2 ? I2 : I1]; + Result = DAG.getNode(ISD::BITCAST, DL, VT, + joinDwords(DAG, DL, Elem1, Elem2)); + Done[I1] = true; + Done[I2] = true; + } else + Result = DAG.getUNDEF(VT); + } + + // Use VLVGx to insert the other elements. + for (unsigned I = 0; I < NumElements; ++I) + if (!Done[I] && Elems[I].getOpcode() != ISD::UNDEF) + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I], + DAG.getConstant(I, DL, MVT::i32)); + return Result; +} + +SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + const SystemZInstrInfo *TII = + static_cast(Subtarget.getInstrInfo()); + auto *BVN = cast(Op.getNode()); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + if (BVN->isConstant()) { + // Try using VECTOR GENERATE BYTE MASK. This is the architecturally- + // preferred way of creating all-zero and all-one vectors so give it + // priority over other methods below. + uint64_t Mask = 0; + if (tryBuildVectorByteMask(BVN, Mask)) { + SDValue Op = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8, + DAG.getConstant(Mask, DL, MVT::i32)); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); + } + + // Try using some form of replication. + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, + 8, true) && + SplatBitSize <= 64) { + // First try assuming that any undefined bits above the highest set bit + // and below the lowest set bit are 1s. This increases the likelihood of + // being able to use a sign-extended element value in VECTOR REPLICATE + // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK. + uint64_t SplatBitsZ = SplatBits.getZExtValue(); + uint64_t SplatUndefZ = SplatUndef.getZExtValue(); + uint64_t Lower = (SplatUndefZ + & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1)); + uint64_t Upper = (SplatUndefZ + & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1)); + uint64_t Value = SplatBitsZ | Upper | Lower; + SDValue Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, + SplatBitSize); + if (Op.getNode()) + return Op; + + // Now try assuming that any undefined bits between the first and + // last defined set bits are set. This increases the chances of + // using a non-wraparound mask. + uint64_t Middle = SplatUndefZ & ~Upper & ~Lower; + Value = SplatBitsZ | Middle; + Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, SplatBitSize); + if (Op.getNode()) + return Op; + } + + // Fall back to loading it from memory. + return SDValue(); + } + + // See if we should use shuffles to construct the vector from other vectors. + SDValue Res = tryBuildVectorShuffle(DAG, BVN); + if (Res.getNode()) + return Res; + + // Otherwise use buildVector to build the vector up from GPRs. + unsigned NumElements = Op.getNumOperands(); + SmallVector Ops(NumElements); + for (unsigned I = 0; I < NumElements; ++I) + Ops[I] = Op.getOperand(I); + return buildVector(DAG, DL, VT, Ops); +} + +SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + auto *VSN = cast(Op.getNode()); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + unsigned NumElements = VT.getVectorNumElements(); + + if (VSN->isSplat()) { + SDValue Op0 = Op.getOperand(0); + unsigned Index = VSN->getSplatIndex(); + assert(Index < VT.getVectorNumElements() && + "Splat index should be defined and in first operand"); + // See whether the value we're splatting is directly available as a scalar. + if ((Index == 0 && Op0.getOpcode() == ISD::SCALAR_TO_VECTOR) || + Op0.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0.getOperand(Index)); + // Otherwise keep it as a vector-to-vector operation. + return DAG.getNode(SystemZISD::SPLAT, DL, VT, Op.getOperand(0), + DAG.getConstant(Index, DL, MVT::i32)); + } + + GeneralShuffle GS(VT); + for (unsigned I = 0; I < NumElements; ++I) { + int Elt = VSN->getMaskElt(I); + if (Elt < 0) + GS.addUndef(); + else + GS.add(Op.getOperand(unsigned(Elt) / NumElements), + unsigned(Elt) % NumElements); + } + return GS.getNode(DAG, SDLoc(VSN)); +} + +SDValue SystemZTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + // Just insert the scalar into element 0 of an undefined vector. + return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, + Op.getValueType(), DAG.getUNDEF(Op.getValueType()), + Op.getOperand(0), DAG.getConstant(0, DL, MVT::i32)); +} + +SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG, + unsigned ByScalar) const { + // Look for cases where a vector shift can use the *_BY_SCALAR form. + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + unsigned ElemBitSize = VT.getVectorElementType().getSizeInBits(); + + // See whether the shift vector is a splat represented as BUILD_VECTOR. + if (auto *BVN = dyn_cast(Op1)) { + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + // Check for constant splats. Use ElemBitSize as the minimum element + // width and reject splats that need wider elements. + if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, + ElemBitSize, true) && + SplatBitSize == ElemBitSize) { + SDValue Shift = DAG.getConstant(SplatBits.getZExtValue() & 0xfff, + DL, MVT::i32); + return DAG.getNode(ByScalar, DL, VT, Op0, Shift); + } + // Check for variable splats. + BitVector UndefElements; + SDValue Splat = BVN->getSplatValue(&UndefElements); + if (Splat) { + // Since i32 is the smallest legal type, we either need a no-op + // or a truncation. + SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Splat); + return DAG.getNode(ByScalar, DL, VT, Op0, Shift); + } + } + + // See whether the shift vector is a splat represented as SHUFFLE_VECTOR, + // and the shift amount is directly available in a GPR. + if (auto *VSN = dyn_cast(Op1)) { + if (VSN->isSplat()) { + SDValue VSNOp0 = VSN->getOperand(0); + unsigned Index = VSN->getSplatIndex(); + assert(Index < VT.getVectorNumElements() && + "Splat index should be defined and in first operand"); + if ((Index == 0 && VSNOp0.getOpcode() == ISD::SCALAR_TO_VECTOR) || + VSNOp0.getOpcode() == ISD::BUILD_VECTOR) { + // Since i32 is the smallest legal type, we either need a no-op + // or a truncation. + SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, + VSNOp0.getOperand(Index)); + return DAG.getNode(ByScalar, DL, VT, Op0, Shift); + } + } + } + + // Otherwise just treat the current form as legal. + return Op; +} + SDValue SystemZTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -2737,6 +3760,12 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, return lowerOR(Op, DAG); case ISD::CTPOP: return lowerCTPOP(Op, DAG); + case ISD::CTLZ_ZERO_UNDEF: + return DAG.getNode(ISD::CTLZ, SDLoc(Op), + Op.getValueType(), Op.getOperand(0)); + case ISD::CTTZ_ZERO_UNDEF: + return DAG.getNode(ISD::CTTZ, SDLoc(Op), + Op.getValueType(), Op.getOperand(0)); case ISD::ATOMIC_SWAP: return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW); case ISD::ATOMIC_STORE: @@ -2773,6 +3802,18 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, return lowerPREFETCH(Op, DAG); case ISD::INTRINSIC_W_CHAIN: return lowerINTRINSIC_W_CHAIN(Op, DAG); + case ISD::BUILD_VECTOR: + return lowerBUILD_VECTOR(Op, DAG); + case ISD::VECTOR_SHUFFLE: + return lowerVECTOR_SHUFFLE(Op, DAG); + case ISD::SCALAR_TO_VECTOR: + return lowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::SHL: + return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR); + case ISD::SRL: + return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR); + case ISD::SRA: + return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR); default: llvm_unreachable("Unexpected node to lower"); } @@ -2820,6 +3861,24 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(TBEGIN); OPCODE(TBEGIN_NOFLOAT); OPCODE(TEND); + OPCODE(BYTE_MASK); + OPCODE(ROTATE_MASK); + OPCODE(REPLICATE); + OPCODE(JOIN_DWORDS); + OPCODE(SPLAT); + OPCODE(MERGE_HIGH); + OPCODE(MERGE_LOW); + OPCODE(SHL_DOUBLE); + OPCODE(PERMUTE_DWORDS); + OPCODE(PERMUTE); + OPCODE(PACK); + OPCODE(VSHL_BY_SCALAR); + OPCODE(VSRL_BY_SCALAR); + OPCODE(VSRA_BY_SCALAR); + OPCODE(VSUM); + OPCODE(VICMPE); + OPCODE(VICMPH); + OPCODE(VICMPHL); OPCODE(ATOMIC_SWAPW); OPCODE(ATOMIC_LOADW_ADD); OPCODE(ATOMIC_LOADW_SUB); @@ -2838,6 +3897,157 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { #undef OPCODE } +// Return true if VT is a vector whose elements are a whole number of bytes +// in width. +static bool canTreatAsByteVector(EVT VT) { + return VT.isVector() && VT.getVectorElementType().getSizeInBits() % 8 == 0; +} + +// Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT +// producing a result of type ResVT. Op is a possibly bitcast version +// of the input vector and Index is the index (based on type VecVT) that +// should be extracted. Return the new extraction if a simplification +// was possible or if Force is true. +SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT, + SDValue Op, unsigned Index, + DAGCombinerInfo &DCI, + bool Force) const { + SelectionDAG &DAG = DCI.DAG; + + // The number of bytes being extracted. + unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize(); + + for (;;) { + unsigned Opcode = Op.getOpcode(); + if (Opcode == ISD::BITCAST) + // Look through bitcasts. + Op = Op.getOperand(0); + else if (Opcode == ISD::VECTOR_SHUFFLE && + canTreatAsByteVector(Op.getValueType())) { + // Get a VPERM-like permute mask and see whether the bytes covered + // by the extracted element are a contiguous sequence from one + // source operand. + SmallVector Bytes; + getVPermMask(cast(Op), Bytes); + int First; + if (!getShuffleInput(Bytes, Index * BytesPerElement, + BytesPerElement, First)) + break; + if (First < 0) + return DAG.getUNDEF(ResVT); + // Make sure the contiguous sequence starts at a multiple of the + // original element size. + unsigned Byte = unsigned(First) % Bytes.size(); + if (Byte % BytesPerElement != 0) + break; + // We can get the extracted value directly from an input. + Index = Byte / BytesPerElement; + Op = Op.getOperand(unsigned(First) / Bytes.size()); + Force = true; + } else if (Opcode == ISD::BUILD_VECTOR && + canTreatAsByteVector(Op.getValueType())) { + // We can only optimize this case if the BUILD_VECTOR elements are + // at least as wide as the extracted value. + EVT OpVT = Op.getValueType(); + unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize(); + if (OpBytesPerElement < BytesPerElement) + break; + // Make sure that the least-significant bit of the extracted value + // is the least significant bit of an input. + unsigned End = (Index + 1) * BytesPerElement; + if (End % OpBytesPerElement != 0) + break; + // We're extracting the low part of one operand of the BUILD_VECTOR. + Op = Op.getOperand(End / OpBytesPerElement - 1); + if (!Op.getValueType().isInteger()) { + EVT VT = MVT::getIntegerVT(Op.getValueType().getSizeInBits()); + Op = DAG.getNode(ISD::BITCAST, DL, VT, Op); + DCI.AddToWorklist(Op.getNode()); + } + EVT VT = MVT::getIntegerVT(ResVT.getSizeInBits()); + Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op); + if (VT != ResVT) { + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(ISD::BITCAST, DL, ResVT, Op); + } + return Op; + } else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG || + Opcode == ISD::ZERO_EXTEND_VECTOR_INREG || + Opcode == ISD::ANY_EXTEND_VECTOR_INREG) && + canTreatAsByteVector(Op.getValueType()) && + canTreatAsByteVector(Op.getOperand(0).getValueType())) { + // Make sure that only the unextended bits are significant. + EVT ExtVT = Op.getValueType(); + EVT OpVT = Op.getOperand(0).getValueType(); + unsigned ExtBytesPerElement = ExtVT.getVectorElementType().getStoreSize(); + unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize(); + unsigned Byte = Index * BytesPerElement; + unsigned SubByte = Byte % ExtBytesPerElement; + unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement; + if (SubByte < MinSubByte || + SubByte + BytesPerElement > ExtBytesPerElement) + break; + // Get the byte offset of the unextended element + Byte = Byte / ExtBytesPerElement * OpBytesPerElement; + // ...then add the byte offset relative to that element. + Byte += SubByte - MinSubByte; + if (Byte % BytesPerElement != 0) + break; + Op = Op.getOperand(0); + Index = Byte / BytesPerElement; + Force = true; + } else + break; + } + if (Force) { + if (Op.getValueType() != VecVT) { + Op = DAG.getNode(ISD::BITCAST, DL, VecVT, Op); + DCI.AddToWorklist(Op.getNode()); + } + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op, + DAG.getConstant(Index, DL, MVT::i32)); + } + return SDValue(); +} + +// Optimize vector operations in scalar value Op on the basis that Op +// is truncated to TruncVT. +SDValue +SystemZTargetLowering::combineTruncateExtract(SDLoc DL, EVT TruncVT, SDValue Op, + DAGCombinerInfo &DCI) const { + // If we have (trunc (extract_vector_elt X, Y)), try to turn it into + // (extract_vector_elt (bitcast X), Y'), where (bitcast X) has elements + // of type TruncVT. + if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + TruncVT.getSizeInBits() % 8 == 0) { + SDValue Vec = Op.getOperand(0); + EVT VecVT = Vec.getValueType(); + if (canTreatAsByteVector(VecVT)) { + if (auto *IndexN = dyn_cast(Op.getOperand(1))) { + unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize(); + unsigned TruncBytes = TruncVT.getStoreSize(); + if (BytesPerElement % TruncBytes == 0) { + // Calculate the value of Y' in the above description. We are + // splitting the original elements into Scale equal-sized pieces + // and for truncation purposes want the last (least-significant) + // of these pieces for IndexN. This is easiest to do by calculating + // the start index of the following element and then subtracting 1. + unsigned Scale = BytesPerElement / TruncBytes; + unsigned NewIndex = (IndexN->getZExtValue() + 1) * Scale - 1; + + // Defer the creation of the bitcast from X to combineExtract, + // which might be able to optimize the extraction. + VecVT = MVT::getVectorVT(MVT::getIntegerVT(TruncBytes * 8), + VecVT.getStoreSize() / TruncBytes); + EVT ResVT = (TruncBytes < 4 ? MVT::i32 : TruncVT); + return combineExtract(DL, ResVT, VecVT, Vec, NewIndex, DCI, true); + } + } + } + } + return SDValue(); +} + SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2869,6 +4079,40 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, } } } + // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better + // for the extraction to be done on a vMiN value, so that we can use VSTE. + // If X has wider elements then convert it to: + // (truncstoreiN (extract_vector_elt (bitcast X), Y2), Z). + if (Opcode == ISD::STORE) { + auto *SN = cast(N); + EVT MemVT = SN->getMemoryVT(); + if (MemVT.isInteger()) { + SDValue Value = combineTruncateExtract(SDLoc(N), MemVT, + SN->getValue(), DCI); + if (Value.getNode()) { + DCI.AddToWorklist(Value.getNode()); + + // Rewrite the store with the new form of stored value. + return DAG.getTruncStore(SN->getChain(), SDLoc(SN), Value, + SN->getBasePtr(), SN->getMemoryVT(), + SN->getMemOperand()); + } + } + } + // Try to simplify a vector extraction. + if (Opcode == ISD::EXTRACT_VECTOR_ELT) { + if (auto *IndexN = dyn_cast(N->getOperand(1))) { + SDValue Op0 = N->getOperand(0); + EVT VecVT = Op0.getValueType(); + return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0, + IndexN->getZExtValue(), DCI, false); + } + } + // (join_dwords X, X) == (replicate X) + if (Opcode == SystemZISD::JOIN_DWORDS && + N->getOperand(0) == N->getOperand(1)) + return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0), + N->getOperand(0)); return SDValue(); } @@ -3681,11 +4925,18 @@ SystemZTargetLowering::emitTransactionBegin(MachineInstr *MI, } } - // Add FPR clobbers. + // Add FPR/VR clobbers. if (!NoFloat && (Control & 4) != 0) { - for (int I = 0; I < 16; I++) { - unsigned Reg = SystemZMC::FP64Regs[I]; - MI->addOperand(MachineOperand::CreateReg(Reg, true, true)); + if (Subtarget.hasVector()) { + for (int I = 0; I < 32; I++) { + unsigned Reg = SystemZMC::VR128Regs[I]; + MI->addOperand(MachineOperand::CreateReg(Reg, true, true)); + } + } else { + for (int I = 0; I < 16; I++) { + unsigned Reg = SystemZMC::FP64Regs[I]; + MI->addOperand(MachineOperand::CreateReg(Reg, true, true)); + } } } diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 56d7ef45568..4b7d5908946 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -155,6 +155,70 @@ enum { // Transaction end. Just the chain operand. Returns chain and glue. TEND, + // Create a vector constant by filling byte N of the result with bit + // 15-N of the single operand. + BYTE_MASK, + + // Create a vector constant by replicating an element-sized RISBG-style mask. + // The first operand specifies the starting set bit and the second operand + // specifies the ending set bit. Both operands count from the MSB of the + // element. + ROTATE_MASK, + + // Replicate a GPR scalar value into all elements of a vector. + REPLICATE, + + // Create a vector from two i64 GPRs. + JOIN_DWORDS, + + // Replicate one element of a vector into all elements. The first operand + // is the vector and the second is the index of the element to replicate. + SPLAT, + + // Interleave elements from the high half of operand 0 and the high half + // of operand 1. + MERGE_HIGH, + + // Likewise for the low halves. + MERGE_LOW, + + // Concatenate the vectors in the first two operands, shift them left + // by the third operand, and take the first half of the result. + SHL_DOUBLE, + + // Take one element of the first v2i64 operand and the one element of + // the second v2i64 operand and concatenate them to form a v2i64 result. + // The third operand is a 4-bit value of the form 0A0B, where A and B + // are the element selectors for the first operand and second operands + // respectively. + PERMUTE_DWORDS, + + // Perform a general vector permute on vector operands 0 and 1. + // Each byte of operand 2 controls the corresponding byte of the result, + // in the same way as a byte-level VECTOR_SHUFFLE mask. + PERMUTE, + + // Pack vector operands 0 and 1 into a single vector with half-sized elements. + PACK, + + // Shift each element of vector operand 0 by the number of bits specified + // by scalar operand 1. + VSHL_BY_SCALAR, + VSRL_BY_SCALAR, + VSRA_BY_SCALAR, + + // For each element of the output type, sum across all sub-elements of + // operand 0 belonging to the corresponding element, and add in the + // rightmost sub-element of the corresponding element of operand 1. + VSUM, + + // Compare integer vector operands 0 and 1 to produce the usual 0/-1 + // vector result. VICMPE is for equality, VICMPH for "signed greater than" + // and VICMPHL for "unsigned greater than". + VICMPE, + VICMPH, + VICMPHL, + // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or // ATOMIC_LOAD_. // @@ -222,6 +286,11 @@ public: MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; } + MVT getVectorIdxTy() const override { + // Only the lower 12 bits of an element index are used, so we don't + // want to clobber the upper 32 bits of a GPR unnecessarily. + return MVT::i32; + } EVT getSetCCResultType(LLVMContext &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; @@ -328,6 +397,16 @@ private: SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const; + + SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp, + unsigned Index, DAGCombinerInfo &DCI, + bool Force) const; + SDValue combineTruncateExtract(SDLoc DL, EVT TruncVT, SDValue Op, + DAGCombinerInfo &DCI) const; // If the last instruction before MBBI in MBB was some form of COMPARE, // try to replace it with a COMPARE AND BRANCH just before MBBI. diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index 2c87871cdca..d7bfc12b938 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -2414,6 +2414,10 @@ class BinaryAliasRIL + : Alias<6, (outs VR128:$V1), (ins cls:$R2, cls:$R3), []>; + // An alias of a CompareRI, but with different register sizes. class CompareAliasRI diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index 3a028594fa4..63101a9d000 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -578,6 +578,8 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opcode = SystemZ::LDR; else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg)) Opcode = SystemZ::LXR; + else if (SystemZ::VR128BitRegClass.contains(DestReg, SrcReg)) + Opcode = SystemZ::VLR; else llvm_unreachable("Impossible reg-to-reg copy"); @@ -1116,6 +1118,10 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC, } else if (RC == &SystemZ::FP128BitRegClass) { LoadOpcode = SystemZ::LX; StoreOpcode = SystemZ::STX; + } else if (RC == &SystemZ::VF128BitRegClass || + RC == &SystemZ::VR128BitRegClass) { + LoadOpcode = SystemZ::VL; + StoreOpcode = SystemZ::VST; } else llvm_unreachable("Unsupported regclass to load or store"); } @@ -1185,6 +1191,7 @@ static bool isStringOfOnes(uint64_t Mask, unsigned &LSB, unsigned &Length) { bool SystemZInstrInfo::isRxSBGMask(uint64_t Mask, unsigned BitSize, unsigned &Start, unsigned &End) const { // Reject trivial all-zero masks. + Mask &= allOnes(BitSize); if (Mask == 0) return false; diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td index 650cae0b35d..d94725b7913 100644 --- a/lib/Target/SystemZ/SystemZInstrVector.td +++ b/lib/Target/SystemZ/SystemZInstrVector.td @@ -19,18 +19,34 @@ let Predicates = [FeatureVector] in { def VLGVB : BinaryVRSc<"vlgvb", 0xE721, null_frag, v128b, 0>; def VLGVH : BinaryVRSc<"vlgvh", 0xE721, null_frag, v128h, 1>; def VLGVF : BinaryVRSc<"vlgvf", 0xE721, null_frag, v128f, 2>; - def VLGVG : BinaryVRSc<"vlgvg", 0xE721, null_frag, v128g, 3>; + def VLGVG : BinaryVRSc<"vlgvg", 0xE721, z_vector_extract, v128g, 3>; // Load VR element from GR. - def VLVGB : TernaryVRSb<"vlvgb", 0xE722, null_frag, v128b, v128b, GR32, 0>; - def VLVGH : TernaryVRSb<"vlvgh", 0xE722, null_frag, v128h, v128h, GR32, 1>; - def VLVGF : TernaryVRSb<"vlvgf", 0xE722, null_frag, v128f, v128f, GR32, 2>; - def VLVGG : TernaryVRSb<"vlvgg", 0xE722, null_frag, v128g, v128g, GR64, 3>; + def VLVGB : TernaryVRSb<"vlvgb", 0xE722, z_vector_insert, + v128b, v128b, GR32, 0>; + def VLVGH : TernaryVRSb<"vlvgh", 0xE722, z_vector_insert, + v128h, v128h, GR32, 1>; + def VLVGF : TernaryVRSb<"vlvgf", 0xE722, z_vector_insert, + v128f, v128f, GR32, 2>; + def VLVGG : TernaryVRSb<"vlvgg", 0xE722, z_vector_insert, + v128g, v128g, GR64, 3>; // Load VR from GRs disjoint. - def VLVGP : BinaryVRRf<"vlvgp", 0xE762, null_frag, v128g>; + def VLVGP : BinaryVRRf<"vlvgp", 0xE762, z_join_dwords, v128g>; + def VLVGP32 : BinaryAliasVRRf; } +// Extractions always assign to the full GR64, even if the element would +// fit in the lower 32 bits. Sub-i64 extracts therefore need to take a +// subreg of the result. +class VectorExtractSubreg + : Pat<(i32 (z_vector_extract (type VR128:$vec), shift12only:$index)), + (EXTRACT_SUBREG (insn VR128:$vec, shift12only:$index), subreg_l32)>; + +def : VectorExtractSubreg; +def : VectorExtractSubreg; +def : VectorExtractSubreg; + //===----------------------------------------------------------------------===// // Immediate instructions //===----------------------------------------------------------------------===// @@ -39,29 +55,38 @@ let Predicates = [FeatureVector] in { // Generate byte mask. def VZERO : InherentVRIa<"vzero", 0xE744, 0>; def VONE : InherentVRIa<"vone", 0xE744, 0xffff>; - def VGBM : UnaryVRIa<"vgbm", 0xE744, null_frag, v128b, imm32zx16>; + def VGBM : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>; // Generate mask. - def VGMB : BinaryVRIb<"vgmb", 0xE746, null_frag, v128b, 0>; - def VGMH : BinaryVRIb<"vgmh", 0xE746, null_frag, v128h, 1>; - def VGMF : BinaryVRIb<"vgmf", 0xE746, null_frag, v128f, 2>; - def VGMG : BinaryVRIb<"vgmg", 0xE746, null_frag, v128g, 3>; + def VGMB : BinaryVRIb<"vgmb", 0xE746, z_rotate_mask, v128b, 0>; + def VGMH : BinaryVRIb<"vgmh", 0xE746, z_rotate_mask, v128h, 1>; + def VGMF : BinaryVRIb<"vgmf", 0xE746, z_rotate_mask, v128f, 2>; + def VGMG : BinaryVRIb<"vgmg", 0xE746, z_rotate_mask, v128g, 3>; // Load element immediate. - def VLEIB : TernaryVRIa<"vleib", 0xE740, null_frag, - v128b, v128b, imm32sx16trunc, imm32zx4>; - def VLEIH : TernaryVRIa<"vleih", 0xE741, null_frag, - v128h, v128h, imm32sx16trunc, imm32zx3>; - def VLEIF : TernaryVRIa<"vleif", 0xE743, null_frag, - v128f, v128f, imm32sx16, imm32zx2>; - def VLEIG : TernaryVRIa<"vleig", 0xE742, null_frag, - v128g, v128g, imm64sx16, imm32zx1>; + // + // We want these instructions to be used ahead of VLVG* where possible. + // However, VLVG* takes a variable BD-format index whereas VLEI takes + // a plain immediate index. This means that VLVG* has an extra "base" + // register operand and is 3 units more complex. Bumping the complexity + // of the VLEI* instructions by 4 means that they are strictly better + // than VLVG* in cases where both forms match. + let AddedComplexity = 4 in { + def VLEIB : TernaryVRIa<"vleib", 0xE740, z_vector_insert, + v128b, v128b, imm32sx16trunc, imm32zx4>; + def VLEIH : TernaryVRIa<"vleih", 0xE741, z_vector_insert, + v128h, v128h, imm32sx16trunc, imm32zx3>; + def VLEIF : TernaryVRIa<"vleif", 0xE743, z_vector_insert, + v128f, v128f, imm32sx16, imm32zx2>; + def VLEIG : TernaryVRIa<"vleig", 0xE742, z_vector_insert, + v128g, v128g, imm64sx16, imm32zx1>; + } // Replicate immediate. - def VREPIB : UnaryVRIa<"vrepib", 0xE745, null_frag, v128b, imm32sx16, 0>; - def VREPIH : UnaryVRIa<"vrepih", 0xE745, null_frag, v128h, imm32sx16, 1>; - def VREPIF : UnaryVRIa<"vrepif", 0xE745, null_frag, v128f, imm32sx16, 2>; - def VREPIG : UnaryVRIa<"vrepig", 0xE745, null_frag, v128g, imm32sx16, 3>; + def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>; + def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>; + def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>; + def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>; } //===----------------------------------------------------------------------===// @@ -89,28 +114,45 @@ let Predicates = [FeatureVector] in { def VLM : LoadMultipleVRSa<"vlm", 0xE736>; // Load and replicate - def VLREPB : UnaryVRX<"vlrepb", 0xE705, null_frag, v128b, 1, 0>; - def VLREPH : UnaryVRX<"vlreph", 0xE705, null_frag, v128h, 2, 1>; - def VLREPF : UnaryVRX<"vlrepf", 0xE705, null_frag, v128f, 4, 2>; - def VLREPG : UnaryVRX<"vlrepg", 0xE705, null_frag, v128g, 8, 3>; + def VLREPB : UnaryVRX<"vlrepb", 0xE705, z_replicate_loadi8, v128b, 1, 0>; + def VLREPH : UnaryVRX<"vlreph", 0xE705, z_replicate_loadi16, v128h, 2, 1>; + def VLREPF : UnaryVRX<"vlrepf", 0xE705, z_replicate_loadi32, v128f, 4, 2>; + def VLREPG : UnaryVRX<"vlrepg", 0xE705, z_replicate_loadi64, v128g, 8, 3>; // Load logical element and zero. - def VLLEZB : UnaryVRX<"vllezb", 0xE704, null_frag, v128b, 1, 0>; - def VLLEZH : UnaryVRX<"vllezh", 0xE704, null_frag, v128h, 2, 1>; - def VLLEZF : UnaryVRX<"vllezf", 0xE704, null_frag, v128f, 4, 2>; - def VLLEZG : UnaryVRX<"vllezg", 0xE704, null_frag, v128g, 8, 3>; + def VLLEZB : UnaryVRX<"vllezb", 0xE704, z_vllezi8, v128b, 1, 0>; + def VLLEZH : UnaryVRX<"vllezh", 0xE704, z_vllezi16, v128h, 2, 1>; + def VLLEZF : UnaryVRX<"vllezf", 0xE704, z_vllezi32, v128f, 4, 2>; + def VLLEZG : UnaryVRX<"vllezg", 0xE704, z_vllezi64, v128g, 8, 3>; // Load element. - def VLEB : TernaryVRX<"vleb", 0xE700, null_frag, v128b, v128b, 1, imm32zx4>; - def VLEH : TernaryVRX<"vleh", 0xE701, null_frag, v128h, v128h, 2, imm32zx3>; - def VLEF : TernaryVRX<"vlef", 0xE703, null_frag, v128f, v128f, 4, imm32zx2>; - def VLEG : TernaryVRX<"vleg", 0xE702, null_frag, v128g, v128g, 8, imm32zx1>; + def VLEB : TernaryVRX<"vleb", 0xE700, z_vlei8, v128b, v128b, 1, imm32zx4>; + def VLEH : TernaryVRX<"vleh", 0xE701, z_vlei16, v128h, v128h, 2, imm32zx3>; + def VLEF : TernaryVRX<"vlef", 0xE703, z_vlei32, v128f, v128f, 4, imm32zx2>; + def VLEG : TernaryVRX<"vleg", 0xE702, z_vlei64, v128g, v128g, 8, imm32zx1>; // Gather element. def VGEF : TernaryVRV<"vgef", 0xE713, 4, imm32zx2>; def VGEG : TernaryVRV<"vgeg", 0xE712, 8, imm32zx1>; } +// Use replicating loads if we're inserting a single element into an +// undefined vector. This avoids a false dependency on the previous +// register contents. +multiclass ReplicatePeephole { + def : Pat<(vectype (z_vector_insert + (undef), (scalartype (load bdxaddr12only:$addr)), 0)), + (vlrep bdxaddr12only:$addr)>; + def : Pat<(vectype (scalar_to_vector + (scalartype (load bdxaddr12only:$addr)))), + (vlrep bdxaddr12only:$addr)>; +} +defm : ReplicatePeephole; +defm : ReplicatePeephole; +defm : ReplicatePeephole; +defm : ReplicatePeephole; + //===----------------------------------------------------------------------===// // Stores //===----------------------------------------------------------------------===// @@ -126,10 +168,10 @@ let Predicates = [FeatureVector] in { def VSTM : StoreMultipleVRSa<"vstm", 0xE73E>; // Store element. - def VSTEB : StoreBinaryVRX<"vsteb", 0xE708, null_frag, v128b, 1, imm32zx4>; - def VSTEH : StoreBinaryVRX<"vsteh", 0xE709, null_frag, v128h, 2, imm32zx3>; - def VSTEF : StoreBinaryVRX<"vstef", 0xE70B, null_frag, v128f, 4, imm32zx2>; - def VSTEG : StoreBinaryVRX<"vsteg", 0xE70A, null_frag, v128g, 8, imm32zx1>; + def VSTEB : StoreBinaryVRX<"vsteb", 0xE708, z_vstei8, v128b, 1, imm32zx4>; + def VSTEH : StoreBinaryVRX<"vsteh", 0xE709, z_vstei16, v128h, 2, imm32zx3>; + def VSTEF : StoreBinaryVRX<"vstef", 0xE70B, z_vstei32, v128f, 4, imm32zx2>; + def VSTEG : StoreBinaryVRX<"vsteg", 0xE70A, z_vstei64, v128g, 8, imm32zx1>; // Scatter element. def VSCEF : StoreBinaryVRV<"vscef", 0xE71B, 4, imm32zx2>; @@ -142,28 +184,28 @@ let Predicates = [FeatureVector] in { let Predicates = [FeatureVector] in { // Merge high. - def VMRHB : BinaryVRRc<"vmrhb", 0xE761, null_frag, v128b, v128b, 0>; - def VMRHH : BinaryVRRc<"vmrhh", 0xE761, null_frag, v128h, v128h, 1>; - def VMRHF : BinaryVRRc<"vmrhf", 0xE761, null_frag, v128f, v128f, 2>; - def VMRHG : BinaryVRRc<"vmrhg", 0xE761, null_frag, v128g, v128g, 3>; + def VMRHB : BinaryVRRc<"vmrhb", 0xE761, z_merge_high, v128b, v128b, 0>; + def VMRHH : BinaryVRRc<"vmrhh", 0xE761, z_merge_high, v128h, v128h, 1>; + def VMRHF : BinaryVRRc<"vmrhf", 0xE761, z_merge_high, v128f, v128f, 2>; + def VMRHG : BinaryVRRc<"vmrhg", 0xE761, z_merge_high, v128g, v128g, 3>; // Merge low. - def VMRLB : BinaryVRRc<"vmrlb", 0xE760, null_frag, v128b, v128b, 0>; - def VMRLH : BinaryVRRc<"vmrlh", 0xE760, null_frag, v128h, v128h, 1>; - def VMRLF : BinaryVRRc<"vmrlf", 0xE760, null_frag, v128f, v128f, 2>; - def VMRLG : BinaryVRRc<"vmrlg", 0xE760, null_frag, v128g, v128g, 3>; + def VMRLB : BinaryVRRc<"vmrlb", 0xE760, z_merge_low, v128b, v128b, 0>; + def VMRLH : BinaryVRRc<"vmrlh", 0xE760, z_merge_low, v128h, v128h, 1>; + def VMRLF : BinaryVRRc<"vmrlf", 0xE760, z_merge_low, v128f, v128f, 2>; + def VMRLG : BinaryVRRc<"vmrlg", 0xE760, z_merge_low, v128g, v128g, 3>; // Permute. - def VPERM : TernaryVRRe<"vperm", 0xE78C, null_frag, v128b, v128b>; + def VPERM : TernaryVRRe<"vperm", 0xE78C, z_permute, v128b, v128b>; // Permute doubleword immediate. - def VPDI : TernaryVRRc<"vpdi", 0xE784, null_frag, v128b, v128b>; + def VPDI : TernaryVRRc<"vpdi", 0xE784, z_permute_dwords, v128g, v128g>; // Replicate. - def VREPB : BinaryVRIc<"vrepb", 0xE74D, null_frag, v128b, v128b, 0>; - def VREPH : BinaryVRIc<"vreph", 0xE74D, null_frag, v128h, v128h, 1>; - def VREPF : BinaryVRIc<"vrepf", 0xE74D, null_frag, v128f, v128f, 2>; - def VREPG : BinaryVRIc<"vrepg", 0xE74D, null_frag, v128g, v128g, 3>; + def VREPB : BinaryVRIc<"vrepb", 0xE74D, z_splat, v128b, v128b, 0>; + def VREPH : BinaryVRIc<"vreph", 0xE74D, z_splat, v128h, v128h, 1>; + def VREPF : BinaryVRIc<"vrepf", 0xE74D, z_splat, v128f, v128f, 2>; + def VREPG : BinaryVRIc<"vrepg", 0xE74D, z_splat, v128g, v128g, 3>; // Select. def VSEL : TernaryVRRe<"vsel", 0xE78D, null_frag, v128any, v128any>; @@ -175,9 +217,9 @@ let Predicates = [FeatureVector] in { let Predicates = [FeatureVector] in { // Pack - def VPKH : BinaryVRRc<"vpkh", 0xE794, null_frag, v128b, v128h, 1>; - def VPKF : BinaryVRRc<"vpkf", 0xE794, null_frag, v128h, v128f, 2>; - def VPKG : BinaryVRRc<"vpkg", 0xE794, null_frag, v128f, v128g, 3>; + def VPKH : BinaryVRRc<"vpkh", 0xE794, z_pack, v128b, v128h, 1>; + def VPKF : BinaryVRRc<"vpkf", 0xE794, z_pack, v128h, v128f, 2>; + def VPKG : BinaryVRRc<"vpkg", 0xE794, z_pack, v128f, v128g, 3>; // Pack saturate. defm VPKSH : BinaryVRRbSPair<"vpksh", 0xE797, null_frag, null_frag, @@ -196,9 +238,12 @@ let Predicates = [FeatureVector] in { v128f, v128g, 3>; // Sign-extend to doubleword. - def VSEGB : UnaryVRRa<"vsegb", 0xE75F, null_frag, v128g, v128b, 0>; - def VSEGH : UnaryVRRa<"vsegh", 0xE75F, null_frag, v128g, v128h, 1>; - def VSEGF : UnaryVRRa<"vsegf", 0xE75F, null_frag, v128g, v128f, 2>; + def VSEGB : UnaryVRRa<"vsegb", 0xE75F, z_vsei8, v128g, v128g, 0>; + def VSEGH : UnaryVRRa<"vsegh", 0xE75F, z_vsei16, v128g, v128g, 1>; + def VSEGF : UnaryVRRa<"vsegf", 0xE75F, z_vsei32, v128g, v128g, 2>; + def : Pat<(z_vsei8_by_parts (v16i8 VR128:$src)), (VSEGB VR128:$src)>; + def : Pat<(z_vsei16_by_parts (v8i16 VR128:$src)), (VSEGH VR128:$src)>; + def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>; // Unpack high. def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, null_frag, v128h, v128b, 0>; @@ -221,16 +266,38 @@ let Predicates = [FeatureVector] in { def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, null_frag, v128g, v128f, 2>; } +//===----------------------------------------------------------------------===// +// Instantiating generic operations for specific types. +//===----------------------------------------------------------------------===// + +multiclass GenericVectorOps { + let Predicates = [FeatureVector] in { + def : Pat<(type (load bdxaddr12only:$addr)), + (VL bdxaddr12only:$addr)>; + def : Pat<(store (type VR128:$src), bdxaddr12only:$addr), + (VST VR128:$src, bdxaddr12only:$addr)>; + def : Pat<(type (vselect (inttype VR128:$x), VR128:$y, VR128:$z)), + (VSEL VR128:$y, VR128:$z, VR128:$x)>; + def : Pat<(type (vselect (inttype (z_vnot VR128:$x)), VR128:$y, VR128:$z)), + (VSEL VR128:$z, VR128:$y, VR128:$x)>; + } +} + +defm : GenericVectorOps; +defm : GenericVectorOps; +defm : GenericVectorOps; +defm : GenericVectorOps; + //===----------------------------------------------------------------------===// // Integer arithmetic //===----------------------------------------------------------------------===// let Predicates = [FeatureVector] in { // Add. - def VAB : BinaryVRRc<"vab", 0xE7F3, null_frag, v128b, v128b, 0>; - def VAH : BinaryVRRc<"vah", 0xE7F3, null_frag, v128h, v128h, 1>; - def VAF : BinaryVRRc<"vaf", 0xE7F3, null_frag, v128f, v128f, 2>; - def VAG : BinaryVRRc<"vag", 0xE7F3, null_frag, v128g, v128g, 3>; + def VAB : BinaryVRRc<"vab", 0xE7F3, add, v128b, v128b, 0>; + def VAH : BinaryVRRc<"vah", 0xE7F3, add, v128h, v128h, 1>; + def VAF : BinaryVRRc<"vaf", 0xE7F3, add, v128f, v128f, 2>; + def VAG : BinaryVRRc<"vag", 0xE7F3, add, v128g, v128g, 3>; def VAQ : BinaryVRRc<"vaq", 0xE7F3, null_frag, v128q, v128q, 4>; // Add compute carry. @@ -268,16 +335,16 @@ let Predicates = [FeatureVector] in { def VCKSM : BinaryVRRc<"vcksm", 0xE766, null_frag, v128any, v128any>; // Count leading zeros. - def VCLZB : UnaryVRRa<"vclzb", 0xE753, null_frag, v128b, v128b, 0>; - def VCLZH : UnaryVRRa<"vclzh", 0xE753, null_frag, v128h, v128h, 1>; - def VCLZF : UnaryVRRa<"vclzf", 0xE753, null_frag, v128f, v128f, 2>; - def VCLZG : UnaryVRRa<"vclzg", 0xE753, null_frag, v128g, v128g, 3>; + def VCLZB : UnaryVRRa<"vclzb", 0xE753, ctlz, v128b, v128b, 0>; + def VCLZH : UnaryVRRa<"vclzh", 0xE753, ctlz, v128h, v128h, 1>; + def VCLZF : UnaryVRRa<"vclzf", 0xE753, ctlz, v128f, v128f, 2>; + def VCLZG : UnaryVRRa<"vclzg", 0xE753, ctlz, v128g, v128g, 3>; // Count trailing zeros. - def VCTZB : UnaryVRRa<"vctzb", 0xE752, null_frag, v128b, v128b, 0>; - def VCTZH : UnaryVRRa<"vctzh", 0xE752, null_frag, v128h, v128h, 1>; - def VCTZF : UnaryVRRa<"vctzf", 0xE752, null_frag, v128f, v128f, 2>; - def VCTZG : UnaryVRRa<"vctzg", 0xE752, null_frag, v128g, v128g, 3>; + def VCTZB : UnaryVRRa<"vctzb", 0xE752, cttz, v128b, v128b, 0>; + def VCTZH : UnaryVRRa<"vctzh", 0xE752, cttz, v128h, v128h, 1>; + def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>; + def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>; // Exclusive or. def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>; @@ -295,16 +362,16 @@ let Predicates = [FeatureVector] in { def VGFMAG : TernaryVRRd<"vgfmag", 0xE7BC, null_frag, v128g, v128g, 3>; // Load complement. - def VLCB : UnaryVRRa<"vlcb", 0xE7DE, null_frag, v128b, v128b, 0>; - def VLCH : UnaryVRRa<"vlch", 0xE7DE, null_frag, v128h, v128h, 1>; - def VLCF : UnaryVRRa<"vlcf", 0xE7DE, null_frag, v128f, v128f, 2>; - def VLCG : UnaryVRRa<"vlcg", 0xE7DE, null_frag, v128g, v128g, 3>; + def VLCB : UnaryVRRa<"vlcb", 0xE7DE, z_vneg, v128b, v128b, 0>; + def VLCH : UnaryVRRa<"vlch", 0xE7DE, z_vneg, v128h, v128h, 1>; + def VLCF : UnaryVRRa<"vlcf", 0xE7DE, z_vneg, v128f, v128f, 2>; + def VLCG : UnaryVRRa<"vlcg", 0xE7DE, z_vneg, v128g, v128g, 3>; // Load positive. - def VLPB : UnaryVRRa<"vlpb", 0xE7DF, null_frag, v128b, v128b, 0>; - def VLPH : UnaryVRRa<"vlph", 0xE7DF, null_frag, v128h, v128h, 1>; - def VLPF : UnaryVRRa<"vlpf", 0xE7DF, null_frag, v128f, v128f, 2>; - def VLPG : UnaryVRRa<"vlpg", 0xE7DF, null_frag, v128g, v128g, 3>; + def VLPB : UnaryVRRa<"vlpb", 0xE7DF, z_viabs8, v128b, v128b, 0>; + def VLPH : UnaryVRRa<"vlph", 0xE7DF, z_viabs16, v128h, v128h, 1>; + def VLPF : UnaryVRRa<"vlpf", 0xE7DF, z_viabs32, v128f, v128f, 2>; + def VLPG : UnaryVRRa<"vlpg", 0xE7DF, z_viabs64, v128g, v128g, 3>; // Maximum. def VMXB : BinaryVRRc<"vmxb", 0xE7FF, null_frag, v128b, v128b, 0>; @@ -331,9 +398,9 @@ let Predicates = [FeatureVector] in { def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, null_frag, v128g, v128g, 3>; // Multiply and add low. - def VMALB : TernaryVRRd<"vmalb", 0xE7AA, null_frag, v128b, v128b, 0>; - def VMALHW : TernaryVRRd<"vmalhw", 0xE7AA, null_frag, v128h, v128h, 1>; - def VMALF : TernaryVRRd<"vmalf", 0xE7AA, null_frag, v128f, v128f, 2>; + def VMALB : TernaryVRRd<"vmalb", 0xE7AA, z_muladd, v128b, v128b, 0>; + def VMALHW : TernaryVRRd<"vmalhw", 0xE7AA, z_muladd, v128h, v128h, 1>; + def VMALF : TernaryVRRd<"vmalf", 0xE7AA, z_muladd, v128f, v128f, 2>; // Multiply and add high. def VMAHB : TernaryVRRd<"vmahb", 0xE7AB, null_frag, v128b, v128b, 0>; @@ -376,9 +443,9 @@ let Predicates = [FeatureVector] in { def VMLHF : BinaryVRRc<"vmlhf", 0xE7A1, null_frag, v128f, v128f, 2>; // Multiply low. - def VMLB : BinaryVRRc<"vmlb", 0xE7A2, null_frag, v128b, v128b, 0>; - def VMLHW : BinaryVRRc<"vmlhw", 0xE7A2, null_frag, v128h, v128h, 1>; - def VMLF : BinaryVRRc<"vmlf", 0xE7A2, null_frag, v128f, v128f, 2>; + def VMLB : BinaryVRRc<"vmlb", 0xE7A2, mul, v128b, v128b, 0>; + def VMLHW : BinaryVRRc<"vmlhw", 0xE7A2, mul, v128h, v128h, 1>; + def VMLF : BinaryVRRc<"vmlf", 0xE7A2, mul, v128f, v128f, 2>; // Multiply even. def VMEB : BinaryVRRc<"vmeb", 0xE7A6, null_frag, v128h, v128b, 0>; @@ -408,6 +475,7 @@ let Predicates = [FeatureVector] in { // Population count. def VPOPCT : BinaryVRRa<"vpopct", 0xE750>; + def : Pat<(v16i8 (z_popcnt VR128:$x)), (VPOPCT VR128:$x, 0)>; // Element rotate left logical (with vector shift amount). def VERLLVB : BinaryVRRc<"verllvb", 0xE773, null_frag, v128b, v128b, 0>; @@ -428,40 +496,40 @@ let Predicates = [FeatureVector] in { def VERIMG : QuaternaryVRId<"verimg", 0xE772, null_frag, v128g, v128g, 3>; // Element shift left (with vector shift amount). - def VESLVB : BinaryVRRc<"veslvb", 0xE770, null_frag, v128b, v128b, 0>; - def VESLVH : BinaryVRRc<"veslvh", 0xE770, null_frag, v128h, v128h, 1>; - def VESLVF : BinaryVRRc<"veslvf", 0xE770, null_frag, v128f, v128f, 2>; - def VESLVG : BinaryVRRc<"veslvg", 0xE770, null_frag, v128g, v128g, 3>; + def VESLVB : BinaryVRRc<"veslvb", 0xE770, z_vshl, v128b, v128b, 0>; + def VESLVH : BinaryVRRc<"veslvh", 0xE770, z_vshl, v128h, v128h, 1>; + def VESLVF : BinaryVRRc<"veslvf", 0xE770, z_vshl, v128f, v128f, 2>; + def VESLVG : BinaryVRRc<"veslvg", 0xE770, z_vshl, v128g, v128g, 3>; // Element shift left (with scalar shift amount). - def VESLB : BinaryVRSa<"veslb", 0xE730, null_frag, v128b, v128b, 0>; - def VESLH : BinaryVRSa<"veslh", 0xE730, null_frag, v128h, v128h, 1>; - def VESLF : BinaryVRSa<"veslf", 0xE730, null_frag, v128f, v128f, 2>; - def VESLG : BinaryVRSa<"veslg", 0xE730, null_frag, v128g, v128g, 3>; + def VESLB : BinaryVRSa<"veslb", 0xE730, z_vshl_by_scalar, v128b, v128b, 0>; + def VESLH : BinaryVRSa<"veslh", 0xE730, z_vshl_by_scalar, v128h, v128h, 1>; + def VESLF : BinaryVRSa<"veslf", 0xE730, z_vshl_by_scalar, v128f, v128f, 2>; + def VESLG : BinaryVRSa<"veslg", 0xE730, z_vshl_by_scalar, v128g, v128g, 3>; // Element shift right arithmetic (with vector shift amount). - def VESRAVB : BinaryVRRc<"vesravb", 0xE77A, null_frag, v128b, v128b, 0>; - def VESRAVH : BinaryVRRc<"vesravh", 0xE77A, null_frag, v128h, v128h, 1>; - def VESRAVF : BinaryVRRc<"vesravf", 0xE77A, null_frag, v128f, v128f, 2>; - def VESRAVG : BinaryVRRc<"vesravg", 0xE77A, null_frag, v128g, v128g, 3>; + def VESRAVB : BinaryVRRc<"vesravb", 0xE77A, z_vsra, v128b, v128b, 0>; + def VESRAVH : BinaryVRRc<"vesravh", 0xE77A, z_vsra, v128h, v128h, 1>; + def VESRAVF : BinaryVRRc<"vesravf", 0xE77A, z_vsra, v128f, v128f, 2>; + def VESRAVG : BinaryVRRc<"vesravg", 0xE77A, z_vsra, v128g, v128g, 3>; // Element shift right arithmetic (with scalar shift amount). - def VESRAB : BinaryVRSa<"vesrab", 0xE73A, null_frag, v128b, v128b, 0>; - def VESRAH : BinaryVRSa<"vesrah", 0xE73A, null_frag, v128h, v128h, 1>; - def VESRAF : BinaryVRSa<"vesraf", 0xE73A, null_frag, v128f, v128f, 2>; - def VESRAG : BinaryVRSa<"vesrag", 0xE73A, null_frag, v128g, v128g, 3>; + def VESRAB : BinaryVRSa<"vesrab", 0xE73A, z_vsra_by_scalar, v128b, v128b, 0>; + def VESRAH : BinaryVRSa<"vesrah", 0xE73A, z_vsra_by_scalar, v128h, v128h, 1>; + def VESRAF : BinaryVRSa<"vesraf", 0xE73A, z_vsra_by_scalar, v128f, v128f, 2>; + def VESRAG : BinaryVRSa<"vesrag", 0xE73A, z_vsra_by_scalar, v128g, v128g, 3>; // Element shift right logical (with vector shift amount). - def VESRLVB : BinaryVRRc<"vesrlvb", 0xE778, null_frag, v128b, v128b, 0>; - def VESRLVH : BinaryVRRc<"vesrlvh", 0xE778, null_frag, v128h, v128h, 1>; - def VESRLVF : BinaryVRRc<"vesrlvf", 0xE778, null_frag, v128f, v128f, 2>; - def VESRLVG : BinaryVRRc<"vesrlvg", 0xE778, null_frag, v128g, v128g, 3>; + def VESRLVB : BinaryVRRc<"vesrlvb", 0xE778, z_vsrl, v128b, v128b, 0>; + def VESRLVH : BinaryVRRc<"vesrlvh", 0xE778, z_vsrl, v128h, v128h, 1>; + def VESRLVF : BinaryVRRc<"vesrlvf", 0xE778, z_vsrl, v128f, v128f, 2>; + def VESRLVG : BinaryVRRc<"vesrlvg", 0xE778, z_vsrl, v128g, v128g, 3>; // Element shift right logical (with scalar shift amount). - def VESRLB : BinaryVRSa<"vesrlb", 0xE738, null_frag, v128b, v128b, 0>; - def VESRLH : BinaryVRSa<"vesrlh", 0xE738, null_frag, v128h, v128h, 1>; - def VESRLF : BinaryVRSa<"vesrlf", 0xE738, null_frag, v128f, v128f, 2>; - def VESRLG : BinaryVRSa<"vesrlg", 0xE738, null_frag, v128g, v128g, 3>; + def VESRLB : BinaryVRSa<"vesrlb", 0xE738, z_vsrl_by_scalar, v128b, v128b, 0>; + def VESRLH : BinaryVRSa<"vesrlh", 0xE738, z_vsrl_by_scalar, v128h, v128h, 1>; + def VESRLF : BinaryVRSa<"vesrlf", 0xE738, z_vsrl_by_scalar, v128f, v128f, 2>; + def VESRLG : BinaryVRSa<"vesrlg", 0xE738, z_vsrl_by_scalar, v128g, v128g, 3>; // Shift left. def VSL : BinaryVRRc<"vsl", 0xE774, null_frag, v128b, v128b>; @@ -470,7 +538,7 @@ let Predicates = [FeatureVector] in { def VSLB : BinaryVRRc<"vslb", 0xE775, null_frag, v128b, v128b>; // Shift left double by byte. - def VSLDB : TernaryVRId<"vsldb", 0xE777, null_frag, v128b, v128b, 0>; + def VSLDB : TernaryVRId<"vsldb", 0xE777, z_shl_double, v128b, v128b, 0>; // Shift right arithmetic. def VSRA : BinaryVRRc<"vsra", 0xE77E, null_frag, v128b, v128b>; @@ -485,10 +553,10 @@ let Predicates = [FeatureVector] in { def VSRLB : BinaryVRRc<"vsrlb", 0xE77D, null_frag, v128b, v128b>; // Subtract. - def VSB : BinaryVRRc<"vsb", 0xE7F7, null_frag, v128b, v128b, 0>; - def VSH : BinaryVRRc<"vsh", 0xE7F7, null_frag, v128h, v128h, 1>; - def VSF : BinaryVRRc<"vsf", 0xE7F7, null_frag, v128f, v128f, 2>; - def VSG : BinaryVRRc<"vsg", 0xE7F7, null_frag, v128g, v128g, 3>; + def VSB : BinaryVRRc<"vsb", 0xE7F7, sub, v128b, v128b, 0>; + def VSH : BinaryVRRc<"vsh", 0xE7F7, sub, v128h, v128h, 1>; + def VSF : BinaryVRRc<"vsf", 0xE7F7, sub, v128f, v128f, 2>; + def VSG : BinaryVRRc<"vsg", 0xE7F7, sub, v128g, v128g, 3>; def VSQ : BinaryVRRc<"vsq", 0xE7F7, null_frag, v128q, v128q, 4>; // Subtract compute borrow indication. @@ -505,18 +573,107 @@ let Predicates = [FeatureVector] in { def VSBCBIQ : TernaryVRRd<"vsbcbiq", 0xE7BD, null_frag, v128q, v128q, 4>; // Sum across doubleword. - def VSUMGH : BinaryVRRc<"vsumgh", 0xE765, null_frag, v128g, v128h, 1>; - def VSUMGF : BinaryVRRc<"vsumgf", 0xE765, null_frag, v128g, v128f, 2>; + def VSUMGH : BinaryVRRc<"vsumgh", 0xE765, z_vsum, v128g, v128h, 1>; + def VSUMGF : BinaryVRRc<"vsumgf", 0xE765, z_vsum, v128g, v128f, 2>; // Sum across quadword. - def VSUMQF : BinaryVRRc<"vsumqf", 0xE767, null_frag, v128q, v128f, 2>; - def VSUMQG : BinaryVRRc<"vsumqg", 0xE767, null_frag, v128q, v128g, 3>; + def VSUMQF : BinaryVRRc<"vsumqf", 0xE767, z_vsum, v128q, v128f, 2>; + def VSUMQG : BinaryVRRc<"vsumqg", 0xE767, z_vsum, v128q, v128g, 3>; // Sum across word. - def VSUMB : BinaryVRRc<"vsumb", 0xE764, null_frag, v128f, v128b, 0>; - def VSUMH : BinaryVRRc<"vsumh", 0xE764, null_frag, v128f, v128h, 1>; + def VSUMB : BinaryVRRc<"vsumb", 0xE764, z_vsum, v128f, v128b, 0>; + def VSUMH : BinaryVRRc<"vsumh", 0xE764, z_vsum, v128f, v128h, 1>; +} + +// Instantiate the bitwise ops for type TYPE. +multiclass BitwiseVectorOps { + let Predicates = [FeatureVector] in { + def : Pat<(type (and VR128:$x, VR128:$y)), (VN VR128:$x, VR128:$y)>; + def : Pat<(type (and VR128:$x, (z_vnot VR128:$y))), + (VNC VR128:$x, VR128:$y)>; + def : Pat<(type (or VR128:$x, VR128:$y)), (VO VR128:$x, VR128:$y)>; + def : Pat<(type (xor VR128:$x, VR128:$y)), (VX VR128:$x, VR128:$y)>; + def : Pat<(type (or (and VR128:$x, VR128:$z), + (and VR128:$y, (z_vnot VR128:$z)))), + (VSEL VR128:$x, VR128:$y, VR128:$z)>; + def : Pat<(type (z_vnot (or VR128:$x, VR128:$y))), + (VNO VR128:$x, VR128:$y)>; + def : Pat<(type (z_vnot VR128:$x)), (VNO VR128:$x, VR128:$x)>; + } +} + +defm : BitwiseVectorOps; +defm : BitwiseVectorOps; +defm : BitwiseVectorOps; +defm : BitwiseVectorOps; + +// Instantiate additional patterns for absolute-related expressions on +// type TYPE. LC is the negate instruction for TYPE and LP is the absolute +// instruction. +multiclass IntegerAbsoluteVectorOps { + let Predicates = [FeatureVector] in { + def : Pat<(type (vselect (type (z_vicmph_zero VR128:$x)), + (z_vneg VR128:$x), VR128:$x)), + (lc (lp VR128:$x))>; + def : Pat<(type (vselect (type (z_vnot (z_vicmph_zero VR128:$x))), + VR128:$x, (z_vneg VR128:$x))), + (lc (lp VR128:$x))>; + def : Pat<(type (vselect (type (z_vicmpl_zero VR128:$x)), + VR128:$x, (z_vneg VR128:$x))), + (lc (lp VR128:$x))>; + def : Pat<(type (vselect (type (z_vnot (z_vicmpl_zero VR128:$x))), + (z_vneg VR128:$x), VR128:$x)), + (lc (lp VR128:$x))>; + def : Pat<(type (or (and (z_vsra_by_scalar VR128:$x, (i32 shift)), + (z_vneg VR128:$x)), + (and (z_vnot (z_vsra_by_scalar VR128:$x, (i32 shift))), + VR128:$x))), + (lp VR128:$x)>; + def : Pat<(type (or (and (z_vsra_by_scalar VR128:$x, (i32 shift)), + VR128:$x), + (and (z_vnot (z_vsra_by_scalar VR128:$x, (i32 shift))), + (z_vneg VR128:$x)))), + (lc (lp VR128:$x))>; + } } +defm : IntegerAbsoluteVectorOps; +defm : IntegerAbsoluteVectorOps; +defm : IntegerAbsoluteVectorOps; +defm : IntegerAbsoluteVectorOps; + +// Instantiate minimum- and maximum-related patterns for TYPE. CMPH is the +// signed or unsigned "set if greater than" comparison instruction and +// MIN and MAX are the associated minimum and maximum instructions. +multiclass IntegerMinMaxVectorOps { + let Predicates = [FeatureVector] in { + def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$x, VR128:$y)), + (max VR128:$x, VR128:$y)>; + def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$y, VR128:$x)), + (min VR128:$x, VR128:$y)>; + def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)), + VR128:$x, VR128:$y)), + (min VR128:$x, VR128:$y)>; + def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)), + VR128:$y, VR128:$x)), + (max VR128:$x, VR128:$y)>; + } +} + +// Signed min/max. +defm : IntegerMinMaxVectorOps; +defm : IntegerMinMaxVectorOps; +defm : IntegerMinMaxVectorOps; +defm : IntegerMinMaxVectorOps; + +// Unsigned min/max. +defm : IntegerMinMaxVectorOps; +defm : IntegerMinMaxVectorOps; +defm : IntegerMinMaxVectorOps; +defm : IntegerMinMaxVectorOps; + //===----------------------------------------------------------------------===// // Integer comparison //===----------------------------------------------------------------------===// @@ -539,33 +696,33 @@ let Predicates = [FeatureVector] in { } // Compare equal. - defm VCEQB : BinaryVRRbSPair<"vceqb", 0xE7F8, null_frag, null_frag, + defm VCEQB : BinaryVRRbSPair<"vceqb", 0xE7F8, z_vicmpe, null_frag, v128b, v128b, 0>; - defm VCEQH : BinaryVRRbSPair<"vceqh", 0xE7F8, null_frag, null_frag, + defm VCEQH : BinaryVRRbSPair<"vceqh", 0xE7F8, z_vicmpe, null_frag, v128h, v128h, 1>; - defm VCEQF : BinaryVRRbSPair<"vceqf", 0xE7F8, null_frag, null_frag, + defm VCEQF : BinaryVRRbSPair<"vceqf", 0xE7F8, z_vicmpe, null_frag, v128f, v128f, 2>; - defm VCEQG : BinaryVRRbSPair<"vceqg", 0xE7F8, null_frag, null_frag, + defm VCEQG : BinaryVRRbSPair<"vceqg", 0xE7F8, z_vicmpe, null_frag, v128g, v128g, 3>; // Compare high. - defm VCHB : BinaryVRRbSPair<"vchb", 0xE7FB, null_frag, null_frag, + defm VCHB : BinaryVRRbSPair<"vchb", 0xE7FB, z_vicmph, null_frag, v128b, v128b, 0>; - defm VCHH : BinaryVRRbSPair<"vchh", 0xE7FB, null_frag, null_frag, + defm VCHH : BinaryVRRbSPair<"vchh", 0xE7FB, z_vicmph, null_frag, v128h, v128h, 1>; - defm VCHF : BinaryVRRbSPair<"vchf", 0xE7FB, null_frag, null_frag, + defm VCHF : BinaryVRRbSPair<"vchf", 0xE7FB, z_vicmph, null_frag, v128f, v128f, 2>; - defm VCHG : BinaryVRRbSPair<"vchg", 0xE7FB, null_frag, null_frag, + defm VCHG : BinaryVRRbSPair<"vchg", 0xE7FB, z_vicmph, null_frag, v128g, v128g, 3>; // Compare high logical. - defm VCHLB : BinaryVRRbSPair<"vchlb", 0xE7F9, null_frag, null_frag, + defm VCHLB : BinaryVRRbSPair<"vchlb", 0xE7F9, z_vicmphl, null_frag, v128b, v128b, 0>; - defm VCHLH : BinaryVRRbSPair<"vchlh", 0xE7F9, null_frag, null_frag, + defm VCHLH : BinaryVRRbSPair<"vchlh", 0xE7F9, z_vicmphl, null_frag, v128h, v128h, 1>; - defm VCHLF : BinaryVRRbSPair<"vchlf", 0xE7F9, null_frag, null_frag, + defm VCHLF : BinaryVRRbSPair<"vchlf", 0xE7F9, z_vicmphl, null_frag, v128f, v128f, 2>; - defm VCHLG : BinaryVRRbSPair<"vchlg", 0xE7F9, null_frag, null_frag, + defm VCHLG : BinaryVRRbSPair<"vchlg", 0xE7F9, z_vicmphl, null_frag, v128g, v128g, 3>; // Test under mask. @@ -685,6 +842,44 @@ let Predicates = [FeatureVector] in { v64g, v64db, 3, 8>; } +//===----------------------------------------------------------------------===// +// Conversions +//===----------------------------------------------------------------------===// + +def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; + +def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; + +def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; + +def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; + +//===----------------------------------------------------------------------===// +// Replicating scalars +//===----------------------------------------------------------------------===// + +// Define patterns for replicating a scalar GR32 into a vector of type TYPE. +// INDEX is 8 minus the element size in bytes. +class VectorReplicateScalar index> + : Pat<(type (z_replicate GR32:$scalar)), + (insn (VLVGP32 GR32:$scalar, GR32:$scalar), index)>; + +def : VectorReplicateScalar; +def : VectorReplicateScalar; +def : VectorReplicateScalar; + +// i64 replications are just a single isntruction. +def : Pat<(v2i64 (z_replicate GR64:$scalar)), + (VLVGP GR64:$scalar, GR64:$scalar)>; + //===----------------------------------------------------------------------===// // String instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index 3151052ecf5..2e431859a86 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -82,6 +82,45 @@ def SDT_ZPrefetch : SDTypeProfile<0, 2, def SDT_ZTBegin : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; +def SDT_ZInsertVectorElt : SDTypeProfile<1, 3, + [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisVT<3, i32>]>; +def SDT_ZExtractVectorElt : SDTypeProfile<1, 2, + [SDTCisVec<1>, + SDTCisVT<2, i32>]>; +def SDT_ZReplicate : SDTypeProfile<1, 1, + [SDTCisVec<0>]>; +def SDT_ZVecBinary : SDTypeProfile<1, 2, + [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def SDT_ZVecBinaryInt : SDTypeProfile<1, 2, + [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def SDT_ZVecBinaryConv : SDTypeProfile<1, 2, + [SDTCisVec<0>, + SDTCisVec<1>, + SDTCisSameAs<1, 2>]>; +def SDT_ZRotateMask : SDTypeProfile<1, 2, + [SDTCisVec<0>, + SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; +def SDT_ZJoinDwords : SDTypeProfile<1, 2, + [SDTCisVT<0, v2i64>, + SDTCisVT<1, i64>, + SDTCisVT<2, i64>]>; +def SDT_ZVecTernary : SDTypeProfile<1, 3, + [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>; +def SDT_ZVecTernaryInt : SDTypeProfile<1, 3, + [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisVT<3, i32>]>; //===----------------------------------------------------------------------===// // Node definitions @@ -134,6 +173,34 @@ def z_udivrem64 : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>; def z_serialize : SDNode<"SystemZISD::SERIALIZE", SDTNone, [SDNPHasChain, SDNPMayStore]>; +// Defined because the index is an i32 rather than a pointer. +def z_vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT", + SDT_ZInsertVectorElt>; +def z_vector_extract : SDNode<"ISD::EXTRACT_VECTOR_ELT", + SDT_ZExtractVectorElt>; +def z_byte_mask : SDNode<"SystemZISD::BYTE_MASK", SDT_ZReplicate>; +def z_rotate_mask : SDNode<"SystemZISD::ROTATE_MASK", SDT_ZRotateMask>; +def z_replicate : SDNode<"SystemZISD::REPLICATE", SDT_ZReplicate>; +def z_join_dwords : SDNode<"SystemZISD::JOIN_DWORDS", SDT_ZJoinDwords>; +def z_splat : SDNode<"SystemZISD::SPLAT", SDT_ZVecBinaryInt>; +def z_merge_high : SDNode<"SystemZISD::MERGE_HIGH", SDT_ZVecBinary>; +def z_merge_low : SDNode<"SystemZISD::MERGE_LOW", SDT_ZVecBinary>; +def z_shl_double : SDNode<"SystemZISD::SHL_DOUBLE", SDT_ZVecTernaryInt>; +def z_permute_dwords : SDNode<"SystemZISD::PERMUTE_DWORDS", + SDT_ZVecTernaryInt>; +def z_permute : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>; +def z_pack : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>; +def z_vshl_by_scalar : SDNode<"SystemZISD::VSHL_BY_SCALAR", + SDT_ZVecBinaryInt>; +def z_vsrl_by_scalar : SDNode<"SystemZISD::VSRL_BY_SCALAR", + SDT_ZVecBinaryInt>; +def z_vsra_by_scalar : SDNode<"SystemZISD::VSRA_BY_SCALAR", + SDT_ZVecBinaryInt>; +def z_vsum : SDNode<"SystemZISD::VSUM", SDT_ZVecBinaryConv>; +def z_vicmpe : SDNode<"SystemZISD::VICMPE", SDT_ZVecBinary>; +def z_vicmph : SDNode<"SystemZISD::VICMPH", SDT_ZVecBinary>; +def z_vicmphl : SDNode<"SystemZISD::VICMPHL", SDT_ZVecBinary>; + class AtomicWOp : SDNode<"SystemZISD::"##name, profile, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; @@ -192,6 +259,10 @@ def z_tbegin_nofloat : SDNode<"SystemZISD::TBEGIN_NOFLOAT", SDT_ZTBegin, def z_tend : SDNode<"SystemZISD::TEND", SDTNone, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def z_vshl : SDNode<"ISD::SHL", SDT_ZVecBinary>; +def z_vsra : SDNode<"ISD::SRA", SDT_ZVecBinary>; +def z_vsrl : SDNode<"ISD::SRL", SDT_ZVecBinary>; + //===----------------------------------------------------------------------===// // Pattern fragments //===----------------------------------------------------------------------===// @@ -215,11 +286,21 @@ def sext8 : PatFrag<(ops node:$src), (sext_inreg node:$src, i8)>; def sext16 : PatFrag<(ops node:$src), (sext_inreg node:$src, i16)>; def sext32 : PatFrag<(ops node:$src), (sext (i32 node:$src))>; +// Match extensions of an i32 to an i64, followed by an in-register sign +// extension from a sub-i32 value. +def sext8dbl : PatFrag<(ops node:$src), (sext8 (anyext node:$src))>; +def sext16dbl : PatFrag<(ops node:$src), (sext16 (anyext node:$src))>; + // Register zero-extend operations. Sub-32-bit values are represented as i32s. def zext8 : PatFrag<(ops node:$src), (and node:$src, 0xff)>; def zext16 : PatFrag<(ops node:$src), (and node:$src, 0xffff)>; def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>; +// Match extensions of an i32 to an i64, followed by an AND of the low +// i8 or i16 part. +def zext8dbl : PatFrag<(ops node:$src), (zext8 (anyext node:$src))>; +def zext16dbl : PatFrag<(ops node:$src), (zext16 (anyext node:$src))>; + // Typed floating-point loads. def loadf32 : PatFrag<(ops node:$src), (f32 (load node:$src))>; def loadf64 : PatFrag<(ops node:$src), (f64 (load node:$src))>; @@ -383,6 +464,10 @@ def z_iabs64 : PatFrag<(ops node:$src), def z_inegabs32 : PatFrag<(ops node:$src), (ineg (z_iabs32 node:$src))>; def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>; +// Integer multiply-and-add +def z_muladd : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (add (mul node:$src1, node:$src2), node:$src3)>; + // Fused multiply-add and multiply-subtract, but with the order of the // operands matching SystemZ's MA and MS instructions. def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3), @@ -403,3 +488,88 @@ class loadu class storeu : PatFrag<(ops node:$value, node:$addr), (store (operator node:$value), node:$addr)>; + +// Vector representation of all-zeros and all-ones. +def z_vzero : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 0))))>; +def z_vones : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 65535))))>; + +// Load a scalar and replicate it in all elements of a vector. +class z_replicate_load + : PatFrag<(ops node:$addr), + (z_replicate (scalartype (load node:$addr)))>; +def z_replicate_loadi8 : z_replicate_load; +def z_replicate_loadi16 : z_replicate_load; +def z_replicate_loadi32 : z_replicate_load; +def z_replicate_loadi64 : z_replicate_load; + +// Load a scalar and insert it into a single element of a vector. +class z_vle + : PatFrag<(ops node:$vec, node:$addr, node:$index), + (z_vector_insert node:$vec, (scalartype (load node:$addr)), + node:$index)>; +def z_vlei8 : z_vle; +def z_vlei16 : z_vle; +def z_vlei32 : z_vle; +def z_vlei64 : z_vle; + +// Load a scalar and insert it into the low element of the high i64 of a +// zeroed vector. +class z_vllez + : PatFrag<(ops node:$addr), + (z_vector_insert (z_vzero), + (scalartype (load node:$addr)), (i32 index))>; +def z_vllezi8 : z_vllez; +def z_vllezi16 : z_vllez; +def z_vllezi32 : z_vllez; +def z_vllezi64 : PatFrag<(ops node:$addr), + (z_join_dwords (i64 (load node:$addr)), (i64 0))>; + +// Store one element of a vector. +class z_vste + : PatFrag<(ops node:$vec, node:$addr, node:$index), + (store (scalartype (z_vector_extract node:$vec, node:$index)), + node:$addr)>; +def z_vstei8 : z_vste; +def z_vstei16 : z_vste; +def z_vstei32 : z_vste; +def z_vstei64 : z_vste; + +// Arithmetic negation on vectors. +def z_vneg : PatFrag<(ops node:$x), (sub (z_vzero), node:$x)>; + +// Bitwise negation on vectors. +def z_vnot : PatFrag<(ops node:$x), (xor node:$x, (z_vones))>; + +// Signed "integer greater than zero" on vectors. +def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, (z_vzero))>; + +// Signed "integer less than zero" on vectors. +def z_vicmpl_zero : PatFrag<(ops node:$x), (z_vicmph (z_vzero), node:$x)>; + +// Integer absolute on vectors. +class z_viabs + : PatFrag<(ops node:$src), + (xor (add node:$src, (z_vsra_by_scalar node:$src, (i32 shift))), + (z_vsra_by_scalar node:$src, (i32 shift)))>; +def z_viabs8 : z_viabs<7>; +def z_viabs16 : z_viabs<15>; +def z_viabs32 : z_viabs<31>; +def z_viabs64 : z_viabs<63>; + +// Sign-extend the i64 elements of a vector. +class z_vse + : PatFrag<(ops node:$src), + (z_vsra_by_scalar (z_vshl_by_scalar node:$src, shift), shift)>; +def z_vsei8 : z_vse<56>; +def z_vsei16 : z_vse<48>; +def z_vsei32 : z_vse<32>; + +// ...and again with the extensions being done on individual i64 scalars. +class z_vse_by_parts + : PatFrag<(ops node:$src), + (z_join_dwords + (operator (z_vector_extract node:$src, index1)), + (operator (z_vector_extract node:$src, index2)))>; +def z_vsei8_by_parts : z_vse_by_parts; +def z_vsei16_by_parts : z_vse_by_parts; +def z_vsei32_by_parts : z_vse_by_parts; diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index b2f8175579f..a34cdaf8030 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -21,15 +21,70 @@ extern "C" void LLVMInitializeSystemZTarget() { RegisterTargetMachine X(TheSystemZTarget); } +// Determine whether we use the vector ABI. +static bool UsesVectorABI(StringRef CPU, StringRef FS) { + // We use the vector ABI whenever the vector facility is avaiable. + // This is the case by default if CPU is z13 or later, and can be + // overridden via "[+-]vector" feature string elements. + bool VectorABI = true; + if (CPU.empty() || CPU == "generic" || + CPU == "z10" || CPU == "z196" || CPU == "zEC12") + VectorABI = false; + + SmallVector Features; + FS.split(Features, ",", -1, false /* KeepEmpty */); + for (auto &Feature : Features) { + if (Feature == "vector" || Feature == "+vector") + VectorABI = true; + if (Feature == "-vector") + VectorABI = false; + } + + return VectorABI; +} + +static std::string computeDataLayout(StringRef TT, StringRef CPU, + StringRef FS) { + const Triple Triple(TT); + bool VectorABI = UsesVectorABI(CPU, FS); + std::string Ret = ""; + + // Big endian. + Ret += "E"; + + // Data mangling. + Ret += DataLayout::getManglingComponent(Triple); + + // Make sure that global data has at least 16 bits of alignment by + // default, so that we can refer to it using LARL. We don't have any + // special requirements for stack variables though. + Ret += "-i1:8:16-i8:8:16"; + + // 64-bit integers are naturally aligned. + Ret += "-i64:64"; + + // 128-bit floats are aligned only to 64 bits. + Ret += "-f128:64"; + + // When using the vector ABI, 128-bit vectors are also aligned to 64 bits. + if (VectorABI) + Ret += "-v128:64"; + + // We prefer 16 bits of aligned for all globals; see above. + Ret += "-a:8:16"; + + // Integer registers are 32 or 64 bits. + Ret += "-n32:64"; + + return Ret; +} + SystemZTargetMachine::SystemZTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - // Make sure that global data has at least 16 bits of alignment by - // default, so that we can refer to it using LARL. We don't have any - // special requirements for stack variables though. - : LLVMTargetMachine(T, "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64", + : LLVMTargetMachine(T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options, RM, CM, OL), TLOF(make_unique()), Subtarget(TT, CPU, FS, *this) { diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 3337f6388bd..5a87df1976c 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -238,3 +238,21 @@ SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } +unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) { + if (!Vector) + // Discount the stack pointer. Also leave out %r0, since it can't + // be used in an address. + return 14; + if (ST->hasVector()) + return 32; + return 0; +} + +unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) { + if (!Vector) + return 64; + if (ST->hasVector()) + return 128; + return 0; +} + diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index d4989130679..e9cabe968ee 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -63,6 +63,14 @@ public: TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); /// @} + + /// \name Vector TTI Implementations + /// @{ + + unsigned getNumberOfRegisters(bool Vector); + unsigned getRegisterBitWidth(bool Vector); + + /// @} }; } // end namespace llvm diff --git a/test/CodeGen/SystemZ/frame-19.ll b/test/CodeGen/SystemZ/frame-19.ll new file mode 100644 index 00000000000..f6e327c3ae3 --- /dev/null +++ b/test/CodeGen/SystemZ/frame-19.ll @@ -0,0 +1,314 @@ +; Test spilling of vector registers. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; We need to allocate a 16-byte spill slot and save the 8 call-saved FPRs. +; The frame size should be exactly 160 + 16 + 8 * 8 = 240. +define void @f1(<16 x i8> *%ptr) { +; CHECK-LABEL: f1: +; CHECK: aghi %r15, -240 +; CHECK-DAG: std %f8, +; CHECK-DAG: std %f9, +; CHECK-DAG: std %f10, +; CHECK-DAG: std %f11, +; CHECK-DAG: std %f12, +; CHECK-DAG: std %f13, +; CHECK-DAG: std %f14, +; CHECK-DAG: std %f15, +; CHECK: vst {{%v[0-9]+}}, 160(%r15) +; CHECK: vl {{%v[0-9]+}}, 160(%r15) +; CHECK-DAG: ld %f8, +; CHECK-DAG: ld %f9, +; CHECK-DAG: ld %f10, +; CHECK-DAG: ld %f11, +; CHECK-DAG: ld %f12, +; CHECK-DAG: ld %f13, +; CHECK-DAG: ld %f14, +; CHECK-DAG: ld %f15, +; CHECK: aghi %r15, 240 +; CHECK: br %r14 + %v0 = load volatile <16 x i8>, <16 x i8> *%ptr + %v1 = load volatile <16 x i8>, <16 x i8> *%ptr + %v2 = load volatile <16 x i8>, <16 x i8> *%ptr + %v3 = load volatile <16 x i8>, <16 x i8> *%ptr + %v4 = load volatile <16 x i8>, <16 x i8> *%ptr + %v5 = load volatile <16 x i8>, <16 x i8> *%ptr + %v6 = load volatile <16 x i8>, <16 x i8> *%ptr + %v7 = load volatile <16 x i8>, <16 x i8> *%ptr + %v8 = load volatile <16 x i8>, <16 x i8> *%ptr + %v9 = load volatile <16 x i8>, <16 x i8> *%ptr + %v10 = load volatile <16 x i8>, <16 x i8> *%ptr + %v11 = load volatile <16 x i8>, <16 x i8> *%ptr + %v12 = load volatile <16 x i8>, <16 x i8> *%ptr + %v13 = load volatile <16 x i8>, <16 x i8> *%ptr + %v14 = load volatile <16 x i8>, <16 x i8> *%ptr + %v15 = load volatile <16 x i8>, <16 x i8> *%ptr + %v16 = load volatile <16 x i8>, <16 x i8> *%ptr + %v17 = load volatile <16 x i8>, <16 x i8> *%ptr + %v18 = load volatile <16 x i8>, <16 x i8> *%ptr + %v19 = load volatile <16 x i8>, <16 x i8> *%ptr + %v20 = load volatile <16 x i8>, <16 x i8> *%ptr + %v21 = load volatile <16 x i8>, <16 x i8> *%ptr + %v22 = load volatile <16 x i8>, <16 x i8> *%ptr + %v23 = load volatile <16 x i8>, <16 x i8> *%ptr + %v24 = load volatile <16 x i8>, <16 x i8> *%ptr + %v25 = load volatile <16 x i8>, <16 x i8> *%ptr + %v26 = load volatile <16 x i8>, <16 x i8> *%ptr + %v27 = load volatile <16 x i8>, <16 x i8> *%ptr + %v28 = load volatile <16 x i8>, <16 x i8> *%ptr + %v29 = load volatile <16 x i8>, <16 x i8> *%ptr + %v30 = load volatile <16 x i8>, <16 x i8> *%ptr + %v31 = load volatile <16 x i8>, <16 x i8> *%ptr + %vx = load volatile <16 x i8>, <16 x i8> *%ptr + store volatile <16 x i8> %vx, <16 x i8> *%ptr + store volatile <16 x i8> %v31, <16 x i8> *%ptr + store volatile <16 x i8> %v30, <16 x i8> *%ptr + store volatile <16 x i8> %v29, <16 x i8> *%ptr + store volatile <16 x i8> %v28, <16 x i8> *%ptr + store volatile <16 x i8> %v27, <16 x i8> *%ptr + store volatile <16 x i8> %v26, <16 x i8> *%ptr + store volatile <16 x i8> %v25, <16 x i8> *%ptr + store volatile <16 x i8> %v24, <16 x i8> *%ptr + store volatile <16 x i8> %v23, <16 x i8> *%ptr + store volatile <16 x i8> %v22, <16 x i8> *%ptr + store volatile <16 x i8> %v21, <16 x i8> *%ptr + store volatile <16 x i8> %v20, <16 x i8> *%ptr + store volatile <16 x i8> %v19, <16 x i8> *%ptr + store volatile <16 x i8> %v18, <16 x i8> *%ptr + store volatile <16 x i8> %v17, <16 x i8> *%ptr + store volatile <16 x i8> %v16, <16 x i8> *%ptr + store volatile <16 x i8> %v15, <16 x i8> *%ptr + store volatile <16 x i8> %v14, <16 x i8> *%ptr + store volatile <16 x i8> %v13, <16 x i8> *%ptr + store volatile <16 x i8> %v12, <16 x i8> *%ptr + store volatile <16 x i8> %v11, <16 x i8> *%ptr + store volatile <16 x i8> %v10, <16 x i8> *%ptr + store volatile <16 x i8> %v9, <16 x i8> *%ptr + store volatile <16 x i8> %v8, <16 x i8> *%ptr + store volatile <16 x i8> %v7, <16 x i8> *%ptr + store volatile <16 x i8> %v6, <16 x i8> *%ptr + store volatile <16 x i8> %v5, <16 x i8> *%ptr + store volatile <16 x i8> %v4, <16 x i8> *%ptr + store volatile <16 x i8> %v3, <16 x i8> *%ptr + store volatile <16 x i8> %v2, <16 x i8> *%ptr + store volatile <16 x i8> %v1, <16 x i8> *%ptr + store volatile <16 x i8> %v0, <16 x i8> *%ptr + ret void +} + +; Like f1, but no 16-byte slot should be needed. +define void @f2(<16 x i8> *%ptr) { +; CHECK-LABEL: f2: +; CHECK: aghi %r15, -224 +; CHECK-DAG: std %f8, +; CHECK-DAG: std %f9, +; CHECK-DAG: std %f10, +; CHECK-DAG: std %f11, +; CHECK-DAG: std %f12, +; CHECK-DAG: std %f13, +; CHECK-DAG: std %f14, +; CHECK-DAG: std %f15, +; CHECK-NOT: vst {{.*}}(%r15) +; CHECK-NOT: vl {{.*}}(%r15) +; CHECK-DAG: ld %f8, +; CHECK-DAG: ld %f9, +; CHECK-DAG: ld %f10, +; CHECK-DAG: ld %f11, +; CHECK-DAG: ld %f12, +; CHECK-DAG: ld %f13, +; CHECK-DAG: ld %f14, +; CHECK-DAG: ld %f15, +; CHECK: aghi %r15, 224 +; CHECK: br %r14 + %v0 = load volatile <16 x i8>, <16 x i8> *%ptr + %v1 = load volatile <16 x i8>, <16 x i8> *%ptr + %v2 = load volatile <16 x i8>, <16 x i8> *%ptr + %v3 = load volatile <16 x i8>, <16 x i8> *%ptr + %v4 = load volatile <16 x i8>, <16 x i8> *%ptr + %v5 = load volatile <16 x i8>, <16 x i8> *%ptr + %v6 = load volatile <16 x i8>, <16 x i8> *%ptr + %v7 = load volatile <16 x i8>, <16 x i8> *%ptr + %v8 = load volatile <16 x i8>, <16 x i8> *%ptr + %v9 = load volatile <16 x i8>, <16 x i8> *%ptr + %v10 = load volatile <16 x i8>, <16 x i8> *%ptr + %v11 = load volatile <16 x i8>, <16 x i8> *%ptr + %v12 = load volatile <16 x i8>, <16 x i8> *%ptr + %v13 = load volatile <16 x i8>, <16 x i8> *%ptr + %v14 = load volatile <16 x i8>, <16 x i8> *%ptr + %v15 = load volatile <16 x i8>, <16 x i8> *%ptr + %v16 = load volatile <16 x i8>, <16 x i8> *%ptr + %v17 = load volatile <16 x i8>, <16 x i8> *%ptr + %v18 = load volatile <16 x i8>, <16 x i8> *%ptr + %v19 = load volatile <16 x i8>, <16 x i8> *%ptr + %v20 = load volatile <16 x i8>, <16 x i8> *%ptr + %v21 = load volatile <16 x i8>, <16 x i8> *%ptr + %v22 = load volatile <16 x i8>, <16 x i8> *%ptr + %v23 = load volatile <16 x i8>, <16 x i8> *%ptr + %v24 = load volatile <16 x i8>, <16 x i8> *%ptr + %v25 = load volatile <16 x i8>, <16 x i8> *%ptr + %v26 = load volatile <16 x i8>, <16 x i8> *%ptr + %v27 = load volatile <16 x i8>, <16 x i8> *%ptr + %v28 = load volatile <16 x i8>, <16 x i8> *%ptr + %v29 = load volatile <16 x i8>, <16 x i8> *%ptr + %v30 = load volatile <16 x i8>, <16 x i8> *%ptr + %v31 = load volatile <16 x i8>, <16 x i8> *%ptr + store volatile <16 x i8> %v31, <16 x i8> *%ptr + store volatile <16 x i8> %v30, <16 x i8> *%ptr + store volatile <16 x i8> %v29, <16 x i8> *%ptr + store volatile <16 x i8> %v28, <16 x i8> *%ptr + store volatile <16 x i8> %v27, <16 x i8> *%ptr + store volatile <16 x i8> %v26, <16 x i8> *%ptr + store volatile <16 x i8> %v25, <16 x i8> *%ptr + store volatile <16 x i8> %v24, <16 x i8> *%ptr + store volatile <16 x i8> %v23, <16 x i8> *%ptr + store volatile <16 x i8> %v22, <16 x i8> *%ptr + store volatile <16 x i8> %v21, <16 x i8> *%ptr + store volatile <16 x i8> %v20, <16 x i8> *%ptr + store volatile <16 x i8> %v19, <16 x i8> *%ptr + store volatile <16 x i8> %v18, <16 x i8> *%ptr + store volatile <16 x i8> %v17, <16 x i8> *%ptr + store volatile <16 x i8> %v16, <16 x i8> *%ptr + store volatile <16 x i8> %v15, <16 x i8> *%ptr + store volatile <16 x i8> %v14, <16 x i8> *%ptr + store volatile <16 x i8> %v13, <16 x i8> *%ptr + store volatile <16 x i8> %v12, <16 x i8> *%ptr + store volatile <16 x i8> %v11, <16 x i8> *%ptr + store volatile <16 x i8> %v10, <16 x i8> *%ptr + store volatile <16 x i8> %v9, <16 x i8> *%ptr + store volatile <16 x i8> %v8, <16 x i8> *%ptr + store volatile <16 x i8> %v7, <16 x i8> *%ptr + store volatile <16 x i8> %v6, <16 x i8> *%ptr + store volatile <16 x i8> %v5, <16 x i8> *%ptr + store volatile <16 x i8> %v4, <16 x i8> *%ptr + store volatile <16 x i8> %v3, <16 x i8> *%ptr + store volatile <16 x i8> %v2, <16 x i8> *%ptr + store volatile <16 x i8> %v1, <16 x i8> *%ptr + store volatile <16 x i8> %v0, <16 x i8> *%ptr + ret void +} + +; Like f2, but only %f8 should be saved. +define void @f3(<16 x i8> *%ptr) { +; CHECK-LABEL: f3: +; CHECK: aghi %r15, -168 +; CHECK-DAG: std %f8, +; CHECK-NOT: vst {{.*}}(%r15) +; CHECK-NOT: vl {{.*}}(%r15) +; CHECK-NOT: %v9 +; CHECK-NOT: %v10 +; CHECK-NOT: %v11 +; CHECK-NOT: %v12 +; CHECK-NOT: %v13 +; CHECK-NOT: %v14 +; CHECK-NOT: %v15 +; CHECK-DAG: ld %f8, +; CHECK: aghi %r15, 168 +; CHECK: br %r14 + %v0 = load volatile <16 x i8>, <16 x i8> *%ptr + %v1 = load volatile <16 x i8>, <16 x i8> *%ptr + %v2 = load volatile <16 x i8>, <16 x i8> *%ptr + %v3 = load volatile <16 x i8>, <16 x i8> *%ptr + %v4 = load volatile <16 x i8>, <16 x i8> *%ptr + %v5 = load volatile <16 x i8>, <16 x i8> *%ptr + %v6 = load volatile <16 x i8>, <16 x i8> *%ptr + %v7 = load volatile <16 x i8>, <16 x i8> *%ptr + %v8 = load volatile <16 x i8>, <16 x i8> *%ptr + %v16 = load volatile <16 x i8>, <16 x i8> *%ptr + %v17 = load volatile <16 x i8>, <16 x i8> *%ptr + %v18 = load volatile <16 x i8>, <16 x i8> *%ptr + %v19 = load volatile <16 x i8>, <16 x i8> *%ptr + %v20 = load volatile <16 x i8>, <16 x i8> *%ptr + %v21 = load volatile <16 x i8>, <16 x i8> *%ptr + %v22 = load volatile <16 x i8>, <16 x i8> *%ptr + %v23 = load volatile <16 x i8>, <16 x i8> *%ptr + %v24 = load volatile <16 x i8>, <16 x i8> *%ptr + %v25 = load volatile <16 x i8>, <16 x i8> *%ptr + %v26 = load volatile <16 x i8>, <16 x i8> *%ptr + %v27 = load volatile <16 x i8>, <16 x i8> *%ptr + %v28 = load volatile <16 x i8>, <16 x i8> *%ptr + %v29 = load volatile <16 x i8>, <16 x i8> *%ptr + %v30 = load volatile <16 x i8>, <16 x i8> *%ptr + %v31 = load volatile <16 x i8>, <16 x i8> *%ptr + store volatile <16 x i8> %v31, <16 x i8> *%ptr + store volatile <16 x i8> %v30, <16 x i8> *%ptr + store volatile <16 x i8> %v29, <16 x i8> *%ptr + store volatile <16 x i8> %v28, <16 x i8> *%ptr + store volatile <16 x i8> %v27, <16 x i8> *%ptr + store volatile <16 x i8> %v26, <16 x i8> *%ptr + store volatile <16 x i8> %v25, <16 x i8> *%ptr + store volatile <16 x i8> %v24, <16 x i8> *%ptr + store volatile <16 x i8> %v23, <16 x i8> *%ptr + store volatile <16 x i8> %v22, <16 x i8> *%ptr + store volatile <16 x i8> %v21, <16 x i8> *%ptr + store volatile <16 x i8> %v20, <16 x i8> *%ptr + store volatile <16 x i8> %v19, <16 x i8> *%ptr + store volatile <16 x i8> %v18, <16 x i8> *%ptr + store volatile <16 x i8> %v17, <16 x i8> *%ptr + store volatile <16 x i8> %v16, <16 x i8> *%ptr + store volatile <16 x i8> %v8, <16 x i8> *%ptr + store volatile <16 x i8> %v7, <16 x i8> *%ptr + store volatile <16 x i8> %v6, <16 x i8> *%ptr + store volatile <16 x i8> %v5, <16 x i8> *%ptr + store volatile <16 x i8> %v4, <16 x i8> *%ptr + store volatile <16 x i8> %v3, <16 x i8> *%ptr + store volatile <16 x i8> %v2, <16 x i8> *%ptr + store volatile <16 x i8> %v1, <16 x i8> *%ptr + store volatile <16 x i8> %v0, <16 x i8> *%ptr + ret void +} + +; Like f2, but no registers should be saved. +define void @f4(<16 x i8> *%ptr) { +; CHECK-LABEL: f4: +; CHECK-NOT: %r15 +; CHECK: br %r14 + %v0 = load volatile <16 x i8>, <16 x i8> *%ptr + %v1 = load volatile <16 x i8>, <16 x i8> *%ptr + %v2 = load volatile <16 x i8>, <16 x i8> *%ptr + %v3 = load volatile <16 x i8>, <16 x i8> *%ptr + %v4 = load volatile <16 x i8>, <16 x i8> *%ptr + %v5 = load volatile <16 x i8>, <16 x i8> *%ptr + %v6 = load volatile <16 x i8>, <16 x i8> *%ptr + %v7 = load volatile <16 x i8>, <16 x i8> *%ptr + %v16 = load volatile <16 x i8>, <16 x i8> *%ptr + %v17 = load volatile <16 x i8>, <16 x i8> *%ptr + %v18 = load volatile <16 x i8>, <16 x i8> *%ptr + %v19 = load volatile <16 x i8>, <16 x i8> *%ptr + %v20 = load volatile <16 x i8>, <16 x i8> *%ptr + %v21 = load volatile <16 x i8>, <16 x i8> *%ptr + %v22 = load volatile <16 x i8>, <16 x i8> *%ptr + %v23 = load volatile <16 x i8>, <16 x i8> *%ptr + %v24 = load volatile <16 x i8>, <16 x i8> *%ptr + %v25 = load volatile <16 x i8>, <16 x i8> *%ptr + %v26 = load volatile <16 x i8>, <16 x i8> *%ptr + %v27 = load volatile <16 x i8>, <16 x i8> *%ptr + %v28 = load volatile <16 x i8>, <16 x i8> *%ptr + %v29 = load volatile <16 x i8>, <16 x i8> *%ptr + %v30 = load volatile <16 x i8>, <16 x i8> *%ptr + %v31 = load volatile <16 x i8>, <16 x i8> *%ptr + store volatile <16 x i8> %v31, <16 x i8> *%ptr + store volatile <16 x i8> %v30, <16 x i8> *%ptr + store volatile <16 x i8> %v29, <16 x i8> *%ptr + store volatile <16 x i8> %v28, <16 x i8> *%ptr + store volatile <16 x i8> %v27, <16 x i8> *%ptr + store volatile <16 x i8> %v26, <16 x i8> *%ptr + store volatile <16 x i8> %v25, <16 x i8> *%ptr + store volatile <16 x i8> %v24, <16 x i8> *%ptr + store volatile <16 x i8> %v23, <16 x i8> *%ptr + store volatile <16 x i8> %v22, <16 x i8> *%ptr + store volatile <16 x i8> %v21, <16 x i8> *%ptr + store volatile <16 x i8> %v20, <16 x i8> *%ptr + store volatile <16 x i8> %v19, <16 x i8> *%ptr + store volatile <16 x i8> %v18, <16 x i8> *%ptr + store volatile <16 x i8> %v17, <16 x i8> *%ptr + store volatile <16 x i8> %v16, <16 x i8> *%ptr + store volatile <16 x i8> %v7, <16 x i8> *%ptr + store volatile <16 x i8> %v6, <16 x i8> *%ptr + store volatile <16 x i8> %v5, <16 x i8> *%ptr + store volatile <16 x i8> %v4, <16 x i8> *%ptr + store volatile <16 x i8> %v3, <16 x i8> *%ptr + store volatile <16 x i8> %v2, <16 x i8> *%ptr + store volatile <16 x i8> %v1, <16 x i8> *%ptr + store volatile <16 x i8> %v0, <16 x i8> *%ptr + ret void +} diff --git a/test/CodeGen/SystemZ/vec-abi-align.ll b/test/CodeGen/SystemZ/vec-abi-align.ll new file mode 100644 index 00000000000..01b97a8583e --- /dev/null +++ b/test/CodeGen/SystemZ/vec-abi-align.ll @@ -0,0 +1,49 @@ +; Verify that we use the vector ABI datalayout if and only if +; the vector facility is present. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=generic | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s + +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=vector | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+vector | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=-vector,vector | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=-vector,+vector | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=-vector | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=vector,-vector | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+vector,-vector | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s + +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -mattr=-vector | \ +; RUN: FileCheck -check-prefix=CHECK-NOVECTOR %s + +%struct.S = type { i8, <2 x i64> } + +define void @test(%struct.S* %s) nounwind { +; CHECK-VECTOR-LABEL: @test +; CHECK-VECTOR: vl %v0, 8(%r2) +; CHECK-NOVECTOR-LABEL: @test +; CHECK-NOVECTOR-DAG: agsi 16(%r2), 1 +; CHECK-NOVECTOR-DAG: agsi 24(%r2), 1 + %ptr = getelementptr %struct.S, %struct.S* %s, i64 0, i32 1 + %vec = load <2 x i64>, <2 x i64>* %ptr + %add = add <2 x i64> %vec, + store <2 x i64> %add, <2 x i64>* %ptr + ret void +} + diff --git a/test/CodeGen/SystemZ/vec-abs-01.ll b/test/CodeGen/SystemZ/vec-abs-01.ll new file mode 100644 index 00000000000..aec3b9314f1 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-abs-01.ll @@ -0,0 +1,146 @@ +; Test v16i8 absolute. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <16 x i8> @f1(<16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: vlpb %v24, %v24 +; CHECK: br %r14 + %cmp = icmp slt <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val + ret <16 x i8> %ret +} + +; Test with sle. +define <16 x i8> @f2(<16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: vlpb %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sle <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val + ret <16 x i8> %ret +} + +; Test with sgt. +define <16 x i8> @f3(<16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK: vlpb %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sgt <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %val, <16 x i8> %neg + ret <16 x i8> %ret +} + +; Test with sge. +define <16 x i8> @f4(<16 x i8> %val) { +; CHECK-LABEL: f4: +; CHECK: vlpb %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sge <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %val, <16 x i8> %neg + ret <16 x i8> %ret +} + +; Test that negative absolute uses VLPB too. There is no vector equivalent +; of LOAD NEGATIVE. +define <16 x i8> @f5(<16 x i8> %val) { +; CHECK-LABEL: f5: +; CHECK: vlpb [[REG:%v[0-9]+]], %v24 +; CHECK: vlcb %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %abs = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val + %ret = sub <16 x i8> zeroinitializer, %abs + ret <16 x i8> %ret +} + +; Try another form of negative absolute (slt version). +define <16 x i8> @f6(<16 x i8> %val) { +; CHECK-LABEL: f6: +; CHECK: vlpb [[REG:%v[0-9]+]], %v24 +; CHECK: vlcb %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %val, <16 x i8> %neg + ret <16 x i8> %ret +} + +; Test with sle. +define <16 x i8> @f7(<16 x i8> %val) { +; CHECK-LABEL: f7: +; CHECK: vlpb [[REG:%v[0-9]+]], %v24 +; CHECK: vlcb %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sle <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %val, <16 x i8> %neg + ret <16 x i8> %ret +} + +; Test with sgt. +define <16 x i8> @f8(<16 x i8> %val) { +; CHECK-LABEL: f8: +; CHECK: vlpb [[REG:%v[0-9]+]], %v24 +; CHECK: vlcb %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sgt <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val + ret <16 x i8> %ret +} + +; Test with sge. +define <16 x i8> @f9(<16 x i8> %val) { +; CHECK-LABEL: f9: +; CHECK: vlpb [[REG:%v[0-9]+]], %v24 +; CHECK: vlcb %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sge <16 x i8> %val, zeroinitializer + %neg = sub <16 x i8> zeroinitializer, %val + %ret = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val + ret <16 x i8> %ret +} + +; Test with an SRA-based boolean vector. +define <16 x i8> @f10(<16 x i8> %val) { +; CHECK-LABEL: f10: +; CHECK: vlpb %v24, %v24 +; CHECK: br %r14 + %shr = ashr <16 x i8> %val, + + %neg = sub <16 x i8> zeroinitializer, %val + %and1 = and <16 x i8> %shr, %neg + %not = xor <16 x i8> %shr, + + %and2 = and <16 x i8> %not, %val + %ret = or <16 x i8> %and1, %and2 + ret <16 x i8> %ret +} + +; ...and again in reverse +define <16 x i8> @f11(<16 x i8> %val) { +; CHECK-LABEL: f11: +; CHECK: vlpb [[REG:%v[0-9]+]], %v24 +; CHECK: vlcb %v24, [[REG]] +; CHECK: br %r14 + %shr = ashr <16 x i8> %val, + + %and1 = and <16 x i8> %shr, %val + %not = xor <16 x i8> %shr, + + %neg = sub <16 x i8> zeroinitializer, %val + %and2 = and <16 x i8> %not, %neg + %ret = or <16 x i8> %and1, %and2 + ret <16 x i8> %ret +} diff --git a/test/CodeGen/SystemZ/vec-abs-02.ll b/test/CodeGen/SystemZ/vec-abs-02.ll new file mode 100644 index 00000000000..c5af619f0ba --- /dev/null +++ b/test/CodeGen/SystemZ/vec-abs-02.ll @@ -0,0 +1,142 @@ +; Test v8i16 absolute. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <8 x i16> @f1(<8 x i16> %val) { +; CHECK-LABEL: f1: +; CHECK: vlph %v24, %v24 +; CHECK: br %r14 + %cmp = icmp slt <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val + ret <8 x i16> %ret +} + +; Test with sle. +define <8 x i16> @f2(<8 x i16> %val) { +; CHECK-LABEL: f2: +; CHECK: vlph %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sle <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val + ret <8 x i16> %ret +} + +; Test with sgt. +define <8 x i16> @f3(<8 x i16> %val) { +; CHECK-LABEL: f3: +; CHECK: vlph %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sgt <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %val, <8 x i16> %neg + ret <8 x i16> %ret +} + +; Test with sge. +define <8 x i16> @f4(<8 x i16> %val) { +; CHECK-LABEL: f4: +; CHECK: vlph %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sge <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %val, <8 x i16> %neg + ret <8 x i16> %ret +} + +; Test that negative absolute uses VLPH too. There is no vector equivalent +; of LOAD NEGATIVE. +define <8 x i16> @f5(<8 x i16> %val) { +; CHECK-LABEL: f5: +; CHECK: vlph [[REG:%v[0-9]+]], %v24 +; CHECK: vlch %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %abs = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val + %ret = sub <8 x i16> zeroinitializer, %abs + ret <8 x i16> %ret +} + +; Try another form of negative absolute (slt version). +define <8 x i16> @f6(<8 x i16> %val) { +; CHECK-LABEL: f6: +; CHECK: vlph [[REG:%v[0-9]+]], %v24 +; CHECK: vlch %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %val, <8 x i16> %neg + ret <8 x i16> %ret +} + +; Test with sle. +define <8 x i16> @f7(<8 x i16> %val) { +; CHECK-LABEL: f7: +; CHECK: vlph [[REG:%v[0-9]+]], %v24 +; CHECK: vlch %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sle <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %val, <8 x i16> %neg + ret <8 x i16> %ret +} + +; Test with sgt. +define <8 x i16> @f8(<8 x i16> %val) { +; CHECK-LABEL: f8: +; CHECK: vlph [[REG:%v[0-9]+]], %v24 +; CHECK: vlch %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sgt <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val + ret <8 x i16> %ret +} + +; Test with sge. +define <8 x i16> @f9(<8 x i16> %val) { +; CHECK-LABEL: f9: +; CHECK: vlph [[REG:%v[0-9]+]], %v24 +; CHECK: vlch %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sge <8 x i16> %val, zeroinitializer + %neg = sub <8 x i16> zeroinitializer, %val + %ret = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val + ret <8 x i16> %ret +} + +; Test with an SRA-based boolean vector. +define <8 x i16> @f10(<8 x i16> %val) { +; CHECK-LABEL: f10: +; CHECK: vlph %v24, %v24 +; CHECK: br %r14 + %shr = ashr <8 x i16> %val, + + %neg = sub <8 x i16> zeroinitializer, %val + %and1 = and <8 x i16> %shr, %neg + %not = xor <8 x i16> %shr, + + %and2 = and <8 x i16> %not, %val + %ret = or <8 x i16> %and1, %and2 + ret <8 x i16> %ret +} + +; ...and again in reverse +define <8 x i16> @f11(<8 x i16> %val) { +; CHECK-LABEL: f11: +; CHECK: vlph [[REG:%v[0-9]+]], %v24 +; CHECK: vlch %v24, [[REG]] +; CHECK: br %r14 + %shr = ashr <8 x i16> %val, + + %and1 = and <8 x i16> %shr, %val + %not = xor <8 x i16> %shr, + + %neg = sub <8 x i16> zeroinitializer, %val + %and2 = and <8 x i16> %not, %neg + %ret = or <8 x i16> %and1, %and2 + ret <8 x i16> %ret +} diff --git a/test/CodeGen/SystemZ/vec-abs-03.ll b/test/CodeGen/SystemZ/vec-abs-03.ll new file mode 100644 index 00000000000..cb17a8895e1 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-abs-03.ll @@ -0,0 +1,138 @@ +; Test v4i32 absolute. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <4 x i32> @f1(<4 x i32> %val) { +; CHECK-LABEL: f1: +; CHECK: vlpf %v24, %v24 +; CHECK: br %r14 + %cmp = icmp slt <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val + ret <4 x i32> %ret +} + +; Test with sle. +define <4 x i32> @f2(<4 x i32> %val) { +; CHECK-LABEL: f2: +; CHECK: vlpf %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sle <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val + ret <4 x i32> %ret +} + +; Test with sgt. +define <4 x i32> @f3(<4 x i32> %val) { +; CHECK-LABEL: f3: +; CHECK: vlpf %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sgt <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %neg + ret <4 x i32> %ret +} + +; Test with sge. +define <4 x i32> @f4(<4 x i32> %val) { +; CHECK-LABEL: f4: +; CHECK: vlpf %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sge <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %neg + ret <4 x i32> %ret +} + +; Test that negative absolute uses VLPF too. There is no vector equivalent +; of LOAD NEGATIVE. +define <4 x i32> @f5(<4 x i32> %val) { +; CHECK-LABEL: f5: +; CHECK: vlpf [[REG:%v[0-9]+]], %v24 +; CHECK: vlcf %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %abs = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val + %ret = sub <4 x i32> zeroinitializer, %abs + ret <4 x i32> %ret +} + +; Try another form of negative absolute (slt version). +define <4 x i32> @f6(<4 x i32> %val) { +; CHECK-LABEL: f6: +; CHECK: vlpf [[REG:%v[0-9]+]], %v24 +; CHECK: vlcf %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %neg + ret <4 x i32> %ret +} + +; Test with sle. +define <4 x i32> @f7(<4 x i32> %val) { +; CHECK-LABEL: f7: +; CHECK: vlpf [[REG:%v[0-9]+]], %v24 +; CHECK: vlcf %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sle <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %neg + ret <4 x i32> %ret +} + +; Test with sgt. +define <4 x i32> @f8(<4 x i32> %val) { +; CHECK-LABEL: f8: +; CHECK: vlpf [[REG:%v[0-9]+]], %v24 +; CHECK: vlcf %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sgt <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val + ret <4 x i32> %ret +} + +; Test with sge. +define <4 x i32> @f9(<4 x i32> %val) { +; CHECK-LABEL: f9: +; CHECK: vlpf [[REG:%v[0-9]+]], %v24 +; CHECK: vlcf %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sge <4 x i32> %val, zeroinitializer + %neg = sub <4 x i32> zeroinitializer, %val + %ret = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val + ret <4 x i32> %ret +} + +; Test with an SRA-based boolean vector. +define <4 x i32> @f10(<4 x i32> %val) { +; CHECK-LABEL: f10: +; CHECK: vlpf %v24, %v24 +; CHECK: br %r14 + %shr = ashr <4 x i32> %val, + %neg = sub <4 x i32> zeroinitializer, %val + %and1 = and <4 x i32> %shr, %neg + %not = xor <4 x i32> %shr, + %and2 = and <4 x i32> %not, %val + %ret = or <4 x i32> %and1, %and2 + ret <4 x i32> %ret +} + +; ...and again in reverse +define <4 x i32> @f11(<4 x i32> %val) { +; CHECK-LABEL: f11: +; CHECK: vlpf [[REG:%v[0-9]+]], %v24 +; CHECK: vlcf %v24, [[REG]] +; CHECK: br %r14 + %shr = ashr <4 x i32> %val, + %and1 = and <4 x i32> %shr, %val + %not = xor <4 x i32> %shr, + %neg = sub <4 x i32> zeroinitializer, %val + %and2 = and <4 x i32> %not, %neg + %ret = or <4 x i32> %and1, %and2 + ret <4 x i32> %ret +} diff --git a/test/CodeGen/SystemZ/vec-abs-04.ll b/test/CodeGen/SystemZ/vec-abs-04.ll new file mode 100644 index 00000000000..31c489b00b3 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-abs-04.ll @@ -0,0 +1,138 @@ +; Test v2i64 absolute. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <2 x i64> @f1(<2 x i64> %val) { +; CHECK-LABEL: f1: +; CHECK: vlpg %v24, %v24 +; CHECK: br %r14 + %cmp = icmp slt <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val + ret <2 x i64> %ret +} + +; Test with sle. +define <2 x i64> @f2(<2 x i64> %val) { +; CHECK-LABEL: f2: +; CHECK: vlpg %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sle <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val + ret <2 x i64> %ret +} + +; Test with sgt. +define <2 x i64> @f3(<2 x i64> %val) { +; CHECK-LABEL: f3: +; CHECK: vlpg %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sgt <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %val, <2 x i64> %neg + ret <2 x i64> %ret +} + +; Test with sge. +define <2 x i64> @f4(<2 x i64> %val) { +; CHECK-LABEL: f4: +; CHECK: vlpg %v24, %v24 +; CHECK: br %r14 + %cmp = icmp sge <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %val, <2 x i64> %neg + ret <2 x i64> %ret +} + +; Test that negative absolute uses VLPG too. There is no vector equivalent +; of LOAD NEGATIVE. +define <2 x i64> @f5(<2 x i64> %val) { +; CHECK-LABEL: f5: +; CHECK: vlpg [[REG:%v[0-9]+]], %v24 +; CHECK: vlcg %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %abs = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val + %ret = sub <2 x i64> zeroinitializer, %abs + ret <2 x i64> %ret +} + +; Try another form of negative absolute (slt version). +define <2 x i64> @f6(<2 x i64> %val) { +; CHECK-LABEL: f6: +; CHECK: vlpg [[REG:%v[0-9]+]], %v24 +; CHECK: vlcg %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp slt <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %val, <2 x i64> %neg + ret <2 x i64> %ret +} + +; Test with sle. +define <2 x i64> @f7(<2 x i64> %val) { +; CHECK-LABEL: f7: +; CHECK: vlpg [[REG:%v[0-9]+]], %v24 +; CHECK: vlcg %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sle <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %val, <2 x i64> %neg + ret <2 x i64> %ret +} + +; Test with sgt. +define <2 x i64> @f8(<2 x i64> %val) { +; CHECK-LABEL: f8: +; CHECK: vlpg [[REG:%v[0-9]+]], %v24 +; CHECK: vlcg %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sgt <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val + ret <2 x i64> %ret +} + +; Test with sge. +define <2 x i64> @f9(<2 x i64> %val) { +; CHECK-LABEL: f9: +; CHECK: vlpg [[REG:%v[0-9]+]], %v24 +; CHECK: vlcg %v24, [[REG]] +; CHECK: br %r14 + %cmp = icmp sge <2 x i64> %val, zeroinitializer + %neg = sub <2 x i64> zeroinitializer, %val + %ret = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val + ret <2 x i64> %ret +} + +; Test with an SRA-based boolean vector. +define <2 x i64> @f10(<2 x i64> %val) { +; CHECK-LABEL: f10: +; CHECK: vlpg %v24, %v24 +; CHECK: br %r14 + %shr = ashr <2 x i64> %val, + %neg = sub <2 x i64> zeroinitializer, %val + %and1 = and <2 x i64> %shr, %neg + %not = xor <2 x i64> %shr, + %and2 = and <2 x i64> %not, %val + %ret = or <2 x i64> %and1, %and2 + ret <2 x i64> %ret +} + +; ...and again in reverse +define <2 x i64> @f11(<2 x i64> %val) { +; CHECK-LABEL: f11: +; CHECK: vlpg [[REG:%v[0-9]+]], %v24 +; CHECK: vlcg %v24, [[REG]] +; CHECK: br %r14 + %shr = ashr <2 x i64> %val, + %and1 = and <2 x i64> %shr, %val + %not = xor <2 x i64> %shr, + %neg = sub <2 x i64> zeroinitializer, %val + %and2 = and <2 x i64> %not, %neg + %ret = or <2 x i64> %and1, %and2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-add-01.ll b/test/CodeGen/SystemZ/vec-add-01.ll new file mode 100644 index 00000000000..a59a8da1cf8 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-add-01.ll @@ -0,0 +1,39 @@ +; Test vector addition. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 addition. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vab %v24, %v26, %v28 +; CHECK: br %r14 + %ret = add <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 addition. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vah %v24, %v26, %v28 +; CHECK: br %r14 + %ret = add <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 addition. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vaf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = add <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 addition. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vag %v24, %v26, %v28 +; CHECK: br %r14 + %ret = add <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-and-01.ll b/test/CodeGen/SystemZ/vec-and-01.ll new file mode 100644 index 00000000000..d467de69cea --- /dev/null +++ b/test/CodeGen/SystemZ/vec-and-01.ll @@ -0,0 +1,39 @@ +; Test vector AND. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 AND. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vn %v24, %v26, %v28 +; CHECK: br %r14 + %ret = and <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 AND. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vn %v24, %v26, %v28 +; CHECK: br %r14 + %ret = and <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 AND. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vn %v24, %v26, %v28 +; CHECK: br %r14 + %ret = and <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 AND. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vn %v24, %v26, %v28 +; CHECK: br %r14 + %ret = and <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-and-02.ll b/test/CodeGen/SystemZ/vec-and-02.ll new file mode 100644 index 00000000000..30bc9241689 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-and-02.ll @@ -0,0 +1,91 @@ +; Test vector AND-NOT. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 AND-NOT. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vnc %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <16 x i8> %val2, + %ret = and <16 x i8> %val1, %not + ret <16 x i8> %ret +} + +; ...and again with the reverse. +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vnc %v24, %v28, %v26 +; CHECK: br %r14 + %not = xor <16 x i8> %val1, + %ret = and <16 x i8> %not, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 AND-NOT. +define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f3: +; CHECK: vnc %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <8 x i16> %val2, + %ret = and <8 x i16> %val1, %not + ret <8 x i16> %ret +} + +; ...and again with the reverse. +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f4: +; CHECK: vnc %v24, %v28, %v26 +; CHECK: br %r14 + %not = xor <8 x i16> %val1, + %ret = and <8 x i16> %not, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 AND-NOT. +define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f5: +; CHECK: vnc %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <4 x i32> %val2, + %ret = and <4 x i32> %val1, %not + ret <4 x i32> %ret +} + +; ...and again with the reverse. +define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f6: +; CHECK: vnc %v24, %v28, %v26 +; CHECK: br %r14 + %not = xor <4 x i32> %val1, + %ret = and <4 x i32> %not, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 AND-NOT. +define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f7: +; CHECK: vnc %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <2 x i64> %val2, + %ret = and <2 x i64> %val1, %not + ret <2 x i64> %ret +} + +; ...and again with the reverse. +define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f8: +; CHECK: vnc %v24, %v28, %v26 +; CHECK: br %r14 + %not = xor <2 x i64> %val1, + %ret = and <2 x i64> %not, %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-and-03.ll b/test/CodeGen/SystemZ/vec-and-03.ll new file mode 100644 index 00000000000..c73d570fb7b --- /dev/null +++ b/test/CodeGen/SystemZ/vec-and-03.ll @@ -0,0 +1,113 @@ +; Test vector zero extensions, which need to be implemented as ANDs. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i1->v16i8 extension. +define <16 x i8> @f1(<16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: vrepib [[REG:%v[0-9]+]], 1 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <16 x i8> %val to <16 x i1> + %ret = zext <16 x i1> %trunc to <16 x i8> + ret <16 x i8> %ret +} + +; Test a v8i1->v8i16 extension. +define <8 x i16> @f2(<8 x i16> %val) { +; CHECK-LABEL: f2: +; CHECK: vrepih [[REG:%v[0-9]+]], 1 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <8 x i16> %val to <8 x i1> + %ret = zext <8 x i1> %trunc to <8 x i16> + ret <8 x i16> %ret +} + +; Test a v8i8->v8i16 extension. +define <8 x i16> @f3(<8 x i16> %val) { +; CHECK-LABEL: f3: +; CHECK: vgbm [[REG:%v[0-9]+]], 21845 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <8 x i16> %val to <8 x i8> + %ret = zext <8 x i8> %trunc to <8 x i16> + ret <8 x i16> %ret +} + +; Test a v4i1->v4i32 extension. +define <4 x i32> @f4(<4 x i32> %val) { +; CHECK-LABEL: f4: +; CHECK: vrepif [[REG:%v[0-9]+]], 1 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <4 x i32> %val to <4 x i1> + %ret = zext <4 x i1> %trunc to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i8->v4i32 extension. +define <4 x i32> @f5(<4 x i32> %val) { +; CHECK-LABEL: f5: +; CHECK: vgbm [[REG:%v[0-9]+]], 4369 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <4 x i32> %val to <4 x i8> + %ret = zext <4 x i8> %trunc to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i16->v4i32 extension. +define <4 x i32> @f6(<4 x i32> %val) { +; CHECK-LABEL: f6: +; CHECK: vgbm [[REG:%v[0-9]+]], 13107 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <4 x i32> %val to <4 x i16> + %ret = zext <4 x i16> %trunc to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v2i1->v2i64 extension. +define <2 x i64> @f7(<2 x i64> %val) { +; CHECK-LABEL: f7: +; CHECK: vrepig [[REG:%v[0-9]+]], 1 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i1> + %ret = zext <2 x i1> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i8->v2i64 extension. +define <2 x i64> @f8(<2 x i64> %val) { +; CHECK-LABEL: f8: +; CHECK: vgbm [[REG:%v[0-9]+]], 257 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i8> + %ret = zext <2 x i8> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i16->v2i64 extension. +define <2 x i64> @f9(<2 x i64> %val) { +; CHECK-LABEL: f9: +; CHECK: vgbm [[REG:%v[0-9]+]], 771 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i16> + %ret = zext <2 x i16> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i32->v2i64 extension. +define <2 x i64> @f10(<2 x i64> %val) { +; CHECK-LABEL: f10: +; CHECK: vgbm [[REG:%v[0-9]+]], 3855 +; CHECK: vn %v24, %v24, [[REG]] +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i32> + %ret = zext <2 x i32> %trunc to <2 x i64> + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-args-01.ll b/test/CodeGen/SystemZ/vec-args-01.ll new file mode 100644 index 00000000000..e07ab7447b2 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-args-01.ll @@ -0,0 +1,48 @@ +; Test the handling of named vector arguments. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK + +; This routine has 6 integer arguments, which fill up r2-r5 and +; the stack slot at offset 160, and 10 vector arguments, which +; fill up v24-v31 and the two double-wide stack slots at 168 +; and 184. +declare void @bar(i64, i64, i64, i64, i64, i64, + <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, + <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, + <4 x i32>, <4 x i32>) + +define void @foo() { +; CHECK-VEC-LABEL: foo: +; CHECK-VEC-DAG: vrepif %v24, 1 +; CHECK-VEC-DAG: vrepif %v26, 2 +; CHECK-VEC-DAG: vrepif %v28, 3 +; CHECK-VEC-DAG: vrepif %v30, 4 +; CHECK-VEC-DAG: vrepif %v25, 5 +; CHECK-VEC-DAG: vrepif %v27, 6 +; CHECK-VEC-DAG: vrepif %v29, 7 +; CHECK-VEC-DAG: vrepif %v31, 8 +; CHECK-VEC: brasl %r14, bar@PLT +; +; CHECK-STACK-LABEL: foo: +; CHECK-STACK: aghi %r15, -200 +; CHECK-STACK-DAG: mvghi 160(%r15), 6 +; CHECK-STACK-DAG: vrepif [[REG1:%v[0-9]+]], 9 +; CHECK-STACK-DAG: vst [[REG1]], 168(%r15) +; CHECK-STACK-DAG: vrepif [[REG2:%v[0-9]+]], 10 +; CHECK-STACK-DAG: vst [[REG2]], 184(%r15) +; CHECK-STACK: brasl %r14, bar@PLT + + call void @bar (i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, + <4 x i32> , + <4 x i32> , + <4 x i32> , + <4 x i32> , + <4 x i32> , + <4 x i32> , + <4 x i32> , + <4 x i32> , + <4 x i32> , + <4 x i32> ) + ret void +} diff --git a/test/CodeGen/SystemZ/vec-args-02.ll b/test/CodeGen/SystemZ/vec-args-02.ll new file mode 100644 index 00000000000..b6081598326 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-args-02.ll @@ -0,0 +1,31 @@ +; Test the handling of unnamed vector arguments. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK + +; This routine is called with two named vector argument (passed +; in %v24 and %v26) and two unnamed vector arguments (passed +; in the double-wide stack slots at 160 and 176). +declare void @bar(<4 x i32>, <4 x i32>, ...) + +define void @foo() { +; CHECK-VEC-LABEL: foo: +; CHECK-VEC-DAG: vrepif %v24, 1 +; CHECK-VEC-DAG: vrepif %v26, 2 +; CHECK-VEC: brasl %r14, bar@PLT +; +; CHECK-STACK-LABEL: foo: +; CHECK-STACK: aghi %r15, -192 +; CHECK-STACK-DAG: vrepif [[REG1:%v[0-9]+]], 3 +; CHECK-STACK-DAG: vst [[REG1]], 160(%r15) +; CHECK-STACK-DAG: vrepif [[REG2:%v[0-9]+]], 4 +; CHECK-STACK-DAG: vst [[REG2]], 176(%r15) +; CHECK-STACK: brasl %r14, bar@PLT + + call void (<4 x i32>, <4 x i32>, ...) @bar + (<4 x i32> , + <4 x i32> , + <4 x i32> , + <4 x i32> ) + ret void +} diff --git a/test/CodeGen/SystemZ/vec-args-03.ll b/test/CodeGen/SystemZ/vec-args-03.ll new file mode 100644 index 00000000000..e9f51c5e9ee --- /dev/null +++ b/test/CodeGen/SystemZ/vec-args-03.ll @@ -0,0 +1,16 @@ +; Test the handling of incoming vector arguments. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; This routine has 10 vector arguments, which fill up %v24-%v31 and +; the two double-wide stack slots at 160 and 176. +define <4 x i32> @foo(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4, + <4 x i32> %v5, <4 x i32> %v6, <4 x i32> %v7, <4 x i32> %v8, + <4 x i32> %v9, <4 x i32> %v10) { +; CHECK-LABEL: foo: +; CHECK: vl [[REG1:%v[0-9]+]], 176(%r15) +; CHECK: vsf %v24, %v26, [[REG1]] +; CHECK: br %r14 + %y = sub <4 x i32> %v2, %v10 + ret <4 x i32> %y +} diff --git a/test/CodeGen/SystemZ/vec-cmp-01.ll b/test/CodeGen/SystemZ/vec-cmp-01.ll new file mode 100644 index 00000000000..a7546db8d7f --- /dev/null +++ b/test/CodeGen/SystemZ/vec-cmp-01.ll @@ -0,0 +1,228 @@ +; Test v16i8 comparisons. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test eq. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vceqb %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp eq <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test ne. +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vceqb [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test sgt. +define <16 x i8> @f3(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vchb %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test sge. +define <16 x i8> @f4(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vchb [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test sle. +define <16 x i8> @f5(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f5: +; CHECK: vchb [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test slt. +define <16 x i8> @f6(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vchb %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp slt <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test ugt. +define <16 x i8> @f7(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f7: +; CHECK: vchlb %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test uge. +define <16 x i8> @f8(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f8: +; CHECK: vchlb [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test ule. +define <16 x i8> @f9(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f9: +; CHECK: vchlb [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test ult. +define <16 x i8> @f10(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f10: +; CHECK: vchlb %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp ult <16 x i8> %val1, %val2 + %ret = sext <16 x i1> %cmp to <16 x i8> + ret <16 x i8> %ret +} + +; Test eq selects. +define <16 x i8> @f11(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f11: +; CHECK: vceqb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp eq <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test ne selects. +define <16 x i8> @f12(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f12: +; CHECK: vceqb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test sgt selects. +define <16 x i8> @f13(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f13: +; CHECK: vchb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test sge selects. +define <16 x i8> @f14(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f14: +; CHECK: vchb [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test sle selects. +define <16 x i8> @f15(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f15: +; CHECK: vchb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test slt selects. +define <16 x i8> @f16(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f16: +; CHECK: vchb [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp slt <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test ugt selects. +define <16 x i8> @f17(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f17: +; CHECK: vchlb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test uge selects. +define <16 x i8> @f18(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f18: +; CHECK: vchlb [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test ule selects. +define <16 x i8> @f19(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f19: +; CHECK: vchlb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} + +; Test ult selects. +define <16 x i8> @f20(<16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3, <16 x i8> %val4) { +; CHECK-LABEL: f20: +; CHECK: vchlb [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ult <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %ret +} diff --git a/test/CodeGen/SystemZ/vec-cmp-02.ll b/test/CodeGen/SystemZ/vec-cmp-02.ll new file mode 100644 index 00000000000..78fb46c01c0 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-cmp-02.ll @@ -0,0 +1,228 @@ +; Test v8i16 comparisons. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test eq. +define <8 x i16> @f1(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f1: +; CHECK: vceqh %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp eq <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test ne. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vceqh [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test sgt. +define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f3: +; CHECK: vchh %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test sge. +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f4: +; CHECK: vchh [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test sle. +define <8 x i16> @f5(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f5: +; CHECK: vchh [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test slt. +define <8 x i16> @f6(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f6: +; CHECK: vchh %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp slt <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test ugt. +define <8 x i16> @f7(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f7: +; CHECK: vchlh %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test uge. +define <8 x i16> @f8(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vchlh [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test ule. +define <8 x i16> @f9(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f9: +; CHECK: vchlh [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test ult. +define <8 x i16> @f10(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f10: +; CHECK: vchlh %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp ult <8 x i16> %val1, %val2 + %ret = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %ret +} + +; Test eq selects. +define <8 x i16> @f11(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f11: +; CHECK: vceqh [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp eq <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test ne selects. +define <8 x i16> @f12(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f12: +; CHECK: vceqh [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test sgt selects. +define <8 x i16> @f13(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f13: +; CHECK: vchh [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test sge selects. +define <8 x i16> @f14(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f14: +; CHECK: vchh [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test sle selects. +define <8 x i16> @f15(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f15: +; CHECK: vchh [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test slt selects. +define <8 x i16> @f16(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f16: +; CHECK: vchh [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp slt <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test ugt selects. +define <8 x i16> @f17(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f17: +; CHECK: vchlh [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test uge selects. +define <8 x i16> @f18(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f18: +; CHECK: vchlh [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test ule selects. +define <8 x i16> @f19(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f19: +; CHECK: vchlh [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} + +; Test ult selects. +define <8 x i16> @f20(<8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3, <8 x i16> %val4) { +; CHECK-LABEL: f20: +; CHECK: vchlh [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ult <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %ret +} diff --git a/test/CodeGen/SystemZ/vec-cmp-03.ll b/test/CodeGen/SystemZ/vec-cmp-03.ll new file mode 100644 index 00000000000..4b070acc935 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-cmp-03.ll @@ -0,0 +1,228 @@ +; Test v4i32 comparisons. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test eq. +define <4 x i32> @f1(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f1: +; CHECK: vceqf %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp eq <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ne. +define <4 x i32> @f2(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f2: +; CHECK: vceqf [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test sgt. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vchf %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test sge. +define <4 x i32> @f4(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f4: +; CHECK: vchf [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test sle. +define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f5: +; CHECK: vchf [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test slt. +define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f6: +; CHECK: vchf %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp slt <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ugt. +define <4 x i32> @f7(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f7: +; CHECK: vchlf %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test uge. +define <4 x i32> @f8(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f8: +; CHECK: vchlf [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ule. +define <4 x i32> @f9(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f9: +; CHECK: vchlf [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ult. +define <4 x i32> @f10(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f10: +; CHECK: vchlf %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp ult <4 x i32> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test eq selects. +define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f11: +; CHECK: vceqf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp eq <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test ne selects. +define <4 x i32> @f12(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f12: +; CHECK: vceqf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test sgt selects. +define <4 x i32> @f13(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f13: +; CHECK: vchf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test sge selects. +define <4 x i32> @f14(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f14: +; CHECK: vchf [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test sle selects. +define <4 x i32> @f15(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f15: +; CHECK: vchf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test slt selects. +define <4 x i32> @f16(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f16: +; CHECK: vchf [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp slt <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test ugt selects. +define <4 x i32> @f17(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f17: +; CHECK: vchlf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test uge selects. +define <4 x i32> @f18(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f18: +; CHECK: vchlf [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test ule selects. +define <4 x i32> @f19(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f19: +; CHECK: vchlf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} + +; Test ult selects. +define <4 x i32> @f20(<4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3, <4 x i32> %val4) { +; CHECK-LABEL: f20: +; CHECK: vchlf [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ult <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %ret +} diff --git a/test/CodeGen/SystemZ/vec-cmp-04.ll b/test/CodeGen/SystemZ/vec-cmp-04.ll new file mode 100644 index 00000000000..5cecaa7251b --- /dev/null +++ b/test/CodeGen/SystemZ/vec-cmp-04.ll @@ -0,0 +1,228 @@ +; Test v2i64 comparisons. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test eq. +define <2 x i64> @f1(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f1: +; CHECK: vceqg %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp eq <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test ne. +define <2 x i64> @f2(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f2: +; CHECK: vceqg [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test sgt. +define <2 x i64> @f3(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f3: +; CHECK: vchg %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test sge. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vchg [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test sle. +define <2 x i64> @f5(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f5: +; CHECK: vchg [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test slt. +define <2 x i64> @f6(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f6: +; CHECK: vchg %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp slt <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test ugt. +define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f7: +; CHECK: vchlg %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test uge. +define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f8: +; CHECK: vchlg [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test ule. +define <2 x i64> @f9(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f9: +; CHECK: vchlg [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test ult. +define <2 x i64> @f10(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f10: +; CHECK: vchlg %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = icmp ult <2 x i64> %val1, %val2 + %ret = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %ret +} + +; Test eq selects. +define <2 x i64> @f11(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f11: +; CHECK: vceqg [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp eq <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test ne selects. +define <2 x i64> @f12(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f12: +; CHECK: vceqg [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ne <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test sgt selects. +define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f13: +; CHECK: vchg [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test sge selects. +define <2 x i64> @f14(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f14: +; CHECK: vchg [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sge <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test sle selects. +define <2 x i64> @f15(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f15: +; CHECK: vchg [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp sle <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test slt selects. +define <2 x i64> @f16(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f16: +; CHECK: vchg [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp slt <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test ugt selects. +define <2 x i64> @f17(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f17: +; CHECK: vchlg [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test uge selects. +define <2 x i64> @f18(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f18: +; CHECK: vchlg [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp uge <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test ule selects. +define <2 x i64> @f19(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f19: +; CHECK: vchlg [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ule <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} + +; Test ult selects. +define <2 x i64> @f20(<2 x i64> %val1, <2 x i64> %val2, + <2 x i64> %val3, <2 x i64> %val4) { +; CHECK-LABEL: f20: +; CHECK: vchlg [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = icmp ult <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-combine-01.ll b/test/CodeGen/SystemZ/vec-combine-01.ll new file mode 100644 index 00000000000..f9da34b6475 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-combine-01.ll @@ -0,0 +1,107 @@ +; Test various target-specific DAG combiner patterns. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Check that an extraction followed by a truncation is effectively treated +; as a bitcast. +define void @f1(<4 x i32> %v1, <4 x i32> %v2, i8 *%ptr1, i8 *%ptr2) { +; CHECK-LABEL: f1: +; CHECK: vaf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vsteb [[REG]], 0(%r2), 3 +; CHECK-DAG: vsteb [[REG]], 0(%r3), 15 +; CHECK: br %r14 + %add = add <4 x i32> %v1, %v2 + %elem1 = extractelement <4 x i32> %add, i32 0 + %elem2 = extractelement <4 x i32> %add, i32 3 + %trunc1 = trunc i32 %elem1 to i8 + %trunc2 = trunc i32 %elem2 to i8 + store i8 %trunc1, i8 *%ptr1 + store i8 %trunc2, i8 *%ptr2 + ret void +} + +; Test a case where a pack-type shuffle can be eliminated. +define i16 @f2(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +; CHECK-LABEL: f2: +; CHECK-NOT: vpk +; CHECK-DAG: vaf [[REG1:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vaf [[REG2:%v[0-9]+]], %v26, %v28 +; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG1]], 3 +; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG2]], 7 +; CHECK: br %r14 + %add1 = add <4 x i32> %v1, %v2 + %add2 = add <4 x i32> %v2, %v3 + %shuffle = shufflevector <4 x i32> %add1, <4 x i32> %add2, + <4 x i32> + %bitcast = bitcast <4 x i32> %shuffle to <8 x i16> + %elem1 = extractelement <8 x i16> %bitcast, i32 1 + %elem2 = extractelement <8 x i16> %bitcast, i32 7 + %res = add i16 %elem1, %elem2 + ret i16 %res +} + +; ...and again in a case where there's also a splat and a bitcast. +define i16 @f3(<4 x i32> %v1, <4 x i32> %v2, <2 x i64> %v3) { +; CHECK-LABEL: f3: +; CHECK-NOT: vrepg +; CHECK-NOT: vpk +; CHECK-DAG: vaf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG]], 6 +; CHECK-DAG: vlgvh {{%r[0-5]}}, %v28, 3 +; CHECK: br %r14 + %add = add <4 x i32> %v1, %v2 + %splat = shufflevector <2 x i64> %v3, <2 x i64> undef, + <2 x i32> + %splatcast = bitcast <2 x i64> %splat to <4 x i32> + %shuffle = shufflevector <4 x i32> %add, <4 x i32> %splatcast, + <4 x i32> + %bitcast = bitcast <4 x i32> %shuffle to <8 x i16> + %elem1 = extractelement <8 x i16> %bitcast, i32 2 + %elem2 = extractelement <8 x i16> %bitcast, i32 7 + %res = add i16 %elem1, %elem2 + ret i16 %res +} + +; ...and again with a merge low instead of a pack. +define i16 @f4(<4 x i32> %v1, <4 x i32> %v2, <2 x i64> %v3) { +; CHECK-LABEL: f4: +; CHECK-NOT: vrepg +; CHECK-NOT: vmr +; CHECK-DAG: vaf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG]], 6 +; CHECK-DAG: vlgvh {{%r[0-5]}}, %v28, 3 +; CHECK: br %r14 + %add = add <4 x i32> %v1, %v2 + %splat = shufflevector <2 x i64> %v3, <2 x i64> undef, + <2 x i32> + %splatcast = bitcast <2 x i64> %splat to <4 x i32> + %shuffle = shufflevector <4 x i32> %add, <4 x i32> %splatcast, + <4 x i32> + %bitcast = bitcast <4 x i32> %shuffle to <8 x i16> + %elem1 = extractelement <8 x i16> %bitcast, i32 4 + %elem2 = extractelement <8 x i16> %bitcast, i32 7 + %res = add i16 %elem1, %elem2 + ret i16 %res +} + +; ...and again with a merge high. +define i16 @f5(<4 x i32> %v1, <4 x i32> %v2, <2 x i64> %v3) { +; CHECK-LABEL: f5: +; CHECK-NOT: vrepg +; CHECK-NOT: vmr +; CHECK-DAG: vaf [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG]], 2 +; CHECK-DAG: vlgvh {{%r[0-5]}}, %v28, 3 +; CHECK: br %r14 + %add = add <4 x i32> %v1, %v2 + %splat = shufflevector <2 x i64> %v3, <2 x i64> undef, + <2 x i32> + %splatcast = bitcast <2 x i64> %splat to <4 x i32> + %shuffle = shufflevector <4 x i32> %add, <4 x i32> %splatcast, + <4 x i32> + %bitcast = bitcast <4 x i32> %shuffle to <8 x i16> + %elem1 = extractelement <8 x i16> %bitcast, i32 4 + %elem2 = extractelement <8 x i16> %bitcast, i32 7 + %res = add i16 %elem1, %elem2 + ret i16 %res +} diff --git a/test/CodeGen/SystemZ/vec-const-01.ll b/test/CodeGen/SystemZ/vec-const-01.ll new file mode 100644 index 00000000000..f173b92b015 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-01.ll @@ -0,0 +1,55 @@ +; Test vector byte masks, v16i8 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test an all-zeros vector. +define <16 x i8> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <16 x i8> zeroinitializer +} + +; Test an all-ones vector. +define <16 x i8> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgbm %v24, 65535 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a mixed vector (mask 0x8c75). +define <16 x i8> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgbm %v24, 35957 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test that undefs are treated as zero. +define <16 x i8> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgbm %v24, 35957 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test that we don't use VGBM if one of the bytes is not 0 or 0xff. +define <16 x i8> @f5() { +; CHECK-LABEL: f5: +; CHECK-NOT: vgbm +; CHECK: br %r14 + ret <16 x i8> +} diff --git a/test/CodeGen/SystemZ/vec-const-02.ll b/test/CodeGen/SystemZ/vec-const-02.ll new file mode 100644 index 00000000000..541cbb9faca --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-02.ll @@ -0,0 +1,47 @@ +; Test vector byte masks, v8i16 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test an all-zeros vector. +define <8 x i16> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <8 x i16> zeroinitializer +} + +; Test an all-ones vector. +define <8 x i16> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgbm %v24, 65535 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a mixed vector (mask 0x8c76). +define <8 x i16> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgbm %v24, 35958 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test that undefs are treated as zero. +define <8 x i16> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgbm %v24, 35958 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test that we don't use VGBM if one of the bytes is not 0 or 0xff. +define <8 x i16> @f5() { +; CHECK-LABEL: f5: +; CHECK-NOT: vgbm +; CHECK: br %r14 + ret <8 x i16> +} diff --git a/test/CodeGen/SystemZ/vec-const-03.ll b/test/CodeGen/SystemZ/vec-const-03.ll new file mode 100644 index 00000000000..45ed83866d5 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-03.ll @@ -0,0 +1,43 @@ +; Test vector byte masks, v4i32 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test an all-zeros vector. +define <4 x i32> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <4 x i32> zeroinitializer +} + +; Test an all-ones vector. +define <4 x i32> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgbm %v24, 65535 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a mixed vector (mask 0x8c76). +define <4 x i32> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgbm %v24, 35958 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test that undefs are treated as zero (mask 0x8076). +define <4 x i32> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgbm %v24, 32886 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test that we don't use VGBM if one of the bytes is not 0 or 0xff. +define <4 x i32> @f5() { +; CHECK-LABEL: f5: +; CHECK-NOT: vgbm +; CHECK: br %r14 + ret <4 x i32> +} diff --git a/test/CodeGen/SystemZ/vec-const-04.ll b/test/CodeGen/SystemZ/vec-const-04.ll new file mode 100644 index 00000000000..1c2fb414d25 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-04.ll @@ -0,0 +1,43 @@ +; Test vector byte masks, v2i64 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test an all-zeros vector. +define <2 x i64> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <2 x i64> zeroinitializer +} + +; Test an all-ones vector. +define <2 x i64> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgbm %v24, 65535 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a mixed vector (mask 0x8c76). +define <2 x i64> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgbm %v24, 35958 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test that undefs are treated as zero (mask 0x8c00). +define <2 x i64> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgbm %v24, 35840 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test that we don't use VGBM if one of the bytes is not 0 or 0xff. +define <2 x i64> @f5() { +; CHECK-LABEL: f5: +; CHECK-NOT: vgbm +; CHECK: br %r14 + ret <2 x i64> +} diff --git a/test/CodeGen/SystemZ/vec-const-07.ll b/test/CodeGen/SystemZ/vec-const-07.ll new file mode 100644 index 00000000000..6fcf95b6921 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-07.ll @@ -0,0 +1,229 @@ +; Test vector replicates, v16i8 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a byte-granularity replicate with the lowest useful value. +define <16 x i8> @f1() { +; CHECK-LABEL: f1: +; CHECK: vrepib %v24, 1 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a byte-granularity replicate with an arbitrary value. +define <16 x i8> @f2() { +; CHECK-LABEL: f2: +; CHECK: vrepib %v24, -55 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a byte-granularity replicate with the highest useful value. +define <16 x i8> @f3() { +; CHECK-LABEL: f3: +; CHECK: vrepib %v24, -2 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a halfword-granularity replicate with the lowest useful value. +define <16 x i8> @f4() { +; CHECK-LABEL: f4: +; CHECK: vrepih %v24, 1 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a halfword-granularity replicate with an arbitrary value. +define <16 x i8> @f5() { +; CHECK-LABEL: f5: +; CHECK: vrepih %v24, 25650 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a halfword-granularity replicate with the highest useful value. +define <16 x i8> @f6() { +; CHECK-LABEL: f6: +; CHECK: vrepih %v24, -2 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a word-granularity replicate with the lowest useful positive value. +define <16 x i8> @f7() { +; CHECK-LABEL: f7: +; CHECK: vrepif %v24, 1 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a word-granularity replicate with the highest in-range value. +define <16 x i8> @f8() { +; CHECK-LABEL: f8: +; CHECK: vrepif %v24, 32767 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a word-granularity replicate with the next highest value. +; This cannot use VREPIF. +define <16 x i8> @f9() { +; CHECK-LABEL: f9: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a word-granularity replicate with the lowest in-range value. +define <16 x i8> @f10() { +; CHECK-LABEL: f10: +; CHECK: vrepif %v24, -32768 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a word-granularity replicate with the next lowest value. +; This cannot use VREPIF. +define <16 x i8> @f11() { +; CHECK-LABEL: f11: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a word-granularity replicate with the highest useful negative value. +define <16 x i8> @f12() { +; CHECK-LABEL: f12: +; CHECK: vrepif %v24, -2 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a doubleword-granularity replicate with the lowest useful positive +; value. +define <16 x i8> @f13() { +; CHECK-LABEL: f13: +; CHECK: vrepig %v24, 1 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a doubleword-granularity replicate with the highest in-range value. +define <16 x i8> @f14() { +; CHECK-LABEL: f14: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a doubleword-granularity replicate with the next highest value. +; This cannot use VREPIG. +define <16 x i8> @f15() { +; CHECK-LABEL: f15: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a doubleword-granularity replicate with the lowest in-range value. +define <16 x i8> @f16() { +; CHECK-LABEL: f16: +; CHECK: vrepig %v24, -32768 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a doubleword-granularity replicate with the next lowest value. +; This cannot use VREPIG. +define <16 x i8> @f17() { +; CHECK-LABEL: f17: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a doubleword-granularity replicate with the highest useful negative +; value. +define <16 x i8> @f18() { +; CHECK-LABEL: f18: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <16 x i8> +} + +; Repeat f14 with undefs optimistically treated as 0. +define <16 x i8> @f19() { +; CHECK-LABEL: f19: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <16 x i8> +} + +; Repeat f18 with undefs optimistically treated as -1. +define <16 x i8> @f20() { +; CHECK-LABEL: f20: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <16 x i8> +} diff --git a/test/CodeGen/SystemZ/vec-const-08.ll b/test/CodeGen/SystemZ/vec-const-08.ll new file mode 100644 index 00000000000..5ab6947e548 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-08.ll @@ -0,0 +1,189 @@ +; Test vector replicates, v8i16 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a byte-granularity replicate with the lowest useful value. +define <8 x i16> @f1() { +; CHECK-LABEL: f1: +; CHECK: vrepib %v24, 1 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a byte-granularity replicate with an arbitrary value. +define <8 x i16> @f2() { +; CHECK-LABEL: f2: +; CHECK: vrepib %v24, -55 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a byte-granularity replicate with the highest useful value. +define <8 x i16> @f3() { +; CHECK-LABEL: f3: +; CHECK: vrepib %v24, -2 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a halfword-granularity replicate with the lowest useful value. +define <8 x i16> @f4() { +; CHECK-LABEL: f4: +; CHECK: vrepih %v24, 1 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a halfword-granularity replicate with an arbitrary value. +define <8 x i16> @f5() { +; CHECK-LABEL: f5: +; CHECK: vrepih %v24, 25650 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a halfword-granularity replicate with the highest useful value. +define <8 x i16> @f6() { +; CHECK-LABEL: f6: +; CHECK: vrepih %v24, -2 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a word-granularity replicate with the lowest useful positive value. +define <8 x i16> @f7() { +; CHECK-LABEL: f7: +; CHECK: vrepif %v24, 1 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a word-granularity replicate with the highest in-range value. +define <8 x i16> @f8() { +; CHECK-LABEL: f8: +; CHECK: vrepif %v24, 32767 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a word-granularity replicate with the next highest value. +; This cannot use VREPIF. +define <8 x i16> @f9() { +; CHECK-LABEL: f9: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a word-granularity replicate with the lowest in-range value. +define <8 x i16> @f10() { +; CHECK-LABEL: f10: +; CHECK: vrepif %v24, -32768 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a word-granularity replicate with the next lowest value. +; This cannot use VREPIF. +define <8 x i16> @f11() { +; CHECK-LABEL: f11: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a word-granularity replicate with the highest useful negative value. +define <8 x i16> @f12() { +; CHECK-LABEL: f12: +; CHECK: vrepif %v24, -2 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a doubleword-granularity replicate with the lowest useful positive +; value. +define <8 x i16> @f13() { +; CHECK-LABEL: f13: +; CHECK: vrepig %v24, 1 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a doubleword-granularity replicate with the highest in-range value. +define <8 x i16> @f14() { +; CHECK-LABEL: f14: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a doubleword-granularity replicate with the next highest value. +; This cannot use VREPIG. +define <8 x i16> @f15() { +; CHECK-LABEL: f15: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a doubleword-granularity replicate with the lowest in-range value. +define <8 x i16> @f16() { +; CHECK-LABEL: f16: +; CHECK: vrepig %v24, -32768 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a doubleword-granularity replicate with the next lowest value. +; This cannot use VREPIG. +define <8 x i16> @f17() { +; CHECK-LABEL: f17: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a doubleword-granularity replicate with the highest useful negative +; value. +define <8 x i16> @f18() { +; CHECK-LABEL: f18: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <8 x i16> +} + +; Repeat f14 with undefs optimistically treated as 0. +define <8 x i16> @f19() { +; CHECK-LABEL: f19: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <8 x i16> +} + +; Repeat f18 with undefs optimistically treated as -1. +define <8 x i16> @f20() { +; CHECK-LABEL: f20: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <8 x i16> +} diff --git a/test/CodeGen/SystemZ/vec-const-09.ll b/test/CodeGen/SystemZ/vec-const-09.ll new file mode 100644 index 00000000000..2cbe9259452 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-09.ll @@ -0,0 +1,169 @@ +; Test vector replicates, v4i32 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a byte-granularity replicate with the lowest useful value. +define <4 x i32> @f1() { +; CHECK-LABEL: f1: +; CHECK: vrepib %v24, 1 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a byte-granularity replicate with an arbitrary value. +define <4 x i32> @f2() { +; CHECK-LABEL: f2: +; CHECK: vrepib %v24, -55 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a byte-granularity replicate with the highest useful value. +define <4 x i32> @f3() { +; CHECK-LABEL: f3: +; CHECK: vrepib %v24, -2 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a halfword-granularity replicate with the lowest useful value. +define <4 x i32> @f4() { +; CHECK-LABEL: f4: +; CHECK: vrepih %v24, 1 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a halfword-granularity replicate with an arbitrary value. +define <4 x i32> @f5() { +; CHECK-LABEL: f5: +; CHECK: vrepih %v24, 25650 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a halfword-granularity replicate with the highest useful value. +define <4 x i32> @f6() { +; CHECK-LABEL: f6: +; CHECK: vrepih %v24, -2 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a word-granularity replicate with the lowest useful positive value. +define <4 x i32> @f7() { +; CHECK-LABEL: f7: +; CHECK: vrepif %v24, 1 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a word-granularity replicate with the highest in-range value. +define <4 x i32> @f8() { +; CHECK-LABEL: f8: +; CHECK: vrepif %v24, 32767 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a word-granularity replicate with the next highest value. +; This cannot use VREPIF. +define <4 x i32> @f9() { +; CHECK-LABEL: f9: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a word-granularity replicate with the lowest in-range value. +define <4 x i32> @f10() { +; CHECK-LABEL: f10: +; CHECK: vrepif %v24, -32768 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a word-granularity replicate with the next lowest value. +; This cannot use VREPIF. +define <4 x i32> @f11() { +; CHECK-LABEL: f11: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a word-granularity replicate with the highest useful negative value. +define <4 x i32> @f12() { +; CHECK-LABEL: f12: +; CHECK: vrepif %v24, -2 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a doubleword-granularity replicate with the lowest useful positive +; value. +define <4 x i32> @f13() { +; CHECK-LABEL: f13: +; CHECK: vrepig %v24, 1 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a doubleword-granularity replicate with the highest in-range value. +define <4 x i32> @f14() { +; CHECK-LABEL: f14: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a doubleword-granularity replicate with the next highest value. +; This cannot use VREPIG. +define <4 x i32> @f15() { +; CHECK-LABEL: f15: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a doubleword-granularity replicate with the lowest in-range value. +define <4 x i32> @f16() { +; CHECK-LABEL: f16: +; CHECK: vrepig %v24, -32768 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a doubleword-granularity replicate with the next lowest value. +; This cannot use VREPIG. +define <4 x i32> @f17() { +; CHECK-LABEL: f17: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a doubleword-granularity replicate with the highest useful negative +; value. +define <4 x i32> @f18() { +; CHECK-LABEL: f18: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <4 x i32> +} + +; Repeat f14 with undefs optimistically treated as 0, 32767. +define <4 x i32> @f19() { +; CHECK-LABEL: f19: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <4 x i32> +} + +; Repeat f18 with undefs optimistically treated as -2, -1. +define <4 x i32> @f20() { +; CHECK-LABEL: f20: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <4 x i32> +} diff --git a/test/CodeGen/SystemZ/vec-const-10.ll b/test/CodeGen/SystemZ/vec-const-10.ll new file mode 100644 index 00000000000..0613b69a277 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-10.ll @@ -0,0 +1,169 @@ +; Test vector replicates, v2i64 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a byte-granularity replicate with the lowest useful value. +define <2 x i64> @f1() { +; CHECK-LABEL: f1: +; CHECK: vrepib %v24, 1 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a byte-granularity replicate with an arbitrary value. +define <2 x i64> @f2() { +; CHECK-LABEL: f2: +; CHECK: vrepib %v24, -55 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a byte-granularity replicate with the highest useful value. +define <2 x i64> @f3() { +; CHECK-LABEL: f3: +; CHECK: vrepib %v24, -2 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a halfword-granularity replicate with the lowest useful value. +define <2 x i64> @f4() { +; CHECK-LABEL: f4: +; CHECK: vrepih %v24, 1 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a halfword-granularity replicate with an arbitrary value. +define <2 x i64> @f5() { +; CHECK-LABEL: f5: +; CHECK: vrepih %v24, 25650 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a halfword-granularity replicate with the highest useful value. +define <2 x i64> @f6() { +; CHECK-LABEL: f6: +; CHECK: vrepih %v24, -2 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a word-granularity replicate with the lowest useful positive value. +define <2 x i64> @f7() { +; CHECK-LABEL: f7: +; CHECK: vrepif %v24, 1 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a word-granularity replicate with the highest in-range value. +define <2 x i64> @f8() { +; CHECK-LABEL: f8: +; CHECK: vrepif %v24, 32767 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a word-granularity replicate with the next highest value. +; This cannot use VREPIF. +define <2 x i64> @f9() { +; CHECK-LABEL: f9: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a word-granularity replicate with the lowest in-range value. +define <2 x i64> @f10() { +; CHECK-LABEL: f10: +; CHECK: vrepif %v24, -32768 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a word-granularity replicate with the next lowest value. +; This cannot use VREPIF. +define <2 x i64> @f11() { +; CHECK-LABEL: f11: +; CHECK-NOT: vrepif +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a word-granularity replicate with the highest useful negative value. +define <2 x i64> @f12() { +; CHECK-LABEL: f12: +; CHECK: vrepif %v24, -2 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a doubleword-granularity replicate with the lowest useful positive +; value. +define <2 x i64> @f13() { +; CHECK-LABEL: f13: +; CHECK: vrepig %v24, 1 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a doubleword-granularity replicate with the highest in-range value. +define <2 x i64> @f14() { +; CHECK-LABEL: f14: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a doubleword-granularity replicate with the next highest value. +; This cannot use VREPIG. +define <2 x i64> @f15() { +; CHECK-LABEL: f15: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a doubleword-granularity replicate with the lowest in-range value. +define <2 x i64> @f16() { +; CHECK-LABEL: f16: +; CHECK: vrepig %v24, -32768 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a doubleword-granularity replicate with the next lowest value. +; This cannot use VREPIG. +define <2 x i64> @f17() { +; CHECK-LABEL: f17: +; CHECK-NOT: vrepig +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a doubleword-granularity replicate with the highest useful negative +; value. +define <2 x i64> @f18() { +; CHECK-LABEL: f18: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <2 x i64> +} + +; Repeat f14 with undefs optimistically treated as 32767. +define <2 x i64> @f19() { +; CHECK-LABEL: f19: +; CHECK: vrepig %v24, 32767 +; CHECK: br %r14 + ret <2 x i64> +} + +; Repeat f18 with undefs optimistically treated as -2. +define <2 x i64> @f20() { +; CHECK-LABEL: f20: +; CHECK: vrepig %v24, -2 +; CHECK: br %r14 + ret <2 x i64> +} diff --git a/test/CodeGen/SystemZ/vec-const-13.ll b/test/CodeGen/SystemZ/vec-const-13.ll new file mode 100644 index 00000000000..2cc425252c2 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-13.ll @@ -0,0 +1,193 @@ +; Test vector replicates that use VECTOR GENERATE MASK, v16i8 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a word-granularity replicate with the lowest value that cannot use +; VREPIF. +define <16 x i8> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a word-granularity replicate that has the lower 17 bits set. +define <16 x i8> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgmf %v24, 15, 31 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a word-granularity replicate that has the upper 15 bits set. +define <16 x i8> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgmf %v24, 0, 14 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a word-granularity replicate that has middle bits set. +define <16 x i8> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgmf %v24, 12, 17 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a word-granularity replicate with a wrap-around mask. +define <16 x i8> @f5() { +; CHECK-LABEL: f5: +; CHECK: vgmf %v24, 17, 15 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a doubleword-granularity replicate with the lowest value that cannot +; use VREPIG. +define <16 x i8> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgmg %v24, 48, 48 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a doubleword-granularity replicate that has the lower 22 bits set. +define <16 x i8> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgmg %v24, 42, 63 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a doubleword-granularity replicate that has the upper 45 bits set. +define <16 x i8> @f8() { +; CHECK-LABEL: f8: +; CHECK: vgmg %v24, 0, 44 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a doubleword-granularity replicate that has middle bits set. +define <16 x i8> @f9() { +; CHECK-LABEL: f9: +; CHECK: vgmg %v24, 31, 42 +; CHECK: br %r14 + ret <16 x i8> +} + +; Test a doubleword-granularity replicate with a wrap-around mask. +define <16 x i8> @f10() { +; CHECK-LABEL: f10: +; CHECK: vgmg %v24, 18, 0 +; CHECK: br %r14 + ret <16 x i8> +} + +; Retest f1 with arbitrary undefs instead of 0s. +define <16 x i8> @f11() { +; CHECK-LABEL: f11: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <16 x i8> +} + +; Try a case where we want consistent undefs to be treated as 0. +define <16 x i8> @f12() { +; CHECK-LABEL: f12: +; CHECK: vgmf %v24, 15, 23 +; CHECK: br %r14 + ret <16 x i8> +} + +; ...and again with the lower bits of the replicated constant. +define <16 x i8> @f13() { +; CHECK-LABEL: f13: +; CHECK: vgmf %v24, 15, 22 +; CHECK: br %r14 + ret <16 x i8> +} + +; Try a case where we want consistent undefs to be treated as -1. +define <16 x i8> @f14() { +; CHECK-LABEL: f14: +; CHECK: vgmf %v24, 28, 8 +; CHECK: br %r14 + ret <16 x i8> +} + +; ...and again with the lower bits of the replicated constant. +define <16 x i8> @f15() { +; CHECK-LABEL: f15: +; CHECK: vgmf %v24, 18, 3 +; CHECK: br %r14 + ret <16 x i8> +} + +; Repeat f9 with arbitrary undefs. +define <16 x i8> @f16() { +; CHECK-LABEL: f16: +; CHECK: vgmg %v24, 31, 42 +; CHECK: br %r14 + ret <16 x i8> +} + +; Try a case where we want some consistent undefs to be treated as 0 +; and some to be treated as 255. +define <16 x i8> @f17() { +; CHECK-LABEL: f17: +; CHECK: vgmg %v24, 23, 35 +; CHECK: br %r14 + ret <16 x i8> +} diff --git a/test/CodeGen/SystemZ/vec-const-14.ll b/test/CodeGen/SystemZ/vec-const-14.ll new file mode 100644 index 00000000000..0e3f124dbf6 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-14.ll @@ -0,0 +1,113 @@ +; Test vector replicates that use VECTOR GENERATE MASK, v8i16 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a word-granularity replicate with the lowest value that cannot use +; VREPIF. +define <8 x i16> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a word-granularity replicate that has the lower 17 bits set. +define <8 x i16> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgmf %v24, 15, 31 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a word-granularity replicate that has the upper 15 bits set. +define <8 x i16> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgmf %v24, 0, 14 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a word-granularity replicate that has middle bits set. +define <8 x i16> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgmf %v24, 12, 17 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a word-granularity replicate with a wrap-around mask. +define <8 x i16> @f5() { +; CHECK-LABEL: f5: +; CHECK: vgmf %v24, 17, 15 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a doubleword-granularity replicate with the lowest value that cannot +; use VREPIG. +define <8 x i16> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgmg %v24, 48, 48 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a doubleword-granularity replicate that has the lower 22 bits set. +define <8 x i16> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgmg %v24, 42, 63 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a doubleword-granularity replicate that has the upper 45 bits set. +define <8 x i16> @f8() { +; CHECK-LABEL: f8: +; CHECK: vgmg %v24, 0, 44 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a doubleword-granularity replicate that has middle bits set. +define <8 x i16> @f9() { +; CHECK-LABEL: f9: +; CHECK: vgmg %v24, 31, 42 +; CHECK: br %r14 + ret <8 x i16> +} + +; Test a doubleword-granularity replicate with a wrap-around mask. +define <8 x i16> @f10() { +; CHECK-LABEL: f10: +; CHECK: vgmg %v24, 18, 0 +; CHECK: br %r14 + ret <8 x i16> +} + +; Retest f1 with arbitrary undefs instead of 0s. +define <8 x i16> @f11() { +; CHECK-LABEL: f11: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <8 x i16> +} + +; ...likewise f9. +define <8 x i16> @f12() { +; CHECK-LABEL: f12: +; CHECK: vgmg %v24, 31, 42 +; CHECK: br %r14 + ret <8 x i16> +} diff --git a/test/CodeGen/SystemZ/vec-const-15.ll b/test/CodeGen/SystemZ/vec-const-15.ll new file mode 100644 index 00000000000..cec445efe89 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-15.ll @@ -0,0 +1,85 @@ +; Test vector replicates that use VECTOR GENERATE MASK, v4i32 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a word-granularity replicate with the lowest value that cannot use +; VREPIF. +define <4 x i32> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a word-granularity replicate that has the lower 17 bits set. +define <4 x i32> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgmf %v24, 15, 31 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a word-granularity replicate that has the upper 15 bits set. +define <4 x i32> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgmf %v24, 0, 14 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a word-granularity replicate that has middle bits set. +define <4 x i32> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgmf %v24, 12, 17 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a word-granularity replicate with a wrap-around mask. +define <4 x i32> @f5() { +; CHECK-LABEL: f5: +; CHECK: vgmf %v24, 17, 15 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a doubleword-granularity replicate with the lowest value that cannot +; use VREPIG. +define <4 x i32> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgmg %v24, 48, 48 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a doubleword-granularity replicate that has the lower 22 bits set. +define <4 x i32> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgmg %v24, 42, 63 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a doubleword-granularity replicate that has the upper 45 bits set. +define <4 x i32> @f8() { +; CHECK-LABEL: f8: +; CHECK: vgmg %v24, 0, 44 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a doubleword-granularity replicate that has middle bits set. +define <4 x i32> @f9() { +; CHECK-LABEL: f9: +; CHECK: vgmg %v24, 31, 42 +; CHECK: br %r14 + ret <4 x i32> +} + +; Test a doubleword-granularity replicate with a wrap-around mask. +define <4 x i32> @f10() { +; CHECK-LABEL: f10: +; CHECK: vgmg %v24, 18, 0 +; CHECK: br %r14 + ret <4 x i32> +} diff --git a/test/CodeGen/SystemZ/vec-const-16.ll b/test/CodeGen/SystemZ/vec-const-16.ll new file mode 100644 index 00000000000..1ab7de2761c --- /dev/null +++ b/test/CodeGen/SystemZ/vec-const-16.ll @@ -0,0 +1,85 @@ +; Test vector replicates that use VECTOR GENERATE MASK, v2i64 version. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a word-granularity replicate with the lowest value that cannot use +; VREPIF. +define <2 x i64> @f1() { +; CHECK-LABEL: f1: +; CHECK: vgmf %v24, 16, 16 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a word-granularity replicate that has the lower 17 bits set. +define <2 x i64> @f2() { +; CHECK-LABEL: f2: +; CHECK: vgmf %v24, 15, 31 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a word-granularity replicate that has the upper 15 bits set. +define <2 x i64> @f3() { +; CHECK-LABEL: f3: +; CHECK: vgmf %v24, 0, 14 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a word-granularity replicate that has middle bits set. +define <2 x i64> @f4() { +; CHECK-LABEL: f4: +; CHECK: vgmf %v24, 12, 17 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a word-granularity replicate with a wrap-around mask. +define <2 x i64> @f5() { +; CHECK-LABEL: f5: +; CHECK: vgmf %v24, 17, 15 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a doubleword-granularity replicate with the lowest value that cannot +; use VREPIG. +define <2 x i64> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgmg %v24, 48, 48 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a doubleword-granularity replicate that has the lower 22 bits set. +define <2 x i64> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgmg %v24, 42, 63 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a doubleword-granularity replicate that has the upper 45 bits set. +define <2 x i64> @f8() { +; CHECK-LABEL: f8: +; CHECK: vgmg %v24, 0, 44 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a doubleword-granularity replicate that has middle bits set. +define <2 x i64> @f9() { +; CHECK-LABEL: f9: +; CHECK: vgmg %v24, 31, 42 +; CHECK: br %r14 + ret <2 x i64> +} + +; Test a doubleword-granularity replicate with a wrap-around mask. +define <2 x i64> @f10() { +; CHECK-LABEL: f10: +; CHECK: vgmg %v24, 18, 0 +; CHECK: br %r14 + ret <2 x i64> +} diff --git a/test/CodeGen/SystemZ/vec-ctlz-01.ll b/test/CodeGen/SystemZ/vec-ctlz-01.ll new file mode 100644 index 00000000000..f6502202ef5 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-ctlz-01.ll @@ -0,0 +1,81 @@ +; Test vector count leading zeros +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %src, i1 %is_zero_undef) +declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %src, i1 %is_zero_undef) +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %src, i1 %is_zero_undef) +declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %src, i1 %is_zero_undef) + +define <16 x i8> @f1(<16 x i8> %a) { +; CHECK-LABEL: f1: +; CHECK: vclzb %v24, %v24 +; CHECK: br %r14 + + %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) + ret <16 x i8> %res +} + +define <16 x i8> @f2(<16 x i8> %a) { +; CHECK-LABEL: f2: +; CHECK: vclzb %v24, %v24 +; CHECK: br %r14 + + %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true) + ret <16 x i8> %res +} + +define <8 x i16> @f3(<8 x i16> %a) { +; CHECK-LABEL: f3: +; CHECK: vclzh %v24, %v24 +; CHECK: br %r14 + + %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) + ret <8 x i16> %res +} + +define <8 x i16> @f4(<8 x i16> %a) { +; CHECK-LABEL: f4: +; CHECK: vclzh %v24, %v24 +; CHECK: br %r14 + + %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true) + ret <8 x i16> %res +} + +define <4 x i32> @f5(<4 x i32> %a) { +; CHECK-LABEL: f5: +; CHECK: vclzf %v24, %v24 +; CHECK: br %r14 + + %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) + ret <4 x i32> %res +} + +define <4 x i32> @f6(<4 x i32> %a) { +; CHECK-LABEL: f6: +; CHECK: vclzf %v24, %v24 +; CHECK: br %r14 + + %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true) + ret <4 x i32> %res +} + +define <2 x i64> @f7(<2 x i64> %a) { +; CHECK-LABEL: f7: +; CHECK: vclzg %v24, %v24 +; CHECK: br %r14 + + %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) + ret <2 x i64> %res +} + +define <2 x i64> @f8(<2 x i64> %a) { +; CHECK-LABEL: f8: +; CHECK: vclzg %v24, %v24 +; CHECK: br %r14 + + %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true) + ret <2 x i64> %res +} + diff --git a/test/CodeGen/SystemZ/vec-ctpop-01.ll b/test/CodeGen/SystemZ/vec-ctpop-01.ll new file mode 100644 index 00000000000..0056af73a2e --- /dev/null +++ b/test/CodeGen/SystemZ/vec-ctpop-01.ll @@ -0,0 +1,53 @@ +; Test vector population-count instruction +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) +declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + +define <16 x i8> @f1(<16 x i8> %a) { +; CHECK-LABEL: f1: +; CHECK: vpopct %v24, %v24, 0 +; CHECK: br %r14 + + %popcnt = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) + ret <16 x i8> %popcnt +} + +define <8 x i16> @f2(<8 x i16> %a) { +; CHECK-LABEL: f2: +; CHECK: vpopct [[T1:%v[0-9]+]], %v24, 0 +; CHECK: veslh [[T2:%v[0-9]+]], [[T1]], 8 +; CHECK: vah [[T3:%v[0-9]+]], [[T1]], [[T2]] +; CHECK: vesrlh %v24, [[T3]], 8 +; CHECK: br %r14 + + %popcnt = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) + ret <8 x i16> %popcnt +} + +define <4 x i32> @f3(<4 x i32> %a) { +; CHECK-LABEL: f3: +; CHECK: vpopct [[T1:%v[0-9]+]], %v24, 0 +; CHECK: vgbm [[T2:%v[0-9]+]], 0 +; CHECK: vsumb %v24, [[T1]], [[T2]] +; CHECK: br %r14 + + %popcnt = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) + ret <4 x i32> %popcnt +} + +define <2 x i64> @f4(<2 x i64> %a) { +; CHECK-LABEL: f4: +; CHECK: vpopct [[T1:%v[0-9]+]], %v24, 0 +; CHECK: vgbm [[T2:%v[0-9]+]], 0 +; CHECK: vsumb [[T3:%v[0-9]+]], [[T1]], [[T2]] +; CHECK: vsumgf %v24, [[T3]], [[T2]] +; CHECK: br %r14 + + %popcnt = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + ret <2 x i64> %popcnt +} + diff --git a/test/CodeGen/SystemZ/vec-cttz-01.ll b/test/CodeGen/SystemZ/vec-cttz-01.ll new file mode 100644 index 00000000000..00a0d21b42f --- /dev/null +++ b/test/CodeGen/SystemZ/vec-cttz-01.ll @@ -0,0 +1,81 @@ +; Test vector count trailing zeros +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare <16 x i8> @llvm.cttz.v16i8(<16 x i8> %src, i1 %is_zero_undef) +declare <8 x i16> @llvm.cttz.v8i16(<8 x i16> %src, i1 %is_zero_undef) +declare <4 x i32> @llvm.cttz.v4i32(<4 x i32> %src, i1 %is_zero_undef) +declare <2 x i64> @llvm.cttz.v2i64(<2 x i64> %src, i1 %is_zero_undef) + +define <16 x i8> @f1(<16 x i8> %a) { +; CHECK-LABEL: f1: +; CHECK: vctzb %v24, %v24 +; CHECK: br %r14 + + %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false) + ret <16 x i8> %res +} + +define <16 x i8> @f2(<16 x i8> %a) { +; CHECK-LABEL: f2: +; CHECK: vctzb %v24, %v24 +; CHECK: br %r14 + + %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true) + ret <16 x i8> %res +} + +define <8 x i16> @f3(<8 x i16> %a) { +; CHECK-LABEL: f3: +; CHECK: vctzh %v24, %v24 +; CHECK: br %r14 + + %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false) + ret <8 x i16> %res +} + +define <8 x i16> @f4(<8 x i16> %a) { +; CHECK-LABEL: f4: +; CHECK: vctzh %v24, %v24 +; CHECK: br %r14 + + %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true) + ret <8 x i16> %res +} + +define <4 x i32> @f5(<4 x i32> %a) { +; CHECK-LABEL: f5: +; CHECK: vctzf %v24, %v24 +; CHECK: br %r14 + + %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false) + ret <4 x i32> %res +} + +define <4 x i32> @f6(<4 x i32> %a) { +; CHECK-LABEL: f6: +; CHECK: vctzf %v24, %v24 +; CHECK: br %r14 + + %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true) + ret <4 x i32> %res +} + +define <2 x i64> @f7(<2 x i64> %a) { +; CHECK-LABEL: f7: +; CHECK: vctzg %v24, %v24 +; CHECK: br %r14 + + %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false) + ret <2 x i64> %res +} + +define <2 x i64> @f8(<2 x i64> %a) { +; CHECK-LABEL: f8: +; CHECK: vctzg %v24, %v24 +; CHECK: br %r14 + + %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) + ret <2 x i64> %res +} + diff --git a/test/CodeGen/SystemZ/vec-div-01.ll b/test/CodeGen/SystemZ/vec-div-01.ll new file mode 100644 index 00000000000..3c5ec4f54ee --- /dev/null +++ b/test/CodeGen/SystemZ/vec-div-01.ll @@ -0,0 +1,62 @@ +; Test vector division. There is no native support for this, so it's really +; a test of the operation legalization code. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 division. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vlvgp [[REG:%v[0-9]+]], +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 0 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 1 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 2 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 3 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 4 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 5 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 6 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 8 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 9 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 10 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 11 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 12 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 13 +; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 14 +; CHECK: br %r14 + %ret = sdiv <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 division. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vlvgp [[REG:%v[0-9]+]], +; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 0 +; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 1 +; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 2 +; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 4 +; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 5 +; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 6 +; CHECK: br %r14 + %ret = sdiv <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 division. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vlvgp [[REG:%v[0-9]+]], +; CHECK-DAG: vlvgf [[REG]], {{%r[0-5]}}, 0 +; CHECK-DAG: vlvgf [[REG]], {{%r[0-5]}}, 2 +; CHECK: br %r14 + %ret = sdiv <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 division. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vlvgp %v24, +; CHECK: br %r14 + %ret = sdiv <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-max-01.ll b/test/CodeGen/SystemZ/vec-max-01.ll new file mode 100644 index 00000000000..ca6f08aa493 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-max-01.ll @@ -0,0 +1,83 @@ +; Test v16i8 maximum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmxb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with sle. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmxb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with sgt. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmxb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} + +; Test with sge. +define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmxb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} + +; Test with ult. +define <16 x i8> @f5(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmxlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with ule. +define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmxlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with ugt. +define <16 x i8> @f7(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmxlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} + +; Test with uge. +define <16 x i8> @f8(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmxlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <16 x i8> %val1, %val2 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} diff --git a/test/CodeGen/SystemZ/vec-max-02.ll b/test/CodeGen/SystemZ/vec-max-02.ll new file mode 100644 index 00000000000..2c61603b6f3 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-max-02.ll @@ -0,0 +1,83 @@ +; Test v8i16 maximum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <8 x i16> @f1(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmxh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with sle. +define <8 x i16> @f2(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmxh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with sgt. +define <8 x i16> @f3(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmxh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} + +; Test with sge. +define <8 x i16> @f4(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmxh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} + +; Test with ult. +define <8 x i16> @f5(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmxlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with ule. +define <8 x i16> @f6(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmxlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with ugt. +define <8 x i16> @f7(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmxlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} + +; Test with uge. +define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmxlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <8 x i16> %val1, %val2 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} diff --git a/test/CodeGen/SystemZ/vec-max-03.ll b/test/CodeGen/SystemZ/vec-max-03.ll new file mode 100644 index 00000000000..a4387948399 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-max-03.ll @@ -0,0 +1,83 @@ +; Test v4i32 maximum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <4 x i32> @f1(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmxf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with sle. +define <4 x i32> @f2(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmxf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with sgt. +define <4 x i32> @f3(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmxf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} + +; Test with sge. +define <4 x i32> @f4(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmxf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} + +; Test with ult. +define <4 x i32> @f5(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmxlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with ule. +define <4 x i32> @f6(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmxlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with ugt. +define <4 x i32> @f7(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmxlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} + +; Test with uge. +define <4 x i32> @f8(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmxlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <4 x i32> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} diff --git a/test/CodeGen/SystemZ/vec-max-04.ll b/test/CodeGen/SystemZ/vec-max-04.ll new file mode 100644 index 00000000000..ab7c6239127 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-max-04.ll @@ -0,0 +1,83 @@ +; Test v2i64 maximum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <2 x i64> @f1(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmxg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with sle. +define <2 x i64> @f2(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmxg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with sgt. +define <2 x i64> @f3(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmxg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} + +; Test with sge. +define <2 x i64> @f4(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmxg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} + +; Test with ult. +define <2 x i64> @f5(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmxlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with ule. +define <2 x i64> @f6(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmxlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with ugt. +define <2 x i64> @f7(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmxlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} + +; Test with uge. +define <2 x i64> @f8(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmxlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <2 x i64> %val1, %val2 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-min-01.ll b/test/CodeGen/SystemZ/vec-min-01.ll new file mode 100644 index 00000000000..255dc57e113 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-min-01.ll @@ -0,0 +1,83 @@ +; Test v16i8 minimum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmnb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with sle. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmnb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with sgt. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmnb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} + +; Test with sge. +define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmnb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} + +; Test with ult. +define <16 x i8> @f5(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmnlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with ule. +define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmnlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1 + ret <16 x i8> %ret +} + +; Test with ugt. +define <16 x i8> @f7(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmnlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} + +; Test with uge. +define <16 x i8> @f8(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmnlb %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <16 x i8> %val2, %val1 + %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2 + ret <16 x i8> %ret +} diff --git a/test/CodeGen/SystemZ/vec-min-02.ll b/test/CodeGen/SystemZ/vec-min-02.ll new file mode 100644 index 00000000000..cad8a61506c --- /dev/null +++ b/test/CodeGen/SystemZ/vec-min-02.ll @@ -0,0 +1,83 @@ +; Test v8i16 minimum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <8 x i16> @f1(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmnh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with sle. +define <8 x i16> @f2(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmnh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with sgt. +define <8 x i16> @f3(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmnh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} + +; Test with sge. +define <8 x i16> @f4(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmnh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} + +; Test with ult. +define <8 x i16> @f5(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmnlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with ule. +define <8 x i16> @f6(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmnlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1 + ret <8 x i16> %ret +} + +; Test with ugt. +define <8 x i16> @f7(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmnlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} + +; Test with uge. +define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmnlh %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <8 x i16> %val2, %val1 + %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2 + ret <8 x i16> %ret +} diff --git a/test/CodeGen/SystemZ/vec-min-03.ll b/test/CodeGen/SystemZ/vec-min-03.ll new file mode 100644 index 00000000000..febac50aa46 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-min-03.ll @@ -0,0 +1,83 @@ +; Test v4i32 minimum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <4 x i32> @f1(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmnf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with sle. +define <4 x i32> @f2(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmnf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with sgt. +define <4 x i32> @f3(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmnf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} + +; Test with sge. +define <4 x i32> @f4(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmnf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} + +; Test with ult. +define <4 x i32> @f5(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmnlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with ule. +define <4 x i32> @f6(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmnlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1 + ret <4 x i32> %ret +} + +; Test with ugt. +define <4 x i32> @f7(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmnlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} + +; Test with uge. +define <4 x i32> @f8(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmnlf %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <4 x i32> %val2, %val1 + %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2 + ret <4 x i32> %ret +} diff --git a/test/CodeGen/SystemZ/vec-min-04.ll b/test/CodeGen/SystemZ/vec-min-04.ll new file mode 100644 index 00000000000..765ce1956b5 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-min-04.ll @@ -0,0 +1,83 @@ +; Test v2i64 minimum. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define <2 x i64> @f1(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmng %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp slt <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with sle. +define <2 x i64> @f2(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmng %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sle <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with sgt. +define <2 x i64> @f3(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmng %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sgt <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} + +; Test with sge. +define <2 x i64> @f4(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmng %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp sge <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} + +; Test with ult. +define <2 x i64> @f5(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f5: +; CHECK: vmnlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ult <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with ule. +define <2 x i64> @f6(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmnlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ule <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1 + ret <2 x i64> %ret +} + +; Test with ugt. +define <2 x i64> @f7(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f7: +; CHECK: vmnlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp ugt <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} + +; Test with uge. +define <2 x i64> @f8(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmnlg %v24, {{%v24, %v26|%v26, %v24}} +; CHECK: br %r14 + %cmp = icmp uge <2 x i64> %val2, %val1 + %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-move-01.ll b/test/CodeGen/SystemZ/vec-move-01.ll new file mode 100644 index 00000000000..952e5a42126 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-01.ll @@ -0,0 +1,35 @@ +; Test vector register moves. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 moves. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <16 x i8> %val2 +} + +; Test v8i16 moves. +define <8 x i16> @f2(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <8 x i16> %val2 +} + +; Test v4i32 moves. +define <4 x i32> @f3(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <4 x i32> %val2 +} + +; Test v2i64 moves. +define <2 x i64> @f4(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <2 x i64> %val2 +} diff --git a/test/CodeGen/SystemZ/vec-move-02.ll b/test/CodeGen/SystemZ/vec-move-02.ll new file mode 100644 index 00000000000..b7b3ab6798d --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-02.ll @@ -0,0 +1,93 @@ +; Test vector loads. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 loads. +define <16 x i8> @f1(<16 x i8> *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ret = load <16 x i8>, <16 x i8> *%ptr + ret <16 x i8> %ret +} + +; Test v8i16 loads. +define <8 x i16> @f2(<8 x i16> *%ptr) { +; CHECK-LABEL: f2: +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ret = load <8 x i16>, <8 x i16> *%ptr + ret <8 x i16> %ret +} + +; Test v4i32 loads. +define <4 x i32> @f3(<4 x i32> *%ptr) { +; CHECK-LABEL: f3: +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ret = load <4 x i32>, <4 x i32> *%ptr + ret <4 x i32> %ret +} + +; Test v2i64 loads. +define <2 x i64> @f4(<2 x i64> *%ptr) { +; CHECK-LABEL: f4: +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ret = load <2 x i64>, <2 x i64> *%ptr + ret <2 x i64> %ret +} + +; Test the highest aligned in-range offset. +define <16 x i8> @f7(<16 x i8> *%base) { +; CHECK-LABEL: f7: +; CHECK: vl %v24, 4080(%r2) +; CHECK: br %r14 + %ptr = getelementptr <16 x i8>, <16 x i8> *%base, i64 255 + %ret = load <16 x i8>, <16 x i8> *%ptr + ret <16 x i8> %ret +} + +; Test the highest unaligned in-range offset. +define <16 x i8> @f8(i8 *%base) { +; CHECK-LABEL: f8: +; CHECK: vl %v24, 4095(%r2) +; CHECK: br %r14 + %addr = getelementptr i8, i8 *%base, i64 4095 + %ptr = bitcast i8 *%addr to <16 x i8> * + %ret = load <16 x i8>, <16 x i8> *%ptr, align 1 + ret <16 x i8> %ret +} + +; Test the next offset up, which requires separate address logic, +define <16 x i8> @f9(<16 x i8> *%base) { +; CHECK-LABEL: f9: +; CHECK: aghi %r2, 4096 +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr <16 x i8>, <16 x i8> *%base, i64 256 + %ret = load <16 x i8>, <16 x i8> *%ptr + ret <16 x i8> %ret +} + +; Test negative offsets, which also require separate address logic, +define <16 x i8> @f10(<16 x i8> *%base) { +; CHECK-LABEL: f10: +; CHECK: aghi %r2, -16 +; CHECK: vl %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr <16 x i8>, <16 x i8> *%base, i64 -1 + %ret = load <16 x i8>, <16 x i8> *%ptr + ret <16 x i8> %ret +} + +; Check that indexes are allowed. +define <16 x i8> @f11(i8 *%base, i64 %index) { +; CHECK-LABEL: f11: +; CHECK: vl %v24, 0(%r3,%r2) +; CHECK: br %r14 + %addr = getelementptr i8, i8 *%base, i64 %index + %ptr = bitcast i8 *%addr to <16 x i8> * + %ret = load <16 x i8>, <16 x i8> *%ptr, align 1 + ret <16 x i8> %ret +} diff --git a/test/CodeGen/SystemZ/vec-move-03.ll b/test/CodeGen/SystemZ/vec-move-03.ll new file mode 100644 index 00000000000..ddce4ef209a --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-03.ll @@ -0,0 +1,93 @@ +; Test vector stores. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 stores. +define void @f1(<16 x i8> %val, <16 x i8> *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + store <16 x i8> %val, <16 x i8> *%ptr + ret void +} + +; Test v8i16 stores. +define void @f2(<8 x i16> %val, <8 x i16> *%ptr) { +; CHECK-LABEL: f2: +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + store <8 x i16> %val, <8 x i16> *%ptr + ret void +} + +; Test v4i32 stores. +define void @f3(<4 x i32> %val, <4 x i32> *%ptr) { +; CHECK-LABEL: f3: +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + store <4 x i32> %val, <4 x i32> *%ptr + ret void +} + +; Test v2i64 stores. +define void @f4(<2 x i64> %val, <2 x i64> *%ptr) { +; CHECK-LABEL: f4: +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + store <2 x i64> %val, <2 x i64> *%ptr + ret void +} + +; Test the highest aligned in-range offset. +define void @f7(<16 x i8> %val, <16 x i8> *%base) { +; CHECK-LABEL: f7: +; CHECK: vst %v24, 4080(%r2) +; CHECK: br %r14 + %ptr = getelementptr <16 x i8>, <16 x i8> *%base, i64 255 + store <16 x i8> %val, <16 x i8> *%ptr + ret void +} + +; Test the highest unaligned in-range offset. +define void @f8(<16 x i8> %val, i8 *%base) { +; CHECK-LABEL: f8: +; CHECK: vst %v24, 4095(%r2) +; CHECK: br %r14 + %addr = getelementptr i8, i8 *%base, i64 4095 + %ptr = bitcast i8 *%addr to <16 x i8> * + store <16 x i8> %val, <16 x i8> *%ptr, align 1 + ret void +} + +; Test the next offset up, which requires separate address logic, +define void @f9(<16 x i8> %val, <16 x i8> *%base) { +; CHECK-LABEL: f9: +; CHECK: aghi %r2, 4096 +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr <16 x i8>, <16 x i8> *%base, i64 256 + store <16 x i8> %val, <16 x i8> *%ptr + ret void +} + +; Test negative offsets, which also require separate address logic, +define void @f10(<16 x i8> %val, <16 x i8> *%base) { +; CHECK-LABEL: f10: +; CHECK: aghi %r2, -16 +; CHECK: vst %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr <16 x i8>, <16 x i8> *%base, i64 -1 + store <16 x i8> %val, <16 x i8> *%ptr + ret void +} + +; Check that indexes are allowed. +define void @f11(<16 x i8> %val, i8 *%base, i64 %index) { +; CHECK-LABEL: f11: +; CHECK: vst %v24, 0(%r3,%r2) +; CHECK: br %r14 + %addr = getelementptr i8, i8 *%base, i64 %index + %ptr = bitcast i8 *%addr to <16 x i8> * + store <16 x i8> %val, <16 x i8> *%ptr, align 1 + ret void +} diff --git a/test/CodeGen/SystemZ/vec-move-04.ll b/test/CodeGen/SystemZ/vec-move-04.ll new file mode 100644 index 00000000000..f43c0b71491 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-04.ll @@ -0,0 +1,121 @@ +; Test vector insertion of register variables. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 insertion into the first element. +define <16 x i8> @f1(<16 x i8> %val, i8 %element) { +; CHECK-LABEL: f1: +; CHECK: vlvgb %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 %element, i32 0 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into the last element. +define <16 x i8> @f2(<16 x i8> %val, i8 %element) { +; CHECK-LABEL: f2: +; CHECK: vlvgb %v24, %r2, 15 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 %element, i32 15 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into a variable element. +define <16 x i8> @f3(<16 x i8> %val, i8 %element, i32 %index) { +; CHECK-LABEL: f3: +; CHECK: vlvgb %v24, %r2, 0(%r3) +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 %element, i32 %index + ret <16 x i8> %ret +} + +; Test v8i16 insertion into the first element. +define <8 x i16> @f4(<8 x i16> %val, i16 %element) { +; CHECK-LABEL: f4: +; CHECK: vlvgh %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 %element, i32 0 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into the last element. +define <8 x i16> @f5(<8 x i16> %val, i16 %element) { +; CHECK-LABEL: f5: +; CHECK: vlvgh %v24, %r2, 7 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 %element, i32 7 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into a variable element. +define <8 x i16> @f6(<8 x i16> %val, i16 %element, i32 %index) { +; CHECK-LABEL: f6: +; CHECK: vlvgh %v24, %r2, 0(%r3) +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 %element, i32 %index + ret <8 x i16> %ret +} + +; Test v4i32 insertion into the first element. +define <4 x i32> @f7(<4 x i32> %val, i32 %element) { +; CHECK-LABEL: f7: +; CHECK: vlvgf %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 %element, i32 0 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into the last element. +define <4 x i32> @f8(<4 x i32> %val, i32 %element) { +; CHECK-LABEL: f8: +; CHECK: vlvgf %v24, %r2, 3 +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 %element, i32 3 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into a variable element. +define <4 x i32> @f9(<4 x i32> %val, i32 %element, i32 %index) { +; CHECK-LABEL: f9: +; CHECK: vlvgf %v24, %r2, 0(%r3) +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 %element, i32 %index + ret <4 x i32> %ret +} + +; Test v2i64 insertion into the first element. +define <2 x i64> @f10(<2 x i64> %val, i64 %element) { +; CHECK-LABEL: f10: +; CHECK: vlvgg %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 %element, i32 0 + ret <2 x i64> %ret +} + +; Test v2i64 insertion into the last element. +define <2 x i64> @f11(<2 x i64> %val, i64 %element) { +; CHECK-LABEL: f11: +; CHECK: vlvgg %v24, %r2, 1 +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 %element, i32 1 + ret <2 x i64> %ret +} + +; Test v2i64 insertion into a variable element. +define <2 x i64> @f12(<2 x i64> %val, i64 %element, i32 %index) { +; CHECK-LABEL: f12: +; CHECK: vlvgg %v24, %r2, 0(%r3) +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 %element, i32 %index + ret <2 x i64> %ret +} + +; Test v16i8 insertion into a variable element plus one. +define <16 x i8> @f19(<16 x i8> %val, i8 %element, i32 %index) { +; CHECK-LABEL: f19: +; CHECK: vlvgb %v24, %r2, 1(%r3) +; CHECK: br %r14 + %add = add i32 %index, 1 + %ret = insertelement <16 x i8> %val, i8 %element, i32 %add + ret <16 x i8> %ret +} diff --git a/test/CodeGen/SystemZ/vec-move-05.ll b/test/CodeGen/SystemZ/vec-move-05.ll new file mode 100644 index 00000000000..60a0666c2f9 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-05.ll @@ -0,0 +1,161 @@ +; Test vector extraction. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 extraction of the first element. +define i8 @f1(<16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: vlgvb %r2, %v24, 0 +; CHECK: br %r14 + %ret = extractelement <16 x i8> %val, i32 0 + ret i8 %ret +} + +; Test v16i8 extraction of the last element. +define i8 @f2(<16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: vlgvb %r2, %v24, 15 +; CHECK: br %r14 + %ret = extractelement <16 x i8> %val, i32 15 + ret i8 %ret +} + +; Test v16i8 extractions of an absurd element number. This must compile +; but we don't care what it does. +define i8 @f3(<16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK-NOT: vlgvb %r2, %v24, 100000 +; CHECK: br %r14 + %ret = extractelement <16 x i8> %val, i32 100000 + ret i8 %ret +} + +; Test v16i8 extraction of a variable element. +define i8 @f4(<16 x i8> %val, i32 %index) { +; CHECK-LABEL: f4: +; CHECK: vlgvb %r2, %v24, 0(%r2) +; CHECK: br %r14 + %ret = extractelement <16 x i8> %val, i32 %index + ret i8 %ret +} + +; Test v8i16 extraction of the first element. +define i16 @f5(<8 x i16> %val) { +; CHECK-LABEL: f5: +; CHECK: vlgvh %r2, %v24, 0 +; CHECK: br %r14 + %ret = extractelement <8 x i16> %val, i32 0 + ret i16 %ret +} + +; Test v8i16 extraction of the last element. +define i16 @f6(<8 x i16> %val) { +; CHECK-LABEL: f6: +; CHECK: vlgvh %r2, %v24, 7 +; CHECK: br %r14 + %ret = extractelement <8 x i16> %val, i32 7 + ret i16 %ret +} + +; Test v8i16 extractions of an absurd element number. This must compile +; but we don't care what it does. +define i16 @f7(<8 x i16> %val) { +; CHECK-LABEL: f7: +; CHECK-NOT: vlgvh %r2, %v24, 100000 +; CHECK: br %r14 + %ret = extractelement <8 x i16> %val, i32 100000 + ret i16 %ret +} + +; Test v8i16 extraction of a variable element. +define i16 @f8(<8 x i16> %val, i32 %index) { +; CHECK-LABEL: f8: +; CHECK: vlgvh %r2, %v24, 0(%r2) +; CHECK: br %r14 + %ret = extractelement <8 x i16> %val, i32 %index + ret i16 %ret +} + +; Test v4i32 extraction of the first element. +define i32 @f9(<4 x i32> %val) { +; CHECK-LABEL: f9: +; CHECK: vlgvf %r2, %v24, 0 +; CHECK: br %r14 + %ret = extractelement <4 x i32> %val, i32 0 + ret i32 %ret +} + +; Test v4i32 extraction of the last element. +define i32 @f10(<4 x i32> %val) { +; CHECK-LABEL: f10: +; CHECK: vlgvf %r2, %v24, 3 +; CHECK: br %r14 + %ret = extractelement <4 x i32> %val, i32 3 + ret i32 %ret +} + +; Test v4i32 extractions of an absurd element number. This must compile +; but we don't care what it does. +define i32 @f11(<4 x i32> %val) { +; CHECK-LABEL: f11: +; CHECK-NOT: vlgvf %r2, %v24, 100000 +; CHECK: br %r14 + %ret = extractelement <4 x i32> %val, i32 100000 + ret i32 %ret +} + +; Test v4i32 extraction of a variable element. +define i32 @f12(<4 x i32> %val, i32 %index) { +; CHECK-LABEL: f12: +; CHECK: vlgvf %r2, %v24, 0(%r2) +; CHECK: br %r14 + %ret = extractelement <4 x i32> %val, i32 %index + ret i32 %ret +} + +; Test v2i64 extraction of the first element. +define i64 @f13(<2 x i64> %val) { +; CHECK-LABEL: f13: +; CHECK: vlgvg %r2, %v24, 0 +; CHECK: br %r14 + %ret = extractelement <2 x i64> %val, i32 0 + ret i64 %ret +} + +; Test v2i64 extraction of the last element. +define i64 @f14(<2 x i64> %val) { +; CHECK-LABEL: f14: +; CHECK: vlgvg %r2, %v24, 1 +; CHECK: br %r14 + %ret = extractelement <2 x i64> %val, i32 1 + ret i64 %ret +} + +; Test v2i64 extractions of an absurd element number. This must compile +; but we don't care what it does. +define i64 @f15(<2 x i64> %val) { +; CHECK-LABEL: f15: +; CHECK-NOT: vlgvg %r2, %v24, 100000 +; CHECK: br %r14 + %ret = extractelement <2 x i64> %val, i32 100000 + ret i64 %ret +} + +; Test v2i64 extraction of a variable element. +define i64 @f16(<2 x i64> %val, i32 %index) { +; CHECK-LABEL: f16: +; CHECK: vlgvg %r2, %v24, 0(%r2) +; CHECK: br %r14 + %ret = extractelement <2 x i64> %val, i32 %index + ret i64 %ret +} + +; Test v16i8 extraction of a variable element with an offset. +define i8 @f27(<16 x i8> %val, i32 %index) { +; CHECK-LABEL: f27: +; CHECK: vlgvb %r2, %v24, 1(%r2) +; CHECK: br %r14 + %add = add i32 %index, 1 + %ret = extractelement <16 x i8> %val, i32 %add + ret i8 %ret +} diff --git a/test/CodeGen/SystemZ/vec-move-06.ll b/test/CodeGen/SystemZ/vec-move-06.ll new file mode 100644 index 00000000000..de3960cad95 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-06.ll @@ -0,0 +1,13 @@ +; Test vector builds using VLVGP. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test the basic v2i64 usage. +define <2 x i64> @f1(i64 %a, i64 %b) { +; CHECK-LABEL: f1: +; CHECK: vlvgp %v24, %r2, %r3 +; CHECK: br %r14 + %veca = insertelement <2 x i64> undef, i64 %a, i32 0 + %vecb = insertelement <2 x i64> %veca, i64 %b, i32 1 + ret <2 x i64> %vecb +} diff --git a/test/CodeGen/SystemZ/vec-move-07.ll b/test/CodeGen/SystemZ/vec-move-07.ll new file mode 100644 index 00000000000..a688b089b97 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-07.ll @@ -0,0 +1,39 @@ +; Test scalar_to_vector expansion. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8. +define <16 x i8> @f1(i8 %val) { +; CHECK-LABEL: f1: +; CHECK: vlvgb %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <16 x i8> undef, i8 %val, i32 0 + ret <16 x i8> %ret +} + +; Test v8i16. +define <8 x i16> @f2(i16 %val) { +; CHECK-LABEL: f2: +; CHECK: vlvgh %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <8 x i16> undef, i16 %val, i32 0 + ret <8 x i16> %ret +} + +; Test v4i32. +define <4 x i32> @f3(i32 %val) { +; CHECK-LABEL: f3: +; CHECK: vlvgf %v24, %r2, 0 +; CHECK: br %r14 + %ret = insertelement <4 x i32> undef, i32 %val, i32 0 + ret <4 x i32> %ret +} + +; Test v2i64. Here we load %val into both halves. +define <2 x i64> @f4(i64 %val) { +; CHECK-LABEL: f4: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK: br %r14 + %ret = insertelement <2 x i64> undef, i64 %val, i32 0 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-move-08.ll b/test/CodeGen/SystemZ/vec-move-08.ll new file mode 100644 index 00000000000..94a3b3aefba --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-08.ll @@ -0,0 +1,284 @@ +; Test vector insertion of memory values. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 insertion into the first element. +define <16 x i8> @f1(<16 x i8> %val, i8 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vleb %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = load i8, i8 *%ptr + %ret = insertelement <16 x i8> %val, i8 %element, i32 0 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into the last element. +define <16 x i8> @f2(<16 x i8> %val, i8 *%ptr) { +; CHECK-LABEL: f2: +; CHECK: vleb %v24, 0(%r2), 15 +; CHECK: br %r14 + %element = load i8, i8 *%ptr + %ret = insertelement <16 x i8> %val, i8 %element, i32 15 + ret <16 x i8> %ret +} + +; Test v16i8 insertion with the highest in-range offset. +define <16 x i8> @f3(<16 x i8> %val, i8 *%base) { +; CHECK-LABEL: f3: +; CHECK: vleb %v24, 4095(%r2), 10 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i32 4095 + %element = load i8, i8 *%ptr + %ret = insertelement <16 x i8> %val, i8 %element, i32 10 + ret <16 x i8> %ret +} + +; Test v16i8 insertion with the first ouf-of-range offset. +define <16 x i8> @f4(<16 x i8> %val, i8 *%base) { +; CHECK-LABEL: f4: +; CHECK: aghi %r2, 4096 +; CHECK: vleb %v24, 0(%r2), 5 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i32 4096 + %element = load i8, i8 *%ptr + %ret = insertelement <16 x i8> %val, i8 %element, i32 5 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into a variable element. +define <16 x i8> @f5(<16 x i8> %val, i8 *%ptr, i32 %index) { +; CHECK-LABEL: f5: +; CHECK-NOT: vleb +; CHECK: br %r14 + %element = load i8, i8 *%ptr + %ret = insertelement <16 x i8> %val, i8 %element, i32 %index + ret <16 x i8> %ret +} + +; Test v8i16 insertion into the first element. +define <8 x i16> @f6(<8 x i16> %val, i16 *%ptr) { +; CHECK-LABEL: f6: +; CHECK: vleh %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = load i16, i16 *%ptr + %ret = insertelement <8 x i16> %val, i16 %element, i32 0 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into the last element. +define <8 x i16> @f7(<8 x i16> %val, i16 *%ptr) { +; CHECK-LABEL: f7: +; CHECK: vleh %v24, 0(%r2), 7 +; CHECK: br %r14 + %element = load i16, i16 *%ptr + %ret = insertelement <8 x i16> %val, i16 %element, i32 7 + ret <8 x i16> %ret +} + +; Test v8i16 insertion with the highest in-range offset. +define <8 x i16> @f8(<8 x i16> %val, i16 *%base) { +; CHECK-LABEL: f8: +; CHECK: vleh %v24, 4094(%r2), 5 +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%base, i32 2047 + %element = load i16, i16 *%ptr + %ret = insertelement <8 x i16> %val, i16 %element, i32 5 + ret <8 x i16> %ret +} + +; Test v8i16 insertion with the first ouf-of-range offset. +define <8 x i16> @f9(<8 x i16> %val, i16 *%base) { +; CHECK-LABEL: f9: +; CHECK: aghi %r2, 4096 +; CHECK: vleh %v24, 0(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%base, i32 2048 + %element = load i16, i16 *%ptr + %ret = insertelement <8 x i16> %val, i16 %element, i32 1 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into a variable element. +define <8 x i16> @f10(<8 x i16> %val, i16 *%ptr, i32 %index) { +; CHECK-LABEL: f10: +; CHECK-NOT: vleh +; CHECK: br %r14 + %element = load i16, i16 *%ptr + %ret = insertelement <8 x i16> %val, i16 %element, i32 %index + ret <8 x i16> %ret +} + +; Test v4i32 insertion into the first element. +define <4 x i32> @f11(<4 x i32> %val, i32 *%ptr) { +; CHECK-LABEL: f11: +; CHECK: vlef %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 0 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into the last element. +define <4 x i32> @f12(<4 x i32> %val, i32 *%ptr) { +; CHECK-LABEL: f12: +; CHECK: vlef %v24, 0(%r2), 3 +; CHECK: br %r14 + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 3 + ret <4 x i32> %ret +} + +; Test v4i32 insertion with the highest in-range offset. +define <4 x i32> @f13(<4 x i32> %val, i32 *%base) { +; CHECK-LABEL: f13: +; CHECK: vlef %v24, 4092(%r2), 2 +; CHECK: br %r14 + %ptr = getelementptr i32, i32 *%base, i32 1023 + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 2 + ret <4 x i32> %ret +} + +; Test v4i32 insertion with the first ouf-of-range offset. +define <4 x i32> @f14(<4 x i32> %val, i32 *%base) { +; CHECK-LABEL: f14: +; CHECK: aghi %r2, 4096 +; CHECK: vlef %v24, 0(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr i32, i32 *%base, i32 1024 + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 1 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into a variable element. +define <4 x i32> @f15(<4 x i32> %val, i32 *%ptr, i32 %index) { +; CHECK-LABEL: f15: +; CHECK-NOT: vlef +; CHECK: br %r14 + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 %index + ret <4 x i32> %ret +} + +; Test v2i64 insertion into the first element. +define <2 x i64> @f16(<2 x i64> %val, i64 *%ptr) { +; CHECK-LABEL: f16: +; CHECK: vleg %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 0 + ret <2 x i64> %ret +} + +; Test v2i64 insertion into the last element. +define <2 x i64> @f17(<2 x i64> %val, i64 *%ptr) { +; CHECK-LABEL: f17: +; CHECK: vleg %v24, 0(%r2), 1 +; CHECK: br %r14 + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 1 + ret <2 x i64> %ret +} + +; Test v2i64 insertion with the highest in-range offset. +define <2 x i64> @f18(<2 x i64> %val, i64 *%base) { +; CHECK-LABEL: f18: +; CHECK: vleg %v24, 4088(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%base, i32 511 + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 1 + ret <2 x i64> %ret +} + +; Test v2i64 insertion with the first ouf-of-range offset. +define <2 x i64> @f19(<2 x i64> %val, i64 *%base) { +; CHECK-LABEL: f19: +; CHECK: aghi %r2, 4096 +; CHECK: vleg %v24, 0(%r2), 0 +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%base, i32 512 + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 0 + ret <2 x i64> %ret +} + +; Test v2i64 insertion into a variable element. +define <2 x i64> @f20(<2 x i64> %val, i64 *%ptr, i32 %index) { +; CHECK-LABEL: f20: +; CHECK-NOT: vleg +; CHECK: br %r14 + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 %index + ret <2 x i64> %ret +} + +; Test a v4i32 gather of the first element. +define <4 x i32> @f31(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f31: +; CHECK: vgef %v24, 0(%v26,%r2), 0 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 0 + %ext = zext i32 %elem to i64 + %add = add i64 %base, %ext + %ptr = inttoptr i64 %add to i32 * + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 0 + ret <4 x i32> %ret +} + +; Test a v4i32 gather of the last element. +define <4 x i32> @f32(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f32: +; CHECK: vgef %v24, 0(%v26,%r2), 3 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 3 + %ext = zext i32 %elem to i64 + %add = add i64 %base, %ext + %ptr = inttoptr i64 %add to i32 * + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 3 + ret <4 x i32> %ret +} + +; Test a v4i32 gather with the highest in-range offset. +define <4 x i32> @f33(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f33: +; CHECK: vgef %v24, 4095(%v26,%r2), 1 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 1 + %ext = zext i32 %elem to i64 + %add1 = add i64 %base, %ext + %add2 = add i64 %add1, 4095 + %ptr = inttoptr i64 %add2 to i32 * + %element = load i32, i32 *%ptr + %ret = insertelement <4 x i32> %val, i32 %element, i32 1 + ret <4 x i32> %ret +} + +; Test a v2i64 gather of the first element. +define <2 x i64> @f34(<2 x i64> %val, <2 x i64> %index, i64 %base) { +; CHECK-LABEL: f34: +; CHECK: vgeg %v24, 0(%v26,%r2), 0 +; CHECK: br %r14 + %elem = extractelement <2 x i64> %index, i32 0 + %add = add i64 %base, %elem + %ptr = inttoptr i64 %add to i64 * + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 0 + ret <2 x i64> %ret +} + +; Test a v2i64 gather of the last element. +define <2 x i64> @f35(<2 x i64> %val, <2 x i64> %index, i64 %base) { +; CHECK-LABEL: f35: +; CHECK: vgeg %v24, 0(%v26,%r2), 1 +; CHECK: br %r14 + %elem = extractelement <2 x i64> %index, i32 1 + %add = add i64 %base, %elem + %ptr = inttoptr i64 %add to i64 * + %element = load i64, i64 *%ptr + %ret = insertelement <2 x i64> %val, i64 %element, i32 1 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-move-09.ll b/test/CodeGen/SystemZ/vec-move-09.ll new file mode 100644 index 00000000000..7863e4305f9 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-09.ll @@ -0,0 +1,237 @@ +; Test vector insertion of constants. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 insertion into the first element. +define <16 x i8> @f1(<16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: vleib %v24, 0, 0 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 0, i32 0 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into the last element. +define <16 x i8> @f2(<16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: vleib %v24, 100, 15 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 100, i32 15 + ret <16 x i8> %ret +} + +; Test v16i8 insertion with the maximum signed value. +define <16 x i8> @f3(<16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK: vleib %v24, 127, 10 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 127, i32 10 + ret <16 x i8> %ret +} + +; Test v16i8 insertion with the minimum signed value. +define <16 x i8> @f4(<16 x i8> %val) { +; CHECK-LABEL: f4: +; CHECK: vleib %v24, -128, 11 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 128, i32 11 + ret <16 x i8> %ret +} + +; Test v16i8 insertion with the maximum unsigned value. +define <16 x i8> @f5(<16 x i8> %val) { +; CHECK-LABEL: f5: +; CHECK: vleib %v24, -1, 12 +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 255, i32 12 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into a variable element. +define <16 x i8> @f6(<16 x i8> %val, i32 %index) { +; CHECK-LABEL: f6: +; CHECK-NOT: vleib +; CHECK: br %r14 + %ret = insertelement <16 x i8> %val, i8 0, i32 %index + ret <16 x i8> %ret +} + +; Test v8i16 insertion into the first element. +define <8 x i16> @f7(<8 x i16> %val) { +; CHECK-LABEL: f7: +; CHECK: vleih %v24, 0, 0 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 0, i32 0 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into the last element. +define <8 x i16> @f8(<8 x i16> %val) { +; CHECK-LABEL: f8: +; CHECK: vleih %v24, 0, 7 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 0, i32 7 + ret <8 x i16> %ret +} + +; Test v8i16 insertion with the maximum signed value. +define <8 x i16> @f9(<8 x i16> %val) { +; CHECK-LABEL: f9: +; CHECK: vleih %v24, 32767, 4 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 32767, i32 4 + ret <8 x i16> %ret +} + +; Test v8i16 insertion with the minimum signed value. +define <8 x i16> @f10(<8 x i16> %val) { +; CHECK-LABEL: f10: +; CHECK: vleih %v24, -32768, 5 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 32768, i32 5 + ret <8 x i16> %ret +} + +; Test v8i16 insertion with the maximum unsigned value. +define <8 x i16> @f11(<8 x i16> %val) { +; CHECK-LABEL: f11: +; CHECK: vleih %v24, -1, 6 +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 65535, i32 6 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into a variable element. +define <8 x i16> @f12(<8 x i16> %val, i32 %index) { +; CHECK-LABEL: f12: +; CHECK-NOT: vleih +; CHECK: br %r14 + %ret = insertelement <8 x i16> %val, i16 0, i32 %index + ret <8 x i16> %ret +} + +; Test v4i32 insertion into the first element. +define <4 x i32> @f13(<4 x i32> %val) { +; CHECK-LABEL: f13: +; CHECK: vleif %v24, 0, 0 +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 0, i32 0 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into the last element. +define <4 x i32> @f14(<4 x i32> %val) { +; CHECK-LABEL: f14: +; CHECK: vleif %v24, 0, 3 +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 0, i32 3 + ret <4 x i32> %ret +} + +; Test v4i32 insertion with the maximum value allowed by VLEIF. +define <4 x i32> @f15(<4 x i32> %val) { +; CHECK-LABEL: f15: +; CHECK: vleif %v24, 32767, 1 +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 32767, i32 1 + ret <4 x i32> %ret +} + +; Test v4i32 insertion with the next value up. +define <4 x i32> @f16(<4 x i32> %val) { +; CHECK-LABEL: f16: +; CHECK-NOT: vleif +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 32768, i32 1 + ret <4 x i32> %ret +} + +; Test v4i32 insertion with the minimum value allowed by VLEIF. +define <4 x i32> @f17(<4 x i32> %val) { +; CHECK-LABEL: f17: +; CHECK: vleif %v24, -32768, 2 +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 -32768, i32 2 + ret <4 x i32> %ret +} + +; Test v4i32 insertion with the next value down. +define <4 x i32> @f18(<4 x i32> %val) { +; CHECK-LABEL: f18: +; CHECK-NOT: vleif +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 -32769, i32 2 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into a variable element. +define <4 x i32> @f19(<4 x i32> %val, i32 %index) { +; CHECK-LABEL: f19: +; CHECK-NOT: vleif +; CHECK: br %r14 + %ret = insertelement <4 x i32> %val, i32 0, i32 %index + ret <4 x i32> %ret +} + +; Test v2i64 insertion into the first element. +define <2 x i64> @f20(<2 x i64> %val) { +; CHECK-LABEL: f20: +; CHECK: vleig %v24, 0, 0 +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 0, i32 0 + ret <2 x i64> %ret +} + +; Test v2i64 insertion into the last element. +define <2 x i64> @f21(<2 x i64> %val) { +; CHECK-LABEL: f21: +; CHECK: vleig %v24, 0, 1 +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 0, i32 1 + ret <2 x i64> %ret +} + +; Test v2i64 insertion with the maximum value allowed by VLEIG. +define <2 x i64> @f22(<2 x i64> %val) { +; CHECK-LABEL: f22: +; CHECK: vleig %v24, 32767, 1 +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 32767, i32 1 + ret <2 x i64> %ret +} + +; Test v2i64 insertion with the next value up. +define <2 x i64> @f23(<2 x i64> %val) { +; CHECK-LABEL: f23: +; CHECK-NOT: vleig +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 32768, i32 1 + ret <2 x i64> %ret +} + +; Test v2i64 insertion with the minimum value allowed by VLEIG. +define <2 x i64> @f24(<2 x i64> %val) { +; CHECK-LABEL: f24: +; CHECK: vleig %v24, -32768, 0 +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 -32768, i32 0 + ret <2 x i64> %ret +} + +; Test v2i64 insertion with the next value down. +define <2 x i64> @f25(<2 x i64> %val) { +; CHECK-LABEL: f25: +; CHECK-NOT: vleig +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 -32769, i32 0 + ret <2 x i64> %ret +} + +; Test v2i64 insertion into a variable element. +define <2 x i64> @f26(<2 x i64> %val, i32 %index) { +; CHECK-LABEL: f26: +; CHECK-NOT: vleig +; CHECK: br %r14 + %ret = insertelement <2 x i64> %val, i64 0, i32 %index + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-move-10.ll b/test/CodeGen/SystemZ/vec-move-10.ll new file mode 100644 index 00000000000..852a4a7c4ed --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-10.ll @@ -0,0 +1,328 @@ +; Test vector extraction to memory. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 extraction from the first element. +define void @f1(<16 x i8> %val, i8 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vsteb %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = extractelement <16 x i8> %val, i32 0 + store i8 %element, i8 *%ptr + ret void +} + +; Test v16i8 extraction from the last element. +define void @f2(<16 x i8> %val, i8 *%ptr) { +; CHECK-LABEL: f2: +; CHECK: vsteb %v24, 0(%r2), 15 +; CHECK: br %r14 + %element = extractelement <16 x i8> %val, i32 15 + store i8 %element, i8 *%ptr + ret void +} + +; Test v16i8 extraction of an invalid element. This must compile, +; but we don't care what it does. +define void @f3(<16 x i8> %val, i8 *%ptr) { +; CHECK-LABEL: f3: +; CHECK-NOT: vsteb %v24, 0(%r2), 16 +; CHECK: br %r14 + %element = extractelement <16 x i8> %val, i32 16 + store i8 %element, i8 *%ptr + ret void +} + +; Test v16i8 extraction with the highest in-range offset. +define void @f4(<16 x i8> %val, i8 *%base) { +; CHECK-LABEL: f4: +; CHECK: vsteb %v24, 4095(%r2), 10 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i32 4095 + %element = extractelement <16 x i8> %val, i32 10 + store i8 %element, i8 *%ptr + ret void +} + +; Test v16i8 extraction with the first ouf-of-range offset. +define void @f5(<16 x i8> %val, i8 *%base) { +; CHECK-LABEL: f5: +; CHECK: aghi %r2, 4096 +; CHECK: vsteb %v24, 0(%r2), 5 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i32 4096 + %element = extractelement <16 x i8> %val, i32 5 + store i8 %element, i8 *%ptr + ret void +} + +; Test v16i8 extraction from a variable element. +define void @f6(<16 x i8> %val, i8 *%ptr, i32 %index) { +; CHECK-LABEL: f6: +; CHECK-NOT: vsteb +; CHECK: br %r14 + %element = extractelement <16 x i8> %val, i32 %index + store i8 %element, i8 *%ptr + ret void +} + +; Test v8i16 extraction from the first element. +define void @f7(<8 x i16> %val, i16 *%ptr) { +; CHECK-LABEL: f7: +; CHECK: vsteh %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = extractelement <8 x i16> %val, i32 0 + store i16 %element, i16 *%ptr + ret void +} + +; Test v8i16 extraction from the last element. +define void @f8(<8 x i16> %val, i16 *%ptr) { +; CHECK-LABEL: f8: +; CHECK: vsteh %v24, 0(%r2), 7 +; CHECK: br %r14 + %element = extractelement <8 x i16> %val, i32 7 + store i16 %element, i16 *%ptr + ret void +} + +; Test v8i16 extraction of an invalid element. This must compile, +; but we don't care what it does. +define void @f9(<8 x i16> %val, i16 *%ptr) { +; CHECK-LABEL: f9: +; CHECK-NOT: vsteh %v24, 0(%r2), 8 +; CHECK: br %r14 + %element = extractelement <8 x i16> %val, i32 8 + store i16 %element, i16 *%ptr + ret void +} + +; Test v8i16 extraction with the highest in-range offset. +define void @f10(<8 x i16> %val, i16 *%base) { +; CHECK-LABEL: f10: +; CHECK: vsteh %v24, 4094(%r2), 5 +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%base, i32 2047 + %element = extractelement <8 x i16> %val, i32 5 + store i16 %element, i16 *%ptr + ret void +} + +; Test v8i16 extraction with the first ouf-of-range offset. +define void @f11(<8 x i16> %val, i16 *%base) { +; CHECK-LABEL: f11: +; CHECK: aghi %r2, 4096 +; CHECK: vsteh %v24, 0(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%base, i32 2048 + %element = extractelement <8 x i16> %val, i32 1 + store i16 %element, i16 *%ptr + ret void +} + +; Test v8i16 extraction from a variable element. +define void @f12(<8 x i16> %val, i16 *%ptr, i32 %index) { +; CHECK-LABEL: f12: +; CHECK-NOT: vsteh +; CHECK: br %r14 + %element = extractelement <8 x i16> %val, i32 %index + store i16 %element, i16 *%ptr + ret void +} + +; Test v4i32 extraction from the first element. +define void @f13(<4 x i32> %val, i32 *%ptr) { +; CHECK-LABEL: f13: +; CHECK: vstef %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = extractelement <4 x i32> %val, i32 0 + store i32 %element, i32 *%ptr + ret void +} + +; Test v4i32 extraction from the last element. +define void @f14(<4 x i32> %val, i32 *%ptr) { +; CHECK-LABEL: f14: +; CHECK: vstef %v24, 0(%r2), 3 +; CHECK: br %r14 + %element = extractelement <4 x i32> %val, i32 3 + store i32 %element, i32 *%ptr + ret void +} + +; Test v4i32 extraction of an invalid element. This must compile, +; but we don't care what it does. +define void @f15(<4 x i32> %val, i32 *%ptr) { +; CHECK-LABEL: f15: +; CHECK-NOT: vstef %v24, 0(%r2), 4 +; CHECK: br %r14 + %element = extractelement <4 x i32> %val, i32 4 + store i32 %element, i32 *%ptr + ret void +} + +; Test v4i32 extraction with the highest in-range offset. +define void @f16(<4 x i32> %val, i32 *%base) { +; CHECK-LABEL: f16: +; CHECK: vstef %v24, 4092(%r2), 2 +; CHECK: br %r14 + %ptr = getelementptr i32, i32 *%base, i32 1023 + %element = extractelement <4 x i32> %val, i32 2 + store i32 %element, i32 *%ptr + ret void +} + +; Test v4i32 extraction with the first ouf-of-range offset. +define void @f17(<4 x i32> %val, i32 *%base) { +; CHECK-LABEL: f17: +; CHECK: aghi %r2, 4096 +; CHECK: vstef %v24, 0(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr i32, i32 *%base, i32 1024 + %element = extractelement <4 x i32> %val, i32 1 + store i32 %element, i32 *%ptr + ret void +} + +; Test v4i32 extraction from a variable element. +define void @f18(<4 x i32> %val, i32 *%ptr, i32 %index) { +; CHECK-LABEL: f18: +; CHECK-NOT: vstef +; CHECK: br %r14 + %element = extractelement <4 x i32> %val, i32 %index + store i32 %element, i32 *%ptr + ret void +} + +; Test v2i64 extraction from the first element. +define void @f19(<2 x i64> %val, i64 *%ptr) { +; CHECK-LABEL: f19: +; CHECK: vsteg %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = extractelement <2 x i64> %val, i32 0 + store i64 %element, i64 *%ptr + ret void +} + +; Test v2i64 extraction from the last element. +define void @f20(<2 x i64> %val, i64 *%ptr) { +; CHECK-LABEL: f20: +; CHECK: vsteg %v24, 0(%r2), 1 +; CHECK: br %r14 + %element = extractelement <2 x i64> %val, i32 1 + store i64 %element, i64 *%ptr + ret void +} + +; Test v2i64 extraction of an invalid element. This must compile, +; but we don't care what it does. +define void @f21(<2 x i64> %val, i64 *%ptr) { +; CHECK-LABEL: f21: +; CHECK-NOT: vsteg %v24, 0(%r2), 2 +; CHECK: br %r14 + %element = extractelement <2 x i64> %val, i32 2 + store i64 %element, i64 *%ptr + ret void +} + +; Test v2i64 extraction with the highest in-range offset. +define void @f22(<2 x i64> %val, i64 *%base) { +; CHECK-LABEL: f22: +; CHECK: vsteg %v24, 4088(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%base, i32 511 + %element = extractelement <2 x i64> %val, i32 1 + store i64 %element, i64 *%ptr + ret void +} + +; Test v2i64 extraction with the first ouf-of-range offset. +define void @f23(<2 x i64> %val, i64 *%base) { +; CHECK-LABEL: f23: +; CHECK: aghi %r2, 4096 +; CHECK: vsteg %v24, 0(%r2), 0 +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%base, i32 512 + %element = extractelement <2 x i64> %val, i32 0 + store i64 %element, i64 *%ptr + ret void +} + +; Test v2i64 extraction from a variable element. +define void @f24(<2 x i64> %val, i64 *%ptr, i32 %index) { +; CHECK-LABEL: f24: +; CHECK-NOT: vsteg +; CHECK: br %r14 + %element = extractelement <2 x i64> %val, i32 %index + store i64 %element, i64 *%ptr + ret void +} + +; Test a v4i32 scatter of the first element. +define void @f37(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f37: +; CHECK: vscef %v24, 0(%v26,%r2), 0 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 0 + %ext = zext i32 %elem to i64 + %add = add i64 %base, %ext + %ptr = inttoptr i64 %add to i32 * + %element = extractelement <4 x i32> %val, i32 0 + store i32 %element, i32 *%ptr + ret void +} + +; Test a v4i32 scatter of the last element. +define void @f38(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f38: +; CHECK: vscef %v24, 0(%v26,%r2), 3 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 3 + %ext = zext i32 %elem to i64 + %add = add i64 %base, %ext + %ptr = inttoptr i64 %add to i32 * + %element = extractelement <4 x i32> %val, i32 3 + store i32 %element, i32 *%ptr + ret void +} + +; Test a v4i32 scatter with the highest in-range offset. +define void @f39(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f39: +; CHECK: vscef %v24, 4095(%v26,%r2), 1 +; CHECK: br %r14 + %elem = extractelement <4 x i32> %index, i32 1 + %ext = zext i32 %elem to i64 + %add1 = add i64 %base, %ext + %add2 = add i64 %add1, 4095 + %ptr = inttoptr i64 %add2 to i32 * + %element = extractelement <4 x i32> %val, i32 1 + store i32 %element, i32 *%ptr + ret void +} + +; Test a v2i64 scatter of the first element. +define void @f40(<2 x i64> %val, <2 x i64> %index, i64 %base) { +; CHECK-LABEL: f40: +; CHECK: vsceg %v24, 0(%v26,%r2), 0 +; CHECK: br %r14 + %elem = extractelement <2 x i64> %index, i32 0 + %add = add i64 %base, %elem + %ptr = inttoptr i64 %add to i64 * + %element = extractelement <2 x i64> %val, i32 0 + store i64 %element, i64 *%ptr + ret void +} + +; Test a v2i64 scatter of the last element. +define void @f41(<2 x i64> %val, <2 x i64> %index, i64 %base) { +; CHECK-LABEL: f41: +; CHECK: vsceg %v24, 0(%v26,%r2), 1 +; CHECK: br %r14 + %elem = extractelement <2 x i64> %index, i32 1 + %add = add i64 %base, %elem + %ptr = inttoptr i64 %add to i64 * + %element = extractelement <2 x i64> %val, i32 1 + store i64 %element, i64 *%ptr + ret void +} diff --git a/test/CodeGen/SystemZ/vec-move-11.ll b/test/CodeGen/SystemZ/vec-move-11.ll new file mode 100644 index 00000000000..45bc91b169b --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-11.ll @@ -0,0 +1,93 @@ +; Test insertions of register values into a nonzero index of an undef. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 insertion into an undef, with an arbitrary index. +define <16 x i8> @f1(i8 %val) { +; CHECK-LABEL: f1: +; CHECK: vlvgb %v24, %r2, 12 +; CHECK-NEXT: br %r14 + %ret = insertelement <16 x i8> undef, i8 %val, i32 12 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into an undef, with the first good index for VLVGP. +define <16 x i8> @f2(i8 %val) { +; CHECK-LABEL: f2: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <16 x i8> undef, i8 %val, i32 7 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into an undef, with the second good index for VLVGP. +define <16 x i8> @f3(i8 %val) { +; CHECK-LABEL: f3: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <16 x i8> undef, i8 %val, i32 15 + ret <16 x i8> %ret +} + +; Test v8i16 insertion into an undef, with an arbitrary index. +define <8 x i16> @f4(i16 %val) { +; CHECK-LABEL: f4: +; CHECK: vlvgh %v24, %r2, 5 +; CHECK-NEXT: br %r14 + %ret = insertelement <8 x i16> undef, i16 %val, i32 5 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into an undef, with the first good index for VLVGP. +define <8 x i16> @f5(i16 %val) { +; CHECK-LABEL: f5: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <8 x i16> undef, i16 %val, i32 3 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into an undef, with the second good index for VLVGP. +define <8 x i16> @f6(i16 %val) { +; CHECK-LABEL: f6: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <8 x i16> undef, i16 %val, i32 7 + ret <8 x i16> %ret +} + +; Test v4i32 insertion into an undef, with an arbitrary index. +define <4 x i32> @f7(i32 %val) { +; CHECK-LABEL: f7: +; CHECK: vlvgf %v24, %r2, 2 +; CHECK-NEXT: br %r14 + %ret = insertelement <4 x i32> undef, i32 %val, i32 2 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into an undef, with the first good index for VLVGP. +define <4 x i32> @f8(i32 %val) { +; CHECK-LABEL: f8: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <4 x i32> undef, i32 %val, i32 1 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into an undef, with the second good index for VLVGP. +define <4 x i32> @f9(i32 %val) { +; CHECK-LABEL: f9: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <4 x i32> undef, i32 %val, i32 3 + ret <4 x i32> %ret +} + +; Test v2i64 insertion into an undef. +define <2 x i64> @f10(i64 %val) { +; CHECK-LABEL: f10: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK-NEXT: br %r14 + %ret = insertelement <2 x i64> undef, i64 %val, i32 1 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-move-12.ll b/test/CodeGen/SystemZ/vec-move-12.ll new file mode 100644 index 00000000000..1fecab688e7 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-12.ll @@ -0,0 +1,103 @@ +; Test insertions of memory values into a nonzero index of an undef. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 insertion into an undef, with an arbitrary index. +define <16 x i8> @f1(i8 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vlrepb %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> undef, i8 %val, i32 12 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into an undef, with the first good index for VLVGP. +define <16 x i8> @f2(i8 *%ptr) { +; CHECK-LABEL: f2: +; CHECK: {{vlrepb|vllezb}} %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> undef, i8 %val, i32 7 + ret <16 x i8> %ret +} + +; Test v16i8 insertion into an undef, with the second good index for VLVGP. +define <16 x i8> @f3(i8 *%ptr) { +; CHECK-LABEL: f3: +; CHECK: vlrepb %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> undef, i8 %val, i32 15 + ret <16 x i8> %ret +} + +; Test v8i16 insertion into an undef, with an arbitrary index. +define <8 x i16> @f4(i16 *%ptr) { +; CHECK-LABEL: f4: +; CHECK: vlreph %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i16, i16 *%ptr + %ret = insertelement <8 x i16> undef, i16 %val, i32 5 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into an undef, with the first good index for VLVGP. +define <8 x i16> @f5(i16 *%ptr) { +; CHECK-LABEL: f5: +; CHECK: {{vlreph|vllezh}} %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i16, i16 *%ptr + %ret = insertelement <8 x i16> undef, i16 %val, i32 3 + ret <8 x i16> %ret +} + +; Test v8i16 insertion into an undef, with the second good index for VLVGP. +define <8 x i16> @f6(i16 *%ptr) { +; CHECK-LABEL: f6: +; CHECK: vlreph %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i16, i16 *%ptr + %ret = insertelement <8 x i16> undef, i16 %val, i32 7 + ret <8 x i16> %ret +} + +; Test v4i32 insertion into an undef, with an arbitrary index. +define <4 x i32> @f7(i32 *%ptr) { +; CHECK-LABEL: f7: +; CHECK: vlrepf %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i32, i32 *%ptr + %ret = insertelement <4 x i32> undef, i32 %val, i32 2 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into an undef, with the first good index for VLVGP. +define <4 x i32> @f8(i32 *%ptr) { +; CHECK-LABEL: f8: +; CHECK: {{vlrepf|vllezf}} %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i32, i32 *%ptr + %ret = insertelement <4 x i32> undef, i32 %val, i32 1 + ret <4 x i32> %ret +} + +; Test v4i32 insertion into an undef, with the second good index for VLVGP. +define <4 x i32> @f9(i32 *%ptr) { +; CHECK-LABEL: f9: +; CHECK: vlrepf %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i32, i32 *%ptr + %ret = insertelement <4 x i32> undef, i32 %val, i32 3 + ret <4 x i32> %ret +} + +; Test v2i64 insertion into an undef. +define <2 x i64> @f10(i64 *%ptr) { +; CHECK-LABEL: f10: +; CHECK: vlrepg %v24, 0(%r2) +; CHECK-NEXT: br %r14 + %val = load i64, i64 *%ptr + %ret = insertelement <2 x i64> undef, i64 %val, i32 1 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-move-13.ll b/test/CodeGen/SystemZ/vec-move-13.ll new file mode 100644 index 00000000000..e103affa4b1 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-13.ll @@ -0,0 +1,47 @@ +; Test insertions of register values into 0. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 insertion into 0. +define <16 x i8> @f1(i8 %val1, i8 %val2) { +; CHECK-LABEL: f1: +; CHECK: vgbm %v24, 0 +; CHECK-DAG: vlvgb %v24, %r2, 2 +; CHECK-DAG: vlvgb %v24, %r3, 12 +; CHECK: br %r14 + %vec1 = insertelement <16 x i8> zeroinitializer, i8 %val1, i32 2 + %vec2 = insertelement <16 x i8> %vec1, i8 %val2, i32 12 + ret <16 x i8> %vec2 +} + +; Test v8i16 insertion into 0. +define <8 x i16> @f2(i16 %val1, i16 %val2) { +; CHECK-LABEL: f2: +; CHECK: vgbm %v24, 0 +; CHECK-DAG: vlvgh %v24, %r2, 3 +; CHECK-DAG: vlvgh %v24, %r3, 5 +; CHECK: br %r14 + %vec1 = insertelement <8 x i16> zeroinitializer, i16 %val1, i32 3 + %vec2 = insertelement <8 x i16> %vec1, i16 %val2, i32 5 + ret <8 x i16> %vec2 +} + +; Test v4i32 insertion into 0. +define <4 x i32> @f3(i32 %val) { +; CHECK-LABEL: f3: +; CHECK: vgbm %v24, 0 +; CHECK: vlvgf %v24, %r2, 3 +; CHECK: br %r14 + %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 3 + ret <4 x i32> %ret +} + +; Test v2i64 insertion into 0. +define <2 x i64> @f4(i64 %val) { +; CHECK-LABEL: f4: +; CHECK: lghi [[REG:%r[0-5]]], 0 +; CHECK: vlvgp %v24, [[REG]], %r2 +; CHECK: br %r14 + %ret = insertelement <2 x i64> zeroinitializer, i64 %val, i32 1 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-move-14.ll b/test/CodeGen/SystemZ/vec-move-14.ll new file mode 100644 index 00000000000..f0c60e7d366 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-14.ll @@ -0,0 +1,76 @@ +; Test insertions of memory values into 0. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test VLLEZB. +define <16 x i8> @f1(i8 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vllezb %v24, 0(%r2) +; CHECK: br %r14 + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7 + ret <16 x i8> %ret +} + +; Test VLLEZB with the highest in-range offset. +define <16 x i8> @f2(i8 *%base) { +; CHECK-LABEL: f2: +; CHECK: vllezb %v24, 4095(%r2) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4095 + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7 + ret <16 x i8> %ret +} + +; Test VLLEZB with the next highest offset. +define <16 x i8> @f3(i8 *%base) { +; CHECK-LABEL: f3: +; CHECK-NOT: vllezb %v24, 4096(%r2) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4096 + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7 + ret <16 x i8> %ret +} + +; Test that VLLEZB allows an index. +define <16 x i8> @f4(i8 *%base, i64 %index) { +; CHECK-LABEL: f4: +; CHECK: vllezb %v24, 0({{%r2,%r3|%r3,%r2}}) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 %index + %val = load i8, i8 *%ptr + %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7 + ret <16 x i8> %ret +} + +; Test VLLEZH. +define <8 x i16> @f5(i16 *%ptr) { +; CHECK-LABEL: f5: +; CHECK: vllezh %v24, 0(%r2) +; CHECK: br %r14 + %val = load i16, i16 *%ptr + %ret = insertelement <8 x i16> zeroinitializer, i16 %val, i32 3 + ret <8 x i16> %ret +} + +; Test VLLEZF. +define <4 x i32> @f6(i32 *%ptr) { +; CHECK-LABEL: f6: +; CHECK: vllezf %v24, 0(%r2) +; CHECK: br %r14 + %val = load i32, i32 *%ptr + %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 1 + ret <4 x i32> %ret +} + +; Test VLLEZG. +define <2 x i64> @f7(i64 *%ptr) { +; CHECK-LABEL: f7: +; CHECK: vllezg %v24, 0(%r2) +; CHECK: br %r14 + %val = load i64, i64 *%ptr + %ret = insertelement <2 x i64> zeroinitializer, i64 %val, i32 0 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-mul-01.ll b/test/CodeGen/SystemZ/vec-mul-01.ll new file mode 100644 index 00000000000..209582f5893 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-mul-01.ll @@ -0,0 +1,39 @@ +; Test vector multiplication. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 multiplication. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmlb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = mul <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 multiplication. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmlhw %v24, %v26, %v28 +; CHECK: br %r14 + %ret = mul <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 multiplication. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmlf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = mul <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 multiplication. There's no vector equivalent. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK-NOT: vmlg +; CHECK: br %r14 + %ret = mul <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-mul-02.ll b/test/CodeGen/SystemZ/vec-mul-02.ll new file mode 100644 index 00000000000..7323330919a --- /dev/null +++ b/test/CodeGen/SystemZ/vec-mul-02.ll @@ -0,0 +1,36 @@ +; Test vector multiply-and-add. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 multiply-and-add. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2, + <16 x i8> %val3) { +; CHECK-LABEL: f1: +; CHECK: vmalb %v24, %v26, %v28, %v30 +; CHECK: br %r14 + %mul = mul <16 x i8> %val1, %val2 + %ret = add <16 x i8> %mul, %val3 + ret <16 x i8> %ret +} + +; Test a v8i16 multiply-and-add. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2, + <8 x i16> %val3) { +; CHECK-LABEL: f2: +; CHECK: vmalhw %v24, %v26, %v28, %v30 +; CHECK: br %r14 + %mul = mul <8 x i16> %val1, %val2 + %ret = add <8 x i16> %mul, %val3 + ret <8 x i16> %ret +} + +; Test a v4i32 multiply-and-add. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> %val3) { +; CHECK-LABEL: f3: +; CHECK: vmalf %v24, %v26, %v28, %v30 +; CHECK: br %r14 + %mul = mul <4 x i32> %val1, %val2 + %ret = add <4 x i32> %mul, %val3 + ret <4 x i32> %ret +} diff --git a/test/CodeGen/SystemZ/vec-neg-01.ll b/test/CodeGen/SystemZ/vec-neg-01.ll new file mode 100644 index 00000000000..357648ba4d3 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-neg-01.ll @@ -0,0 +1,39 @@ +; Test vector negation. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 negation. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: vlcb %v24, %v26 +; CHECK: br %r14 + %ret = sub <16 x i8> zeroinitializer, %val + ret <16 x i8> %ret +} + +; Test a v8i16 negation. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f2: +; CHECK: vlch %v24, %v26 +; CHECK: br %r14 + %ret = sub <8 x i16> zeroinitializer, %val + ret <8 x i16> %ret +} + +; Test a v4i32 negation. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f3: +; CHECK: vlcf %v24, %v26 +; CHECK: br %r14 + %ret = sub <4 x i32> zeroinitializer, %val + ret <4 x i32> %ret +} + +; Test a v2i64 negation. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f4: +; CHECK: vlcg %v24, %v26 +; CHECK: br %r14 + %ret = sub <2 x i64> zeroinitializer, %val + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-or-01.ll b/test/CodeGen/SystemZ/vec-or-01.ll new file mode 100644 index 00000000000..789150ad2d1 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-or-01.ll @@ -0,0 +1,39 @@ +; Test vector OR. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 OR. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vo %v24, %v26, %v28 +; CHECK: br %r14 + %ret = or <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 OR. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vo %v24, %v26, %v28 +; CHECK: br %r14 + %ret = or <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 OR. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vo %v24, %v26, %v28 +; CHECK: br %r14 + %ret = or <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 OR. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vo %v24, %v26, %v28 +; CHECK: br %r14 + %ret = or <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-or-02.ll b/test/CodeGen/SystemZ/vec-or-02.ll new file mode 100644 index 00000000000..eeb86e36ff0 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-or-02.ll @@ -0,0 +1,107 @@ +; Test vector (or (and X, Z), (and Y, (not Z))) patterns. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3) { +; CHECK-LABEL: f1: +; CHECK: vsel %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <16 x i8> %val3, + %and1 = and <16 x i8> %val1, %val3 + %and2 = and <16 x i8> %val2, %not + %ret = or <16 x i8> %and1, %and2 + ret <16 x i8> %ret +} + +; ...and again with the XOR applied to the other operand of the AND. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3) { +; CHECK-LABEL: f2: +; CHECK: vsel %v24, %v26, %v24, %v28 +; CHECK: br %r14 + %not = xor <16 x i8> %val3, + %and1 = and <16 x i8> %val1, %not + %and2 = and <16 x i8> %val2, %val3 + %ret = or <16 x i8> %and1, %and2 + ret <16 x i8> %ret +} + +; Test v8i16. +define <8 x i16> @f3(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3) { +; CHECK-LABEL: f3: +; CHECK: vsel %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <8 x i16> %val3, + %and1 = and <8 x i16> %val1, %val3 + %and2 = and <8 x i16> %val2, %not + %ret = or <8 x i16> %and1, %and2 + ret <8 x i16> %ret +} + +; ...and again with the XOR applied to the other operand of the AND. +define <8 x i16> @f4(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3) { +; CHECK-LABEL: f4: +; CHECK: vsel %v24, %v26, %v24, %v28 +; CHECK: br %r14 + %not = xor <8 x i16> %val3, + %and1 = and <8 x i16> %val1, %not + %and2 = and <8 x i16> %val2, %val3 + %ret = or <8 x i16> %and1, %and2 + ret <8 x i16> %ret +} + +; Test v4i32. +define <4 x i32> @f5(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3) { +; CHECK-LABEL: f5: +; CHECK: vsel %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <4 x i32> %val3, + %and1 = and <4 x i32> %val1, %val3 + %and2 = and <4 x i32> %val2, %not + %ret = or <4 x i32> %and1, %and2 + ret <4 x i32> %ret +} + +; ...and again with the XOR applied to the other operand of the AND. +define <4 x i32> @f6(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3) { +; CHECK-LABEL: f6: +; CHECK: vsel %v24, %v26, %v24, %v28 +; CHECK: br %r14 + %not = xor <4 x i32> %val3, + %and1 = and <4 x i32> %val1, %not + %and2 = and <4 x i32> %val2, %val3 + %ret = or <4 x i32> %and1, %and2 + ret <4 x i32> %ret +} + +; Test v2i64. +define <2 x i64> @f7(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3) { +; CHECK-LABEL: f7: +; CHECK: vsel %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <2 x i64> %val3, + %and1 = and <2 x i64> %val1, %val3 + %and2 = and <2 x i64> %val2, %not + %ret = or <2 x i64> %and1, %and2 + ret <2 x i64> %ret +} + +; ...and again with the XOR applied to the other operand of the AND. +define <2 x i64> @f8(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3) { +; CHECK-LABEL: f8: +; CHECK: vsel %v24, %v26, %v24, %v28 +; CHECK: br %r14 + %not = xor <2 x i64> %val3, + %and1 = and <2 x i64> %val1, %not + %and2 = and <2 x i64> %val2, %val3 + %ret = or <2 x i64> %and1, %and2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-perm-01.ll b/test/CodeGen/SystemZ/vec-perm-01.ll new file mode 100644 index 00000000000..520ff45e7f7 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-perm-01.ll @@ -0,0 +1,124 @@ +; Test vector splat. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 splat of the first element. +define <16 x i8> @f1(<16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: vrepb %v24, %v24, 0 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> zeroinitializer + ret <16 x i8> %ret +} + +; Test v16i8 splat of the last element. +define <16 x i8> @f2(<16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: vrepb %v24, %v24, 15 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> + ret <16 x i8> %ret +} + +; Test v16i8 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <16 x i8> @f3(<16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK: vrepb %v24, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> undef, <16 x i8> %val, + <16 x i32> + ret <16 x i8> %ret +} + +; Test v8i16 splat of the first element. +define <8 x i16> @f4(<8 x i16> %val) { +; CHECK-LABEL: f4: +; CHECK: vreph %v24, %v24, 0 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> zeroinitializer + ret <8 x i16> %ret +} + +; Test v8i16 splat of the last element. +define <8 x i16> @f5(<8 x i16> %val) { +; CHECK-LABEL: f5: +; CHECK: vreph %v24, %v24, 7 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> + ret <8 x i16> %ret +} + +; Test v8i16 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <8 x i16> @f6(<8 x i16> %val) { +; CHECK-LABEL: f6: +; CHECK: vreph %v24, %v24, 2 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> undef, <8 x i16> %val, + <8 x i32> + ret <8 x i16> %ret +} + +; Test v4i32 splat of the first element. +define <4 x i32> @f7(<4 x i32> %val) { +; CHECK-LABEL: f7: +; CHECK: vrepf %v24, %v24, 0 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> zeroinitializer + ret <4 x i32> %ret +} + +; Test v4i32 splat of the last element. +define <4 x i32> @f8(<4 x i32> %val) { +; CHECK-LABEL: f8: +; CHECK: vrepf %v24, %v24, 3 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> + ret <4 x i32> %ret +} + +; Test v4i32 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <4 x i32> @f9(<4 x i32> %val) { +; CHECK-LABEL: f9: +; CHECK: vrepf %v24, %v24, 1 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> undef, <4 x i32> %val, + <4 x i32> + ret <4 x i32> %ret +} + +; Test v2i64 splat of the first element. +define <2 x i64> @f10(<2 x i64> %val) { +; CHECK-LABEL: f10: +; CHECK: vrepg %v24, %v24, 0 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> zeroinitializer + ret <2 x i64> %ret +} + +; Test v2i64 splat of the last element. +define <2 x i64> @f11(<2 x i64> %val) { +; CHECK-LABEL: f11: +; CHECK: vrepg %v24, %v24, 1 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-perm-02.ll b/test/CodeGen/SystemZ/vec-perm-02.ll new file mode 100644 index 00000000000..93e4112c0ef --- /dev/null +++ b/test/CodeGen/SystemZ/vec-perm-02.ll @@ -0,0 +1,144 @@ +; Test replications of a scalar register value, represented as splats. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test v16i8 splat of the first element. +define <16 x i8> @f1(i8 %scalar) { +; CHECK-LABEL: f1: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vrepb %v24, [[REG]], 7 +; CHECK: br %r14 + %val = insertelement <16 x i8> undef, i8 %scalar, i32 0 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> zeroinitializer + ret <16 x i8> %ret +} + +; Test v16i8 splat of the last element. +define <16 x i8> @f2(i8 %scalar) { +; CHECK-LABEL: f2: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vrepb %v24, [[REG]], 7 +; CHECK: br %r14 + %val = insertelement <16 x i8> undef, i8 %scalar, i32 15 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> + ret <16 x i8> %ret +} + +; Test v16i8 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <16 x i8> @f3(i8 %scalar) { +; CHECK-LABEL: f3: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vrepb %v24, [[REG]], 7 +; CHECK: br %r14 + %val = insertelement <16 x i8> undef, i8 %scalar, i32 4 + %ret = shufflevector <16 x i8> undef, <16 x i8> %val, + <16 x i32> + ret <16 x i8> %ret +} + +; Test v8i16 splat of the first element. +define <8 x i16> @f4(i16 %scalar) { +; CHECK-LABEL: f4: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vreph %v24, [[REG]], 3 +; CHECK: br %r14 + %val = insertelement <8 x i16> undef, i16 %scalar, i32 0 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> zeroinitializer + ret <8 x i16> %ret +} + +; Test v8i16 splat of the last element. +define <8 x i16> @f5(i16 %scalar) { +; CHECK-LABEL: f5: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vreph %v24, [[REG]], 3 +; CHECK: br %r14 + %val = insertelement <8 x i16> undef, i16 %scalar, i32 7 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> + ret <8 x i16> %ret +} + +; Test v8i16 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <8 x i16> @f6(i16 %scalar) { +; CHECK-LABEL: f6: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vreph %v24, [[REG]], 3 +; CHECK: br %r14 + %val = insertelement <8 x i16> undef, i16 %scalar, i32 2 + %ret = shufflevector <8 x i16> undef, <8 x i16> %val, + <8 x i32> + ret <8 x i16> %ret +} + +; Test v4i32 splat of the first element. +define <4 x i32> @f7(i32 %scalar) { +; CHECK-LABEL: f7: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vrepf %v24, [[REG]], 1 +; CHECK: br %r14 + %val = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> zeroinitializer + ret <4 x i32> %ret +} + +; Test v4i32 splat of the last element. +define <4 x i32> @f8(i32 %scalar) { +; CHECK-LABEL: f8: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vrepf %v24, [[REG]], 1 +; CHECK: br %r14 + %val = insertelement <4 x i32> undef, i32 %scalar, i32 3 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> + ret <4 x i32> %ret +} + +; Test v4i32 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <4 x i32> @f9(i32 %scalar) { +; CHECK-LABEL: f9: +; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2 +; CHECK: vrepf %v24, [[REG]], 1 +; CHECK: br %r14 + %val = insertelement <4 x i32> undef, i32 %scalar, i32 1 + %ret = shufflevector <4 x i32> undef, <4 x i32> %val, + <4 x i32> + ret <4 x i32> %ret +} + +; Test v2i64 splat of the first element. +define <2 x i64> @f10(i64 %scalar) { +; CHECK-LABEL: f10: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK: br %r14 + %val = insertelement <2 x i64> undef, i64 %scalar, i32 0 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> zeroinitializer + ret <2 x i64> %ret +} + +; Test v2i64 splat of the last element. +define <2 x i64> @f11(i64 %scalar) { +; CHECK-LABEL: f11: +; CHECK: vlvgp %v24, %r2, %r2 +; CHECK: br %r14 + %val = insertelement <2 x i64> undef, i64 %scalar, i32 1 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-perm-03.ll b/test/CodeGen/SystemZ/vec-perm-03.ll new file mode 100644 index 00000000000..d74948bdb51 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-perm-03.ll @@ -0,0 +1,173 @@ +; Test replications of a scalar memory value, represented as splats. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 replicating load with no offset. +define <16 x i8> @f1(i8 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vlrepb %v24, 0(%r2) +; CHECK: br %r14 + %scalar = load i8, i8 *%ptr + %val = insertelement <16 x i8> undef, i8 %scalar, i32 0 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> zeroinitializer + ret <16 x i8> %ret +} + +; Test a v16i8 replicating load with the maximum in-range offset. +define <16 x i8> @f2(i8 *%base) { +; CHECK-LABEL: f2: +; CHECK: vlrepb %v24, 4095(%r2) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4095 + %scalar = load i8, i8 *%ptr + %val = insertelement <16 x i8> undef, i8 %scalar, i32 0 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> zeroinitializer + ret <16 x i8> %ret +} + +; Test a v16i8 replicating load with the first out-of-range offset. +define <16 x i8> @f3(i8 *%base) { +; CHECK-LABEL: f3: +; CHECK: aghi %r2, 4096 +; CHECK: vlrepb %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4096 + %scalar = load i8, i8 *%ptr + %val = insertelement <16 x i8> undef, i8 %scalar, i32 0 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> zeroinitializer + ret <16 x i8> %ret +} + +; Test a v8i16 replicating load with no offset. +define <8 x i16> @f4(i16 *%ptr) { +; CHECK-LABEL: f4: +; CHECK: vlreph %v24, 0(%r2) +; CHECK: br %r14 + %scalar = load i16, i16 *%ptr + %val = insertelement <8 x i16> undef, i16 %scalar, i32 0 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> zeroinitializer + ret <8 x i16> %ret +} + +; Test a v8i16 replicating load with the maximum in-range offset. +define <8 x i16> @f5(i16 *%base) { +; CHECK-LABEL: f5: +; CHECK: vlreph %v24, 4094(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%base, i64 2047 + %scalar = load i16, i16 *%ptr + %val = insertelement <8 x i16> undef, i16 %scalar, i32 0 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> zeroinitializer + ret <8 x i16> %ret +} + +; Test a v8i16 replicating load with the first out-of-range offset. +define <8 x i16> @f6(i16 *%base) { +; CHECK-LABEL: f6: +; CHECK: aghi %r2, 4096 +; CHECK: vlreph %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%base, i64 2048 + %scalar = load i16, i16 *%ptr + %val = insertelement <8 x i16> undef, i16 %scalar, i32 0 + %ret = shufflevector <8 x i16> %val, <8 x i16> undef, + <8 x i32> zeroinitializer + ret <8 x i16> %ret +} + +; Test a v4i32 replicating load with no offset. +define <4 x i32> @f7(i32 *%ptr) { +; CHECK-LABEL: f7: +; CHECK: vlrepf %v24, 0(%r2) +; CHECK: br %r14 + %scalar = load i32, i32 *%ptr + %val = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> zeroinitializer + ret <4 x i32> %ret +} + +; Test a v4i32 replicating load with the maximum in-range offset. +define <4 x i32> @f8(i32 *%base) { +; CHECK-LABEL: f8: +; CHECK: vlrepf %v24, 4092(%r2) +; CHECK: br %r14 + %ptr = getelementptr i32, i32 *%base, i64 1023 + %scalar = load i32, i32 *%ptr + %val = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> zeroinitializer + ret <4 x i32> %ret +} + +; Test a v4i32 replicating load with the first out-of-range offset. +define <4 x i32> @f9(i32 *%base) { +; CHECK-LABEL: f9: +; CHECK: aghi %r2, 4096 +; CHECK: vlrepf %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr i32, i32 *%base, i64 1024 + %scalar = load i32, i32 *%ptr + %val = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %ret = shufflevector <4 x i32> %val, <4 x i32> undef, + <4 x i32> zeroinitializer + ret <4 x i32> %ret +} + +; Test a v2i64 replicating load with no offset. +define <2 x i64> @f10(i64 *%ptr) { +; CHECK-LABEL: f10: +; CHECK: vlrepg %v24, 0(%r2) +; CHECK: br %r14 + %scalar = load i64, i64 *%ptr + %val = insertelement <2 x i64> undef, i64 %scalar, i32 0 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> zeroinitializer + ret <2 x i64> %ret +} + +; Test a v2i64 replicating load with the maximum in-range offset. +define <2 x i64> @f11(i64 *%base) { +; CHECK-LABEL: f11: +; CHECK: vlrepg %v24, 4088(%r2) +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%base, i32 511 + %scalar = load i64, i64 *%ptr + %val = insertelement <2 x i64> undef, i64 %scalar, i32 0 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> zeroinitializer + ret <2 x i64> %ret +} + +; Test a v2i64 replicating load with the first out-of-range offset. +define <2 x i64> @f12(i64 *%base) { +; CHECK-LABEL: f12: +; CHECK: aghi %r2, 4096 +; CHECK: vlrepg %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%base, i32 512 + %scalar = load i64, i64 *%ptr + %val = insertelement <2 x i64> undef, i64 %scalar, i32 0 + %ret = shufflevector <2 x i64> %val, <2 x i64> undef, + <2 x i32> zeroinitializer + ret <2 x i64> %ret +} + +; Test a v16i8 replicating load with an index. +define <16 x i8> @f19(i8 *%base, i64 %index) { +; CHECK-LABEL: f19: +; CHECK: vlrepb %v24, 1023(%r3,%r2) +; CHECK: br %r14 + %ptr1 = getelementptr i8, i8 *%base, i64 %index + %ptr = getelementptr i8, i8 *%ptr1, i64 1023 + %scalar = load i8, i8 *%ptr + %val = insertelement <16 x i8> undef, i8 %scalar, i32 0 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> zeroinitializer + ret <16 x i8> %ret +} diff --git a/test/CodeGen/SystemZ/vec-perm-04.ll b/test/CodeGen/SystemZ/vec-perm-04.ll new file mode 100644 index 00000000000..1d449b9bb34 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-perm-04.ll @@ -0,0 +1,160 @@ +; Test vector merge high. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a canonical v16i8 merge high. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmrhb %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a reversed v16i8 merge high. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmrhb %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 merge high with only the first operand being used. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmrhb %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 merge high with only the second operand being used. +; This is converted into @f3 by target-independent code. +define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmrhb %v24, %v26, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 merge with both operands being the same. This too is +; converted into @f3 by target-independent code. +define <16 x i8> @f5(<16 x i8> %val) { +; CHECK-LABEL: f5: +; CHECK: vmrhb %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> %val, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 merge in which some of the indices are don't care. +define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmrhb %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 merge in which one of the operands is undefined and where +; indices for that operand are "don't care". Target-independent code +; converts the indices themselves into "undef"s. +define <16 x i8> @f7(<16 x i8> %val) { +; CHECK-LABEL: f7: +; CHECK: vmrhb %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> undef, <16 x i8> %val, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a canonical v8i16 merge high. +define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmrhh %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> + ret <8 x i16> %ret +} + +; Test a reversed v8i16 merge high. +define <8 x i16> @f9(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f9: +; CHECK: vmrhh %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> + ret <8 x i16> %ret +} + +; Test a canonical v4i32 merge high. +define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f10: +; CHECK: vmrhf %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> + ret <4 x i32> %ret +} + +; Test a reversed v4i32 merge high. +define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f11: +; CHECK: vmrhf %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> + ret <4 x i32> %ret +} + +; Test a canonical v2i64 merge high. +define <2 x i64> @f12(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f12: +; CHECK: vmrhg %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2, + <2 x i32> + ret <2 x i64> %ret +} + +; Test a reversed v2i64 merge high. +define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f13: +; CHECK: vmrhg %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2, + <2 x i32> + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-perm-05.ll b/test/CodeGen/SystemZ/vec-perm-05.ll new file mode 100644 index 00000000000..636228c56ba --- /dev/null +++ b/test/CodeGen/SystemZ/vec-perm-05.ll @@ -0,0 +1,160 @@ +; Test vector merge low. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a canonical v16i8 merge low. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vmrlb %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a reversed v16i8 merge low. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vmrlb %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 merge low with only the first operand being used. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vmrlb %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 merge low with only the second operand being used. +; This is converted into @f3 by target-independent code. +define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vmrlb %v24, %v26, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 merge with both operands being the same. This too is +; converted into @f3 by target-independent code. +define <16 x i8> @f5(<16 x i8> %val) { +; CHECK-LABEL: f5: +; CHECK: vmrlb %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> %val, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 merge in which some of the indices are don't care. +define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vmrlb %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 merge in which one of the operands is undefined and where +; indices for that operand are "don't care". Target-independent code +; converts the indices themselves into "undef"s. +define <16 x i8> @f7(<16 x i8> %val) { +; CHECK-LABEL: f7: +; CHECK: vmrlb %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> undef, <16 x i8> %val, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a canonical v8i16 merge low. +define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vmrlh %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> + ret <8 x i16> %ret +} + +; Test a reversed v8i16 merge low. +define <8 x i16> @f9(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f9: +; CHECK: vmrlh %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> + ret <8 x i16> %ret +} + +; Test a canonical v4i32 merge low. +define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f10: +; CHECK: vmrlf %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> + ret <4 x i32> %ret +} + +; Test a reversed v4i32 merge low. +define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f11: +; CHECK: vmrlf %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> + ret <4 x i32> %ret +} + +; Test a canonical v2i64 merge low. +define <2 x i64> @f12(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f12: +; CHECK: vmrlg %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2, + <2 x i32> + ret <2 x i64> %ret +} + +; Test a reversed v2i64 merge low. +define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f13: +; CHECK: vmrlg %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2, + <2 x i32> + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-perm-06.ll b/test/CodeGen/SystemZ/vec-perm-06.ll new file mode 100644 index 00000000000..298fc60e851 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-perm-06.ll @@ -0,0 +1,140 @@ +; Test vector pack. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a canonical v16i8 pack. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vpkh %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a reversed v16i8 pack. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vpkh %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 pack with only the first operand being used. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vpkh %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 pack with only the second operand being used. +; This is converted into @f3 by target-independent code. +define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vpkh %v24, %v26, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 pack with both operands being the same. This too is +; converted into @f3 by target-independent code. +define <16 x i8> @f5(<16 x i8> %val) { +; CHECK-LABEL: f5: +; CHECK: vpkh %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> %val, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 pack in which some of the indices are don't care. +define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vpkh %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 pack in which one of the operands is undefined and where +; indices for that operand are "don't care". Target-independent code +; converts the indices themselves into "undef"s. +define <16 x i8> @f7(<16 x i8> %val) { +; CHECK-LABEL: f7: +; CHECK: vpkh %v24, %v24, %v24 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> undef, <16 x i8> %val, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a canonical v8i16 pack. +define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vpkf %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> + ret <8 x i16> %ret +} + +; Test a reversed v8i16 pack. +define <8 x i16> @f9(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f9: +; CHECK: vpkf %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> + ret <8 x i16> %ret +} + +; Test a canonical v4i32 pack. +define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f10: +; CHECK: vpkg %v24, %v24, %v26 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> + ret <4 x i32> %ret +} + +; Test a reversed v4i32 pack. +define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f11: +; CHECK: vpkg %v24, %v26, %v24 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> + ret <4 x i32> %ret +} diff --git a/test/CodeGen/SystemZ/vec-perm-07.ll b/test/CodeGen/SystemZ/vec-perm-07.ll new file mode 100644 index 00000000000..40ca3995524 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-perm-07.ll @@ -0,0 +1,125 @@ +; Test vector shift left double immediate. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift with the lowest useful shift amount. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vsldb %v24, %v24, %v26, 1 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 shift with the highest shift amount. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vsldb %v24, %v24, %v26, 15 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 shift in which the operands need to be reversed. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vsldb %v24, %v26, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 shift in which the operands need to be duplicated. +define <16 x i8> @f4(<16 x i8> %val) { +; CHECK-LABEL: f4: +; CHECK: vsldb %v24, %v24, %v24, 7 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v16i8 shift in which some of the indices are undefs. +define <16 x i8> @f5(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f5: +; CHECK: vsldb %v24, %v24, %v26, 11 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; ...and again with reversed operands. +define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f6: +; CHECK: vsldb %v24, %v26, %v24, 13 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a v8i16 shift with the lowest useful shift amount. +define <8 x i16> @f7(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f7: +; CHECK: vsldb %v24, %v24, %v26, 2 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> + ret <8 x i16> %ret +} + +; Test a v8i16 shift with the highest useful shift amount. +define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f8: +; CHECK: vsldb %v24, %v24, %v26, 14 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> + ret <8 x i16> %ret +} + +; Test a v4i32 shift with the lowest useful shift amount. +define <4 x i32> @f9(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f9: +; CHECK: vsldb %v24, %v24, %v26, 4 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i32 shift with the highest useful shift amount. +define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f10: +; CHECK: vsldb %v24, %v24, %v26, 12 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> + ret <4 x i32> %ret +} + +; We use VPDI for v2i64 shuffles. diff --git a/test/CodeGen/SystemZ/vec-perm-08.ll b/test/CodeGen/SystemZ/vec-perm-08.ll new file mode 100644 index 00000000000..4d06377f5a3 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-perm-08.ll @@ -0,0 +1,130 @@ +; Test vector permutes using VPDI. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a high1/low2 permute for v16i8. +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vpdi %v24, %v24, %v26, 1 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a low2/high1 permute for v16i8. +define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: vpdi %v24, %v26, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a low1/high2 permute for v16i8. +define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f3: +; CHECK: vpdi %v24, %v24, %v26, 4 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a high2/low1 permute for v16i8. +define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f4: +; CHECK: vpdi %v24, %v26, %v24, 1 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} + +; Test reversing two doublewords in a v16i8. +define <16 x i8> @f5(<16 x i8> %val) { +; CHECK-LABEL: f5: +; CHECK: vpdi %v24, %v24, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <16 x i8> %val, <16 x i8> undef, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a high1/low2 permute for v8i16. +define <8 x i16> @f6(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f6: +; CHECK: vpdi %v24, %v24, %v26, 1 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> + ret <8 x i16> %ret +} + +; Test a low2/high1 permute for v8i16. +define <8 x i16> @f7(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f7: +; CHECK: vpdi %v24, %v26, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> + ret <8 x i16> %ret +} + +; Test a high1/low2 permute for v4i32. +define <4 x i32> @f8(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f8: +; CHECK: vpdi %v24, %v24, %v26, 1 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> + ret <4 x i32> %ret +} + +; Test a low2/high1 permute for v4i32. +define <4 x i32> @f9(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f9: +; CHECK: vpdi %v24, %v26, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> + ret <4 x i32> %ret +} + +; Test a high1/low2 permute for v2i64. +define <2 x i64> @f10(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f10: +; CHECK: vpdi %v24, %v24, %v26, 1 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2, + <2 x i32> + ret <2 x i64> %ret +} + +; Test low2/high1 permute for v2i64. +define <2 x i64> @f11(<2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f11: +; CHECK: vpdi %v24, %v26, %v24, 4 +; CHECK: br %r14 + %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2, + <2 x i32> + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-perm-09.ll b/test/CodeGen/SystemZ/vec-perm-09.ll new file mode 100644 index 00000000000..9c9632cf030 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-perm-09.ll @@ -0,0 +1,38 @@ +; Test general vector permute of a v16i8. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-CODE %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s + +define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) { +; CHECK-CODE-LABEL: f1: +; CHECK-CODE: larl [[REG:%r[0-5]]], +; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]]) +; CHECK-CODE: vperm %v24, %v24, %v26, [[MASK]] +; CHECK-CODE: br %r14 +; +; CHECK-VECTOR: .byte 1 +; CHECK-VECTOR-NEXT: .byte 19 +; CHECK-VECTOR-NEXT: .byte 6 +; CHECK-VECTOR-NEXT: .byte 5 +; CHECK-VECTOR-NEXT: .byte 20 +; CHECK-VECTOR-NEXT: .byte 22 +; CHECK-VECTOR-NEXT: .byte 1 +; CHECK-VECTOR-NEXT: .byte 1 +; CHECK-VECTOR-NEXT: .byte 25 +; CHECK-VECTOR-NEXT: .byte 29 +; CHECK-VECTOR-NEXT: .byte 11 +; Any byte would be OK here +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .byte 31 +; CHECK-VECTOR-NEXT: .byte 4 +; CHECK-VECTOR-NEXT: .byte 15 +; CHECK-VECTOR-NEXT: .byte 19 + %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2, + <16 x i32> + ret <16 x i8> %ret +} diff --git a/test/CodeGen/SystemZ/vec-perm-10.ll b/test/CodeGen/SystemZ/vec-perm-10.ll new file mode 100644 index 00000000000..382e6dc4c3f --- /dev/null +++ b/test/CodeGen/SystemZ/vec-perm-10.ll @@ -0,0 +1,36 @@ +; Test general vector permute of a v8i16. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-CODE %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s + +define <8 x i16> @f1(<8 x i16> %val1, <8 x i16> %val2) { +; CHECK-CODE-LABEL: f1: +; CHECK-CODE: larl [[REG:%r[0-5]]], +; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]]) +; CHECK-CODE: vperm %v24, %v26, %v24, [[MASK]] +; CHECK-CODE: br %r14 +; +; CHECK-VECTOR: .byte 0 +; CHECK-VECTOR-NEXT: .byte 1 +; CHECK-VECTOR-NEXT: .byte 26 +; CHECK-VECTOR-NEXT: .byte 27 +; Any 2 bytes would be OK here +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .byte 28 +; CHECK-VECTOR-NEXT: .byte 29 +; CHECK-VECTOR-NEXT: .byte 6 +; CHECK-VECTOR-NEXT: .byte 7 +; CHECK-VECTOR-NEXT: .byte 14 +; CHECK-VECTOR-NEXT: .byte 15 +; CHECK-VECTOR-NEXT: .byte 8 +; CHECK-VECTOR-NEXT: .byte 9 +; CHECK-VECTOR-NEXT: .byte 16 +; CHECK-VECTOR-NEXT: .byte 17 + %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2, + <8 x i32> + ret <8 x i16> %ret +} diff --git a/test/CodeGen/SystemZ/vec-perm-11.ll b/test/CodeGen/SystemZ/vec-perm-11.ll new file mode 100644 index 00000000000..c9e29880fe0 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-perm-11.ll @@ -0,0 +1,35 @@ +; Test general vector permute of a v4i32. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-CODE %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \ +; RUN: FileCheck -check-prefix=CHECK-VECTOR %s + +define <4 x i32> @f1(<4 x i32> %val1, <4 x i32> %val2) { +; CHECK-CODE-LABEL: f1: +; CHECK-CODE: larl [[REG:%r[0-5]]], +; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]]) +; CHECK-CODE: vperm %v24, %v26, %v24, [[MASK]] +; CHECK-CODE: br %r14 +; +; CHECK-VECTOR: .byte 4 +; CHECK-VECTOR-NEXT: .byte 5 +; CHECK-VECTOR-NEXT: .byte 6 +; CHECK-VECTOR-NEXT: .byte 7 +; CHECK-VECTOR-NEXT: .byte 20 +; CHECK-VECTOR-NEXT: .byte 21 +; CHECK-VECTOR-NEXT: .byte 22 +; CHECK-VECTOR-NEXT: .byte 23 +; Any 4 bytes would be OK here +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .space 1 +; CHECK-VECTOR-NEXT: .byte 12 +; CHECK-VECTOR-NEXT: .byte 13 +; CHECK-VECTOR-NEXT: .byte 14 +; CHECK-VECTOR-NEXT: .byte 15 + %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2, + <4 x i32> + ret <4 x i32> %ret +} diff --git a/test/CodeGen/SystemZ/vec-shift-01.ll b/test/CodeGen/SystemZ/vec-shift-01.ll new file mode 100644 index 00000000000..be8605b182c --- /dev/null +++ b/test/CodeGen/SystemZ/vec-shift-01.ll @@ -0,0 +1,39 @@ +; Test vector shift left with vector shift amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: veslvb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = shl <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 shift. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: veslvh %v24, %v26, %v28 +; CHECK: br %r14 + %ret = shl <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 shift. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: veslvf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = shl <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 shift. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: veslvg %v24, %v26, %v28 +; CHECK: br %r14 + %ret = shl <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-shift-02.ll b/test/CodeGen/SystemZ/vec-shift-02.ll new file mode 100644 index 00000000000..2825872e023 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-shift-02.ll @@ -0,0 +1,39 @@ +; Test vector arithmetic shift right with vector shift amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vesravb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = ashr <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 shift. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vesravh %v24, %v26, %v28 +; CHECK: br %r14 + %ret = ashr <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 shift. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vesravf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = ashr <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 shift. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vesravg %v24, %v26, %v28 +; CHECK: br %r14 + %ret = ashr <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-shift-03.ll b/test/CodeGen/SystemZ/vec-shift-03.ll new file mode 100644 index 00000000000..c923d8b5d45 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-shift-03.ll @@ -0,0 +1,39 @@ +; Test vector logical shift right with vector shift amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vesrlvb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = lshr <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 shift. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vesrlvh %v24, %v26, %v28 +; CHECK: br %r14 + %ret = lshr <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 shift. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vesrlvf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = lshr <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 shift. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vesrlvg %v24, %v26, %v28 +; CHECK: br %r14 + %ret = lshr <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-shift-04.ll b/test/CodeGen/SystemZ/vec-shift-04.ll new file mode 100644 index 00000000000..6fd12897bf5 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-shift-04.ll @@ -0,0 +1,134 @@ +; Test vector shift left with scalar shift amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift by a variable. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, i32 %shift) { +; CHECK-LABEL: f1: +; CHECK: veslb %v24, %v26, 0(%r2) +; CHECK: br %r14 + %truncshift = trunc i32 %shift to i8 + %shiftvec = insertelement <16 x i8> undef, i8 %truncshift, i32 0 + %val2 = shufflevector <16 x i8> %shiftvec, <16 x i8> undef, + <16 x i32> zeroinitializer + %ret = shl <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v16i8 shift by the lowest useful constant. +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: veslb %v24, %v26, 1 +; CHECK: br %r14 + %ret = shl <16 x i8> %val, + ret <16 x i8> %ret +} + +; Test a v16i8 shift by the highest useful constant. +define <16 x i8> @f3(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK: veslb %v24, %v26, 7 +; CHECK: br %r14 + %ret = shl <16 x i8> %val, + ret <16 x i8> %ret +} + +; Test a v8i16 shift by a variable. +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, i32 %shift) { +; CHECK-LABEL: f4: +; CHECK: veslh %v24, %v26, 0(%r2) +; CHECK: br %r14 + %truncshift = trunc i32 %shift to i16 + %shiftvec = insertelement <8 x i16> undef, i16 %truncshift, i32 0 + %val2 = shufflevector <8 x i16> %shiftvec, <8 x i16> undef, + <8 x i32> zeroinitializer + %ret = shl <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v8i16 shift by the lowest useful constant. +define <8 x i16> @f5(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f5: +; CHECK: veslh %v24, %v26, 1 +; CHECK: br %r14 + %ret = shl <8 x i16> %val, + ret <8 x i16> %ret +} + +; Test a v8i16 shift by the highest useful constant. +define <8 x i16> @f6(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f6: +; CHECK: veslh %v24, %v26, 15 +; CHECK: br %r14 + %ret = shl <8 x i16> %val, + ret <8 x i16> %ret +} + +; Test a v4i32 shift by a variable. +define <4 x i32> @f7(<4 x i32> %dummy, <4 x i32> %val1, i32 %shift) { +; CHECK-LABEL: f7: +; CHECK: veslf %v24, %v26, 0(%r2) +; CHECK: br %r14 + %shiftvec = insertelement <4 x i32> undef, i32 %shift, i32 0 + %val2 = shufflevector <4 x i32> %shiftvec, <4 x i32> undef, + <4 x i32> zeroinitializer + %ret = shl <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v4i32 shift by the lowest useful constant. +define <4 x i32> @f8(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f8: +; CHECK: veslf %v24, %v26, 1 +; CHECK: br %r14 + %ret = shl <4 x i32> %val, + ret <4 x i32> %ret +} + +; Test a v4i32 shift by the highest useful constant. +define <4 x i32> @f9(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f9: +; CHECK: veslf %v24, %v26, 31 +; CHECK: br %r14 + %ret = shl <4 x i32> %val, + ret <4 x i32> %ret +} + +; Test a v2i64 shift by a variable. +define <2 x i64> @f10(<2 x i64> %dummy, <2 x i64> %val1, i32 %shift) { +; CHECK-LABEL: f10: +; CHECK: veslg %v24, %v26, 0(%r2) +; CHECK: br %r14 + %extshift = sext i32 %shift to i64 + %shiftvec = insertelement <2 x i64> undef, i64 %extshift, i32 0 + %val2 = shufflevector <2 x i64> %shiftvec, <2 x i64> undef, + <2 x i32> zeroinitializer + %ret = shl <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} + +; Test a v2i64 shift by the lowest useful constant. +define <2 x i64> @f11(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f11: +; CHECK: veslg %v24, %v26, 1 +; CHECK: br %r14 + %ret = shl <2 x i64> %val, + ret <2 x i64> %ret +} + +; Test a v2i64 shift by the highest useful constant. +define <2 x i64> @f12(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f12: +; CHECK: veslg %v24, %v26, 63 +; CHECK: br %r14 + %ret = shl <2 x i64> %val, + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-shift-05.ll b/test/CodeGen/SystemZ/vec-shift-05.ll new file mode 100644 index 00000000000..22ce46b2d0d --- /dev/null +++ b/test/CodeGen/SystemZ/vec-shift-05.ll @@ -0,0 +1,134 @@ +; Test vector arithmetic shift right with scalar shift amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift by a variable. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, i32 %shift) { +; CHECK-LABEL: f1: +; CHECK: vesrab %v24, %v26, 0(%r2) +; CHECK: br %r14 + %truncshift = trunc i32 %shift to i8 + %shiftvec = insertelement <16 x i8> undef, i8 %truncshift, i32 0 + %val2 = shufflevector <16 x i8> %shiftvec, <16 x i8> undef, + <16 x i32> zeroinitializer + %ret = ashr <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v16i8 shift by the lowest useful constant. +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: vesrab %v24, %v26, 1 +; CHECK: br %r14 + %ret = ashr <16 x i8> %val, + ret <16 x i8> %ret +} + +; Test a v16i8 shift by the highest useful constant. +define <16 x i8> @f3(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK: vesrab %v24, %v26, 7 +; CHECK: br %r14 + %ret = ashr <16 x i8> %val, + ret <16 x i8> %ret +} + +; Test a v8i16 shift by a variable. +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, i32 %shift) { +; CHECK-LABEL: f4: +; CHECK: vesrah %v24, %v26, 0(%r2) +; CHECK: br %r14 + %truncshift = trunc i32 %shift to i16 + %shiftvec = insertelement <8 x i16> undef, i16 %truncshift, i32 0 + %val2 = shufflevector <8 x i16> %shiftvec, <8 x i16> undef, + <8 x i32> zeroinitializer + %ret = ashr <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v8i16 shift by the lowest useful constant. +define <8 x i16> @f5(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f5: +; CHECK: vesrah %v24, %v26, 1 +; CHECK: br %r14 + %ret = ashr <8 x i16> %val, + ret <8 x i16> %ret +} + +; Test a v8i16 shift by the highest useful constant. +define <8 x i16> @f6(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f6: +; CHECK: vesrah %v24, %v26, 15 +; CHECK: br %r14 + %ret = ashr <8 x i16> %val, + ret <8 x i16> %ret +} + +; Test a v4i32 shift by a variable. +define <4 x i32> @f7(<4 x i32> %dummy, <4 x i32> %val1, i32 %shift) { +; CHECK-LABEL: f7: +; CHECK: vesraf %v24, %v26, 0(%r2) +; CHECK: br %r14 + %shiftvec = insertelement <4 x i32> undef, i32 %shift, i32 0 + %val2 = shufflevector <4 x i32> %shiftvec, <4 x i32> undef, + <4 x i32> zeroinitializer + %ret = ashr <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v4i32 shift by the lowest useful constant. +define <4 x i32> @f8(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f8: +; CHECK: vesraf %v24, %v26, 1 +; CHECK: br %r14 + %ret = ashr <4 x i32> %val, + ret <4 x i32> %ret +} + +; Test a v4i32 shift by the highest useful constant. +define <4 x i32> @f9(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f9: +; CHECK: vesraf %v24, %v26, 31 +; CHECK: br %r14 + %ret = ashr <4 x i32> %val, + ret <4 x i32> %ret +} + +; Test a v2i64 shift by a variable. +define <2 x i64> @f10(<2 x i64> %dummy, <2 x i64> %val1, i32 %shift) { +; CHECK-LABEL: f10: +; CHECK: vesrag %v24, %v26, 0(%r2) +; CHECK: br %r14 + %extshift = sext i32 %shift to i64 + %shiftvec = insertelement <2 x i64> undef, i64 %extshift, i32 0 + %val2 = shufflevector <2 x i64> %shiftvec, <2 x i64> undef, + <2 x i32> zeroinitializer + %ret = ashr <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} + +; Test a v2i64 shift by the lowest useful constant. +define <2 x i64> @f11(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f11: +; CHECK: vesrag %v24, %v26, 1 +; CHECK: br %r14 + %ret = ashr <2 x i64> %val, + ret <2 x i64> %ret +} + +; Test a v2i64 shift by the highest useful constant. +define <2 x i64> @f12(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f12: +; CHECK: vesrag %v24, %v26, 63 +; CHECK: br %r14 + %ret = ashr <2 x i64> %val, + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-shift-06.ll b/test/CodeGen/SystemZ/vec-shift-06.ll new file mode 100644 index 00000000000..8a5bb0a9a55 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-shift-06.ll @@ -0,0 +1,134 @@ +; Test vector logical shift right with scalar shift amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 shift by a variable. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, i32 %shift) { +; CHECK-LABEL: f1: +; CHECK: vesrlb %v24, %v26, 0(%r2) +; CHECK: br %r14 + %truncshift = trunc i32 %shift to i8 + %shiftvec = insertelement <16 x i8> undef, i8 %truncshift, i32 0 + %val2 = shufflevector <16 x i8> %shiftvec, <16 x i8> undef, + <16 x i32> zeroinitializer + %ret = lshr <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v16i8 shift by the lowest useful constant. +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f2: +; CHECK: vesrlb %v24, %v26, 1 +; CHECK: br %r14 + %ret = lshr <16 x i8> %val, + ret <16 x i8> %ret +} + +; Test a v16i8 shift by the highest useful constant. +define <16 x i8> @f3(<16 x i8> %dummy, <16 x i8> %val) { +; CHECK-LABEL: f3: +; CHECK: vesrlb %v24, %v26, 7 +; CHECK: br %r14 + %ret = lshr <16 x i8> %val, + ret <16 x i8> %ret +} + +; Test a v8i16 shift by a variable. +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, i32 %shift) { +; CHECK-LABEL: f4: +; CHECK: vesrlh %v24, %v26, 0(%r2) +; CHECK: br %r14 + %truncshift = trunc i32 %shift to i16 + %shiftvec = insertelement <8 x i16> undef, i16 %truncshift, i32 0 + %val2 = shufflevector <8 x i16> %shiftvec, <8 x i16> undef, + <8 x i32> zeroinitializer + %ret = lshr <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v8i16 shift by the lowest useful constant. +define <8 x i16> @f5(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f5: +; CHECK: vesrlh %v24, %v26, 1 +; CHECK: br %r14 + %ret = lshr <8 x i16> %val, + ret <8 x i16> %ret +} + +; Test a v8i16 shift by the highest useful constant. +define <8 x i16> @f6(<8 x i16> %dummy, <8 x i16> %val) { +; CHECK-LABEL: f6: +; CHECK: vesrlh %v24, %v26, 15 +; CHECK: br %r14 + %ret = lshr <8 x i16> %val, + ret <8 x i16> %ret +} + +; Test a v4i32 shift by a variable. +define <4 x i32> @f7(<4 x i32> %dummy, <4 x i32> %val1, i32 %shift) { +; CHECK-LABEL: f7: +; CHECK: vesrlf %v24, %v26, 0(%r2) +; CHECK: br %r14 + %shiftvec = insertelement <4 x i32> undef, i32 %shift, i32 0 + %val2 = shufflevector <4 x i32> %shiftvec, <4 x i32> undef, + <4 x i32> zeroinitializer + %ret = lshr <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v4i32 shift by the lowest useful constant. +define <4 x i32> @f8(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f8: +; CHECK: vesrlf %v24, %v26, 1 +; CHECK: br %r14 + %ret = lshr <4 x i32> %val, + ret <4 x i32> %ret +} + +; Test a v4i32 shift by the highest useful constant. +define <4 x i32> @f9(<4 x i32> %dummy, <4 x i32> %val) { +; CHECK-LABEL: f9: +; CHECK: vesrlf %v24, %v26, 31 +; CHECK: br %r14 + %ret = lshr <4 x i32> %val, + ret <4 x i32> %ret +} + +; Test a v2i64 shift by a variable. +define <2 x i64> @f10(<2 x i64> %dummy, <2 x i64> %val1, i32 %shift) { +; CHECK-LABEL: f10: +; CHECK: vesrlg %v24, %v26, 0(%r2) +; CHECK: br %r14 + %extshift = sext i32 %shift to i64 + %shiftvec = insertelement <2 x i64> undef, i64 %extshift, i32 0 + %val2 = shufflevector <2 x i64> %shiftvec, <2 x i64> undef, + <2 x i32> zeroinitializer + %ret = lshr <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} + +; Test a v2i64 shift by the lowest useful constant. +define <2 x i64> @f11(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f11: +; CHECK: vesrlg %v24, %v26, 1 +; CHECK: br %r14 + %ret = lshr <2 x i64> %val, + ret <2 x i64> %ret +} + +; Test a v2i64 shift by the highest useful constant. +define <2 x i64> @f12(<2 x i64> %dummy, <2 x i64> %val) { +; CHECK-LABEL: f12: +; CHECK: vesrlg %v24, %v26, 63 +; CHECK: br %r14 + %ret = lshr <2 x i64> %val, + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-shift-07.ll b/test/CodeGen/SystemZ/vec-shift-07.ll new file mode 100644 index 00000000000..f229c5e25a4 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-shift-07.ll @@ -0,0 +1,182 @@ +; Test vector sign extensions. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i1->v16i8 extension. +define <16 x i8> @f1(<16 x i8> %val) { +; CHECK-LABEL: f1: +; CHECK: veslb [[REG:%v[0-9]+]], %v24, 7 +; CHECK: vesrab %v24, [[REG]], 7 +; CHECK: br %r14 + %trunc = trunc <16 x i8> %val to <16 x i1> + %ret = sext <16 x i1> %trunc to <16 x i8> + ret <16 x i8> %ret +} + +; Test a v8i1->v8i16 extension. +define <8 x i16> @f2(<8 x i16> %val) { +; CHECK-LABEL: f2: +; CHECK: veslh [[REG:%v[0-9]+]], %v24, 15 +; CHECK: vesrah %v24, [[REG]], 15 +; CHECK: br %r14 + %trunc = trunc <8 x i16> %val to <8 x i1> + %ret = sext <8 x i1> %trunc to <8 x i16> + ret <8 x i16> %ret +} + +; Test a v8i8->v8i16 extension. +define <8 x i16> @f3(<8 x i16> %val) { +; CHECK-LABEL: f3: +; CHECK: veslh [[REG:%v[0-9]+]], %v24, 8 +; CHECK: vesrah %v24, [[REG]], 8 +; CHECK: br %r14 + %trunc = trunc <8 x i16> %val to <8 x i8> + %ret = sext <8 x i8> %trunc to <8 x i16> + ret <8 x i16> %ret +} + +; Test a v4i1->v4i32 extension. +define <4 x i32> @f4(<4 x i32> %val) { +; CHECK-LABEL: f4: +; CHECK: veslf [[REG:%v[0-9]+]], %v24, 31 +; CHECK: vesraf %v24, [[REG]], 31 +; CHECK: br %r14 + %trunc = trunc <4 x i32> %val to <4 x i1> + %ret = sext <4 x i1> %trunc to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i8->v4i32 extension. +define <4 x i32> @f5(<4 x i32> %val) { +; CHECK-LABEL: f5: +; CHECK: veslf [[REG:%v[0-9]+]], %v24, 24 +; CHECK: vesraf %v24, [[REG]], 24 +; CHECK: br %r14 + %trunc = trunc <4 x i32> %val to <4 x i8> + %ret = sext <4 x i8> %trunc to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i16->v4i32 extension. +define <4 x i32> @f6(<4 x i32> %val) { +; CHECK-LABEL: f6: +; CHECK: veslf [[REG:%v[0-9]+]], %v24, 16 +; CHECK: vesraf %v24, [[REG]], 16 +; CHECK: br %r14 + %trunc = trunc <4 x i32> %val to <4 x i16> + %ret = sext <4 x i16> %trunc to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v2i1->v2i64 extension. +define <2 x i64> @f7(<2 x i64> %val) { +; CHECK-LABEL: f7: +; CHECK: veslg [[REG:%v[0-9]+]], %v24, 63 +; CHECK: vesrag %v24, [[REG]], 63 +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i1> + %ret = sext <2 x i1> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i8->v2i64 extension. +define <2 x i64> @f8(<2 x i64> %val) { +; CHECK-LABEL: f8: +; CHECK: vsegb %v24, %v24 +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i8> + %ret = sext <2 x i8> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i16->v2i64 extension. +define <2 x i64> @f9(<2 x i64> %val) { +; CHECK-LABEL: f9: +; CHECK: vsegh %v24, %v24 +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i16> + %ret = sext <2 x i16> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i32->v2i64 extension. +define <2 x i64> @f10(<2 x i64> %val) { +; CHECK-LABEL: f10: +; CHECK: vsegf %v24, %v24 +; CHECK: br %r14 + %trunc = trunc <2 x i64> %val to <2 x i32> + %ret = sext <2 x i32> %trunc to <2 x i64> + ret <2 x i64> %ret +} + +; Test an alternative v2i8->v2i64 extension. +define <2 x i64> @f11(<2 x i64> %val) { +; CHECK-LABEL: f11: +; CHECK: vsegb %v24, %v24 +; CHECK: br %r14 + %shl = shl <2 x i64> %val, + %ret = ashr <2 x i64> %shl, + ret <2 x i64> %ret +} + +; Test an alternative v2i16->v2i64 extension. +define <2 x i64> @f12(<2 x i64> %val) { +; CHECK-LABEL: f12: +; CHECK: vsegh %v24, %v24 +; CHECK: br %r14 + %shl = shl <2 x i64> %val, + %ret = ashr <2 x i64> %shl, + ret <2 x i64> %ret +} + +; Test an alternative v2i32->v2i64 extension. +define <2 x i64> @f13(<2 x i64> %val) { +; CHECK-LABEL: f13: +; CHECK: vsegf %v24, %v24 +; CHECK: br %r14 + %shl = shl <2 x i64> %val, + %ret = ashr <2 x i64> %shl, + ret <2 x i64> %ret +} + +; Test an extraction-based v2i8->v2i64 extension. +define <2 x i64> @f14(<16 x i8> %val) { +; CHECK-LABEL: f14: +; CHECK: vsegb %v24, %v24 +; CHECK: br %r14 + %elt0 = extractelement <16 x i8> %val, i32 7 + %elt1 = extractelement <16 x i8> %val, i32 15 + %ext0 = sext i8 %elt0 to i64 + %ext1 = sext i8 %elt1 to i64 + %vec0 = insertelement <2 x i64> undef, i64 %ext0, i32 0 + %vec1 = insertelement <2 x i64> %vec0, i64 %ext1, i32 1 + ret <2 x i64> %vec1 +} + +; Test an extraction-based v2i16->v2i64 extension. +define <2 x i64> @f15(<16 x i16> %val) { +; CHECK-LABEL: f15: +; CHECK: vsegh %v24, %v24 +; CHECK: br %r14 + %elt0 = extractelement <16 x i16> %val, i32 3 + %elt1 = extractelement <16 x i16> %val, i32 7 + %ext0 = sext i16 %elt0 to i64 + %ext1 = sext i16 %elt1 to i64 + %vec0 = insertelement <2 x i64> undef, i64 %ext0, i32 0 + %vec1 = insertelement <2 x i64> %vec0, i64 %ext1, i32 1 + ret <2 x i64> %vec1 +} + +; Test an extraction-based v2i32->v2i64 extension. +define <2 x i64> @f16(<16 x i32> %val) { +; CHECK-LABEL: f16: +; CHECK: vsegf %v24, %v24 +; CHECK: br %r14 + %elt0 = extractelement <16 x i32> %val, i32 1 + %elt1 = extractelement <16 x i32> %val, i32 3 + %ext0 = sext i32 %elt0 to i64 + %ext1 = sext i32 %elt1 to i64 + %vec0 = insertelement <2 x i64> undef, i64 %ext0, i32 0 + %vec1 = insertelement <2 x i64> %vec0, i64 %ext1, i32 1 + ret <2 x i64> %vec1 +} diff --git a/test/CodeGen/SystemZ/vec-sub-01.ll b/test/CodeGen/SystemZ/vec-sub-01.ll new file mode 100644 index 00000000000..9e5b4f81e6d --- /dev/null +++ b/test/CodeGen/SystemZ/vec-sub-01.ll @@ -0,0 +1,39 @@ +; Test vector subtraction. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 subtraction. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vsb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 subtraction. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vsh %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 subtraction. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vsf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 subtraction. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vsg %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-xor-01.ll b/test/CodeGen/SystemZ/vec-xor-01.ll new file mode 100644 index 00000000000..063b768117c --- /dev/null +++ b/test/CodeGen/SystemZ/vec-xor-01.ll @@ -0,0 +1,39 @@ +; Test vector XOR. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i8 XOR. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vx %v24, %v26, %v28 +; CHECK: br %r14 + %ret = xor <16 x i8> %val1, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 XOR. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vx %v24, %v26, %v28 +; CHECK: br %r14 + %ret = xor <8 x i16> %val1, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 XOR. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vx %v24, %v26, %v28 +; CHECK: br %r14 + %ret = xor <4 x i32> %val1, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 XOR. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vx %v24, %v26, %v28 +; CHECK: br %r14 + %ret = xor <2 x i64> %val1, %val2 + ret <2 x i64> %ret +} -- 2.34.1