From: Tim Northover Date: Thu, 1 Aug 2013 09:20:35 +0000 (+0000) Subject: AArch64: add initial NEON support X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=87773c318fcee853fb34a80a10c4347d523bdafb;p=oota-llvm.git AArch64: add initial NEON support Patch by Ana Pazos. - Completed implementation of instruction formats: AdvSIMD three same AdvSIMD modified immediate AdvSIMD scalar pairwise - Completed implementation of instruction classes (some of the instructions in these classes belong to yet unfinished instruction formats): Vector Arithmetic Vector Immediate Vector Pairwise Arithmetic - Initial implementation of instruction formats: AdvSIMD scalar two-reg misc AdvSIMD scalar three same - Intial implementation of instruction class: Scalar Arithmetic - Initial clang changes to support arm v8 intrinsics. Note: no clang changes for scalar intrinsics function name mangling yet. - Comprehensive test cases for added instructions To verify auto codegen, encoding, decoding, diagnosis, intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187567 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td index e1023826ba8..1a849c4c30c 100644 --- a/include/llvm/IR/Intrinsics.td +++ b/include/llvm/IR/Intrinsics.td @@ -494,6 +494,7 @@ def int_convertuu : Intrinsic<[llvm_anyint_ty], include "llvm/IR/IntrinsicsPowerPC.td" include "llvm/IR/IntrinsicsX86.td" include "llvm/IR/IntrinsicsARM.td" +include "llvm/IR/IntrinsicsAArch64.td" include "llvm/IR/IntrinsicsXCore.td" include "llvm/IR/IntrinsicsHexagon.td" include "llvm/IR/IntrinsicsNVVM.td" diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td new file mode 100644 index 00000000000..d7b1947db12 --- /dev/null +++ b/include/llvm/IR/IntrinsicsAArch64.td @@ -0,0 +1,41 @@ +//===- IntrinsicsAArch64.td - Defines AArch64 intrinsics -----------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines all of the AArch64-specific intrinsics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Advanced SIMD (NEON) + +let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". + +// Vector Absolute Compare (Floating Point) +def int_aarch64_neon_vacgeq : Intrinsic<[llvm_v2i64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty], + [IntrNoMem]>; +def int_aarch64_neon_vacgtq : Intrinsic<[llvm_v2i64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty], + [IntrNoMem]>; + +// Vector maxNum (Floating Point) +def int_aarch64_neon_vmaxnm : Neon_2Arg_Intrinsic; + +// Vector minNum (Floating Point) +def int_aarch64_neon_vminnm : Neon_2Arg_Intrinsic; + +// Vector Pairwise maxNum (Floating Point) +def int_aarch64_neon_vpmaxnm : Neon_2Arg_Intrinsic; + +// Vector Pairwise minNum (Floating Point) +def int_aarch64_neon_vpminnm : Neon_2Arg_Intrinsic; + +// Vector Multiply Extended (Floating Point) +def int_aarch64_neon_vmulx : Neon_2Arg_Intrinsic; +} diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td index b880d8373de..bff7eebe00e 100644 --- a/lib/Target/AArch64/AArch64CallingConv.td +++ b/lib/Target/AArch64/AArch64CallingConv.td @@ -61,7 +61,7 @@ def CC_A64_APCS : CallingConv<[ // Vectors and Floating-point types. CCIfType<[v2i8], CCBitConvertToType>, CCIfType<[v4i8, v2i16], CCBitConvertToType>, - CCIfType<[v8i8, v4i16, v2i32, v2f32], CCBitConvertToType>, + CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64], CCBitConvertToType>, CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCBitConvertToType>, diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index d0abc0bbd11..44b691bfcce 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -42,6 +42,8 @@ static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) { AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) { + const AArch64Subtarget *Subtarget = &TM.getSubtarget(); + // SIMD compares set the entire lane's bits to 1 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); @@ -53,6 +55,21 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); + if (Subtarget->hasNEON()) { + // And the vectors + addRegisterClass(MVT::v8i8, &AArch64::VPR64RegClass); + addRegisterClass(MVT::v4i16, &AArch64::VPR64RegClass); + addRegisterClass(MVT::v2i32, &AArch64::VPR64RegClass); + addRegisterClass(MVT::v1i64, &AArch64::VPR64RegClass); + addRegisterClass(MVT::v2f32, &AArch64::VPR64RegClass); + addRegisterClass(MVT::v16i8, &AArch64::VPR128RegClass); + addRegisterClass(MVT::v8i16, &AArch64::VPR128RegClass); + addRegisterClass(MVT::v4i32, &AArch64::VPR128RegClass); + addRegisterClass(MVT::v2i64, &AArch64::VPR128RegClass); + addRegisterClass(MVT::v4f32, &AArch64::VPR128RegClass); + addRegisterClass(MVT::v2f64, &AArch64::VPR128RegClass); + } + computeRegisterProperties(); // We combine OR nodes for bitfield and NEON BSL operations. @@ -251,6 +268,31 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) setExceptionPointerRegister(AArch64::X0); setExceptionSelectorRegister(AArch64::X1); + + if (Subtarget->hasNEON()) { + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); + + setOperationAction(ISD::SETCC, MVT::v8i8, Custom); + setOperationAction(ISD::SETCC, MVT::v16i8, Custom); + setOperationAction(ISD::SETCC, MVT::v4i16, Custom); + setOperationAction(ISD::SETCC, MVT::v8i16, Custom); + setOperationAction(ISD::SETCC, MVT::v2i32, Custom); + setOperationAction(ISD::SETCC, MVT::v4i32, Custom); + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); + setOperationAction(ISD::SETCC, MVT::v2f32, Custom); + setOperationAction(ISD::SETCC, MVT::v4f32, Custom); + setOperationAction(ISD::SETCC, MVT::v2f64, Custom); + } } EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { @@ -777,7 +819,22 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; case AArch64ISD::WrapperSmall: return "AArch64ISD::WrapperSmall"; - default: return NULL; + case AArch64ISD::NEON_BSL: + return "AArch64ISD::NEON_BSL"; + case AArch64ISD::NEON_MOVIMM: + return "AArch64ISD::NEON_MOVIMM"; + case AArch64ISD::NEON_MVNIMM: + return "AArch64ISD::NEON_MVNIMM"; + case AArch64ISD::NEON_FMOVIMM: + return "AArch64ISD::NEON_FMOVIMM"; + case AArch64ISD::NEON_CMP: + return "AArch64ISD::NEON_CMP"; + case AArch64ISD::NEON_CMPZ: + return "AArch64ISD::NEON_CMPZ"; + case AArch64ISD::NEON_TST: + return "AArch64ISD::NEON_TST"; + default: + return NULL; } } @@ -2230,6 +2287,213 @@ AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(A64CC::NE, MVT::i32)); } +static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + ISD::CondCode CC = cast(Op.getOperand(2))->get(); + EVT VT = Op.getValueType(); + bool Invert = false; + SDValue Op0, Op1; + unsigned Opcode; + + if (LHS.getValueType().isInteger()) { + + // Attempt to use Vector Integer Compare Mask Test instruction. + // TST = icmp ne (and (op0, op1), zero). + if (CC == ISD::SETNE) { + if (((LHS.getOpcode() == ISD::AND) && + ISD::isBuildVectorAllZeros(RHS.getNode())) || + ((RHS.getOpcode() == ISD::AND) && + ISD::isBuildVectorAllZeros(LHS.getNode()))) { + + SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS; + SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0)); + SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1)); + return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS); + } + } + + // Attempt to use Vector Integer Compare Mask against Zero instr (Signed). + // Note: Compare against Zero does not support unsigned predicates. + if ((ISD::isBuildVectorAllZeros(RHS.getNode()) || + ISD::isBuildVectorAllZeros(LHS.getNode())) && + !isUnsignedIntSetCC(CC)) { + + // If LHS is the zero value, swap operands and CondCode. + if (ISD::isBuildVectorAllZeros(LHS.getNode())) { + CC = getSetCCSwappedOperands(CC); + Op0 = RHS; + } else + Op0 = LHS; + + // Ensure valid CondCode for Compare Mask against Zero instruction: + // EQ, GE, GT, LE, LT. + if (ISD::SETNE == CC) { + Invert = true; + CC = ISD::SETEQ; + } + + // Using constant type to differentiate integer and FP compares with zero. + Op1 = DAG.getConstant(0, MVT::i32); + Opcode = AArch64ISD::NEON_CMPZ; + + } else { + // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned). + // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT. + bool Swap = false; + switch (CC) { + default: + llvm_unreachable("Illegal integer comparison."); + case ISD::SETEQ: + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETUGT: + case ISD::SETUGE: + break; + case ISD::SETNE: + Invert = true; + CC = ISD::SETEQ; + break; + case ISD::SETULT: + case ISD::SETULE: + case ISD::SETLT: + case ISD::SETLE: + Swap = true; + CC = getSetCCSwappedOperands(CC); + } + + if (Swap) + std::swap(LHS, RHS); + + Opcode = AArch64ISD::NEON_CMP; + Op0 = LHS; + Op1 = RHS; + } + + // Generate Compare Mask instr or Compare Mask against Zero instr. + SDValue NeonCmp = + DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC)); + + if (Invert) + NeonCmp = DAG.getNOT(DL, NeonCmp, VT); + + return NeonCmp; + } + + // Now handle Floating Point cases. + // Attempt to use Vector Floating Point Compare Mask against Zero instruction. + if (ISD::isBuildVectorAllZeros(RHS.getNode()) || + ISD::isBuildVectorAllZeros(LHS.getNode())) { + + // If LHS is the zero value, swap operands and CondCode. + if (ISD::isBuildVectorAllZeros(LHS.getNode())) { + CC = getSetCCSwappedOperands(CC); + Op0 = RHS; + } else + Op0 = LHS; + + // Using constant type to differentiate integer and FP compares with zero. + Op1 = DAG.getConstantFP(0, MVT::f32); + Opcode = AArch64ISD::NEON_CMPZ; + } else { + // Attempt to use Vector Floating Point Compare Mask instruction. + Op0 = LHS; + Op1 = RHS; + Opcode = AArch64ISD::NEON_CMP; + } + + SDValue NeonCmpAlt; + // Some register compares have to be implemented with swapped CC and operands, + // e.g.: OLT implemented as OGT with swapped operands. + bool SwapIfRegArgs = false; + + // Ensure valid CondCode for FP Compare Mask against Zero instruction: + // EQ, GE, GT, LE, LT. + // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT. + switch (CC) { + default: + llvm_unreachable("Illegal FP comparison"); + case ISD::SETUNE: + case ISD::SETNE: + Invert = true; // Fallthrough + case ISD::SETOEQ: + case ISD::SETEQ: + CC = ISD::SETEQ; + break; + case ISD::SETOLT: + case ISD::SETLT: + CC = ISD::SETLT; + SwapIfRegArgs = true; + break; + case ISD::SETOGT: + case ISD::SETGT: + CC = ISD::SETGT; + break; + case ISD::SETOLE: + case ISD::SETLE: + CC = ISD::SETLE; + SwapIfRegArgs = true; + break; + case ISD::SETOGE: + case ISD::SETGE: + CC = ISD::SETGE; + break; + case ISD::SETUGE: + Invert = true; + CC = ISD::SETLT; + SwapIfRegArgs = true; + break; + case ISD::SETULE: + Invert = true; + CC = ISD::SETGT; + break; + case ISD::SETUGT: + Invert = true; + CC = ISD::SETLE; + SwapIfRegArgs = true; + break; + case ISD::SETULT: + Invert = true; + CC = ISD::SETGE; + break; + case ISD::SETUEQ: + Invert = true; // Fallthrough + case ISD::SETONE: + // Expand this to (OGT |OLT). + NeonCmpAlt = + DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT)); + CC = ISD::SETLT; + SwapIfRegArgs = true; + break; + case ISD::SETUO: + Invert = true; // Fallthrough + case ISD::SETO: + // Expand this to (OGE | OLT). + NeonCmpAlt = + DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE)); + CC = ISD::SETLT; + SwapIfRegArgs = true; + break; + } + + if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) { + CC = getSetCCSwappedOperands(CC); + std::swap(Op0, Op1); + } + + // Generate FP Compare Mask instr or FP Compare Mask against Zero instr + SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC)); + + if (NeonCmpAlt.getNode()) + NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt); + + if (Invert) + NeonCmp = DAG.getNOT(DL, NeonCmp, VT); + + return NeonCmp; +} + // (SETCC lhs, rhs, condcode) SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { @@ -2239,6 +2503,9 @@ AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(2))->get(); EVT VT = Op.getValueType(); + if (VT.isVector()) + return LowerVectorSETCC(Op, DAG); + if (LHS.getValueType() == MVT::f128) { // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS // for the rest of the function (some i32 or i64 values). @@ -2395,11 +2662,155 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::BUILD_VECTOR: + return LowerBUILD_VECTOR(Op, DAG, getSubtarget()); } return SDValue(); } +/// Check if the specified splat value corresponds to a valid vector constant +/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI). If +/// so, return the encoded 8-bit immediate and the OpCmode instruction fields +/// values. +static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, + unsigned SplatBitSize, SelectionDAG &DAG, + bool is128Bits, NeonModImmType type, EVT &VT, + unsigned &Imm, unsigned &OpCmode) { + switch (SplatBitSize) { + default: + llvm_unreachable("unexpected size for isNeonModifiedImm"); + case 8: { + if (type != Neon_Mov_Imm) + return false; + assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); + // Neon movi per byte: Op=0, Cmode=1110. + OpCmode = 0xe; + Imm = SplatBits; + VT = is128Bits ? MVT::v16i8 : MVT::v8i8; + break; + } + case 16: { + // Neon move inst per halfword + VT = is128Bits ? MVT::v8i16 : MVT::v4i16; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x00nn is 0x00nn LSL 0 + // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000 + // bic: Op=1, Cmode=1001; orr: Op=0, Cmode=1001 + // Op=x, Cmode=100y + Imm = SplatBits; + OpCmode = 0x8; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0xnn00 is 0x00nn LSL 8 + // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010 + // bic: Op=1, Cmode=1011; orr: Op=0, Cmode=1011 + // Op=x, Cmode=101x + Imm = SplatBits >> 8; + OpCmode = 0xa; + break; + } + // can't handle any other + return false; + } + + case 32: { + // First the LSL variants (MSL is unusable by some interested instructions). + + // Neon move instr per word, shift zeros + VT = is128Bits ? MVT::v4i32 : MVT::v2i32; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x000000nn is 0x000000nn LSL 0 + // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000 + // bic: Op=1, Cmode= 0001; orr: Op=0, Cmode= 0001 + // Op=x, Cmode=000x + Imm = SplatBits; + OpCmode = 0; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0x0000nn00 is 0x000000nn LSL 8 + // movi: Op=0, Cmode= 0010; mvni: Op=1, Cmode= 0010 + // bic: Op=1, Cmode= 0011; orr : Op=0, Cmode= 0011 + // Op=x, Cmode=001x + Imm = SplatBits >> 8; + OpCmode = 0x2; + break; + } + if ((SplatBits & ~0xff0000) == 0) { + // Value = 0x00nn0000 is 0x000000nn LSL 16 + // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100 + // bic: Op=1, Cmode= 0101; orr: Op=0, Cmode= 0101 + // Op=x, Cmode=010x + Imm = SplatBits >> 16; + OpCmode = 0x4; + break; + } + if ((SplatBits & ~0xff000000) == 0) { + // Value = 0xnn000000 is 0x000000nn LSL 24 + // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110 + // bic: Op=1, Cmode= 0111; orr: Op=0, Cmode= 0111 + // Op=x, Cmode=011x + Imm = SplatBits >> 24; + OpCmode = 0x6; + break; + } + + // Now the MSL immediates. + + // Neon move instr per word, shift ones + if ((SplatBits & ~0xffff) == 0 && + ((SplatBits | SplatUndef) & 0xff) == 0xff) { + // Value = 0x0000nnff is 0x000000nn MSL 8 + // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100 + // Op=x, Cmode=1100 + Imm = SplatBits >> 8; + OpCmode = 0xc; + break; + } + if ((SplatBits & ~0xffffff) == 0 && + ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { + // Value = 0x00nnffff is 0x000000nn MSL 16 + // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101 + // Op=x, Cmode=1101 + Imm = SplatBits >> 16; + OpCmode = 0xd; + break; + } + // can't handle any other + return false; + } + + case 64: { + if (type != Neon_Mov_Imm) + return false; + // Neon move instr bytemask, where each byte is either 0x00 or 0xff. + // movi Op=1, Cmode=1110. + OpCmode = 0x1e; + uint64_t BitMask = 0xff; + uint64_t Val = 0; + unsigned ImmMask = 1; + Imm = 0; + for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { + if (((SplatBits | SplatUndef) & BitMask) == BitMask) { + Val |= BitMask; + Imm |= ImmMask; + } else if ((SplatBits & BitMask) != 0) { + return false; + } + BitMask <<= 8; + ImmMask <<= 1; + } + SplatBits = Val; + VT = is128Bits ? MVT::v2i64 : MVT::v1i64; + break; + } + } + + return true; +} + static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { @@ -2725,6 +3136,7 @@ static SDValue PerformORCombine(SDNode *N, const AArch64Subtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); EVT VT = N->getValueType(0); if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) @@ -2745,6 +3157,44 @@ static SDValue PerformORCombine(SDNode *N, if (Res.getNode()) return Res; + if (!Subtarget->hasNEON()) + return SDValue(); + + // Attempt to use vector immediate-form BSL + // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. + + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::AND) + return SDValue(); + + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() != ISD::AND) + return SDValue(); + + if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + APInt SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(1)); + APInt SplatBits0; + if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, + HasAnyUndefs) && + !HasAnyUndefs) { + BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(1)); + APInt SplatBits1; + if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, + HasAnyUndefs) && + !HasAnyUndefs && SplatBits0 == ~SplatBits1) { + // Canonicalize the vector type to make instruction selection simpler. + EVT CanonicalVT = VT.is128BitVector() ? MVT::v16i8 : MVT::v8i8; + SDValue Result = DAG.getNode(AArch64ISD::NEON_BSL, DL, CanonicalVT, + N0->getOperand(1), N0->getOperand(0), + N1->getOperand(0)); + return DAG.getNode(ISD::BITCAST, DL, VT, Result); + } + } + } + return SDValue(); } @@ -2819,6 +3269,76 @@ AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { return false; } +// If this is a case we can't handle, return null and let the default +// expansion code take care of it. +SDValue +AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const AArch64Subtarget *ST) const { + + BuildVectorSDNode *BVN = cast(Op.getNode()); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + + // Note we favor lowering MOVI over MVNI. + // This has implications on the definition of patterns in TableGen to select + // BIC immediate instructions but not ORR immediate instructions. + // If this lowering order is changed, TableGen patterns for BIC immediate and + // ORR immediate instructions have to be updated. + if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + // First attempt to use vector immediate-form MOVI + EVT NeonMovVT; + unsigned Imm = 0; + unsigned OpCmode = 0; + + if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), + SplatBitSize, DAG, VT.is128BitVector(), + Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) { + SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32); + SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32); + + if (ImmVal.getNode() && OpCmodeVal.getNode()) { + SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT, + ImmVal, OpCmodeVal); + return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov); + } + } + + // Then attempt to use vector immediate-form MVNI + uint64_t NegatedImm = (~SplatBits).getZExtValue(); + if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, + DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT, + Imm, OpCmode)) { + SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32); + SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32); + if (ImmVal.getNode() && OpCmodeVal.getNode()) { + SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT, + ImmVal, OpCmodeVal); + return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov); + } + } + + // Attempt to use vector immediate-form FMOV + if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) || + (VT == MVT::v2f64 && SplatBitSize == 64)) { + APFloat RealVal( + SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble, + SplatBits); + uint32_t ImmVal; + if (A64Imms::isFPImm(RealVal, ImmVal)) { + SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32); + return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val); + } + } + } + } + return SDValue(); +} + AArch64TargetLowering::ConstraintType AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { if (Constraint.size() == 1) { diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 320346e60b7..67a908e24ef 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -111,7 +111,28 @@ namespace AArch64ISD { // created using the small memory model style: i.e. adrp/add or // adrp/mem-op. This exists to prevent bare TargetAddresses which may never // get selected. - WrapperSmall + WrapperSmall, + + // Vector bitwise select + NEON_BSL, + + // Vector move immediate + NEON_MOVIMM, + + // Vector Move Inverted Immediate + NEON_MVNIMM, + + // Vector FP move immediate + NEON_FMOVIMM, + + // Vector compare + NEON_CMP, + + // Vector compare zero + NEON_CMPZ, + + // Vector compare bitwise test + NEON_TST }; } @@ -148,9 +169,11 @@ public: SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const; - void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, - SDLoc DL, SDValue &Chain) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const AArch64Subtarget *ST) const; + void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL, + SDValue &Chain) const; /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call @@ -253,6 +276,10 @@ private: return &getTargetMachine().getSubtarget(); } }; +enum NeonModImmType { + Neon_Mov_Imm, + Neon_Mvn_Imm +}; } // namespace llvm #endif // LLVM_TARGET_AARCH64_ISELLOWERING_H diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 9dd122f1494..09451fdc45d 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -959,3 +959,96 @@ class A64I_Breg opc, bits<5> op2, bits<6> op3, bits<5> op4, let Inst{4-0} = op4; } + +//===----------------------------------------------------------------------===// +// +// Neon Instruction Format Definitions. +// + +let Predicates = [HasNEON] in { + +class NeonInstAlias + : InstAlias { +} + +// Format AdvSIMD 3 vector registers with same vector type +class NeonI_3VSame size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdnm +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + // Inherit Rm in 20-16 + let Inst{15-11} = opcode; + let Inst{10} = 0b1; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD 1 vector register with modified immediate +class NeonI_1VModImm patterns, InstrItinClass itin> + : A64InstRd +{ + bits<8> Imm; + bits<4> cmode; + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = op; + let Inst{28-19} = 0b0111100000; + let Inst{15-12} = cmode; + let Inst{11} = 0b0; // o2 + let Inst{10} = 1; + // Inherit Rd in 4-0 + let Inst{18-16} = Imm{7-5}; // imm a:b:c + let Inst{9-5} = Imm{4-0}; // imm d:e:f:g:h +} + +// Format AdvSIMD 3 scalar registers with same type + +class NeonI_Scalar3Same size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdnm +{ + let Inst{31} = 0b0; + let Inst{30} = 0b1; + let Inst{29} = u; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + // Inherit Rm in 20-16 + let Inst{15-11} = opcode; + let Inst{10} = 0b1; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + + +// Format AdvSIMD 2 vector registers miscellaneous +class NeonI_2VMisc size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdn +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +} + diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 725a12164be..07289b0be14 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -11,6 +11,17 @@ // //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// ARM Instruction Predicate Definitions. +// +def HasNEON : Predicate<"Subtarget->hasNEON()">, + AssemblerPredicate<"FeatureNEON", "neon">; +def HasCrypto : Predicate<"Subtarget->hasCrypto()">, + AssemblerPredicate<"FeatureCrypto","crypto">; + +// Use fused MAC if more precision in FP computation is allowed. +def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast)">; include "AArch64InstrFormats.td" //===----------------------------------------------------------------------===// @@ -2173,6 +2184,29 @@ def FMSUBdddd : A64I_fpdp3Impl<"fmsub", FPR64, f64, 0b01, 0b0, 0b1, fmsub>; def FNMADDdddd : A64I_fpdp3Impl<"fnmadd", FPR64, f64, 0b01, 0b1, 0b0, fnmadd>; def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>; +// Extra patterns for when we're allowed to optimise separate multiplication and +// addition. +let Predicates = [UseFusedMAC] in { +def : Pat<(fadd FPR32:$Ra, (fmul FPR32:$Rn, FPR32:$Rm)), + (FMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; +def : Pat<(fsub FPR32:$Ra, (fmul FPR32:$Rn, FPR32:$Rm)), + (FMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; +def : Pat<(fsub (fmul FPR32:$Rn, FPR32:$Rm), FPR32:$Ra), + (FNMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; +def : Pat<(fsub (fneg FPR32:$Ra), (fmul FPR32:$Rn, FPR32:$Rm)), + (FNMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; + +def : Pat<(fadd FPR64:$Ra, (fmul FPR64:$Rn, FPR64:$Rm)), + (FMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +def : Pat<(fsub FPR64:$Ra, (fmul FPR64:$Rn, FPR64:$Rm)), + (FMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +def : Pat<(fsub (fmul FPR64:$Rn, FPR64:$Rm), FPR64:$Ra), + (FNMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +def : Pat<(fsub (fneg FPR64:$Ra), (fmul FPR64:$Rn, FPR64:$Rm)), + (FNMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +} + + //===----------------------------------------------------------------------===// // Floating-point <-> fixed-point conversion instructions //===----------------------------------------------------------------------===// @@ -5123,3 +5157,9 @@ defm : regoff_pats<"Xm", (add i64:$Rn, i64:$Rm), defm : regoff_pats<"Xm", (add i64:$Rn, (shl i64:$Rm, SHIFT)), (i64 i64:$Rn), (i64 i64:$Rm), (i64 3)>; + +//===----------------------------------------------------------------------===// +// Advanced SIMD (NEON) Support +// + +include "AArch64InstrNEON.td" \ No newline at end of file diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td new file mode 100644 index 00000000000..98b9e3e1158 --- /dev/null +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -0,0 +1,1634 @@ +//===-- AArch64InstrNEON.td - NEON support for AArch64 -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the AArch64 NEON instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// NEON-specific DAG Nodes. +//===----------------------------------------------------------------------===// +def Neon_bsl : SDNode<"AArch64ISD::NEON_BSL", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>>; + +// (outs Result), (ins Imm, OpCmode) +def SDT_Neon_movi : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVT<1, i32>]>; + +def Neon_movi : SDNode<"AArch64ISD::NEON_MOVIMM", SDT_Neon_movi>; + +def Neon_mvni : SDNode<"AArch64ISD::NEON_MVNIMM", SDT_Neon_movi>; + +// (outs Result), (ins Imm) +def Neon_fmovi : SDNode<"AArch64ISD::NEON_FMOVIMM", SDTypeProfile<1, 1, + [SDTCisVec<0>, SDTCisVT<1, i32>]>>; + +// (outs Result), (ins LHS, RHS, CondCode) +def Neon_cmp : SDNode<"AArch64ISD::NEON_CMP", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>; + +// (outs Result), (ins LHS, 0/0.0 constant, CondCode) +def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisVec<1>]>>; + +// (outs Result), (ins LHS, RHS) +def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2, + [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>; + +//===----------------------------------------------------------------------===// +// Multiclasses +//===----------------------------------------------------------------------===// + +multiclass NeonI_3VSame_B_sizes size, bits<5> opcode, + string asmop, SDPatternOperator opnode8B, + SDPatternOperator opnode16B, + bit Commutable = 0> +{ + let isCommutable = Commutable in { + def _8B : NeonI_3VSame<0b0, u, size, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b", + [(set (v8i8 VPR64:$Rd), + (v8i8 (opnode8B (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))], + NoItinerary>; + + def _16B : NeonI_3VSame<0b1, u, size, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b", + [(set (v16i8 VPR128:$Rd), + (v16i8 (opnode16B (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))], + NoItinerary>; + } + +} + +multiclass NeonI_3VSame_HS_sizes opcode, + string asmop, SDPatternOperator opnode, + bit Commutable = 0> +{ + let isCommutable = Commutable in { + def _4H : NeonI_3VSame<0b0, u, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.4h, $Rn.4h, $Rm.4h", + [(set (v4i16 VPR64:$Rd), + (v4i16 (opnode (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))))], + NoItinerary>; + + def _8H : NeonI_3VSame<0b1, u, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.8h, $Rn.8h, $Rm.8h", + [(set (v8i16 VPR128:$Rd), + (v8i16 (opnode (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))))], + NoItinerary>; + + def _2S : NeonI_3VSame<0b0, u, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s", + [(set (v2i32 VPR64:$Rd), + (v2i32 (opnode (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))))], + NoItinerary>; + + def _4S : NeonI_3VSame<0b1, u, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s", + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))))], + NoItinerary>; + } +} +multiclass NeonI_3VSame_BHS_sizes opcode, + string asmop, SDPatternOperator opnode, + bit Commutable = 0> + : NeonI_3VSame_HS_sizes +{ + let isCommutable = Commutable in { + def _8B : NeonI_3VSame<0b0, u, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b", + [(set (v8i8 VPR64:$Rd), + (v8i8 (opnode (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))], + NoItinerary>; + + def _16B : NeonI_3VSame<0b1, u, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b", + [(set (v16i8 VPR128:$Rd), + (v16i8 (opnode (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))], + NoItinerary>; + } +} + +multiclass NeonI_3VSame_BHSD_sizes opcode, + string asmop, SDPatternOperator opnode, + bit Commutable = 0> + : NeonI_3VSame_BHS_sizes +{ + let isCommutable = Commutable in { + def _2D : NeonI_3VSame<0b1, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d", + [(set (v2i64 VPR128:$Rd), + (v2i64 (opnode (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))))], + NoItinerary>; + } +} + +// Multiclass NeonI_3VSame_SD_sizes: Operand types are floating point types, +// but Result types can be integer or floating point types. +multiclass NeonI_3VSame_SD_sizes opcode, + string asmop, SDPatternOperator opnode2S, + SDPatternOperator opnode4S, + SDPatternOperator opnode2D, + ValueType ResTy2S, ValueType ResTy4S, + ValueType ResTy2D, bit Commutable = 0> +{ + let isCommutable = Commutable in { + def _2S : NeonI_3VSame<0b0, u, {size, 0b0}, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s", + [(set (ResTy2S VPR64:$Rd), + (ResTy2S (opnode2S (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))))], + NoItinerary>; + + def _4S : NeonI_3VSame<0b1, u, {size, 0b0}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s", + [(set (ResTy4S VPR128:$Rd), + (ResTy4S (opnode4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))))], + NoItinerary>; + + def _2D : NeonI_3VSame<0b1, u, {size, 0b1}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d", + [(set (ResTy2D VPR128:$Rd), + (ResTy2D (opnode2D (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))))], + NoItinerary>; + } +} + +//===----------------------------------------------------------------------===// +// Instruction Definitions +//===----------------------------------------------------------------------===// + +// Vector Arithmetic Instructions + +// Vector Add (Integer and Floating-Point) + +defm ADDvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b10000, "add", add, 1>; +defm FADDvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11010, "fadd", fadd, fadd, fadd, + v2f32, v4f32, v2f64, 1>; + +// Vector Sub (Integer and Floating-Point) + +defm SUBvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10000, "sub", sub, 0>; +defm FSUBvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11010, "fsub", fsub, fsub, fsub, + v2f32, v4f32, v2f64, 0>; + +// Vector Multiply (Integer and Floating-Point) + +defm MULvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10011, "mul", mul, 1>; +defm FMULvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11011, "fmul", fmul, fmul, fmul, + v2f32, v4f32, v2f64, 1>; + +// Vector Multiply (Polynomial) + +defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul", + int_arm_neon_vmulp, int_arm_neon_vmulp, 1>; + +// Vector Multiply-accumulate and Multiply-subtract (Integer) + +// class NeonI_3VSame_Constraint_impl: NeonI_3VSame with no data type and +// two operands constraints. +class NeonI_3VSame_Constraint_impl size, bits<5> opcode, + SDPatternOperator opnode> + : NeonI_3VSame { + let Constraints = "$src = $Rd"; +} + +def Neon_mla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (add node:$Ra, (mul node:$Rn, node:$Rm))>; + +def Neon_mls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (sub node:$Ra, (mul node:$Rn, node:$Rm))>; + + +def MLAvvv_8B: NeonI_3VSame_Constraint_impl<"mla", ".8b", VPR64, v8i8, + 0b0, 0b0, 0b00, 0b10010, Neon_mla>; +def MLAvvv_16B: NeonI_3VSame_Constraint_impl<"mla", ".16b", VPR128, v16i8, + 0b1, 0b0, 0b00, 0b10010, Neon_mla>; +def MLAvvv_4H: NeonI_3VSame_Constraint_impl<"mla", ".4h", VPR64, v4i16, + 0b0, 0b0, 0b01, 0b10010, Neon_mla>; +def MLAvvv_8H: NeonI_3VSame_Constraint_impl<"mla", ".8h", VPR128, v8i16, + 0b1, 0b0, 0b01, 0b10010, Neon_mla>; +def MLAvvv_2S: NeonI_3VSame_Constraint_impl<"mla", ".2s", VPR64, v2i32, + 0b0, 0b0, 0b10, 0b10010, Neon_mla>; +def MLAvvv_4S: NeonI_3VSame_Constraint_impl<"mla", ".4s", VPR128, v4i32, + 0b1, 0b0, 0b10, 0b10010, Neon_mla>; + +def MLSvvv_8B: NeonI_3VSame_Constraint_impl<"mls", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b00, 0b10010, Neon_mls>; +def MLSvvv_16B: NeonI_3VSame_Constraint_impl<"mls", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b00, 0b10010, Neon_mls>; +def MLSvvv_4H: NeonI_3VSame_Constraint_impl<"mls", ".4h", VPR64, v4i16, + 0b0, 0b1, 0b01, 0b10010, Neon_mls>; +def MLSvvv_8H: NeonI_3VSame_Constraint_impl<"mls", ".8h", VPR128, v8i16, + 0b1, 0b1, 0b01, 0b10010, Neon_mls>; +def MLSvvv_2S: NeonI_3VSame_Constraint_impl<"mls", ".2s", VPR64, v2i32, + 0b0, 0b1, 0b10, 0b10010, Neon_mls>; +def MLSvvv_4S: NeonI_3VSame_Constraint_impl<"mls", ".4s", VPR128, v4i32, + 0b1, 0b1, 0b10, 0b10010, Neon_mls>; + +// Vector Multiply-accumulate and Multiply-subtract (Floating Point) + +def Neon_fmla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (fadd node:$Ra, (fmul node:$Rn, node:$Rm))>; + +def Neon_fmls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (fsub node:$Ra, (fmul node:$Rn, node:$Rm))>; + +let Predicates = [HasNEON, UseFusedMAC] in { +def FMLAvvv_2S: NeonI_3VSame_Constraint_impl<"fmla", ".2s", VPR64, v2f32, + 0b0, 0b0, 0b00, 0b11001, Neon_fmla>; +def FMLAvvv_4S: NeonI_3VSame_Constraint_impl<"fmla", ".4s", VPR128, v4f32, + 0b1, 0b0, 0b00, 0b11001, Neon_fmla>; +def FMLAvvv_2D: NeonI_3VSame_Constraint_impl<"fmla", ".2d", VPR128, v2f64, + 0b1, 0b0, 0b01, 0b11001, Neon_fmla>; + +def FMLSvvv_2S: NeonI_3VSame_Constraint_impl<"fmls", ".2s", VPR64, v2f32, + 0b0, 0b0, 0b10, 0b11001, Neon_fmls>; +def FMLSvvv_4S: NeonI_3VSame_Constraint_impl<"fmls", ".4s", VPR128, v4f32, + 0b1, 0b0, 0b10, 0b11001, Neon_fmls>; +def FMLSvvv_2D: NeonI_3VSame_Constraint_impl<"fmls", ".2d", VPR128, v2f64, + 0b1, 0b0, 0b11, 0b11001, Neon_fmls>; +} + +// We're also allowed to match the fma instruction regardless of compile +// options. +def : Pat<(v2f32 (fma VPR64:$Rn, VPR64:$Rm, VPR64:$Ra)), + (FMLAvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>; +def : Pat<(v4f32 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)), + (FMLAvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>; +def : Pat<(v2f64 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)), + (FMLAvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>; + +def : Pat<(v2f32 (fma (fneg VPR64:$Rn), VPR64:$Rm, VPR64:$Ra)), + (FMLSvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>; +def : Pat<(v4f32 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)), + (FMLSvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>; +def : Pat<(v2f64 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)), + (FMLSvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>; + +// Vector Divide (Floating-Point) + +defm FDIVvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11111, "fdiv", fdiv, fdiv, fdiv, + v2f32, v4f32, v2f64, 0>; + +// Vector Bitwise Operations + +// Vector Bitwise AND + +defm ANDvvv : NeonI_3VSame_B_sizes<0b0, 0b00, 0b00011, "and", and, and, 1>; + +// Vector Bitwise Exclusive OR + +defm EORvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b00011, "eor", xor, xor, 1>; + +// Vector Bitwise OR + +defm ORRvvv : NeonI_3VSame_B_sizes<0b0, 0b10, 0b00011, "orr", or, or, 1>; + +// ORR disassembled as MOV if Vn==Vm + +// Vector Move - register +// Alias for ORR if Vn=Vm and it is the preferred syntax +def : NeonInstAlias<"mov $Rd.8b, $Rn.8b", + (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn)>; +def : NeonInstAlias<"mov $Rd.16b, $Rn.16b", + (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn)>; + +def Neon_immAllOnes: PatLeaf<(Neon_movi (i32 timm), (i32 imm)), [{ + ConstantSDNode *ImmConstVal = cast(N->getOperand(0)); + ConstantSDNode *OpCmodeConstVal = cast(N->getOperand(1)); + unsigned EltBits; + uint64_t EltVal = A64Imms::decodeNeonModImm(ImmConstVal->getZExtValue(), + OpCmodeConstVal->getZExtValue(), EltBits); + return (EltBits == 8 && EltVal == 0xff); +}]>; + + +def Neon_not8B : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v8i8 Neon_immAllOnes)))>; +def Neon_not16B : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 Neon_immAllOnes)))>; + +def Neon_orn8B : PatFrag<(ops node:$Rn, node:$Rm), + (or node:$Rn, (Neon_not8B node:$Rm))>; + +def Neon_orn16B : PatFrag<(ops node:$Rn, node:$Rm), + (or node:$Rn, (Neon_not16B node:$Rm))>; + +def Neon_bic8B : PatFrag<(ops node:$Rn, node:$Rm), + (and node:$Rn, (Neon_not8B node:$Rm))>; + +def Neon_bic16B : PatFrag<(ops node:$Rn, node:$Rm), + (and node:$Rn, (Neon_not16B node:$Rm))>; + + +// Vector Bitwise OR NOT - register + +defm ORNvvv : NeonI_3VSame_B_sizes<0b0, 0b11, 0b00011, "orn", + Neon_orn8B, Neon_orn16B, 0>; + +// Vector Bitwise Bit Clear (AND NOT) - register + +defm BICvvv : NeonI_3VSame_B_sizes<0b0, 0b01, 0b00011, "bic", + Neon_bic8B, Neon_bic16B, 0>; + +multiclass Neon_bitwise2V_patterns { + def : Pat<(v2i32 (opnode8B VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i16 (opnode8B VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1i64 (opnode8B VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i32 (opnode16B VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v8i16 (opnode16B VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2i64 (opnode16B VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$Rn, VPR128:$Rm)>; +} + +// Additional patterns for bitwise instructions AND, EOR, ORR, BIC, ORN +defm : Neon_bitwise2V_patterns; +defm : Neon_bitwise2V_patterns; +defm : Neon_bitwise2V_patterns; +defm : Neon_bitwise2V_patterns; +defm : Neon_bitwise2V_patterns; + +// Vector Bitwise Select +def BSLvvv_8B : NeonI_3VSame_Constraint_impl<"bsl", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b01, 0b00011, Neon_bsl>; + +def BSLvvv_16B : NeonI_3VSame_Constraint_impl<"bsl", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b01, 0b00011, Neon_bsl>; + +multiclass Neon_bitwise3V_patterns { + // Disassociate type from instruction definition + def : Pat<(v2i32 (opnode VPR64:$src,VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i16 (opnode VPR64:$src, VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1i64 (opnode VPR64:$src, VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i32 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v8i16 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2i64 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + + // Allow to match BSL instruction pattern with non-constant operand + def : Pat<(v8i8 (or (and VPR64:$Rn, VPR64:$Rd), + (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))), + (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i16 (or (and VPR64:$Rn, VPR64:$Rd), + (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))), + (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v2i32 (or (and VPR64:$Rn, VPR64:$Rd), + (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))), + (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1i64 (or (and VPR64:$Rn, VPR64:$Rd), + (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))), + (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v16i8 (or (and VPR128:$Rn, VPR128:$Rd), + (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))), + (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v8i16 (or (and VPR128:$Rn, VPR128:$Rd), + (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))), + (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v4i32 (or (and VPR128:$Rn, VPR128:$Rd), + (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))), + (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2i64 (or (and VPR128:$Rn, VPR128:$Rd), + (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))), + (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>; + + // Allow to match llvm.arm.* intrinsics. + def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 VPR64:$src), + (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 VPR64:$src), + (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 VPR64:$src), + (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 VPR64:$src), + (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 VPR64:$src), + (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 VPR128:$src), + (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 VPR128:$src), + (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 VPR128:$src), + (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 VPR128:$src), + (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 VPR128:$src), + (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2f64 (int_arm_neon_vbsl (v2f64 VPR128:$src), + (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; +} + +// Additional patterns for bitwise instruction BSL +defm: Neon_bitwise3V_patterns; + +def Neon_NoBSLop : PatFrag<(ops node:$src, node:$Rn, node:$Rm), + (Neon_bsl node:$src, node:$Rn, node:$Rm), + [{ (void)N; return false; }]>; + +// Vector Bitwise Insert if True + +def BITvvv_8B : NeonI_3VSame_Constraint_impl<"bit", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b10, 0b00011, Neon_NoBSLop>; +def BITvvv_16B : NeonI_3VSame_Constraint_impl<"bit", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b10, 0b00011, Neon_NoBSLop>; + +// Vector Bitwise Insert if False + +def BIFvvv_8B : NeonI_3VSame_Constraint_impl<"bif", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b11, 0b00011, Neon_NoBSLop>; +def BIFvvv_16B : NeonI_3VSame_Constraint_impl<"bif", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b11, 0b00011, Neon_NoBSLop>; + +// Vector Absolute Difference and Accumulate (Signed, Unsigned) + +def Neon_uaba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (add node:$Ra, (int_arm_neon_vabdu node:$Rn, node:$Rm))>; +def Neon_saba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (add node:$Ra, (int_arm_neon_vabds node:$Rn, node:$Rm))>; + +// Vector Absolute Difference and Accumulate (Unsigned) +def UABAvvv_8B : NeonI_3VSame_Constraint_impl<"uaba", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b00, 0b01111, Neon_uaba>; +def UABAvvv_16B : NeonI_3VSame_Constraint_impl<"uaba", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b00, 0b01111, Neon_uaba>; +def UABAvvv_4H : NeonI_3VSame_Constraint_impl<"uaba", ".4h", VPR64, v4i16, + 0b0, 0b1, 0b01, 0b01111, Neon_uaba>; +def UABAvvv_8H : NeonI_3VSame_Constraint_impl<"uaba", ".8h", VPR128, v8i16, + 0b1, 0b1, 0b01, 0b01111, Neon_uaba>; +def UABAvvv_2S : NeonI_3VSame_Constraint_impl<"uaba", ".2s", VPR64, v2i32, + 0b0, 0b1, 0b10, 0b01111, Neon_uaba>; +def UABAvvv_4S : NeonI_3VSame_Constraint_impl<"uaba", ".4s", VPR128, v4i32, + 0b1, 0b1, 0b10, 0b01111, Neon_uaba>; + +// Vector Absolute Difference and Accumulate (Signed) +def SABAvvv_8B : NeonI_3VSame_Constraint_impl<"saba", ".8b", VPR64, v8i8, + 0b0, 0b0, 0b00, 0b01111, Neon_saba>; +def SABAvvv_16B : NeonI_3VSame_Constraint_impl<"saba", ".16b", VPR128, v16i8, + 0b1, 0b0, 0b00, 0b01111, Neon_saba>; +def SABAvvv_4H : NeonI_3VSame_Constraint_impl<"saba", ".4h", VPR64, v4i16, + 0b0, 0b0, 0b01, 0b01111, Neon_saba>; +def SABAvvv_8H : NeonI_3VSame_Constraint_impl<"saba", ".8h", VPR128, v8i16, + 0b1, 0b0, 0b01, 0b01111, Neon_saba>; +def SABAvvv_2S : NeonI_3VSame_Constraint_impl<"saba", ".2s", VPR64, v2i32, + 0b0, 0b0, 0b10, 0b01111, Neon_saba>; +def SABAvvv_4S : NeonI_3VSame_Constraint_impl<"saba", ".4s", VPR128, v4i32, + 0b1, 0b0, 0b10, 0b01111, Neon_saba>; + + +// Vector Absolute Difference (Signed, Unsigned) +defm UABDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01110, "uabd", int_arm_neon_vabdu, 0>; +defm SABDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01110, "sabd", int_arm_neon_vabds, 0>; + +// Vector Absolute Difference (Floating Point) +defm FABDvvv: NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11010, "fabd", + int_arm_neon_vabds, int_arm_neon_vabds, + int_arm_neon_vabds, v2f32, v4f32, v2f64, 0>; + +// Vector Reciprocal Step (Floating Point) +defm FRECPSvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11111, "frecps", + int_arm_neon_vrecps, int_arm_neon_vrecps, + int_arm_neon_vrecps, + v2f32, v4f32, v2f64, 0>; + +// Vector Reciprocal Square Root Step (Floating Point) +defm FRSQRTSvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", + int_arm_neon_vrsqrts, + int_arm_neon_vrsqrts, + int_arm_neon_vrsqrts, + v2f32, v4f32, v2f64, 0>; + +// Vector Comparisons + +def Neon_cmeq : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETEQ)>; +def Neon_cmphs : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETUGE)>; +def Neon_cmge : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETGE)>; +def Neon_cmhi : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETUGT)>; +def Neon_cmgt : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETGT)>; + +// NeonI_compare_aliases class: swaps register operands to implement +// comparison aliases, e.g., CMLE is alias for CMGE with operands reversed. +class NeonI_compare_aliases + : NeonInstAlias; + +// Vector Comparisons (Integer) + +// Vector Compare Mask Equal (Integer) +let isCommutable =1 in { +defm CMEQvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10001, "cmeq", Neon_cmeq, 0>; +} + +// Vector Compare Mask Higher or Same (Unsigned Integer) +defm CMHSvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00111, "cmhs", Neon_cmphs, 0>; + +// Vector Compare Mask Greater Than or Equal (Integer) +defm CMGEvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00111, "cmge", Neon_cmge, 0>; + +// Vector Compare Mask Higher (Unsigned Integer) +defm CMHIvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00110, "cmhi", Neon_cmhi, 0>; + +// Vector Compare Mask Greater Than (Integer) +defm CMGTvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00110, "cmgt", Neon_cmgt, 0>; + +// Vector Compare Mask Bitwise Test (Integer) +defm CMTSTvvv: NeonI_3VSame_BHSD_sizes<0b0, 0b10001, "cmtst", Neon_tst, 0>; + +// Vector Compare Mask Less or Same (Unsigned Integer) +// CMLS is alias for CMHS with operands reversed. +def CMLSvvv_8B : NeonI_compare_aliases<"cmls", ".8b", CMHSvvv_8B, VPR64>; +def CMLSvvv_16B : NeonI_compare_aliases<"cmls", ".16b", CMHSvvv_16B, VPR128>; +def CMLSvvv_4H : NeonI_compare_aliases<"cmls", ".4h", CMHSvvv_4H, VPR64>; +def CMLSvvv_8H : NeonI_compare_aliases<"cmls", ".8h", CMHSvvv_8H, VPR128>; +def CMLSvvv_2S : NeonI_compare_aliases<"cmls", ".2s", CMHSvvv_2S, VPR64>; +def CMLSvvv_4S : NeonI_compare_aliases<"cmls", ".4s", CMHSvvv_4S, VPR128>; +def CMLSvvv_2D : NeonI_compare_aliases<"cmls", ".2d", CMHSvvv_2D, VPR128>; + +// Vector Compare Mask Less Than or Equal (Integer) +// CMLE is alias for CMGE with operands reversed. +def CMLEvvv_8B : NeonI_compare_aliases<"cmle", ".8b", CMGEvvv_8B, VPR64>; +def CMLEvvv_16B : NeonI_compare_aliases<"cmle", ".16b", CMGEvvv_16B, VPR128>; +def CMLEvvv_4H : NeonI_compare_aliases<"cmle", ".4h", CMGEvvv_4H, VPR64>; +def CMLEvvv_8H : NeonI_compare_aliases<"cmle", ".8h", CMGEvvv_8H, VPR128>; +def CMLEvvv_2S : NeonI_compare_aliases<"cmle", ".2s", CMGEvvv_2S, VPR64>; +def CMLEvvv_4S : NeonI_compare_aliases<"cmle", ".4s", CMGEvvv_4S, VPR128>; +def CMLEvvv_2D : NeonI_compare_aliases<"cmle", ".2d", CMGEvvv_2D, VPR128>; + +// Vector Compare Mask Lower (Unsigned Integer) +// CMLO is alias for CMHI with operands reversed. +def CMLOvvv_8B : NeonI_compare_aliases<"cmlo", ".8b", CMHIvvv_8B, VPR64>; +def CMLOvvv_16B : NeonI_compare_aliases<"cmlo", ".16b", CMHIvvv_16B, VPR128>; +def CMLOvvv_4H : NeonI_compare_aliases<"cmlo", ".4h", CMHIvvv_4H, VPR64>; +def CMLOvvv_8H : NeonI_compare_aliases<"cmlo", ".8h", CMHIvvv_8H, VPR128>; +def CMLOvvv_2S : NeonI_compare_aliases<"cmlo", ".2s", CMHIvvv_2S, VPR64>; +def CMLOvvv_4S : NeonI_compare_aliases<"cmlo", ".4s", CMHIvvv_4S, VPR128>; +def CMLOvvv_2D : NeonI_compare_aliases<"cmlo", ".2d", CMHIvvv_2D, VPR128>; + +// Vector Compare Mask Less Than (Integer) +// CMLT is alias for CMGT with operands reversed. +def CMLTvvv_8B : NeonI_compare_aliases<"cmlt", ".8b", CMGTvvv_8B, VPR64>; +def CMLTvvv_16B : NeonI_compare_aliases<"cmlt", ".16b", CMGTvvv_16B, VPR128>; +def CMLTvvv_4H : NeonI_compare_aliases<"cmlt", ".4h", CMGTvvv_4H, VPR64>; +def CMLTvvv_8H : NeonI_compare_aliases<"cmlt", ".8h", CMGTvvv_8H, VPR128>; +def CMLTvvv_2S : NeonI_compare_aliases<"cmlt", ".2s", CMGTvvv_2S, VPR64>; +def CMLTvvv_4S : NeonI_compare_aliases<"cmlt", ".4s", CMGTvvv_4S, VPR128>; +def CMLTvvv_2D : NeonI_compare_aliases<"cmlt", ".2d", CMGTvvv_2D, VPR128>; + + +def neon_uimm0_asmoperand : AsmOperandClass +{ + let Name = "UImm0"; + let PredicateMethod = "isUImm<0>"; + let RenderMethod = "addImmOperands"; +} + +def neon_uimm0 : Operand, ImmLeaf { + let ParserMatchClass = neon_uimm0_asmoperand; + let PrintMethod = "printNeonUImm0Operand"; + +} + +multiclass NeonI_cmpz_sizes opcode, string asmop, CondCode CC> +{ + def _8B : NeonI_2VMisc<0b0, u, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.8b, $Rn.8b, $Imm", + [(set (v8i8 VPR64:$Rd), + (v8i8 (Neon_cmpz (v8i8 VPR64:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _16B : NeonI_2VMisc<0b1, u, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.16b, $Rn.16b, $Imm", + [(set (v16i8 VPR128:$Rd), + (v16i8 (Neon_cmpz (v16i8 VPR128:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _4H : NeonI_2VMisc<0b0, u, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.4h, $Rn.4h, $Imm", + [(set (v4i16 VPR64:$Rd), + (v4i16 (Neon_cmpz (v4i16 VPR64:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _8H : NeonI_2VMisc<0b1, u, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.8h, $Rn.8h, $Imm", + [(set (v8i16 VPR128:$Rd), + (v8i16 (Neon_cmpz (v8i16 VPR128:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _2S : NeonI_2VMisc<0b0, u, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.2s, $Rn.2s, $Imm", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_cmpz (v2i32 VPR64:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _4S : NeonI_2VMisc<0b1, u, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.4s, $Rn.4s, $Imm", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_cmpz (v4i32 VPR128:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _2D : NeonI_2VMisc<0b1, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.2d, $Rn.2d, $Imm", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_cmpz (v2i64 VPR128:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; +} + +// Vector Compare Mask Equal to Zero (Integer) +defm CMEQvvi : NeonI_cmpz_sizes<0b0, 0b01001, "cmeq", SETEQ>; + +// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer) +defm CMGEvvi : NeonI_cmpz_sizes<0b1, 0b01000, "cmge", SETGE>; + +// Vector Compare Mask Greater Than Zero (Signed Integer) +defm CMGTvvi : NeonI_cmpz_sizes<0b0, 0b01000, "cmgt", SETGT>; + +// Vector Compare Mask Less Than or Equal To Zero (Signed Integer) +defm CMLEvvi : NeonI_cmpz_sizes<0b1, 0b01001, "cmle", SETLE>; + +// Vector Compare Mask Less Than Zero (Signed Integer) +defm CMLTvvi : NeonI_cmpz_sizes<0b0, 0b01010, "cmlt", SETLT>; + +// Vector Comparisons (Floating Point) + +// Vector Compare Mask Equal (Floating Point) +let isCommutable =1 in { +defm FCMEQvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11100, "fcmeq", Neon_cmeq, + Neon_cmeq, Neon_cmeq, + v2i32, v4i32, v2i64, 0>; +} + +// Vector Compare Mask Greater Than Or Equal (Floating Point) +defm FCMGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11100, "fcmge", Neon_cmge, + Neon_cmge, Neon_cmge, + v2i32, v4i32, v2i64, 0>; + +// Vector Compare Mask Greater Than (Floating Point) +defm FCMGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11100, "fcmgt", Neon_cmgt, + Neon_cmgt, Neon_cmgt, + v2i32, v4i32, v2i64, 0>; + +// Vector Compare Mask Less Than Or Equal (Floating Point) +// FCMLE is alias for FCMGE with operands reversed. +def FCMLEvvv_2S : NeonI_compare_aliases<"fcmle", ".2s", FCMGEvvv_2S, VPR64>; +def FCMLEvvv_4S : NeonI_compare_aliases<"fcmle", ".4s", FCMGEvvv_4S, VPR128>; +def FCMLEvvv_2D : NeonI_compare_aliases<"fcmle", ".2d", FCMGEvvv_2D, VPR128>; + +// Vector Compare Mask Less Than (Floating Point) +// FCMLT is alias for FCMGT with operands reversed. +def FCMLTvvv_2S : NeonI_compare_aliases<"fcmlt", ".2s", FCMGTvvv_2S, VPR64>; +def FCMLTvvv_4S : NeonI_compare_aliases<"fcmlt", ".4s", FCMGTvvv_4S, VPR128>; +def FCMLTvvv_2D : NeonI_compare_aliases<"fcmlt", ".2d", FCMGTvvv_2D, VPR128>; + + +multiclass NeonI_fpcmpz_sizes opcode, + string asmop, CondCode CC> +{ + def _2S : NeonI_2VMisc<0b0, u, {size, 0b0}, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, fpz32:$FPImm), + asmop # "\t$Rd.2s, $Rn.2s, $FPImm", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_cmpz (v2f32 VPR64:$Rn), (f32 fpimm:$FPImm), CC)))], + NoItinerary>; + + def _4S : NeonI_2VMisc<0b1, u, {size, 0b0}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, fpz32:$FPImm), + asmop # "\t$Rd.4s, $Rn.4s, $FPImm", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_cmpz (v4f32 VPR128:$Rn), (f32 fpimm:$FPImm), CC)))], + NoItinerary>; + + def _2D : NeonI_2VMisc<0b1, u, {size, 0b1}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, fpz32:$FPImm), + asmop # "\t$Rd.2d, $Rn.2d, $FPImm", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_cmpz (v2f64 VPR128:$Rn), (f32 fpimm:$FPImm), CC)))], + NoItinerary>; +} + +// Vector Compare Mask Equal to Zero (Floating Point) +defm FCMEQvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01101, "fcmeq", SETEQ>; + +// Vector Compare Mask Greater Than or Equal to Zero (Floating Point) +defm FCMGEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01100, "fcmge", SETGE>; + +// Vector Compare Mask Greater Than Zero (Floating Point) +defm FCMGTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01100, "fcmgt", SETGT>; + +// Vector Compare Mask Less Than or Equal To Zero (Floating Point) +defm FCMLEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01101, "fcmle", SETLE>; + +// Vector Compare Mask Less Than Zero (Floating Point) +defm FCMLTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01110, "fcmlt", SETLT>; + +// Vector Absolute Comparisons (Floating Point) + +// Vector Absolute Compare Mask Greater Than Or Equal (Floating Point) +defm FACGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11101, "facge", + int_arm_neon_vacged, int_arm_neon_vacgeq, + int_aarch64_neon_vacgeq, + v2i32, v4i32, v2i64, 0>; + +// Vector Absolute Compare Mask Greater Than (Floating Point) +defm FACGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11101, "facgt", + int_arm_neon_vacgtd, int_arm_neon_vacgtq, + int_aarch64_neon_vacgtq, + v2i32, v4i32, v2i64, 0>; + +// Vector Absolute Compare Mask Less Than Or Equal (Floating Point) +// FACLE is alias for FACGE with operands reversed. +def FACLEvvv_2S : NeonI_compare_aliases<"facle", ".2s", FACGEvvv_2S, VPR64>; +def FACLEvvv_4S : NeonI_compare_aliases<"facle", ".4s", FACGEvvv_4S, VPR128>; +def FACLEvvv_2D : NeonI_compare_aliases<"facle", ".2d", FACGEvvv_2D, VPR128>; + +// Vector Absolute Compare Mask Less Than (Floating Point) +// FACLT is alias for FACGT with operands reversed. +def FACLTvvv_2S : NeonI_compare_aliases<"faclt", ".2s", FACGTvvv_2S, VPR64>; +def FACLTvvv_4S : NeonI_compare_aliases<"faclt", ".4s", FACGTvvv_4S, VPR128>; +def FACLTvvv_2D : NeonI_compare_aliases<"faclt", ".2d", FACGTvvv_2D, VPR128>; + +// Vector halving add (Integer Signed, Unsigned) +defm SHADDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00000, "shadd", + int_arm_neon_vhadds, 1>; +defm UHADDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00000, "uhadd", + int_arm_neon_vhaddu, 1>; + +// Vector halving sub (Integer Signed, Unsigned) +defm SHSUBvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00100, "shsub", + int_arm_neon_vhsubs, 0>; +defm UHSUBvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00100, "uhsub", + int_arm_neon_vhsubu, 0>; + +// Vector rouding halving add (Integer Signed, Unsigned) +defm SRHADDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00010, "srhadd", + int_arm_neon_vrhadds, 1>; +defm URHADDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00010, "urhadd", + int_arm_neon_vrhaddu, 1>; + +// Vector Saturating add (Integer Signed, Unsigned) +defm SQADDvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00001, "sqadd", + int_arm_neon_vqadds, 1>; +defm UQADDvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00001, "uqadd", + int_arm_neon_vqaddu, 1>; + +// Vector Saturating sub (Integer Signed, Unsigned) +defm SQSUBvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00101, "sqsub", + int_arm_neon_vqsubs, 1>; +defm UQSUBvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00101, "uqsub", + int_arm_neon_vqsubu, 1>; + +// Vector Shift Left (Signed and Unsigned Integer) +defm SSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01000, "sshl", + int_arm_neon_vshifts, 1>; +defm USHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01000, "ushl", + int_arm_neon_vshiftu, 1>; + +// Vector Saturating Shift Left (Signed and Unsigned Integer) +defm SQSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01001, "sqshl", + int_arm_neon_vqshifts, 1>; +defm UQSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01001, "uqshl", + int_arm_neon_vqshiftu, 1>; + +// Vector Rouding Shift Left (Signed and Unsigned Integer) +defm SRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01010, "srshl", + int_arm_neon_vrshifts, 1>; +defm URSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01010, "urshl", + int_arm_neon_vrshiftu, 1>; + +// Vector Saturating Rouding Shift Left (Signed and Unsigned Integer) +defm SQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01011, "sqrshl", + int_arm_neon_vqrshifts, 1>; +defm UQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01011, "uqrshl", + int_arm_neon_vqrshiftu, 1>; + +// Vector Maximum (Signed and Unsigned Integer) +defm SMAXvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01100, "smax", int_arm_neon_vmaxs, 1>; +defm UMAXvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01100, "umax", int_arm_neon_vmaxu, 1>; + +// Vector Minimum (Signed and Unsigned Integer) +defm SMINvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01101, "smin", int_arm_neon_vmins, 1>; +defm UMINvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01101, "umin", int_arm_neon_vminu, 1>; + +// Vector Maximum (Floating Point) +defm FMAXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11110, "fmax", + int_arm_neon_vmaxs, int_arm_neon_vmaxs, + int_arm_neon_vmaxs, v2f32, v4f32, v2f64, 1>; + +// Vector Minimum (Floating Point) +defm FMINvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11110, "fmin", + int_arm_neon_vmins, int_arm_neon_vmins, + int_arm_neon_vmins, v2f32, v4f32, v2f64, 1>; + +// Vector maxNum (Floating Point) - prefer a number over a quiet NaN) +defm FMAXNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11000, "fmaxnm", + int_aarch64_neon_vmaxnm, + int_aarch64_neon_vmaxnm, + int_aarch64_neon_vmaxnm, + v2f32, v4f32, v2f64, 1>; + +// Vector minNum (Floating Point) - prefer a number over a quiet NaN) +defm FMINNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11000, "fminnm", + int_aarch64_neon_vminnm, + int_aarch64_neon_vminnm, + int_aarch64_neon_vminnm, + v2f32, v4f32, v2f64, 1>; + +// Vector Maximum Pairwise (Signed and Unsigned Integer) +defm SMAXPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10100, "smaxp", int_arm_neon_vpmaxs, 1>; +defm UMAXPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10100, "umaxp", int_arm_neon_vpmaxu, 1>; + +// Vector Minimum Pairwise (Signed and Unsigned Integer) +defm SMINPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10101, "sminp", int_arm_neon_vpmins, 1>; +defm UMINPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10101, "uminp", int_arm_neon_vpminu, 1>; + +// Vector Maximum Pairwise (Floating Point) +defm FMAXPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11110, "fmaxp", + int_arm_neon_vpmaxs, int_arm_neon_vpmaxs, + int_arm_neon_vpmaxs, v2f32, v4f32, v2f64, 1>; + +// Vector Minimum Pairwise (Floating Point) +defm FMINPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11110, "fminp", + int_arm_neon_vpmins, int_arm_neon_vpmins, + int_arm_neon_vpmins, v2f32, v4f32, v2f64, 1>; + +// Vector maxNum Pairwise (Floating Point) - prefer a number over a quiet NaN) +defm FMAXNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11000, "fmaxnmp", + int_aarch64_neon_vpmaxnm, + int_aarch64_neon_vpmaxnm, + int_aarch64_neon_vpmaxnm, + v2f32, v4f32, v2f64, 1>; + +// Vector minNum Pairwise (Floating Point) - prefer a number over a quiet NaN) +defm FMINNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11000, "fminnmp", + int_aarch64_neon_vpminnm, + int_aarch64_neon_vpminnm, + int_aarch64_neon_vpminnm, + v2f32, v4f32, v2f64, 1>; + +// Vector Addition Pairwise (Integer) +defm ADDP : NeonI_3VSame_BHSD_sizes<0b0, 0b10111, "addp", int_arm_neon_vpadd, 1>; + +// Vector Addition Pairwise (Floating Point) +defm FADDP : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11010, "faddp", + int_arm_neon_vpadd, + int_arm_neon_vpadd, + int_arm_neon_vpadd, + v2f32, v4f32, v2f64, 1>; + +// Vector Saturating Doubling Multiply High +defm SQDMULHvvv : NeonI_3VSame_HS_sizes<0b0, 0b10110, "sqdmulh", + int_arm_neon_vqdmulh, 1>; + +// Vector Saturating Rouding Doubling Multiply High +defm SQRDMULHvvv : NeonI_3VSame_HS_sizes<0b1, 0b10110, "sqrdmulh", + int_arm_neon_vqrdmulh, 1>; + +// Vector Multiply Extended (Floating Point) +defm FMULXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11011, "fmulx", + int_aarch64_neon_vmulx, + int_aarch64_neon_vmulx, + int_aarch64_neon_vmulx, + v2f32, v4f32, v2f64, 1>; + +// Vector Immediate Instructions + +multiclass neon_mov_imm_shift_asmoperands +{ + def _asmoperand : AsmOperandClass + { + let Name = "NeonMovImmShift" # PREFIX; + let RenderMethod = "addNeonMovImmShift" # PREFIX # "Operands"; + let PredicateMethod = "isNeonMovImmShift" # PREFIX; + } +} + +// Definition of vector immediates shift operands + +// The selectable use-cases extract the shift operation +// information from the OpCmode fields encoded in the immediate. +def neon_mod_shift_imm_XFORM : SDNodeXFormgetZExtValue(); + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn); + if (!HasShift) return SDValue(); + return CurDAG->getTargetConstant(ShiftImm, MVT::i32); +}]>; + +// Vector immediates shift operands which accept LSL and MSL +// shift operators with shift value in the range of 0, 8, 16, 24 (LSL), +// or 0, 8 (LSLH) or 8, 16 (MSL). +defm neon_mov_imm_LSL : neon_mov_imm_shift_asmoperands<"LSL">; +defm neon_mov_imm_MSL : neon_mov_imm_shift_asmoperands<"MSL">; +// LSLH restricts shift amount to 0, 8 out of 0, 8, 16, 24 +defm neon_mov_imm_LSLH : neon_mov_imm_shift_asmoperands<"LSLH">; + +multiclass neon_mov_imm_shift_operands +{ + def _operand : Operand, ImmLeaf + { + let PrintMethod = + "printNeonMovImmShiftOperand"; + let DecoderMethod = + "DecodeNeonMovImmShiftOperand"; + let ParserMatchClass = + !cast("neon_mov_imm_" # PREFIX # HALF # "_asmoperand"); + } +} + +defm neon_mov_imm_LSL : neon_mov_imm_shift_operands<"LSL", "", "false", [{ + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn); + return (HasShift && !ShiftOnesIn); +}]>; + +defm neon_mov_imm_MSL : neon_mov_imm_shift_operands<"MSL", "", "false", [{ + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn); + return (HasShift && ShiftOnesIn); +}]>; + +defm neon_mov_imm_LSLH : neon_mov_imm_shift_operands<"LSL", "H", "true", [{ + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn); + return (HasShift && !ShiftOnesIn); +}]>; + +def neon_uimm8_asmoperand : AsmOperandClass +{ + let Name = "UImm8"; + let PredicateMethod = "isUImm<8>"; + let RenderMethod = "addImmOperands"; +} + +def neon_uimm8 : Operand, ImmLeaf { + let ParserMatchClass = neon_uimm8_asmoperand; + let PrintMethod = "printNeonUImm8Operand"; +} + +def neon_uimm64_mask_asmoperand : AsmOperandClass +{ + let Name = "NeonUImm64Mask"; + let PredicateMethod = "isNeonUImm64Mask"; + let RenderMethod = "addNeonUImm64MaskOperands"; +} + +// MCOperand for 64-bit bytemask with each byte having only the +// value 0x00 and 0xff is encoded as an unsigned 8-bit value +def neon_uimm64_mask : Operand, ImmLeaf { + let ParserMatchClass = neon_uimm64_mask_asmoperand; + let PrintMethod = "printNeonUImm64MaskOperand"; +} + +multiclass NeonI_mov_imm_lsl_sizes +{ + // shift zeros, per word + def _2S : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_LSL_operand:$Simm), + !strconcat(asmop, " $Rd.2s, $Imm$Simm"), + [(set (v2i32 VPR64:$Rd), + (v2i32 (opnode (timm:$Imm), + (neon_mov_imm_LSL_operand:$Simm))))], + NoItinerary> { + bits<2> Simm; + let cmode = {0b0, Simm{1}, Simm{0}, 0b0}; + } + + def _4S : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_LSL_operand:$Simm), + !strconcat(asmop, " $Rd.4s, $Imm$Simm"), + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (timm:$Imm), + (neon_mov_imm_LSL_operand:$Simm))))], + NoItinerary> { + bits<2> Simm; + let cmode = {0b0, Simm{1}, Simm{0}, 0b0}; + } + + // shift zeros, per halfword + def _4H : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm), + !strconcat(asmop, " $Rd.4h, $Imm$Simm"), + [(set (v4i16 VPR64:$Rd), + (v4i16 (opnode (timm:$Imm), + (neon_mov_imm_LSLH_operand:$Simm))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b0, Simm, 0b0}; + } + + def _8H : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm), + !strconcat(asmop, " $Rd.8h, $Imm$Simm"), + [(set (v8i16 VPR128:$Rd), + (v8i16 (opnode (timm:$Imm), + (neon_mov_imm_LSLH_operand:$Simm))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b0, Simm, 0b0}; + } +} + +multiclass NeonI_mov_imm_with_constraint_lsl_sizes +{ + let Constraints = "$src = $Rd" in { + // shift zeros, per word + def _2S : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins VPR64:$src, neon_uimm8:$Imm, + neon_mov_imm_LSL_operand:$Simm), + !strconcat(asmop, " $Rd.2s, $Imm$Simm"), + [(set (v2i32 VPR64:$Rd), + (v2i32 (opnode (v2i32 VPR64:$src), + (v2i32 (bitconvert (v2i32 (neonopnode timm:$Imm, + neon_mov_imm_LSL_operand:$Simm)))))))], + NoItinerary> { + bits<2> Simm; + let cmode = {0b0, Simm{1}, Simm{0}, 0b1}; + } + + def _4S : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSL_operand:$Simm), + !strconcat(asmop, " $Rd.4s, $Imm$Simm"), + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (v4i32 VPR128:$src), + (v4i32 (bitconvert (v4i32 (neonopnode timm:$Imm, + neon_mov_imm_LSL_operand:$Simm)))))))], + NoItinerary> { + bits<2> Simm; + let cmode = {0b0, Simm{1}, Simm{0}, 0b1}; + } + + // shift zeros, per halfword + def _4H : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins VPR64:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm), + !strconcat(asmop, " $Rd.4h, $Imm$Simm"), + [(set (v4i16 VPR64:$Rd), + (v4i16 (opnode (v4i16 VPR64:$src), + (v4i16 (bitconvert (v4i16 (neonopnode timm:$Imm, + neon_mov_imm_LSL_operand:$Simm)))))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b0, Simm, 0b1}; + } + + def _8H : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm), + !strconcat(asmop, " $Rd.8h, $Imm$Simm"), + [(set (v8i16 VPR128:$Rd), + (v8i16 (opnode (v8i16 VPR128:$src), + (v8i16 (bitconvert (v8i16 (neonopnode timm:$Imm, + neon_mov_imm_LSL_operand:$Simm)))))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b0, Simm, 0b1}; + } + } +} + +multiclass NeonI_mov_imm_msl_sizes +{ + // shift ones, per word + def _2S : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_MSL_operand:$Simm), + !strconcat(asmop, " $Rd.2s, $Imm$Simm"), + [(set (v2i32 VPR64:$Rd), + (v2i32 (opnode (timm:$Imm), + (neon_mov_imm_MSL_operand:$Simm))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b1, 0b0, Simm}; + } + + def _4S : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_MSL_operand:$Simm), + !strconcat(asmop, " $Rd.4s, $Imm$Simm"), + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (timm:$Imm), + (neon_mov_imm_MSL_operand:$Simm))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b1, 0b0, Simm}; + } +} + +// Vector Move Immediate Shifted +let isReMaterializable = 1 in { +defm MOVIvi_lsl : NeonI_mov_imm_lsl_sizes<"movi", 0b0, Neon_movi>; +} + +// Vector Move Inverted Immediate Shifted +let isReMaterializable = 1 in { +defm MVNIvi_lsl : NeonI_mov_imm_lsl_sizes<"mvni", 0b1, Neon_mvni>; +} + +// Vector Bitwise Bit Clear (AND NOT) - immediate +let isReMaterializable = 1 in { +defm BICvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"bic", 0b1, + and, Neon_mvni>; +} + +// Vector Bitwise OR - immedidate + +let isReMaterializable = 1 in { +defm ORRvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"orr", 0b0, + or, Neon_movi>; +} + +// Additional patterns for Vector Bitwise Bit Clear (AND NOT) - immedidate +// LowerBUILD_VECTOR favors lowering MOVI over MVNI. +// BIC immediate instructions selection requires additional patterns to +// transform Neon_movi operands into BIC immediate operands + +def neon_mov_imm_LSLH_transform_XFORM : SDNodeXFormgetZExtValue(); + unsigned ShiftImm; + unsigned ShiftOnesIn; + (void)A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn); + // LSLH restricts shift amount to 0, 8 which are encoded as 0 and 1 + // Transform encoded shift amount 0 to 1 and 1 to 0. + return CurDAG->getTargetConstant(!ShiftImm, MVT::i32); +}]>; + +def neon_mov_imm_LSLH_transform_operand + : ImmLeaf; + +// Transform (and A, (4h Neon_movi 0xff)) -> BIC 4h (A, 0x00, LSL 8) +// Transform (and A, (4h Neon_movi 0xff LSL #8)) -> BIC 4h (A, 0x00) +def : Pat<(v4i16 (and VPR64:$src, + (v4i16 (Neon_movi 255, neon_mov_imm_LSLH_transform_operand:$Simm)))), + (BICvi_lsl_4H VPR64:$src, 0, + neon_mov_imm_LSLH_transform_operand:$Simm)>; + +// Transform (and A, (8h Neon_movi 8h 0xff)) -> BIC 8h (A, 0x00, LSL 8) +// Transform (and A, (8h Neon_movi 0xff LSL #8)) -> BIC 8h (A, 0x00) +def : Pat<(v8i16 (and VPR128:$src, + (v8i16 (Neon_movi 255, neon_mov_imm_LSLH_transform_operand:$Simm)))), + (BICvi_lsl_8H VPR128:$src, 0, + neon_mov_imm_LSLH_transform_operand:$Simm)>; + + +multiclass Neon_bitwiseVi_patterns { + def : Pat<(v8i8 (opnode VPR64:$src, + (bitconvert(v4i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST4H VPR64:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; + def : Pat<(v1i64 (opnode VPR64:$src, + (bitconvert(v4i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST4H VPR64:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; + + def : Pat<(v16i8 (opnode VPR128:$src, + (bitconvert(v8i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST8H VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; + def : Pat<(v4i32 (opnode VPR128:$src, + (bitconvert(v8i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST8H VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; + def : Pat<(v2i64 (opnode VPR128:$src, + (bitconvert(v8i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST8H VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; +} + +// Additional patterns for Vector Vector Bitwise Bit Clear (AND NOT) - immediate +defm : Neon_bitwiseVi_patterns; + +// Additional patterns for Vector Bitwise OR - immedidate +defm : Neon_bitwiseVi_patterns; + + +// Vector Move Immediate Masked +let isReMaterializable = 1 in { +defm MOVIvi_msl : NeonI_mov_imm_msl_sizes<"movi", 0b0, Neon_movi>; +} + +// Vector Move Inverted Immediate Masked +let isReMaterializable = 1 in { +defm MVNIvi_msl : NeonI_mov_imm_msl_sizes<"mvni", 0b1, Neon_mvni>; +} + +class NeonI_mov_imm_lsl_aliases + : NeonInstAlias; + +// Aliases for Vector Move Immediate Shifted +def : NeonI_mov_imm_lsl_aliases<"movi", ".2s", MOVIvi_lsl_2S, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"movi", ".4s", MOVIvi_lsl_4S, VPR128>; +def : NeonI_mov_imm_lsl_aliases<"movi", ".4h", MOVIvi_lsl_4H, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"movi", ".8h", MOVIvi_lsl_8H, VPR128>; + +// Aliases for Vector Move Inverted Immediate Shifted +def : NeonI_mov_imm_lsl_aliases<"mvni", ".2s", MVNIvi_lsl_2S, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"mvni", ".4s", MVNIvi_lsl_4S, VPR128>; +def : NeonI_mov_imm_lsl_aliases<"mvni", ".4h", MVNIvi_lsl_4H, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"mvni", ".8h", MVNIvi_lsl_8H, VPR128>; + +// Aliases for Vector Bitwise Bit Clear (AND NOT) - immediate +def : NeonI_mov_imm_lsl_aliases<"bic", ".2s", BICvi_lsl_2S, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"bic", ".4s", BICvi_lsl_4S, VPR128>; +def : NeonI_mov_imm_lsl_aliases<"bic", ".4h", BICvi_lsl_4H, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"bic", ".8h", BICvi_lsl_8H, VPR128>; + +// Aliases for Vector Bitwise OR - immedidate +def : NeonI_mov_imm_lsl_aliases<"orr", ".2s", ORRvi_lsl_2S, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"orr", ".4s", ORRvi_lsl_4S, VPR128>; +def : NeonI_mov_imm_lsl_aliases<"orr", ".4h", ORRvi_lsl_4H, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"orr", ".8h", ORRvi_lsl_8H, VPR128>; + +// Vector Move Immediate - per byte +let isReMaterializable = 1 in { +def MOVIvi_8B : NeonI_1VModImm<0b0, 0b0, + (outs VPR64:$Rd), (ins neon_uimm8:$Imm), + "movi\t$Rd.8b, $Imm", + [(set (v8i8 VPR64:$Rd), + (v8i8 (Neon_movi (timm:$Imm), (i32 imm))))], + NoItinerary> { + let cmode = 0b1110; +} + +def MOVIvi_16B : NeonI_1VModImm<0b1, 0b0, + (outs VPR128:$Rd), (ins neon_uimm8:$Imm), + "movi\t$Rd.16b, $Imm", + [(set (v16i8 VPR128:$Rd), + (v16i8 (Neon_movi (timm:$Imm), (i32 imm))))], + NoItinerary> { + let cmode = 0b1110; +} +} + +// Vector Move Immediate - bytemask, per double word +let isReMaterializable = 1 in { +def MOVIvi_2D : NeonI_1VModImm<0b1, 0b1, + (outs VPR128:$Rd), (ins neon_uimm64_mask:$Imm), + "movi\t $Rd.2d, $Imm", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_movi (timm:$Imm), (i32 imm))))], + NoItinerary> { + let cmode = 0b1110; +} +} + +// Vector Move Immediate - bytemask, one doubleword + +let isReMaterializable = 1 in { +def MOVIdi : NeonI_1VModImm<0b0, 0b1, + (outs FPR64:$Rd), (ins neon_uimm64_mask:$Imm), + "movi\t $Rd, $Imm", + [(set (f64 FPR64:$Rd), + (f64 (bitconvert + (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))))], + NoItinerary> { + let cmode = 0b1110; +} +} + +// Vector Floating Point Move Immediate + +class NeonI_FMOV_impl + : NeonI_1VModImm { + let cmode = 0b1111; + } + +let isReMaterializable = 1 in { +def FMOVvi_2S : NeonI_FMOV_impl<".2s", VPR64, v2f32, fmov32_operand, 0b0, 0b0>; +def FMOVvi_4S : NeonI_FMOV_impl<".4s", VPR128, v4f32, fmov32_operand, 0b1, 0b0>; +def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>; +} + +// Scalar Arithmetic + +class NeonI_Scalar3Same_D_size opcode, string asmop> + : NeonI_Scalar3Same; + +multiclass NeonI_Scalar3Same_BHSD_sizes opcode, + string asmop, bit Commutable = 0> +{ + let isCommutable = Commutable in { + def bbb : NeonI_Scalar3Same; + def hhh : NeonI_Scalar3Same; + def sss : NeonI_Scalar3Same; + def ddd : NeonI_Scalar3Same; + } +} + +class Neon_Scalar_D_size_patterns + : Pat<(v1i64 (opnode (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))), + (SUBREG_TO_REG (i64 0), + (INSTD (EXTRACT_SUBREG VPR64:$Rn, sub_64), + (EXTRACT_SUBREG VPR64:$Rm, sub_64)), + sub_64)>; + + +// Scalar Integer Add +let isCommutable = 1 in { +def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">; +} + +// Scalar Integer Sub +def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">; + +// Pattern for Scalar Integer Add and Sub with D register +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; + +// Scalar Integer Saturating Add (Signed, Unsigned) +defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>; +defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>; + +// Scalar Integer Saturating Sub (Signed, Unsigned) +defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>; +defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>; + +// Patterns for Scalar Integer Saturating Add, Sub with D register only +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; + +// Scalar Integer Shift Left (Signed, Unsigned) +def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">; +def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">; + +// Scalar Integer Saturating Shift Left (Signed, Unsigned) +defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>; +defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>; + +// Scalar Integer Rouding Shift Left (Signed, Unsigned) +def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">; +def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">; + +// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) +defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>; +defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>; + +// Patterns for Scalar Integer Shift Lef, Saturating Shift Left, +// Rounding Shift Left, Rounding Saturating Shift Left with D register only +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; +def : Neon_Scalar_D_size_patterns; + + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// 64-bit vector bitcasts... + +def : Pat<(v1i64 (bitconvert (v8i8 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v8i8 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v8i8 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v8i8 VPR64:$src))), (v4i16 VPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (v4i16 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4i16 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4i16 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4i16 VPR64:$src))), (v8i8 VPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (v2i32 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v2i32 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2i32 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2i32 VPR64:$src))), (v8i8 VPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (v2f32 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v2f32 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2f32 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2f32 VPR64:$src))), (v8i8 VPR64:$src)>; + +def : Pat<(v2f32 (bitconvert (v1i64 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v1i64 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v1i64 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v1i64 VPR64:$src))), (v8i8 VPR64:$src)>; + +// ..and 128-bit vector bitcasts... + +def : Pat<(v2f64 (bitconvert (v16i8 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v16i8 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v16i8 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v16i8 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v16i8 VPR128:$src))), (v8i16 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v8i16 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8i16 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v4i32 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4i32 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v4f32 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v2i64 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2i64 (bitconvert (v2f64 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2f64 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2f64 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2f64 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2f64 VPR128:$src))), (v16i8 VPR128:$src)>; + + +// ...and scalar bitcasts... + +def : Pat<(f64 (bitconvert (v8i8 VPR64:$src))), + (f64 (EXTRACT_SUBREG (v8i8 VPR64:$src), sub_64))>; +def : Pat<(f64 (bitconvert (v4i16 VPR64:$src))), + (f64 (EXTRACT_SUBREG (v4i16 VPR64:$src), sub_64))>; +def : Pat<(f64 (bitconvert (v2i32 VPR64:$src))), + (f64 (EXTRACT_SUBREG (v2i32 VPR64:$src), sub_64))>; +def : Pat<(f64 (bitconvert (v2f32 VPR64:$src))), + (f64 (EXTRACT_SUBREG (v2f32 VPR64:$src), sub_64))>; +def : Pat<(f64 (bitconvert (v1i64 VPR64:$src))), + (f64 (EXTRACT_SUBREG (v1i64 VPR64:$src), sub_64))>; +def : Pat<(f128 (bitconvert (v16i8 VPR128:$src))), + (f128 (EXTRACT_SUBREG (v16i8 VPR128:$src), sub_alias))>; +def : Pat<(f128 (bitconvert (v8i16 VPR128:$src))), + (f128 (EXTRACT_SUBREG (v8i16 VPR128:$src), sub_alias))>; +def : Pat<(f128 (bitconvert (v4i32 VPR128:$src))), + (f128 (EXTRACT_SUBREG (v4i32 VPR128:$src), sub_alias))>; +def : Pat<(f128 (bitconvert (v2i64 VPR128:$src))), + (f128 (EXTRACT_SUBREG (v2i64 VPR128:$src), sub_alias))>; +def : Pat<(f128 (bitconvert (v4f32 VPR128:$src))), + (f128 (EXTRACT_SUBREG (v4f32 VPR128:$src), sub_alias))>; +def : Pat<(f128 (bitconvert (v2f64 VPR128:$src))), + (f128 (EXTRACT_SUBREG (v2f64 VPR128:$src), sub_alias))>; + +def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), + (v8i8 (SUBREG_TO_REG (i64 0), (f64 FPR64:$src), sub_64))>; +def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), + (v4i16 (SUBREG_TO_REG (i64 0), (f64 FPR64:$src), sub_64))>; +def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), + (v2i32 (SUBREG_TO_REG (i64 0), (f64 FPR64:$src), sub_64))>; +def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), + (v2f32 (SUBREG_TO_REG (i64 0), (f64 FPR64:$src), sub_64))>; +def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), + (v1i64 (SUBREG_TO_REG (i64 0), (f64 FPR64:$src), sub_64))>; +def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), + (v16i8 (SUBREG_TO_REG (i128 0), (f128 FPR128:$src), + sub_alias))>; +def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), + (v8i16 (SUBREG_TO_REG (i128 0), (f128 FPR128:$src), + sub_alias))>; +def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), + (v4i32 (SUBREG_TO_REG (i128 0), (f128 FPR128:$src), + sub_alias))>; +def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), + (v2i64 (SUBREG_TO_REG (i128 0), (f128 FPR128:$src), + sub_alias))>; +def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), + (v4f32 (SUBREG_TO_REG (i128 0), (f128 FPR128:$src), + sub_alias))>; +def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), + (v2f64 (SUBREG_TO_REG (i128 0), (f128 FPR128:$src), + sub_alias))>; diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp index 3d22330afe7..7ce5ce3441e 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -109,6 +109,11 @@ bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO, case MachineOperand::MO_Immediate: MCOp = MCOperand::CreateImm(MO.getImm()); break; + case MachineOperand::MO_FPImmediate: { + assert(MO.getFPImm()->isZero() && "Only fp imm 0.0 is supported"); + MCOp = MCOperand::CreateFPImm(0.0); + break; + } case MachineOperand::MO_BlockAddress: MCOp = lowerSymbolOperand(MO, GetBlockAddressSymbol(MO.getBlockAddress())); break; diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td index cc2bb6135cc..b3a81b1dc0a 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/lib/Target/AArch64/AArch64RegisterInfo.td @@ -185,7 +185,7 @@ foreach Index = 0-31 in { // These two classes contain the same registers, which should be reasonably // sensible for MC and allocation purposes, but allows them to be treated // separately for things like stack spilling. -def VPR64 : RegisterClass<"AArch64", [v2f32, v2i32, v4i16, v8i8], 64, +def VPR64 : RegisterClass<"AArch64", [v2f32, v2i32, v4i16, v8i8, v1i64], 64, (sequence "V%u", 0, 31)>; def VPR128 : RegisterClass<"AArch64", diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index d17b7382099..d71bb4e9734 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -26,10 +26,8 @@ using namespace llvm; AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS) - : AArch64GenSubtargetInfo(TT, CPU, FS) - , HasNEON(true) - , HasCrypto(true) - , TargetTriple(TT) { + : AArch64GenSubtargetInfo(TT, CPU, FS), HasNEON(false), HasCrypto(false), + TargetTriple(TT) { ParseSubtargetFeatures(CPU, FS); } diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 2e9205fc992..35a7c8d85db 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -48,6 +48,9 @@ public: bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; } + bool hasNEON() const { return HasNEON; } + + bool hasCrypto() const { return HasCrypto; } }; } // End llvm namespace diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 10a9a6a4062..43e91ac4e01 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -664,8 +664,42 @@ public: return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4; } - template bool isSImm7Scaled() const { - if (!isImm()) return false; + bool isNeonMovImmShiftLSL() const { + if (!isShiftOrExtend()) + return false; + + if (ShiftExtend.ShiftType != A64SE::LSL) + return false; + + // Valid shift amount is 0, 8, 16 and 24. + return ShiftExtend.Amount % 8 == 0 && ShiftExtend.Amount <= 24; + } + + bool isNeonMovImmShiftLSLH() const { + if (!isShiftOrExtend()) + return false; + + if (ShiftExtend.ShiftType != A64SE::LSL) + return false; + + // Valid shift amount is 0 and 8. + return ShiftExtend.Amount == 0 || ShiftExtend.Amount == 8; + } + + bool isNeonMovImmShiftMSL() const { + if (!isShiftOrExtend()) + return false; + + if (ShiftExtend.ShiftType != A64SE::MSL) + return false; + + // Valid shift amount is 8 and 16. + return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16; + } + + template bool isSImm7Scaled() const { + if (!isImm()) + return false; const MCConstantExpr *CE = dyn_cast(getImm()); if (!CE) return false; @@ -705,10 +739,27 @@ public: return isa(getImm()); } + bool isNeonUImm64Mask() const { + if (!isImm()) + return false; + + const MCConstantExpr *CE = dyn_cast(getImm()); + if (!CE) + return false; + + uint64_t Value = CE->getValue(); + + // i64 value with each byte being either 0x00 or 0xff. + for (unsigned i = 0; i < 8; ++i, Value >>= 8) + if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff) + return false; + return true; + } + static AArch64Operand *CreateImmWithLSL(const MCExpr *Val, unsigned ShiftAmount, bool ImplicitAmount, - SMLoc S, SMLoc E) { + SMLoc S,SMLoc E) { AArch64Operand *Op = new AArch64Operand(k_ImmWithLSL, S, E); Op->ImmWithLSL.Val = Val; Op->ImmWithLSL.ShiftAmount = ShiftAmount; @@ -1026,6 +1077,40 @@ public: Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount)); } + // For Vector Immediates shifted imm operands. + void addNeonMovImmShiftLSLOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + if (ShiftExtend.Amount % 8 != 0 || ShiftExtend.Amount > 24) + llvm_unreachable("Invalid shift amount for vector immediate inst."); + + // Encode LSL shift amount 0, 8, 16, 24 as 0, 1, 2, 3. + int64_t Imm = ShiftExtend.Amount / 8; + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addNeonMovImmShiftLSLHOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + if (ShiftExtend.Amount != 0 && ShiftExtend.Amount != 8) + llvm_unreachable("Invalid shift amount for vector immediate inst."); + + // Encode LSLH shift amount 0, 8 as 0, 1. + int64_t Imm = ShiftExtend.Amount / 8; + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addNeonMovImmShiftMSLOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + if (ShiftExtend.Amount != 8 && ShiftExtend.Amount != 16) + llvm_unreachable("Invalid shift amount for vector immediate inst."); + + // Encode MSL shift amount 8, 16 as 0, 1. + int64_t Imm = ShiftExtend.Amount / 8 - 1; + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + // For the extend in load-store (register offset) instructions. template void addAddrRegExtendOperands(MCInst &Inst, unsigned N) const { @@ -1065,6 +1150,20 @@ public: Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount)); } + + void addNeonUImm64MaskOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + // A bit from each byte in the constant forms the encoded immediate + const MCConstantExpr *CE = dyn_cast(getImm()); + uint64_t Value = CE->getValue(); + + unsigned Imm = 0; + for (unsigned i = 0; i < 8; ++i, Value >>= 8) { + Imm |= (Value & 1) << i; + } + Inst.addOperand(MCOperand::CreateImm(Imm)); + } }; } // end anonymous namespace. @@ -1660,20 +1759,21 @@ AArch64AsmParser::ParseShiftExtend( std::string LowerID = IDVal.lower(); A64SE::ShiftExtSpecifiers Spec = - StringSwitch(LowerID) - .Case("lsl", A64SE::LSL) - .Case("lsr", A64SE::LSR) - .Case("asr", A64SE::ASR) - .Case("ror", A64SE::ROR) - .Case("uxtb", A64SE::UXTB) - .Case("uxth", A64SE::UXTH) - .Case("uxtw", A64SE::UXTW) - .Case("uxtx", A64SE::UXTX) - .Case("sxtb", A64SE::SXTB) - .Case("sxth", A64SE::SXTH) - .Case("sxtw", A64SE::SXTW) - .Case("sxtx", A64SE::SXTX) - .Default(A64SE::Invalid); + StringSwitch(LowerID) + .Case("lsl", A64SE::LSL) + .Case("msl", A64SE::MSL) + .Case("lsr", A64SE::LSR) + .Case("asr", A64SE::ASR) + .Case("ror", A64SE::ROR) + .Case("uxtb", A64SE::UXTB) + .Case("uxth", A64SE::UXTH) + .Case("uxtw", A64SE::UXTW) + .Case("uxtx", A64SE::UXTX) + .Case("sxtb", A64SE::SXTB) + .Case("sxth", A64SE::SXTH) + .Case("sxtw", A64SE::SXTW) + .Case("sxtx", A64SE::SXTX) + .Default(A64SE::Invalid); if (Spec == A64SE::Invalid) return MatchOperand_NoMatch; @@ -1683,8 +1783,8 @@ AArch64AsmParser::ParseShiftExtend( S = Parser.getTok().getLoc(); Parser.Lex(); - if (Spec != A64SE::LSL && Spec != A64SE::LSR && - Spec != A64SE::ASR && Spec != A64SE::ROR) { + if (Spec != A64SE::LSL && Spec != A64SE::LSR && Spec != A64SE::ASR && + Spec != A64SE::ROR && Spec != A64SE::MSL) { // The shift amount can be omitted for the extending versions, but not real // shifts: // add x0, x0, x0, uxtb @@ -2019,7 +2119,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, "expected compatible register or floating-point constant"); case Match_FPZero: return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(), - "expected floating-point constant #0.0"); + "expected floating-point constant #0.0 or invalid register type"); case Match_Label: return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(), "expected label or encodable integer pc offset"); diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 36dd7041402..a88a8e8e9e6 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -85,6 +85,9 @@ static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeVPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); @@ -126,6 +129,10 @@ static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst, unsigned ShiftAmount, uint64_t Address, const void *Decoder); +template +static DecodeStatus +DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount, + uint64_t Address, const void *Decoder); static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount, @@ -336,9 +343,20 @@ DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, return MCDisassembler::Success; } +static DecodeStatus DecodeVPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; + + uint16_t Register = getReg(Decoder, AArch64::VPR64RegClassID, RegNo); + Inst.addOperand(MCOperand::CreateReg(Register)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeVPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, const void *Decoder) { + uint64_t Address, const void *Decoder) { if (RegNo > 31) return MCDisassembler::Fail; @@ -799,4 +817,24 @@ extern "C" void LLVMInitializeAArch64Disassembler() { createAArch64Disassembler); } +template +static DecodeStatus +DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount, + uint64_t Address, const void *Decoder) { + bool IsLSL = false; + if (Ext == A64SE::LSL) + IsLSL = true; + else if (Ext != A64SE::MSL) + return MCDisassembler::Fail; + + // MSL and LSLH accepts encoded shift amount 0 or 1. + if ((!IsLSL || (IsLSL && IsHalf)) && ShiftAmount != 0 && ShiftAmount != 1) + return MCDisassembler::Fail; + + // LSL accepts encoded shift amount 0, 1, 2 or 3. + if (IsLSL && ShiftAmount > 3) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(ShiftAmount)); + return MCDisassembler::Success; +} diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index 82ce80c8b1a..b6243310d58 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -406,3 +406,84 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, printAnnotation(O, Annot); } + +template +void AArch64InstPrinter::printNeonMovImmShiftOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + + assert(MO.isImm() && + "Immediate operand required for Neon vector immediate inst."); + + bool IsLSL = false; + if (Ext == A64SE::LSL) + IsLSL = true; + else if (Ext != A64SE::MSL) + llvm_unreachable("Invalid shift specifier in movi instruction"); + + int64_t Imm = MO.getImm(); + + // MSL and LSLH accepts encoded shift amount 0 or 1. + if ((!IsLSL || (IsLSL && isHalf)) && Imm != 0 && Imm != 1) + llvm_unreachable("Invalid shift amount in movi instruction"); + + // LSH accepts encoded shift amount 0, 1, 2 or 3. + if (IsLSL && (Imm < 0 || Imm > 3)) + llvm_unreachable("Invalid shift amount in movi instruction"); + + // Print shift amount as multiple of 8 with MSL encoded shift amount + // 0 and 1 printed as 8 and 16. + if (!IsLSL) + Imm++; + Imm *= 8; + + // LSL #0 is not printed + if (IsLSL) { + if (Imm == 0) + return; + O << ", lsl"; + } else + O << ", msl"; + + O << " #" << Imm; +} + +void AArch64InstPrinter::printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &o) { + o << "#0x0"; +} + +void AArch64InstPrinter::printNeonUImm8Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MOUImm = MI->getOperand(OpNum); + + assert(MOUImm.isImm() && + "Immediate operand required for Neon vector immediate inst."); + + unsigned Imm = MOUImm.getImm(); + + O << "#0x"; + O.write_hex(Imm); +} + +void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MOUImm8 = MI->getOperand(OpNum); + + assert(MOUImm8.isImm() && + "Immediate operand required for Neon vector immediate bytemask inst."); + + uint32_t UImm8 = MOUImm8.getImm(); + uint64_t Mask = 0; + + // Replicates 0x00 or 0xff byte in a 64-bit vector + for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) { + if ((UImm8 >> ByteNum) & 1) + Mask |= (uint64_t)0xff << (8 * ByteNum); + } + + O << "#0x"; + O.write_hex(Mask); +} diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index 639fa869c01..f7439bec668 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -164,9 +164,14 @@ public: return RegNo == AArch64::XSP || RegNo == AArch64::WSP; } - + template + void printNeonMovImmShiftOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printNeonUImm8Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); }; - } #endif diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 48d48190fde..58fc95c2eaf 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -40,7 +40,7 @@ MCSubtargetInfo *AArch64_MC::createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); - InitAArch64MCSubtargetInfo(X, TT, CPU, ""); + InitAArch64MCSubtargetInfo(X, TT, CPU, FS); return X; } diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index 79865f6aa59..2a97cd63256 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -1105,3 +1105,69 @@ bool A64Imms::isOnlyMOVNImm(int RegWidth, uint64_t Value, return isMOVNImm(RegWidth, Value, UImm16, Shift); } + +// decodeNeonModShiftImm - Decode a Neon OpCmode value into the +// the shift amount and the shift type (shift zeros or ones in) and +// returns whether the OpCmode value implies a shift operation. +bool A64Imms::decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm, + unsigned &ShiftOnesIn) { + ShiftImm = 0; + ShiftOnesIn = false; + bool HasShift = true; + + if (OpCmode == 0xe) { + // movi byte + HasShift = false; + } else if (OpCmode == 0x1e) { + // movi 64-bit bytemask + HasShift = false; + } else if ((OpCmode & 0xc) == 0x8) { + // shift zeros, per halfword + ShiftImm = ((OpCmode & 0x2) >> 1); + } else if ((OpCmode & 0x8) == 0) { + // shift zeros, per word + ShiftImm = ((OpCmode & 0x6) >> 1); + } else if ((OpCmode & 0xe) == 0xc) { + // shift ones, per word + ShiftOnesIn = true; + ShiftImm = (OpCmode & 0x1); + } else { + // per byte, per bytemask + llvm_unreachable("Unsupported Neon modified immediate"); + } + + return HasShift; +} + +// decodeNeonModImm - Decode a NEON modified immediate and OpCmode values +// into the element value and the element size in bits. +uint64_t A64Imms::decodeNeonModImm(unsigned Val, unsigned OpCmode, + unsigned &EltBits) { + uint64_t DecodedVal = Val; + EltBits = 0; + + if (OpCmode == 0xe) { + // movi byte + EltBits = 8; + } else if (OpCmode == 0x1e) { + // movi 64-bit bytemask + DecodedVal = 0; + for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) { + if ((Val >> ByteNum) & 1) + DecodedVal |= (uint64_t)0xff << (8 * ByteNum); + } + EltBits = 64; + } else if ((OpCmode & 0xc) == 0x8) { + // shift zeros, per halfword + EltBits = 16; + } else if ((OpCmode & 0x8) == 0) { + // shift zeros, per word + EltBits = 32; + } else if ((OpCmode & 0xe) == 0xc) { + // shift ones, per word + EltBits = 32; + } else { + llvm_unreachable("Unsupported Neon modified immediate"); + } + return DecodedVal; +} diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 9a1ca6127ae..e675efc9d9a 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -289,6 +289,7 @@ namespace A64SE { enum ShiftExtSpecifiers { Invalid = -1, LSL, + MSL, LSR, ASR, ROR, @@ -1068,7 +1069,10 @@ namespace A64Imms { // MOVN but *not* with a MOVZ (because that would take priority). bool isOnlyMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift); -} + uint64_t decodeNeonModImm(unsigned Val, unsigned OpCmode, unsigned &EltBits); + bool decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm, + unsigned &ShiftOnesIn); + } } // end namespace llvm; diff --git a/test/CodeGen/AArch64/complex-copy-noneon.ll b/test/CodeGen/AArch64/complex-copy-noneon.ll new file mode 100644 index 00000000000..4ae547856ec --- /dev/null +++ b/test/CodeGen/AArch64/complex-copy-noneon.ll @@ -0,0 +1,21 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s + +; The DAG combiner decided to use a vector load/store for this struct copy +; previously. This probably shouldn't happen without NEON, but the most +; important thing is that it compiles. + +define void @store_combine() nounwind { + %src = alloca { double, double }, align 8 + %dst = alloca { double, double }, align 8 + + %src.realp = getelementptr inbounds { double, double }* %src, i32 0, i32 0 + %src.real = load double* %src.realp + %src.imagp = getelementptr inbounds { double, double }* %src, i32 0, i32 1 + %src.imag = load double* %src.imagp + + %dst.realp = getelementptr inbounds { double, double }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { double, double }* %dst, i32 0, i32 1 + store double %src.real, double* %dst.realp + store double %src.imag, double* %dst.imagp + ret void +} diff --git a/test/CodeGen/AArch64/inline-asm-constraints.ll b/test/CodeGen/AArch64/inline-asm-constraints.ll index cfa06a4e0b2..18a3b37b41d 100644 --- a/test/CodeGen/AArch64/inline-asm-constraints.ll +++ b/test/CodeGen/AArch64/inline-asm-constraints.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s +;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s define i64 @test_inline_constraint_r(i64 %base, i32 %offset) { ; CHECK-LABEL: test_inline_constraint_r: @@ -44,6 +44,26 @@ define i32 @test_inline_constraint_Q(i32 *%ptr) { @dump = global fp128 zeroinitializer +define void @test_inline_constraint_w(<8 x i8> %vec64, <4 x float> %vec128, half %hlf, float %flt, double %dbl, fp128 %quad) { +; CHECK: test_inline_constraint_w: + call <8 x i8> asm sideeffect "add $0.8b, $1.8b, $1.8b", "=w,w"(<8 x i8> %vec64) + call <8 x i8> asm sideeffect "fadd $0.4s, $1.4s, $1.4s", "=w,w"(<4 x float> %vec128) +; CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + + ; Arguably semantically dodgy to output "vN", but it's what GCC does + ; so purely for compatibility we want vector registers to be output. + call float asm sideeffect "fcvt ${0:s}, ${1:h}", "=w,w"(half undef) + call float asm sideeffect "fadd $0.2s, $0.2s, $0.2s", "=w,w"(float %flt) + call double asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(double %dbl) + call fp128 asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(fp128 %quad) +; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}} +; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + ret void +} + define void @test_inline_constraint_I() { ; CHECK-LABEL: test_inline_constraint_I: call void asm sideeffect "add x0, x0, $0", "I"(i32 0) diff --git a/test/CodeGen/AArch64/neon-aba-abd.ll b/test/CodeGen/AArch64/neon-aba-abd.ll new file mode 100644 index 00000000000..b423666d80f --- /dev/null +++ b/test/CodeGen/AArch64/neon-aba-abd.ll @@ -0,0 +1,226 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uabd_v8i8: + %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uabd v0.8b, v0.8b, v1.8b + ret <8 x i8> %abd +} + +define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uaba_v8i8: + %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) + %aba = add <8 x i8> %lhs, %abd +; CHECK: uaba v0.8b, v0.8b, v1.8b + ret <8 x i8> %aba +} + +define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sabd_v8i8: + %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sabd v0.8b, v0.8b, v1.8b + ret <8 x i8> %abd +} + +define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_saba_v8i8: + %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) + %aba = add <8 x i8> %lhs, %abd +; CHECK: saba v0.8b, v0.8b, v1.8b + ret <8 x i8> %aba +} + +declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uabd_v16i8: + %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uabd v0.16b, v0.16b, v1.16b + ret <16 x i8> %abd +} + +define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uaba_v16i8: + %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) + %aba = add <16 x i8> %lhs, %abd +; CHECK: uaba v0.16b, v0.16b, v1.16b + ret <16 x i8> %aba +} + +define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sabd_v16i8: + %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sabd v0.16b, v0.16b, v1.16b + ret <16 x i8> %abd +} + +define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_saba_v16i8: + %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) + %aba = add <16 x i8> %lhs, %abd +; CHECK: saba v0.16b, v0.16b, v1.16b + ret <16 x i8> %aba +} + +declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uabd_v4i16: + %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uabd v0.4h, v0.4h, v1.4h + ret <4 x i16> %abd +} + +define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uaba_v4i16: + %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + %aba = add <4 x i16> %lhs, %abd +; CHECK: uaba v0.4h, v0.4h, v1.4h + ret <4 x i16> %aba +} + +define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sabd_v4i16: + %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sabd v0.4h, v0.4h, v1.4h + ret <4 x i16> %abd +} + +define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_saba_v4i16: + %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + %aba = add <4 x i16> %lhs, %abd +; CHECK: saba v0.4h, v0.4h, v1.4h + ret <4 x i16> %aba +} + +declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uabd_v8i16: + %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uabd v0.8h, v0.8h, v1.8h + ret <8 x i16> %abd +} + +define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uaba_v8i16: + %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) + %aba = add <8 x i16> %lhs, %abd +; CHECK: uaba v0.8h, v0.8h, v1.8h + ret <8 x i16> %aba +} + +define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sabd_v8i16: + %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sabd v0.8h, v0.8h, v1.8h + ret <8 x i16> %abd +} + +define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_saba_v8i16: + %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) + %aba = add <8 x i16> %lhs, %abd +; CHECK: saba v0.8h, v0.8h, v1.8h + ret <8 x i16> %aba +} + +declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uabd_v2i32: + %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uabd v0.2s, v0.2s, v1.2s + ret <2 x i32> %abd +} + +define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uaba_v2i32: + %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + %aba = add <2 x i32> %lhs, %abd +; CHECK: uaba v0.2s, v0.2s, v1.2s + ret <2 x i32> %aba +} + +define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sabd_v2i32: + %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sabd v0.2s, v0.2s, v1.2s + ret <2 x i32> %abd +} + +define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_saba_v2i32: + %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + %aba = add <2 x i32> %lhs, %abd +; CHECK: saba v0.2s, v0.2s, v1.2s + ret <2 x i32> %aba +} + +declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uabd_v4i32: + %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uabd v0.4s, v0.4s, v1.4s + ret <4 x i32> %abd +} + +define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uaba_v4i32: + %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) + %aba = add <4 x i32> %lhs, %abd +; CHECK: uaba v0.4s, v0.4s, v1.4s + ret <4 x i32> %aba +} + +define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sabd_v4i32: + %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sabd v0.4s, v0.4s, v1.4s + ret <4 x i32> %abd +} + +define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_saba_v4i32: + %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) + %aba = add <4 x i32> %lhs, %abd +; CHECK: saba v0.4s, v0.4s, v1.4s + ret <4 x i32> %aba +} + +declare <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float>, <2 x float>) + +define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fabd_v2f32: + %abd = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fabd v0.2s, v0.2s, v1.2s + ret <2 x float> %abd +} + +declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>) + +define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fabd_v4f32: + %abd = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fabd v0.4s, v0.4s, v1.4s + ret <4 x float> %abd +} + +declare <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double>, <2 x double>) + +define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fabd_v2f64: + %abd = call <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fabd v0.2d, v0.2d, v1.2d + ret <2 x double> %abd +} \ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-add-pairwise.ll b/test/CodeGen/AArch64/neon-add-pairwise.ll new file mode 100644 index 00000000000..1abfed31908 --- /dev/null +++ b/test/CodeGen/AArch64/neon-add-pairwise.ll @@ -0,0 +1,92 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_addp_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: addp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_addp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: addp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_addp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: addp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_addp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: addp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_addp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: addp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_addp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: addp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + + +declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_addp_v2i64: + %val = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: addp v0.2d, v0.2d, v1.2d + ret <2 x i64> %val +} + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_faddp_v2f32: + %val = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: faddp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_faddp_v4f32: + %val = call <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: faddp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_faddp_v2f64: + %val = call <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: faddp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + diff --git a/test/CodeGen/AArch64/neon-add-sub.ll b/test/CodeGen/AArch64/neon-add-sub.ll new file mode 100644 index 00000000000..65ec8a247e5 --- /dev/null +++ b/test/CodeGen/AArch64/neon-add-sub.ll @@ -0,0 +1,132 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: add {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp3 = add <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @add16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: add {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp3 = add <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <4 x i16> @add4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: add {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp3 = add <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @add8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: add {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp3 = add <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <2 x i32> @add2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: add {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = add <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @add4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: add {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = add <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <2 x i64> @add2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = add <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <2 x float> @add2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fadd {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = fadd <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @add4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fadd {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = fadd <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @add2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = fadd <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + +define <8 x i8> @sub8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: sub {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp3 = sub <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @sub16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: sub {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp3 = sub <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <4 x i16> @sub4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: sub {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp3 = sub <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @sub8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: sub {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp3 = sub <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <2 x i32> @sub2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: sub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = sub <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @sub4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: sub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = sub <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <2 x i64> @sub2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = sub <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <2 x float> @sub2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fsub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = fsub <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @sub4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fsub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = fsub <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = fsub <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + +define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) { +;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + %tmp3 = add <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) { +;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + %tmp3 = sub <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + diff --git a/test/CodeGen/AArch64/neon-bitcast.ll b/test/CodeGen/AArch64/neon-bitcast.ll new file mode 100644 index 00000000000..f9ec7048402 --- /dev/null +++ b/test/CodeGen/AArch64/neon-bitcast.ll @@ -0,0 +1,574 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s + +; From <8 x i8> + +define <1 x i64> @test_v8i8_to_v1i64(<8 x i8> %in) nounwind { +; CHECK: test_v8i8_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v8i8_to_v2i32(<8 x i8> %in) nounwind { +; CHECK: test_v8i8_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v8i8_to_v1f32(<8 x i8> %in) nounwind{ +; CHECK: test_v8i8_to_v1f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v8i8_to_v4i16(<8 x i8> %in) nounwind{ +; CHECK: test_v8i8_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v8i8_to_v8i8(<8 x i8> %in) nounwind{ +; CHECK: test_v8i8_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <8 x i8> + ret <8 x i8> %val +} + +; From <4 x i16> + +define <1 x i64> @test_v4i16_to_v1i64(<4 x i16> %in) nounwind { +; CHECK: test_v4i16_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v4i16_to_v2i32(<4 x i16> %in) nounwind { +; CHECK: test_v4i16_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v4i16_to_v1f32(<4 x i16> %in) nounwind{ +; CHECK: test_v4i16_to_v1f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v4i16_to_v4i16(<4 x i16> %in) nounwind{ +; CHECK: test_v4i16_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v4i16_to_v8i8(<4 x i16> %in) nounwind{ +; CHECK: test_v4i16_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <8 x i8> + ret <8 x i8> %val +} + +; From <2 x i32> + +define <1 x i64> @test_v2i32_to_v1i64(<2 x i32> %in) nounwind { +; CHECK: test_v2i32_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v2i32_to_v2i32(<2 x i32> %in) nounwind { +; CHECK: test_v2i32_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v2i32_to_v1f32(<2 x i32> %in) nounwind{ +; CHECK: test_v2i32_to_v1f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v2i32_to_v4i16(<2 x i32> %in) nounwind{ +; CHECK: test_v2i32_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v2i32_to_v8i8(<2 x i32> %in) nounwind{ +; CHECK: test_v2i32_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <8 x i8> + ret <8 x i8> %val +} + +; From <2 x float> + +define <1 x i64> @test_v2f32_to_v1i64(<2 x float> %in) nounwind { +; CHECK: test_v2f32_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v2f32_to_v2i32(<2 x float> %in) nounwind { +; CHECK: test_v2f32_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v2f32_to_v2f32(<2 x float> %in) nounwind{ +; CHECK: test_v2f32_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v2f32_to_v4i16(<2 x float> %in) nounwind{ +; CHECK: test_v2f32_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v2f32_to_v8i8(<2 x float> %in) nounwind{ +; CHECK: test_v2f32_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <8 x i8> + ret <8 x i8> %val +} + +; From <1 x i64> + +define <1 x i64> @test_v1i64_to_v1i64(<1 x i64> %in) nounwind { +; CHECK: test_v1i64_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v1i64_to_v2i32(<1 x i64> %in) nounwind { +; CHECK: test_v1i64_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v1i64_to_v2f32(<1 x i64> %in) nounwind{ +; CHECK: test_v1i64_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v1i64_to_v4i16(<1 x i64> %in) nounwind{ +; CHECK: test_v1i64_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v1i64_to_v8i8(<1 x i64> %in) nounwind{ +; CHECK: test_v1i64_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <8 x i8> + ret <8 x i8> %val +} + + +; From <16 x i8> + +define <2 x double> @test_v16i8_to_v2f64(<16 x i8> %in) nounwind { +; CHECK: test_v16i8_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v16i8_to_v2i64(<16 x i8> %in) nounwind { +; CHECK: test_v16i8_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v16i8_to_v4i32(<16 x i8> %in) nounwind { +; CHECK: test_v16i8_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v16i8_to_v2f32(<16 x i8> %in) nounwind{ +; CHECK: test_v16i8_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v16i8_to_v8i16(<16 x i8> %in) nounwind{ +; CHECK: test_v16i8_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v16i8_to_v16i8(<16 x i8> %in) nounwind{ +; CHECK: test_v16i8_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <8 x i16> + +define <2 x double> @test_v8i16_to_v2f64(<8 x i16> %in) nounwind { +; CHECK: test_v8i16_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v8i16_to_v2i64(<8 x i16> %in) nounwind { +; CHECK: test_v8i16_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v8i16_to_v4i32(<8 x i16> %in) nounwind { +; CHECK: test_v8i16_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v8i16_to_v2f32(<8 x i16> %in) nounwind{ +; CHECK: test_v8i16_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v8i16_to_v8i16(<8 x i16> %in) nounwind{ +; CHECK: test_v8i16_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v8i16_to_v16i8(<8 x i16> %in) nounwind{ +; CHECK: test_v8i16_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <4 x i32> + +define <2 x double> @test_v4i32_to_v2f64(<4 x i32> %in) nounwind { +; CHECK: test_v4i32_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v4i32_to_v2i64(<4 x i32> %in) nounwind { +; CHECK: test_v4i32_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v4i32_to_v4i32(<4 x i32> %in) nounwind { +; CHECK: test_v4i32_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v4i32_to_v2f32(<4 x i32> %in) nounwind{ +; CHECK: test_v4i32_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v4i32_to_v8i16(<4 x i32> %in) nounwind{ +; CHECK: test_v4i32_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v4i32_to_v16i8(<4 x i32> %in) nounwind{ +; CHECK: test_v4i32_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <4 x float> + +define <2 x double> @test_v4f32_to_v2f64(<4 x float> %in) nounwind { +; CHECK: test_v4f32_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v4f32_to_v2i64(<4 x float> %in) nounwind { +; CHECK: test_v4f32_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v4f32_to_v4i32(<4 x float> %in) nounwind { +; CHECK: test_v4f32_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v4f32_to_v4f32(<4 x float> %in) nounwind{ +; CHECK: test_v4f32_to_v4f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v4f32_to_v8i16(<4 x float> %in) nounwind{ +; CHECK: test_v4f32_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v4f32_to_v16i8(<4 x float> %in) nounwind{ +; CHECK: test_v4f32_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <2 x i64> + +define <2 x double> @test_v2i64_to_v2f64(<2 x i64> %in) nounwind { +; CHECK: test_v2i64_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v2i64_to_v2i64(<2 x i64> %in) nounwind { +; CHECK: test_v2i64_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v2i64_to_v4i32(<2 x i64> %in) nounwind { +; CHECK: test_v2i64_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v2i64_to_v4f32(<2 x i64> %in) nounwind{ +; CHECK: test_v2i64_to_v4f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v2i64_to_v8i16(<2 x i64> %in) nounwind{ +; CHECK: test_v2i64_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v2i64_to_v16i8(<2 x i64> %in) nounwind{ +; CHECK: test_v2i64_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <2 x double> + +define <2 x double> @test_v2f64_to_v2f64(<2 x double> %in) nounwind { +; CHECK: test_v2f64_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v2f64_to_v2i64(<2 x double> %in) nounwind { +; CHECK: test_v2f64_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v2f64_to_v4i32(<2 x double> %in) nounwind { +; CHECK: test_v2f64_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v2f64_to_v4f32(<2 x double> %in) nounwind{ +; CHECK: test_v2f64_to_v4f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v2f64_to_v8i16(<2 x double> %in) nounwind{ +; CHECK: test_v2f64_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v2f64_to_v16i8(<2 x double> %in) nounwind{ +; CHECK: test_v2f64_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <16 x i8> + ret <16 x i8> %val +} + diff --git a/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/test/CodeGen/AArch64/neon-bitwise-instructions.ll new file mode 100644 index 00000000000..1c43b979fc4 --- /dev/null +++ b/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -0,0 +1,594 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + + +define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <8 x i8> %a, %b; + ret <8 x i8> %tmp1 +} + +define <16 x i8> @and16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <16 x i8> %a, %b; + ret <16 x i8> %tmp1 +} + + +define <8 x i8> @orr8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = or <8 x i8> %a, %b; + ret <8 x i8> %tmp1 +} + +define <16 x i8> @orr16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = or <16 x i8> %a, %b; + ret <16 x i8> %tmp1 +} + + +define <8 x i8> @xor8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <8 x i8> %a, %b; + ret <8 x i8> %tmp1 +} + +define <16 x i8> @xor16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <16 x i8> %a, %b; + ret <16 x i8> %tmp1 +} + +define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 > + %tmp3 = or <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 > + %tmp3 = or <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i8> @orn8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = or <8 x i8> %a, %tmp1 + ret <8 x i8> %tmp2 +} + +define <16 x i8> @orn16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = or <16 x i8> %a, %tmp1 + ret <16 x i8> %tmp2 +} + +define <8 x i8> @bic8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <8 x i8> %a, %tmp1 + ret <8 x i8> %tmp2 +} + +define <16 x i8> @bic16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <16 x i8> %a, %tmp1 + ret <16 x i8> %tmp2 +} + +define <2 x i32> @orrimm2s_lsl0(<2 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.2s, #0xff + %tmp1 = or <2 x i32> %a, < i32 255, i32 255> + ret <2 x i32> %tmp1 +} + +define <2 x i32> @orrimm2s_lsl8(<2 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #8 + %tmp1 = or <2 x i32> %a, < i32 65280, i32 65280> + ret <2 x i32> %tmp1 +} + +define <2 x i32> @orrimm2s_lsl16(<2 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #16 + %tmp1 = or <2 x i32> %a, < i32 16711680, i32 16711680> + ret <2 x i32> %tmp1 +} + +define <2 x i32> @orrimm2s_lsl24(<2 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #24 + %tmp1 = or <2 x i32> %a, < i32 4278190080, i32 4278190080> + ret <2 x i32> %tmp1 +} + +define <4 x i32> @orrimm4s_lsl0(<4 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.4s, #0xff + %tmp1 = or <4 x i32> %a, < i32 255, i32 255, i32 255, i32 255> + ret <4 x i32> %tmp1 +} + +define <4 x i32> @orrimm4s_lsl8(<4 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #8 + %tmp1 = or <4 x i32> %a, < i32 65280, i32 65280, i32 65280, i32 65280> + ret <4 x i32> %tmp1 +} + +define <4 x i32> @orrimm4s_lsl16(<4 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #16 + %tmp1 = or <4 x i32> %a, < i32 16711680, i32 16711680, i32 16711680, i32 16711680> + ret <4 x i32> %tmp1 +} + +define <4 x i32> @orrimm4s_lsl24(<4 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #24 + %tmp1 = or <4 x i32> %a, < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080> + ret <4 x i32> %tmp1 +} + +define <4 x i16> @orrimm4h_lsl0(<4 x i16> %a) { +;CHECK: orr {{v[0-31]+}}.4h, #0xff + %tmp1 = or <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255 > + ret <4 x i16> %tmp1 +} + +define <4 x i16> @orrimm4h_lsl8(<4 x i16> %a) { +;CHECK: orr {{v[0-31]+}}.4h, #0xff, lsl #8 + %tmp1 = or <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 > + ret <4 x i16> %tmp1 +} + +define <8 x i16> @orrimm8h_lsl0(<8 x i16> %a) { +;CHECK: orr {{v[0-31]+}}.8h, #0xff + %tmp1 = or <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 > + ret <8 x i16> %tmp1 +} + +define <8 x i16> @orrimm8h_lsl8(<8 x i16> %a) { +;CHECK: orr {{v[0-31]+}}.8h, #0xff, lsl #8 + %tmp1 = or <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 > + ret <8 x i16> %tmp1 +} + +define <2 x i32> @bicimm2s_lsl0(<2 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.2s, #0x10 + %tmp1 = and <2 x i32> %a, < i32 4294967279, i32 4294967279 > + ret <2 x i32> %tmp1 +} + +define <2 x i32> @bicimm2s_lsl8(<2 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #8 + %tmp1 = and <2 x i32> %a, < i32 18446744073709547519, i32 18446744073709547519 > + ret <2 x i32> %tmp1 +} + +define <2 x i32> @bicimm2s_lsl16(<2 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #16 + %tmp1 = and <2 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039 > + ret <2 x i32> %tmp1 +} + +define <2 x i32> @bicimm2s_lsl124(<2 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #24 + %tmp1 = and <2 x i32> %a, < i32 18446744073441116159, i32 18446744073441116159> + ret <2 x i32> %tmp1 +} + +define <4 x i32> @bicimm4s_lsl0(<4 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.4s, #0x10 + %tmp1 = and <4 x i32> %a, < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 > + ret <4 x i32> %tmp1 +} + +define <4 x i32> @bicimm4s_lsl8(<4 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #8 + %tmp1 = and <4 x i32> %a, < i32 18446744073709547519, i32 18446744073709547519, i32 18446744073709547519, i32 18446744073709547519 > + ret <4 x i32> %tmp1 +} + +define <4 x i32> @bicimm4s_lsl16(<4 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #16 + %tmp1 = and <4 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039 > + ret <4 x i32> %tmp1 +} + +define <4 x i32> @bicimm4s_lsl124(<4 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #24 + %tmp1 = and <4 x i32> %a, < i32 18446744073441116159, i32 18446744073441116159, i32 18446744073441116159, i32 18446744073441116159> + ret <4 x i32> %tmp1 +} + +define <4 x i16> @bicimm4h_lsl0_a(<4 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.4h, #0x10 + %tmp1 = and <4 x i16> %a, < i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599 > + ret <4 x i16> %tmp1 +} + +define <4 x i16> @bicimm4h_lsl0_b(<4 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.4h, #0x0 + %tmp1 = and <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 > + ret <4 x i16> %tmp1 +} + +define <4 x i16> @bicimm4h_lsl8_a(<4 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.4h, #0x10, lsl #8 + %tmp1 = and <4 x i16> %a, < i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519> + ret <4 x i16> %tmp1 +} + +define <4 x i16> @bicimm4h_lsl8_b(<4 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.4h, #0x0, lsl #8 + %tmp1 = and <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255> + ret <4 x i16> %tmp1 +} + +define <8 x i16> @bicimm8h_lsl0_a(<8 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.8h, #0x10 + %tmp1 = and <8 x i16> %a, < i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, + i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599 > + ret <8 x i16> %tmp1 +} + +define <8 x i16> @bicimm8h_lsl0_b(<8 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.8h, #0x0 + %tmp1 = and <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 > + ret <8 x i16> %tmp1 +} + +define <8 x i16> @bicimm8h_lsl8_a(<8 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.8h, #0x10, lsl #8 + %tmp1 = and <8 x i16> %a, < i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, + i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519> + ret <8 x i16> %tmp1 +} + +define <8 x i16> @bicimm8h_lsl8_b(<8 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.8h, #0x0, lsl #8 + %tmp1 = and <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + ret <8 x i16> %tmp1 +} + +define <2 x i32> @and2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <2 x i32> %a, %b; + ret <2 x i32> %tmp1 +} + +define <4 x i16> @and4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <4 x i16> %a, %b; + ret <4 x i16> %tmp1 +} + +define <1 x i64> @and1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <1 x i64> %a, %b; + ret <1 x i64> %tmp1 +} + +define <4 x i32> @and4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <4 x i32> %a, %b; + ret <4 x i32> %tmp1 +} + +define <8 x i16> @and8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <8 x i16> %a, %b; + ret <8 x i16> %tmp1 +} + +define <2 x i64> @and2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <2 x i64> %a, %b; + ret <2 x i64> %tmp1 +} + +define <2 x i32> @orr2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = or <2 x i32> %a, %b; + ret <2 x i32> %tmp1 +} + +define <4 x i16> @orr4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = or <4 x i16> %a, %b; + ret <4 x i16> %tmp1 +} + +define <1 x i64> @orr1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = or <1 x i64> %a, %b; + ret <1 x i64> %tmp1 +} + +define <4 x i32> @orr4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = or <4 x i32> %a, %b; + ret <4 x i32> %tmp1 +} + +define <8 x i16> @orr8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = or <8 x i16> %a, %b; + ret <8 x i16> %tmp1 +} + +define <2 x i64> @orr2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = or <2 x i64> %a, %b; + ret <2 x i64> %tmp1 +} + +define <2 x i32> @eor2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <2 x i32> %a, %b; + ret <2 x i32> %tmp1 +} + +define <4 x i16> @eor4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <4 x i16> %a, %b; + ret <4 x i16> %tmp1 +} + +define <1 x i64> @eor1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <1 x i64> %a, %b; + ret <1 x i64> %tmp1 +} + +define <4 x i32> @eor4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <4 x i32> %a, %b; + ret <4 x i32> %tmp1 +} + +define <8 x i16> @eor8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <8 x i16> %a, %b; + ret <8 x i16> %tmp1 +} + +define <2 x i64> @eor2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <2 x i64> %a, %b; + ret <2 x i64> %tmp1 +} + + +define <2 x i32> @bic2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 > + %tmp2 = and <2 x i32> %a, %tmp1 + ret <2 x i32> %tmp2 +} + +define <4 x i16> @bic4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 > + %tmp2 = and <4 x i16> %a, %tmp1 + ret <4 x i16> %tmp2 +} + +define <1 x i64> @bic1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <1 x i64> %b, < i64 -1> + %tmp2 = and <1 x i64> %a, %tmp1 + ret <1 x i64> %tmp2 +} + +define <4 x i32> @bic4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1> + %tmp2 = and <4 x i32> %a, %tmp1 + ret <4 x i32> %tmp2 +} + +define <8 x i16> @bic8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp2 = and <8 x i16> %a, %tmp1 + ret <8 x i16> %tmp2 +} + +define <2 x i64> @bic2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1> + %tmp2 = and <2 x i64> %a, %tmp1 + ret <2 x i64> %tmp2 +} + +define <2 x i32> @orn2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 > + %tmp2 = or <2 x i32> %a, %tmp1 + ret <2 x i32> %tmp2 +} + +define <4 x i16> @orn4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 > + %tmp2 = or <4 x i16> %a, %tmp1 + ret <4 x i16> %tmp2 +} + +define <1 x i64> @orn1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <1 x i64> %b, < i64 -1> + %tmp2 = or <1 x i64> %a, %tmp1 + ret <1 x i64> %tmp2 +} + +define <4 x i32> @orn4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1> + %tmp2 = or <4 x i32> %a, %tmp1 + ret <4 x i32> %tmp2 +} + +define <8 x i16> @orn8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp2 = or <8 x i16> %a, %tmp1 + ret <8 x i16> %tmp2 +} + +define <2 x i64> @orn2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1> + %tmp2 = or <2 x i64> %a, %tmp1 + ret <2 x i64> %tmp2 +} +define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <2 x i32> %a, < i32 -1, i32 -1 > + %tmp2 = and <2 x i32> %b, < i32 0, i32 0 > + %tmp3 = or <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + + +define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <4 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1 > + %tmp2 = and <4 x i16> %b, < i16 0, i16 0,i16 0, i16 0 > + %tmp3 = or <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <1 x i64> %a, < i64 -1 > + %tmp2 = and <1 x i64> %b, < i64 0 > + %tmp3 = or <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <4 x i32> %a, < i32 -1, i32 -1, i32 -1, i32 -1 > + %tmp2 = and <4 x i32> %b, < i32 0, i32 0, i32 0, i32 0 > + %tmp3 = or <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1, i16 -1, i16 -1, i16 -1,i16 -1 > + %tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0 > + %tmp3 = or <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <2 x i64> %a, < i64 -1, i64 -1 > + %tmp2 = and <2 x i64> %b, < i64 0, i64 0 > + %tmp3 = or <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + + +define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %1 = and <8 x i8> %v1, %v2 + %2 = xor <8 x i8> %v1, + %3 = and <8 x i8> %2, %v3 + %4 = or <8 x i8> %1, %3 + ret <8 x i8> %4 +} + +define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %1 = and <4 x i16> %v1, %v2 + %2 = xor <4 x i16> %v1, + %3 = and <4 x i16> %2, %v3 + %4 = or <4 x i16> %1, %3 + ret <4 x i16> %4 +} + +define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %1 = and <2 x i32> %v1, %v2 + %2 = xor <2 x i32> %v1, + %3 = and <2 x i32> %2, %v3 + %4 = or <2 x i32> %1, %3 + ret <2 x i32> %4 +} + +define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %1 = and <1 x i64> %v1, %v2 + %2 = xor <1 x i64> %v1, + %3 = and <1 x i64> %2, %v3 + %4 = or <1 x i64> %1, %3 + ret <1 x i64> %4 +} + +define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %1 = and <16 x i8> %v1, %v2 + %2 = xor <16 x i8> %v1, + %3 = and <16 x i8> %2, %v3 + %4 = or <16 x i8> %1, %3 + ret <16 x i8> %4 +} + +define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %1 = and <8 x i16> %v1, %v2 + %2 = xor <8 x i16> %v1, + %3 = and <8 x i16> %2, %v3 + %4 = or <8 x i16> %1, %3 + ret <8 x i16> %4 +} + +define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %1 = and <4 x i32> %v1, %v2 + %2 = xor <4 x i32> %v1, + %3 = and <4 x i32> %2, %v3 + %4 = or <4 x i32> %1, %3 + ret <4 x i32> %4 +} + +define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %1 = and <2 x i64> %v1, %v2 + %2 = xor <2 x i64> %v1, + %3 = and <2 x i64> %2, %v3 + %4 = or <2 x i64> %1, %3 + ret <2 x i64> %4 +} + +define <8 x i8> @orrimm8b_as_orrimm4h_lsl0(<8 x i8> %a) { +;CHECK: orr {{v[0-31]+}}.4h, #0xff + %val = or <8 x i8> %a, + ret <8 x i8> %val +} + +define <8 x i8> @orrimm8b_as_orimm4h_lsl8(<8 x i8> %a) { +;CHECK: orr {{v[0-31]+}}.4h, #0xff, lsl #8 + %val = or <8 x i8> %a, + ret <8 x i8> %val +} + +define <16 x i8> @orimm16b_as_orrimm8h_lsl0(<16 x i8> %a) { +;CHECK: orr {{v[0-31]+}}.8h, #0xff + %val = or <16 x i8> %a, + ret <16 x i8> %val +} + +define <16 x i8> @orimm16b_as_orrimm8h_lsl8(<16 x i8> %a) { +;CHECK: orr {{v[0-31]+}}.8h, #0xff, lsl #8 + %val = or <16 x i8> %a, + ret <16 x i8> %val +} + + diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll new file mode 100644 index 00000000000..0848f9b03dd --- /dev/null +++ b/test/CodeGen/AArch64/neon-compare-instructions.ll @@ -0,0 +1,1982 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp eq <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp eq <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp eq <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp eq <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp eq <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp eq <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp eq <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp sgt <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp sgt <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp sgt <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp sgt <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp sgt <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp sgt <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp sgt <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp slt <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp slt <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp slt <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp slt <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp slt <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp slt <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp slt <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp sge <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp sge <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp sge <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp sge <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp sge <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp sge <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp sge <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp sle <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp sle <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp sle <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp sle <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp sle <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp sle <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp sle <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ugt <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ugt <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp ugt <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp ugt <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp ugt <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp ugt <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp ugt <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp ult <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp ult <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp ult <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp ult <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp ult <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp ult <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp ult <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp uge <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp uge <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp uge <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp uge <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp uge <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp uge <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp uge <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp ule <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp ule <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp ule <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp ule <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp ule <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp ule <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp ule <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = and <8 x i8> %A, %B + %tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer + %tmp5 = sext <8 x i1> %tmp4 to <8 x i8> + ret <8 x i8> %tmp5 +} + +define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = and <16 x i8> %A, %B + %tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer + %tmp5 = sext <16 x i1> %tmp4 to <16 x i8> + ret <16 x i8> %tmp5 +} + +define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = and <4 x i16> %A, %B + %tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer + %tmp5 = sext <4 x i1> %tmp4 to <4 x i16> + ret <4 x i16> %tmp5 +} + +define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = and <8 x i16> %A, %B + %tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer + %tmp5 = sext <8 x i1> %tmp4 to <8 x i16> + ret <8 x i16> %tmp5 +} + +define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = and <2 x i32> %A, %B + %tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer + %tmp5 = sext <2 x i1> %tmp4 to <2 x i32> + ret <2 x i32> %tmp5 +} + +define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = and <4 x i32> %A, %B + %tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer + %tmp5 = sext <4 x i1> %tmp4 to <4 x i32> + ret <4 x i32> %tmp5 +} + +define <2 x i64> @cmtst2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = and <2 x i64> %A, %B + %tmp4 = icmp ne <2 x i64> %tmp3, zeroinitializer + %tmp5 = sext <2 x i1> %tmp4 to <2 x i64> + ret <2 x i64> %tmp5 +} + + + +define <8 x i8> @cmeqz8xi8(<8 x i8> %A) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp eq <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmeqz16xi8(<16 x i8> %A) { +;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp eq <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmeqz4xi16(<4 x i16> %A) { +;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp eq <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmeqz8xi16(<8 x i16> %A) { +;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp eq <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmeqz2xi32(<2 x i32> %A) { +;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp eq <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmeqz4xi32(<4 x i32> %A) { +;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp eq <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmeqz2xi64(<2 x i64> %A) { +;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp eq <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <8 x i8> @cmgez8xi8(<8 x i8> %A) { +;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp sge <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmgez16xi8(<16 x i8> %A) { +;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp sge <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmgez4xi16(<4 x i16> %A) { +;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp sge <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmgez8xi16(<8 x i16> %A) { +;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp sge <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmgez2xi32(<2 x i32> %A) { +;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp sge <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmgez4xi32(<4 x i32> %A) { +;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp sge <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmgez2xi64(<2 x i64> %A) { +;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp sge <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <8 x i8> @cmgtz8xi8(<8 x i8> %A) { +;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp sgt <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmgtz16xi8(<16 x i8> %A) { +;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp sgt <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmgtz4xi16(<4 x i16> %A) { +;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp sgt <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmgtz8xi16(<8 x i16> %A) { +;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp sgt <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmgtz2xi32(<2 x i32> %A) { +;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp sgt <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmgtz4xi32(<4 x i32> %A) { +;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp sgt <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmgtz2xi64(<2 x i64> %A) { +;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp sgt <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmlez8xi8(<8 x i8> %A) { +;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp sle <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmlez16xi8(<16 x i8> %A) { +;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp sle <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmlez4xi16(<4 x i16> %A) { +;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp sle <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmlez8xi16(<8 x i16> %A) { +;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp sle <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmlez2xi32(<2 x i32> %A) { +;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp sle <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmlez4xi32(<4 x i32> %A) { +;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp sle <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmlez2xi64(<2 x i64> %A) { +;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp sle <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmltz8xi8(<8 x i8> %A) { +;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp slt <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmltz16xi8(<16 x i8> %A) { +;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp slt <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmltz4xi16(<4 x i16> %A) { +;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp slt <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmltz8xi16(<8 x i16> %A) { +;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp slt <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmltz2xi32(<2 x i32> %A) { +;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp slt <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmltz4xi32(<4 x i32> %A) { +;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp slt <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmltz2xi64(<2 x i64> %A) { +;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp slt <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmneqz8xi8(<8 x i8> %A) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmneqz16xi8(<16 x i8> %A) { +;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmneqz4xi16(<4 x i16> %A) { +;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmneqz8xi16(<8 x i16> %A) { +;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmneqz2xi32(<2 x i32> %A) { +;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmneqz4xi32(<4 x i32> %A) { +;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmneqz2xi64(<2 x i64> %A) { +;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmhsz8xi8(<8 x i8> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp uge <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmhsz16xi8(<16 x i8> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp uge <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmhsz4xi16(<4 x i16> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp uge <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmhsz8xi16(<8 x i16> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp uge <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmhsz2xi32(<2 x i32> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp uge <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmhsz4xi32(<4 x i32> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp uge <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmhsz2xi64(<2 x i64> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp uge <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <8 x i8> @cmhiz8xi8(<8 x i8> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ugt <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmhiz16xi8(<16 x i8> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ugt <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmhiz4xi16(<4 x i16> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp ugt <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmhiz8xi16(<8 x i16> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp ugt <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmhiz2xi32(<2 x i32> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp ugt <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmhiz4xi32(<4 x i32> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp ugt <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmhiz2xi64(<2 x i64> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp ugt <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmlsz8xi8(<8 x i8> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp ule <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmlsz16xi8(<16 x i8> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp ule <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmlsz4xi16(<4 x i16> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp ule <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmlsz8xi16(<8 x i16> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp ule <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmlsz2xi32(<2 x i32> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp ule <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmlsz4xi32(<4 x i32> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp ule <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmlsz2xi64(<2 x i64> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp ule <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmloz8xi8(<8 x i8> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v1.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ult <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmloz16xi8(<16 x i8> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp ult <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmloz4xi16(<4 x i16> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp ult <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmloz8xi16(<8 x i16> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp ult <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmloz2xi32(<2 x i32> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp ult <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmloz4xi32(<4 x i32> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp ult <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmloz2xi64(<2 x i64> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp ult <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmoeq2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = fcmp oeq <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmoeq4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = fcmp oeq <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmoeq2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = fcmp oeq <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmoge2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = fcmp oge <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmoge4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = fcmp oge <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmoge2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = fcmp oge <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmogt2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = fcmp ogt <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmogt4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = fcmp ogt <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmogt2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = fcmp ogt <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmole2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = fcmp ole <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmole4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = fcmp ole <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmole2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = fcmp ole <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmolt2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = fcmp olt <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmolt4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = fcmp olt <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmolt2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = fcmp olt <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmone2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp one <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmone4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp one <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmone2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; todo check reversed operands + %tmp3 = fcmp one <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmord2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ord <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + + +define <4 x i32> @fcmord4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ord <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmord2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ord <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmuno2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp uno <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmuno4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uno <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmuno2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uno <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmueq2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ueq <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmueq4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ueq <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmueq2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ueq <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmuge2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGE = ULE with swapped operands, ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp uge <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmuge4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGE = ULE with swapped operands, ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uge <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmuge2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGE = ULE with swapped operands, ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uge <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmugt2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGT = ULT with swapped operands, ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ugt <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmugt4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGT = ULT with swapped operands, ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ugt <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmugt2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ugt <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmule2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ule <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmule4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ule <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmule2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ule <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmult2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ult <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmult4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ult <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmult2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ult <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmune2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNE = !OEQ. +;CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp une <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmune4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNE = !OEQ. +;CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp une <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmune2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNE = !OEQ. +;CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp une <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmoeqz2xfloat(<2 x float> %A) { +;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp oeq <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmoeqz4xfloat(<4 x float> %A) { +;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp oeq <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmoeqz2xdouble(<2 x double> %A) { +;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp oeq <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmogez2xfloat(<2 x float> %A) { +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp oge <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmogez4xfloat(<4 x float> %A) { +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp oge <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmogez2xdouble(<2 x double> %A) { +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp oge <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmogtz2xfloat(<2 x float> %A) { +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp ogt <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmogtz4xfloat(<4 x float> %A) { +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp ogt <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmogtz2xdouble(<2 x double> %A) { +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp ogt <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmoltz2xfloat(<2 x float> %A) { +;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp olt <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmoltz4xfloat(<4 x float> %A) { +;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp olt <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmoltz2xdouble(<2 x double> %A) { +;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp olt <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmolez2xfloat(<2 x float> %A) { +;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp ole <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmolez4xfloat(<4 x float> %A) { +;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp ole <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmolez2xdouble(<2 x double> %A) { +;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp ole <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmonez2xfloat(<2 x float> %A) { +; ONE with zero = OLT | OGT +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp one <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmonez4xfloat(<4 x float> %A) { +; ONE with zero = OLT | OGT +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp one <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmonez2xdouble(<2 x double> %A) { +; ONE with zero = OLT | OGT +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp one <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmordz2xfloat(<2 x float> %A) { +; ORD with zero = OLT | OGE +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ord <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmordz4xfloat(<4 x float> %A) { +; ORD with zero = OLT | OGE +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ord <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmordz2xdouble(<2 x double> %A) { +; ORD with zero = OLT | OGE +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ord <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmueqz2xfloat(<2 x float> %A) { +; UEQ with zero = !ONE = !(OLT |OGT) +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ueq <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmueqz4xfloat(<4 x float> %A) { +; UEQ with zero = !ONE = !(OLT |OGT) +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ueq <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmueqz2xdouble(<2 x double> %A) { +; UEQ with zero = !ONE = !(OLT |OGT) +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ueq <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmugez2xfloat(<2 x float> %A) { +; UGE with zero = !OLT +;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp uge <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmugez4xfloat(<4 x float> %A) { +; UGE with zero = !OLT +;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uge <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmugez2xdouble(<2 x double> %A) { +; UGE with zero = !OLT +;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uge <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmugtz2xfloat(<2 x float> %A) { +; UGT with zero = !OLE +;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ugt <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmugtz4xfloat(<4 x float> %A) { +; UGT with zero = !OLE +;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ugt <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmugtz2xdouble(<2 x double> %A) { +; UGT with zero = !OLE +;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ugt <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmultz2xfloat(<2 x float> %A) { +; ULT with zero = !OGE +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ult <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmultz4xfloat(<4 x float> %A) { +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ult <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmultz2xdouble(<2 x double> %A) { +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ult <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmulez2xfloat(<2 x float> %A) { +; ULE with zero = !OGT +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ule <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmulez4xfloat(<4 x float> %A) { +; ULE with zero = !OGT +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ule <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmulez2xdouble(<2 x double> %A) { +; ULE with zero = !OGT +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ule <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmunez2xfloat(<2 x float> %A) { +; UNE with zero = !OEQ with zero +;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp une <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmunez4xfloat(<4 x float> %A) { +; UNE with zero = !OEQ with zero +;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp une <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmunez2xdouble(<2 x double> %A) { +; UNE with zero = !OEQ with zero +;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp une <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmunoz2xfloat(<2 x float> %A) { +; UNO with zero = !ORD = !(OLT | OGE) +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp uno <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmunoz4xfloat(<4 x float> %A) { +; UNO with zero = !ORD = !(OLT | OGE) +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uno <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmunoz2xdouble(<2 x double> %A) { +; UNO with zero = !ORD = !(OLT | OGE) +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uno <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 + +} diff --git a/test/CodeGen/AArch64/neon-facge-facgt.ll b/test/CodeGen/AArch64/neon-facge-facgt.ll new file mode 100644 index 00000000000..146256e4be1 --- /dev/null +++ b/test/CodeGen/AArch64/neon-facge-facgt.ll @@ -0,0 +1,56 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) +declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) +declare <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double>, <2 x double>) + +define <2 x i32> @facge_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facge_from_intr_v2i32: + %val = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %A, <2 x float> %B) +; CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + ret <2 x i32> %val +} +define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facge_from_intr_v4i32: + %val = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %A, <4 x float> %B) +; CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + ret <4 x i32> %val +} + +define <2 x i64> @facge_from_intr_v2i64(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facge_from_intr_v2i64: + %val = call <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double> %A, <2 x double> %B) +; CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + ret <2 x i64> %val +} + +declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) +declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) +declare <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double>, <2 x double>) + +define <2 x i32> @facgt_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facgt_from_intr_v2i32: + %val = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %A, <2 x float> %B) +; CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + ret <2 x i32> %val +} +define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facgt_from_intr_v4i32: + %val = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %A, <4 x float> %B) +; CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + ret <4 x i32> %val +} + +define <2 x i64> @facgt_from_intr_v2i64(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facgt_from_intr_v2i64: + %val = call <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double> %A, <2 x double> %B) +; CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + ret <2 x i64> %val +} + diff --git a/test/CodeGen/AArch64/neon-fma.ll b/test/CodeGen/AArch64/neon-fma.ll new file mode 100644 index 00000000000..dcf4e287806 --- /dev/null +++ b/test/CodeGen/AArch64/neon-fma.ll @@ -0,0 +1,112 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +define <2 x float> @fmla2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp1 = fmul <2 x float> %A, %B; + %tmp2 = fadd <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + +define <4 x float> @fmla4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp1 = fmul <4 x float> %A, %B; + %tmp2 = fadd <4 x float> %C, %tmp1; + ret <4 x float> %tmp2 +} + +define <2 x double> @fmla2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp1 = fmul <2 x double> %A, %B; + %tmp2 = fadd <2 x double> %C, %tmp1; + ret <2 x double> %tmp2 +} + + +define <2 x float> @fmls2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp1 = fmul <2 x float> %A, %B; + %tmp2 = fsub <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + +define <4 x float> @fmls4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp1 = fmul <4 x float> %A, %B; + %tmp2 = fsub <4 x float> %C, %tmp1; + ret <4 x float> %tmp2 +} + +define <2 x double> @fmls2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp1 = fmul <2 x double> %A, %B; + %tmp2 = fsub <2 x double> %C, %tmp1; + ret <2 x double> %tmp2 +} + + +; Another set of tests for when the intrinsic is used. + +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) + +define <2 x float> @fmla2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C) + ret <2 x float> %val +} + +define <4 x float> @fmla4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C) + ret <4 x float> %val +} + +define <2 x double> @fmla2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C) + ret <2 x double> %val +} + +define <2 x float> @fmls2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %negA = fsub <2 x float> , %A + %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %negA, <2 x float> %B, <2 x float> %C) + ret <2 x float> %val +} + +define <4 x float> @fmls4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %negA = fsub <4 x float> , %A + %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %negA, <4 x float> %B, <4 x float> %C) + ret <4 x float> %val +} + +define <2 x double> @fmls2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %negA = fsub <2 x double> , %A + %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %negA, <2 x double> %B, <2 x double> %C) + ret <2 x double> %val +} + +declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) +declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) + +define <2 x float> @fmuladd2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %val = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C) + ret <2 x float> %val +} + +define <4 x float> @fmuladd4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %val = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C) + ret <4 x float> %val +} + +define <2 x double> @fmuladd2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %val = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C) + ret <2 x double> %val +} diff --git a/test/CodeGen/AArch64/neon-frsqrt-frecp.ll b/test/CodeGen/AArch64/neon-frsqrt-frecp.ll new file mode 100644 index 00000000000..46fe25d74d9 --- /dev/null +++ b/test/CodeGen/AArch64/neon-frsqrt-frecp.ll @@ -0,0 +1,54 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +; Set of tests for when the intrinsic is used. + +declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @frsqrts_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frsqrts v0.2s, v0.2s, v1.2s + %val = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %lhs, <2 x float> %rhs) + ret <2 x float> %val +} + +define <4 x float> @frsqrts_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frsqrts v0.4s, v0.4s, v1.4s + %val = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %lhs, <4 x float> %rhs) + ret <4 x float> %val +} + +define <2 x double> @frsqrts_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frsqrts v0.2d, v0.2d, v1.2d + %val = call <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double> %lhs, <2 x double> %rhs) + ret <2 x double> %val +} + +declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @frecps_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frecps v0.2s, v0.2s, v1.2s + %val = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %lhs, <2 x float> %rhs) + ret <2 x float> %val +} + +define <4 x float> @frecps_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frecps v0.4s, v0.4s, v1.4s + %val = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %lhs, <4 x float> %rhs) + ret <4 x float> %val +} + +define <2 x double> @frecps_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frecps v0.2d, v0.2d, v1.2d + %val = call <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double> %lhs, <2 x double> %rhs) + ret <2 x double> %val +} + diff --git a/test/CodeGen/AArch64/neon-halving-add-sub.ll b/test/CodeGen/AArch64/neon-halving-add-sub.ll new file mode 100644 index 00000000000..a8f59dbdb0a --- /dev/null +++ b/test/CodeGen/AArch64/neon-halving-add-sub.ll @@ -0,0 +1,207 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uhadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uhadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_shadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_shadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: shadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uhadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uhadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_shadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_shadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: shadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uhadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uhadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_shadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_shadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: shadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uhadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uhadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_shadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_shadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: shadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uhadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uhadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_shadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_shadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: shadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uhadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uhadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_shadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_shadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: shadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + + +declare <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uhsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uhsub_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uhsub v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_shsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_shsub_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: shsub v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uhsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uhsub_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uhsub v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_shsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_shsub_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: shsub v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uhsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uhsub_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uhsub v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_shsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_shsub_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: shsub v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uhsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uhsub_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uhsub v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_shsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_shsub_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: shsub v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uhsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uhsub_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uhsub v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_shsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_shsub_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: shsub v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uhsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uhsub_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uhsub v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_shsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_shsub_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: shsub v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-max-min-pairwise.ll b/test/CodeGen/AArch64/neon-max-min-pairwise.ll new file mode 100644 index 00000000000..d757aca86a6 --- /dev/null +++ b/test/CodeGen/AArch64/neon-max-min-pairwise.ll @@ -0,0 +1,310 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_smaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_smaxp_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: smaxp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_umaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { + %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: umaxp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_smaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_smaxp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: smaxp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_umaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_umaxp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: umaxp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_smaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_smaxp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: smaxp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_umaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_umaxp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: umaxp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + + +declare <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_smaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_smaxp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: smaxp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_umaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_umaxp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: umaxp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_smaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_smaxp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: smaxp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_umaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_umaxp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: umaxp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_smaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_smaxp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: smaxp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_umaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_umaxp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: umaxp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_sminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_sminp_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sminp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_uminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { + %tmp1 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uminp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_sminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sminp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sminp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_uminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uminp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uminp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_sminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sminp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sminp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_uminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uminp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uminp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + + +declare <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_sminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sminp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sminp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_uminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uminp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uminp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_sminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sminp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sminp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_uminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uminp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uminp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_sminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sminp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sminp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_uminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uminp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uminp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmaxp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmaxp_v2f32: + %val = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmaxp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmaxp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmaxp_v4f32: + %val = call <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmaxp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmaxp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmaxp_v2f64: + %val = call <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmaxp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fminp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fminp_v2f32: + %val = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fminp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fminp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fminp_v4f32: + %val = call <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fminp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fminp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fminp_v2f64: + %val = call <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fminp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmaxnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmaxnmp_v2f32: + %val = call <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmaxnmp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmaxnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmaxnmp_v4f32: + %val = call <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmaxnmp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmaxnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmaxnmp_v2f64: + %val = call <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmaxnmp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fminnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fminnmp_v2f32: + %val = call <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fminnmp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fminnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fminnmp_v4f32: + %val = call <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fminnmp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fminnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fminnmp_v2f64: + %val = call <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fminnmp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + diff --git a/test/CodeGen/AArch64/neon-max-min.ll b/test/CodeGen/AArch64/neon-max-min.ll new file mode 100644 index 00000000000..7889c77e37f --- /dev/null +++ b/test/CodeGen/AArch64/neon-max-min.ll @@ -0,0 +1,310 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_smax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_smax_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: smax v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_umax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { + %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: umax v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_smax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_smax_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: smax v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_umax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_umax_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: umax v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_smax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_smax_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: smax v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_umax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_umax_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: umax v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + + +declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_smax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_smax_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: smax v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_umax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_umax_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: umax v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + + +declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_smax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_smax_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: smax v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_umax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_umax_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: umax v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_smax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_smax_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: smax v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_umax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_umax_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: umax v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_smin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_smin_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: smin v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_umin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { + %tmp1 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: umin v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_smin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_smin_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: smin v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_umin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_umin_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: umin v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_smin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_smin_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: smin v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_umin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_umin_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: umin v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + + +declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_smin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_smin_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: smin v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_umin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_umin_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: umin v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + + +declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_smin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_smin_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: smin v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_umin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_umin_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: umin v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_smin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_smin_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: smin v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_umin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_umin_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: umin v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmax_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmax_v2f32: + %val = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmax v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmax_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmax_v4f32: + %val = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmax v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmax_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmax_v2f64: + %val = call <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmax v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmin_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmin_v2f32: + %val = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmin v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmin_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmin_v4f32: + %val = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmin v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmin_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmin_v2f64: + %val = call <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmin v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + + +declare <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmaxnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmaxnm_v2f32: + %val = call <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmaxnm v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmaxnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmaxnm_v4f32: + %val = call <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmaxnm v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmaxnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmaxnm_v2f64: + %val = call <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmaxnm v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fminnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fminnm_v2f32: + %val = call <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fminnm v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fminnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fminnm_v4f32: + %val = call <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fminnm v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fminnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fminnm_v2f64: + %val = call <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fminnm v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} diff --git a/test/CodeGen/AArch64/neon-mla-mls.ll b/test/CodeGen/AArch64/neon-mla-mls.ll new file mode 100644 index 00000000000..23e9223a8b7 --- /dev/null +++ b/test/CodeGen/AArch64/neon-mla-mls.ll @@ -0,0 +1,88 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + + +define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { +;CHECK: mla {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = mul <8 x i8> %A, %B; + %tmp2 = add <8 x i8> %C, %tmp1; + ret <8 x i8> %tmp2 +} + +define <16 x i8> @mla16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { +;CHECK: mla {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = mul <16 x i8> %A, %B; + %tmp2 = add <16 x i8> %C, %tmp1; + ret <16 x i8> %tmp2 +} + +define <4 x i16> @mla4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { +;CHECK: mla {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp1 = mul <4 x i16> %A, %B; + %tmp2 = add <4 x i16> %C, %tmp1; + ret <4 x i16> %tmp2 +} + +define <8 x i16> @mla8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { +;CHECK: mla {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp1 = mul <8 x i16> %A, %B; + %tmp2 = add <8 x i16> %C, %tmp1; + ret <8 x i16> %tmp2 +} + +define <2 x i32> @mla2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { +;CHECK: mla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp1 = mul <2 x i32> %A, %B; + %tmp2 = add <2 x i32> %C, %tmp1; + ret <2 x i32> %tmp2 +} + +define <4 x i32> @mla4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +;CHECK: mla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp1 = mul <4 x i32> %A, %B; + %tmp2 = add <4 x i32> %C, %tmp1; + ret <4 x i32> %tmp2 +} + +define <8 x i8> @mls8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { +;CHECK: mls {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = mul <8 x i8> %A, %B; + %tmp2 = sub <8 x i8> %C, %tmp1; + ret <8 x i8> %tmp2 +} + +define <16 x i8> @mls16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { +;CHECK: mls {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = mul <16 x i8> %A, %B; + %tmp2 = sub <16 x i8> %C, %tmp1; + ret <16 x i8> %tmp2 +} + +define <4 x i16> @mls4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { +;CHECK: mls {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp1 = mul <4 x i16> %A, %B; + %tmp2 = sub <4 x i16> %C, %tmp1; + ret <4 x i16> %tmp2 +} + +define <8 x i16> @mls8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { +;CHECK: mls {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp1 = mul <8 x i16> %A, %B; + %tmp2 = sub <8 x i16> %C, %tmp1; + ret <8 x i16> %tmp2 +} + +define <2 x i32> @mls2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { +;CHECK: mls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp1 = mul <2 x i32> %A, %B; + %tmp2 = sub <2 x i32> %C, %tmp1; + ret <2 x i32> %tmp2 +} + +define <4 x i32> @mls4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +;CHECK: mls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp1 = mul <4 x i32> %A, %B; + %tmp2 = sub <4 x i32> %C, %tmp1; + ret <4 x i32> %tmp2 +} + + diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll new file mode 100644 index 00000000000..42f6a894da6 --- /dev/null +++ b/test/CodeGen/AArch64/neon-mov.ll @@ -0,0 +1,205 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i8> @movi8b() { +;CHECK: movi {{v[0-31]+}}.8b, #0x8 + ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > +} + +define <16 x i8> @movi16b() { +;CHECK: movi {{v[0-31]+}}.16b, #0x8 + ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > +} + +define <2 x i32> @movi2s_lsl0() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff + ret <2 x i32> < i32 255, i32 255 > +} + +define <2 x i32> @movi2s_lsl8() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #8 + ret <2 x i32> < i32 65280, i32 65280 > +} + +define <2 x i32> @movi2s_lsl16() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #16 + ret <2 x i32> < i32 16711680, i32 16711680 > + +} + +define <2 x i32> @movi2s_lsl24() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #24 + ret <2 x i32> < i32 4278190080, i32 4278190080 > +} + +define <4 x i32> @movi4s_lsl0() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff + ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 > +} + +define <4 x i32> @movi4s_lsl8() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #8 + ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 > +} + +define <4 x i32> @movi4s_lsl16() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #16 + ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 > + +} + +define <4 x i32> @movi4s_lsl24() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #24 + ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 > +} + +define <4 x i16> @movi4h_lsl0() { +;CHECK: movi {{v[0-31]+}}.4h, #0xff + ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 > +} + +define <4 x i16> @movi4h_lsl8() { +;CHECK: movi {{v[0-31]+}}.4h, #0xff, lsl #8 + ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 > +} + +define <8 x i16> @movi8h_lsl0() { +;CHECK: movi {{v[0-31]+}}.8h, #0xff + ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 > +} + +define <8 x i16> @movi8h_lsl8() { +;CHECK: movi {{v[0-31]+}}.8h, #0xff, lsl #8 + ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 > +} + + +define <2 x i32> @mvni2s_lsl0() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10 + ret <2 x i32> < i32 4294967279, i32 4294967279 > +} + +define <2 x i32> @mvni2s_lsl8() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #8 + ret <2 x i32> < i32 4294963199, i32 4294963199 > +} + +define <2 x i32> @mvni2s_lsl16() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #16 + ret <2 x i32> < i32 4293918719, i32 4293918719 > +} + +define <2 x i32> @mvni2s_lsl24() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #24 + ret <2 x i32> < i32 4026531839, i32 4026531839 > +} + +define <4 x i32> @mvni4s_lsl0() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10 + ret <4 x i32> < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 > +} + +define <4 x i32> @mvni4s_lsl8() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #8 + ret <4 x i32> < i32 4294963199, i32 4294963199, i32 4294963199, i32 4294963199 > +} + +define <4 x i32> @mvni4s_lsl16() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #16 + ret <4 x i32> < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 > + +} + +define <4 x i32> @mvni4s_lsl24() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #24 + ret <4 x i32> < i32 4026531839, i32 4026531839, i32 4026531839, i32 4026531839 > +} + + +define <4 x i16> @mvni4h_lsl0() { +;CHECK: mvni {{v[0-31]+}}.4h, #0x10 + ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 > +} + +define <4 x i16> @mvni4h_lsl8() { +;CHECK: mvni {{v[0-31]+}}.4h, #0x10, lsl #8 + ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 > +} + +define <8 x i16> @mvni8h_lsl0() { +;CHECK: mvni {{v[0-31]+}}.8h, #0x10 + ret <8 x i16> < i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519 > +} + +define <8 x i16> @mvni8h_lsl8() { +;CHECK: mvni {{v[0-31]+}}.8h, #0x10, lsl #8 + ret <8 x i16> < i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439 > +} + + +define <2 x i32> @movi2s_msl8(<2 x i32> %a) { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, msl #8 + ret <2 x i32> < i32 65535, i32 65535 > +} + +define <2 x i32> @movi2s_msl16() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, msl #16 + ret <2 x i32> < i32 16777215, i32 16777215 > +} + + +define <4 x i32> @movi4s_msl8() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, msl #8 + ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 > +} + +define <4 x i32> @movi4s_msl16() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, msl #16 + ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 > +} + +define <2 x i32> @mvni2s_msl8() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, msl #8 + ret <2 x i32> < i32 18446744073709547264, i32 18446744073709547264> +} + +define <2 x i32> @mvni2s_msl16() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, msl #16 + ret <2 x i32> < i32 18446744073708437504, i32 18446744073708437504> +} + +define <4 x i32> @mvni4s_msl8() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, msl #8 + ret <4 x i32> < i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264> +} + +define <4 x i32> @mvni4s_msl16() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, msl #16 + ret <4 x i32> < i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504> +} + +define <2 x i64> @movi2d() { +;CHECK: movi {{v[0-31]+}}.2d, #0xff0000ff0000ffff + ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 > +} + +define <1 x i64> @movid() { +;CHECK: movi {{d[0-31]+}}, #0xff0000ff0000ffff + ret <1 x i64> < i64 18374687574888349695 > +} + +define <2 x float> @fmov2s() { +;CHECK: fmov {{v[0-31]+}}.2s, #-12.00000000 + ret <2 x float> < float -1.2e1, float -1.2e1> +} + +define <4 x float> @fmov4s() { +;CHECK: fmov {{v[0-31]+}}.4s, #-12.00000000 + ret <4 x float> < float -1.2e1, float -1.2e1, float -1.2e1, float -1.2e1> +} + +define <2 x double> @fmov2d() { +;CHECK: fmov {{v[0-31]+}}.2d, #-12.00000000 + ret <2 x double> < double -1.2e1, double -1.2e1> +} + + diff --git a/test/CodeGen/AArch64/neon-mul-div.ll b/test/CodeGen/AArch64/neon-mul-div.ll new file mode 100644 index 00000000000..e1be3132663 --- /dev/null +++ b/test/CodeGen/AArch64/neon-mul-div.ll @@ -0,0 +1,181 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + + +define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: mul {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp3 = mul <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: mul {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp3 = mul <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: mul {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp3 = mul <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: mul {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp3 = mul <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: mul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = mul <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: mul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = mul <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + + define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fmul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = fmul <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fmul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = fmul <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fmul {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = fmul <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + + + define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fdiv {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = fdiv <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fdiv {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = fdiv <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fdiv {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = fdiv <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) +declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) + +define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: poly_mulv8i8: + %prod = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: pmul v0.8b, v0.8b, v1.8b + ret <8 x i8> %prod +} + +define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: poly_mulv16i8: + %prod = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: pmul v0.16b, v0.16b, v1.16b + ret <16 x i8> %prod +} + +declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqdmulh_v4i16: + %prod = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqdmulh v0.4h, v0.4h, v1.4h + ret <4 x i16> %prod +} + +define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqdmulh_v8i16: + %prod = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqdmulh v0.8h, v0.8h, v1.8h + ret <8 x i16> %prod +} + +define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqdmulh_v2i32: + %prod = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqdmulh v0.2s, v0.2s, v1.2s + ret <2 x i32> %prod +} + +define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqdmulh_v4i32: + %prod = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqdmulh v0.4s, v0.4s, v1.4s + ret <4 x i32> %prod +} + +declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqrdmulh_v4i16: + %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h + ret <4 x i16> %prod +} + +define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqrdmulh_v8i16: + %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h + ret <8 x i16> %prod +} + +define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqrdmulh_v2i32: + %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s + ret <2 x i32> %prod +} + +define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqrdmulh_v4i32: + %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s + ret <4 x i32> %prod +} + +declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: fmulx v0.2s, v0.2s, v1.2s + %val = call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs) + ret <2 x float> %val +} + +define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: fmulx v0.4s, v0.4s, v1.4s + %val = call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs) + ret <4 x float> %val +} + +define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: fmulx v0.2d, v0.2d, v1.2d + %val = call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs) + ret <2 x double> %val +} diff --git a/test/CodeGen/AArch64/neon-rounding-halving-add.ll b/test/CodeGen/AArch64/neon-rounding-halving-add.ll new file mode 100644 index 00000000000..009da3b51a8 --- /dev/null +++ b/test/CodeGen/AArch64/neon-rounding-halving-add.ll @@ -0,0 +1,105 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_urhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_urhadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: urhadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_srhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_srhadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: srhadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_urhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_urhadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: urhadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_srhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_srhadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: srhadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_urhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_urhadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: urhadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_srhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_srhadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: srhadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_urhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_urhadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: urhadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_srhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_srhadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: srhadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_urhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_urhadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: urhadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_srhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_srhadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: srhadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_urhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_urhadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: urhadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_srhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_srhadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: srhadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + + diff --git a/test/CodeGen/AArch64/neon-rounding-shift.ll b/test/CodeGen/AArch64/neon-rounding-shift.ll new file mode 100644 index 00000000000..404e49185e6 --- /dev/null +++ b/test/CodeGen/AArch64/neon-rounding-shift.ll @@ -0,0 +1,138 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_urshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_urshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: urshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_srshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_srshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: srshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_urshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_urshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: urshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_srshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_srshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: srshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_urshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_urshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: urshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_srshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_srshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: srshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_urshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_urshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: urshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_srshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_srshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: srshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_urshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_urshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: urshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_srshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_srshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: srshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_urshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_urshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: urshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_srshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_srshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: srshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_urshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: urshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_srshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: srshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_urshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_urshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: urshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_srshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_srshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: srshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-saturating-add-sub.ll new file mode 100644 index 00000000000..b2fac1fbc1a --- /dev/null +++ b/test/CodeGen/AArch64/neon-saturating-add-sub.ll @@ -0,0 +1,274 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uqadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sqadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uqadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uqadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sqadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sqadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uqadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uqadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uqadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uqadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uqadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uqadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uqadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uqadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqadd_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: uqadd d0, d0, d1 + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqadd_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sqadd d0, d0, d1 + ret <1 x i64> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_uqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_uqadd_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: uqadd v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sqadd_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sqadd v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqsub_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uqsub v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqsub_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sqsub v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uqsub_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uqsub v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sqsub_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sqsub v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uqsub_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uqsub v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqsub_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqsub v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uqsub_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uqsub v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqsub_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqsub v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uqsub_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uqsub v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqsub_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqsub v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uqsub_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uqsub v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqsub_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqsub v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_uqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_uqsub_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: uqsub v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sqsub_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sqsub v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqsub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqsub_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: uqsub d0, d0, d1 + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqsub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqsub_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sqsub d0, d0, d1 + ret <1 x i64> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll new file mode 100644 index 00000000000..05d8dfea9de --- /dev/null +++ b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll @@ -0,0 +1,138 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqrshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uqrshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqrshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sqrshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uqrshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uqrshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sqrshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sqrshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uqrshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uqrshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqrshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqrshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uqrshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uqrshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqrshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqrshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uqrshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uqrshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqrshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqrshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uqrshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uqrshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqrshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqrshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqrshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: uqrshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqrshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sqrshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_uqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_uqrshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: uqrshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sqrshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sqrshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-saturating-shift.ll b/test/CodeGen/AArch64/neon-saturating-shift.ll new file mode 100644 index 00000000000..3b7f78cc799 --- /dev/null +++ b/test/CodeGen/AArch64/neon-saturating-shift.ll @@ -0,0 +1,138 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uqshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sqshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uqshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uqshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sqshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sqshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uqshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uqshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uqshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uqshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uqshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uqshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uqshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uqshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: uqshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sqshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_uqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_uqshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: uqshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sqshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sqshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-shift.ll b/test/CodeGen/AArch64/neon-shift.ll new file mode 100644 index 00000000000..45a26057996 --- /dev/null +++ b/test/CodeGen/AArch64/neon-shift.ll @@ -0,0 +1,140 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: ushl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_ushl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_ushl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: ushl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_ushl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_ushl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: ushl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_ushl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_ushl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: ushl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_ushl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_ushl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: ushl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_ushl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_ushl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: ushl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_ushl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: ushl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_ushl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_ushl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: ushl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + + + diff --git a/test/MC/AArch64/basic-a64-diagnostics.s b/test/MC/AArch64/basic-a64-diagnostics.s index 1e9024c5eed..2e6e0bbd387 100644 --- a/test/MC/AArch64/basic-a64-diagnostics.s +++ b/test/MC/AArch64/basic-a64-diagnostics.s @@ -1,4 +1,4 @@ -// RUN: not llvm-mc -triple=aarch64 < %s 2> %t +// RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2> %t // RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s //------------------------------------------------------------------------------ @@ -2892,13 +2892,13 @@ movi wzr, #0x44444444 movi w3, #0xffff movi x9, #0x0000ffff00000000 -// CHECK-ERROR: error: invalid instruction +// CHECK-ERROR: error: invalid operand for instruction // CHECK-ERROR-NEXT: movi wzr, #0x44444444 // CHECK-ERROR-NEXT: ^ -// CHECK-ERROR: error: invalid instruction +// CHECK-ERROR: error: invalid operand for instruction // CHECK-ERROR-NEXT: movi w3, #0xffff // CHECK-ERROR-NEXT: ^ -// CHECK-ERROR: error: invalid instruction +// CHECK-ERROR: error: invalid operand for instruction // CHECK-ERROR-NEXT: movi x9, #0x0000ffff00000000 // CHECK-ERROR-NEXT: ^ diff --git a/test/MC/AArch64/basic-a64-instructions.s b/test/MC/AArch64/basic-a64-instructions.s index ad3064e5e52..e4f6b218924 100644 --- a/test/MC/AArch64/basic-a64-instructions.s +++ b/test/MC/AArch64/basic-a64-instructions.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding < %s | FileCheck %s +// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s .globl _func // Check that the assembler can handle the documented syntax from the ARM ARM. diff --git a/test/MC/AArch64/neon-aba-abd.s b/test/MC/AArch64/neon-aba-abd.s new file mode 100644 index 00000000000..178eb26f64c --- /dev/null +++ b/test/MC/AArch64/neon-aba-abd.s @@ -0,0 +1,78 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//---------------------------------------------------------------------- +// Vector Absolute Difference and Accumulate (Signed, Unsigned) +//---------------------------------------------------------------------- + uaba v0.8b, v1.8b, v2.8b + uaba v0.16b, v1.16b, v2.16b + uaba v0.4h, v1.4h, v2.4h + uaba v0.8h, v1.8h, v2.8h + uaba v0.2s, v1.2s, v2.2s + uaba v0.4s, v1.4s, v2.4s + +// CHECK: uaba v0.8b, v1.8b, v2.8b // encoding: [0x20,0x7c,0x22,0x2e] +// CHECK: uaba v0.16b, v1.16b, v2.16b // encoding: [0x20,0x7c,0x22,0x6e] +// CHECK: uaba v0.4h, v1.4h, v2.4h // encoding: [0x20,0x7c,0x62,0x2e] +// CHECK: uaba v0.8h, v1.8h, v2.8h // encoding: [0x20,0x7c,0x62,0x6e] +// CHECK: uaba v0.2s, v1.2s, v2.2s // encoding: [0x20,0x7c,0xa2,0x2e] +// CHECK: uaba v0.4s, v1.4s, v2.4s // encoding: [0x20,0x7c,0xa2,0x6e] + + + saba v0.8b, v1.8b, v2.8b + saba v0.16b, v1.16b, v2.16b + saba v0.4h, v1.4h, v2.4h + saba v0.8h, v1.8h, v2.8h + saba v0.2s, v1.2s, v2.2s + saba v0.4s, v1.4s, v2.4s + +// CHECK: saba v0.8b, v1.8b, v2.8b // encoding: [0x20,0x7c,0x22,0x0e] +// CHECK: saba v0.16b, v1.16b, v2.16b // encoding: [0x20,0x7c,0x22,0x4e] +// CHECK: saba v0.4h, v1.4h, v2.4h // encoding: [0x20,0x7c,0x62,0x0e] +// CHECK: saba v0.8h, v1.8h, v2.8h // encoding: [0x20,0x7c,0x62,0x4e] +// CHECK: saba v0.2s, v1.2s, v2.2s // encoding: [0x20,0x7c,0xa2,0x0e] +// CHECK: saba v0.4s, v1.4s, v2.4s // encoding: [0x20,0x7c,0xa2,0x4e] + +//---------------------------------------------------------------------- +// Vector Absolute Difference (Signed, Unsigned) +//---------------------------------------------------------------------- + uabd v0.8b, v1.8b, v2.8b + uabd v0.16b, v1.16b, v2.16b + uabd v0.4h, v1.4h, v2.4h + uabd v0.8h, v1.8h, v2.8h + uabd v0.2s, v1.2s, v2.2s + uabd v0.4s, v1.4s, v2.4s + +// CHECK: uabd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x74,0x22,0x2e] +// CHECK: uabd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x74,0x22,0x6e] +// CHECK: uabd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x74,0x62,0x2e] +// CHECK: uabd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x74,0x62,0x6e] +// CHECK: uabd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x74,0xa2,0x2e] +// CHECK: uabd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x74,0xa2,0x6e] + + sabd v0.8b, v1.8b, v2.8b + sabd v0.16b, v1.16b, v2.16b + sabd v0.4h, v1.4h, v2.4h + sabd v0.8h, v1.8h, v2.8h + sabd v0.2s, v1.2s, v2.2s + sabd v0.4s, v1.4s, v2.4s + +// CHECK: sabd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x74,0x22,0x0e] +// CHECK: sabd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x74,0x22,0x4e] +// CHECK: sabd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x74,0x62,0x0e] +// CHECK: sabd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x74,0x62,0x4e] +// CHECK: sabd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x74,0xa2,0x0e] +// CHECK: sabd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x74,0xa2,0x4e] + +//---------------------------------------------------------------------- +// Vector Absolute Difference (Floating Point) +//---------------------------------------------------------------------- + fabd v0.2s, v1.2s, v2.2s + fabd v31.4s, v15.4s, v16.4s + fabd v7.2d, v8.2d, v25.2d + +// CHECK: fabd v0.2s, v1.2s, v2.2s // encoding: [0x20,0xd4,0xa2,0x2e] +// CHECK: fabd v31.4s, v15.4s, v16.4s // encoding: [0xff,0xd5,0xb0,0x6e] +// CHECK: fabd v7.2d, v8.2d, v25.2d // encoding: [0x07,0xd5,0xf9,0x6e] + diff --git a/test/MC/AArch64/neon-add-pairwise.s b/test/MC/AArch64/neon-add-pairwise.s new file mode 100644 index 00000000000..b586c225485 --- /dev/null +++ b/test/MC/AArch64/neon-add-pairwise.s @@ -0,0 +1,35 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + + +//------------------------------------------------------------------------------ +// Vector Add Pairwise (Integer) +//------------------------------------------------------------------------------ + addp v0.8b, v1.8b, v2.8b + addp v0.16b, v1.16b, v2.16b + addp v0.4h, v1.4h, v2.4h + addp v0.8h, v1.8h, v2.8h + addp v0.2s, v1.2s, v2.2s + addp v0.4s, v1.4s, v2.4s + addp v0.2d, v1.2d, v2.2d + +// CHECK: addp v0.8b, v1.8b, v2.8b // encoding: [0x20,0xbc,0x22,0x0e] +// CHECK: addp v0.16b, v1.16b, v2.16b // encoding: [0x20,0xbc,0x22,0x4e] +// CHECK: addp v0.4h, v1.4h, v2.4h // encoding: [0x20,0xbc,0x62,0x0e] +// CHECK: addp v0.8h, v1.8h, v2.8h // encoding: [0x20,0xbc,0x62,0x4e] +// CHECK: addp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xbc,0xa2,0x0e] +// CHECK: addp v0.4s, v1.4s, v2.4s // encoding: [0x20,0xbc,0xa2,0x4e] +// CHECK: addp v0.2d, v1.2d, v2.2d // encoding: [0x20,0xbc,0xe2,0x4e] + +//------------------------------------------------------------------------------ +// Vector Add Pairwise (Floating Point +//------------------------------------------------------------------------------ + faddp v0.2s, v1.2s, v2.2s + faddp v0.4s, v1.4s, v2.4s + faddp v0.2d, v1.2d, v2.2d + +// CHECK: faddp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xd4,0x22,0x2e] +// CHECK: faddp v0.4s, v1.4s, v2.4s // encoding: [0x20,0xd4,0x22,0x6e] +// CHECK: faddp v0.2d, v1.2d, v2.2d // encoding: [0x20,0xd4,0x62,0x6e] + diff --git a/test/MC/AArch64/neon-add-sub-instructions.s b/test/MC/AArch64/neon-add-sub-instructions.s new file mode 100644 index 00000000000..863798eaf0d --- /dev/null +++ b/test/MC/AArch64/neon-add-sub-instructions.s @@ -0,0 +1,82 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + + +//------------------------------------------------------------------------------ +// Vector Integer Add +//------------------------------------------------------------------------------ + add v0.8b, v1.8b, v2.8b + add v0.16b, v1.16b, v2.16b + add v0.4h, v1.4h, v2.4h + add v0.8h, v1.8h, v2.8h + add v0.2s, v1.2s, v2.2s + add v0.4s, v1.4s, v2.4s + add v0.2d, v1.2d, v2.2d + +// CHECK: add v0.8b, v1.8b, v2.8b // encoding: [0x20,0x84,0x22,0x0e] +// CHECK: add v0.16b, v1.16b, v2.16b // encoding: [0x20,0x84,0x22,0x4e] +// CHECK: add v0.4h, v1.4h, v2.4h // encoding: [0x20,0x84,0x62,0x0e] +// CHECK: add v0.8h, v1.8h, v2.8h // encoding: [0x20,0x84,0x62,0x4e] +// CHECK: add v0.2s, v1.2s, v2.2s // encoding: [0x20,0x84,0xa2,0x0e] +// CHECK: add v0.4s, v1.4s, v2.4s // encoding: [0x20,0x84,0xa2,0x4e] +// CHECK: add v0.2d, v1.2d, v2.2d // encoding: [0x20,0x84,0xe2,0x4e] + +//------------------------------------------------------------------------------ +// Vector Integer Sub +//------------------------------------------------------------------------------ + sub v0.8b, v1.8b, v2.8b + sub v0.16b, v1.16b, v2.16b + sub v0.4h, v1.4h, v2.4h + sub v0.8h, v1.8h, v2.8h + sub v0.2s, v1.2s, v2.2s + sub v0.4s, v1.4s, v2.4s + sub v0.2d, v1.2d, v2.2d + +// CHECK: sub v0.8b, v1.8b, v2.8b // encoding: [0x20,0x84,0x22,0x2e] +// CHECK: sub v0.16b, v1.16b, v2.16b // encoding: [0x20,0x84,0x22,0x6e] +// CHECK: sub v0.4h, v1.4h, v2.4h // encoding: [0x20,0x84,0x62,0x2e] +// CHECK: sub v0.8h, v1.8h, v2.8h // encoding: [0x20,0x84,0x62,0x6e] +// CHECK: sub v0.2s, v1.2s, v2.2s // encoding: [0x20,0x84,0xa2,0x2e] +// CHECK: sub v0.4s, v1.4s, v2.4s // encoding: [0x20,0x84,0xa2,0x6e] +// CHECK: sub v0.2d, v1.2d, v2.2d // encoding: [0x20,0x84,0xe2,0x6e] + +//------------------------------------------------------------------------------ +// Vector Floating-Point Add +//------------------------------------------------------------------------------ + fadd v0.2s, v1.2s, v2.2s + fadd v0.4s, v1.4s, v2.4s + fadd v0.2d, v1.2d, v2.2d + +// CHECK: fadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0xd4,0x22,0x0e] +// CHECK: fadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0xd4,0x22,0x4e] +// CHECK: fadd v0.2d, v1.2d, v2.2d // encoding: [0x20,0xd4,0x62,0x4e] + + +//------------------------------------------------------------------------------ +// Vector Floating-Point Sub +//------------------------------------------------------------------------------ + fsub v0.2s, v1.2s, v2.2s + fsub v0.4s, v1.4s, v2.4s + fsub v0.2d, v1.2d, v2.2d + +// CHECK: fsub v0.2s, v1.2s, v2.2s // encoding: [0x20,0xd4,0xa2,0x0e] +// CHECK: fsub v0.4s, v1.4s, v2.4s // encoding: [0x20,0xd4,0xa2,0x4e] +// CHECK: fsub v0.2d, v1.2d, v2.2d // encoding: [0x20,0xd4,0xe2,0x4e] + +//------------------------------------------------------------------------------ +// Scalar Integer Add +//------------------------------------------------------------------------------ + add d31, d0, d16 + +// CHECK: add d31, d0, d16 // encoding: [0x1f,0x84,0xf0,0x5e] + +//------------------------------------------------------------------------------ +// Scalar Integer Sub +//------------------------------------------------------------------------------ + sub d1, d7, d8 + +// CHECK: sub d1, d7, d8 // encoding: [0xe1,0x84,0xe8,0x7e] + + + diff --git a/test/MC/AArch64/neon-bitwise-instructions.s b/test/MC/AArch64/neon-bitwise-instructions.s new file mode 100644 index 00000000000..79d0a9b70b5 --- /dev/null +++ b/test/MC/AArch64/neon-bitwise-instructions.s @@ -0,0 +1,60 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//------------------------------------------------------------------------------ +// Vector And +//------------------------------------------------------------------------------ + and v0.8b, v1.8b, v2.8b + and v0.16b, v1.16b, v2.16b + +// CHECK: and v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0x22,0x0e] +// CHECK: and v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0x22,0x4e] + + +//------------------------------------------------------------------------------ +// Vector Orr +//------------------------------------------------------------------------------ + orr v0.8b, v1.8b, v2.8b + orr v0.16b, v1.16b, v2.16b + +// CHECK: orr v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0xa2,0x0e] +// CHECK: orr v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0xa2,0x4e] + + +//------------------------------------------------------------------------------ +// Vector Eor +//------------------------------------------------------------------------------ + eor v0.8b, v1.8b, v2.8b + eor v0.16b, v1.16b, v2.16b + +// CHECK: eor v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0x22,0x2e] +// CHECK: eor v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0x22,0x6e] + + +//---------------------------------------------------------------------- +// Vector Bitwise +//---------------------------------------------------------------------- + + bit v0.8b, v1.8b, v2.8b + bit v0.16b, v1.16b, v2.16b + bif v0.8b, v1.8b, v2.8b + bif v0.16b, v1.16b, v2.16b + bsl v0.8b, v1.8b, v2.8b + bsl v0.16b, v1.16b, v2.16b + orn v0.8b, v1.8b, v2.8b + orn v0.16b, v1.16b, v2.16b + bic v0.8b, v1.8b, v2.8b + bic v0.16b, v1.16b, v2.16b + +// CHECK: bit v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0xa2,0x2e] +// CHECK: bit v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0xa2,0x6e] +// CHECK: bif v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0xe2,0x2e] +// CHECK: bif v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0xe2,0x6e] +// CHECK: bsl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0x62,0x2e] +// CHECK: bsl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0x62,0x6e] +// CHECK: orn v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0xe2,0x0e] +// CHECK: orn v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0xe2,0x4e] +// CHECK: bic v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0x62,0x0e] +// CHECK: bic v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0x62,0x4e] + diff --git a/test/MC/AArch64/neon-compare-instructions.s b/test/MC/AArch64/neon-compare-instructions.s new file mode 100644 index 00000000000..e4bc2025835 --- /dev/null +++ b/test/MC/AArch64/neon-compare-instructions.s @@ -0,0 +1,405 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//---------------------------------------------------------------------- +// Vector Compare Mask Equal (Integer) +//---------------------------------------------------------------------- + + cmeq v0.8b, v15.8b, v17.8b + cmeq v1.16b, v31.16b, v8.16b + cmeq v15.4h, v16.4h, v17.4h + cmeq v5.8h, v6.8h, v7.8h + cmeq v29.2s, v27.2s, v28.2s + cmeq v9.4s, v7.4s, v8.4s + cmeq v3.2d, v31.2d, v21.2d + +// CHECK: cmeq v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x8d,0x31,0x2e] +// CHECK: cmeq v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x8f,0x28,0x6e] +// CHECK: cmeq v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x8e,0x71,0x2e] +// CHECK: cmeq v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x8c,0x67,0x6e] +// CHECK: cmeq v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x8f,0xbc,0x2e] +// CHECK: cmeq v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x8c,0xa8,0x6e] +// CHECK: cmeq v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x8f,0xf5,0x6e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Higher or Same (Unsigned Integer) +// Vector Compare Mask Less or Same (Unsigned Integer) +// CMLS is alias for CMHS with operands reversed. +//---------------------------------------------------------------------- + + cmhs v0.8b, v15.8b, v17.8b + cmhs v1.16b, v31.16b, v8.16b + cmhs v15.4h, v16.4h, v17.4h + cmhs v5.8h, v6.8h, v7.8h + cmhs v29.2s, v27.2s, v28.2s + cmhs v9.4s, v7.4s, v8.4s + cmhs v3.2d, v31.2d, v21.2d + + cmls v0.8b, v17.8b, v15.8b + cmls v1.16b, v8.16b, v31.16b + cmls v15.4h, v17.4h, v16.4h + cmls v5.8h, v7.8h, v6.8h + cmls v29.2s, v28.2s, v27.2s + cmls v9.4s, v8.4s, v7.4s + cmls v3.2d, v21.2d, v31.2d + +// CHECK: cmhs v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x3d,0x31,0x2e] +// CHECK: cmhs v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x3f,0x28,0x6e] +// CHECK: cmhs v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x3e,0x71,0x2e] +// CHECK: cmhs v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x3c,0x67,0x6e] +// CHECK: cmhs v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x3f,0xbc,0x2e] +// CHECK: cmhs v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x3c,0xa8,0x6e] +// CHECK: cmhs v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x3f,0xf5,0x6e] +// CHECK: cmhs v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x3d,0x31,0x2e] +// CHECK: cmhs v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x3f,0x28,0x6e] +// CHECK: cmhs v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x3e,0x71,0x2e] +// CHECK: cmhs v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x3c,0x67,0x6e] +// CHECK: cmhs v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x3f,0xbc,0x2e] +// CHECK: cmhs v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x3c,0xa8,0x6e] +// CHECK: cmhs v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x3f,0xf5,0x6e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than or Equal (Integer) +// Vector Compare Mask Less Than or Equal (Integer) +// CMLE is alias for CMGE with operands reversed. +//---------------------------------------------------------------------- + + cmge v0.8b, v15.8b, v17.8b + cmge v1.16b, v31.16b, v8.16b + cmge v15.4h, v16.4h, v17.4h + cmge v5.8h, v6.8h, v7.8h + cmge v29.2s, v27.2s, v28.2s + cmge v9.4s, v7.4s, v8.4s + cmge v3.2d, v31.2d, v21.2d + + cmle v0.8b, v17.8b, v15.8b + cmle v1.16b, v8.16b, v31.16b + cmle v15.4h, v17.4h, v16.4h + cmle v5.8h, v7.8h, v6.8h + cmle v29.2s, v28.2s, v27.2s + cmle v9.4s, v8.4s, v7.4s + cmle v3.2d, v21.2d, v31.2d + +// CHECK: cmge v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x3d,0x31,0x0e] +// CHECK: cmge v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x3f,0x28,0x4e] +// CHECK: cmge v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x3e,0x71,0x0e] +// CHECK: cmge v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x3c,0x67,0x4e] +// CHECK: cmge v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x3f,0xbc,0x0e] +// CHECK: cmge v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x3c,0xa8,0x4e] +// CHECK: cmge v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x3f,0xf5,0x4e] +// CHECK: cmge v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x3d,0x31,0x0e] +// CHECK: cmge v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x3f,0x28,0x4e] +// CHECK: cmge v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x3e,0x71,0x0e] +// CHECK: cmge v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x3c,0x67,0x4e] +// CHECK: cmge v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x3f,0xbc,0x0e] +// CHECK: cmge v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x3c,0xa8,0x4e] +// CHECK: cmge v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x3f,0xf5,0x4e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Higher (Unsigned Integer) +// Vector Compare Mask Lower (Unsigned Integer) +// CMLO is alias for CMHI with operands reversed. +//---------------------------------------------------------------------- + + cmhi v0.8b, v15.8b, v17.8b + cmhi v1.16b, v31.16b, v8.16b + cmhi v15.4h, v16.4h, v17.4h + cmhi v5.8h, v6.8h, v7.8h + cmhi v29.2s, v27.2s, v28.2s + cmhi v9.4s, v7.4s, v8.4s + cmhi v3.2d, v31.2d, v21.2d + + cmlo v0.8b, v17.8b, v15.8b + cmlo v1.16b, v8.16b, v31.16b + cmlo v15.4h, v17.4h, v16.4h + cmlo v5.8h, v7.8h, v6.8h + cmlo v29.2s, v28.2s, v27.2s + cmlo v9.4s, v8.4s, v7.4s + cmlo v3.2d, v21.2d, v31.2d + +// CHECK: cmhi v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x35,0x31,0x2e] +// CHECK: cmhi v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x37,0x28,0x6e] +// CHECK: cmhi v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x36,0x71,0x2e] +// CHECK: cmhi v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x34,0x67,0x6e] +// CHECK: cmhi v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x37,0xbc,0x2e] +// CHECK: cmhi v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x34,0xa8,0x6e] +// CHECK: cmhi v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x37,0xf5,0x6e] +// CHECK: cmhi v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x35,0x31,0x2e] +// CHECK: cmhi v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x37,0x28,0x6e] +// CHECK: cmhi v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x36,0x71,0x2e] +// CHECK: cmhi v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x34,0x67,0x6e] +// CHECK: cmhi v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x37,0xbc,0x2e] +// CHECK: cmhi v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x34,0xa8,0x6e] +// CHECK: cmhi v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x37,0xf5,0x6e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than (Integer) +// Vector Compare Mask Less Than (Integer) +// CMLT is alias for CMGT with operands reversed. +//---------------------------------------------------------------------- + + cmgt v0.8b, v15.8b, v17.8b + cmgt v1.16b, v31.16b, v8.16b + cmgt v15.4h, v16.4h, v17.4h + cmgt v5.8h, v6.8h, v7.8h + cmgt v29.2s, v27.2s, v28.2s + cmgt v9.4s, v7.4s, v8.4s + cmgt v3.2d, v31.2d, v21.2d + + cmlt v0.8b, v17.8b, v15.8b + cmlt v1.16b, v8.16b, v31.16b + cmlt v15.4h, v17.4h, v16.4h + cmlt v5.8h, v7.8h, v6.8h + cmlt v29.2s, v28.2s, v27.2s + cmlt v9.4s, v8.4s, v7.4s + cmlt v3.2d, v21.2d, v31.2d + +// CHECK: cmgt v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x35,0x31,0x0e] +// CHECK: cmgt v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x37,0x28,0x4e] +// CHECK: cmgt v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x36,0x71,0x0e] +// CHECK: cmgt v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x34,0x67,0x4e] +// CHECK: cmgt v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x37,0xbc,0x0e] +// CHECK: cmgt v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x34,0xa8,0x4e] +// CHECK: cmgt v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x37,0xf5,0x4e] +// CHECK: cmgt v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x35,0x31,0x0e] +// CHECK: cmgt v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x37,0x28,0x4e] +// CHECK: cmgt v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x36,0x71,0x0e] +// CHECK: cmgt v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x34,0x67,0x4e] +// CHECK: cmgt v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x37,0xbc,0x0e] +// CHECK: cmgt v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x34,0xa8,0x4e] +// CHECK: cmgt v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x37,0xf5,0x4e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Bitwise Test (Integer) +//---------------------------------------------------------------------- + + cmtst v0.8b, v15.8b, v17.8b + cmtst v1.16b, v31.16b, v8.16b + cmtst v15.4h, v16.4h, v17.4h + cmtst v5.8h, v6.8h, v7.8h + cmtst v29.2s, v27.2s, v28.2s + cmtst v9.4s, v7.4s, v8.4s + cmtst v3.2d, v31.2d, v21.2d + +// CHECK: cmtst v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x8d,0x31,0x0e] +// CHECK: cmtst v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x8f,0x28,0x4e] +// CHECK: cmtst v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x8e,0x71,0x0e] +// CHECK: cmtst v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x8c,0x67,0x4e] +// CHECK: cmtst v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x8f,0xbc,0x0e] +// CHECK: cmtst v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x8c,0xa8,0x4e] +// CHECK: cmtst v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x8f,0xf5,0x4e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Equal (Floating Point) +//---------------------------------------------------------------------- + + fcmeq v0.2s, v31.2s, v16.2s + fcmeq v4.4s, v7.4s, v15.4s + fcmeq v29.2d, v2.2d, v5.2d + +// CHECK: fcmeq v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xe7,0x30,0x0e] +// CHECK: fcmeq v4.4s, v7.4s, v15.4s // encoding: [0xe4,0xe4,0x2f,0x4e] +// CHECK: fcmeq v29.2d, v2.2d, v5.2d // encoding: [0x5d,0xe4,0x65,0x4e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than Or Equal (Floating Point) +// Vector Compare Mask Less Than Or Equal (Floating Point) +// FCMLE is alias for FCMGE with operands reversed. +//---------------------------------------------------------------------- + + fcmge v31.4s, v29.4s, v28.4s + fcmge v3.2s, v8.2s, v12.2s + fcmge v17.2d, v15.2d, v13.2d + fcmle v31.4s, v28.4s, v29.4s + fcmle v3.2s, v12.2s, v8.2s + fcmle v17.2d, v13.2d, v15.2d + +// CHECK: fcmge v31.4s, v29.4s, v28.4s // encoding: [0xbf,0xe7,0x3c,0x6e] +// CHECK: fcmge v3.2s, v8.2s, v12.2s // encoding: [0x03,0xe5,0x2c,0x2e] +// CHECK: fcmge v17.2d, v15.2d, v13.2d // encoding: [0xf1,0xe5,0x6d,0x6e] +// CHECK: fcmge v31.4s, v29.4s, v28.4s // encoding: [0xbf,0xe7,0x3c,0x6e] +// CHECK: fcmge v3.2s, v8.2s, v12.2s // encoding: [0x03,0xe5,0x2c,0x2e] +// CHECK: fcmge v17.2d, v15.2d, v13.2d // encoding: [0xf1,0xe5,0x6d,0x6e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than (Floating Point) +// Vector Compare Mask Less Than (Floating Point) +// FCMLT is alias for FCMGT with operands reversed. +//---------------------------------------------------------------------- + + fcmgt v0.2s, v31.2s, v16.2s + fcmgt v4.4s, v7.4s, v15.4s + fcmgt v29.2d, v2.2d, v5.2d + fcmlt v0.2s, v16.2s, v31.2s + fcmlt v4.4s, v15.4s, v7.4s + fcmlt v29.2d, v5.2d, v2.2d + +// CHECK: fcmgt v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xe7,0xb0,0x2e] +// CHECK: fcmgt v4.4s, v7.4s, v15.4s // encoding: [0xe4,0xe4,0xaf,0x6e] +// CHECK: fcmgt v29.2d, v2.2d, v5.2d // encoding: [0x5d,0xe4,0xe5,0x6e] +// CHECK: fcmgt v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xe7,0xb0,0x2e] +// CHECK: fcmgt v4.4s, v7.4s, v15.4s // encoding: [0xe4,0xe4,0xaf,0x6e] +// CHECK: fcmgt v29.2d, v2.2d, v5.2d // encoding: [0x5d,0xe4,0xe5,0x6e] + + +//---------------------------------------------------------------------- +// Vector Compare Mask Equal to Zero (Integer) +//---------------------------------------------------------------------- + + cmeq v0.8b, v15.8b, #0 + cmeq v1.16b, v31.16b, #0 + cmeq v15.4h, v16.4h, #0 + cmeq v5.8h, v6.8h, #0 + cmeq v29.2s, v27.2s, #0 + cmeq v9.4s, v7.4s, #0 + cmeq v3.2d, v31.2d, #0 + +// CHECK: cmeq v0.8b, v15.8b, #0x0 // encoding: [0xe0,0x99,0x20,0x0e] +// CHECK: cmeq v1.16b, v31.16b, #0x0 // encoding: [0xe1,0x9b,0x20,0x4e] +// CHECK: cmeq v15.4h, v16.4h, #0x0 // encoding: [0x0f,0x9a,0x60,0x0e] +// CHECK: cmeq v5.8h, v6.8h, #0x0 // encoding: [0xc5,0x98,0x60,0x4e] +// CHECK: cmeq v29.2s, v27.2s, #0x0 // encoding: [0x7d,0x9b,0xa0,0x0e] +// CHECK: cmeq v9.4s, v7.4s, #0x0 // encoding: [0xe9,0x98,0xa0,0x4e] +// CHECK: cmeq v3.2d, v31.2d, #0x0 // encoding: [0xe3,0x9b,0xe0,0x4e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer) +//---------------------------------------------------------------------- + cmge v0.8b, v15.8b, #0 + cmge v1.16b, v31.16b, #0 + cmge v15.4h, v16.4h, #0 + cmge v5.8h, v6.8h, #0 + cmge v29.2s, v27.2s, #0 + cmge v17.4s, v20.4s, #0 + cmge v3.2d, v31.2d, #0 + +// CHECK: cmge v0.8b, v15.8b, #0x0 // encoding: [0xe0,0x89,0x20,0x2e] +// CHECK: cmge v1.16b, v31.16b, #0x0 // encoding: [0xe1,0x8b,0x20,0x6e] +// CHECK: cmge v15.4h, v16.4h, #0x0 // encoding: [0x0f,0x8a,0x60,0x2e] +// CHECK: cmge v5.8h, v6.8h, #0x0 // encoding: [0xc5,0x88,0x60,0x6e] +// CHECK: cmge v29.2s, v27.2s, #0x0 // encoding: [0x7d,0x8b,0xa0,0x2e] +// CHECK: cmge v17.4s, v20.4s, #0x0 // encoding: [0x91,0x8a,0xa0,0x6e] +// CHECK: cmge v3.2d, v31.2d, #0x0 // encoding: [0xe3,0x8b,0xe0,0x6e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than Zero (Signed Integer) +//---------------------------------------------------------------------- + + cmgt v0.8b, v15.8b, #0 + cmgt v1.16b, v31.16b, #0 + cmgt v15.4h, v16.4h, #0 + cmgt v5.8h, v6.8h, #0 + cmgt v29.2s, v27.2s, #0 + cmgt v9.4s, v7.4s, #0 + cmgt v3.2d, v31.2d, #0 + +// CHECK: cmgt v0.8b, v15.8b, #0x0 // encoding: [0xe0,0x89,0x20,0x0e] +// CHECK: cmgt v1.16b, v31.16b, #0x0 // encoding: [0xe1,0x8b,0x20,0x4e] +// CHECK: cmgt v15.4h, v16.4h, #0x0 // encoding: [0x0f,0x8a,0x60,0x0e] +// CHECK: cmgt v5.8h, v6.8h, #0x0 // encoding: [0xc5,0x88,0x60,0x4e] +// CHECK: cmgt v29.2s, v27.2s, #0x0 // encoding: [0x7d,0x8b,0xa0,0x0e] +// CHECK: cmgt v9.4s, v7.4s, #0x0 // encoding: [0xe9,0x88,0xa0,0x4e] +// CHECK: cmgt v3.2d, v31.2d, #0x0 // encoding: [0xe3,0x8b,0xe0,0x4e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Less Than or Equal To Zero (Signed Integer) +//---------------------------------------------------------------------- + cmle v0.8b, v15.8b, #0 + cmle v1.16b, v31.16b, #0 + cmle v15.4h, v16.4h, #0 + cmle v5.8h, v6.8h, #0 + cmle v29.2s, v27.2s, #0 + cmle v9.4s, v7.4s, #0 + cmle v3.2d, v31.2d, #0 + +// CHECK: cmle v0.8b, v15.8b, #0x0 // encoding: [0xe0,0x99,0x20,0x2e] +// CHECK: cmle v1.16b, v31.16b, #0x0 // encoding: [0xe1,0x9b,0x20,0x6e] +// CHECK: cmle v15.4h, v16.4h, #0x0 // encoding: [0x0f,0x9a,0x60,0x2e] +// CHECK: cmle v5.8h, v6.8h, #0x0 // encoding: [0xc5,0x98,0x60,0x6e] +// CHECK: cmle v29.2s, v27.2s, #0x0 // encoding: [0x7d,0x9b,0xa0,0x2e] +// CHECK: cmle v9.4s, v7.4s, #0x0 // encoding: [0xe9,0x98,0xa0,0x6e] +// CHECK: cmle v3.2d, v31.2d, #0x0 // encoding: [0xe3,0x9b,0xe0,0x6e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Less Than Zero (Signed Integer) +//---------------------------------------------------------------------- + cmlt v0.8b, v15.8b, #0 + cmlt v1.16b, v31.16b, #0 + cmlt v15.4h, v16.4h, #0 + cmlt v5.8h, v6.8h, #0 + cmlt v29.2s, v27.2s, #0 + cmlt v9.4s, v7.4s, #0 + cmlt v3.2d, v31.2d, #0 + +// CHECK: cmlt v0.8b, v15.8b, #0x0 // encoding: [0xe0,0xa9,0x20,0x0e] +// CHECK: cmlt v1.16b, v31.16b, #0x0 // encoding: [0xe1,0xab,0x20,0x4e] +// CHECK: cmlt v15.4h, v16.4h, #0x0 // encoding: [0x0f,0xaa,0x60,0x0e] +// CHECK: cmlt v5.8h, v6.8h, #0x0 // encoding: [0xc5,0xa8,0x60,0x4e] +// CHECK: cmlt v29.2s, v27.2s, #0x0 // encoding: [0x7d,0xab,0xa0,0x0e] +// CHECK: cmlt v9.4s, v7.4s, #0x0 // encoding: [0xe9,0xa8,0xa0,0x4e] +// CHECK: cmlt v3.2d, v31.2d, #0x0 // encoding: [0xe3,0xab,0xe0,0x4e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Equal to Zero (Floating Point) +//---------------------------------------------------------------------- + fcmeq v0.2s, v31.2s, #0.0 + fcmeq v4.4s, v7.4s, #0.0 + fcmeq v29.2d, v2.2d, #0.0 + +// CHECK: fcmeq v0.2s, v31.2s, #0.0 // encoding: [0xe0,0xdb,0xa0,0x0e] +// CHECK: fcmeq v4.4s, v7.4s, #0.0 // encoding: [0xe4,0xd8,0xa0,0x4e] +// CHECK: fcmeq v29.2d, v2.2d, #0.0 // encoding: [0x5d,0xd8,0xe0,0x4e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than or Equal to Zero (Floating Point) +//---------------------------------------------------------------------- + fcmge v31.4s, v29.4s, #0.0 + fcmge v3.2s, v8.2s, #0.0 + fcmge v17.2d, v15.2d, #0.0 + +// CHECK: fcmge v31.4s, v29.4s, #0.0 // encoding: [0xbf,0xcb,0xa0,0x6e] +// CHECK: fcmge v3.2s, v8.2s, #0.0 // encoding: [0x03,0xc9,0xa0,0x2e] +// CHECK: fcmge v17.2d, v15.2d, #0.0 // encoding: [0xf1,0xc9,0xe0,0x6e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than Zero (Floating Point) +//---------------------------------------------------------------------- + fcmgt v0.2s, v31.2s, #0.0 + fcmgt v4.4s, v7.4s, #0.0 + fcmgt v29.2d, v2.2d, #0.0 + +// CHECK: fcmgt v0.2s, v31.2s, #0.0 // encoding: [0xe0,0xcb,0xa0,0x0e] +// CHECK: fcmgt v4.4s, v7.4s, #0.0 // encoding: [0xe4,0xc8,0xa0,0x4e] +// CHECK: fcmgt v29.2d, v2.2d, #0.0 // encoding: [0x5d,0xc8,0xe0,0x4e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Less Than or Equal To Zero (Floating Point) +//---------------------------------------------------------------------- + fcmle v1.4s, v8.4s, #0.0 + fcmle v3.2s, v20.2s, #0.0 + fcmle v7.2d, v13.2d, #0.0 + +// CHECK: fcmle v1.4s, v8.4s, #0.0 // encoding: [0x01,0xd9,0xa0,0x6e] +// CHECK: fcmle v3.2s, v20.2s, #0.0 // encoding: [0x83,0xda,0xa0,0x2e] +// CHECK: fcmle v7.2d, v13.2d, #0.0 // encoding: [0xa7,0xd9,0xe0,0x6e] + +//---------------------------------------------------------------------- +// Vector Compare Mask Less Than Zero (Floating Point) +//---------------------------------------------------------------------- + fcmlt v16.2s, v2.2s, #0.0 + fcmlt v15.4s, v4.4s, #0.0 + fcmlt v5.2d, v29.2d, #0.0 + +// CHECK: fcmlt v16.2s, v2.2s, #0.0 // encoding: [0x50,0xe8,0xa0,0x0e] +// CHECK: fcmlt v15.4s, v4.4s, #0.0 // encoding: [0x8f,0xe8,0xa0,0x4e] +// CHECK: fcmlt v5.2d, v29.2d, #0.0 // encoding: [0xa5,0xeb,0xe0,0x4e] + + + + + + + + + diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s new file mode 100644 index 00000000000..5373889222f --- /dev/null +++ b/test/MC/AArch64/neon-diagnostics.s @@ -0,0 +1,1207 @@ +// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon < %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s + +//------------------------------------------------------------------------------ +// Vector Integer Add/sub +//------------------------------------------------------------------------------ + + // Mismatched vector types + add v0.16b, v1.8b, v2.8b + sub v0.2d, v1.2d, v2.2s + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: add v0.16b, v1.8b, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sub v0.2d, v1.2d, v2.2s +// CHECK-ERROR: ^ + +//------------------------------------------------------------------------------ +// Vector Floating-Point Add/sub +//------------------------------------------------------------------------------ + + // Mismatched and invalid vector types + fadd v0.2d, v1.2s, v2.2s + fsub v0.4s, v1.2s, v2.4s + fsub v0.8b, v1.8b, v2.8b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fadd v0.2d, v1.2s, v2.2s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fsub v0.4s, v1.2s, v2.4s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fsub v0.8b, v1.8b, v2.8b +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Integer Mul +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types + mul v0.16b, v1.8b, v2.8b + mul v0.2d, v1.2d, v2.2d + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mul v0.16b, v1.8b, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mul v0.2d, v1.2d, v2.2d +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Floating-Point Mul/Div +//---------------------------------------------------------------------- + // Mismatched vector types + fmul v0.16b, v1.8b, v2.8b + fdiv v0.2s, v1.2d, v2.2d + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmul v0.16b, v1.8b, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fdiv v0.2s, v1.2d, v2.2d +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector And Orr Eor Bsl Bit Bif, Orn, Bic, +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + and v0.8b, v1.16b, v2.8b + orr v0.4h, v1.4h, v2.4h + eor v0.2s, v1.2s, v2.2s + bsl v0.8b, v1.16b, v2.8b + bsl v0.2s, v1.2s, v2.2s + bit v0.2d, v1.2d, v2.2d + bif v0.4h, v1.4h, v2.4h + orn v0.8b, v1.16b, v2.16b + bic v0.2d, v1.2d, v2.2d + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: and v0.8b, v1.16b, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: orr v0.4h, v1.4h, v2.4h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: eor v0.2s, v1.2s, v2.2s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: bsl v0.8b, v1.16b, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: bsl v0.2s, v1.2s, v2.2s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: bit v0.2d, v1.2d, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: bif v0.4h, v1.4h, v2.4h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: orn v0.8b, v1.16b, v2.16b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: bic v0.2d, v1.2d, v2.2d +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Integer Multiply-accumulate and Multiply-subtract +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types + mla v0.16b, v1.8b, v2.8b + mls v0.2d, v1.2d, v2.2d + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mla v0.16b, v1.8b, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mls v0.2d, v1.2d, v2.2d +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Floating-Point Multiply-accumulate and Multiply-subtract +//---------------------------------------------------------------------- + // Mismatched vector types + fmla v0.2s, v1.2d, v2.2d + fmls v0.16b, v1.8b, v2.8b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmla v0.2s, v1.2d, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmls v0.16b, v1.8b, v2.8b +// CHECK-ERROR: ^ + + +//---------------------------------------------------------------------- +// Vector Move Immediate Shifted +// Vector Move Inverted Immediate Shifted +// Vector Bitwise Bit Clear (AND NOT) - immediate +// Vector Bitwise OR - immedidate +//---------------------------------------------------------------------- + // out of range immediate (0 to 0xff) + movi v0.2s, #-1 + mvni v1.4s, #256 + // out of range shift (0, 8, 16, 24 and 0, 8) + bic v15.4h, #1, lsl #7 + orr v31.2s, #1, lsl #25 + movi v5.4h, #10, lsl #16 + // invalid vector type (2s, 4s, 4h, 8h) + movi v5.8b, #1, lsl #8 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: movi v0.2s, #-1 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mvni v1.4s, #256 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: bic v15.4h, #1, lsl #7 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: orr v31.2s, #1, lsl #25 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: movi v5.4h, #10, lsl #16 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: movi v5.8b, #1, lsl #8 +// CHECK-ERROR: ^ +//---------------------------------------------------------------------- +// Vector Move Immediate Masked +// Vector Move Inverted Immediate Masked +//---------------------------------------------------------------------- + // out of range immediate (0 to 0xff) + movi v0.2s, #-1, msl #8 + mvni v7.4s, #256, msl #16 + // out of range shift (8, 16) + movi v3.2s, #1, msl #0 + mvni v17.4s, #255, msl #32 + // invalid vector type (2s, 4s) + movi v5.4h, #31, msl #8 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: movi v0.2s, #-1, msl #8 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mvni v7.4s, #256, msl #16 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: movi v3.2s, #1, msl #0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mvni v17.4s, #255, msl #32 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: movi v5.4h, #31, msl #8 +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Immediate - per byte +//---------------------------------------------------------------------- + // out of range immediate (0 to 0xff) + movi v0.8b, #-1 + movi v1.16b, #256 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: movi v0.8b, #-1 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: movi v1.16b, #256 +// CHECK-ERROR: ^ + + +//---------------------------------------------------------------------- +// Vector Move Immediate - bytemask, per doubleword +//--------------------------------------------------------------------- + // invalid bytemask (0x00 or 0xff) + movi v0.2d, #0x10ff00ff00ff00ff + +// CHECK:ERROR: error: invalid operand for instruction +// CHECK:ERROR: movi v0.2d, #0x10ff00ff00ff00ff +// CHECK:ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Move Immediate - bytemask, one doubleword +//---------------------------------------------------------------------- + // invalid bytemask (0x00 or 0xff) + movi v0.2d, #0xffff00ff001f00ff + +// CHECK:ERROR: error: invalid operand for instruction +// CHECK:ERROR: movi v0.2d, #0xffff00ff001f00ff +// CHECK:ERROR: ^ +//---------------------------------------------------------------------- +// Vector Floating Point Move Immediate +//---------------------------------------------------------------------- + // invalid vector type (2s, 4s, 2d) + fmov v0.4h, #1.0 + +// CHECK:ERROR: error: invalid operand for instruction +// CHECK:ERROR: fmov v0.4h, #1.0 +// CHECK:ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Move - register +//---------------------------------------------------------------------- + // invalid vector type (8b, 16b) + mov v0.2s, v31.8b +// CHECK:ERROR: error: invalid operand for instruction +// CHECK:ERROR: mov v0.2s, v31.8b +// CHECK:ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Absolute Difference and Accumulate (Signed, Unsigned) +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types (2d) + saba v0.16b, v1.8b, v2.8b + uaba v0.2d, v1.2d, v2.2d + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: saba v0.16b, v1.8b, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uaba v0.2d, v1.2d, v2.2d +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Absolute Difference and Accumulate (Signed, Unsigned) +// Vector Absolute Difference (Signed, Unsigned) + + // Mismatched and invalid vector types (2d) + uaba v0.16b, v1.8b, v2.8b + saba v0.2d, v1.2d, v2.2d + uabd v0.4s, v1.2s, v2.2s + sabd v0.4h, v1.8h, v8.8h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uaba v0.16b, v1.8b, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: saba v0.2d, v1.2d, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uabd v0.4s, v1.2s, v2.2s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sabd v0.4h, v1.8h, v8.8h +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Absolute Difference (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + fabd v0.2s, v1.4s, v2.2d + fabd v0.4h, v1.4h, v2.4h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fabd v0.2s, v1.4s, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fabd v0.4h, v1.4h, v2.4h +// CHECK-ERROR: ^ +//---------------------------------------------------------------------- +// Vector Multiply (Polynomial) +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types + pmul v0.8b, v1.8b, v2.16b + pmul v0.2s, v1.2s, v2.2s + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: pmul v0.8b, v1.8b, v2.16b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: pmul v0.2s, v1.2s, v2.2s +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Scalar Integer Add and Sub +//---------------------------------------------------------------------- + + // Mismatched registers + add d0, s1, d2 + sub s1, d1, d2 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: add d0, s1, d2 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sub s1, d1, d2 +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Reciprocal Step (Floating Point) +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types + frecps v0.4s, v1.2d, v2.4s + frecps v0.8h, v1.8h, v2.8h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: frecps v0.4s, v1.2d, v2.4s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: frecps v0.8h, v1.8h, v2.8h +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Reciprocal Square Root Step (Floating Point) +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types + frsqrts v0.2d, v1.2d, v2.2s + frsqrts v0.4h, v1.4h, v2.4h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: frsqrts v0.2d, v1.2d, v2.2s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: frsqrts v0.4h, v1.4h, v2.4h +// CHECK-ERROR: ^ + + +//---------------------------------------------------------------------- +// Vector Absolute Compare Mask Less Than Or Equal (Floating Point) +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types + facge v0.2d, v1.2s, v2.2d + facge v0.4h, v1.4h, v2.4h + facle v0.8h, v1.4h, v2.4h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: facge v0.2d, v1.2s, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: facge v0.4h, v1.4h, v2.4h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: facle v0.8h, v1.4h, v2.4h +// CHECK-ERROR: ^ +//---------------------------------------------------------------------- +// Vector Absolute Compare Mask Less Than (Floating Point) +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types + facgt v0.2d, v1.2d, v2.4s + facgt v0.8h, v1.8h, v2.8h + faclt v0.8b, v1.8b, v2.8b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: facgt v0.2d, v1.2d, v2.4s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: facgt v0.8h, v1.8h, v2.8h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: faclt v0.8b, v1.8b, v2.8b +// CHECK-ERROR: ^ + + +//---------------------------------------------------------------------- +// Vector Compare Mask Equal (Integer) +//---------------------------------------------------------------------- + + // Mismatched vector types + cmeq c0.2d, v1.2d, v2.2s + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmeq c0.2d, v1.2d, v2.2s +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Higher or Same (Unsigned Integer) +// Vector Compare Mask Less or Same (Unsigned Integer) +// CMLS is alias for CMHS with operands reversed. +//---------------------------------------------------------------------- + + // Mismatched vector types + cmhs c0.4h, v1.8b, v2.8b + cmls c0.16b, v1.16b, v2.2d + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmhs c0.4h, v1.8b, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmls c0.16b, v1.16b, v2.2d +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than or Equal (Integer) +// Vector Compare Mask Less Than or Equal (Integer) +// CMLE is alias for CMGE with operands reversed. +//---------------------------------------------------------------------- + + // Mismatched vector types + cmge c0.8h, v1.8b, v2.8b + cmle c0.4h, v1.2s, v2.2s + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmge c0.8h, v1.8b, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmle c0.4h, v1.2s, v2.2s +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Higher (Unsigned Integer) +// Vector Compare Mask Lower (Unsigned Integer) +// CMLO is alias for CMHI with operands reversed. +//---------------------------------------------------------------------- + + // Mismatched vector types + cmhi c0.4s, v1.4s, v2.16b + cmlo c0.8b, v1.8b, v2.2s + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmhi c0.4s, v1.4s, v2.16b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmlo c0.8b, v1.8b, v2.2s +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than (Integer) +// Vector Compare Mask Less Than (Integer) +// CMLT is alias for CMGT with operands reversed. +//---------------------------------------------------------------------- + + // Mismatched vector types + cmgt c0.8b, v1.4s, v2.16b + cmlt c0.8h, v1.16b, v2.4s + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmgt c0.8b, v1.4s, v2.16b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmlt c0.8h, v1.16b, v2.4s +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Bitwise Test (Integer) +//---------------------------------------------------------------------- + + // Mismatched vector types + cmtst c0.16b, v1.16b, v2.4s + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmtst c0.16b, v1.16b, v2.4s +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Equal (Floating Point) +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types + fcmeq v0.2d, v1.2s, v2.2d + fcmeq v0.16b, v1.16b, v2.16b + fcmeq v0.8b, v1.4h, v2.4h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmeq v0.2d, v1.2s, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmeq v0.16b, v1.16b, v2.16b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmeq v0.8b, v1.4h, v2.4h +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than Or Equal (Floating Point) +// Vector Compare Mask Less Than Or Equal (Floating Point) +// FCMLE is alias for FCMGE with operands reversed. +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types + fcmge v31.4s, v29.2s, v28.4s + fcmge v3.8b, v8.2s, v12.2s + fcmle v17.8h, v15.2d, v13.2d + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmge v31.4s, v29.2s, v28.4s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmge v3.8b, v8.2s, v12.2s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmle v17.8h, v15.2d, v13.2d +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than (Floating Point) +// Vector Compare Mask Less Than (Floating Point) +// FCMLT is alias for FCMGT with operands reversed. +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types + fcmgt v0.2d, v31.2s, v16.2s + fcmgt v4.4s, v7.4s, v15.4h + fcmlt v29.2d, v5.2d, v2.16b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmgt v0.2d, v31.2s, v16.2s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type +// CHECK-ERROR: fcmgt v4.4s, v7.4s, v15.4h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type +// CHECK-ERROR: fcmlt v29.2d, v5.2d, v2.16b +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Equal to Zero (Integer) +//---------------------------------------------------------------------- + // Mismatched vector types and invalid imm + // Mismatched vector types + cmeq c0.2d, v1.2s, #0 + cmeq c0.2d, v1.2d, #1 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmeq c0.2d, v1.2s, #0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmeq c0.2d, v1.2d, #1 +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer) +//---------------------------------------------------------------------- + // Mismatched vector types and invalid imm + cmge c0.8h, v1.8b, #0 + cmge c0.4s, v1.4s, #-1 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmge c0.8h, v1.8b, #0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmge c0.4s, v1.4s, #-1 +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than Zero (Signed Integer) +//---------------------------------------------------------------------- + // Mismatched vector types and invalid imm + cmgt c0.8b, v1.4s, #0 + cmgt c0.8b, v1.8b, #-255 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmgt c0.8b, v1.4s, #0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmgt c0.8b, v1.8b, #-255 +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Less Than or Equal To Zero (Signed Integer) +//---------------------------------------------------------------------- + // Mismatched vector types and invalid imm + cmle c0.4h, v1.2s, #0 + cmle c0.16b, v1.16b, #16 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmle c0.4h, v1.2s, #0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmle c0.16b, v1.16b, #16 +// CHECK-ERROR: ^ +//---------------------------------------------------------------------- +// Vector Compare Mask Less Than Zero (Signed Integer) +//---------------------------------------------------------------------- + // Mismatched vector types and invalid imm + cmlt c0.8h, v1.16b, #0 + cmlt c0.8h, v1.8h, #-15 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmlt c0.8h, v1.16b, #0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: cmlt c0.8h, v1.8h, #-15 +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Equal to Zero (Floating Point) +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types, invalid imm + fcmeq v0.2d, v1.2s, #0.0 + fcmeq v0.16b, v1.16b, #0.0 + fcmeq v0.8b, v1.4h, #1.0 + fcmeq v0.8b, v1.4h, #1 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmeq v0.2d, v1.2s, #0.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmeq v0.16b, v1.16b, #0.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmeq v0.8b, v1.4h, #1.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: Expected floating-point immediate +// CHECK-ERROR: fcmeq v0.8b, v1.4h, #1 +// CHECK-ERROR: ^ +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than or Equal to Zero (Floating Point) +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types, invalid imm + fcmge v31.4s, v29.2s, #0.0 + fcmge v3.8b, v8.2s, #0.0 + fcmle v17.8h, v15.2d, #-1.0 + fcmle v17.8h, v15.2d, #0 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmge v31.4s, v29.2s, #0.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmge v3.8b, v8.2s, #0.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmle v17.8h, v15.2d, #-1.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: Expected floating-point immediate +// CHECK-ERROR: fcmle v17.8h, v15.2d, #0 +// CHECK-ERROR: ^ +//---------------------------------------------------------------------- +// Vector Compare Mask Greater Than Zero (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types, invalid imm + fcmgt v0.2d, v31.2s, #0.0 + fcmgt v4.4s, v7.4h, #0.0 + fcmlt v29.2d, v5.2d, #255.0 + fcmlt v29.2d, v5.2d, #255 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmgt v0.2d, v31.2s, #0.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmgt v4.4s, v7.4h, #0.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type +// CHECK-ERROR: fcmlt v29.2d, v5.2d, #255.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: Expected floating-point immediate +// CHECK-ERROR: fcmlt v29.2d, v5.2d, #255 +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Less Than or Equal To Zero (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types, invalid imm + fcmge v31.4s, v29.2s, #0.0 + fcmge v3.8b, v8.2s, #0.0 + fcmle v17.2d, v15.2d, #15.0 + fcmle v17.2d, v15.2d, #15 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmge v31.4s, v29.2s, #0.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmge v3.8b, v8.2s, #0.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type +// CHECK-ERROR: fcmle v17.2d, v15.2d, #15.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: Expected floating-point immediate +// CHECK-ERROR: fcmle v17.2d, v15.2d, #15 +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Compare Mask Less Than Zero (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types, invalid imm + fcmgt v0.2d, v31.2s, #0.0 + fcmgt v4.4s, v7.4h, #0.0 + fcmlt v29.2d, v5.2d, #16.0 + fcmlt v29.2d, v5.2d, #2 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmgt v0.2d, v31.2s, #0.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fcmgt v4.4s, v7.4h, #0.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type +// CHECK-ERROR: fcmlt v29.2d, v5.2d, #16.0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: Expected floating-point immediate +// CHECK-ERROR: fcmlt v29.2d, v5.2d, #2 +// CHECK-ERROR: ^ + +/----------------------------------------------------------------------- +// Vector Integer Halving Add (Signed) +// Vector Integer Halving Add (Unsigned) +// Vector Integer Halving Sub (Signed) +// Vector Integer Halving Sub (Unsigned) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types (2d) + shadd v0.2d, v1.2d, v2.2d + uhadd v4.2s, v5.2s, v5.4h + shsub v11.4h, v12.8h, v13.4h + uhsub v31.16b, v29.8b, v28.8b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: shadd v0.2d, v1.2d, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uhadd v4.2s, v5.2s, v5.4h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: shsub v11.4h, v12.8h, v13.4h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uhsub v31.16b, v29.8b, v28.8b +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Integer Rouding Halving Add (Signed) +// Vector Integer Rouding Halving Add (Unsigned) +//---------------------------------------------------------------------- + + // Mismatched and invalid vector types (2d) + srhadd v0.2s, v1.2s, v2.2d + urhadd v0.16b, v1.16b, v2.8h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: srhadd v0.2s, v1.2s, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: urhadd v0.16b, v1.16b, v2.8h +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Integer Saturating Add (Signed) +// Vector Integer Saturating Add (Unsigned) +// Vector Integer Saturating Sub (Signed) +// Vector Integer Saturating Sub (Unsigned) +//---------------------------------------------------------------------- + + // Mismatched vector types + sqadd v0.2s, v1.2s, v2.2d + uqadd v31.8h, v1.4h, v2.4h + sqsub v10.8h, v1.16b, v2.16b + uqsub v31.8b, v1.8b, v2.4s + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqadd v0.2s, v1.2s, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uqadd v31.8h, v1.4h, v2.4h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqsub v10.8h, v1.16b, v2.16b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uqsub v31.8b, v1.8b, v2.4s +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Scalar Integer Saturating Add (Signed) +// Scalar Integer Saturating Add (Unsigned) +// Scalar Integer Saturating Sub (Signed) +// Scalar Integer Saturating Sub (Unsigned) +//---------------------------------------------------------------------- + + // Mismatched registers + sqadd d0, s31, d2 + uqadd s0, s1, d2 + sqsub b0, b2, s18 + uqsub h1, h2, d2 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqadd d0, s31, d2 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uqadd s0, s1, d2 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqsub b0, b2, s18 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uqsub h1, h2, d2 +// CHECK-ERROR: ^ + + +//---------------------------------------------------------------------- +// Vector Shift Left (Signed and Unsigned Integer) +//---------------------------------------------------------------------- + // Mismatched vector types + sshl v0.4s, v15.2s, v16.2s + ushl v1.16b, v25.16b, v6.8h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sshl v0.4s, v15.2s, v16.2s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ushl v1.16b, v25.16b, v6.8h +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Saturating Shift Left (Signed and Unsigned Integer) +//---------------------------------------------------------------------- + // Mismatched vector types + sqshl v0.2s, v15.2s, v16.2d + uqshl v1.8b, v25.4h, v6.8h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqshl v0.2s, v15.2s, v16.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uqshl v1.8b, v25.4h, v6.8h +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Rouding Shift Left (Signed and Unsigned Integer) +//---------------------------------------------------------------------- + // Mismatched vector types + srshl v0.8h, v15.8h, v16.16b + urshl v1.2d, v25.2d, v6.4s + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: srshl v0.8h, v15.8h, v16.16b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: urshl v1.2d, v25.2d, v6.4s +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Saturating Rouding Shift Left (Signed and Unsigned Integer) +//---------------------------------------------------------------------- + // Mismatched vector types + sqrshl v0.2s, v15.8h, v16.16b + uqrshl v1.4h, v25.4h, v6.2d + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrshl v0.2s, v15.8h, v16.16b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uqrshl v1.4h, v25.4h, v6.2d +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Scalar Integer Shift Left (Signed, Unsigned) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + sshl d0, d1, s2 + ushl b2, b0, b1 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sshl d0, d1, s2 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ushl b2, b0, b1 +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Scalar Integer Saturating Shift Left (Signed, Unsigned) +//---------------------------------------------------------------------- + + // Mismatched vector types + sqshl b0, b1, s0 + uqshl h0, h1, b0 + sqshl s0, s1, h0 + uqshl d0, d1, b0 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqshl b0, b1, s0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uqshl h0, h1, b0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqshl s0, s1, h0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uqshl d0, d1, b0 +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Scalar Integer Rouding Shift Left (Signed, Unsigned) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + srshl h0, h1, h2 + urshl s0, s1, s2 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: srshl h0, h1, h2 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: urshl s0, s1, s2 +// CHECK-ERROR: ^ + + +//---------------------------------------------------------------------- +// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) +//---------------------------------------------------------------------- + + // Mismatched vector types + sqrshl b0, b1, s0 + uqrshl h0, h1, b0 + sqrshl s0, s1, h0 + uqrshl d0, d1, b0 + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrshl b0, b1, s0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uqrshl h0, h1, b0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrshl s0, s1, h0 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uqrshl d0, d1, b0 +// CHECK-ERROR: ^ + + +//---------------------------------------------------------------------- +// Vector Maximum (Signed, Unsigned) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + smax v0.2d, v1.2d, v2.2d + umax v0.4h, v1.4h, v2.2s + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smax v0.2d, v1.2d, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umax v0.4h, v1.4h, v2.2s +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Minimum (Signed, Unsigned) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + smin v0.2d, v1.2d, v2.2d + umin v0.2s, v1.2s, v2.8b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smin v0.2d, v1.2d, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umin v0.2s, v1.2s, v2.8b +// CHECK-ERROR: ^ + + +//---------------------------------------------------------------------- +// Vector Maximum (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + fmax v0.2s, v1.2s, v2.4s + fmax v0.8b, v1.8b, v2.8b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmax v0.2s, v1.2s, v2.4s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmax v0.8b, v1.8b, v2.8b +// CHECK-ERROR: ^ +//---------------------------------------------------------------------- +// Vector Minimum (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + fmin v0.4s, v1.4s, v2.2d + fmin v0.8h, v1.8h, v2.8h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmin v0.4s, v1.4s, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmin v0.8h, v1.8h, v2.8h +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector maxNum (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + fmaxnm v0.2s, v1.2s, v2.2d + fmaxnm v0.4h, v1.8h, v2.4h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmaxnm v0.2s, v1.2s, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmaxnm v0.4h, v1.8h, v2.4h +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector minNum (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + fminnm v0.4s, v1.2s, v2.4s + fminnm v0.16b, v0.16b, v0.16b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fminnm v0.4s, v1.2s, v2.4s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fminnm v0.16b, v0.16b, v0.16b +// CHECK-ERROR: ^ + + +//---------------------------------------------------------------------- +// Vector Maximum Pairwise (Signed, Unsigned) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + smaxp v0.2d, v1.2d, v2.2d + umaxp v0.4h, v1.4h, v2.2s + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smaxp v0.2d, v1.2d, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umaxp v0.4h, v1.4h, v2.2s +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Minimum Pairwise (Signed, Unsigned) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + sminp v0.2d, v1.2d, v2.2d + uminp v0.2s, v1.2s, v2.8b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sminp v0.2d, v1.2d, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: uminp v0.2s, v1.2s, v2.8b +// CHECK-ERROR: ^ + + +//---------------------------------------------------------------------- +// Vector Maximum Pairwise (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + fmaxp v0.2s, v1.2s, v2.4s + fmaxp v0.8b, v1.8b, v2.8b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmaxp v0.2s, v1.2s, v2.4s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmaxp v0.8b, v1.8b, v2.8b +// CHECK-ERROR: ^ +//---------------------------------------------------------------------- +// Vector Minimum Pairwise (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + fminp v0.4s, v1.4s, v2.2d + fminp v0.8h, v1.8h, v2.8h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fminp v0.4s, v1.4s, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fminp v0.8h, v1.8h, v2.8h +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector maxNum Pairwise (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + fmaxnmp v0.2s, v1.2s, v2.2d + fmaxnmp v0.4h, v1.8h, v2.4h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmaxnmp v0.2s, v1.2s, v2.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmaxnmp v0.4h, v1.8h, v2.4h +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector minNum Pairwise (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + fminnmp v0.4s, v1.2s, v2.4s + fminnmp v0.16b, v0.16b, v0.16b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fminnmp v0.4s, v1.2s, v2.4s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fminnmp v0.16b, v0.16b, v0.16b +// CHECK-ERROR: ^ + + +//---------------------------------------------------------------------- +// Vector Add Pairwise (Integer) +//---------------------------------------------------------------------- + + // Mismatched vector types + addp v0.16b, v1.8b, v2.8b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: addp v0.16b, v1.8b, v2.8b +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Add Pairwise (Floating Point) +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + faddp v0.16b, v1.8b, v2.8b + faddp v0.2d, v1.2d, v2.8h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: faddp v0.16b, v1.8b, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: faddp v0.2d, v1.2d, v2.8h +// CHECK-ERROR: ^ + + +//---------------------------------------------------------------------- +// Vector Saturating Doubling Multiply High +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + sqdmulh v2.4h, v25.8h, v3.4h + sqdmulh v12.2d, v5.2d, v13.2d + sqdmulh v3.8b, v1.8b, v30.8b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmulh v2.4h, v25.8h, v3.4h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmulh v12.2d, v5.2d, v13.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmulh v3.8b, v1.8b, v30.8b +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Saturating Rouding Doubling Multiply High +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + sqrdmulh v2.2s, v25.4s, v3.4s + sqrdmulh v12.16b, v5.16b, v13.16b + sqrdmulh v3.4h, v1.4h, v30.2d + + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmulh v2.2s, v25.4s, v3.4s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmulh v12.16b, v5.16b, v13.16b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmulh v3.4h, v1.4h, v30.2d +// CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector Multiply Extended +//---------------------------------------------------------------------- + // Mismatched and invalid vector types + fmulx v21.2s, v5.2s, v13.2d + fmulx v1.4h, v25.4h, v3.4h + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmulx v21.2s, v5.2s, v13.2d +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmulx v1.4h, v25.4h, v3.4h +// CHECK-ERROR: ^ diff --git a/test/MC/AArch64/neon-facge-facgt.s b/test/MC/AArch64/neon-facge-facgt.s new file mode 100644 index 00000000000..212eda2f209 --- /dev/null +++ b/test/MC/AArch64/neon-facge-facgt.s @@ -0,0 +1,41 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//---------------------------------------------------------------------- +// Vector Absolute Compare Mask Less Than Or Equal (Floating Point) +// FACLE is alias for FACGE with operands reversed +//---------------------------------------------------------------------- + facge v0.2s, v31.2s, v16.2s + facge v4.4s, v7.4s, v15.4s + facge v29.2d, v2.2d, v5.2d + facle v0.2s, v16.2s, v31.2s + facle v4.4s, v15.4s, v7.4s + facle v29.2d, v5.2d, v2.2d + +// CHECK: facge v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xef,0x30,0x2e] +// CHECK: facge v4.4s, v7.4s, v15.4s // encoding: [0xe4,0xec,0x2f,0x6e] +// CHECK: facge v29.2d, v2.2d, v5.2d // encoding: [0x5d,0xec,0x65,0x6e] +// CHECK: facge v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xef,0x30,0x2e] +// CHECK: facge v4.4s, v7.4s, v15.4s // encoding: [0xe4,0xec,0x2f,0x6e] +// CHECK: facge v29.2d, v2.2d, v5.2d // encoding: [0x5d,0xec,0x65,0x6e] + +//---------------------------------------------------------------------- +// Vector Absolute Compare Mask Less Than (Floating Point) +// FACLT is alias for FACGT with operands reversed +//---------------------------------------------------------------------- + facgt v31.4s, v29.4s, v28.4s + facgt v3.2s, v8.2s, v12.2s + facgt v17.2d, v15.2d, v13.2d + faclt v31.4s, v28.4s, v29.4s + faclt v3.2s, v12.2s, v8.2s + faclt v17.2d, v13.2d, v15.2d + +// CHECK: facgt v31.4s, v29.4s, v28.4s // encoding: [0xbf,0xef,0xbc,0x6e] +// CHECK: facgt v3.2s, v8.2s, v12.2s // encoding: [0x03,0xed,0xac,0x2e] +// CHECK: facgt v17.2d, v15.2d, v13.2d // encoding: [0xf1,0xed,0xed,0x6e] +// CHECK: facgt v31.4s, v29.4s, v28.4s // encoding: [0xbf,0xef,0xbc,0x6e] +// CHECK: facgt v3.2s, v8.2s, v12.2s // encoding: [0x03,0xed,0xac,0x2e] +// CHECK: facgt v17.2d, v15.2d, v13.2d // encoding: [0xf1,0xed,0xed,0x6e] + + diff --git a/test/MC/AArch64/neon-frsqrt-frecp.s b/test/MC/AArch64/neon-frsqrt-frecp.s new file mode 100644 index 00000000000..79fe5da5e76 --- /dev/null +++ b/test/MC/AArch64/neon-frsqrt-frecp.s @@ -0,0 +1,27 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//---------------------------------------------------------------------- +// Vector Reciprocal Square Root Step (Floating Point) +//---------------------------------------------------------------------- + frsqrts v0.2s, v31.2s, v16.2s + frsqrts v4.4s, v7.4s, v15.4s + frsqrts v29.2d, v2.2d, v5.2d + +// CHECK: frsqrts v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xff,0xb0,0x0e] +// CHECK: frsqrts v4.4s, v7.4s, v15.4s // encoding: [0xe4,0xfc,0xaf,0x4e] +// CHECK: frsqrts v29.2d, v2.2d, v5.2d // encoding: [0x5d,0xfc,0xe5,0x4e] + +//---------------------------------------------------------------------- +// Vector Reciprocal Step (Floating Point) +//---------------------------------------------------------------------- + frecps v31.4s, v29.4s, v28.4s + frecps v3.2s, v8.2s, v12.2s + frecps v17.2d, v15.2d, v13.2d + +// CHECK: frecps v31.4s, v29.4s, v28.4s // encoding: [0xbf,0xff,0x3c,0x4e] +// CHECK: frecps v3.2s, v8.2s, v12.2s // encoding: [0x03,0xfd,0x2c,0x0e] +// CHECK: frecps v17.2d, v15.2d, v13.2d // encoding: [0xf1,0xfd,0x6d,0x4e] + + diff --git a/test/MC/AArch64/neon-halving-add-sub.s b/test/MC/AArch64/neon-halving-add-sub.s new file mode 100644 index 00000000000..555f1b83b4f --- /dev/null +++ b/test/MC/AArch64/neon-halving-add-sub.s @@ -0,0 +1,74 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + + +//------------------------------------------------------------------------------ +// Vector Integer Halving Add (Signed) +//------------------------------------------------------------------------------ + shadd v0.8b, v1.8b, v2.8b + shadd v0.16b, v1.16b, v2.16b + shadd v0.4h, v1.4h, v2.4h + shadd v0.8h, v1.8h, v2.8h + shadd v0.2s, v1.2s, v2.2s + shadd v0.4s, v1.4s, v2.4s + +// CHECK: shadd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x04,0x22,0x0e] +// CHECK: shadd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x04,0x22,0x4e] +// CHECK: shadd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x04,0x62,0x0e] +// CHECK: shadd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x04,0x62,0x4e] +// CHECK: shadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x04,0xa2,0x0e] +// CHECK: shadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x04,0xa2,0x4e] + + +//------------------------------------------------------------------------------ +// Vector Integer Halving Add (Unsigned) +//------------------------------------------------------------------------------ + uhadd v0.8b, v1.8b, v2.8b + uhadd v0.16b, v1.16b, v2.16b + uhadd v0.4h, v1.4h, v2.4h + uhadd v0.8h, v1.8h, v2.8h + uhadd v0.2s, v1.2s, v2.2s + uhadd v0.4s, v1.4s, v2.4s + +// CHECK: uhadd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x04,0x22,0x2e] +// CHECK: uhadd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x04,0x22,0x6e] +// CHECK: uhadd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x04,0x62,0x2e] +// CHECK: uhadd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x04,0x62,0x6e] +// CHECK: uhadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x04,0xa2,0x2e] +// CHECK: uhadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x04,0xa2,0x6e] + +//------------------------------------------------------------------------------ +// Vector Integer Halving Sub (Signed) +//------------------------------------------------------------------------------ + shsub v0.8b, v1.8b, v2.8b + shsub v0.16b, v1.16b, v2.16b + shsub v0.4h, v1.4h, v2.4h + shsub v0.8h, v1.8h, v2.8h + shsub v0.2s, v1.2s, v2.2s + shsub v0.4s, v1.4s, v2.4s + +// CHECK: shsub v0.8b, v1.8b, v2.8b // encoding: [0x20,0x24,0x22,0x0e] +// CHECK: shsub v0.16b, v1.16b, v2.16b // encoding: [0x20,0x24,0x22,0x4e] +// CHECK: shsub v0.4h, v1.4h, v2.4h // encoding: [0x20,0x24,0x62,0x0e] +// CHECK: shsub v0.8h, v1.8h, v2.8h // encoding: [0x20,0x24,0x62,0x4e] +// CHECK: shsub v0.2s, v1.2s, v2.2s // encoding: [0x20,0x24,0xa2,0x0e] +// CHECK: shsub v0.4s, v1.4s, v2.4s // encoding: [0x20,0x24,0xa2,0x4e] + +//------------------------------------------------------------------------------ +// Vector Integer Halving Sub (Unsigned) +//------------------------------------------------------------------------------ + uhsub v0.8b, v1.8b, v2.8b + uhsub v0.16b, v1.16b, v2.16b + uhsub v0.4h, v1.4h, v2.4h + uhsub v0.8h, v1.8h, v2.8h + uhsub v0.2s, v1.2s, v2.2s + uhsub v0.4s, v1.4s, v2.4s + +// CHECK: uhsub v0.8b, v1.8b, v2.8b // encoding: [0x20,0x24,0x22,0x2e] +// CHECK: uhsub v0.16b, v1.16b, v2.16b // encoding: [0x20,0x24,0x22,0x6e] +// CHECK: uhsub v0.4h, v1.4h, v2.4h // encoding: [0x20,0x24,0x62,0x2e] +// CHECK: uhsub v0.8h, v1.8h, v2.8h // encoding: [0x20,0x24,0x62,0x6e] +// CHECK: uhsub v0.2s, v1.2s, v2.2s // encoding: [0x20,0x24,0xa2,0x2e] +// CHECK: uhsub v0.4s, v1.4s, v2.4s // encoding: [0x20,0x24,0xa2,0x6e] + diff --git a/test/MC/AArch64/neon-max-min-pairwise.s b/test/MC/AArch64/neon-max-min-pairwise.s new file mode 100644 index 00000000000..8d2dadb1997 --- /dev/null +++ b/test/MC/AArch64/neon-max-min-pairwise.s @@ -0,0 +1,110 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//---------------------------------------------------------------------- +// Vector Maximum Pairwise (Signed and Unsigned Integer) +//---------------------------------------------------------------------- + smaxp v0.8b, v1.8b, v2.8b + smaxp v0.16b, v1.16b, v2.16b + smaxp v0.4h, v1.4h, v2.4h + smaxp v0.8h, v1.8h, v2.8h + smaxp v0.2s, v1.2s, v2.2s + smaxp v0.4s, v1.4s, v2.4s + +// CHECK: smaxp v0.8b, v1.8b, v2.8b // encoding: [0x20,0xa4,0x22,0x0e] +// CHECK: smaxp v0.16b, v1.16b, v2.16b // encoding: [0x20,0xa4,0x22,0x4e] +// CHECK: smaxp v0.4h, v1.4h, v2.4h // encoding: [0x20,0xa4,0x62,0x0e] +// CHECK: smaxp v0.8h, v1.8h, v2.8h // encoding: [0x20,0xa4,0x62,0x4e] +// CHECK: smaxp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xa4,0xa2,0x0e] +// CHECK: smaxp v0.4s, v1.4s, v2.4s // encoding: [0x20,0xa4,0xa2,0x4e] + + umaxp v0.8b, v1.8b, v2.8b + umaxp v0.16b, v1.16b, v2.16b + umaxp v0.4h, v1.4h, v2.4h + umaxp v0.8h, v1.8h, v2.8h + umaxp v0.2s, v1.2s, v2.2s + umaxp v0.4s, v1.4s, v2.4s + +// CHECK: umaxp v0.8b, v1.8b, v2.8b // encoding: [0x20,0xa4,0x22,0x2e] +// CHECK: umaxp v0.16b, v1.16b, v2.16b // encoding: [0x20,0xa4,0x22,0x6e] +// CHECK: umaxp v0.4h, v1.4h, v2.4h // encoding: [0x20,0xa4,0x62,0x2e] +// CHECK: umaxp v0.8h, v1.8h, v2.8h // encoding: [0x20,0xa4,0x62,0x6e] +// CHECK: umaxp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xa4,0xa2,0x2e] +// CHECK: umaxp v0.4s, v1.4s, v2.4s // encoding: [0x20,0xa4,0xa2,0x6e] + +//---------------------------------------------------------------------- +// Vector Minimum Pairwise (Signed and Unsigned Integer) +//---------------------------------------------------------------------- + sminp v0.8b, v1.8b, v2.8b + sminp v0.16b, v1.16b, v2.16b + sminp v0.4h, v1.4h, v2.4h + sminp v0.8h, v1.8h, v2.8h + sminp v0.2s, v1.2s, v2.2s + sminp v0.4s, v1.4s, v2.4s + +// CHECK: sminp v0.8b, v1.8b, v2.8b // encoding: [0x20,0xac,0x22,0x0e] +// CHECK: sminp v0.16b, v1.16b, v2.16b // encoding: [0x20,0xac,0x22,0x4e] +// CHECK: sminp v0.4h, v1.4h, v2.4h // encoding: [0x20,0xac,0x62,0x0e] +// CHECK: sminp v0.8h, v1.8h, v2.8h // encoding: [0x20,0xac,0x62,0x4e] +// CHECK: sminp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xac,0xa2,0x0e] +// CHECK: sminp v0.4s, v1.4s, v2.4s // encoding: [0x20,0xac,0xa2,0x4e] + + uminp v0.8b, v1.8b, v2.8b + uminp v0.16b, v1.16b, v2.16b + uminp v0.4h, v1.4h, v2.4h + uminp v0.8h, v1.8h, v2.8h + uminp v0.2s, v1.2s, v2.2s + uminp v0.4s, v1.4s, v2.4s + +// CHECK: uminp v0.8b, v1.8b, v2.8b // encoding: [0x20,0xac,0x22,0x2e] +// CHECK: uminp v0.16b, v1.16b, v2.16b // encoding: [0x20,0xac,0x22,0x6e] +// CHECK: uminp v0.4h, v1.4h, v2.4h // encoding: [0x20,0xac,0x62,0x2e] +// CHECK: uminp v0.8h, v1.8h, v2.8h // encoding: [0x20,0xac,0x62,0x6e] +// CHECK: uminp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xac,0xa2,0x2e] +// CHECK: uminp v0.4s, v1.4s, v2.4s // encoding: [0x20,0xac,0xa2,0x6e] + +//---------------------------------------------------------------------- +// Vector Maximum Pairwise (Floating Point) +//---------------------------------------------------------------------- + fmaxp v0.2s, v1.2s, v2.2s + fmaxp v31.4s, v15.4s, v16.4s + fmaxp v7.2d, v8.2d, v25.2d + +// CHECK: fmaxp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xf4,0x22,0x2e] +// CHECK: fmaxp v31.4s, v15.4s, v16.4s // encoding: [0xff,0xf5,0x30,0x6e] +// CHECK: fmaxp v7.2d, v8.2d, v25.2d // encoding: [0x07,0xf5,0x79,0x6e] + +//---------------------------------------------------------------------- +// Vector Minimum Pairwise (Floating Point) +//---------------------------------------------------------------------- + fminp v10.2s, v15.2s, v22.2s + fminp v3.4s, v5.4s, v6.4s + fminp v17.2d, v13.2d, v2.2d + +// CHECK: fminp v10.2s, v15.2s, v22.2s // encoding: [0xea,0xf5,0xb6,0x2e] +// CHECK: fminp v3.4s, v5.4s, v6.4s // encoding: [0xa3,0xf4,0xa6,0x6e] +// CHECK: fminp v17.2d, v13.2d, v2.2d // encoding: [0xb1,0xf5,0xe2,0x6e] + +//---------------------------------------------------------------------- +// Vector maxNum Pairwise (Floating Point) +//---------------------------------------------------------------------- + fmaxnmp v0.2s, v1.2s, v2.2s + fmaxnmp v31.4s, v15.4s, v16.4s + fmaxnmp v7.2d, v8.2d, v25.2d + +// CHECK: fmaxnmp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xc4,0x22,0x2e] +// CHECK: fmaxnmp v31.4s, v15.4s, v16.4s // encoding: [0xff,0xc5,0x30,0x6e] +// CHECK: fmaxnmp v7.2d, v8.2d, v25.2d // encoding: [0x07,0xc5,0x79,0x6e] + +//---------------------------------------------------------------------- +// Vector minNum Pairwise (Floating Point) +//---------------------------------------------------------------------- + fminnmp v10.2s, v15.2s, v22.2s + fminnmp v3.4s, v5.4s, v6.4s + fminnmp v17.2d, v13.2d, v2.2d + +// CHECK: fminnmp v10.2s, v15.2s, v22.2s // encoding: [0xea,0xc5,0xb6,0x2e] +// CHECK: fminnmp v3.4s, v5.4s, v6.4s // encoding: [0xa3,0xc4,0xa6,0x6e] +// CHECK: fminnmp v17.2d, v13.2d, v2.2d // encoding: [0xb1,0xc5,0xe2,0x6e] + diff --git a/test/MC/AArch64/neon-max-min.s b/test/MC/AArch64/neon-max-min.s new file mode 100644 index 00000000000..6d1efde5077 --- /dev/null +++ b/test/MC/AArch64/neon-max-min.s @@ -0,0 +1,110 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//---------------------------------------------------------------------- +// Vector Maximum (Signed and Unsigned Integer) +//---------------------------------------------------------------------- + smax v0.8b, v1.8b, v2.8b + smax v0.16b, v1.16b, v2.16b + smax v0.4h, v1.4h, v2.4h + smax v0.8h, v1.8h, v2.8h + smax v0.2s, v1.2s, v2.2s + smax v0.4s, v1.4s, v2.4s + +// CHECK: smax v0.8b, v1.8b, v2.8b // encoding: [0x20,0x64,0x22,0x0e] +// CHECK: smax v0.16b, v1.16b, v2.16b // encoding: [0x20,0x64,0x22,0x4e] +// CHECK: smax v0.4h, v1.4h, v2.4h // encoding: [0x20,0x64,0x62,0x0e] +// CHECK: smax v0.8h, v1.8h, v2.8h // encoding: [0x20,0x64,0x62,0x4e] +// CHECK: smax v0.2s, v1.2s, v2.2s // encoding: [0x20,0x64,0xa2,0x0e] +// CHECK: smax v0.4s, v1.4s, v2.4s // encoding: [0x20,0x64,0xa2,0x4e] + + umax v0.8b, v1.8b, v2.8b + umax v0.16b, v1.16b, v2.16b + umax v0.4h, v1.4h, v2.4h + umax v0.8h, v1.8h, v2.8h + umax v0.2s, v1.2s, v2.2s + umax v0.4s, v1.4s, v2.4s + +// CHECK: umax v0.8b, v1.8b, v2.8b // encoding: [0x20,0x64,0x22,0x2e] +// CHECK: umax v0.16b, v1.16b, v2.16b // encoding: [0x20,0x64,0x22,0x6e] +// CHECK: umax v0.4h, v1.4h, v2.4h // encoding: [0x20,0x64,0x62,0x2e] +// CHECK: umax v0.8h, v1.8h, v2.8h // encoding: [0x20,0x64,0x62,0x6e] +// CHECK: umax v0.2s, v1.2s, v2.2s // encoding: [0x20,0x64,0xa2,0x2e] +// CHECK: umax v0.4s, v1.4s, v2.4s // encoding: [0x20,0x64,0xa2,0x6e] + +//---------------------------------------------------------------------- +// Vector Minimum (Signed and Unsigned Integer) +//---------------------------------------------------------------------- + smin v0.8b, v1.8b, v2.8b + smin v0.16b, v1.16b, v2.16b + smin v0.4h, v1.4h, v2.4h + smin v0.8h, v1.8h, v2.8h + smin v0.2s, v1.2s, v2.2s + smin v0.4s, v1.4s, v2.4s + +// CHECK: smin v0.8b, v1.8b, v2.8b // encoding: [0x20,0x6c,0x22,0x0e] +// CHECK: smin v0.16b, v1.16b, v2.16b // encoding: [0x20,0x6c,0x22,0x4e] +// CHECK: smin v0.4h, v1.4h, v2.4h // encoding: [0x20,0x6c,0x62,0x0e] +// CHECK: smin v0.8h, v1.8h, v2.8h // encoding: [0x20,0x6c,0x62,0x4e] +// CHECK: smin v0.2s, v1.2s, v2.2s // encoding: [0x20,0x6c,0xa2,0x0e] +// CHECK: smin v0.4s, v1.4s, v2.4s // encoding: [0x20,0x6c,0xa2,0x4e] + + umin v0.8b, v1.8b, v2.8b + umin v0.16b, v1.16b, v2.16b + umin v0.4h, v1.4h, v2.4h + umin v0.8h, v1.8h, v2.8h + umin v0.2s, v1.2s, v2.2s + umin v0.4s, v1.4s, v2.4s + +// CHECK: umin v0.8b, v1.8b, v2.8b // encoding: [0x20,0x6c,0x22,0x2e] +// CHECK: umin v0.16b, v1.16b, v2.16b // encoding: [0x20,0x6c,0x22,0x6e] +// CHECK: umin v0.4h, v1.4h, v2.4h // encoding: [0x20,0x6c,0x62,0x2e] +// CHECK: umin v0.8h, v1.8h, v2.8h // encoding: [0x20,0x6c,0x62,0x6e] +// CHECK: umin v0.2s, v1.2s, v2.2s // encoding: [0x20,0x6c,0xa2,0x2e] +// CHECK: umin v0.4s, v1.4s, v2.4s // encoding: [0x20,0x6c,0xa2,0x6e] + +//---------------------------------------------------------------------- +// Vector Maximum (Floating Point) +//---------------------------------------------------------------------- + fmax v0.2s, v1.2s, v2.2s + fmax v31.4s, v15.4s, v16.4s + fmax v7.2d, v8.2d, v25.2d + +// CHECK: fmax v0.2s, v1.2s, v2.2s // encoding: [0x20,0xf4,0x22,0x0e] +// CHECK: fmax v31.4s, v15.4s, v16.4s // encoding: [0xff,0xf5,0x30,0x4e] +// CHECK: fmax v7.2d, v8.2d, v25.2d // encoding: [0x07,0xf5,0x79,0x4e] + +//---------------------------------------------------------------------- +// Vector Minimum (Floating Point) +//---------------------------------------------------------------------- + fmin v10.2s, v15.2s, v22.2s + fmin v3.4s, v5.4s, v6.4s + fmin v17.2d, v13.2d, v2.2d + +// CHECK: fmin v10.2s, v15.2s, v22.2s // encoding: [0xea,0xf5,0xb6,0x0e] +// CHECK: fmin v3.4s, v5.4s, v6.4s // encoding: [0xa3,0xf4,0xa6,0x4e] +// CHECK: fmin v17.2d, v13.2d, v2.2d // encoding: [0xb1,0xf5,0xe2,0x4e] + +//---------------------------------------------------------------------- +// Vector maxNum (Floating Point) +//---------------------------------------------------------------------- + fmaxnm v0.2s, v1.2s, v2.2s + fmaxnm v31.4s, v15.4s, v16.4s + fmaxnm v7.2d, v8.2d, v25.2d + +// CHECK: fmaxnm v0.2s, v1.2s, v2.2s // encoding: [0x20,0xc4,0x22,0x0e] +// CHECK: fmaxnm v31.4s, v15.4s, v16.4s // encoding: [0xff,0xc5,0x30,0x4e] +// CHECK: fmaxnm v7.2d, v8.2d, v25.2d // encoding: [0x07,0xc5,0x79,0x4e] + +//---------------------------------------------------------------------- +// Vector minNum (Floating Point) +//---------------------------------------------------------------------- + fminnm v10.2s, v15.2s, v22.2s + fminnm v3.4s, v5.4s, v6.4s + fminnm v17.2d, v13.2d, v2.2d + +// CHECK: fminnm v10.2s, v15.2s, v22.2s // encoding: [0xea,0xc5,0xb6,0x0e] +// CHECK: fminnm v3.4s, v5.4s, v6.4s // encoding: [0xa3,0xc4,0xa6,0x4e] +// CHECK: fminnm v17.2d, v13.2d, v2.2d // encoding: [0xb1,0xc5,0xe2,0x4e] + diff --git a/test/MC/AArch64/neon-mla-mls-instructions.s b/test/MC/AArch64/neon-mla-mls-instructions.s new file mode 100644 index 00000000000..3072e6f1200 --- /dev/null +++ b/test/MC/AArch64/neon-mla-mls-instructions.s @@ -0,0 +1,61 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//---------------------------------------------------------------------- +// Vector Integer Multiply-accumulate +//---------------------------------------------------------------------- + mla v0.8b, v1.8b, v2.8b + mla v0.16b, v1.16b, v2.16b + mla v0.4h, v1.4h, v2.4h + mla v0.8h, v1.8h, v2.8h + mla v0.2s, v1.2s, v2.2s + mla v0.4s, v1.4s, v2.4s + +// CHECK: mla v0.8b, v1.8b, v2.8b // encoding: [0x20,0x94,0x22,0x0e] +// CHECK: mla v0.16b, v1.16b, v2.16b // encoding: [0x20,0x94,0x22,0x4e] +// CHECK: mla v0.4h, v1.4h, v2.4h // encoding: [0x20,0x94,0x62,0x0e] +// CHECK: mla v0.8h, v1.8h, v2.8h // encoding: [0x20,0x94,0x62,0x4e] +// CHECK: mla v0.2s, v1.2s, v2.2s // encoding: [0x20,0x94,0xa2,0x0e] +// CHECK: mla v0.4s, v1.4s, v2.4s // encoding: [0x20,0x94,0xa2,0x4e] + + +//---------------------------------------------------------------------- +// Vector Integer Multiply-subtract +//---------------------------------------------------------------------- + mls v0.8b, v1.8b, v2.8b + mls v0.16b, v1.16b, v2.16b + mls v0.4h, v1.4h, v2.4h + mls v0.8h, v1.8h, v2.8h + mls v0.2s, v1.2s, v2.2s + mls v0.4s, v1.4s, v2.4s + +// CHECK: mls v0.8b, v1.8b, v2.8b // encoding: [0x20,0x94,0x22,0x2e] +// CHECK: mls v0.16b, v1.16b, v2.16b // encoding: [0x20,0x94,0x22,0x6e] +// CHECK: mls v0.4h, v1.4h, v2.4h // encoding: [0x20,0x94,0x62,0x2e] +// CHECK: mls v0.8h, v1.8h, v2.8h // encoding: [0x20,0x94,0x62,0x6e] +// CHECK: mls v0.2s, v1.2s, v2.2s // encoding: [0x20,0x94,0xa2,0x2e] +// CHECK: mls v0.4s, v1.4s, v2.4s // encoding: [0x20,0x94,0xa2,0x6e] + +//---------------------------------------------------------------------- +// Vector Floating-Point Multiply-accumulate +//---------------------------------------------------------------------- + fmla v0.2s, v1.2s, v2.2s + fmla v0.4s, v1.4s, v2.4s + fmla v0.2d, v1.2d, v2.2d + +// CHECK: fmla v0.2s, v1.2s, v2.2s // encoding: [0x20,0xcc,0x22,0x0e] +// CHECK: fmla v0.4s, v1.4s, v2.4s // encoding: [0x20,0xcc,0x22,0x4e] +// CHECK: fmla v0.2d, v1.2d, v2.2d // encoding: [0x20,0xcc,0x62,0x4e] + +//---------------------------------------------------------------------- +// Vector Floating-Point Multiply-subtract +//---------------------------------------------------------------------- + fmls v0.2s, v1.2s, v2.2s + fmls v0.4s, v1.4s, v2.4s + fmls v0.2d, v1.2d, v2.2d + +// CHECK: fmls v0.2s, v1.2s, v2.2s // encoding: [0x20,0xcc,0xa2,0x0e] +// CHECK: fmls v0.4s, v1.4s, v2.4s // encoding: [0x20,0xcc,0xa2,0x4e] +// CHECK: fmls v0.2d, v1.2d, v2.2d // encoding: [0x20,0xcc,0xe2,0x4e] + diff --git a/test/MC/AArch64/neon-mov.s b/test/MC/AArch64/neon-mov.s new file mode 100644 index 00000000000..83313724107 --- /dev/null +++ b/test/MC/AArch64/neon-mov.s @@ -0,0 +1,207 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + + +//---------------------------------------------------------------------- +// Vector Move Immediate Shifted +//---------------------------------------------------------------------- + movi v0.2s, #1 + movi v1.2s, #0 + movi v15.2s, #1, lsl #8 + movi v16.2s, #1, lsl #16 + movi v31.2s, #1, lsl #24 + movi v0.4s, #1 + movi v0.4s, #1, lsl #8 + movi v0.4s, #1, lsl #16 + movi v0.4s, #1, lsl #24 + movi v0.4h, #1 + movi v0.4h, #1, lsl #8 + movi v0.8h, #1 + movi v0.8h, #1, lsl #8 + +// CHECK: movi v0.2s, #0x1 // encoding: [0x20,0x04,0x00,0x0f] +// CHECK: movi v1.2s, #0x0 // encoding: [0x01,0x04,0x00,0x0f] +// CHECK: movi v15.2s, #0x1, lsl #8 // encoding: [0x2f,0x24,0x00,0x0f] +// CHECK: movi v16.2s, #0x1, lsl #16 // encoding: [0x30,0x44,0x00,0x0f] +// CHECK: movi v31.2s, #0x1, lsl #24 // encoding: [0x3f,0x64,0x00,0x0f] +// CHECK: movi v0.4s, #0x1 // encoding: [0x20,0x04,0x00,0x4f] +// CHECK: movi v0.4s, #0x1, lsl #8 // encoding: [0x20,0x24,0x00,0x4f] +// CHECK: movi v0.4s, #0x1, lsl #16 // encoding: [0x20,0x44,0x00,0x4f] +// CHECK: movi v0.4s, #0x1, lsl #24 // encoding: [0x20,0x64,0x00,0x4f] +// CHECK: movi v0.4h, #0x1 // encoding: [0x20,0x84,0x00,0x0f] +// CHECK: movi v0.4h, #0x1, lsl #8 // encoding: [0x20,0xa4,0x00,0x0f] +// CHECK: movi v0.8h, #0x1 // encoding: [0x20,0x84,0x00,0x4f] +// CHECK: movi v0.8h, #0x1, lsl #8 // encoding: [0x20,0xa4,0x00,0x4f] + +//---------------------------------------------------------------------- +// Vector Move Inverted Immediate Shifted +//---------------------------------------------------------------------- + mvni v0.2s, #1 + mvni v1.2s, #0 + mvni v0.2s, #1, lsl #8 + mvni v0.2s, #1, lsl #16 + mvni v0.2s, #1, lsl #24 + mvni v0.4s, #1 + mvni v15.4s, #1, lsl #8 + mvni v16.4s, #1, lsl #16 + mvni v31.4s, #1, lsl #24 + mvni v0.4h, #1 + mvni v0.4h, #1, lsl #8 + mvni v0.8h, #1 + mvni v0.8h, #1, lsl #8 + +// CHECK: mvni v0.2s, #0x1 // encoding: [0x20,0x04,0x00,0x2f] +// CHECK: mvni v1.2s, #0x0 // encoding: [0x01,0x04,0x00,0x2f] +// CHECK: mvni v0.2s, #0x1, lsl #8 // encoding: [0x20,0x24,0x00,0x2f] +// CHECK: mvni v0.2s, #0x1, lsl #16 // encoding: [0x20,0x44,0x00,0x2f] +// CHECK: mvni v0.2s, #0x1, lsl #24 // encoding: [0x20,0x64,0x00,0x2f] +// CHECK: mvni v0.4s, #0x1 // encoding: [0x20,0x04,0x00,0x6f] +// CHECK: mvni v15.4s, #0x1, lsl #8 // encoding: [0x2f,0x24,0x00,0x6f] +// CHECK: mvni v16.4s, #0x1, lsl #16 // encoding: [0x30,0x44,0x00,0x6f] +// CHECK: mvni v31.4s, #0x1, lsl #24 // encoding: [0x3f,0x64,0x00,0x6f] +// CHECK: mvni v0.4h, #0x1 // encoding: [0x20,0x84,0x00,0x2f] +// CHECK: mvni v0.4h, #0x1, lsl #8 // encoding: [0x20,0xa4,0x00,0x2f] +// CHECK: mvni v0.8h, #0x1 // encoding: [0x20,0x84,0x00,0x6f] +// CHECK: mvni v0.8h, #0x1, lsl #8 // encoding: [0x20,0xa4,0x00,0x6f] + +//---------------------------------------------------------------------- +// Vector Bitwise Bit Clear (AND NOT) - immediate +//---------------------------------------------------------------------- + bic v0.2s, #1 + bic v1.2s, #0 + bic v0.2s, #1, lsl #8 + bic v0.2s, #1, lsl #16 + bic v0.2s, #1, lsl #24 + bic v0.4s, #1 + bic v0.4s, #1, lsl #8 + bic v0.4s, #1, lsl #16 + bic v0.4s, #1, lsl #24 + bic v15.4h, #1 + bic v16.4h, #1, lsl #8 + bic v0.8h, #1 + bic v31.8h, #1, lsl #8 + +// CHECK: bic v0.2s, #0x1 // encoding: [0x20,0x14,0x00,0x2f] +// CHECK: bic v1.2s, #0x0 // encoding: [0x01,0x14,0x00,0x2f] +// CHECK: bic v0.2s, #0x1, lsl #8 // encoding: [0x20,0x34,0x00,0x2f] +// CHECK: bic v0.2s, #0x1, lsl #16 // encoding: [0x20,0x54,0x00,0x2f] +// CHECK: bic v0.2s, #0x1, lsl #24 // encoding: [0x20,0x74,0x00,0x2f] +// CHECK: bic v0.4s, #0x1 // encoding: [0x20,0x14,0x00,0x6f] +// CHECK: bic v0.4s, #0x1, lsl #8 // encoding: [0x20,0x34,0x00,0x6f] +// CHECK: bic v0.4s, #0x1, lsl #16 // encoding: [0x20,0x54,0x00,0x6f] +// CHECK: bic v0.4s, #0x1, lsl #24 // encoding: [0x20,0x74,0x00,0x6f] +// CHECK: bic v15.4h, #0x1 // encoding: [0x2f,0x94,0x00,0x2f] +// CHECK: bic v16.4h, #0x1, lsl #8 // encoding: [0x30,0xb4,0x00,0x2f] +// CHECK: bic v0.8h, #0x1 // encoding: [0x20,0x94,0x00,0x6f] +// CHECK: bic v31.8h, #0x1, lsl #8 // encoding: [0x3f,0xb4,0x00,0x6f] + +//---------------------------------------------------------------------- +// Vector Bitwise OR - immedidate +//---------------------------------------------------------------------- + orr v0.2s, #1 + orr v1.2s, #0 + orr v0.2s, #1, lsl #8 + orr v0.2s, #1, lsl #16 + orr v0.2s, #1, lsl #24 + orr v0.4s, #1 + orr v0.4s, #1, lsl #8 + orr v0.4s, #1, lsl #16 + orr v0.4s, #1, lsl #24 + orr v31.4h, #1 + orr v15.4h, #1, lsl #8 + orr v0.8h, #1 + orr v16.8h, #1, lsl #8 + +// CHECK: orr v0.2s, #0x1 // encoding: [0x20,0x14,0x00,0x0f] +// CHECK: orr v1.2s, #0x0 // encoding: [0x01,0x14,0x00,0x0f] +// CHECK: orr v0.2s, #0x1, lsl #8 // encoding: [0x20,0x34,0x00,0x0f] +// CHECK: orr v0.2s, #0x1, lsl #16 // encoding: [0x20,0x54,0x00,0x0f] +// CHECK: orr v0.2s, #0x1, lsl #24 // encoding: [0x20,0x74,0x00,0x0f] +// CHECK: orr v0.4s, #0x1 // encoding: [0x20,0x14,0x00,0x4f] +// CHECK: orr v0.4s, #0x1, lsl #8 // encoding: [0x20,0x34,0x00,0x4f] +// CHECK: orr v0.4s, #0x1, lsl #16 // encoding: [0x20,0x54,0x00,0x4f] +// CHECK: orr v0.4s, #0x1, lsl #24 // encoding: [0x20,0x74,0x00,0x4f] +// CHECK: orr v31.4h, #0x1 // encoding: [0x3f,0x94,0x00,0x0f] +// CHECK: orr v15.4h, #0x1, lsl #8 // encoding: [0x2f,0xb4,0x00,0x0f] +// CHECK: orr v0.8h, #0x1 // encoding: [0x20,0x94,0x00,0x4f] +// CHECK: orr v16.8h, #0x1, lsl #8 // encoding: [0x30,0xb4,0x00,0x4f] + +//---------------------------------------------------------------------- +// Vector Move Immediate Masked +//---------------------------------------------------------------------- + movi v0.2s, #1, msl #8 + movi v1.2s, #1, msl #16 + movi v0.4s, #1, msl #8 + movi v31.4s, #1, msl #16 + +// CHECK: movi v0.2s, #0x1, msl #8 // encoding: [0x20,0xc4,0x00,0x0f] +// CHECK: movi v1.2s, #0x1, msl #16 // encoding: [0x21,0xd4,0x00,0x0f] +// CHECK: movi v0.4s, #0x1, msl #8 // encoding: [0x20,0xc4,0x00,0x4f] +// CHECK: movi v31.4s, #0x1, msl #16 // encoding: [0x3f,0xd4,0x00,0x4f] + +//---------------------------------------------------------------------- +// Vector Move Inverted Immediate Masked +//---------------------------------------------------------------------- + mvni v1.2s, #0x1, msl #8 + mvni v0.2s, #0x1, msl #16 + mvni v31.4s, #0x1, msl #8 + mvni v0.4s, #0x1, msl #16 + +// CHECK: mvni v1.2s, #0x1, msl #8 // encoding: [0x21,0xc4,0x00,0x2f] +// CHECK: mvni v0.2s, #0x1, msl #16 // encoding: [0x20,0xd4,0x00,0x2f] +// CHECK: mvni v31.4s, #0x1, msl #8 // encoding: [0x3f,0xc4,0x00,0x6f] +// CHECK: mvni v0.4s, #0x1, msl #16 // encoding: [0x20,0xd4,0x00,0x6f] + +//---------------------------------------------------------------------- +// Vector Immediate - per byte +//---------------------------------------------------------------------- + movi v0.8b, #0 + movi v31.8b, #0xff + movi v15.16b, #0xf + movi v31.16b, #0x1f + +// CHECK: movi v0.8b, #0x0 // encoding: [0x00,0xe4,0x00,0x0f] +// CHECK: movi v31.8b, #0xff // encoding: [0xff,0xe7,0x07,0x0f] +// CHECK: movi v15.16b, #0xf // encoding: [0xef,0xe5,0x00,0x4f] +// CHECK: movi v31.16b, #0x1f // encoding: [0xff,0xe7,0x00,0x4f] + +//---------------------------------------------------------------------- +// Vector Move Immediate - bytemask, per doubleword +//--------------------------------------------------------------------- + movi v0.2d, #0xff00ff00ff00ff00 + +// CHECK: movi v0.2d, #0xff00ff00ff00ff00 // encoding: [0x40,0xe5,0x05,0x6f] + +//---------------------------------------------------------------------- +// Vector Move Immediate - bytemask, one doubleword +//---------------------------------------------------------------------- + movi d0, #0xff00ff00ff00ff00 + +// CHECK: movi d0, #0xff00ff00ff00ff00 // encoding: [0x40,0xe5,0x05,0x2f] + +//---------------------------------------------------------------------- +// Vector Floating Point Move Immediate +//---------------------------------------------------------------------- + fmov v1.2s, #1.0 + fmov v15.4s, #1.0 + fmov v31.2d, #1.0 + +// CHECK: fmov v1.2s, #1.00000000 // encoding: [0x01,0xf6,0x03,0x0f] +// CHECK: fmov v15.4s, #1.00000000 // encoding: [0x0f,0xf6,0x03,0x4f] +// CHECK: fmov v31.2d, #1.00000000 // encoding: [0x1f,0xf6,0x03,0x6f] + + +//---------------------------------------------------------------------- +// Vector Move - register +//---------------------------------------------------------------------- + mov v0.8b, v31.8b + mov v15.16b, v16.16b + orr v0.8b, v31.8b, v31.8b + orr v15.16b, v16.16b, v16.16b + +// CHECK: mov v0.8b, v31.8b // encoding: [0xe0,0x1f,0xbf,0x0e] +// CHECK: mov v15.16b, v16.16b // encoding: [0x0f,0x1e,0xb0,0x4e] +// CHECK: mov v0.8b, v31.8b // encoding: [0xe0,0x1f,0xbf,0x0e] +// CHECK: mov v15.16b, v16.16b // encoding: [0x0f,0x1e,0xb0,0x4e] + diff --git a/test/MC/AArch64/neon-mul-div-instructions.s b/test/MC/AArch64/neon-mul-div-instructions.s new file mode 100644 index 00000000000..1fe6d2b819c --- /dev/null +++ b/test/MC/AArch64/neon-mul-div-instructions.s @@ -0,0 +1,86 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//---------------------------------------------------------------------- +// Vector Integer Mul +//---------------------------------------------------------------------- + mul v0.8b, v1.8b, v2.8b + mul v0.16b, v1.16b, v2.16b + mul v0.4h, v1.4h, v2.4h + mul v0.8h, v1.8h, v2.8h + mul v0.2s, v1.2s, v2.2s + mul v0.4s, v1.4s, v2.4s + +// CHECK: mul v0.8b, v1.8b, v2.8b // encoding: [0x20,0x9c,0x22,0x0e] +// CHECK: mul v0.16b, v1.16b, v2.16b // encoding: [0x20,0x9c,0x22,0x4e] +// CHECK: mul v0.4h, v1.4h, v2.4h // encoding: [0x20,0x9c,0x62,0x0e] +// CHECK: mul v0.8h, v1.8h, v2.8h // encoding: [0x20,0x9c,0x62,0x4e] +// CHECK: mul v0.2s, v1.2s, v2.2s // encoding: [0x20,0x9c,0xa2,0x0e] +// CHECK: mul v0.4s, v1.4s, v2.4s // encoding: [0x20,0x9c,0xa2,0x4e] + + +//---------------------------------------------------------------------- +// Vector Floating-Point Mul +//---------------------------------------------------------------------- + fmul v0.2s, v1.2s, v2.2s + fmul v0.4s, v1.4s, v2.4s + fmul v0.2d, v1.2d, v2.2d + +// CHECK: fmul v0.2s, v1.2s, v2.2s // encoding: [0x20,0xdc,0x22,0x2e] +// CHECK: fmul v0.4s, v1.4s, v2.4s // encoding: [0x20,0xdc,0x22,0x6e] +// CHECK: fmul v0.2d, v1.2d, v2.2d // encoding: [0x20,0xdc,0x62,0x6e] + +//---------------------------------------------------------------------- +// Vector Floating-Point Div +//---------------------------------------------------------------------- + fdiv v0.2s, v1.2s, v2.2s + fdiv v0.4s, v1.4s, v2.4s + fdiv v0.2d, v1.2d, v2.2d + +// CHECK: fdiv v0.2s, v1.2s, v2.2s // encoding: [0x20,0xfc,0x22,0x2e] +// CHECK: fdiv v0.4s, v1.4s, v2.4s // encoding: [0x20,0xfc,0x22,0x6e] +// CHECK: fdiv v0.2d, v1.2d, v2.2d // encoding: [0x20,0xfc,0x62,0x6e] + +//---------------------------------------------------------------------- +// Vector Multiply (Polynomial) +//---------------------------------------------------------------------- + pmul v17.8b, v31.8b, v16.8b + pmul v0.16b, v1.16b, v2.16b + +// CHECK: pmul v17.8b, v31.8b, v16.8b // encoding: [0xf1,0x9f,0x30,0x2e] +// CHECK: pmul v0.16b, v1.16b, v2.16b // encoding: [0x20,0x9c,0x22,0x6e] + +//---------------------------------------------------------------------- +// Vector Saturating Doubling Multiply High +//---------------------------------------------------------------------- + sqdmulh v2.4h, v25.4h, v3.4h + sqdmulh v12.8h, v5.8h, v13.8h + sqdmulh v3.2s, v1.2s, v30.2s + +// CHECK: sqdmulh v2.4h, v25.4h, v3.4h // encoding: [0x22,0xb7,0x63,0x0e] +// CHECK: sqdmulh v12.8h, v5.8h, v13.8h // encoding: [0xac,0xb4,0x6d,0x4e] +// CHECK: sqdmulh v3.2s, v1.2s, v30.2s // encoding: [0x23,0xb4,0xbe,0x0e] + +//---------------------------------------------------------------------- +// Vector Saturating Rouding Doubling Multiply High +//---------------------------------------------------------------------- + sqrdmulh v2.4h, v25.4h, v3.4h + sqrdmulh v12.8h, v5.8h, v13.8h + sqrdmulh v3.2s, v1.2s, v30.2s + +// CHECK: sqrdmulh v2.4h, v25.4h, v3.4h // encoding: [0x22,0xb7,0x63,0x2e] +// CHECK: sqrdmulh v12.8h, v5.8h, v13.8h // encoding: [0xac,0xb4,0x6d,0x6e] +// CHECK: sqrdmulh v3.2s, v1.2s, v30.2s // encoding: [0x23,0xb4,0xbe,0x2e] + +//---------------------------------------------------------------------- +// Vector Multiply Extended +//---------------------------------------------------------------------- + fmulx v21.2s, v5.2s, v13.2s + fmulx v1.4s, v25.4s, v3.4s + fmulx v31.2d, v22.2d, v2.2d + +// CHECK: fmulx v21.2s, v5.2s, v13.2s // encoding: [0xb5,0xdc,0x2d,0x0e] +// CHECK: fmulx v1.4s, v25.4s, v3.4s // encoding: [0x21,0xdf,0x23,0x4e] +// CHECK: fmulx v31.2d, v22.2d, v2.2d // encoding: [0xdf,0xde,0x62,0x4e] + diff --git a/test/MC/AArch64/neon-rounding-halving-add.s b/test/MC/AArch64/neon-rounding-halving-add.s new file mode 100644 index 00000000000..47ac2126802 --- /dev/null +++ b/test/MC/AArch64/neon-rounding-halving-add.s @@ -0,0 +1,39 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + + +//------------------------------------------------------------------------------ +// Vector Integer Rouding Halving Add (Signed) +//------------------------------------------------------------------------------ + srhadd v0.8b, v1.8b, v2.8b + srhadd v0.16b, v1.16b, v2.16b + srhadd v0.4h, v1.4h, v2.4h + srhadd v0.8h, v1.8h, v2.8h + srhadd v0.2s, v1.2s, v2.2s + srhadd v0.4s, v1.4s, v2.4s + +// CHECK: srhadd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x14,0x22,0x0e] +// CHECK: srhadd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x14,0x22,0x4e] +// CHECK: srhadd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x14,0x62,0x0e] +// CHECK: srhadd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x14,0x62,0x4e] +// CHECK: srhadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x14,0xa2,0x0e] +// CHECK: srhadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x14,0xa2,0x4e] + +//------------------------------------------------------------------------------ +// Vector Integer Rouding Halving Add (Unsigned) +//------------------------------------------------------------------------------ + urhadd v0.8b, v1.8b, v2.8b + urhadd v0.16b, v1.16b, v2.16b + urhadd v0.4h, v1.4h, v2.4h + urhadd v0.8h, v1.8h, v2.8h + urhadd v0.2s, v1.2s, v2.2s + urhadd v0.4s, v1.4s, v2.4s + +// CHECK: urhadd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x14,0x22,0x2e] +// CHECK: urhadd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x14,0x22,0x6e] +// CHECK: urhadd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x14,0x62,0x2e] +// CHECK: urhadd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x14,0x62,0x6e] +// CHECK: urhadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x14,0xa2,0x2e] +// CHECK: urhadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x14,0xa2,0x6e] + diff --git a/test/MC/AArch64/neon-rounding-shift.s b/test/MC/AArch64/neon-rounding-shift.s new file mode 100644 index 00000000000..f3c70d7e38e --- /dev/null +++ b/test/MC/AArch64/neon-rounding-shift.s @@ -0,0 +1,57 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + + +//------------------------------------------------------------------------------ +// Vector Integer Rounding Shift Lef (Signed) +//------------------------------------------------------------------------------ + srshl v0.8b, v1.8b, v2.8b + srshl v0.16b, v1.16b, v2.16b + srshl v0.4h, v1.4h, v2.4h + srshl v0.8h, v1.8h, v2.8h + srshl v0.2s, v1.2s, v2.2s + srshl v0.4s, v1.4s, v2.4s + srshl v0.2d, v1.2d, v2.2d + +// CHECK: srshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x54,0x22,0x0e] +// CHECK: srshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x54,0x22,0x4e] +// CHECK: srshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x54,0x62,0x0e] +// CHECK: srshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x54,0x62,0x4e] +// CHECK: srshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x54,0xa2,0x0e] +// CHECK: srshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x54,0xa2,0x4e] +// CHECK: srshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x54,0xe2,0x4e] + +//------------------------------------------------------------------------------ +// Vector Integer Rounding Shift Lef (Unsigned) +//------------------------------------------------------------------------------ + urshl v0.8b, v1.8b, v2.8b + urshl v0.16b, v1.16b, v2.16b + urshl v0.4h, v1.4h, v2.4h + urshl v0.8h, v1.8h, v2.8h + urshl v0.2s, v1.2s, v2.2s + urshl v0.4s, v1.4s, v2.4s + urshl v0.2d, v1.2d, v2.2d + +// CHECK: urshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x54,0x22,0x2e] +// CHECK: urshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x54,0x22,0x6e] +// CHECK: urshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x54,0x62,0x2e] +// CHECK: urshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x54,0x62,0x6e] +// CHECK: urshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x54,0xa2,0x2e] +// CHECK: urshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x54,0xa2,0x6e] +// CHECK: urshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x54,0xe2,0x6e] + +//------------------------------------------------------------------------------ +// Scalar Integer Rounding Shift Lef (Signed) +//------------------------------------------------------------------------------ + srshl d17, d31, d8 + +// CHECK: srshl d17, d31, d8 // encoding: [0xf1,0x57,0xe8,0x5e] + +//------------------------------------------------------------------------------ +// Scalar Integer Rounding Shift Lef (Unsigned) +//------------------------------------------------------------------------------ + urshl d17, d31, d8 + +// CHECK: urshl d17, d31, d8 // encoding: [0xf1,0x57,0xe8,0x7e] + diff --git a/test/MC/AArch64/neon-saturating-add-sub.s b/test/MC/AArch64/neon-saturating-add-sub.s new file mode 100644 index 00000000000..1032ae47e20 --- /dev/null +++ b/test/MC/AArch64/neon-saturating-add-sub.s @@ -0,0 +1,133 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + + +//------------------------------------------------------------------------------ +// Vector Integer Saturating Add (Signed) +//------------------------------------------------------------------------------ + sqadd v0.8b, v1.8b, v2.8b + sqadd v0.16b, v1.16b, v2.16b + sqadd v0.4h, v1.4h, v2.4h + sqadd v0.8h, v1.8h, v2.8h + sqadd v0.2s, v1.2s, v2.2s + sqadd v0.4s, v1.4s, v2.4s + sqadd v0.2d, v1.2d, v2.2d + +// CHECK: sqadd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x0c,0x22,0x0e] +// CHECK: sqadd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x0c,0x22,0x4e] +// CHECK: sqadd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x0c,0x62,0x0e] +// CHECK: sqadd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x0c,0x62,0x4e] +// CHECK: sqadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x0c,0xa2,0x0e] +// CHECK: sqadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x0c,0xa2,0x4e] +// CHECK: sqadd v0.2d, v1.2d, v2.2d // encoding: [0x20,0x0c,0xe2,0x4e] + +//------------------------------------------------------------------------------ +// Vector Integer Saturating Add (Unsigned) +//------------------------------------------------------------------------------ + uqadd v0.8b, v1.8b, v2.8b + uqadd v0.16b, v1.16b, v2.16b + uqadd v0.4h, v1.4h, v2.4h + uqadd v0.8h, v1.8h, v2.8h + uqadd v0.2s, v1.2s, v2.2s + uqadd v0.4s, v1.4s, v2.4s + uqadd v0.2d, v1.2d, v2.2d + +// CHECK: uqadd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x0c,0x22,0x2e] +// CHECK: uqadd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x0c,0x22,0x6e] +// CHECK: uqadd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x0c,0x62,0x2e] +// CHECK: uqadd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x0c,0x62,0x6e] +// CHECK: uqadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x0c,0xa2,0x2e] +// CHECK: uqadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x0c,0xa2,0x6e] +// CHECK: uqadd v0.2d, v1.2d, v2.2d // encoding: [0x20,0x0c,0xe2,0x6e] + +//------------------------------------------------------------------------------ +// Vector Integer Saturating Sub (Signed) +//------------------------------------------------------------------------------ + sqsub v0.8b, v1.8b, v2.8b + sqsub v0.16b, v1.16b, v2.16b + sqsub v0.4h, v1.4h, v2.4h + sqsub v0.8h, v1.8h, v2.8h + sqsub v0.2s, v1.2s, v2.2s + sqsub v0.4s, v1.4s, v2.4s + sqsub v0.2d, v1.2d, v2.2d + +// CHECK: sqsub v0.8b, v1.8b, v2.8b // encoding: [0x20,0x2c,0x22,0x0e] +// CHECK: sqsub v0.16b, v1.16b, v2.16b // encoding: [0x20,0x2c,0x22,0x4e] +// CHECK: sqsub v0.4h, v1.4h, v2.4h // encoding: [0x20,0x2c,0x62,0x0e] +// CHECK: sqsub v0.8h, v1.8h, v2.8h // encoding: [0x20,0x2c,0x62,0x4e] +// CHECK: sqsub v0.2s, v1.2s, v2.2s // encoding: [0x20,0x2c,0xa2,0x0e] +// CHECK: sqsub v0.4s, v1.4s, v2.4s // encoding: [0x20,0x2c,0xa2,0x4e] +// CHECK: sqsub v0.2d, v1.2d, v2.2d // encoding: [0x20,0x2c,0xe2,0x4e] + +//------------------------------------------------------------------------------ +// Vector Integer Saturating Sub (Unsigned) +//------------------------------------------------------------------------------ + uqsub v0.8b, v1.8b, v2.8b + uqsub v0.16b, v1.16b, v2.16b + uqsub v0.4h, v1.4h, v2.4h + uqsub v0.8h, v1.8h, v2.8h + uqsub v0.2s, v1.2s, v2.2s + uqsub v0.4s, v1.4s, v2.4s + uqsub v0.2d, v1.2d, v2.2d + +// CHECK: uqsub v0.8b, v1.8b, v2.8b // encoding: [0x20,0x2c,0x22,0x2e] +// CHECK: uqsub v0.16b, v1.16b, v2.16b // encoding: [0x20,0x2c,0x22,0x6e] +// CHECK: uqsub v0.4h, v1.4h, v2.4h // encoding: [0x20,0x2c,0x62,0x2e] +// CHECK: uqsub v0.8h, v1.8h, v2.8h // encoding: [0x20,0x2c,0x62,0x6e] +// CHECK: uqsub v0.2s, v1.2s, v2.2s // encoding: [0x20,0x2c,0xa2,0x2e] +// CHECK: uqsub v0.4s, v1.4s, v2.4s // encoding: [0x20,0x2c,0xa2,0x6e] +// CHECK: uqsub v0.2d, v1.2d, v2.2d // encoding: [0x20,0x2c,0xe2,0x6e] + +//------------------------------------------------------------------------------ +// Scalar Integer Saturating Add (Signed) +//------------------------------------------------------------------------------ + sqadd b0, b1, b2 + sqadd h10, h11, h12 + sqadd s20, s21, s2 + sqadd d17, d31, d8 + +// CHECK: sqadd b0, b1, b2 // encoding: [0x20,0x0c,0x22,0x5e] +// CHECK: sqadd h10, h11, h12 // encoding: [0x6a,0x0d,0x6c,0x5e] +// CHECK: sqadd s20, s21, s2 // encoding: [0xb4,0x0e,0xa2,0x5e] +// CHECK: sqadd d17, d31, d8 // encoding: [0xf1,0x0f,0xe8,0x5e] + +//------------------------------------------------------------------------------ +// Scalar Integer Saturating Add (Unsigned) +//------------------------------------------------------------------------------ + uqadd b0, b1, b2 + uqadd h10, h11, h12 + uqadd s20, s21, s2 + uqadd d17, d31, d8 + +// CHECK: uqadd b0, b1, b2 // encoding: [0x20,0x0c,0x22,0x7e] +// CHECK: uqadd h10, h11, h12 // encoding: [0x6a,0x0d,0x6c,0x7e] +// CHECK: uqadd s20, s21, s2 // encoding: [0xb4,0x0e,0xa2,0x7e] +// CHECK: uqadd d17, d31, d8 // encoding: [0xf1,0x0f,0xe8,0x7e] + +//------------------------------------------------------------------------------ +// Scalar Integer Saturating Sub (Signed) +//------------------------------------------------------------------------------ + sqsub b0, b1, b2 + sqsub h10, h11, h12 + sqsub s20, s21, s2 + sqsub d17, d31, d8 + +// CHECK: sqsub b0, b1, b2 // encoding: [0x20,0x2c,0x22,0x5e] +// CHECK: sqsub h10, h11, h12 // encoding: [0x6a,0x2d,0x6c,0x5e] +// CHECK: sqsub s20, s21, s2 // encoding: [0xb4,0x2e,0xa2,0x5e] +// CHECK: sqsub d17, d31, d8 // encoding: [0xf1,0x2f,0xe8,0x5e] + +//------------------------------------------------------------------------------ +// Scalar Integer Saturating Sub (Unsigned) +//------------------------------------------------------------------------------ + uqsub b0, b1, b2 + uqsub h10, h11, h12 + uqsub s20, s21, s2 + uqsub d17, d31, d8 + +// CHECK: uqsub b0, b1, b2 // encoding: [0x20,0x2c,0x22,0x7e] +// CHECK: uqsub h10, h11, h12 // encoding: [0x6a,0x2d,0x6c,0x7e] +// CHECK: uqsub s20, s21, s2 // encoding: [0xb4,0x2e,0xa2,0x7e] +// CHECK: uqsub d17, d31, d8 // encoding: [0xf1,0x2f,0xe8,0x7e] + diff --git a/test/MC/AArch64/neon-saturating-rounding-shift.s b/test/MC/AArch64/neon-saturating-rounding-shift.s new file mode 100644 index 00000000000..a36e68988e1 --- /dev/null +++ b/test/MC/AArch64/neon-saturating-rounding-shift.s @@ -0,0 +1,70 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + + +//------------------------------------------------------------------------------ +// Vector Integer Saturating Rounding Shift Lef (Signed) +//------------------------------------------------------------------------------ + sqrshl v0.8b, v1.8b, v2.8b + sqrshl v0.16b, v1.16b, v2.16b + sqrshl v0.4h, v1.4h, v2.4h + sqrshl v0.8h, v1.8h, v2.8h + sqrshl v0.2s, v1.2s, v2.2s + sqrshl v0.4s, v1.4s, v2.4s + sqrshl v0.2d, v1.2d, v2.2d + +// CHECK: sqrshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x5c,0x22,0x0e] +// CHECK: sqrshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x5c,0x22,0x4e] +// CHECK: sqrshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x5c,0x62,0x0e] +// CHECK: sqrshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x5c,0x62,0x4e] +// CHECK: sqrshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x5c,0xa2,0x0e] +// CHECK: sqrshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x5c,0xa2,0x4e] +// CHECK: sqrshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x5c,0xe2,0x4e] + +//------------------------------------------------------------------------------ +// Vector Integer Saturating Rounding Shift Lef (Unsigned) +//------------------------------------------------------------------------------ + uqrshl v0.8b, v1.8b, v2.8b + uqrshl v0.16b, v1.16b, v2.16b + uqrshl v0.4h, v1.4h, v2.4h + uqrshl v0.8h, v1.8h, v2.8h + uqrshl v0.2s, v1.2s, v2.2s + uqrshl v0.4s, v1.4s, v2.4s + uqrshl v0.2d, v1.2d, v2.2d + +// CHECK: uqrshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x5c,0x22,0x2e] +// CHECK: uqrshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x5c,0x22,0x6e] +// CHECK: uqrshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x5c,0x62,0x2e] +// CHECK: uqrshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x5c,0x62,0x6e] +// CHECK: uqrshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x5c,0xa2,0x2e] +// CHECK: uqrshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x5c,0xa2,0x6e] +// CHECK: uqrshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x5c,0xe2,0x6e] + +//------------------------------------------------------------------------------ +// Scalar Integer Saturating Rounding Shift Lef (Signed) +//------------------------------------------------------------------------------ + sqrshl b0, b1, b2 + sqrshl h10, h11, h12 + sqrshl s20, s21, s2 + sqrshl d17, d31, d8 + +// CHECK: sqrshl b0, b1, b2 // encoding: [0x20,0x5c,0x22,0x5e] +// CHECK: sqrshl h10, h11, h12 // encoding: [0x6a,0x5d,0x6c,0x5e] +// CHECK: sqrshl s20, s21, s2 // encoding: [0xb4,0x5e,0xa2,0x5e] +// CHECK: sqrshl d17, d31, d8 // encoding: [0xf1,0x5f,0xe8,0x5e] + +//------------------------------------------------------------------------------ +// Scalar Integer Saturating Rounding Shift Lef (Unsigned) +//------------------------------------------------------------------------------ + uqrshl b0, b1, b2 + uqrshl h10, h11, h12 + uqrshl s20, s21, s2 + uqrshl d17, d31, d8 + +// CHECK: uqrshl b0, b1, b2 // encoding: [0x20,0x5c,0x22,0x7e] +// CHECK: uqrshl h10, h11, h12 // encoding: [0x6a,0x5d,0x6c,0x7e] +// CHECK: uqrshl s20, s21, s2 // encoding: [0xb4,0x5e,0xa2,0x7e] +// CHECK: uqrshl d17, d31, d8 // encoding: [0xf1,0x5f,0xe8,0x7e] + + diff --git a/test/MC/AArch64/neon-saturating-shift.s b/test/MC/AArch64/neon-saturating-shift.s new file mode 100644 index 00000000000..2c8456db63e --- /dev/null +++ b/test/MC/AArch64/neon-saturating-shift.s @@ -0,0 +1,69 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + + +//------------------------------------------------------------------------------ +// Vector Integer Saturating Shift Lef (Signed) +//------------------------------------------------------------------------------ + sqshl v0.8b, v1.8b, v2.8b + sqshl v0.16b, v1.16b, v2.16b + sqshl v0.4h, v1.4h, v2.4h + sqshl v0.8h, v1.8h, v2.8h + sqshl v0.2s, v1.2s, v2.2s + sqshl v0.4s, v1.4s, v2.4s + sqshl v0.2d, v1.2d, v2.2d + +// CHECK: sqshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x4c,0x22,0x0e] +// CHECK: sqshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x4c,0x22,0x4e] +// CHECK: sqshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x4c,0x62,0x0e] +// CHECK: sqshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x4c,0x62,0x4e] +// CHECK: sqshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x4c,0xa2,0x0e] +// CHECK: sqshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x4c,0xa2,0x4e] +// CHECK: sqshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x4c,0xe2,0x4e] + +//------------------------------------------------------------------------------ +// Vector Integer Saturating Shift Lef (Unsigned) +//------------------------------------------------------------------------------ + uqshl v0.8b, v1.8b, v2.8b + uqshl v0.16b, v1.16b, v2.16b + uqshl v0.4h, v1.4h, v2.4h + uqshl v0.8h, v1.8h, v2.8h + uqshl v0.2s, v1.2s, v2.2s + uqshl v0.4s, v1.4s, v2.4s + uqshl v0.2d, v1.2d, v2.2d + +// CHECK: uqshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x4c,0x22,0x2e] +// CHECK: uqshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x4c,0x22,0x6e] +// CHECK: uqshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x4c,0x62,0x2e] +// CHECK: uqshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x4c,0x62,0x6e] +// CHECK: uqshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x4c,0xa2,0x2e] +// CHECK: uqshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x4c,0xa2,0x6e] +// CHECK: uqshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x4c,0xe2,0x6e] + +//------------------------------------------------------------------------------ +// Scalar Integer Saturating Shift Lef (Signed) +//------------------------------------------------------------------------------ + sqshl b0, b1, b2 + sqshl h10, h11, h12 + sqshl s20, s21, s2 + sqshl d17, d31, d8 + +// CHECK: sqshl b0, b1, b2 // encoding: [0x20,0x4c,0x22,0x5e] +// CHECK: sqshl h10, h11, h12 // encoding: [0x6a,0x4d,0x6c,0x5e] +// CHECK: sqshl s20, s21, s2 // encoding: [0xb4,0x4e,0xa2,0x5e] +// CHECK: sqshl d17, d31, d8 // encoding: [0xf1,0x4f,0xe8,0x5e] + +//------------------------------------------------------------------------------ +// Scalar Integer Saturating Shift Lef (Unsigned) +//------------------------------------------------------------------------------ + uqshl b0, b1, b2 + uqshl h10, h11, h12 + uqshl s20, s21, s2 + uqshl d17, d31, d8 + +// CHECK: uqshl b0, b1, b2 // encoding: [0x20,0x4c,0x22,0x7e] +// CHECK: uqshl h10, h11, h12 // encoding: [0x6a,0x4d,0x6c,0x7e] +// CHECK: uqshl s20, s21, s2 // encoding: [0xb4,0x4e,0xa2,0x7e] +// CHECK: uqshl d17, d31, d8 // encoding: [0xf1,0x4f,0xe8,0x7e] + diff --git a/test/MC/AArch64/neon-shift.s b/test/MC/AArch64/neon-shift.s new file mode 100644 index 00000000000..be1799e2c11 --- /dev/null +++ b/test/MC/AArch64/neon-shift.s @@ -0,0 +1,57 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + + +//------------------------------------------------------------------------------ +// Vector Integer Shift Lef (Signed) +//------------------------------------------------------------------------------ + sshl v0.8b, v1.8b, v2.8b + sshl v0.16b, v1.16b, v2.16b + sshl v0.4h, v1.4h, v2.4h + sshl v0.8h, v1.8h, v2.8h + sshl v0.2s, v1.2s, v2.2s + sshl v0.4s, v1.4s, v2.4s + sshl v0.2d, v1.2d, v2.2d + +// CHECK: sshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x44,0x22,0x0e] +// CHECK: sshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x44,0x22,0x4e] +// CHECK: sshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x44,0x62,0x0e] +// CHECK: sshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x44,0x62,0x4e] +// CHECK: sshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x44,0xa2,0x0e] +// CHECK: sshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x44,0xa2,0x4e] +// CHECK: sshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x44,0xe2,0x4e] + +//------------------------------------------------------------------------------ +// Vector Integer Shift Lef (Unsigned) +//------------------------------------------------------------------------------ + ushl v0.8b, v1.8b, v2.8b + ushl v0.16b, v1.16b, v2.16b + ushl v0.4h, v1.4h, v2.4h + ushl v0.8h, v1.8h, v2.8h + ushl v0.2s, v1.2s, v2.2s + ushl v0.4s, v1.4s, v2.4s + ushl v0.2d, v1.2d, v2.2d + +// CHECK: ushl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x44,0x22,0x2e] +// CHECK: ushl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x44,0x22,0x6e] +// CHECK: ushl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x44,0x62,0x2e] +// CHECK: ushl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x44,0x62,0x6e] +// CHECK: ushl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x44,0xa2,0x2e] +// CHECK: ushl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x44,0xa2,0x6e] +// CHECK: ushl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x44,0xe2,0x6e] + +//------------------------------------------------------------------------------ +// Scalar Integer Shift Lef (Signed) +//------------------------------------------------------------------------------ + sshl d17, d31, d8 + +// CHECK: sshl d17, d31, d8 // encoding: [0xf1,0x47,0xe8,0x5e] + +//------------------------------------------------------------------------------ +// Scalar Integer Shift Lef (Unsigned) +//------------------------------------------------------------------------------ + ushl d17, d31, d8 + +// CHECK: ushl d17, d31, d8 // encoding: [0xf1,0x47,0xe8,0x7e] + diff --git a/test/MC/AArch64/noneon-diagnostics.s b/test/MC/AArch64/noneon-diagnostics.s new file mode 100644 index 00000000000..ea786c0ba67 --- /dev/null +++ b/test/MC/AArch64/noneon-diagnostics.s @@ -0,0 +1,28 @@ +// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=-neon < %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s + + fmla v3.4s, v12.4s, v17.4s + fmla v1.2d, v30.2d, v20.2d + fmla v9.2s, v9.2s, v0.2s +// CHECK-ERROR: error: instruction requires a CPU feature not currently enabled +// CHECK-ERROR-NEXT: fmla v3.4s, v12.4s, v17.4s +// CHECK-ERROR-NEXT: ^ +// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled +// CHECK-ERROR-NEXT: fmla v1.2d, v30.2d, v20.2d +// CHECK-ERROR-NEXT: ^ +// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled +// CHECK-ERROR-NEXT: fmla v9.2s, v9.2s, v0.2s +// CHECK-ERROR-NEXT: ^ + + fmls v3.4s, v12.4s, v17.4s + fmls v1.2d, v30.2d, v20.2d + fmls v9.2s, v9.2s, v0.2s +// CHECK-ERROR: error: instruction requires a CPU feature not currently enabled +// CHECK-ERROR-NEXT: fmls v3.4s, v12.4s, v17.4s +// CHECK-ERROR-NEXT: ^ +// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled +// CHECK-ERROR-NEXT: fmls v1.2d, v30.2d, v20.2d +// CHECK-ERROR-NEXT: ^ +// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled +// CHECK-ERROR-NEXT: fmls v9.2s, v9.2s, v0.2s +// CHECK-ERROR-NEXT: ^ diff --git a/test/MC/Disassembler/AArch64/neon-instructions.txt b/test/MC/Disassembler/AArch64/neon-instructions.txt new file mode 100644 index 00000000000..40d1f4c66f8 --- /dev/null +++ b/test/MC/Disassembler/AArch64/neon-instructions.txt @@ -0,0 +1,673 @@ +# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -disassemble < %s | FileCheck %s + +#------------------------------------------------------------------------------ +# Vector Integer Add/Sub +#------------------------------------------------------------------------------ +# CHECK: add v31.8b, v31.8b, v31.8b +# CHECK: sub v0.2d, v0.2d, v0.2d +0xff 0x87 0x3f 0x0e +0x00 0x84 0xe0 0x6e + +#------------------------------------------------------------------------------ +# Vector Floating-Point Add/Sub +#------------------------------------------------------------------------------ + +# CHECK: fadd v0.4s, v0.4s, v0.4s +# CHECK: fsub v31.2s, v31.2s, v31.2s +0x00 0xd4 0x20 0x4e +0xff 0xd7 0xbf 0x0e + +#------------------------------------------------------------------------------ +# Vector Integer Mul +#------------------------------------------------------------------------------ +# CHECK: mul v0.8b, v1.8b, v2.8b +0x20 0x9c 0x22 0x0e + +#------------------------------------------------------------------------------ +# Vector Floating-Point Mul/Div +#------------------------------------------------------------------------------ +# CHECK: fmul v0.2s, v1.2s, v2.2s +# CHECK: fdiv v31.2s, v31.2s, v31.2s +0x20 0xdc 0x22 0x2e +0xff 0xff 0x3f 0x2e + +#---------------------------------------------------------------------- +# Vector Polynomial Multiply +#---------------------------------------------------------------------- +# CHECK: pmul v0.8b, v15.8b, v16.8b +# CHECK: pmul v31.16b, v7.16b, v8.16b +0xe0 0x9d 0x30 0x2e +0xff 0x9c 0x28 0x6e + +#------------------------------------------------------------------------------ +# Vector And, Orr, Eor, Orn, Bic +#------------------------------------------------------------------------------ +# CHECK: and v2.8b, v2.8b, v2.8b +# CHECK: orr v31.16b, v31.16b, v30.16b +# CHECK: eor v0.16b, v1.16b, v2.16b +# CHECK: orn v9.16b, v10.16b, v11.16b +# CHECK: bic v31.8b, v30.8b, v29.8b +0x42 0x1c 0x22 0x0e +0xff 0x1f 0xbe 0x4e +0x20 0x1c 0x22 0x6e +0x49 0x1d 0xeb 0x4e +0xdf 0x1f 0x7d 0x0e + +#------------------------------------------------------------------------------ +# Vector Bsl, Bit, Bif +#------------------------------------------------------------------------------ +# CHECK: bsl v0.8b, v1.8b, v2.8b +# CHECK: bit v31.16b, v31.16b, v31.16b +# CHECK: bif v0.16b, v1.16b, v2.16b +0x20 0x1c 0x62 0x2e +0xff 0x1f 0xbf 0x6e +0x20 0x1c 0xe2 0x6e + + +#------------------------------------------------------------------------------ +# Vector Integer Multiply-accumulate and Multiply-subtract +#------------------------------------------------------------------------------ +# CHECK: mla v0.8b, v1.8b, v2.8b +# CHECK: mls v31.4h, v31.4h, v31.4h +0x20 0x94 0x22 0x0e +0xff 0x97 0x7f 0x2e + +#------------------------------------------------------------------------------ +# Vector Floating-Point Multiply-accumulate and Multiply-subtract +#------------------------------------------------------------------------------ +# CHECK: fmla v0.2s, v1.2s, v2.2s +# CHECK: fmls v31.2s, v31.2s, v31.2s +0x20 0xcc 0x22 0x0e +0xff 0xcf 0xbf 0x0e + +#------------------------------------------------------------------------------ +# Vector Move Immediate Shifted +# Vector Move Inverted Immediate Shifted +# Vector Bitwise Bit Clear (AND NOT) - immediate +# Vector Bitwise OR - immedidate +#------------------------------------------------------------------------------ +# CHECK: movi v31.4s, #0xff, lsl #24 +# CHECK: mvni v0.2s, #0x0 +# CHECK: bic v15.4h, #0xf, lsl #8 +# CHECK: orr v16.8h, #0x1f +0xff 0x67 0x07 0x4f +0x00 0x04 0x00 0x2f +0xef 0xb5 0x00 0x2f +0xf0 0x97 0x00 0x4f + +#------------------------------------------------------------------------------ +# Vector Move Immediate Masked +# Vector Move Inverted Immediate Masked +#------------------------------------------------------------------------------ +# CHECK: movi v8.2s, #0x8, msl #8 +# CHECK: mvni v16.4s, #0x10, msl #16 +0x08 0xc5 0x00 0x0f +0x10 0xd6 0x00 0x6f + +#------------------------------------------------------------------------------ +# Vector Immediate - per byte +# Vector Move Immediate - bytemask, per doubleword +# Vector Move Immediate - bytemask, one doubleword +#------------------------------------------------------------------------------ +# CHECK: movi v16.8b, #0xff +# CHECK: movi v31.16b, #0x1f +# CHECK: movi d15, #0xff00ff00ff00ff +# CHECK: movi v31.2d, #0xff0000ff0000ffff +0xf0 0xe7 0x07 0x0f +0xff 0xe7 0x00 0x4f +0xaf 0xe6 0x02 0x2f +0x7f 0xe6 0x04 0x6f + +#------------------------------------------------------------------------------ +# Vector Floating Point Move Immediate +#------------------------------------------------------------------------------ +# CHECK: fmov v0.2s, #13.0 +# CHECK: fmov v15.4s, #1.0 +# CHECK: fmov v31.2d, #-1.25 +0x40 0xf5 0x01 0x0f +0x0f 0xf6 0x03 0x4f +0x9f 0xf6 0x07 0x6f + +#------------------------------------------------------------------------------ +# Vector Move - register +#------------------------------------------------------------------------------ +# CHECK: mov v1.16b, v15.16b +# CHECK: mov v25.8b, v4.8b +0xe1 0x1d 0xaf 0x4e +0x99 0x1c 0xa4 0x0e + +#---------------------------------------------------------------------- +# Vector Absolute Difference and Accumulate (Signed, Unsigned) +# Vector Absolute Difference (Signed, Unsigned) +# Vector Absolute Difference (Floating Point) +#---------------------------------------------------------------------- + +# CHECK: uaba v0.8b, v1.8b, v2.8b +# CHECK: saba v31.16b, v30.16b, v29.16b +# CHECK: uabd v15.4h, v16.4h, v17.4h +# CHECK: sabd v5.4h, v4.4h, v6.4h +# CHECK: fabd v1.4s, v31.4s, v16.4s +0x20 0x7c 0x22 0x2e +0xdf 0x7f 0x3d 0x4e +0x0f 0x76 0x71 0x2e +0x85 0x74 0x66 0x0e +0xe1 0xd7 0xb0 0x6e + +#---------------------------------------------------------------------- +# Scalar Integer Add +# Scalar Integer Sub +#---------------------------------------------------------------------- + +# CHECK: add d17, d31, d29 +# CHECK: sub d15, d5, d16 +0xf1 0x87 0xfd 0x5e +0xaf 0x84 0xf0 0x7e + +#---------------------------------------------------------------------- +# Vector Reciprocal Square Root Step (Floating Point) +#---------------------------------------------------------------------- +# CHECK: frsqrts v31.2d, v15.2d, v8.2d +0xff 0xfd 0xe8 0x4e + +#---------------------------------------------------------------------- +# Vector Reciprocal Step (Floating Point) +#---------------------------------------------------------------------- +# CHECK: frecps v5.4s, v7.4s, v16.4s +0xe5 0xfc 0x30 0x4e + +#---------------------------------------------------------------------- +# Vector Absolute Compare Mask Less Than Or Equal (Floating Point) +#---------------------------------------------------------------------- +# CHECK: facge v0.4s, v31.4s, v16.4s +0xe0 0xef 0x30 0x6e + +#---------------------------------------------------------------------- +# Vector Absolute Compare Mask Less Than (Floating Point) +#---------------------------------------------------------------------- +# CHECK: facgt v31.2d, v29.2d, v28.2d +0xbf 0xef 0xfc 0x6e + +#---------------------------------------------------------------------- +# Vector Compare Mask Equal (Integer) +#---------------------------------------------------------------------- +# CHECK: cmeq v5.16b, v15.16b, v31.16b +0xe5 0x8d 0x3f 0x6e + +#---------------------------------------------------------------------- +# Vector Compare Mask Higher or Same (Unsigned Integer) +#---------------------------------------------------------------------- +# CHECK: cmhs v1.8b, v16.8b, v30.8b +0x01 0x3e 0x3e 0x2e + +#---------------------------------------------------------------------- +# Vector Compare Mask Greater Than or Equal (Integer) +#---------------------------------------------------------------------- +# CHECK: cmge v20.4h, v11.4h, v23.4h +0x74 0x3d 0x77 0x0e + +#---------------------------------------------------------------------- +# Vector Compare Mask Higher (Unsigned Integer) +# CHECK: cmhi v13.8h, v3.8h, v27.8h +0x6d 0x34 0x7b 0x6e + +#---------------------------------------------------------------------- +# Vector Compare Mask Greater Than (Integer) +#---------------------------------------------------------------------- +# CHECK: cmgt v9.4s, v4.4s, v28.4s +0x89 0x34 0xbc 0x4e + +#---------------------------------------------------------------------- +# Vector Compare Mask Bitwise Test (Integer) +#---------------------------------------------------------------------- +# CHECK: cmtst v21.2s, v19.2s, v18.2s +0x75 0x8e 0xb2 0x0e + +#---------------------------------------------------------------------- +# Vector Compare Mask Equal (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fcmeq v0.2s, v15.2s, v16.2s +0xe0 0xe5 0x30 0x0e + +#---------------------------------------------------------------------- +# Vector Compare Mask Greater Than Or Equal (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fcmge v31.4s, v7.4s, v29.4s +0xff 0xe4 0x3d 0x6e + +#---------------------------------------------------------------------- +# Vector Compare Mask Greater Than (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fcmgt v17.4s, v8.4s, v25.4s +0x11 0xe5 0xb9 0x6e + +#---------------------------------------------------------------------- +# Vector Compare Mask Equal to Zero (Integer) +#---------------------------------------------------------------------- +# CHECK: cmeq v31.16b, v15.16b, #0x0 +0xff 0x99 0x20 0x4e + +#---------------------------------------------------------------------- +# Vector Compare Mask Greater Than or Equal to Zero (Signed Integer) +#---------------------------------------------------------------------- +# CHECK: cmge v3.8b, v15.8b, #0x0 +0xe3 0x89 0x20 0x2e + +#---------------------------------------------------------------------- +# Vector Compare Mask Greater Than Zero (Signed Integer) +#---------------------------------------------------------------------- +# CHECK: cmgt v22.2s, v9.2s, #0x0 +0x36 0x89 0xa0 0x0e + +#---------------------------------------------------------------------- +# Vector Compare Mask Less Than or Equal To Zero (Signed Integer) +#---------------------------------------------------------------------- +# CHECK: cmle v5.2d, v14.2d, #0x0 +0xc5 0x99 0xe0 0x6e + +#---------------------------------------------------------------------- +# Vector Compare Mask Less Than Zero (Signed Integer) +#---------------------------------------------------------------------- +# CHECK: cmlt v13.8h, v11.8h, #0x0 +0x6d 0xa9 0x60 0x4e + +#---------------------------------------------------------------------- +# Vector Compare Mask Equal to Zero (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fcmeq v15.2s, v21.2s, #0.0 +0xaf 0xda 0xa0 0x0e + +#---------------------------------------------------------------------- +# Vector Compare Mask Greater Than or Equal to Zero (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fcmge v14.2d, v13.2d, #0.0 +0xae 0xc9 0xe0 0x6e + +#---------------------------------------------------------------------- +# Vector Compare Mask Greater Than Zero (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fcmgt v9.4s, v23.4s, #0.0 +0xe9 0xca 0xa0 0x4e + +#---------------------------------------------------------------------- +# Vector Compare Mask Less Than or Equal To Zero (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fcmle v11.2d, v6.2d, #0.0 +0xcb 0xd8 0xe0 0x6e + +#---------------------------------------------------------------------- +# Vector Compare Mask Less Than Zero (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fcmlt v12.4s, v25.4s, #0.0 +0x2c 0xeb 0xa0 0x4e + + +#------------------------------------------------------------------------------ +# Vector Integer Halving Add (Signed) +# Vector Integer Halving Add (Unsigned) +# Vector Integer Halving Sub (Signed) +# Vector Integer Halving Sub (Unsigned) +#------------------------------------------------------------------------------ +# CHECK: shadd v0.8b, v31.8b, v29.8b +# CHECK: uhadd v15.16b, v16.16b, v17.16b +# CHECK: shsub v0.4h, v1.4h, v2.4h +# CHECK: uhadd v5.8h, v7.8h, v8.8h +# CHECK: shsub v9.2s, v11.2s, v21.2s +# CHECK: uhsub v22.4s, v30.4s, v19.4s +0xe0 0x07 0x3d 0x0e +0x0f 0x06 0x31 0x6e +0x20 0x24 0x62 0x0e +0xe5 0x04 0x68 0x6e +0x69 0x25 0xb5 0x0e +0xd6 0x27 0xb3 0x6e + +#------------------------------------------------------------------------------ +# Vector Integer Rouding Halving Add (Signed) +# Vector Integer Rouding Halving Add (Unsigned) +#------------------------------------------------------------------------------ +# CHECK: srhadd v3.8b, v5.8b, v7.8b +# CHECK: urhadd v7.16b, v17.16b, v27.16b +# CHECK: srhadd v10.4h, v11.4h, v13.4h +# CHECK: urhadd v1.8h, v2.8h, v3.8h +# CHECK: srhadd v4.2s, v5.2s, v6.2s +# CHECK: urhadd v7.4s, v7.4s, v7.4s +0xa3 0x14 0x27 0x0e +0x27 0x16 0x3b 0x6e +0x6a 0x15 0x6d 0x0e +0x41 0x14 0x63 0x6e +0xa4 0x14 0xa6 0x0e +0xe7 0x14 0xa7 0x6e + +#------------------------------------------------------------------------------ +# Vector Integer Saturating Add (Signed) +# Vector Integer Saturating Add (Unsigned) +# Vector Integer Saturating Sub (Signed) +# Vector Integer Saturating Sub (Unsigned) +#------------------------------------------------------------------------------ +# CHECK: sqsub v0.8b, v1.8b, v2.8b +# CHECK: sqadd v0.16b, v1.16b, v2.16b +# CHECK: uqsub v0.4h, v1.4h, v2.4h +# CHECK: uqadd v0.8h, v1.8h, v2.8h +# CHECK: sqadd v0.2s, v1.2s, v2.2s +# CHECK: sqsub v0.4s, v1.4s, v2.4s +# CHECK: sqsub v0.2d, v1.2d, v2.2d +0x20 0x2c 0x22 0x0e +0x20 0x0c 0x22 0x4e +0x20 0x2c 0x62 0x2e +0x20 0x0c 0x62 0x6e +0x20 0x0c 0xa2 0x0e +0x20 0x2c 0xa2 0x4e +0x20 0x2c 0xe2 0x4e + +#------------------------------------------------------------------------------ +# Scalar Integer Saturating Add (Signed) +# Scalar Integer Saturating Add (Unsigned) +# Scalar Integer Saturating Sub (Signed) +# Scalar Integer Saturating Add (Unsigned) +#------------------------------------------------------------------------------ +# CHECK: sqadd b20, b11, b15 +# CHECK: uqadd h0, h1, h5 +# CHECK: sqsub s20, s10, s7 +# CHECK: uqsub d16, d16, d16 +0x74 0x0d 0x2f 0x5e +0x20 0x0c 0x65 0x7e +0x54 0x2d 0xa7 0x5e +0x10 0x2e 0xf0 0x7e + + +#---------------------------------------------------------------------- +# Vector Shift Left (Signed and Unsigned Integer) +#---------------------------------------------------------------------- +# CHECK: sshl v10.8b, v15.8b, v22.8b +# CHECK: ushl v10.16b, v5.16b, v2.16b +# CHECK: sshl v10.4h, v15.4h, v22.4h +# CHECK: ushl v10.8h, v5.8h, v2.8h +# CHECK: sshl v10.2s, v15.2s, v22.2s +# CHECK: ushl v10.4s, v5.4s, v2.4s +# CHECK: sshl v0.2d, v1.2d, v2.2d +0xea 0x45 0x36 0x0e +0xaa 0x44 0x22 0x6e +0xea 0x45 0x76 0x0e +0xaa 0x44 0x62 0x6e +0xea 0x45 0xb6 0x0e +0xaa 0x44 0xa2 0x6e +0x20 0x44 0xe2 0x4e + +#---------------------------------------------------------------------- +# Vector Saturating Shift Left (Signed and Unsigned Integer) +#---------------------------------------------------------------------- +# CHECK: sqshl v1.8b, v15.8b, v22.8b +# CHECK: uqshl v2.16b, v14.16b, v23.16b +# CHECK: sqshl v3.4h, v13.4h, v24.4h +# CHECK: uqshl v4.8h, v12.8h, v25.8h +# CHECK: sqshl v5.2s, v11.2s, v26.2s +# CHECK: uqshl v6.4s, v10.4s, v27.4s +# CHECK: uqshl v0.2d, v1.2d, v2.2d +0xe1 0x4d 0x36 0x0e +0xc2 0x4d 0x37 0x6e +0xa3 0x4d 0x78 0x0e +0x84 0x4d 0x79 0x6e +0x65 0x4d 0xba 0x0e +0x46 0x4d 0xbb 0x6e +0x20 0x4c 0xe2 0x6e + +#---------------------------------------------------------------------- +# Vector Rouding Shift Left (Signed and Unsigned Integer) +#---------------------------------------------------------------------- +# CHECK: srshl v10.8b, v5.8b, v22.8b +# CHECK: urshl v10.16b, v5.16b, v2.16b +# CHECK: srshl v1.4h, v5.4h, v31.4h +# CHECK: urshl v1.8h, v5.8h, v2.8h +# CHECK: srshl v10.2s, v15.2s, v2.2s +# CHECK: urshl v1.4s, v5.4s, v2.4s +# CHECK: urshl v0.2d, v1.2d, v2.2d +0xaa 0x54 0x36 0x0e +0xaa 0x54 0x22 0x6e +0xa1 0x54 0x7f 0x0e +0xa1 0x54 0x62 0x6e +0xea 0x55 0xa2 0x0e +0xa1 0x54 0xa2 0x6e +0x20 0x54 0xe2 0x6e + +#---------------------------------------------------------------------- +# Vector Saturating Rouding Shift Left (Signed and Unsigned Integer) +#---------------------------------------------------------------------- +# CHECK: sqrshl v1.8b, v15.8b, v22.8b +# CHECK: uqrshl v2.16b, v14.16b, v23.16b +# CHECK: sqrshl v3.4h, v13.4h, v24.4h +# CHECK: uqrshl v4.8h, v12.8h, v25.8h +# CHECK: sqrshl v5.2s, v11.2s, v26.2s +# CHECK: uqrshl v6.4s, v10.4s, v27.4s +# CHECK: uqrshl v6.4s, v10.4s, v27.4s +0xe1 0x5d 0x36 0x0e +0xc2 0x5d 0x37 0x6e +0xa3 0x5d 0x78 0x0e +0x84 0x5d 0x79 0x6e +0x65 0x5d 0xba 0x0e +0x46 0x5d 0xbb 0x6e +0x46 0x5d 0xbb 0x6e + +#---------------------------------------------------------------------- +# Scalar Integer Shift Left (Signed, Unsigned) +#---------------------------------------------------------------------- +# CHECK: sshl d31, d31, d31 +# CHECK: ushl d0, d0, d0 +0xff 0x47 0xff 0x5e +0x00 0x44 0xe0 0x7e + +#---------------------------------------------------------------------- +# Scalar Integer Saturating Shift Left (Signed, Unsigned) +#---------------------------------------------------------------------- +# CHECK: sqshl d31, d31, d31 +# CHECK: uqshl s23, s20, s16 +# CHECK: sqshl h3, h4, h15 +# CHECK: uqshl b11, b20, b30 +0xff 0x4f 0xff 0x5e +0x97 0x4e 0xb0 0x7e +0x83 0x4c 0x6f 0x5e +0x8b 0x4e 0x3e 0x7e + +#---------------------------------------------------------------------- +# Scalar Integer Rouding Shift Left (Signed, Unsigned) +#---------------------------------------------------------------------- +# CHECK: srshl d16, d16, d16 +# CHECK: urshl d8, d7, d4 +0x10 0x56 0xf0 0x5e +0xe8 0x54 0xe4 0x7e + +#---------------------------------------------------------------------- +# Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) +#---------------------------------------------------------------------- +# CHECK: sqrshl d31, d31, d31 +# CHECK: uqrshl s23, s20, s16 +# CHECK: sqrshl h3, h4, h15 +# CHECK: uqrshl b11, b20, b30 +0xff 0x5f 0xff 0x5e +0x97 0x5e 0xb0 0x7e +0x83 0x5c 0x6f 0x5e +0x8b 0x5e 0x3e 0x7e + +#---------------------------------------------------------------------- +# Vector Maximum (Signed and Unsigned Integer) +#---------------------------------------------------------------------- +# CHECK: smax v1.8b, v15.8b, v22.8b +# CHECK: umax v2.16b, v14.16b, v23.16b +# CHECK: smax v3.4h, v13.4h, v24.4h +# CHECK: umax v4.8h, v12.8h, v25.8h +# CHECK: smax v5.2s, v11.2s, v26.2s +# CHECK: umax v6.4s, v10.4s, v27.4s +0xe1 0x65 0x36 0x0e +0xc2 0x65 0x37 0x6e +0xa3 0x65 0x78 0x0e +0x84 0x65 0x79 0x6e +0x65 0x65 0xba 0x0e +0x46 0x65 0xbb 0x6e + +#---------------------------------------------------------------------- +# Vector Minimum (Signed and Unsigned Integer) +#---------------------------------------------------------------------- +# CHECK: umin v1.8b, v15.8b, v22.8b +# CHECK: smin v2.16b, v14.16b, v23.16b +# CHECK: umin v3.4h, v13.4h, v24.4h +# CHECK: smin v4.8h, v12.8h, v25.8h +# CHECK: umin v5.2s, v11.2s, v26.2s +# CHECK: smin v6.4s, v10.4s, v27.4s +0xe1 0x6d 0x36 0x2e +0xc2 0x6d 0x37 0x4e +0xa3 0x6d 0x78 0x2e +0x84 0x6d 0x79 0x4e +0x65 0x6d 0xba 0x2e +0x46 0x6d 0xbb 0x4e + +#---------------------------------------------------------------------- +# Vector Maximum (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fmax v29.2s, v28.2s, v25.2s +# CHECK: fmax v9.4s, v8.4s, v5.4s +# CHECK: fmax v11.2d, v10.2d, v7.2d +0x9d 0xf7 0x39 0x0e +0x09 0xf5 0x25 0x4e +0x4b 0xf5 0x67 0x4e + +#---------------------------------------------------------------------- +# Vector Minimum (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fmin v29.2s, v28.2s, v25.2s +# CHECK: fmin v9.4s, v8.4s, v5.4s +# CHECK: fmin v11.2d, v10.2d, v7.2d +0x9d 0xf7 0xb9 0x0e +0x09 0xf5 0xa5 0x4e +0x4b 0xf5 0xe7 0x4e + +#---------------------------------------------------------------------- +# Vector maxNum (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fmaxnm v9.2s, v8.2s, v5.2s +# CHECK: fmaxnm v9.4s, v8.4s, v5.4s +# CHECK: fmaxnm v11.2d, v10.2d, v7.2d +0x09 0xc5 0x25 0x0e +0x09 0xc5 0x25 0x4e +0x4b 0xc5 0x67 0x4e + +#---------------------------------------------------------------------- +# Vector minNum (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fminnm v2.2s, v8.2s, v25.2s +# CHECK: fminnm v9.4s, v8.4s, v5.4s +# CHECK: fminnm v11.2d, v10.2d, v7.2d +0x02 0xc5 0xb9 0x0e +0x09 0xc5 0xa5 0x4e +0x4b 0xc5 0xe7 0x4e + + +#---------------------------------------------------------------------- +# Vector Maximum Pairwise (Signed and Unsigned Integer) +#---------------------------------------------------------------------- +# CHECK: smaxp v1.8b, v15.8b, v22.8b +# CHECK: umaxp v2.16b, v14.16b, v23.16b +# CHECK: smaxp v3.4h, v13.4h, v24.4h +# CHECK: umaxp v4.8h, v12.8h, v25.8h +# CHECK: smaxp v5.2s, v11.2s, v26.2s +# CHECK: umaxp v6.4s, v10.4s, v27.4s +0xe1 0xa5 0x36 0x0e +0xc2 0xa5 0x37 0x6e +0xa3 0xa5 0x78 0x0e +0x84 0xa5 0x79 0x6e +0x65 0xa5 0xba 0x0e +0x46 0xa5 0xbb 0x6e + +#---------------------------------------------------------------------- +# Vector Minimum Pairwise (Signed and Unsigned Integer) +#---------------------------------------------------------------------- +# CHECK: uminp v1.8b, v15.8b, v22.8b +# CHECK: sminp v2.16b, v14.16b, v23.16b +# CHECK: uminp v3.4h, v13.4h, v24.4h +# CHECK: sminp v4.8h, v12.8h, v25.8h +# CHECK: uminp v5.2s, v11.2s, v26.2s +# CHECK: sminp v6.4s, v10.4s, v27.4s +0xe1 0xad 0x36 0x2e +0xc2 0xad 0x37 0x4e +0xa3 0xad 0x78 0x2e +0x84 0xad 0x79 0x4e +0x65 0xad 0xba 0x2e +0x46 0xad 0xbb 0x4e + +#---------------------------------------------------------------------- +# Vector Maximum Pairwise (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fmaxp v29.2s, v28.2s, v25.2s +# CHECK: fmaxp v9.4s, v8.4s, v5.4s +# CHECK: fmaxp v11.2d, v10.2d, v7.2d +0x9d 0xf7 0x39 0x2e +0x09 0xf5 0x25 0x6e +0x4b 0xf5 0x67 0x6e + +#---------------------------------------------------------------------- +# Vector Minimum Pairwise (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fminp v29.2s, v28.2s, v25.2s +# CHECK: fminp v9.4s, v8.4s, v5.4s +# CHECK: fminp v11.2d, v10.2d, v7.2d +0x9d 0xf7 0xb9 0x2e +0x09 0xf5 0xa5 0x6e +0x4b 0xf5 0xe7 0x6e + +#---------------------------------------------------------------------- +# Vector maxNum Pairwise (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fmaxnmp v9.2s, v8.2s, v5.2s +# CHECK: fmaxnmp v9.4s, v8.4s, v5.4s +# CHECK: fmaxnmp v11.2d, v10.2d, v7.2d +0x09 0xc5 0x25 0x2e +0x09 0xc5 0x25 0x6e +0x4b 0xc5 0x67 0x6e + +#---------------------------------------------------------------------- +# Vector minNum Pairwise (Floating Point) +#---------------------------------------------------------------------- +# CHECK: fminnmp v2.2s, v8.2s, v25.2s +# CHECK: fminnmp v9.4s, v8.4s, v5.4s +# CHECK: fminnmp v11.2d, v10.2d, v7.2d +0x02 0xc5 0xb9 0x2e +0x09 0xc5 0xa5 0x6e +0x4b 0xc5 0xe7 0x6e + +#------------------------------------------------------------------------------ +# Vector Add Pairwise (Integer) +#------------------------------------------------------------------------------ +# CHECK: addp v31.8b, v31.8b, v31.8b +# CHECK: addp v0.2d, v0.2d, v0.2d +0xff 0xbf 0x3f 0x0e +0x00 0xbc 0xe0 0x4e + +#------------------------------------------------------------------------------ +# Vector Add Pairwise (Floating Point) +#------------------------------------------------------------------------------ +# CHECK: faddp v0.4s, v0.4s, v0.4s +# CHECK: faddp v31.2s, v31.2s, v31.2s +0x00 0xd4 0x20 0x6e +0xff 0xd7 0x3f 0x2e + + +#------------------------------------------------------------------------------ +# Vector Saturating Doubling Multiply High +# Vector Saturating Rouding Doubling Multiply High +#------------------------------------------------------------------------------ +# CHECK: sqdmulh v31.2s, v31.2s, v31.2s +# CHECK: sqdmulh v5.4s, v7.4s, v9.4s +# CHECK: sqrdmulh v31.4h, v3.4h, v13.4h +# CHECK: sqrdmulh v0.8h, v10.8h, v20.8h +0xff 0xb7 0xbf 0x0e +0xe5 0xb4 0xa9 0x4e +0x7f 0xb4 0x6d 0x2e +0x40 0xb5 0x74 0x6e + +#------------------------------------------------------------------------------ +# Vector Multiply Extended +#------------------------------------------------------------------------------ +# CHECK: fmulx v1.2s, v22.2s, v2.2s +# CHECK: fmulx v21.4s, v15.4s, v3.4s +# CHECK: fmulx v11.2d, v5.2d, v23.2d +0xc1 0xde 0x22 0x0e +0xf5 0xdd 0x23 0x4e +0xab 0xdc 0x77 0x4e +