setOperationAction(ISD::CTPOP, MVT::v4i16, Custom);
setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
+ // NEON does not have single instruction CTTZ for vectors.
+ setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
+
+ setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
+
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
+
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
+
// NEON only has FMA instructions as of VFP4.
if (!Subtarget->hasVFP4()) {
setOperationAction(ISD::FMA, MVT::v2f32, Expand);
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
- EVT VT = N->getValueType(0);
SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ if (VT.isVector()) {
+ assert(ST->hasNEON());
+
+ // Compute the least significant set bit: LSB = X & -X
+ SDValue X = N->getOperand(0);
+ SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
+ SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
+
+ EVT ElemTy = VT.getVectorElementType();
+
+ if (ElemTy == MVT::i8) {
+ // Compute with: cttz(x) = ctpop(lsb - 1)
+ SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+ DAG.getTargetConstant(1, dl, ElemTy));
+ SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
+ return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
+ }
+
+ if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
+ (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
+ // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
+ unsigned NumBits = ElemTy.getSizeInBits();
+ SDValue WidthMinus1 =
+ DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+ DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
+ SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
+ return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
+ }
+
+ // Compute with: cttz(x) = ctpop(lsb - 1)
+
+ // Since we can only compute the number of bits in a byte with vcnt.8, we
+ // have to gather the result with pairwise addition (vpaddl) for i16, i32,
+ // and i64.
+
+ // Compute LSB - 1.
+ SDValue Bits;
+ if (ElemTy == MVT::i64) {
+ // Load constant 0xffff'ffff'ffff'ffff to register.
+ SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+ DAG.getTargetConstant(0x1eff, dl, MVT::i32));
+ Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
+ } else {
+ SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+ DAG.getTargetConstant(1, dl, ElemTy));
+ Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
+ }
+
+ // Count #bits with vcnt.8.
+ EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+ SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
+ SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
+
+ // Gather the #bits with vpaddl (pairwise add.)
+ EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+ SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
+ DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+ Cnt8);
+ if (ElemTy == MVT::i16)
+ return Cnt16;
+
+ EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
+ SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
+ DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+ Cnt16);
+ if (ElemTy == MVT::i32)
+ return Cnt32;
+
+ assert(ElemTy == MVT::i64);
+ SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+ Cnt32);
+ return Cnt64;
+ }
if (!ST->hasV6T2Ops())
return SDValue();
case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
case ISD::SRL_PARTS:
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
- case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+ case ISD::CTTZ:
+ case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG);
case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
--- /dev/null
+; RUN: llc < %s -mtriple armv7-linux-gnueabihf -mattr=+neon | FileCheck %s
+
+; This test checks the @llvm.cttz.* intrinsics for vectors.
+
+declare <1 x i8> @llvm.cttz.v1i8(<1 x i8>, i1)
+declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1)
+declare <4 x i8> @llvm.cttz.v4i8(<4 x i8>, i1)
+declare <8 x i8> @llvm.cttz.v8i8(<8 x i8>, i1)
+declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1)
+
+declare <1 x i16> @llvm.cttz.v1i16(<1 x i16>, i1)
+declare <2 x i16> @llvm.cttz.v2i16(<2 x i16>, i1)
+declare <4 x i16> @llvm.cttz.v4i16(<4 x i16>, i1)
+declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1)
+
+declare <1 x i32> @llvm.cttz.v1i32(<1 x i32>, i1)
+declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1)
+declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
+
+declare <1 x i64> @llvm.cttz.v1i64(<1 x i64>, i1)
+declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
+
+;------------------------------------------------------------------------------
+
+define void @test_v1i8(<1 x i8>* %p) {
+; CHECK-LABEL: test_v1i8
+ %a = load <1 x i8>, <1 x i8>* %p
+ %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 false)
+ store <1 x i8> %tmp, <1 x i8>* %p
+ ret void
+}
+
+define void @test_v2i8(<2 x i8>* %p) {
+; CHECK-LABEL: test_v2i8:
+ %a = load <2 x i8>, <2 x i8>* %p
+ %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 false)
+ store <2 x i8> %tmp, <2 x i8>* %p
+ ret void
+}
+
+define void @test_v4i8(<4 x i8>* %p) {
+; CHECK-LABEL: test_v4i8:
+ %a = load <4 x i8>, <4 x i8>* %p
+ %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 false)
+ store <4 x i8> %tmp, <4 x i8>* %p
+ ret void
+}
+
+define void @test_v8i8(<8 x i8>* %p) {
+; CHECK-LABEL: test_v8i8:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i8 [[D2:d[0-9]+]], #0x1
+; CHECK: vneg.s8 [[D3:d[0-9]+]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D3]]
+; CHECK: vsub.i8 [[D1]], [[D1]], [[D2]]
+; CHECK: vcnt.8 [[D1]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <8 x i8>, <8 x i8>* %p
+ %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 false)
+ store <8 x i8> %tmp, <8 x i8>* %p
+ ret void
+}
+
+define void @test_v16i8(<16 x i8>* %p) {
+; CHECK-LABEL: test_v16i8:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i8 [[Q2:q[0-9]+]], #0x1
+; CHECK: vneg.s8 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q3]]
+; CHECK: vsub.i8 [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vcnt.8 [[Q1]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <16 x i8>, <16 x i8>* %p
+ %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
+ store <16 x i8> %tmp, <16 x i8>* %p
+ ret void
+}
+
+define void @test_v1i16(<1 x i16>* %p) {
+; CHECK-LABEL: test_v1i16:
+ %a = load <1 x i16>, <1 x i16>* %p
+ %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 false)
+ store <1 x i16> %tmp, <1 x i16>* %p
+ ret void
+}
+
+define void @test_v2i16(<2 x i16>* %p) {
+; CHECK-LABEL: test_v2i16:
+ %a = load <2 x i16>, <2 x i16>* %p
+ %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 false)
+ store <2 x i16> %tmp, <2 x i16>* %p
+ ret void
+}
+
+define void @test_v4i16(<4 x i16>* %p) {
+; CHECK-LABEL: test_v4i16:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i16 [[D2:d[0-9]+]], #0x1
+; CHECK: vneg.s16 [[D3:d[0-9]+]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D3]]
+; CHECK: vsub.i16 [[D1]], [[D1]], [[D2]]
+; CHECK: vcnt.8 [[D1]], [[D1]]
+; CHECK: vpaddl.u8 [[D1]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <4 x i16>, <4 x i16>* %p
+ %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 false)
+ store <4 x i16> %tmp, <4 x i16>* %p
+ ret void
+}
+
+define void @test_v8i16(<8 x i16>* %p) {
+; CHECK-LABEL: test_v8i16:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i16 [[Q2:q[0-9]+]], #0x1
+; CHECK: vneg.s16 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q3]]
+; CHECK: vsub.i16 [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vcnt.8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u8 [[Q1]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <8 x i16>, <8 x i16>* %p
+ %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
+ store <8 x i16> %tmp, <8 x i16>* %p
+ ret void
+}
+
+define void @test_v1i32(<1 x i32>* %p) {
+; CHECK-LABEL: test_v1i32:
+ %a = load <1 x i32>, <1 x i32>* %p
+ %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 false)
+ store <1 x i32> %tmp, <1 x i32>* %p
+ ret void
+}
+
+define void @test_v2i32(<2 x i32>* %p) {
+; CHECK-LABEL: test_v2i32:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x1
+; CHECK: vneg.s32 [[D3:d[0-9]+]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D3]]
+; CHECK: vsub.i32 [[D1]], [[D1]], [[D2]]
+; CHECK: vcnt.8 [[D1]], [[D1]]
+; CHECK: vpaddl.u8 [[D1]], [[D1]]
+; CHECK: vpaddl.u16 [[D1]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <2 x i32>, <2 x i32>* %p
+ %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
+ store <2 x i32> %tmp, <2 x i32>* %p
+ ret void
+}
+
+define void @test_v4i32(<4 x i32>* %p) {
+; CHECK-LABEL: test_v4i32:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x1
+; CHECK: vneg.s32 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q3]]
+; CHECK: vsub.i32 [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vcnt.8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u16 [[Q1]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <4 x i32>, <4 x i32>* %p
+ %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
+ store <4 x i32> %tmp, <4 x i32>* %p
+ ret void
+}
+
+define void @test_v1i64(<1 x i64>* %p) {
+; CHECK-LABEL: test_v1i64:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x0
+; CHECK: vmov.i64 [[D3:d[0-9]+]], #0xffffffffffffffff
+; CHECK: vsub.i64 [[D2]], [[D2]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D2]]
+; CHECK: vadd.i64 [[D1]], [[D1]], [[D3]]
+; CHECK: vcnt.8 [[D1]], [[D1]]
+; CHECK: vpaddl.u8 [[D1]], [[D1]]
+; CHECK: vpaddl.u16 [[D1]], [[D1]]
+; CHECK: vpaddl.u32 [[D1]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <1 x i64>, <1 x i64>* %p
+ %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 false)
+ store <1 x i64> %tmp, <1 x i64>* %p
+ ret void
+}
+
+define void @test_v2i64(<2 x i64>* %p) {
+; CHECK-LABEL: test_v2i64:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x0
+; CHECK: vmov.i64 [[Q3:q[0-9]+]], #0xffffffffffffffff
+; CHECK: vsub.i64 [[Q2]], [[Q2]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vadd.i64 [[Q1]], [[Q1]], [[Q3]]
+; CHECK: vcnt.8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u16 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u32 [[Q1]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <2 x i64>, <2 x i64>* %p
+ %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
+ store <2 x i64> %tmp, <2 x i64>* %p
+ ret void
+}
+
+;------------------------------------------------------------------------------
+
+define void @test_v1i8_zero_undef(<1 x i8>* %p) {
+; CHECK-LABEL: test_v1i8_zero_undef
+ %a = load <1 x i8>, <1 x i8>* %p
+ %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 true)
+ store <1 x i8> %tmp, <1 x i8>* %p
+ ret void
+}
+
+define void @test_v2i8_zero_undef(<2 x i8>* %p) {
+; CHECK-LABEL: test_v2i8_zero_undef:
+ %a = load <2 x i8>, <2 x i8>* %p
+ %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true)
+ store <2 x i8> %tmp, <2 x i8>* %p
+ ret void
+}
+
+define void @test_v4i8_zero_undef(<4 x i8>* %p) {
+; CHECK-LABEL: test_v4i8_zero_undef:
+ %a = load <4 x i8>, <4 x i8>* %p
+ %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true)
+ store <4 x i8> %tmp, <4 x i8>* %p
+ ret void
+}
+
+define void @test_v8i8_zero_undef(<8 x i8>* %p) {
+; CHECK-LABEL: test_v8i8_zero_undef:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i8 [[D2:d[0-9]+]], #0x1
+; CHECK: vneg.s8 [[D3:d[0-9]+]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D3]]
+; CHECK: vsub.i8 [[D1]], [[D1]], [[D2]]
+; CHECK: vcnt.8 [[D1]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <8 x i8>, <8 x i8>* %p
+ %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true)
+ store <8 x i8> %tmp, <8 x i8>* %p
+ ret void
+}
+
+define void @test_v16i8_zero_undef(<16 x i8>* %p) {
+; CHECK-LABEL: test_v16i8_zero_undef:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i8 [[Q2:q[0-9]+]], #0x1
+; CHECK: vneg.s8 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q3]]
+; CHECK: vsub.i8 [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vcnt.8 [[Q1]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <16 x i8>, <16 x i8>* %p
+ %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
+ store <16 x i8> %tmp, <16 x i8>* %p
+ ret void
+}
+
+define void @test_v1i16_zero_undef(<1 x i16>* %p) {
+; CHECK-LABEL: test_v1i16_zero_undef:
+ %a = load <1 x i16>, <1 x i16>* %p
+ %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 true)
+ store <1 x i16> %tmp, <1 x i16>* %p
+ ret void
+}
+
+define void @test_v2i16_zero_undef(<2 x i16>* %p) {
+; CHECK-LABEL: test_v2i16_zero_undef:
+ %a = load <2 x i16>, <2 x i16>* %p
+ %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true)
+ store <2 x i16> %tmp, <2 x i16>* %p
+ ret void
+}
+
+define void @test_v4i16_zero_undef(<4 x i16>* %p) {
+; CHECK-LABEL: test_v4i16_zero_undef:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vneg.s16 [[D2:d[0-9]+]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D2]]
+; CHECK: vmov.i16 [[D3:d[0-9]+]], #0xf
+; CHECK: vclz.i16 [[D1]], [[D1]]
+; CHECK: vsub.i16 [[D1]], [[D3]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <4 x i16>, <4 x i16>* %p
+ %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true)
+ store <4 x i16> %tmp, <4 x i16>* %p
+ ret void
+}
+
+define void @test_v8i16_zero_undef(<8 x i16>* %p) {
+; CHECK-LABEL: test_v8i16_zero_undef:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vneg.s16 [[Q2:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vmov.i16 [[Q3:q[0-9]+]], #0xf
+; CHECK: vclz.i16 [[Q1]], [[Q1]]
+; CHECK: vsub.i16 [[Q1]], [[Q3]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <8 x i16>, <8 x i16>* %p
+ %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
+ store <8 x i16> %tmp, <8 x i16>* %p
+ ret void
+}
+
+define void @test_v1i32_zero_undef(<1 x i32>* %p) {
+; CHECK-LABEL: test_v1i32_zero_undef:
+ %a = load <1 x i32>, <1 x i32>* %p
+ %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 true)
+ store <1 x i32> %tmp, <1 x i32>* %p
+ ret void
+}
+
+define void @test_v2i32_zero_undef(<2 x i32>* %p) {
+; CHECK-LABEL: test_v2i32_zero_undef:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vneg.s32 [[D2:d[0-9]+]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D2]]
+; CHECK: vmov.i32 [[D3:d[0-9]+]], #0x1f
+; CHECK: vclz.i32 [[D1]], [[D1]]
+; CHECK: vsub.i32 [[D1]], [[D3]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <2 x i32>, <2 x i32>* %p
+ %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true)
+ store <2 x i32> %tmp, <2 x i32>* %p
+ ret void
+}
+
+define void @test_v4i32_zero_undef(<4 x i32>* %p) {
+; CHECK-LABEL: test_v4i32_zero_undef:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vneg.s32 [[Q2:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vmov.i32 [[Q3:q[0-9]+]], #0x1f
+; CHECK: vclz.i32 [[Q1]], [[Q1]]
+; CHECK: vsub.i32 [[Q1]], [[Q3]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <4 x i32>, <4 x i32>* %p
+ %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
+ store <4 x i32> %tmp, <4 x i32>* %p
+ ret void
+}
+
+define void @test_v1i64_zero_undef(<1 x i64>* %p) {
+; CHECK-LABEL: test_v1i64_zero_undef:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x0
+; CHECK: vmov.i64 [[D3:d[0-9]+]], #0xffffffffffffffff
+; CHECK: vsub.i64 [[D2]], [[D2]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D2]]
+; CHECK: vadd.i64 [[D1]], [[D1]], [[D3]]
+; CHECK: vcnt.8 [[D1]], [[D1]]
+; CHECK: vpaddl.u8 [[D1]], [[D1]]
+; CHECK: vpaddl.u16 [[D1]], [[D1]]
+; CHECK: vpaddl.u32 [[D1]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <1 x i64>, <1 x i64>* %p
+ %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 true)
+ store <1 x i64> %tmp, <1 x i64>* %p
+ ret void
+}
+
+define void @test_v2i64_zero_undef(<2 x i64>* %p) {
+; CHECK-LABEL: test_v2i64_zero_undef:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x0
+; CHECK: vmov.i64 [[Q3:q[0-9]+]], #0xffffffffffffffff
+; CHECK: vsub.i64 [[Q2]], [[Q2]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vadd.i64 [[Q1]], [[Q1]], [[Q3]]
+; CHECK: vcnt.8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u16 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u32 [[Q1]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <2 x i64>, <2 x i64>* %p
+ %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
+ store <2 x i64> %tmp, <2 x i64>* %p
+ ret void
+}