From: Ahmed Bougacha Date: Fri, 19 Jun 2015 02:15:34 +0000 (+0000) Subject: [ARM] Add D-sized vtrn/vuzp/vzip tests, and cleanup. NFC. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=2120f5332b34300b5986cd898a155f148d985def;p=oota-llvm.git [ARM] Add D-sized vtrn/vuzp/vzip tests, and cleanup. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@240114 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/CodeGen/ARM/vtrn.ll b/test/CodeGen/ARM/vtrn.ll index caa5becac1d..4be51acf886 100644 --- a/test/CodeGen/ARM/vtrn.ll +++ b/test/CodeGen/ARM/vtrn.ll @@ -1,9 +1,14 @@ ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: vtrni8: -;CHECK: vtrn.8 -;CHECK-NEXT: vadd.i8 +; CHECK-LABEL: vtrni8: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.8 d17, d16 +; CHECK-NEXT: vadd.i8 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> @@ -12,10 +17,61 @@ define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ret <8 x i8> %tmp5 } +define <16 x i8> @vtrni8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { +; CHECK-LABEL: vtrni8_Qres: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d19, [r0] +; CHECK-NEXT: vldr d18, [r1] +; CHECK-NEXT: vmov.u8 r0, d19[0] +; CHECK-NEXT: vmov.8 d16[0], r0 +; CHECK-NEXT: vmov.u8 r0, d18[0] +; CHECK-NEXT: vmov.8 d16[1], r0 +; CHECK-NEXT: vmov.u8 r0, d19[2] +; CHECK-NEXT: vmov.8 d16[2], r0 +; CHECK-NEXT: vmov.u8 r0, d18[2] +; CHECK-NEXT: vmov.8 d16[3], r0 +; CHECK-NEXT: vmov.u8 r0, d19[4] +; CHECK-NEXT: vmov.8 d16[4], r0 +; CHECK-NEXT: vmov.u8 r0, d18[4] +; CHECK-NEXT: vmov.8 d16[5], r0 +; CHECK-NEXT: vmov.u8 r0, d19[6] +; CHECK-NEXT: vmov.8 d16[6], r0 +; CHECK-NEXT: vmov.u8 r0, d18[6] +; CHECK-NEXT: vmov.8 d16[7], r0 +; CHECK-NEXT: vmov.u8 r0, d19[1] +; CHECK-NEXT: vmov.8 d17[0], r0 +; CHECK-NEXT: vmov.u8 r0, d18[1] +; CHECK-NEXT: vmov.8 d17[1], r0 +; CHECK-NEXT: vmov.u8 r0, d19[3] +; CHECK-NEXT: vmov.8 d17[2], r0 +; CHECK-NEXT: vmov.u8 r0, d18[3] +; CHECK-NEXT: vmov.8 d17[3], r0 +; CHECK-NEXT: vmov.u8 r0, d19[5] +; CHECK-NEXT: vmov.8 d17[4], r0 +; CHECK-NEXT: vmov.u8 r0, d18[5] +; CHECK-NEXT: vmov.8 d17[5], r0 +; CHECK-NEXT: vmov.u8 r0, d19[7] +; CHECK-NEXT: vmov.8 d17[6], r0 +; CHECK-NEXT: vmov.u8 r0, d18[7] +; CHECK-NEXT: vmov.8 d17[7], r0 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: vtrni16: -;CHECK: vtrn.16 -;CHECK-NEXT: vadd.i16 +; CHECK-LABEL: vtrni16: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.16 d17, d16 +; CHECK-NEXT: vadd.i16 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> @@ -24,10 +80,45 @@ define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ret <4 x i16> %tmp5 } +define <8 x i16> @vtrni16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { +; CHECK-LABEL: vtrni16_Qres: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: vmov.16 d18[0], r0 +; CHECK-NEXT: vmov.u16 r0, d17[0] +; CHECK-NEXT: vmov.16 d18[1], r0 +; CHECK-NEXT: vmov.u16 r0, d16[2] +; CHECK-NEXT: vmov.16 d18[2], r0 +; CHECK-NEXT: vmov.u16 r0, d17[2] +; CHECK-NEXT: vmov.16 d18[3], r0 +; CHECK-NEXT: vmov.u16 r0, d16[1] +; CHECK-NEXT: vmov.16 d19[0], r0 +; CHECK-NEXT: vmov.u16 r0, d17[1] +; CHECK-NEXT: vmov.16 d19[1], r0 +; CHECK-NEXT: vmov.u16 r0, d16[3] +; CHECK-NEXT: vmov.16 d19[2], r0 +; CHECK-NEXT: vmov.u16 r0, d17[3] +; CHECK-NEXT: vmov.16 d19[3], r0 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: mov pc, lr + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> + ret <8 x i16> %tmp3 +} + define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: vtrni32: -;CHECK: vtrn.32 -;CHECK-NEXT: vadd.i32 +; CHECK-LABEL: vtrni32: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.32 d17, d16 +; CHECK-NEXT: vadd.i32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> @@ -36,10 +127,31 @@ define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ret <2 x i32> %tmp5 } +define <4 x i32> @vtrni32_Qres(<2 x i32>* %A, <2 x i32>* %B) nounwind { +; CHECK-LABEL: vtrni32_Qres: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vrev64.32 q9, q8 +; CHECK-NEXT: vuzp.32 q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %tmp1 = load <2 x i32>, <2 x i32>* %A + %tmp2 = load <2 x i32>, <2 x i32>* %B + %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> + ret <4 x i32> %tmp3 +} + define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind { -;CHECK-LABEL: vtrnf: -;CHECK: vtrn.32 -;CHECK-NEXT: vadd.f32 +; CHECK-LABEL: vtrnf: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.32 d17, d16 +; CHECK-NEXT: vadd.f32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> @@ -48,10 +160,32 @@ define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind { ret <2 x float> %tmp5 } +define <4 x float> @vtrnf_Qres(<2 x float>* %A, <2 x float>* %B) nounwind { +; CHECK-LABEL: vtrnf_Qres: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vrev64.32 q9, q8 +; CHECK-NEXT: vuzp.32 q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %tmp1 = load <2 x float>, <2 x float>* %A + %tmp2 = load <2 x float>, <2 x float>* %B + %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <4 x i32> + ret <4 x float> %tmp3 +} + define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: vtrnQi8: -;CHECK: vtrn.8 -;CHECK-NEXT: vadd.i8 +; CHECK-LABEL: vtrnQi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vtrn.8 q9, q8 +; CHECK-NEXT: vadd.i8 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> @@ -60,10 +194,31 @@ define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { ret <16 x i8> %tmp5 } +define <32 x i8> @vtrnQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { +; CHECK-LABEL: vtrnQi8_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vtrn.8 q9, q8 +; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = load <16 x i8>, <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> + ret <32 x i8> %tmp3 +} + define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: vtrnQi16: -;CHECK: vtrn.16 -;CHECK-NEXT: vadd.i16 +; CHECK-LABEL: vtrnQi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vtrn.16 q9, q8 +; CHECK-NEXT: vadd.i16 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> @@ -72,10 +227,31 @@ define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ret <8 x i16> %tmp5 } +define <16 x i16> @vtrnQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { +; CHECK-LABEL: vtrnQi16_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vtrn.16 q9, q8 +; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> + ret <16 x i16> %tmp3 +} + define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: vtrnQi32: -;CHECK: vtrn.32 -;CHECK-NEXT: vadd.i32 +; CHECK-LABEL: vtrnQi32: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vtrn.32 q9, q8 +; CHECK-NEXT: vadd.i32 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> @@ -84,10 +260,31 @@ define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ret <4 x i32> %tmp5 } +define <8 x i32> @vtrnQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind { +; CHECK-LABEL: vtrnQi32_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vtrn.32 q9, q8 +; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp2 = load <4 x i32>, <4 x i32>* %B + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> + ret <8 x i32> %tmp3 +} + define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind { -;CHECK-LABEL: vtrnQf: -;CHECK: vtrn.32 -;CHECK-NEXT: vadd.f32 +; CHECK-LABEL: vtrnQf: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vtrn.32 q9, q8 +; CHECK-NEXT: vadd.f32 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> @@ -96,12 +293,31 @@ define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind { ret <4 x float> %tmp5 } -; Undef shuffle indices should not prevent matching to VTRN: +define <8 x float> @vtrnQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind { +; CHECK-LABEL: vtrnQf_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vtrn.32 q9, q8 +; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <4 x float>, <4 x float>* %A + %tmp2 = load <4 x float>, <4 x float>* %B + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> + ret <8 x float> %tmp3 +} + define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: vtrni8_undef: -;CHECK: vtrn.8 -;CHECK-NEXT: vadd.i8 +; CHECK-LABEL: vtrni8_undef: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.8 d17, d16 +; CHECK-NEXT: vadd.i8 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> @@ -110,10 +326,54 @@ define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { ret <8 x i8> %tmp5 } +define <16 x i8> @vtrni8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { +; CHECK-LABEL: vtrni8_undef_Qres: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vldr d19, [r1] +; CHECK-NEXT: vmov.u8 r0, d18[0] +; CHECK-NEXT: vmov.8 d16[0], r0 +; CHECK-NEXT: vmov.u8 r0, d18[2] +; CHECK-NEXT: vmov.8 d16[2], r0 +; CHECK-NEXT: vmov.u8 r0, d19[2] +; CHECK-NEXT: vmov.8 d16[3], r0 +; CHECK-NEXT: vmov.u8 r0, d19[4] +; CHECK-NEXT: vmov.8 d16[5], r0 +; CHECK-NEXT: vmov.u8 r0, d18[6] +; CHECK-NEXT: vmov.8 d16[6], r0 +; CHECK-NEXT: vmov.u8 r0, d19[6] +; CHECK-NEXT: vmov.8 d16[7], r0 +; CHECK-NEXT: vmov.u8 r0, d18[1] +; CHECK-NEXT: vmov.8 d17[0], r0 +; CHECK-NEXT: vmov.u8 r0, d19[1] +; CHECK-NEXT: vmov.8 d17[1], r0 +; CHECK-NEXT: vmov.u8 r0, d18[3] +; CHECK-NEXT: vmov.8 d17[2], r0 +; CHECK-NEXT: vmov.u8 r0, d19[3] +; CHECK-NEXT: vmov.8 d17[3], r0 +; CHECK-NEXT: vmov.u8 r0, d18[5] +; CHECK-NEXT: vmov.8 d17[4], r0 +; CHECK-NEXT: vmov.u8 r0, d19[7] +; CHECK-NEXT: vmov.8 d17[7], r0 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: vtrnQi16_undef: -;CHECK: vtrn.16 -;CHECK-NEXT: vadd.i16 +; CHECK-LABEL: vtrnQi16_undef: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vtrn.16 q9, q8 +; CHECK-NEXT: vadd.i16 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> @@ -122,3 +382,17 @@ define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { ret <8 x i16> %tmp5 } +define <16 x i16> @vtrnQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { +; CHECK-LABEL: vtrnQi16_undef_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vtrn.16 q9, q8 +; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> + ret <16 x i16> %tmp3 +} diff --git a/test/CodeGen/ARM/vuzp.ll b/test/CodeGen/ARM/vuzp.ll index 7a7306a2659..973dc773aa9 100644 --- a/test/CodeGen/ARM/vuzp.ll +++ b/test/CodeGen/ARM/vuzp.ll @@ -1,9 +1,14 @@ ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: vuzpi8: -;CHECK: vuzp.8 -;CHECK-NEXT: vadd.i8 +; CHECK-LABEL: vuzpi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vuzp.8 d17, d16 +; CHECK-NEXT: vadd.i8 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> @@ -12,10 +17,61 @@ define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ret <8 x i8> %tmp5 } +define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { +; CHECK-LABEL: vuzpi8_Qres: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d19, [r0] +; CHECK-NEXT: vldr d18, [r1] +; CHECK-NEXT: vmov.u8 r0, d19[0] +; CHECK-NEXT: vmov.8 d16[0], r0 +; CHECK-NEXT: vmov.u8 r0, d19[2] +; CHECK-NEXT: vmov.8 d16[1], r0 +; CHECK-NEXT: vmov.u8 r0, d19[4] +; CHECK-NEXT: vmov.8 d16[2], r0 +; CHECK-NEXT: vmov.u8 r0, d19[6] +; CHECK-NEXT: vmov.8 d16[3], r0 +; CHECK-NEXT: vmov.u8 r0, d18[0] +; CHECK-NEXT: vmov.8 d16[4], r0 +; CHECK-NEXT: vmov.u8 r0, d18[2] +; CHECK-NEXT: vmov.8 d16[5], r0 +; CHECK-NEXT: vmov.u8 r0, d18[4] +; CHECK-NEXT: vmov.8 d16[6], r0 +; CHECK-NEXT: vmov.u8 r0, d18[6] +; CHECK-NEXT: vmov.8 d16[7], r0 +; CHECK-NEXT: vmov.u8 r0, d19[1] +; CHECK-NEXT: vmov.8 d17[0], r0 +; CHECK-NEXT: vmov.u8 r0, d19[3] +; CHECK-NEXT: vmov.8 d17[1], r0 +; CHECK-NEXT: vmov.u8 r0, d19[5] +; CHECK-NEXT: vmov.8 d17[2], r0 +; CHECK-NEXT: vmov.u8 r0, d19[7] +; CHECK-NEXT: vmov.8 d17[3], r0 +; CHECK-NEXT: vmov.u8 r0, d18[1] +; CHECK-NEXT: vmov.8 d17[4], r0 +; CHECK-NEXT: vmov.u8 r0, d18[3] +; CHECK-NEXT: vmov.8 d17[5], r0 +; CHECK-NEXT: vmov.u8 r0, d18[5] +; CHECK-NEXT: vmov.8 d17[6], r0 +; CHECK-NEXT: vmov.u8 r0, d18[7] +; CHECK-NEXT: vmov.8 d17[7], r0 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: vuzpi16: -;CHECK: vuzp.16 -;CHECK-NEXT: vadd.i16 +; CHECK-LABEL: vuzpi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vuzp.16 d17, d16 +; CHECK-NEXT: vadd.i16 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> @@ -24,12 +80,48 @@ define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ret <4 x i16> %tmp5 } +define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { +; CHECK-LABEL: vuzpi16_Qres: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: vmov.16 d18[0], r0 +; CHECK-NEXT: vmov.u16 r0, d16[2] +; CHECK-NEXT: vmov.16 d18[1], r0 +; CHECK-NEXT: vmov.u16 r0, d17[0] +; CHECK-NEXT: vmov.16 d18[2], r0 +; CHECK-NEXT: vmov.u16 r0, d17[2] +; CHECK-NEXT: vmov.16 d18[3], r0 +; CHECK-NEXT: vmov.u16 r0, d16[1] +; CHECK-NEXT: vmov.16 d19[0], r0 +; CHECK-NEXT: vmov.u16 r0, d16[3] +; CHECK-NEXT: vmov.16 d19[1], r0 +; CHECK-NEXT: vmov.u16 r0, d17[1] +; CHECK-NEXT: vmov.16 d19[2], r0 +; CHECK-NEXT: vmov.u16 r0, d17[3] +; CHECK-NEXT: vmov.16 d19[3], r0 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: mov pc, lr + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> + ret <8 x i16> %tmp3 +} + ; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors. define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: vuzpQi8: -;CHECK: vuzp.8 -;CHECK-NEXT: vadd.i8 +; CHECK-LABEL: vuzpQi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vuzp.8 q9, q8 +; CHECK-NEXT: vadd.i8 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> @@ -38,10 +130,31 @@ define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { ret <16 x i8> %tmp5 } +define <32 x i8> @vuzpQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { +; CHECK-LABEL: vuzpQi8_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vuzp.8 q9, q8 +; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = load <16 x i8>, <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> + ret <32 x i8> %tmp3 +} + define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: vuzpQi16: -;CHECK: vuzp.16 -;CHECK-NEXT: vadd.i16 +; CHECK-LABEL: vuzpQi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vuzp.16 q9, q8 +; CHECK-NEXT: vadd.i16 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> @@ -50,10 +163,31 @@ define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ret <8 x i16> %tmp5 } +define <16 x i16> @vuzpQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { +; CHECK-LABEL: vuzpQi16_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vuzp.16 q9, q8 +; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> + ret <16 x i16> %tmp3 +} + define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: vuzpQi32: -;CHECK: vuzp.32 -;CHECK-NEXT: vadd.i32 +; CHECK-LABEL: vuzpQi32: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vuzp.32 q9, q8 +; CHECK-NEXT: vadd.i32 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> @@ -62,10 +196,31 @@ define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ret <4 x i32> %tmp5 } +define <8 x i32> @vuzpQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind { +; CHECK-LABEL: vuzpQi32_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vuzp.32 q9, q8 +; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp2 = load <4 x i32>, <4 x i32>* %B + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> + ret <8 x i32> %tmp3 +} + define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind { -;CHECK-LABEL: vuzpQf: -;CHECK: vuzp.32 -;CHECK-NEXT: vadd.f32 +; CHECK-LABEL: vuzpQf: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vuzp.32 q9, q8 +; CHECK-NEXT: vadd.f32 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> @@ -74,12 +229,32 @@ define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind { ret <4 x float> %tmp5 } +define <8 x float> @vuzpQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind { +; CHECK-LABEL: vuzpQf_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vuzp.32 q9, q8 +; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <4 x float>, <4 x float>* %A + %tmp2 = load <4 x float>, <4 x float>* %B + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> + ret <8 x float> %tmp3 +} + ; Undef shuffle indices should not prevent matching to VUZP: define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: vuzpi8_undef: -;CHECK: vuzp.8 -;CHECK-NEXT: vadd.i8 +; CHECK-LABEL: vuzpi8_undef: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vuzp.8 d17, d16 +; CHECK-NEXT: vadd.i8 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> @@ -88,10 +263,54 @@ define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { ret <8 x i8> %tmp5 } +define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { +; CHECK-LABEL: vuzpi8_undef_Qres: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vldr d19, [r1] +; CHECK-NEXT: vmov.u8 r0, d18[0] +; CHECK-NEXT: vmov.8 d16[0], r0 +; CHECK-NEXT: vmov.u8 r0, d18[2] +; CHECK-NEXT: vmov.8 d16[1], r0 +; CHECK-NEXT: vmov.u8 r0, d19[0] +; CHECK-NEXT: vmov.8 d16[4], r0 +; CHECK-NEXT: vmov.u8 r0, d19[2] +; CHECK-NEXT: vmov.8 d16[5], r0 +; CHECK-NEXT: vmov.u8 r0, d19[4] +; CHECK-NEXT: vmov.8 d16[6], r0 +; CHECK-NEXT: vmov.u8 r0, d19[6] +; CHECK-NEXT: vmov.8 d16[7], r0 +; CHECK-NEXT: vmov.u8 r0, d18[1] +; CHECK-NEXT: vmov.8 d17[0], r0 +; CHECK-NEXT: vmov.u8 r0, d18[3] +; CHECK-NEXT: vmov.8 d17[1], r0 +; CHECK-NEXT: vmov.u8 r0, d18[5] +; CHECK-NEXT: vmov.8 d17[2], r0 +; CHECK-NEXT: vmov.u8 r0, d18[7] +; CHECK-NEXT: vmov.8 d17[3], r0 +; CHECK-NEXT: vmov.u8 r0, d19[5] +; CHECK-NEXT: vmov.8 d17[6], r0 +; CHECK-NEXT: vmov.u8 r0, d19[7] +; CHECK-NEXT: vmov.8 d17[7], r0 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: vuzpQi16_undef: -;CHECK: vuzp.16 -;CHECK-NEXT: vadd.i16 +; CHECK-LABEL: vuzpQi16_undef: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vuzp.16 q9, q8 +; CHECK-NEXT: vadd.i16 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> @@ -100,3 +319,17 @@ define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { ret <8 x i16> %tmp5 } +define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { +; CHECK-LABEL: vuzpQi16_undef_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vuzp.16 q9, q8 +; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> + ret <16 x i16> %tmp3 +} diff --git a/test/CodeGen/ARM/vzip.ll b/test/CodeGen/ARM/vzip.ll index a1b5b4549ac..20767251a67 100644 --- a/test/CodeGen/ARM/vzip.ll +++ b/test/CodeGen/ARM/vzip.ll @@ -1,9 +1,14 @@ ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: vzipi8: -;CHECK: vzip.8 -;CHECK-NEXT: vadd.i8 +; CHECK-LABEL: vzipi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vzip.8 d17, d16 +; CHECK-NEXT: vadd.i8 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> @@ -12,10 +17,61 @@ define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ret <8 x i8> %tmp5 } +define <16 x i8> @vzipi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { +; CHECK-LABEL: vzipi8_Qres: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d19, [r0] +; CHECK-NEXT: vldr d18, [r1] +; CHECK-NEXT: vmov.u8 r0, d19[0] +; CHECK-NEXT: vmov.8 d16[0], r0 +; CHECK-NEXT: vmov.u8 r0, d18[0] +; CHECK-NEXT: vmov.8 d16[1], r0 +; CHECK-NEXT: vmov.u8 r0, d19[1] +; CHECK-NEXT: vmov.8 d16[2], r0 +; CHECK-NEXT: vmov.u8 r0, d18[1] +; CHECK-NEXT: vmov.8 d16[3], r0 +; CHECK-NEXT: vmov.u8 r0, d19[2] +; CHECK-NEXT: vmov.8 d16[4], r0 +; CHECK-NEXT: vmov.u8 r0, d18[2] +; CHECK-NEXT: vmov.8 d16[5], r0 +; CHECK-NEXT: vmov.u8 r0, d19[3] +; CHECK-NEXT: vmov.8 d16[6], r0 +; CHECK-NEXT: vmov.u8 r0, d18[3] +; CHECK-NEXT: vmov.8 d16[7], r0 +; CHECK-NEXT: vmov.u8 r0, d19[4] +; CHECK-NEXT: vmov.8 d17[0], r0 +; CHECK-NEXT: vmov.u8 r0, d18[4] +; CHECK-NEXT: vmov.8 d17[1], r0 +; CHECK-NEXT: vmov.u8 r0, d19[5] +; CHECK-NEXT: vmov.8 d17[2], r0 +; CHECK-NEXT: vmov.u8 r0, d18[5] +; CHECK-NEXT: vmov.8 d17[3], r0 +; CHECK-NEXT: vmov.u8 r0, d19[6] +; CHECK-NEXT: vmov.8 d17[4], r0 +; CHECK-NEXT: vmov.u8 r0, d18[6] +; CHECK-NEXT: vmov.8 d17[5], r0 +; CHECK-NEXT: vmov.u8 r0, d19[7] +; CHECK-NEXT: vmov.8 d17[6], r0 +; CHECK-NEXT: vmov.u8 r0, d18[7] +; CHECK-NEXT: vmov.8 d17[7], r0 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: vzipi16: -;CHECK: vzip.16 -;CHECK-NEXT: vadd.i16 +; CHECK-LABEL: vzipi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vzip.16 d17, d16 +; CHECK-NEXT: vadd.i16 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> @@ -24,12 +80,48 @@ define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ret <4 x i16> %tmp5 } +define <8 x i16> @vzipi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { +; CHECK-LABEL: vzipi16_Qres: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: vmov.16 d18[0], r0 +; CHECK-NEXT: vmov.u16 r0, d17[0] +; CHECK-NEXT: vmov.16 d18[1], r0 +; CHECK-NEXT: vmov.u16 r0, d16[1] +; CHECK-NEXT: vmov.16 d18[2], r0 +; CHECK-NEXT: vmov.u16 r0, d17[1] +; CHECK-NEXT: vmov.16 d18[3], r0 +; CHECK-NEXT: vmov.u16 r0, d16[2] +; CHECK-NEXT: vmov.16 d19[0], r0 +; CHECK-NEXT: vmov.u16 r0, d17[2] +; CHECK-NEXT: vmov.16 d19[1], r0 +; CHECK-NEXT: vmov.u16 r0, d16[3] +; CHECK-NEXT: vmov.16 d19[2], r0 +; CHECK-NEXT: vmov.u16 r0, d17[3] +; CHECK-NEXT: vmov.16 d19[3], r0 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: mov pc, lr + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> + ret <8 x i16> %tmp3 +} + ; VZIP.32 is equivalent to VTRN.32 for 64-bit vectors. define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: vzipQi8: -;CHECK: vzip.8 -;CHECK-NEXT: vadd.i8 +; CHECK-LABEL: vzipQi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vzip.8 q9, q8 +; CHECK-NEXT: vadd.i8 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> @@ -38,10 +130,31 @@ define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { ret <16 x i8> %tmp5 } +define <32 x i8> @vzipQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { +; CHECK-LABEL: vzipQi8_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vzip.8 q9, q8 +; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = load <16 x i8>, <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> + ret <32 x i8> %tmp3 +} + define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: vzipQi16: -;CHECK: vzip.16 -;CHECK-NEXT: vadd.i16 +; CHECK-LABEL: vzipQi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vzip.16 q9, q8 +; CHECK-NEXT: vadd.i16 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> @@ -50,10 +163,31 @@ define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ret <8 x i16> %tmp5 } +define <16 x i16> @vzipQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { +; CHECK-LABEL: vzipQi16_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vzip.16 q9, q8 +; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> + ret <16 x i16> %tmp3 +} + define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: vzipQi32: -;CHECK: vzip.32 -;CHECK-NEXT: vadd.i32 +; CHECK-LABEL: vzipQi32: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vzip.32 q9, q8 +; CHECK-NEXT: vadd.i32 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> @@ -62,10 +196,31 @@ define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ret <4 x i32> %tmp5 } +define <8 x i32> @vzipQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind { +; CHECK-LABEL: vzipQi32_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vzip.32 q9, q8 +; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp2 = load <4 x i32>, <4 x i32>* %B + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> + ret <8 x i32> %tmp3 +} + define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind { -;CHECK-LABEL: vzipQf: -;CHECK: vzip.32 -;CHECK-NEXT: vadd.f32 +; CHECK-LABEL: vzipQf: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vzip.32 q9, q8 +; CHECK-NEXT: vadd.f32 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> @@ -74,12 +229,32 @@ define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind { ret <4 x float> %tmp5 } +define <8 x float> @vzipQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind { +; CHECK-LABEL: vzipQf_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vzip.32 q9, q8 +; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <4 x float>, <4 x float>* %A + %tmp2 = load <4 x float>, <4 x float>* %B + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> + ret <8 x float> %tmp3 +} + ; Undef shuffle indices should not prevent matching to VZIP: define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: vzipi8_undef: -;CHECK: vzip.8 -;CHECK-NEXT: vadd.i8 +; CHECK-LABEL: vzipi8_undef: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vzip.8 d17, d16 +; CHECK-NEXT: vadd.i8 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> @@ -88,10 +263,54 @@ define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { ret <8 x i8> %tmp5 } +define <16 x i8> @vzipi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { +; CHECK-LABEL: vzipi8_undef_Qres: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vldr d19, [r1] +; CHECK-NEXT: vmov.u8 r0, d18[0] +; CHECK-NEXT: vmov.8 d16[0], r0 +; CHECK-NEXT: vmov.u8 r0, d18[1] +; CHECK-NEXT: vmov.8 d16[2], r0 +; CHECK-NEXT: vmov.u8 r0, d19[1] +; CHECK-NEXT: vmov.8 d16[3], r0 +; CHECK-NEXT: vmov.u8 r0, d19[2] +; CHECK-NEXT: vmov.8 d16[5], r0 +; CHECK-NEXT: vmov.u8 r0, d18[3] +; CHECK-NEXT: vmov.8 d16[6], r0 +; CHECK-NEXT: vmov.u8 r0, d19[3] +; CHECK-NEXT: vmov.8 d16[7], r0 +; CHECK-NEXT: vmov.u8 r0, d18[4] +; CHECK-NEXT: vmov.8 d17[0], r0 +; CHECK-NEXT: vmov.u8 r0, d19[4] +; CHECK-NEXT: vmov.8 d17[1], r0 +; CHECK-NEXT: vmov.u8 r0, d18[5] +; CHECK-NEXT: vmov.8 d17[2], r0 +; CHECK-NEXT: vmov.u8 r0, d19[5] +; CHECK-NEXT: vmov.8 d17[3], r0 +; CHECK-NEXT: vmov.u8 r0, d18[6] +; CHECK-NEXT: vmov.8 d17[4], r0 +; CHECK-NEXT: vmov.u8 r0, d19[7] +; CHECK-NEXT: vmov.8 d17[7], r0 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + define <16 x i8> @vzipQi8_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: vzipQi8_undef: -;CHECK: vzip.8 -;CHECK-NEXT: vadd.i8 +; CHECK-LABEL: vzipQi8_undef: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vzip.8 q9, q8 +; CHECK-NEXT: vadd.i8 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> @@ -100,3 +319,17 @@ define <16 x i8> @vzipQi8_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind { ret <16 x i8> %tmp5 } +define <32 x i8> @vzipQi8_undef_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { +; CHECK-LABEL: vzipQi8_undef_QQres: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vzip.8 q9, q8 +; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov pc, lr + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = load <16 x i8>, <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> + ret <32 x i8> %tmp3 +}