1 ; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn | FileCheck %s
2 ; rdar://10050222, rdar://10134392
4 define <4 x float> @t1(<4 x float> %a, <1 x i64>* nocapture %p) nounwind {
7 ; CHECK: movlps (%rdi), %xmm0
9 %p.val = load <1 x i64>* %p, align 1
10 %0 = bitcast <1 x i64> %p.val to <2 x float>
11 %shuffle.i = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
12 %shuffle1.i = shufflevector <4 x float> %a, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
13 ret <4 x float> %shuffle1.i
16 define <4 x float> @t1a(<4 x float> %a, <1 x i64>* nocapture %p) nounwind {
19 ; CHECK: movlps (%rdi), %xmm0
21 %0 = bitcast <1 x i64>* %p to double*
23 %2 = insertelement <2 x double> undef, double %1, i32 0
24 %3 = bitcast <2 x double> %2 to <4 x float>
25 %4 = shufflevector <4 x float> %a, <4 x float> %3, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
29 define void @t2(<1 x i64>* nocapture %p, <4 x float> %a) nounwind {
32 ; CHECK: movlps %xmm0, (%rdi)
34 %cast.i = bitcast <4 x float> %a to <2 x i64>
35 %extract.i = extractelement <2 x i64> %cast.i, i32 0
36 %0 = getelementptr inbounds <1 x i64>* %p, i64 0, i64 0
37 store i64 %extract.i, i64* %0, align 8
41 define void @t2a(<1 x i64>* nocapture %p, <4 x float> %a) nounwind {
44 ; CHECK: movlps %xmm0, (%rdi)
46 %0 = bitcast <1 x i64>* %p to double*
47 %1 = bitcast <4 x float> %a to <2 x double>
48 %2 = extractelement <2 x double> %1, i32 0
49 store double %2, double* %0
54 define <2 x double> @t3() nounwind readonly {
57 ; CHECK: punpcklqdq %xmm1, %xmm0
58 ; CHECK: movq (%rax), %xmm1
59 ; CHECK: movsd %xmm1, %xmm0
60 %tmp0 = load i128* null, align 1
61 %tmp1 = load <2 x i32>* undef, align 8
62 %tmp2 = bitcast i128 %tmp0 to <16 x i8>
63 %tmp3 = bitcast <2 x i32> %tmp1 to i64
64 %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0
65 %tmp5 = bitcast <16 x i8> %tmp2 to <2 x double>
66 %tmp6 = bitcast <2 x i64> %tmp4 to <2 x double>
67 %tmp7 = shufflevector <2 x double> %tmp5, <2 x double> %tmp6, <2 x i32> <i32 2, i32 1>
68 ret <2 x double> %tmp7
72 define <2 x i64> @t4() nounwind readonly {
75 ; CHECK: punpcklqdq %xmm0, %xmm1
76 ; CHECK: movq (%rax), %xmm0
77 ; CHECK: movsd %xmm1, %xmm0
78 %tmp0 = load i128* null, align 1
79 %tmp1 = load <2 x i32>* undef, align 8
80 %tmp2 = bitcast i128 %tmp0 to <16 x i8>
81 %tmp3 = bitcast <2 x i32> %tmp1 to i64
82 %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0
83 %tmp5 = bitcast <16 x i8> %tmp2 to <2 x i64>
84 %tmp6 = shufflevector <2 x i64> %tmp4, <2 x i64> %tmp5, <2 x i32> <i32 2, i32 1>