%ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
ret <8 x i16> %ret
}
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl 4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps $48
+; CHECK-NEXT: ret
+ %1 = load <4 x float>* %pb, align 16
+ %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+ ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl 4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+ %1 = load <4 x float>* %pb, align 16
+ %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+ ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl 4(%esp), %eax
+; X32: movl 8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+ %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+ %2 = load <4 x float>* %1, align 16
+ %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+ ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl 8(%esp), %eax
+; X32: movl 4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps $48
+; CHECK-NEXT: ret
+ %1 = getelementptr inbounds float* %fb, i64 %index
+ %2 = load float* %1, align 4
+ %3 = insertelement <4 x float> undef, float %2, i32 0
+ %4 = insertelement <4 x float> %3, float %2, i32 1
+ %5 = insertelement <4 x float> %4, float %2, i32 2
+ %6 = insertelement <4 x float> %5, float %2, i32 3
+ %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+ ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl 4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps $48
+; CHECK-NEXT: ret
+ %1 = load <4 x float>* %b, align 4
+ %2 = extractelement <4 x float> %1, i32 0
+ %3 = insertelement <4 x float> undef, float %2, i32 0
+ %4 = insertelement <4 x float> %3, float %2, i32 1
+ %5 = insertelement <4 x float> %4, float %2, i32 2
+ %6 = insertelement <4 x float> %5, float %2, i32 3
+ %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+ ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl 8(%esp), %eax
+; X32: movl 4(%esp), %ecx
+; CHECK: movss
+; CHECK-NOT: mov
+; CHECK: insertps $48
+; CHECK: insertps $48
+; CHECK: insertps $48
+; CHECK: insertps $48
+; CHECK: addps
+; CHECK: addps
+; CHECK: addps
+; CHECK-NEXT: ret
+ %1 = getelementptr inbounds float* %fb, i64 %index
+ %2 = load float* %1, align 4
+ %3 = insertelement <4 x float> undef, float %2, i32 0
+ %4 = insertelement <4 x float> %3, float %2, i32 1
+ %5 = insertelement <4 x float> %4, float %2, i32 2
+ %6 = insertelement <4 x float> %5, float %2, i32 3
+ %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+ %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+ %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+ %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+ %11 = fadd <4 x float> %7, %8
+ %12 = fadd <4 x float> %9, %10
+ %13 = fadd <4 x float> %11, %12
+ ret <4 x float> %13
+}
+
+define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
+; CHECK-LABEL: insertps_with_undefs:
+; CHECK-NOT: shufps
+; CHECK: insertps $32, %xmm0
+; CHECK: ret
+ %1 = load float* %b, align 4
+ %2 = insertelement <4 x float> undef, float %1, i32 0
+ %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
+ ret <4 x float> %result
+}
+
+; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
+; the destination index to change the load, instead of the source index.
+define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
+; CHECK-LABEL: pr20087:
+; CHECK: insertps $48
+; CHECK: ret
+ %load = load <4 x float> *%ptr
+ %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
+ ret <4 x float> %ret
+}
+
+; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
+define void @insertps_pr20411(i32* noalias nocapture %RET) #1 {
+; CHECK-LABEL: insertps_pr20411:
+; CHECK: movaps {{[^,]*}}, %[[REG1:xmm.]]
+; CHECK: pshufd {{.*}} ## [[REG2:xmm.]] = mem[3,0,0,0]
+; CHECK: insertps {{.*}} ## xmm1 = [[REG2]][0],[[REG1]][3]{{.*}}
+
+ %gather_load = shufflevector <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %shuffle109 = shufflevector <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; 4 5 6 7
+
+ %shuffle116 = shufflevector <8 x i32> %gather_load, <8 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; 3 x x x
+ %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 4, i32 3, i32 undef, i32 undef> ; 3 7 x x
+
+ %ptrcast = bitcast i32* %RET to <4 x i32>*
+ store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
+ ret void
+}