test/CodeGen/X86/sse2.ll

   1 ; Tests for SSE2 and below, without SSE3+.
   2 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
   3
   4 define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
   5 ; CHECK-LABEL: test1:
   6 ; CHECK:       ## BB#0:
   7 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
   8 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   9 ; CHECK-NEXT:    movapd (%ecx), %xmm0
  10 ; CHECK-NEXT:    movlpd {{[0-9]+}}(%esp), %xmm0
  11 ; CHECK-NEXT:    movapd %xmm0, (%eax)
  12 ; CHECK-NEXT:    retl
  13         %tmp3 = load <2 x double>* %A, align 16
  14         %tmp7 = insertelement <2 x double> undef, double %B, i32 0
  15         %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
  16         store <2 x double> %tmp9, <2 x double>* %r, align 16
  17         ret void
  18 }
  19
  20 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
  21 ; CHECK-LABEL: test2:
  22 ; CHECK:       ## BB#0:
  23 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
  24 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
  25 ; CHECK-NEXT:    movapd (%ecx), %xmm0
  26 ; CHECK-NEXT:    movhpd {{[0-9]+}}(%esp), %xmm0
  27 ; CHECK-NEXT:    movapd %xmm0, (%eax)
  28 ; CHECK-NEXT:    retl
  29         %tmp3 = load <2 x double>* %A, align 16
  30         %tmp7 = insertelement <2 x double> undef, double %B, i32 0
  31         %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
  32         store <2 x double> %tmp9, <2 x double>* %r, align 16
  33         ret void
  34 }
  35
  36
  37 define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
  38 ; CHECK-LABEL: test3:
  39 ; CHECK:       ## BB#0:
  40 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
  41 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
  42 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
  43 ; CHECK-NEXT:    movaps (%edx), %xmm0
  44 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
  45 ; CHECK-NEXT:    movaps %xmm0, (%eax)
  46 ; CHECK-NEXT:    retl
  47         %tmp = load <4 x float>* %B             ; <<4 x float>> [#uses=2]
  48         %tmp3 = load <4 x float>* %A            ; <<4 x float>> [#uses=2]
  49         %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0          ; <float> [#uses=1]
  50         %tmp7 = extractelement <4 x float> %tmp, i32 0          ; <float> [#uses=1]
  51         %tmp8 = extractelement <4 x float> %tmp3, i32 1         ; <float> [#uses=1]
  52         %tmp9 = extractelement <4 x float> %tmp, i32 1          ; <float> [#uses=1]
  53         %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0             ; <<4 x float>> [#uses=1]
  54         %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1           ; <<4 x float>> [#uses=1]
  55         %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2           ; <<4 x float>> [#uses=1]
  56         %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3           ; <<4 x float>> [#uses=1]
  57         store <4 x float> %tmp13, <4 x float>* %res
  58         ret void
  59 }
  60
  61 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
  62 ; CHECK-LABEL: test4:
  63 ; CHECK:       ## BB#0:
  64 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
  65 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,3,0]
  66 ; CHECK-NEXT:    movdqa %xmm0, (%eax)
  67 ; CHECK-NEXT:    retl
  68         %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >               ; <<4 x float>> [#uses=1]
  69         store <4 x float> %tmp5, <4 x float>* %res
  70         ret void
  71 }
  72
  73 define <4 x i32> @test5(i8** %ptr) nounwind {
  74 ; CHECK-LABEL: test5:
  75 ; CHECK:       ## BB#0:
  76 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
  77 ; CHECK-NEXT:    movl (%eax), %eax
  78 ; CHECK-NEXT:    movss (%eax), %xmm1
  79 ; CHECK-NEXT:    pxor %xmm0, %xmm0
  80 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
  81 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
  82 ; CHECK-NEXT:    retl
  83         %tmp = load i8** %ptr           ; <i8*> [#uses=1]
  84         %tmp.upgrd.1 = bitcast i8* %tmp to float*               ; <float*> [#uses=1]
  85         %tmp.upgrd.2 = load float* %tmp.upgrd.1         ; <float> [#uses=1]
  86         %tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0               ; <<4 x float>> [#uses=1]
  87         %tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1               ; <<4 x float>> [#uses=1]
  88         %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2             ; <<4 x float>> [#uses=1]
  89         %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3            ; <<4 x float>> [#uses=1]
  90         %tmp21 = bitcast <4 x float> %tmp11 to <16 x i8>                ; <<16 x i8>> [#uses=1]
  91         %tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 >               ; <<16 x i8>> [#uses=1]
  92         %tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16>          ; <<8 x i16>> [#uses=1]
  93         %tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 >                ; <<8 x i16>> [#uses=1]
  94         %tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32>            ; <<4 x i32>> [#uses=1]
  95         ret <4 x i32> %tmp36
  96 }
  97
  98 define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
  99 ; CHECK-LABEL: test6:
 100 ; CHECK:       ## BB#0:
 101 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 102 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 103 ; CHECK-NEXT:    movaps (%ecx), %xmm0
 104 ; CHECK-NEXT:    movaps %xmm0, (%eax)
 105 ; CHECK-NEXT:    retl
 106   %tmp1 = load <4 x float>* %A            ; <<4 x float>> [#uses=1]
 107   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
 108   store <4 x float> %tmp2, <4 x float>* %res
 109   ret void
 110 }
 111
 112 define void @test7() nounwind {
 113 ; CHECK-LABEL: test7:
 114 ; CHECK:       ## BB#0:
 115 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 116 ; CHECK-NEXT:    movaps %xmm0, 0
 117 ; CHECK-NEXT:    retl
 118   bitcast <4 x i32> zeroinitializer to <4 x float>                ; <<4 x float>>:1 [#uses=1]
 119   shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
 120   store <4 x float> %2, <4 x float>* null
 121   ret void
 122 }
 123
 124 @x = external global [4 x i32]
 125
 126 define <2 x i64> @test8() nounwind {
 127 ; CHECK-LABEL: test8:
 128 ; CHECK:       ## BB#0:
 129 ; CHECK-NEXT:    movl L_x$non_lazy_ptr, %eax
 130 ; CHECK-NEXT:    movups (%eax), %xmm0
 131 ; CHECK-NEXT:    retl
 132         %tmp = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 0)            ; <i32> [#uses=1]
 133         %tmp3 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 1)           ; <i32> [#uses=1]
 134         %tmp5 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 2)           ; <i32> [#uses=1]
 135         %tmp7 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 3)           ; <i32> [#uses=1]
 136         %tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0           ; <<4 x i32>> [#uses=1]
 137         %tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1         ; <<4 x i32>> [#uses=1]
 138         %tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2               ; <<4 x i32>> [#uses=1]
 139         %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3               ; <<4 x i32>> [#uses=1]
 140         %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64>          ; <<2 x i64>> [#uses=1]
 141         ret <2 x i64> %tmp16
 142 }
 143
 144 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
 145 ; CHECK-LABEL: test9:
 146 ; CHECK:       ## BB#0:
 147 ; CHECK-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
 148 ; CHECK-NEXT:    retl
 149         %tmp = insertelement <4 x float> undef, float %a, i32 0         ; <<4 x float>> [#uses=1]
 150         %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1                ; <<4 x float>> [#uses=1]
 151         %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2              ; <<4 x float>> [#uses=1]
 152         %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3              ; <<4 x float>> [#uses=1]
 153         ret <4 x float> %tmp13
 154 }
 155
 156 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
 157 ; CHECK-LABEL: test10:
 158 ; CHECK:       ## BB#0:
 159 ; CHECK-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 160 ; CHECK-NEXT:    retl
 161         %tmp = insertelement <4 x float> undef, float %a, i32 0         ; <<4 x float>> [#uses=1]
 162         %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1                ; <<4 x float>> [#uses=1]
 163         %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2              ; <<4 x float>> [#uses=1]
 164         %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3              ; <<4 x float>> [#uses=1]
 165         ret <4 x float> %tmp13
 166 }
 167
 168 define <2 x double> @test11(double %a, double %b) nounwind {
 169 ; CHECK-LABEL: test11:
 170 ; CHECK:       ## BB#0:
 171 ; CHECK-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 172 ; CHECK-NEXT:    retl
 173         %tmp = insertelement <2 x double> undef, double %a, i32 0               ; <<2 x double>> [#uses=1]
 174         %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1               ; <<2 x double>> [#uses=1]
 175         ret <2 x double> %tmp7
 176 }
 177
 178 define void @test12() nounwind {
 179 ; CHECK-LABEL: test12:
 180 ; CHECK:       ## BB#0:
 181 ; CHECK-NEXT:    movaps 0, %xmm0
 182 ; CHECK-NEXT:    xorps %xmm1, %xmm1
 183 ; CHECK-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
 184 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
 185 ; CHECK-NEXT:    addps %xmm1, %xmm0
 186 ; CHECK-NEXT:    movaps %xmm0, 0
 187 ; CHECK-NEXT:    retl
 188   %tmp1 = load <4 x float>* null          ; <<4 x float>> [#uses=2]
 189   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
 190   %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
 191   %tmp4 = fadd <4 x float> %tmp2, %tmp3            ; <<4 x float>> [#uses=1]
 192   store <4 x float> %tmp4, <4 x float>* null
 193   ret void
 194 }
 195
 196 define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
 197 ; CHECK-LABEL: test13:
 198 ; CHECK:       ## BB#0:
 199 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 200 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 201 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 202 ; CHECK-NEXT:    movaps (%edx), %xmm0
 203 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
 204 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 205 ; CHECK-NEXT:    movdqa %xmm0, (%eax)
 206 ; CHECK-NEXT:    retl
 207   %tmp3 = load <4 x float>* %B            ; <<4 x float>> [#uses=1]
 208   %tmp5 = load <4 x float>* %C            ; <<4 x float>> [#uses=1]
 209   %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 >         ; <<4 x float>> [#uses=1]
 210   store <4 x float> %tmp11, <4 x float>* %res
 211   ret void
 212 }
 213
 214 define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
 215 ; CHECK-LABEL: test14:
 216 ; CHECK:       ## BB#0:
 217 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 218 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 219 ; CHECK-NEXT:    movaps (%ecx), %xmm1
 220 ; CHECK-NEXT:    movaps (%eax), %xmm2
 221 ; CHECK-NEXT:    movaps %xmm2, %xmm0
 222 ; CHECK-NEXT:    addps %xmm1, %xmm0
 223 ; CHECK-NEXT:    subps %xmm1, %xmm2
 224 ; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 225 ; CHECK-NEXT:    retl
 226   %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=2]
 227   %tmp5 = load <4 x float>* %x            ; <<4 x float>> [#uses=2]
 228   %tmp9 = fadd <4 x float> %tmp5, %tmp             ; <<4 x float>> [#uses=1]
 229   %tmp21 = fsub <4 x float> %tmp5, %tmp            ; <<4 x float>> [#uses=1]
 230   %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
 231   ret <4 x float> %tmp27
 232 }
 233
 234 define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
 235 ; CHECK-LABEL: test15:
 236 ; CHECK:       ## BB#0: ## %entry
 237 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 238 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 239 ; CHECK-NEXT:    movaps (%ecx), %xmm0
 240 ; CHECK-NEXT:    movaps (%eax), %xmm1
 241 ; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
 242 ; CHECK-NEXT:    retl
 243 entry:
 244   %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=1]
 245   %tmp3 = load <4 x float>* %x            ; <<4 x float>> [#uses=1]
 246   %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >           ; <<4 x float>> [#uses=1]
 247   ret <4 x float> %tmp4
 248 }
 249
 250 ; PR8900
 251
 252 define  <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
 253 ; CHECK-LABEL: test16:
 254 ; CHECK:       ## BB#0:
 255 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 256 ; CHECK-NEXT:    movapd 96(%eax), %xmm0
 257 ; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 258 ; CHECK-NEXT:    retl
 259   %i5 = getelementptr inbounds <4 x double>* %srcA, i32 3
 260   %i6 = load <4 x double>* %i5, align 32
 261   %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
 262   ret <2 x double> %i7
 263 }
 264
 265 ; PR9009
 266 define fastcc void @test17() nounwind {
 267 ; CHECK-LABEL: test17:
 268 ; CHECK:       ## BB#0: ## %entry
 269 ; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
 270 ; CHECK-NEXT:    movaps %xmm0, (%eax)
 271 ; CHECK-NEXT:    retl
 272 entry:
 273   %0 = insertelement <4 x i32> undef, i32 undef, i32 1
 274   %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 275   %2 = bitcast <4 x i32> %1 to <4 x float>
 276   store <4 x float> %2, <4 x float> * undef
 277   ret void
 278 }
 279
 280 ; PR9210
 281 define <4 x float> @f(<4 x double>) nounwind {
 282 ; CHECK-LABEL: f:
 283 ; CHECK:       ## BB#0: ## %entry
 284 ; CHECK-NEXT:    cvtpd2ps %xmm1, %xmm1
 285 ; CHECK-NEXT:    cvtpd2ps %xmm0, %xmm0
 286 ; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 287 ; CHECK-NEXT:    retl
 288 entry:
 289  %double2float.i = fptrunc <4 x double> %0 to <4 x float>
 290  ret <4 x float> %double2float.i
 291 }
 292
 293 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
 294 ; CHECK-LABEL: test_insert_64_zext:
 295 ; CHECK:       ## BB#0:
 296 ; CHECK-NEXT:    movq %xmm0, %xmm0
 297 ; CHECK-NEXT:    retl
 298   %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
 299   ret <2 x i64> %1
 300 }
 301
 302 define <4 x i32> @PR19721(<4 x i32> %i) {
 303 ; CHECK-LABEL: PR19721:
 304 ; CHECK:       ## BB#0:
 305 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,0,0]
 306 ; CHECK-NEXT:    movd %xmm1, %eax
 307 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,0,0,0]
 308 ; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
 309 ; CHECK-NEXT:    movd %xmm0, %ecx
 310 ; CHECK-NEXT:    movd %xmm1, %edx
 311 ; CHECK-NEXT:    movd %edx, %xmm0
 312 ; CHECK-NEXT:    movd %ecx, %xmm1
 313 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 314 ; CHECK-NEXT:    movd %eax, %xmm0
 315 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
 316 ; CHECK-NEXT:    retl
 317   %bc = bitcast <4 x i32> %i to i128
 318   %insert = and i128 %bc, -4294967296
 319   %bc2 = bitcast i128 %insert to <4 x i32>
 320   ret <4 x i32> %bc2
 321 }
 322
 323 define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
 324 ; CHECK-LABEL: test_mul:
 325 ; CHECK:       ## BB#0:
 326 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,0,3,0]
 327 ; CHECK-NEXT:    pmuludq %xmm1, %xmm0
 328 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,0,3,0]
 329 ; CHECK-NEXT:    pmuludq %xmm2, %xmm1
 330 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 331 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 332 ; CHECK-NEXT:    retl
 333   %m = mul <4 x i32> %x, %y
 334   ret <4 x i32> %m
 335 }