lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 //===---------------------------------------------------------------------===//
   6
   7 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
   8 other fast SSE modes.
   9
  10 //===---------------------------------------------------------------------===//
  11
  12 Think about doing i64 math in SSE regs.
  13
  14 //===---------------------------------------------------------------------===//
  15
  16 This testcase should have no SSE instructions in it, and only one load from
  17 a constant pool:
  18
  19 double %test3(bool %B) {
  20         %C = select bool %B, double 123.412, double 523.01123123
  21         ret double %C
  22 }
  23
  24 Currently, the select is being lowered, which prevents the dag combiner from
  25 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  26
  27 The pattern isel got this one right.
  28
  29 //===---------------------------------------------------------------------===//
  30
  31 SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
  32 like this:
  33
  34   X += y
  35
  36 and the register allocator decides to spill X, it is cheaper to emit this as:
  37
  38 Y += [xslot]
  39 store Y -> [xslot]
  40
  41 than as:
  42
  43 tmp = [xslot]
  44 tmp += y
  45 store tmp -> [xslot]
  46
  47 ..and this uses one fewer register (so this should be done at load folding
  48 time, not at spiller time).  *Note* however that this can only be done
  49 if Y is dead.  Here's a testcase:
  50
  51 %.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
  52 implementation   ; Functions:
  53 declare void %printf(int, ...)
  54 void %main() {
  55 build_tree.exit:
  56         br label %no_exit.i7
  57 no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
  58         %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
  59         %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
  60         %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
  61         %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
  62         br bool false, label %Compute_Tree.exit23, label %no_exit.i7
  63 Compute_Tree.exit23:            ; preds = %no_exit.i7
  64         tail call void (int, ...)* %printf( int 0 )
  65         store double %tmp.34.i18, double* null
  66         ret void
  67 }
  68
  69 We currently emit:
  70
  71 .BBmain_1:
  72         xorpd %XMM1, %XMM1
  73         addsd %XMM0, %XMM1
  74 ***     movsd %XMM2, QWORD PTR [%ESP + 8]
  75 ***     addsd %XMM2, %XMM1
  76 ***     movsd QWORD PTR [%ESP + 8], %XMM2
  77         jmp .BBmain_1   # no_exit.i7
  78
  79 This is a bugpoint reduced testcase, which is why the testcase doesn't make
  80 much sense (e.g. its an infinite loop). :)
  81
  82 //===---------------------------------------------------------------------===//
  83
  84 SSE should implement 'select_cc' using 'emulated conditional moves' that use
  85 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  86
  87 double %X(double %Y, double %Z, double %A, double %B) {
  88         %C = setlt double %A, %B
  89         %z = add double %Z, 0.0    ;; select operand is not a load
  90         %D = select bool %C, double %Y, double %z
  91         ret double %D
  92 }
  93
  94 We currently emit:
  95
  96 _X:
  97         subl $12, %esp
  98         xorpd %xmm0, %xmm0
  99         addsd 24(%esp), %xmm0
 100         movsd 32(%esp), %xmm1
 101         movsd 16(%esp), %xmm2
 102         ucomisd 40(%esp), %xmm1
 103         jb LBB_X_2
 104 LBB_X_1:
 105         movsd %xmm0, %xmm2
 106 LBB_X_2:
 107         movsd %xmm2, (%esp)
 108         fldl (%esp)
 109         addl $12, %esp
 110         ret
 111
 112 //===---------------------------------------------------------------------===//
 113
 114 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
 115 registers. The choice may depend on subtarget information. We should do some
 116 more experiments on different x86 machines.
 117
 118 //===---------------------------------------------------------------------===//
 119
 120 Currently the x86 codegen isn't very good at mixing SSE and FPStack
 121 code:
 122
 123 unsigned int foo(double x) { return x; }
 124
 125 foo:
 126         subl $20, %esp
 127         movsd 24(%esp), %xmm0
 128         movsd %xmm0, 8(%esp)
 129         fldl 8(%esp)
 130         fisttpll (%esp)
 131         movl (%esp), %eax
 132         addl $20, %esp
 133         ret
 134
 135 This will be solved when we go to a dynamic programming based isel.
 136
 137 //===---------------------------------------------------------------------===//
 138
 139 Should generate min/max for stuff like:
 140
 141 void minf(float a, float b, float *X) {
 142   *X = a <= b ? a : b;
 143 }
 144
 145 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
 146 and ISD::FMAX node types?
 147
 148 //===---------------------------------------------------------------------===//
 149
 150 The first BB of this code:
 151
 152 declare bool %foo()
 153 int %bar() {
 154         %V = call bool %foo()
 155         br bool %V, label %T, label %F
 156 T:
 157         ret int 1
 158 F:
 159         call bool %foo()
 160         ret int 12
 161 }
 162
 163 compiles to:
 164
 165 _bar:
 166         subl $12, %esp
 167         call L_foo$stub
 168         xorb $1, %al
 169         testb %al, %al
 170         jne LBB_bar_2   # F
 171
 172 It would be better to emit "cmp %al, 1" than a xor and test.
 173
 174 //===---------------------------------------------------------------------===//
 175
 176 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 177 feasible.
 178
 179 //===---------------------------------------------------------------------===//
 180
 181 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
 182 the reg-reg copy in this example:
 183
 184 float foo(int *x, float *y, unsigned c) {
 185   float res = 0.0;
 186   unsigned i;
 187   for (i = 0; i < c; i++) {
 188     float xx = (float)x[i];
 189     xx = xx * y[i];
 190     xx += res;
 191     res = xx;
 192   }
 193   return res;
 194 }
 195
 196 LBB_foo_3:      # no_exit
 197         cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
 198         mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
 199         addss %XMM0, %XMM1
 200         inc %ESI
 201         cmp %ESI, %ECX
 202 ****    movaps %XMM1, %XMM0
 203         jb LBB_foo_3    # no_exit
 204
 205 //===---------------------------------------------------------------------===//
 206
 207 Codegen:
 208   if (copysign(1.0, x) == copysign(1.0, y))
 209 into:
 210   if (x^y & mask)
 211 when using SSE.
 212
 213 //===---------------------------------------------------------------------===//
 214
 215 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
 216 of a v4sf value.
 217
 218 //===---------------------------------------------------------------------===//
 219
 220 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
 221 Perhaps use pxor / xorp* to clear a XMM register first?
 222
 223 //===---------------------------------------------------------------------===//
 224
 225 Better codegen for:
 226
 227 void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
 228 void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
 229
 230 For the later we generate:
 231
 232 _f:
 233         pxor %xmm0, %xmm0
 234         movss 8(%esp), %xmm1
 235         movaps %xmm0, %xmm2
 236         unpcklps %xmm1, %xmm2
 237         movss 4(%esp), %xmm1
 238         unpcklps %xmm0, %xmm1
 239         unpcklps %xmm2, %xmm1
 240         movl 12(%esp), %eax
 241         movaps %xmm1, (%eax)
 242         ret
 243
 244 This seems like it should use shufps, one for each of a & b.
 245
 246 //===---------------------------------------------------------------------===//
 247
 248 How to decide when to use the "floating point version" of logical ops? Here are
 249 some code fragments:
 250
 251         movaps LCPI5_5, %xmm2
 252         divps %xmm1, %xmm2
 253         mulps %xmm2, %xmm3
 254         mulps 8656(%ecx), %xmm3
 255         addps 8672(%ecx), %xmm3
 256         andps LCPI5_6, %xmm2
 257         andps LCPI5_1, %xmm3
 258         por %xmm2, %xmm3
 259         movdqa %xmm3, (%edi)
 260
 261         movaps LCPI5_5, %xmm1
 262         divps %xmm0, %xmm1
 263         mulps %xmm1, %xmm3
 264         mulps 8656(%ecx), %xmm3
 265         addps 8672(%ecx), %xmm3
 266         andps LCPI5_6, %xmm1
 267         andps LCPI5_1, %xmm3
 268         orps %xmm1, %xmm3
 269         movaps %xmm3, 112(%esp)
 270         movaps %xmm3, (%ebx)
 271
 272 Due to some minor source change, the later case ended up using orps and movaps
 273 instead of por and movdqa. Does it matter?
 274
 275 //===---------------------------------------------------------------------===//
 276
 277 Use movddup to splat a v2f64 directly from a memory source. e.g.
 278
 279 #include <emmintrin.h>
 280
 281 void test(__m128d *r, double A) {
 282   *r = _mm_set1_pd(A);
 283 }
 284
 285 llc:
 286
 287 _test:
 288         movsd 8(%esp), %xmm0
 289         unpcklpd %xmm0, %xmm0
 290         movl 4(%esp), %eax
 291         movapd %xmm0, (%eax)
 292         ret
 293
 294 icc:
 295
 296 _test:
 297         movl 4(%esp), %eax
 298         movddup 8(%esp), %xmm0
 299         movapd %xmm0, (%eax)
 300         ret
 301
 302 //===---------------------------------------------------------------------===//
 303
 304 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
 305 to choose between movaps, movapd, and movdqa based on types of source and
 306 destination?
 307
 308 How about andps, andpd, and pand? Do we really care about the type of the packed
 309 elements? If not, why not always use the "ps" variants which are likely to be
 310 shorter.
 311
 312 //===---------------------------------------------------------------------===//
 313
 314 We are emitting bad code for this:
 315
 316 float %test(float* %V, int %I, int %D, float %V) {
 317 entry:
 318         %tmp = seteq int %D, 0
 319         br bool %tmp, label %cond_true, label %cond_false23
 320
 321 cond_true:
 322         %tmp3 = getelementptr float* %V, int %I
 323         %tmp = load float* %tmp3
 324         %tmp5 = setgt float %tmp, %V
 325         %tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V )
 326         %tmp7 = or bool %tmp5, %tmp6
 327         br bool %tmp7, label %UnifiedReturnBlock, label %cond_next
 328
 329 cond_next:
 330         %tmp10 = add int %I, 1
 331         %tmp12 = getelementptr float* %V, int %tmp10
 332         %tmp13 = load float* %tmp12
 333         %tmp15 = setle float %tmp13, %V
 334         %tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V )
 335         %tmp17 = or bool %tmp15, %tmp16
 336         %retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00
 337         ret float %retval
 338
 339 cond_false23:
 340         %tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V )
 341         ret float %tmp28
 342
 343 UnifiedReturnBlock:             ; preds = %cond_true
 344         ret float 0.000000e+00
 345 }
 346
 347 declare bool %llvm.isunordered.f32(float, float)
 348
 349 declare float %foo(float*, int, int, float)
 350
 351
 352 It exposes a known load folding problem:
 353
 354         movss (%edx,%ecx,4), %xmm1
 355         ucomiss %xmm1, %xmm0
 356
 357 As well as this:
 358
 359 LBB_test_2:     # cond_next
 360         movss LCPI1_0, %xmm2
 361         pxor %xmm3, %xmm3
 362         ucomiss %xmm0, %xmm1
 363         jbe LBB_test_6  # cond_next
 364 LBB_test_5:     # cond_next
 365         movaps %xmm2, %xmm3
 366 LBB_test_6:     # cond_next
 367         movss %xmm3, 40(%esp)
 368         flds 40(%esp)
 369         addl $44, %esp
 370         ret
 371
 372 Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting
 373 three moves (movss, movaps, movss).
 374
 375 //===---------------------------------------------------------------------===//
 376
 377 External test Nurbs exposed some problems. Look for
 378 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 379 emits:
 380
 381         movaps    (%edx), %xmm2                                 #59.21
 382         movaps    (%edx), %xmm5                                 #60.21
 383         movaps    (%edx), %xmm4                                 #61.21
 384         movaps    (%edx), %xmm3                                 #62.21
 385         movl      40(%ecx), %ebp                                #69.49
 386         shufps    $0, %xmm2, %xmm5                              #60.21
 387         movl      100(%esp), %ebx                               #69.20
 388         movl      (%ebx), %edi                                  #69.20
 389         imull     %ebp, %edi                                    #69.49
 390         addl      (%eax), %edi                                  #70.33
 391         shufps    $85, %xmm2, %xmm4                             #61.21
 392         shufps    $170, %xmm2, %xmm3                            #62.21
 393         shufps    $255, %xmm2, %xmm2                            #63.21
 394         lea       (%ebp,%ebp,2), %ebx                           #69.49
 395         negl      %ebx                                          #69.49
 396         lea       -3(%edi,%ebx), %ebx                           #70.33
 397         shll      $4, %ebx                                      #68.37
 398         addl      32(%ecx), %ebx                                #68.37
 399         testb     $15, %bl                                      #91.13
 400         jne       L_B1.24       # Prob 5%                       #91.13
 401
 402 This is the llvm code after instruction scheduling:
 403
 404 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 405         %reg1078 = MOV32ri -3
 406         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 407         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 408         %reg1080 = IMUL32rr %reg1079, %reg1037
 409         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 410         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 411         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 412         %reg1082 = SHL32ri %reg1038, 4
 413         %reg1039 = ADD32rr %reg1036, %reg1082
 414         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 415         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 416         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 417         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 418         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 419         %reg1040 = MOV32rr %reg1039
 420         %reg1084 = AND32ri8 %reg1039, 15
 421         CMP32ri8 %reg1084, 0
 422         JE mbb<cond_next204,0xa914d30>
 423
 424 Still ok. After register allocation:
 425
 426 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 427         %EAX = MOV32ri -3
 428         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 429         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 430         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 431         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 432         IMUL32rr %EAX<def&use>, %EDX
 433         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 434         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 435         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 436         %EAX = LEA32r %ESI, 1, %EAX, -3
 437         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 438         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 439         %EDI = MOV32rr %EAX
 440         SHL32ri %EDI<def&use>, 4
 441         ADD32rr %EDI<def&use>, %ESI
 442         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 443         %XMM1 = MOVAPSrr %XMM0
 444         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 445         %XMM2 = MOVAPSrr %XMM0
 446         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 447         %XMM3 = MOVAPSrr %XMM0
 448         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 449         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 450         %EBX = MOV32rr %EDI
 451         AND32ri8 %EBX<def&use>, 15
 452         CMP32ri8 %EBX, 0
 453         JE mbb<cond_next204,0xa914d30>
 454
 455 This looks really bad. The problem is shufps is a destructive opcode. Since it
 456 appears as operand two in more than one shufps ops. It resulted in a number of
 457 copies. Note icc also suffers from the same problem. Either the instruction
 458 selector should select pshufd or The register allocator can made the two-address
 459 to three-address transformation.
 460
 461 It also exposes some other problems. See MOV32ri -3 and the spills.
 462
 463 //===---------------------------------------------------------------------===//
 464
 465 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
 466
 467 LLVM is producing bad code.
 468
 469 LBB_main_4:     # cond_true44
 470         addps %xmm1, %xmm2
 471         subps %xmm3, %xmm2
 472         movaps (%ecx), %xmm4
 473         movaps %xmm2, %xmm1
 474         addps %xmm4, %xmm1
 475         addl $16, %ecx
 476         incl %edx
 477         cmpl $262144, %edx
 478         movaps %xmm3, %xmm2
 479         movaps %xmm4, %xmm3
 480         jne LBB_main_4  # cond_true44
 481
 482 There are two problems. 1) No need to two loop induction variables. We can
 483 compare against 262144 * 16. 2) Known register coalescer issue. We should
 484 be able eliminate one of the movaps:
 485
 486         addps %xmm2, %xmm1    <=== Commute!
 487         subps %xmm3, %xmm1
 488         movaps (%ecx), %xmm4
 489         movaps %xmm1, %xmm1   <=== Eliminate!
 490         addps %xmm4, %xmm1
 491         addl $16, %ecx
 492         incl %edx
 493         cmpl $262144, %edx
 494         movaps %xmm3, %xmm2
 495         movaps %xmm4, %xmm3
 496         jne LBB_main_4  # cond_true44
 497
 498 //===---------------------------------------------------------------------===//
 499
 500 Consider:
 501
 502 __m128 test(float a) {
 503   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 504 }
 505
 506 This compiles into:
 507
 508 movss 4(%esp), %xmm1
 509 mulss %xmm1, %xmm1
 510 xorps %xmm0, %xmm0
 511 movss %xmm1, %xmm0
 512 ret
 513
 514 Because mulss doesn't modify the top 3 elements, the top elements of
 515 xmm1 are already zero'd.  We could compile this to:
 516
 517 movss 4(%esp), %xmm0
 518 mulss %xmm0, %xmm0
 519 ret
 520
 521 //===---------------------------------------------------------------------===//
 522
 523 Here's a sick and twisted idea.  Consider code like this:
 524
 525 __m128 test(__m128 a) {
 526   float b = *(float*)&A;
 527   ...
 528   return _mm_set_ps(0.0, 0.0, 0.0, b);
 529 }
 530
 531 This might compile to this code:
 532
 533 movaps c(%esp), %xmm1
 534 xorps %xmm0, %xmm0
 535 movss %xmm1, %xmm0
 536 ret
 537
 538 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 539 this code:
 540
 541 movaps c(%esp), %xmm1
 542 movaps %xmm1, c2(%esp)
 543 ...
 544
 545 xorps %xmm0, %xmm0
 546 movaps c2(%esp), %xmm1
 547 movss %xmm1, %xmm0
 548 ret
 549
 550 However, since the reload is only used by these instructions, we could
 551 "fold" it into the uses, producing something like this:
 552
 553 movaps c(%esp), %xmm1
 554 movaps %xmm1, c2(%esp)
 555 ...
 556
 557 movss c2(%esp), %xmm0
 558 ret
 559
 560 ... saving two instructions.
 561
 562 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 563 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
 564 This can be used to simplify a variety of shuffle operations, where the
 565 elements are fixed zeros.
 566
 567 //===---------------------------------------------------------------------===//
 568
 569 For this:
 570
 571 #include <emmintrin.h>
 572 void test(__m128d *r, __m128d *A, double B) {
 573   *r = _mm_loadl_pd(*A, &B);
 574 }
 575
 576 We generates:
 577
 578         subl $12, %esp
 579         movsd 24(%esp), %xmm0
 580         movsd %xmm0, (%esp)
 581         movl 20(%esp), %eax
 582         movapd (%eax), %xmm0
 583         movlpd (%esp), %xmm0
 584         movl 16(%esp), %eax
 585         movapd %xmm0, (%eax)
 586         addl $12, %esp
 587         ret
 588
 589 icc generates:
 590
 591         movl      4(%esp), %edx                                 #3.6
 592         movl      8(%esp), %eax                                 #3.6
 593         movapd    (%eax), %xmm0                                 #4.22
 594         movlpd    12(%esp), %xmm0                               #4.8
 595         movapd    %xmm0, (%edx)                                 #4.3
 596         ret                                                     #5.1
 597
 598 So icc is smart enough to know that B is in memory so it doesn't load it and
 599 store it back to stack.
 600
 601 //===---------------------------------------------------------------------===//
 602
 603 __m128d test1( __m128d A, __m128d B) {
 604   return _mm_shuffle_pd(A, B, 0x3);
 605 }
 606
 607 compiles to
 608
 609 shufpd $3, %xmm1, %xmm0
 610
 611 Perhaps it's better to use unpckhpd instead?
 612
 613 unpckhpd %xmm1, %xmm0
 614
 615 Don't know if unpckhpd is faster. But it is shorter.
 616
 617 //===---------------------------------------------------------------------===//
 618
 619 This code generates ugly code, probably due to costs being off or something:
 620
 621 void %test(float* %P, <4 x float>* %P2 ) {
 622         %xFloat0.688 = load float* %P
 623         %loadVector37.712 = load <4 x float>* %P2
 624         %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
 625         store <4 x float> %inFloat3.713, <4 x float>* %P2
 626         ret void
 627 }
 628
 629 Generates:
 630
 631 _test:
 632         pxor %xmm0, %xmm0
 633         movd %xmm0, %eax        ;; EAX = 0!
 634         movl 8(%esp), %ecx
 635         movaps (%ecx), %xmm0
 636         pinsrw $6, %eax, %xmm0
 637         shrl $16, %eax          ;; EAX = 0 again!
 638         pinsrw $7, %eax, %xmm0
 639         movaps %xmm0, (%ecx)
 640         ret
 641
 642 It would be better to generate:
 643
 644 _test:
 645         movl 8(%esp), %ecx
 646         movaps (%ecx), %xmm0
 647         xor %eax, %eax
 648         pinsrw $6, %eax, %xmm0
 649         pinsrw $7, %eax, %xmm0
 650         movaps %xmm0, (%ecx)
 651         ret
 652
 653 or use pxor (to make a zero vector) and shuffle (to insert it).
 654
 655 //===---------------------------------------------------------------------===//
 656
 657 Some useful information in the Apple Altivec / SSE Migration Guide:
 658
 659 http://developer.apple.com/documentation/Performance/Conceptual/
 660 Accelerate_sse_migration/index.html
 661
 662 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 663
 664 //===---------------------------------------------------------------------===//
 665
 666 Add hooks to commute some CMPP operations.