Get closer to fully working scalar FP in SSE regs. This gets singlesource

[oota-llvm.git] / lib / Target / X86 / X86InstrInfo.td
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td

index 3d723fa75b42b03da1a37874c112dab17f7c8754..53a82ba3802d23b0a58c9d764737fce334d238af 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -20,6 +20,9 @@ class X86MemOperand<ValueType Ty> : Operand<Ty> {
    let NumMIOperands = 4;
    let PrintMethod = "printMemoryOperand";
  }
+def SSECC : Operand<i8> {
+  let PrintMethod = "printSSECC";
+}
  
  def i8mem  : X86MemOperand<i8>;
  def i16mem : X86MemOperand<i16>;
@@ -119,6 +122,8 @@ class DC     { bits<4> Prefix = 7; }
  class DD     { bits<4> Prefix = 8; }
  class DE     { bits<4> Prefix = 9; }
  class DF     { bits<4> Prefix = 10; }
+class XD     { bits<4> Prefix = 11; }
+class XS     { bits<4> Prefix = 12; }
  
  
  //===----------------------------------------------------------------------===//
@@ -152,9 +157,11 @@ let isTerminator = 1 in
  //  Control Flow Instructions...
  //
  
-// Return instruction...
+// Return instructions.
  let isTerminator = 1, isReturn = 1, isBarrier = 1 in
    def RET : I<0xC3, RawFrm, (ops), "ret">;
+let isTerminator = 1, isReturn = 1, isBarrier = 1 in
+  def RETI : Ii16<0xC2, RawFrm, (ops i16imm:$amt), "ret $amt">;
  
  // All branches are RawFrm, Void, Branch, and Terminators
  let isBranch = 1, isTerminator = 1 in
@@ -170,6 +177,8 @@ def JBE : IBr<0x86, (ops i32imm:$dst), "jbe $dst">, TB;
  def JA  : IBr<0x87, (ops i32imm:$dst), "ja $dst">, TB;
  def JS  : IBr<0x88, (ops i32imm:$dst), "js $dst">, TB;
  def JNS : IBr<0x89, (ops i32imm:$dst), "jns $dst">, TB;
+def JP  : IBr<0x8A, (ops i32imm:$dst), "jp $dst">, TB;
+def JNP : IBr<0x8B, (ops i32imm:$dst), "jnp $dst">, TB;
  def JL  : IBr<0x8C, (ops i32imm:$dst), "jl $dst">, TB;
  def JGE : IBr<0x8D, (ops i32imm:$dst), "jge $dst">, TB;
  def JLE : IBr<0x8E, (ops i32imm:$dst), "jle $dst">, TB;
@@ -181,13 +190,30 @@ def JG  : IBr<0x8F, (ops i32imm:$dst), "jg $dst">, TB;
  //
  let isCall = 1 in
    // All calls clobber the non-callee saved registers...
-  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0] in {
+  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7] in {
      def CALLpcrel32 : I<0xE8, RawFrm, (ops calltarget:$dst), "call $dst">;
      def CALL32r     : I<0xFF, MRM2r, (ops R32:$dst), "call {*}$dst">;
      def CALL32m     : I<0xFF, MRM2m, (ops i32mem:$dst), "call {*}$dst">;
    }
  
-       
+// Tail call stuff.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+  def TAILJMPd : IBr<0xE9, (ops calltarget:$dst), "jmp $dst  # TAIL CALL">;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+  def TAILJMPr : I<0xFF, MRM4r, (ops R32:$dst), "jmp {*}$dst  # TAIL CALL">;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+  def TAILJMPm : I<0xFF, MRM4m, (ops i32mem:$dst), "jmp {*}$dst  # TAIL CALL">;
+
+// ADJSTACKPTRri - This is a standard ADD32ri instruction, identical in every
+// way, except that it is marked as being a terminator.  This causes the epilog
+// inserter to insert reloads of callee saved registers BEFORE this.  We need
+// this until we have a more accurate way of tracking where the stack pointer is
+// within a function.
+let isTerminator = 1, isTwoAddress = 1 in
+  def ADJSTACKPTRri : Ii32<0x81, MRM0r, (ops R32:$dst, R32:$src1, i32imm:$src2),
+                           "add{l} {$src2, $dst|$dst, $src2}">;
+
  //===----------------------------------------------------------------------===//
  //  Miscellaneous Instructions...
  //
@@ -256,32 +282,32 @@ def REP_STOSD : I<0xAB, RawFrm, (ops), "{rep;stosl|rep stosd}">,
  //  Input/Output Instructions...
  //
  def IN8rr  : I<0xEC, RawFrm, (ops),
-               "in{b} {%DX, %AL|AL, DX}">,  Imp<[DX], [AL]>;
+               "in{b} {%dx, %al|%AL, %DX}">,  Imp<[DX], [AL]>;
  def IN16rr : I<0xED, RawFrm, (ops),
-               "in{w} {%DX, %AX|AX, DX}">,  Imp<[DX], [AX]>, OpSize;
+               "in{w} {%dx, %ax|%AX, %DX}">,  Imp<[DX], [AX]>, OpSize;
  def IN32rr : I<0xED, RawFrm, (ops),
-               "in{l} {%DX, %EAX|EAX, DX}">, Imp<[DX],[EAX]>;
+               "in{l} {%dx, %eax|%EAX, %DX}">, Imp<[DX],[EAX]>;
  
  def IN8ri  : Ii16<0xE4, RawFrm, (ops i16imm:$port),
-                  "in{b} {$port, %AL|AL, $port}">,  Imp<[], [AL]>;
+                  "in{b} {$port, %al|%AL, $port}">,  Imp<[], [AL]>;
  def IN16ri : Ii16<0xE5, RawFrm, (ops i16imm:$port),
-                  "in{w} {$port, %AX|AX, $port}">,  Imp<[], [AX]>, OpSize;
+                  "in{w} {$port, %ax|%AX, $port}">,  Imp<[], [AX]>, OpSize;
  def IN32ri : Ii16<0xE5, RawFrm, (ops i16imm:$port),
-                  "in{l} {$port, %EAX|EAX, $port}">, Imp<[],[EAX]>;
+                  "in{l} {$port, %eax|%EAX, $port}">, Imp<[],[EAX]>;
  
  def OUT8rr  : I<0xEE, RawFrm, (ops),
-                "out{b} {%AL, %DX|DX, AL}">,  Imp<[DX,  AL], []>;
+                "out{b} {%al, %dx|%DX, %AL}">,  Imp<[DX,  AL], []>;
  def OUT16rr : I<0xEF, RawFrm, (ops),
-                "out{w} {%AX, %DX|DX, AX}">,  Imp<[DX,  AX], []>, OpSize;
+                "out{w} {%ax, %dx|%DX, %AX}">,  Imp<[DX,  AX], []>, OpSize;
  def OUT32rr : I<0xEF, RawFrm, (ops),
-                "out{l} {%EAX, %DX|DX, EAX}">, Imp<[DX, EAX], []>;
+                "out{l} {%eax, %dx|%DX, %EAX}">, Imp<[DX, EAX], []>;
  
  def OUT8ir  : Ii16<0xE6, RawFrm, (ops i16imm:$port),
-                   "out{b} {%AL, $port|$port, AL}">, Imp<[AL], []>;
+                   "out{b} {%al, $port|$port, %AL}">, Imp<[AL], []>;
  def OUT16ir : Ii16<0xE7, RawFrm, (ops i16imm:$port),
-                   "out{w} {%AX, $port|$port, AX}">, Imp<[AX], []>, OpSize;
+                   "out{w} {%ax, $port|$port, %AX}">, Imp<[AX], []>, OpSize;
  def OUT32ir : Ii16<0xE7, RawFrm, (ops i16imm:$port),
-                   "out{l} {%EAX, $port|$port, %EAX}">, Imp<[EAX], []>;
+                   "out{l} {%eax, $port|$port, %EAX}">, Imp<[EAX], []>;
  
  //===----------------------------------------------------------------------===//
  //  Move Instructions...
@@ -318,7 +344,7 @@ def MOV16mr : I<0x89, MRMDestMem, (ops i16mem:$dst, R16:$src),
                  "mov{w} {$src, $dst|$dst, $src}">, OpSize;
  def MOV32mr : I<0x89, MRMDestMem, (ops i32mem:$dst, R32:$src),
                  "mov{l} {$src, $dst|$dst, $src}">;
-
+                
  //===----------------------------------------------------------------------===//
  //  Fixed-Register Multiplication and Division Instructions...
  //
@@ -337,6 +363,19 @@ def MUL16m : I<0xF7, MRM4m, (ops i16mem:$src),
  def MUL32m : I<0xF7, MRM4m, (ops i32mem:$src),
                 "mul{l} $src">, Imp<[EAX],[EAX,EDX]>;   // EAX,EDX = EAX*[mem32]
  
+def IMUL8r  : I<0xF6, MRM5r, (ops R8:$src), "imul{b} $src">,
+              Imp<[AL],[AX]>;               // AL,AH = AL*R8
+def IMUL16r : I<0xF7, MRM5r, (ops R16:$src), "imul{w} $src">,
+              Imp<[AX],[AX,DX]>, OpSize;    // AX,DX = AX*R16
+def IMUL32r : I<0xF7, MRM5r, (ops R32:$src), "imul{l} $src">,
+              Imp<[EAX],[EAX,EDX]>;         // EAX,EDX = EAX*R32
+def IMUL8m  : I<0xF6, MRM5m, (ops i8mem :$src),
+                "imul{b} $src">, Imp<[AL],[AX]>;           // AL,AH = AL*[mem8]
+def IMUL16m : I<0xF7, MRM5m, (ops i16mem:$src),
+                "imul{w} $src">, Imp<[AX],[AX,DX]>, OpSize;// AX,DX = AX*[mem16]
+def IMUL32m : I<0xF7, MRM5m, (ops i32mem:$src),
+                "imul{l} $src">, Imp<[EAX],[EAX,EDX]>;  // EAX,EDX = EAX*[mem32]
+
  // unsigned division/remainder
  def DIV8r  : I<0xF6, MRM6r, (ops R8:$src),          // AX/r8 = AL,AH
                 "div{b} $src">, Imp<[AX],[AX]>;
@@ -484,6 +523,34 @@ def CMOVNS32rm: I<0x49, MRMSrcMem,       // if !signed, R32 = [mem32]
                    (ops R32:$dst, R32:$src1, i32mem:$src2),
                    "cmovns {$src2, $dst|$dst, $src2}">, TB;
  
+def CMOVP16rr : I<0x4A, MRMSrcReg,       // if parity, R16 = R16
+                  (ops R16:$dst, R16:$src1, R16:$src2),
+                  "cmovp {$src2, $dst|$dst, $src2}">, TB, OpSize;
+def CMOVP16rm : I<0x4A, MRMSrcMem,       // if parity, R16 = [mem16]
+                  (ops R16:$dst, R16:$src1, i16mem:$src2),
+                  "cmovp {$src2, $dst|$dst, $src2}">, TB, OpSize;
+def CMOVP32rr : I<0x4A, MRMSrcReg,       // if parity, R32 = R32
+                  (ops R32:$dst, R32:$src1, R32:$src2),
+                  "cmovp {$src2, $dst|$dst, $src2}">, TB;
+def CMOVP32rm : I<0x4A, MRMSrcMem,       // if parity, R32 = [mem32]
+                  (ops R32:$dst, R32:$src1, i32mem:$src2),
+                  "cmovp {$src2, $dst|$dst, $src2}">, TB;
+
+ 
+def CMOVNP16rr : I<0x4B, MRMSrcReg,       // if !parity, R16 = R16
+                  (ops R16:$dst, R16:$src1, R16:$src2),
+                  "cmovnp {$src2, $dst|$dst, $src2}">, TB, OpSize;
+def CMOVNP16rm : I<0x4B, MRMSrcMem,       // if !parity, R16 = [mem16]
+                  (ops R16:$dst, R16:$src1, i16mem:$src2),
+                  "cmovnp {$src2, $dst|$dst, $src2}">, TB, OpSize;
+def CMOVNP32rr : I<0x4B, MRMSrcReg,       // if !parity, R32 = R32
+                  (ops R32:$dst, R32:$src1, R32:$src2),
+                  "cmovnp {$src2, $dst|$dst, $src2}">, TB;
+def CMOVNP32rm : I<0x4B, MRMSrcMem,       // if !parity, R32 = [mem32]
+                  (ops R32:$dst, R32:$src1, i32mem:$src2),
+                  "cmovnp {$src2, $dst|$dst, $src2}">, TB;
+
+
  def CMOVL16rr : I<0x4C, MRMSrcReg,       // if <s, R16 = R16
                    (ops R16:$dst, R16:$src1, R16:$src2),
                    "cmovl {$src2, $dst|$dst, $src2}">, TB, OpSize;
@@ -556,8 +623,10 @@ let isTwoAddress = 0 in {
  }
  
  def INC8r  : I<0xFE, MRM0r, (ops R8 :$dst, R8 :$src), "inc{b} $dst">;
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
  def INC16r : I<0xFF, MRM0r, (ops R16:$dst, R16:$src), "inc{w} $dst">, OpSize;
  def INC32r : I<0xFF, MRM0r, (ops R32:$dst, R32:$src), "inc{l} $dst">;
+}
  let isTwoAddress = 0 in {
    def INC8m  : I<0xFE, MRM0m, (ops i8mem :$dst), "inc{b} $dst">;
    def INC16m : I<0xFF, MRM0m, (ops i16mem:$dst), "inc{w} $dst">, OpSize;
@@ -565,8 +634,10 @@ let isTwoAddress = 0 in {
  }
  
  def DEC8r  : I<0xFE, MRM1r, (ops R8 :$dst, R8 :$src), "dec{b} $dst">;
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
  def DEC16r : I<0xFF, MRM1r, (ops R16:$dst, R16:$src), "dec{w} $dst">, OpSize;
  def DEC32r : I<0xFF, MRM1r, (ops R32:$dst, R32:$src), "dec{l} $dst">;
+}
  
  let isTwoAddress = 0 in {
    def DEC8m  : I<0xFE, MRM1m, (ops i8mem :$dst), "dec{b} $dst">;
@@ -575,6 +646,7 @@ let isTwoAddress = 0 in {
  }
  
  // Logical operators...
+let isCommutable = 1 in {   // X = AND Y, Z   --> X = AND Z, Y
  def AND8rr   : I<0x20, MRMDestReg,
                  (ops R8 :$dst, R8 :$src1, R8 :$src2),
                  "and{b} {$src2, $dst|$dst, $src2}">;
@@ -584,6 +656,7 @@ def AND16rr  : I<0x21, MRMDestReg,
  def AND32rr  : I<0x21, MRMDestReg, 
                   (ops R32:$dst, R32:$src1, R32:$src2),
                   "and{l} {$src2, $dst|$dst, $src2}">;
+}
  
  def AND8rm   : I<0x22, MRMSrcMem, 
                   (ops R8 :$dst, R8 :$src1, i8mem :$src2),
@@ -639,12 +712,14 @@ let isTwoAddress = 0 in {
  }
  
  
+let isCommutable = 1 in {   // X = OR Y, Z   --> X = OR Z, Y
  def OR8rr    : I<0x08, MRMDestReg, (ops R8 :$dst, R8 :$src1, R8 :$src2),
                   "or{b} {$src2, $dst|$dst, $src2}">;
  def OR16rr   : I<0x09, MRMDestReg, (ops R16:$dst, R16:$src1, R16:$src2),
                   "or{w} {$src2, $dst|$dst, $src2}">, OpSize;
  def OR32rr   : I<0x09, MRMDestReg, (ops R32:$dst, R32:$src1, R32:$src2),
                   "or{l} {$src2, $dst|$dst, $src2}">;
+}
  def OR8rm    : I<0x0A, MRMSrcMem , (ops R8 :$dst, R8 :$src1, i8mem :$src2),
                   "or{b} {$src2, $dst|$dst, $src2}">;
  def OR16rm   : I<0x0B, MRMSrcMem , (ops R16:$dst, R16:$src1, i16mem:$src2),
@@ -683,6 +758,7 @@ let isTwoAddress = 0 in {
  }
  
  
+let isCommutable = 1 in {   // X = XOR Y, Z   --> X = XOR Z, Y
  def XOR8rr   : I<0x30, MRMDestReg,
                   (ops R8 :$dst, R8 :$src1, R8 :$src2),
                   "xor{b} {$src2, $dst|$dst, $src2}">;
@@ -692,6 +768,8 @@ def XOR16rr  : I<0x31, MRMDestReg,
  def XOR32rr  : I<0x31, MRMDestReg, 
                   (ops R32:$dst, R32:$src1, R32:$src2), 
                   "xor{l} {$src2, $dst|$dst, $src2}">;
+}
+
  def XOR8rm   : I<0x32, MRMSrcMem , 
                   (ops R8 :$dst, R8:$src1, i8mem :$src2), 
                   "xor{b} {$src2, $dst|$dst, $src2}">;
@@ -752,12 +830,15 @@ def SHL16rCL : I<0xD3, MRM4r, (ops R16:$dst, R16:$src),
                   "shl{w} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>, OpSize;
  def SHL32rCL : I<0xD3, MRM4r, (ops R32:$dst, R32:$src),
                   "shl{l} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>;
+
  def SHL8ri   : Ii8<0xC0, MRM4r, (ops R8 :$dst, R8 :$src1, i8imm:$src2),
                     "shl{b} {$src2, $dst|$dst, $src2}">;
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
  def SHL16ri  : Ii8<0xC1, MRM4r, (ops R16:$dst, R16:$src1, i8imm:$src2),
                     "shl{w} {$src2, $dst|$dst, $src2}">, OpSize;
  def SHL32ri  : Ii8<0xC1, MRM4r, (ops R32:$dst, R32:$src1, i8imm:$src2),
                     "shl{l} {$src2, $dst|$dst, $src2}">;
+}
  
  let isTwoAddress = 0 in {
    def SHL8mCL  : I<0xD2, MRM4m, (ops i8mem :$dst),
@@ -831,18 +912,98 @@ let isTwoAddress = 0 in {
                       "sar{l} {$src, $dst|$dst, $src}">;
  }
  
+// Rotate instructions
+// FIXME: provide shorter instructions when imm8 == 1
+def ROL8rCL  : I<0xD2, MRM0r, (ops R8 :$dst, R8 :$src),
+                 "rol{b} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>;
+def ROL16rCL : I<0xD3, MRM0r, (ops R16:$dst, R16:$src),
+                 "rol{w} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>, OpSize;
+def ROL32rCL : I<0xD3, MRM0r, (ops R32:$dst, R32:$src),
+                 "rol{l} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>;
+
+def ROL8ri   : Ii8<0xC0, MRM0r, (ops R8 :$dst, R8 :$src1, i8imm:$src2),
+                   "rol{b} {$src2, $dst|$dst, $src2}">;
+def ROL16ri  : Ii8<0xC1, MRM0r, (ops R16:$dst, R16:$src1, i8imm:$src2),
+                   "rol{w} {$src2, $dst|$dst, $src2}">, OpSize;
+def ROL32ri  : Ii8<0xC1, MRM0r, (ops R32:$dst, R32:$src1, i8imm:$src2),
+                   "rol{l} {$src2, $dst|$dst, $src2}">;
+
+let isTwoAddress = 0 in {
+  def ROL8mCL  : I<0xD2, MRM0m, (ops i8mem :$dst),
+                   "rol{b} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>;
+  def ROL16mCL : I<0xD3, MRM0m, (ops i16mem:$dst),
+                   "rol{w} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>, OpSize;
+  def ROL32mCL : I<0xD3, MRM0m, (ops i32mem:$dst),
+                   "rol{l} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>;
+  def ROL8mi   : Ii8<0xC0, MRM0m, (ops i8mem :$dst, i8imm:$src),
+                     "rol{b} {$src, $dst|$dst, $src}">;
+  def ROL16mi  : Ii8<0xC1, MRM0m, (ops i16mem:$dst, i8imm:$src),
+                     "rol{w} {$src, $dst|$dst, $src}">, OpSize;
+  def ROL32mi  : Ii8<0xC1, MRM0m, (ops i32mem:$dst, i8imm:$src),
+                     "rol{l} {$src, $dst|$dst, $src}">;
+}
+
+def ROR8rCL  : I<0xD2, MRM1r, (ops R8 :$dst, R8 :$src),
+                 "ror{b} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>;
+def ROR16rCL : I<0xD3, MRM1r, (ops R16:$dst, R16:$src),
+                 "ror{w} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>, OpSize;
+def ROR32rCL : I<0xD3, MRM1r, (ops R32:$dst, R32:$src),
+                 "ror{l} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>;
+
+def ROR8ri   : Ii8<0xC0, MRM1r, (ops R8 :$dst, R8 :$src1, i8imm:$src2),
+                   "ror{b} {$src2, $dst|$dst, $src2}">;
+def ROR16ri  : Ii8<0xC1, MRM1r, (ops R16:$dst, R16:$src1, i8imm:$src2),
+                   "ror{w} {$src2, $dst|$dst, $src2}">, OpSize;
+def ROR32ri  : Ii8<0xC1, MRM1r, (ops R32:$dst, R32:$src1, i8imm:$src2),
+                   "ror{l} {$src2, $dst|$dst, $src2}">;
+let isTwoAddress = 0 in {
+  def ROR8mCL  : I<0xD2, MRM1m, (ops i8mem :$dst),
+                   "ror{b} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>;
+  def ROR16mCL : I<0xD3, MRM1m, (ops i16mem:$dst),
+                   "ror{w} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>, OpSize;
+  def ROR32mCL : I<0xD3, MRM1m, (ops i32mem:$dst), 
+                   "ror{l} {%cl, $dst|$dst, %CL}">, Imp<[CL],[]>;
+  def ROR8mi   : Ii8<0xC0, MRM1m, (ops i8mem :$dst, i8imm:$src),
+                     "ror{b} {$src, $dst|$dst, $src}">;
+  def ROR16mi  : Ii8<0xC1, MRM1m, (ops i16mem:$dst, i8imm:$src),
+                     "ror{w} {$src, $dst|$dst, $src}">, OpSize;
+  def ROR32mi  : Ii8<0xC1, MRM1m, (ops i32mem:$dst, i8imm:$src),
+                     "ror{l} {$src, $dst|$dst, $src}">;
+}
+
+
+
+// Double shift instructions (generalizations of rotate)
+
  def SHLD32rrCL : I<0xA5, MRMDestReg, (ops R32:$dst, R32:$src1, R32:$src2),
                     "shld{l} {%cl, $src2, $dst|$dst, $src2, %CL}">,
                     Imp<[CL],[]>, TB;
  def SHRD32rrCL : I<0xAD, MRMDestReg, (ops R32:$dst, R32:$src1, R32:$src2),
                     "shrd{l} {%cl, $src2, $dst|$dst, $src2, %CL}">,
                     Imp<[CL],[]>, TB;
+def SHLD16rrCL : I<0xA5, MRMDestReg, (ops R16:$dst, R16:$src1, R16:$src2),
+                   "shld{w} {%cl, $src2, $dst|$dst, $src2, %CL}">,
+                   Imp<[CL],[]>, TB, OpSize;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (ops R16:$dst, R16:$src1, R16:$src2),
+                   "shrd{w} {%cl, $src2, $dst|$dst, $src2, %CL}">,
+                   Imp<[CL],[]>, TB, OpSize;
+
+let isCommutable = 1 in {  // These instructions commute to each other.
  def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
                       (ops R32:$dst, R32:$src1, R32:$src2, i8imm:$src3),
                       "shld{l} {$src3, $src2, $dst|$dst, $src2, $src3}">, TB;
  def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
                       (ops R32:$dst, R32:$src1, R32:$src2, i8imm:$src3),
                       "shrd{l} {$src3, $src2, $dst|$dst, $src2, $src3}">, TB;
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+                     (ops R16:$dst, R16:$src1, R16:$src2, i8imm:$src3),
+                     "shld{w} {$src3, $src2, $dst|$dst, $src2, $src3}">,
+                     TB, OpSize;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+                     (ops R16:$dst, R16:$src1, R16:$src2, i8imm:$src3),
+                     "shrd{w} {$src3, $src2, $dst|$dst, $src2, $src3}">,
+                     TB, OpSize;
+}
  
  let isTwoAddress = 0 in {
    def SHLD32mrCL : I<0xA5, MRMDestMem, (ops i32mem:$dst, R32:$src2),
@@ -857,16 +1018,35 @@ let isTwoAddress = 0 in {
    def SHRD32mri8 : Ii8<0xAC, MRMDestMem, 
                         (ops i32mem:$dst, R32:$src2, i8imm:$src3),
                         "shrd{l} {$src3, $src2, $dst|$dst, $src2, $src3}">, TB;
+
+  def SHLD16mrCL : I<0xA5, MRMDestMem, (ops i16mem:$dst, R16:$src2),
+                     "shld{w} {%cl, $src2, $dst|$dst, $src2, %CL}">,
+                     Imp<[CL],[]>, TB, OpSize;
+  def SHRD16mrCL : I<0xAD, MRMDestMem, (ops i16mem:$dst, R16:$src2),
+                    "shrd{w} {%cl, $src2, $dst|$dst, $src2, %CL}">,
+                    Imp<[CL],[]>, TB, OpSize;
+  def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+                      (ops i16mem:$dst, R16:$src2, i8imm:$src3),
+                      "shld{w} {$src3, $src2, $dst|$dst, $src2, $src3}">,
+                      TB, OpSize;
+  def SHRD16mri8 : Ii8<0xAC, MRMDestMem, 
+                       (ops i16mem:$dst, R16:$src2, i8imm:$src3),
+                       "shrd{w} {$src3, $src2, $dst|$dst, $src2, $src3}">,
+                       TB, OpSize;
  }
  
  
-// Arithmetic...
+// Arithmetic.
+let isCommutable = 1 in {   // X = ADD Y, Z   --> X = ADD Z, Y
  def ADD8rr   : I<0x00, MRMDestReg, (ops R8 :$dst, R8 :$src1, R8 :$src2),
                   "add{b} {$src2, $dst|$dst, $src2}">;
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
  def ADD16rr  : I<0x01, MRMDestReg, (ops R16:$dst, R16:$src1, R16:$src2),
                   "add{w} {$src2, $dst|$dst, $src2}">, OpSize;
  def ADD32rr  : I<0x01, MRMDestReg, (ops R32:$dst, R32:$src1, R32:$src2),
                   "add{l} {$src2, $dst|$dst, $src2}">;
+} // end isConvertibleToThreeAddress
+} // end isCommutable
  def ADD8rm   : I<0x02, MRMSrcMem, (ops R8 :$dst, R8 :$src1, i8mem :$src2),
                   "add{b} {$src2, $dst|$dst, $src2}">;
  def ADD16rm  : I<0x03, MRMSrcMem, (ops R16:$dst, R16:$src1, i16mem:$src2),
@@ -876,10 +1056,13 @@ def ADD32rm  : I<0x03, MRMSrcMem, (ops R32:$dst, R32:$src1, i32mem:$src2),
  
  def ADD8ri   : Ii8<0x80, MRM0r, (ops R8:$dst, R8:$src1, i8imm:$src2),
                     "add{b} {$src2, $dst|$dst, $src2}">;
+
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
  def ADD16ri  : Ii16<0x81, MRM0r, (ops R16:$dst, R16:$src1, i16imm:$src2),
                      "add{w} {$src2, $dst|$dst, $src2}">, OpSize;
  def ADD32ri  : Ii32<0x81, MRM0r, (ops R32:$dst, R32:$src1, i32imm:$src2),
                      "add{l} {$src2, $dst|$dst, $src2}">;
+}
  
  def ADD16ri8 : Ii8<0x83, MRM0r, (ops R16:$dst, R16:$src1, i8imm:$src2),
                     "add{w} {$src2, $dst|$dst, $src2}">, OpSize;
@@ -905,8 +1088,10 @@ let isTwoAddress = 0 in {
                       "add{l} {$src2, $dst|$dst, $src2}">;
  }
  
+let isCommutable = 1 in {  // X = ADC Y, Z --> X = ADC Z, Y
  def ADC32rr  : I<0x11, MRMDestReg, (ops R32:$dst, R32:$src1, R32:$src2),
                   "adc{l} {$src2, $dst|$dst, $src2}">;
+}
  def ADC32rm  : I<0x13, MRMSrcMem , (ops R32:$dst, R32:$src1, i32mem:$src2),
                   "adc{l} {$src2, $dst|$dst, $src2}">;
  def ADC32ri  : Ii32<0x81, MRM2r, (ops R32:$dst, R32:$src1, i32imm:$src2),
@@ -997,10 +1182,12 @@ def SBB16ri8 : Ii8<0x83, MRM3r, (ops R16:$dst, R16:$src1, i8imm:$src2),
  def SBB32ri8 : Ii8<0x83, MRM3r, (ops R32:$dst, R32:$src1, i8imm:$src2),
                     "sbb{l} {$src2, $dst|$dst, $src2}">;
  
+let isCommutable = 1 in {  // X = IMUL Y, Z --> X = IMUL Z, Y
  def IMUL16rr : I<0xAF, MRMSrcReg, (ops R16:$dst, R16:$src1, R16:$src2),
                   "imul{w} {$src2, $dst|$dst, $src2}">, TB, OpSize;
  def IMUL32rr : I<0xAF, MRMSrcReg, (ops R32:$dst, R32:$src1, R32:$src2),
                   "imul{l} {$src2, $dst|$dst, $src2}">, TB;
+}
  def IMUL16rm : I<0xAF, MRMSrcMem, (ops R16:$dst, R16:$src1, i16mem:$src2),
                   "imul{w} {$src2, $dst|$dst, $src2}">, TB, OpSize;
  def IMUL32rm : I<0xAF, MRMSrcMem, (ops R32:$dst, R32:$src1, i32mem:$src2),
@@ -1039,12 +1226,14 @@ def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // R32 = [mem32]*I8
  //===----------------------------------------------------------------------===//
  // Test instructions are just like AND, except they don't generate a result.
  //
+let isCommutable = 1 in {   // TEST X, Y   --> TEST Y, X
  def TEST8rr  : I<0x84, MRMDestReg, (ops R8:$src1, R8:$src2),
                   "test{b} {$src2, $src1|$src1, $src2}">;
  def TEST16rr : I<0x85, MRMDestReg, (ops R16:$src1, R16:$src2),
                   "test{w} {$src2, $src1|$src1, $src2}">, OpSize;
  def TEST32rr : I<0x85, MRMDestReg, (ops R32:$src1, R32:$src2),
                   "test{l} {$src2, $src1|$src1, $src2}">;
+}
  def TEST8mr  : I<0x84, MRMDestMem, (ops i8mem :$src1, R8 :$src2),
                   "test{b} {$src2, $src1|$src1, $src2}">;
  def TEST16mr : I<0x85, MRMDestMem, (ops i16mem:$src1, R16:$src2),
@@ -1119,6 +1308,10 @@ def SETPr    : I<0x9A, MRM0r,
                   (ops R8   :$dst), "setp $dst">, TB;    // R8 = parity
  def SETPm    : I<0x9A, MRM0m, 
                   (ops i8mem:$dst), "setp $dst">, TB;    // [mem8] = parity
+def SETNPr   : I<0x9B, MRM0r, 
+                 (ops R8   :$dst), "setnp $dst">, TB;   // R8 = not parity
+def SETNPm   : I<0x9B, MRM0m, 
+                 (ops i8mem:$dst), "setnp $dst">, TB;   // [mem8] = not parity
  def SETLr    : I<0x9C, MRM0r, 
                   (ops R8   :$dst), "setl $dst">, TB;    // R8 = <  signed
  def SETLm    : I<0x9C, MRM0m, 
@@ -1210,9 +1403,140 @@ def MOVZX32rr16: I<0xB7, MRMSrcReg, (ops R32:$dst, R16:$src),
  def MOVZX32rm16: I<0xB7, MRMSrcMem, (ops R32:$dst, i16mem:$src),
                     "movz{wl|x} {$src, $dst|$dst, $src}">, TB;
  
+//===----------------------------------------------------------------------===//
+// XMM Floating point support (requires SSE2)
+//===----------------------------------------------------------------------===//
+
+def MOVSSrm : I<0x10, MRMSrcMem, (ops RXMM:$dst, f32mem:$src),
+                "movss {$src, $dst|$dst, $src}">, XS;
+def MOVSSmr : I<0x11, MRMDestMem, (ops f32mem:$dst, RXMM:$src),
+                "movss {$src, $dst|$dst, $src}">, XS;
+def MOVSDrm : I<0x10, MRMSrcMem, (ops RXMM:$dst, f64mem:$src),
+                "movsd {$src, $dst|$dst, $src}">, XD;
+def MOVSDmr : I<0x11, MRMDestMem, (ops f64mem:$dst, RXMM:$src),
+                "movsd {$src, $dst|$dst, $src}">, XD;
+def MOVAPSrr: I<0x28, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "movaps {$src, $dst|$dst, $src}">, TB;
+def MOVAPSrm: I<0x28, MRMSrcMem, (ops RXMM:$dst, f32mem:$src),
+                "movaps {$src, $dst|$dst, $src}">, TB;
+def MOVAPSmr: I<0x29, MRMDestMem, (ops f32mem:$dst, RXMM:$src),
+                "movaps {$src, $dst|$dst, $src}">, TB;
+def MOVAPDrr: I<0x28, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "movapd {$src, $dst|$dst, $src}">, TB, OpSize;
+def MOVAPDrm: I<0x28, MRMSrcMem, (ops RXMM:$dst, f64mem:$src),
+                "movapd {$src, $dst|$dst, $src}">, TB, OpSize;
+def MOVAPDmr: I<0x29, MRMDestMem, (ops f64mem:$dst, RXMM:$src),
+                "movapd {$src, $dst|$dst, $src}">, TB, OpSize;
+
+def CVTTSD2SIrr: I<0x2C, MRMSrcReg, (ops R32:$dst, RXMM:$src),
+                "cvttsd2si {$src, $dst|$dst, $src}">, XD;
+def CVTTSD2SIrm: I<0x2C, MRMSrcMem, (ops R32:$dst, f64mem:$src),
+                "cvttsd2si {$src, $dst|$dst, $src}">, XD;
+def CVTTSS2SIrr: I<0x2C, MRMSrcReg, (ops R32:$dst, RXMM:$src),
+                "cvttss2si {$src, $dst|$dst, $src}">, XS;
+def CVTTSS2SIrm: I<0x2C, MRMSrcMem, (ops R32:$dst, f32mem:$src),
+                "cvttss2si {$src, $dst|$dst, $src}">, XS;
+def CVTSD2SSrr: I<0x5A, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "cvtsd2ss {$src, $dst|$dst, $src}">, XS;
+def CVTSD2SSrm: I<0x5A, MRMSrcMem, (ops RXMM:$dst, f64mem:$src), 
+                "cvtsd2ss {$src, $dst|$dst, $src}">, XS;
+def CVTSS2SDrr: I<0x5A, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "cvtss2sd {$src, $dst|$dst, $src}">, XD;
+def CVTSS2SDrm: I<0x5A, MRMSrcMem, (ops RXMM:$dst, f32mem:$src),
+                "cvtss2sd {$src, $dst|$dst, $src}">, XD;
+def CVTSI2SSrr: I<0x2A, MRMSrcReg, (ops R32:$dst, RXMM:$src),
+                "cvtsi2ss {$src, $dst|$dst, $src}">, XS;
+def CVTSI2SSrm: I<0x2A, MRMSrcMem, (ops R32:$dst, f32mem:$src),
+                "cvtsi2ss {$src, $dst|$dst, $src}">, XS;
+def CVTSI2SDrr: I<0x2A, MRMSrcReg, (ops R32:$dst, RXMM:$src),
+                "cvtsi2sd {$src, $dst|$dst, $src}">, XD;
+def CVTSI2SDrm: I<0x2A, MRMSrcMem, (ops R32:$dst, f64mem:$src),
+                "cvtsi2sd {$src, $dst|$dst, $src}">, XD;
+
+def SQRTSSrm : I<0x51, MRMSrcMem, (ops RXMM:$dst, f32mem:$src),
+                "subss {$src, $dst|$dst, $src}">, XS;
+def SQRTSSrr : I<0x51, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "subss {$src, $dst|$dst, $src}">, XS;
+def SQRTSDrm : I<0x51, MRMSrcMem, (ops RXMM:$dst, f64mem:$src),
+                "subsd {$src, $dst|$dst, $src}">, XD;
+def SQRTSDrr : I<0x51, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "subsd {$src, $dst|$dst, $src}">, XD;
+
+def UCOMISDrr: I<0x2E, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "ucomisd {$src, $dst|$dst, $src}">, TB, OpSize;
+def UCOMISDrm: I<0x2E, MRMSrcMem, (ops RXMM:$dst, f64mem:$src),
+                "ucomisd {$src, $dst|$dst, $src}">, TB, OpSize;
+def UCOMISSrr: I<0x2E, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "ucomiss {$src, $dst|$dst, $src}">, TB;
+def UCOMISSrm: I<0x2E, MRMSrcMem, (ops RXMM:$dst, f32mem:$src),
+                "ucomiss {$src, $dst|$dst, $src}">, TB;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in {
+def ADDSSrr : I<0x58, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "addss {$src, $dst|$dst, $src}">, XS;
+def ADDSDrr : I<0x58, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "addsd {$src, $dst|$dst, $src}">, XD;
+def ANDPSrr : I<0x54, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "andps {$src, $dst|$dst, $src}">, TB;
+def ANDPDrr : I<0x54, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "andpd {$src, $dst|$dst, $src}">, TB, OpSize;
+def MULSSrr : I<0x59, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "mulss {$src, $dst|$dst, $src}">, XS;
+def MULSDrr : I<0x59, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "mulsd {$src, $dst|$dst, $src}">, XD;
+def ORPSrr : I<0x56, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "orps {$src, $dst|$dst, $src}">, TB;
+def ORPDrr : I<0x56, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "orpd {$src, $dst|$dst, $src}">, TB, OpSize;
+}
+def ANDNPSrr : I<0x55, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "andnps {$src, $dst|$dst, $src}">, TB;
+def ANDNPDrr : I<0x55, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "andnpd {$src, $dst|$dst, $src}">, TB, OpSize;
+def ADDSSrm : I<0x58, MRMSrcMem, (ops RXMM:$dst, RXMM:$src1, f32mem:$src),
+                "addss {$src, $dst|$dst, $src}">, XS;
+def ADDSDrm : I<0x58, MRMSrcMem, (ops RXMM:$dst, RXMM:$src1, f64mem:$src),
+                "addsd {$src, $dst|$dst, $src}">, XD;
+def MULSSrm : I<0x59, MRMSrcMem, (ops RXMM:$dst, RXMM:$src1, f32mem:$src),
+                "mulss {$src, $dst|$dst, $src}">, XS;
+def MULSDrm : I<0x59, MRMSrcMem, (ops RXMM:$dst, RXMM:$src1, f64mem:$src),
+                "mulsd {$src, $dst|$dst, $src}">, XD;
+
+def DIVSSrm : I<0x5E, MRMSrcMem, (ops RXMM:$dst, RXMM:$src1, f32mem:$src),
+                "divss {$src, $dst|$dst, $src}">, XS;
+def DIVSSrr : I<0x5E, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "divss {$src, $dst|$dst, $src}">, XS;
+def DIVSDrm : I<0x5E, MRMSrcMem, (ops RXMM:$dst, RXMM:$src1, f64mem:$src),
+                "divsd {$src, $dst|$dst, $src}">, XD;
+def DIVSDrr : I<0x5E, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "divsd {$src, $dst|$dst, $src}">, XD;
+
+def SUBSSrm : I<0x5C, MRMSrcMem, (ops RXMM:$dst, RXMM:$src1, f32mem:$src),
+                "subss {$src, $dst|$dst, $src}">, XS;
+def SUBSSrr : I<0x5C, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "subss {$src, $dst|$dst, $src}">, XS;
+def SUBSDrm : I<0x5C, MRMSrcMem, (ops RXMM:$dst, RXMM:$src1, f64mem:$src),
+                "subsd {$src, $dst|$dst, $src}">, XD;
+def SUBSDrr : I<0x5C, MRMSrcReg, (ops RXMM:$dst, RXMM:$src1, RXMM:$src),
+                "subsd {$src, $dst|$dst, $src}">, XD;
+
+def CMPSSrr : I<0xC2, MRMSrcReg, 
+                (ops RXMM:$dst, RXMM:$src1, RXMM:$src, SSECC:$cc),
+                "cmp${cc}ss {$src, $dst|$dst, $src}">, XS;
+def CMPSSrm : I<0xC2, MRMSrcMem, 
+                (ops RXMM:$dst, RXMM:$src1, f32mem:$src, SSECC:$cc),
+                "cmp${cc}ss {$src, $dst|$dst, $src}">, XS;
+def CMPSDrr : I<0xC2, MRMSrcReg, 
+                (ops RXMM:$dst, RXMM:$src1, RXMM:$src, SSECC:$cc),
+                "cmp${cc}sd {$src, $dst|$dst, $src}">, XD;
+def CMPSDrm : I<0xC2, MRMSrcMem, 
+                (ops RXMM:$dst, RXMM:$src1, f64mem:$src, SSECC:$cc),
+                "cmp${cc}sd {$src, $dst|$dst, $src}">, XD;
+}
  
  //===----------------------------------------------------------------------===//
-// Floating point support
+// Stack-based Floating point support
  //===----------------------------------------------------------------------===//
  
  // FIXME: These need to indicate mod/ref sets for FP regs... & FP 'TOP'
@@ -1322,12 +1646,16 @@ let isTwoAddress = 1, Uses = [ST0], Defs = [ST0] in {
                      (ops RST:$op), "fcmovbe {$op, %ST(0)|%ST(0), $op}">, DA;
    def FCMOVE  : FPI<0xC8, AddRegFrm, CondMovFP,
                      (ops RST:$op), "fcmove {$op, %ST(0)|%ST(0), $op}">, DA;
+  def FCMOVP  : FPI<0xD8, AddRegFrm, CondMovFP,
+                    (ops RST:$op), "fcmovu  {$op, %ST(0)|%ST(0), $op}">, DA;
    def FCMOVAE : FPI<0xC0, AddRegFrm, CondMovFP,
                      (ops RST:$op), "fcmovae {$op, %ST(0)|%ST(0), $op}">, DB;
    def FCMOVA  : FPI<0xD0, AddRegFrm, CondMovFP,
                      (ops RST:$op), "fcmova {$op, %ST(0)|%ST(0), $op}">, DB;
    def FCMOVNE : FPI<0xC8, AddRegFrm, CondMovFP,
                      (ops RST:$op), "fcmovne {$op, %ST(0)|%ST(0), $op}">, DB;
+  def FCMOVNP : FPI<0xD8, AddRegFrm, CondMovFP,
+                    (ops RST:$op), "fcmovnu {$op, %ST(0)|%ST(0), $op}">, DB;
  }
  
  // Floating point loads & stores...
@@ -1362,8 +1690,12 @@ def FLD1 : FPI<0xE8, RawFrm, ZeroArgFP, (ops), "fld1">, D9;
  
  
  // Unary operations...
-def FCHS : FPI<0xE0, RawFrm, OneArgFPRW, (ops), "fchs">, D9;   // f1 = fchs f2
-def FTST : FPI<0xE4, RawFrm, OneArgFP, (ops), "ftst">, D9;     // ftst ST(0)
+def FCHS  : FPI<0xE0, RawFrm, OneArgFPRW, (ops), "fchs" >, D9; // f1 = fchs f2
+def FABS  : FPI<0xE1, RawFrm, OneArgFPRW, (ops), "fabs" >, D9; // f1 = fabs f2
+def FSQRT : FPI<0xFA, RawFrm, OneArgFPRW, (ops), "fsqrt">, D9; // fsqrt ST(0)
+def FSIN  : FPI<0xFE, RawFrm, OneArgFPRW, (ops), "fsin" >, D9; // fsin  ST(0)
+def FCOS  : FPI<0xFF, RawFrm, OneArgFPRW, (ops), "fcos" >, D9; // fcos  ST(0)
+def FTST  : FPI<0xE4, RawFrm, OneArgFP  , (ops), "ftst" >, D9; // ftst ST(0)
  
  // Binary arithmetic operations...
  class FPST0rInst<bits<8> o, dag ops, string asm>
@@ -1389,7 +1721,7 @@ def FADDPrST0  : FPrST0PInst<0xC0, (ops RST:$op),
  
  // NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
  // of some of the 'reverse' forms of the fsub and fdiv instructions.  As such,
-// we have to put some 'r's in and take them out of wierd places.
+// we have to put some 'r's in and take them out of weird places.
  def FSUBRST0r  : FPST0rInst <0xE8, (ops RST:$op),
                               "fsubr $op">;
  def FSUBrST0   : FPrST0Inst <0xE8, (ops RST:$op),