Since I'm obliged to work with a development OS that currently doesn't

[oota-llvm.git] / lib / Target / X86 / X86InstrInfo.td
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td

index 4d27f31a18aa9d3695faad77f43aed5d3f771cce..f4a57be007e1a887d3ab5fec24e3e2f953a97dba 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -222,6 +222,7 @@ def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
  def NotSmallCode : Predicate<"TM.getCodeModel() != CodeModel::Small">;
  def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
  def OptForSpeed  : Predicate<"!OptForSize">;
+def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
  
  //===----------------------------------------------------------------------===//
  // X86 Instruction Format Definitions.
@@ -235,24 +236,22 @@ include "X86InstrFormats.td"
  
  // X86 specific condition code. These correspond to CondCode in
  // X86InstrInfo.h. They must be kept in synch.
-def X86_COND_A   : PatLeaf<(i8 0)>;
-def X86_COND_AE  : PatLeaf<(i8 1)>;
-def X86_COND_B   : PatLeaf<(i8 2)>;
-def X86_COND_BE  : PatLeaf<(i8 3)>;
-def X86_COND_E   : PatLeaf<(i8 4)>;
-def X86_COND_G   : PatLeaf<(i8 5)>;
-def X86_COND_GE  : PatLeaf<(i8 6)>;
-def X86_COND_L   : PatLeaf<(i8 7)>;
-def X86_COND_LE  : PatLeaf<(i8 8)>;
-def X86_COND_NE  : PatLeaf<(i8 9)>;
+def X86_COND_A   : PatLeaf<(i8 0)>;  // alt. COND_NBE
+def X86_COND_AE  : PatLeaf<(i8 1)>;  // alt. COND_NC
+def X86_COND_B   : PatLeaf<(i8 2)>;  // alt. COND_C
+def X86_COND_BE  : PatLeaf<(i8 3)>;  // alt. COND_NA
+def X86_COND_E   : PatLeaf<(i8 4)>;  // alt. COND_Z
+def X86_COND_G   : PatLeaf<(i8 5)>;  // alt. COND_NLE
+def X86_COND_GE  : PatLeaf<(i8 6)>;  // alt. COND_NL
+def X86_COND_L   : PatLeaf<(i8 7)>;  // alt. COND_NGE
+def X86_COND_LE  : PatLeaf<(i8 8)>;  // alt. COND_NG
+def X86_COND_NE  : PatLeaf<(i8 9)>;  // alt. COND_NZ
  def X86_COND_NO  : PatLeaf<(i8 10)>;
-def X86_COND_NP  : PatLeaf<(i8 11)>;
+def X86_COND_NP  : PatLeaf<(i8 11)>; // alt. COND_PO
  def X86_COND_NS  : PatLeaf<(i8 12)>;
-def X86_COND_NC  : PatLeaf<(i8 13)>;
-def X86_COND_O   : PatLeaf<(i8 14)>;
-def X86_COND_P   : PatLeaf<(i8 15)>;
-def X86_COND_S   : PatLeaf<(i8 16)>;
-def X86_COND_C   : PatLeaf<(i8 17)>;
+def X86_COND_O   : PatLeaf<(i8 13)>;
+def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
+def X86_COND_S   : PatLeaf<(i8 15)>;
  
  def i16immSExt8  : PatLeaf<(i16 imm), [{
    // i16immSExt8 predicate - True if the 16-bit immediate fits in a 8-bit
@@ -309,6 +308,16 @@ def nvloadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
    return false;
  }]>;
  
+def gsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  const Value *Src = LD->getSrcValue();
+  if (!Src)
+    return false;
+  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+    return PT->getAddressSpace() == 256;
+  return false;
+}]>;
+
  def loadi8  : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
  def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
  
@@ -464,10 +473,6 @@ def JO  : IBr<0x80, (ins brtarget:$dst), "jo\t$dst",
                [(X86brcond bb:$dst, X86_COND_O, EFLAGS)]>, TB;
  def JNO : IBr<0x81, (ins brtarget:$dst), "jno\t$dst",
                [(X86brcond bb:$dst, X86_COND_NO, EFLAGS)]>, TB;
-def JC  : IBr<0x82, (ins brtarget:$dst), "jc\t$dst",
-              [(X86brcond bb:$dst, X86_COND_C, EFLAGS)]>, TB;
-def JNC : IBr<0x83, (ins brtarget:$dst), "jnc\t$dst",
-              [(X86brcond bb:$dst, X86_COND_NC, EFLAGS)]>, TB;
  } // Uses = [EFLAGS]
  
  //===----------------------------------------------------------------------===//
@@ -843,7 +848,6 @@ def CMOVB32rr : I<0x42, MRMSrcReg,       // if <u, GR32 = GR32
                    [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
                                     X86_COND_B, EFLAGS))]>,
                     TB;
-
  def CMOVAE16rr: I<0x43, MRMSrcReg,       // if >=u, GR16 = GR16
                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
                    "cmovae\t{$src2, $dst|$dst, $src2}",
@@ -1000,14 +1004,31 @@ def CMOVNP32rr : I<0x4B, MRMSrcReg,       // if !parity, GR32 = GR32
                     [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
                                      X86_COND_NP, EFLAGS))]>,
                    TB;
-} // isCommutable = 1
-
-def CMOVNP32rm : I<0x4B, MRMSrcMem,       // if !parity, GR32 = [mem32]
-                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
-                  "cmovnp\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
-                                    X86_COND_NP, EFLAGS))]>,
+def CMOVO16rr : I<0x40, MRMSrcReg,       // if overflow, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovo\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_O, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVO32rr : I<0x40, MRMSrcReg,       // if overflow, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovo\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_O, EFLAGS))]>,
+                  TB;
+def CMOVNO16rr : I<0x41, MRMSrcReg,       // if !overflow, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovno\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                    X86_COND_NO, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNO32rr : I<0x41, MRMSrcReg,       // if !overflow, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovno\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                    X86_COND_NO, EFLAGS))]>,
                    TB;
+} // isCommutable = 1
  
  def CMOVB16rm : I<0x42, MRMSrcMem,       // if <u, GR16 = [mem16]
                    (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
@@ -1171,6 +1192,36 @@ def CMOVNP16rm : I<0x4B, MRMSrcMem,       // if !parity, GR16 = [mem16]
                     [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
                                      X86_COND_NP, EFLAGS))]>,
                    TB, OpSize;
+def CMOVNP32rm : I<0x4B, MRMSrcMem,       // if !parity, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovnp\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                    X86_COND_NP, EFLAGS))]>,
+                  TB;
+def CMOVO16rm : I<0x40, MRMSrcMem,       // if overflow, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovo\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_O, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVO32rm : I<0x40, MRMSrcMem,       // if overflow, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovo\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_O, EFLAGS))]>,
+                  TB;
+def CMOVNO16rm : I<0x41, MRMSrcMem,       // if !overflow, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovno\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                    X86_COND_NO, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNO32rm : I<0x41, MRMSrcMem,       // if !overflow, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovno\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                    X86_COND_NO, EFLAGS))]>,
+                  TB;
  } // Uses = [EFLAGS]
  
  
@@ -1194,12 +1245,15 @@ let isTwoAddress = 0 in {
  }
  } // Defs = [EFLAGS]
  
+// Match xor -1 to not. Favors these over a move imm + xor to save code size.
+let AddedComplexity = 15 in {
  def NOT8r  : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src), "not{b}\t$dst",
                 [(set GR8:$dst, (not GR8:$src))]>;
  def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src), "not{w}\t$dst",
                 [(set GR16:$dst, (not GR16:$src))]>, OpSize;
  def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src), "not{l}\t$dst",
                 [(set GR32:$dst, (not GR32:$src))]>;
+}
  let isTwoAddress = 0 in {
    def NOT8m  : I<0xF6, MRM2m, (outs), (ins i8mem :$dst), "not{b}\t$dst",
                   [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
@@ -2533,27 +2587,6 @@ def SETNOm   : I<0x91, MRM0m,
                   "setno\t$dst",
                   [(store (X86setcc X86_COND_NO, EFLAGS), addr:$dst)]>,
                 TB;                        // [mem8] = not overflow
-
-def SETCr    : I<0x92, MRM0r, 
-                 (outs GR8   :$dst), (ins),
-                 "setc\t$dst",
-                 [(set GR8:$dst, (X86setcc X86_COND_C, EFLAGS))]>,
-               TB;                        // GR8 = carry
-def SETCm    : I<0x92, MRM0m, 
-                 (outs), (ins i8mem:$dst),
-                 "setc\t$dst",
-                 [(store (X86setcc X86_COND_C, EFLAGS), addr:$dst)]>,
-               TB;                        // [mem8] = carry
-def SETNCr   : I<0x93, MRM0r, 
-                 (outs GR8   :$dst), (ins),
-                 "setnc\t$dst",
-                 [(set GR8:$dst, (X86setcc X86_COND_NC, EFLAGS))]>,
-               TB;                        // GR8 = not carry
-def SETNCm   : I<0x93, MRM0m, 
-                 (outs), (ins i8mem:$dst),
-                 "setnc\t$dst",
-                 [(store (X86setcc X86_COND_NC, EFLAGS), addr:$dst)]>,
-               TB;                        // [mem8] = not carry
  } // Uses = [EFLAGS]
  
  
@@ -2652,25 +2685,48 @@ def CMP32ri8 : Ii8<0x83, MRM7r,
  } // Defs = [EFLAGS]
  
  // Bit tests.
-// TODO: BT with immediate operands
  // TODO: BTC, BTR, and BTS
  let Defs = [EFLAGS] in {
-def BT16rr : I<0xA3, MRMSrcReg, (outs), (ins GR16:$src1, GR16:$src2),
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
                 "bt{w}\t{$src2, $src1|$src1, $src2}",
                 [(X86bt GR16:$src1, GR16:$src2),
-                (implicit EFLAGS)]>, OpSize;
-def BT32rr : I<0xA3, MRMSrcReg, (outs), (ins GR32:$src1, GR32:$src2),
+                (implicit EFLAGS)]>, OpSize, TB;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
                 "bt{l}\t{$src2, $src1|$src1, $src2}",
                 [(X86bt GR32:$src1, GR32:$src2),
-                (implicit EFLAGS)]>;
-def BT16mr : I<0xA3, MRMSrcMem, (outs), (ins i16mem:$src1, GR16:$src2),
-               "bt{w}\t{$src2, $src1|$src1, $src2}",
-               [(X86bt addr:$src1, GR16:$src2),
-                (implicit EFLAGS)]>, OpSize;
-def BT32mr : I<0xA3, MRMSrcMem, (outs), (ins i32mem:$src1, GR32:$src2),
-               "bt{l}\t{$src2, $src1|$src1, $src2}",
-               [(X86bt addr:$src1, GR32:$src2),
-                (implicit EFLAGS)]>;
+                (implicit EFLAGS)]>, TB;
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Disable these instructions for now.
+//def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+//               "bt{w}\t{$src2, $src1|$src1, $src2}",
+//               [(X86bt (loadi16 addr:$src1), GR16:$src2),
+//                (implicit EFLAGS)]>, OpSize, TB, Requires<[FastBTMem]>;
+//def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+//               "bt{l}\t{$src2, $src1|$src1, $src2}",
+//               [(X86bt (loadi32 addr:$src1), GR32:$src2),
+//                (implicit EFLAGS)]>, TB, Requires<[FastBTMem]>;
+
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt GR16:$src1, i16immSExt8:$src2),
+                 (implicit EFLAGS)]>, OpSize, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt GR32:$src1, i32immSExt8:$src2),
+                 (implicit EFLAGS)]>, TB;
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt (loadi16 addr:$src1), i16immSExt8:$src2),
+                 (implicit EFLAGS)]>, OpSize, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt (loadi32 addr:$src1), i32immSExt8:$src2),
+                 (implicit EFLAGS)]>, TB;
  } // Defs = [EFLAGS]
  
  // Sign/Zero extenders
@@ -2806,6 +2862,11 @@ def TLS_tp : I<0x8B, Pseudo, (outs GR32:$dst), (ins),
                 "movl\t%gs:0, $dst",
                 [(set GR32:$dst, X86TLStp)]>, SegGS;
  
+let AddedComplexity = 5 in
+def GS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "movl\t%gs:$src, $dst",
+                   [(set GR32:$dst, (gsload addr:$src))]>, SegGS;
+
  //===----------------------------------------------------------------------===//
  // DWARF Pseudo Instructions
  //
@@ -3058,6 +3119,73 @@ def : Pat<(parallel (X86cmp GR16:$src1, 0), (implicit EFLAGS)),
  def : Pat<(parallel (X86cmp GR32:$src1, 0), (implicit EFLAGS)),
            (TEST32rr GR32:$src1, GR32:$src1)>;
  
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_B, EFLAGS),
+          (CMOVAE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_B, EFLAGS),
+          (CMOVAE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_AE, EFLAGS),
+          (CMOVB16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_AE, EFLAGS),
+          (CMOVB32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_E, EFLAGS),
+          (CMOVNE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_E, EFLAGS),
+          (CMOVNE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NE, EFLAGS),
+          (CMOVE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NE, EFLAGS),
+          (CMOVE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_BE, EFLAGS),
+          (CMOVA16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_BE, EFLAGS),
+          (CMOVA32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_A, EFLAGS),
+          (CMOVBE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_A, EFLAGS),
+          (CMOVBE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_L, EFLAGS),
+          (CMOVGE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_L, EFLAGS),
+          (CMOVGE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_GE, EFLAGS),
+          (CMOVL16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_GE, EFLAGS),
+          (CMOVL32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_LE, EFLAGS),
+          (CMOVG16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_LE, EFLAGS),
+          (CMOVG32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_G, EFLAGS),
+          (CMOVLE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_G, EFLAGS),
+          (CMOVLE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_P, EFLAGS),
+          (CMOVNP16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_P, EFLAGS),
+          (CMOVNP32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NP, EFLAGS),
+          (CMOVP16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NP, EFLAGS),
+          (CMOVP32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_S, EFLAGS),
+          (CMOVNS16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_S, EFLAGS),
+          (CMOVNS32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NS, EFLAGS),
+          (CMOVS16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NS, EFLAGS),
+          (CMOVS32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_O, EFLAGS),
+          (CMOVNO16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_O, EFLAGS),
+          (CMOVNO32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NO, EFLAGS),
+          (CMOVO16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NO, EFLAGS),
+          (CMOVO32rm GR32:$src2, addr:$src1)>;
+
  // zextload bool -> zextload byte
  def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>;
  def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
@@ -3484,6 +3612,17 @@ def : Pat<(parallel (X86smul_ovf (load addr:$src1), i32immSExt8:$src2),
                      (implicit EFLAGS)),
            (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
  
+// Optimize multiple with overflow by 2.
+let AddedComplexity = 2 in {
+def : Pat<(parallel (X86smul_ovf GR16:$src1, 2),
+                    (implicit EFLAGS)),
+          (ADD16rr GR16:$src1, GR16:$src1)>;
+
+def : Pat<(parallel (X86smul_ovf GR32:$src1, 2),
+                    (implicit EFLAGS)),
+          (ADD32rr GR32:$src1, GR32:$src1)>;
+}
+
  //===----------------------------------------------------------------------===//
  // Floating Point Stack Support
  //===----------------------------------------------------------------------===//