Remove hasSSE*orAVX functions and change all callers to use just hasSSE*. AVX is...

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index d33a7e09a56dcc910e3ec98696fceac249fd5a0e..2ff1e55f38e9770e135630047215e882b61ffc96 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -44,7 +44,6 @@
  #include "llvm/ADT/Statistic.h"
  #include "llvm/ADT/StringExtras.h"
  #include "llvm/ADT/VariadicFunction.h"
-#include "llvm/ADT/VectorExtras.h"
  #include "llvm/Support/CallSite.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/Dwarf.h"
@@ -256,7 +255,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
  
    if (Subtarget->is64Bit()) {
      setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
-    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    } else if (!TM.Options.UseSoftFloat) {
      // We have an algorithm for SSE2->double, and we turn this into a
      // 64-bit FILD followed by conditional FADD for other targets.
@@ -938,7 +937,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
    }
  
-  if (Subtarget->hasSSE41orAVX()) {
+  if (Subtarget->hasSSE41()) {
      setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
      setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
      setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
@@ -1010,7 +1009,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      }
    }
  
-  if (Subtarget->hasSSE42orAVX())
+  if (Subtarget->hasSSE42())
      setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
  
    if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
@@ -3238,13 +3237,13 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
  /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
  /// is suitable for input to PALIGNR.
  static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                          bool hasSSSE3OrAVX) {
+                          bool hasSSSE3) {
    int i, e = VT.getVectorNumElements();
    if (VT.getSizeInBits() != 128)
      return false;
  
    // Do not handle v2i64 / v2f64 shuffles with palignr.
-  if (e < 4 || !hasSSSE3OrAVX)
+  if (e < 4 || !hasSSSE3)
      return false;
  
    for (i = 0; i != e; ++i)
@@ -3846,7 +3845,7 @@ static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
  /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
  bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
                           const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasSSE3orAVX())
+  if (!Subtarget->hasSSE3())
      return false;
  
    // The second vector must be undef
@@ -3874,7 +3873,7 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
  /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
  bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
                           const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasSSE3orAVX())
+  if (!Subtarget->hasSSE3())
      return false;
  
    // The second vector must be undef
@@ -5339,7 +5338,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
        return LD;
  
      // For SSE 4.1, use insertps to put the high elements into the low element.
-    if (getSubtarget()->hasSSE41orAVX()) {
+    if (getSubtarget()->hasSSE41()) {
        SDValue Result;
        if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
          Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -5510,7 +5509,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
    // quads, disable the next transformation since it does not help SSSE3.
    bool V1Used = InputQuads[0] || InputQuads[1];
    bool V2Used = InputQuads[2] || InputQuads[3];
-  if (Subtarget->hasSSSE3orAVX()) {
+  if (Subtarget->hasSSSE3()) {
      if (InputQuads.count() == 2 && V1Used && V2Used) {
        BestLoQuad = InputQuads.find_first();
        BestHiQuad = InputQuads.find_next(BestLoQuad);
@@ -5583,7 +5582,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
    // If we have SSSE3, and all words of the result are from 1 input vector,
    // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
    // is present, fall back to case 4.
-  if (Subtarget->hasSSSE3orAVX()) {
+  if (Subtarget->hasSSSE3()) {
      SmallVector<SDValue,16> pshufbMask;
  
      // If we have elements from both input vectors, set the high bit of the
@@ -5651,7 +5650,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                  &MaskV[0]);
  
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX())
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
        NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
                                 NewV.getOperand(0),
                                 X86::getShufflePSHUFLWImmediate(NewV.getNode()),
@@ -5679,7 +5678,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                  &MaskV[0]);
  
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX())
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
        NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
                                NewV.getOperand(0),
                                X86::getShufflePSHUFHWImmediate(NewV.getNode()),
@@ -5745,7 +5744,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
    }
  
    // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
-  if (TLI.getSubtarget()->hasSSSE3orAVX()) {
+  if (TLI.getSubtarget()->hasSSSE3()) {
      SmallVector<SDValue,16> pshufbMask;
  
      // If all result elements are from one input vector, then only translate
@@ -6506,7 +6505,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp, HasAVX2))
      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
  
-  if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3orAVX() &&
+  if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3() &&
        V2IsUndef && RelaxedMayFoldVectorLoad(V1))
      return getMOVDDup(Op, dl, V1, DAG);
  
@@ -6658,7 +6657,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    // inlined here right now to enable us to directly emit target specific
    // nodes, and remove one by one until they don't return Op anymore.
  
-  if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()))
+  if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3()))
      return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
                                  getShufflePALIGNRImmediate(SVOp),
                                  DAG);
@@ -6830,7 +6829,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
  
    assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
  
-  if (Subtarget->hasSSE41orAVX()) {
+  if (Subtarget->hasSSE41()) {
      SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
      if (Res.getNode())
        return Res;
@@ -6972,7 +6971,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
      return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
    }
  
-  if (Subtarget->hasSSE41orAVX())
+  if (Subtarget->hasSSE41())
      return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
  
    if (EltVT == MVT::i8)
@@ -7581,38 +7580,17 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
  // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
  SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
                                                 SelectionDAG &DAG) const {
-  // This algorithm is not obvious. Here it is in C code, more or less:
+  // This algorithm is not obvious. Here it is what we're trying to output:
    /*
-    double uint64_to_double( uint32_t hi, uint32_t lo ) {
-      static const __m128i exp = { 0x4330000045300000ULL, 0 };
-      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
-
-      // Copy ints to xmm registers.
-      __m128i xh = _mm_cvtsi32_si128( hi );
-      __m128i xl = _mm_cvtsi32_si128( lo );
-
-      // Combine into low half of a single xmm register.
-      __m128i x = _mm_unpacklo_epi32( xh, xl );
-      __m128d d;
-      double sd;
-
-      // Merge in appropriate exponents to give the integer bits the right
-      // magnitude.
-      x = _mm_unpacklo_epi32( x, exp );
-
-      // Subtract away the biases to deal with the IEEE-754 double precision
-      // implicit 1.
-      d = _mm_sub_pd( (__m128d) x, bias );
-
-      // All conversions up to here are exact. The correctly rounded result is
-      // calculated using the current rounding mode using the following
-      // horizontal add.
-      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
-      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
-                                // store doesn't really need to be here (except
-                                // maybe to zero the other double)
-      return sd;
-    }
+     movq       %rax,  %xmm0
+     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+     #ifdef __SSE3__
+       haddpd   %xmm0, %xmm0          
+     #else
+       pshufd   $0x4e, %xmm0, %xmm1 
+       addpd    %xmm1, %xmm0
+     #endif
    */
  
    DebugLoc dl = Op.getDebugLoc();
@@ -7620,46 +7598,51 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
  
    // Build some magic constants.
    SmallVector<Constant*,4> CV0;
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
    CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
+  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
    CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
    CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
    Constant *C0 = ConstantVector::get(CV0);
    SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
  
    SmallVector<Constant*,2> CV1;
-  CV1.push_back(
-    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
    CV1.push_back(
      ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
    Constant *C1 = ConstantVector::get(CV1);
    SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
  
-  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
-                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                                        Op.getOperand(0),
-                                        DAG.getIntPtrConstant(1)));
-  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
-                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                                        Op.getOperand(0),
-                                        DAG.getIntPtrConstant(0)));
-  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
+  // Load the 64-bit value into an XMM register.
+  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                            Op.getOperand(0));
    SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
                                MachinePointerInfo::getConstantPool(),
                                false, false, false, 16);
-  SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
-  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2);
+  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
+                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
+                              CLod0);
+
    SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                                MachinePointerInfo::getConstantPool(),
                                false, false, false, 16);
+  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+  SDValue Result;
+
+  if (Subtarget->hasSSE3()) {
+    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
+    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+  } else {
+    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
+    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
+                                           S2F, 0x4E, DAG);
+    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
+                         Sub);
+  }
  
-  // Add the halves; easiest way is to swap them into another reg first.
-  int ShufMask[2] = { 1, -1 };
-  SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
-                                      DAG.getUNDEF(MVT::v2f64), ShufMask);
-  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
                       DAG.getIntPtrConstant(0));
  }
  
@@ -7729,6 +7712,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
      return LowerUINT_TO_FP_i64(Op, DAG);
    else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
      return LowerUINT_TO_FP_i32(Op, DAG);
+  else if (SrcVT == MVT::i64 && DstVT == MVT::f32)
+    return SDValue();
  
    // Make a 64-bit buffer, and use it to build an FILD.
    SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
@@ -8466,9 +8451,9 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
  
    // Check that the operation in question is available (most are plain SSE2,
    // but PCMPGTQ and PCMPEQQ have different requirements).
-  if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42orAVX())
+  if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42())
      return SDValue();
-  if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41orAVX())
+  if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41())
      return SDValue();
  
    // Since SSE has no unsigned integer comparisons, we need to flip  the sign
@@ -11139,7 +11124,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
            isPSHUFDMask(M, VT) ||
            isPSHUFHWMask(M, VT) ||
            isPSHUFLWMask(M, VT) ||
-          isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()) ||
+          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
            isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
            isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
            isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) ||
@@ -11548,7 +11533,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
  MachineBasicBlock *
  X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
                              unsigned numArgs, bool memArg) const {
-  assert(Subtarget->hasSSE42orAVX() &&
+  assert(Subtarget->hasSSE42() &&
           "Target must have SSE4.2 or AVX features enabled");
  
    DebugLoc dl = MI->getDebugLoc();
@@ -12909,7 +12894,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
        VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
        (Subtarget->hasXMMInt() ||
-       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
+       (Subtarget->hasXMM() && VT.getScalarType() == MVT::f32))) {
      ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
  
      unsigned Opcode = 0;
@@ -13755,7 +13740,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
  
    // look for psign/blend
    if (VT == MVT::v2i64 || VT == MVT::v4i64) {
-    if (!Subtarget->hasSSSE3orAVX() ||
+    if (!Subtarget->hasSSSE3() ||
          (VT == MVT::v4i64 && !Subtarget->hasAVX2()))
        return SDValue();
  
@@ -13825,7 +13810,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
          return DAG.getNode(ISD::BITCAST, DL, VT, Sign);
        }
        // PBLENDVB only available on SSE 4.1
-      if (!Subtarget->hasSSE41orAVX())
+      if (!Subtarget->hasSSE41())
          return SDValue();
  
        EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
@@ -14364,7 +14349,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
    SDValue RHS = N->getOperand(1);
  
    // Try to synthesize horizontal adds from adds of shuffles.
-  if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
         (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
        isHorizontalBinOp(LHS, RHS, true))
      return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
@@ -14379,7 +14364,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
    SDValue RHS = N->getOperand(1);
  
    // Try to synthesize horizontal subs from subs of shuffles.
-  if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
         (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
        isHorizontalBinOp(LHS, RHS, false))
      return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
@@ -14584,7 +14569,7 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
    SDValue Op1 = N->getOperand(1);
  
    // Try to synthesize horizontal adds from adds of shuffles.
-  if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
         (Subtarget->hasAVX2() && (VT == MVT::v16i16 || MVT::v8i32))) &&
        isHorizontalBinOp(Op0, Op1, true))
      return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
@@ -14617,7 +14602,7 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
  
    // Try to synthesize horizontal adds from adds of shuffles.
    EVT VT = N->getValueType(0);
-  if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
         (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
        isHorizontalBinOp(Op0, Op1, true))
      return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
@@ -14971,7 +14956,8 @@ TargetLowering::ConstraintWeight
        break;
    case 'x':
    case 'Y':
-    if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM())
+    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) ||
+        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX()))
        weight = CW_Register;
      break;
    case 'I':
@@ -15251,7 +15237,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
      case 'Y':   // SSE_REGS if SSE2 allowed
        if (!Subtarget->hasXMMInt()) break;
        // FALL THROUGH.
-    case 'x':   // SSE_REGS if SSE1 allowed
+    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
        if (!Subtarget->hasXMM()) break;
  
        switch (VT.getSimpleVT().SimpleTy) {
@@ -15271,6 +15257,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
        case MVT::v4f32:
        case MVT::v2f64:
          return std::make_pair(0U, X86::VR128RegisterClass);
+      // AVX types.
+      case MVT::v32i8:
+      case MVT::v16i16:
+      case MVT::v8i32:
+      case MVT::v4i64:
+      case MVT::v8f32:
+      case MVT::v4f64:
+        return std::make_pair(0U, X86::VR256RegisterClass);
+        
        }
        break;
      }