1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
16 #include "X86InstrBuilder.h"
17 #include "X86ISelLowering.h"
18 #include "X86MachineFunctionInfo.h"
19 #include "X86TargetMachine.h"
20 #include "llvm/CallingConv.h"
21 #include "llvm/Constants.h"
22 #include "llvm/DerivedTypes.h"
23 #include "llvm/GlobalVariable.h"
24 #include "llvm/Function.h"
25 #include "llvm/Intrinsics.h"
26 #include "llvm/ADT/BitVector.h"
27 #include "llvm/ADT/VectorExtras.h"
28 #include "llvm/CodeGen/CallingConvLower.h"
29 #include "llvm/CodeGen/MachineFrameInfo.h"
30 #include "llvm/CodeGen/MachineFunction.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineModuleInfo.h"
33 #include "llvm/CodeGen/MachineRegisterInfo.h"
34 #include "llvm/CodeGen/PseudoSourceValue.h"
35 #include "llvm/CodeGen/SelectionDAG.h"
36 #include "llvm/Support/MathExtras.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Target/TargetOptions.h"
39 #include "llvm/ADT/SmallSet.h"
40 #include "llvm/ADT/StringExtras.h"
41 #include "llvm/Support/CommandLine.h"
45 DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
47 // Forward declarations.
48 static SDValue getMOVLMask(unsigned NumElems, SelectionDAG &DAG, DebugLoc dl);
50 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
51 : TargetLowering(TM) {
52 Subtarget = &TM.getSubtarget<X86Subtarget>();
53 X86ScalarSSEf64 = Subtarget->hasSSE2();
54 X86ScalarSSEf32 = Subtarget->hasSSE1();
55 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
59 RegInfo = TM.getRegisterInfo();
62 // Set up the TargetLowering object.
64 // X86 is weird, it always uses i8 for shift amounts and setcc results.
65 setShiftAmountType(MVT::i8);
66 setBooleanContents(ZeroOrOneBooleanContent);
67 setSchedulingPreference(SchedulingForRegPressure);
68 setShiftAmountFlavor(Mask); // shl X, 32 == shl X, 0
69 setStackPointerRegisterToSaveRestore(X86StackPtr);
71 if (Subtarget->isTargetDarwin()) {
72 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
73 setUseUnderscoreSetJmp(false);
74 setUseUnderscoreLongJmp(false);
75 } else if (Subtarget->isTargetMingw()) {
76 // MS runtime is weird: it exports _setjmp, but longjmp!
77 setUseUnderscoreSetJmp(true);
78 setUseUnderscoreLongJmp(false);
80 setUseUnderscoreSetJmp(true);
81 setUseUnderscoreLongJmp(true);
84 // Set up the register classes.
85 addRegisterClass(MVT::i8, X86::GR8RegisterClass);
86 addRegisterClass(MVT::i16, X86::GR16RegisterClass);
87 addRegisterClass(MVT::i32, X86::GR32RegisterClass);
88 if (Subtarget->is64Bit())
89 addRegisterClass(MVT::i64, X86::GR64RegisterClass);
91 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
93 // We don't accept any truncstore of integer registers.
94 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
95 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
96 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
97 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
98 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
99 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
101 // SETOEQ and SETUNE require checking two conditions.
102 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
103 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
104 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
105 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
106 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
107 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
109 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
111 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
112 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
113 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
115 if (Subtarget->is64Bit()) {
116 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand);
117 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
119 if (X86ScalarSSEf64) {
120 // We have an impenetrably clever algorithm for ui64->double only.
121 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
123 // We have faster algorithm for ui32->single only.
124 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
126 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
129 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
131 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
132 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
133 // SSE has no i16 to fp conversion, only i32
134 if (X86ScalarSSEf32) {
135 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
136 // f32 and f64 cases are Legal, f80 case is not
137 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
139 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
140 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
143 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
144 // are Legal, f80 is custom lowered.
145 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
146 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
148 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
150 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
151 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
153 if (X86ScalarSSEf32) {
154 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
155 // f32 and f64 cases are Legal, f80 case is not
156 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
158 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
159 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
162 // Handle FP_TO_UINT by promoting the destination to a larger signed
164 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
165 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
166 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
168 if (Subtarget->is64Bit()) {
169 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
170 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
172 if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
173 // Expand FP_TO_UINT into a select.
174 // FIXME: We would like to use a Custom expander here eventually to do
175 // the optimal thing for SSE vs. the default expansion in the legalizer.
176 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
178 // With SSE3 we can use fisttpll to convert to a signed i64.
179 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
182 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
183 if (!X86ScalarSSEf64) {
184 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand);
185 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand);
188 // Scalar integer divide and remainder are lowered to use operations that
189 // produce two results, to match the available instructions. This exposes
190 // the two-result form to trivial CSE, which is able to combine x/y and x%y
191 // into a single instruction.
193 // Scalar integer multiply-high is also lowered to use two-result
194 // operations, to match the available instructions. However, plain multiply
195 // (low) operations are left as Legal, as there are single-result
196 // instructions for this in x86. Using the two-result multiply instructions
197 // when both high and low results are needed must be arranged by dagcombine.
198 setOperationAction(ISD::MULHS , MVT::i8 , Expand);
199 setOperationAction(ISD::MULHU , MVT::i8 , Expand);
200 setOperationAction(ISD::SDIV , MVT::i8 , Expand);
201 setOperationAction(ISD::UDIV , MVT::i8 , Expand);
202 setOperationAction(ISD::SREM , MVT::i8 , Expand);
203 setOperationAction(ISD::UREM , MVT::i8 , Expand);
204 setOperationAction(ISD::MULHS , MVT::i16 , Expand);
205 setOperationAction(ISD::MULHU , MVT::i16 , Expand);
206 setOperationAction(ISD::SDIV , MVT::i16 , Expand);
207 setOperationAction(ISD::UDIV , MVT::i16 , Expand);
208 setOperationAction(ISD::SREM , MVT::i16 , Expand);
209 setOperationAction(ISD::UREM , MVT::i16 , Expand);
210 setOperationAction(ISD::MULHS , MVT::i32 , Expand);
211 setOperationAction(ISD::MULHU , MVT::i32 , Expand);
212 setOperationAction(ISD::SDIV , MVT::i32 , Expand);
213 setOperationAction(ISD::UDIV , MVT::i32 , Expand);
214 setOperationAction(ISD::SREM , MVT::i32 , Expand);
215 setOperationAction(ISD::UREM , MVT::i32 , Expand);
216 setOperationAction(ISD::MULHS , MVT::i64 , Expand);
217 setOperationAction(ISD::MULHU , MVT::i64 , Expand);
218 setOperationAction(ISD::SDIV , MVT::i64 , Expand);
219 setOperationAction(ISD::UDIV , MVT::i64 , Expand);
220 setOperationAction(ISD::SREM , MVT::i64 , Expand);
221 setOperationAction(ISD::UREM , MVT::i64 , Expand);
223 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
224 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
225 setOperationAction(ISD::BR_CC , MVT::Other, Expand);
226 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand);
227 if (Subtarget->is64Bit())
228 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
229 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
230 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
231 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
232 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
233 setOperationAction(ISD::FREM , MVT::f32 , Expand);
234 setOperationAction(ISD::FREM , MVT::f64 , Expand);
235 setOperationAction(ISD::FREM , MVT::f80 , Expand);
236 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
238 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
239 setOperationAction(ISD::CTTZ , MVT::i8 , Custom);
240 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
241 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
242 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
243 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
244 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
245 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
246 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
247 if (Subtarget->is64Bit()) {
248 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
249 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
250 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
253 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
254 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
256 // These should be promoted to a larger select which is supported.
257 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
258 setOperationAction(ISD::SELECT , MVT::i8 , Promote);
259 // X86 wants to expand cmov itself.
260 setOperationAction(ISD::SELECT , MVT::i16 , Custom);
261 setOperationAction(ISD::SELECT , MVT::i32 , Custom);
262 setOperationAction(ISD::SELECT , MVT::f32 , Custom);
263 setOperationAction(ISD::SELECT , MVT::f64 , Custom);
264 setOperationAction(ISD::SELECT , MVT::f80 , Custom);
265 setOperationAction(ISD::SETCC , MVT::i8 , Custom);
266 setOperationAction(ISD::SETCC , MVT::i16 , Custom);
267 setOperationAction(ISD::SETCC , MVT::i32 , Custom);
268 setOperationAction(ISD::SETCC , MVT::f32 , Custom);
269 setOperationAction(ISD::SETCC , MVT::f64 , Custom);
270 setOperationAction(ISD::SETCC , MVT::f80 , Custom);
271 if (Subtarget->is64Bit()) {
272 setOperationAction(ISD::SELECT , MVT::i64 , Custom);
273 setOperationAction(ISD::SETCC , MVT::i64 , Custom);
275 // X86 ret instruction may pop stack.
276 setOperationAction(ISD::RET , MVT::Other, Custom);
277 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
280 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
281 setOperationAction(ISD::JumpTable , MVT::i32 , Custom);
282 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);
283 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);
284 if (Subtarget->is64Bit())
285 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
286 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);
287 if (Subtarget->is64Bit()) {
288 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);
289 setOperationAction(ISD::JumpTable , MVT::i64 , Custom);
290 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);
291 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);
293 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
294 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);
295 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);
296 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);
297 if (Subtarget->is64Bit()) {
298 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom);
299 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom);
300 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
303 if (Subtarget->hasSSE1())
304 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
306 if (!Subtarget->hasSSE2())
307 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand);
309 // Expand certain atomics
310 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
311 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
312 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
313 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
315 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
316 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
317 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
318 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
320 if (!Subtarget->is64Bit()) {
321 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
322 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
323 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
324 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
325 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
326 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
327 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
330 // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion.
331 setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
332 // FIXME - use subtarget debug flags
333 if (!Subtarget->isTargetDarwin() &&
334 !Subtarget->isTargetELF() &&
335 !Subtarget->isTargetCygMing()) {
336 setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
337 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
340 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
341 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand);
342 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
343 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand);
344 if (Subtarget->is64Bit()) {
345 setExceptionPointerRegister(X86::RAX);
346 setExceptionSelectorRegister(X86::RDX);
348 setExceptionPointerRegister(X86::EAX);
349 setExceptionSelectorRegister(X86::EDX);
351 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
352 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
354 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
356 setOperationAction(ISD::TRAP, MVT::Other, Legal);
358 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
359 setOperationAction(ISD::VASTART , MVT::Other, Custom);
360 setOperationAction(ISD::VAEND , MVT::Other, Expand);
361 if (Subtarget->is64Bit()) {
362 setOperationAction(ISD::VAARG , MVT::Other, Custom);
363 setOperationAction(ISD::VACOPY , MVT::Other, Custom);
365 setOperationAction(ISD::VAARG , MVT::Other, Expand);
366 setOperationAction(ISD::VACOPY , MVT::Other, Expand);
369 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
370 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
371 if (Subtarget->is64Bit())
372 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
373 if (Subtarget->isTargetCygMing())
374 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
376 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
378 if (!UseSoftFloat && X86ScalarSSEf64) {
379 // f32 and f64 use SSE.
380 // Set up the FP register classes.
381 addRegisterClass(MVT::f32, X86::FR32RegisterClass);
382 addRegisterClass(MVT::f64, X86::FR64RegisterClass);
384 // Use ANDPD to simulate FABS.
385 setOperationAction(ISD::FABS , MVT::f64, Custom);
386 setOperationAction(ISD::FABS , MVT::f32, Custom);
388 // Use XORP to simulate FNEG.
389 setOperationAction(ISD::FNEG , MVT::f64, Custom);
390 setOperationAction(ISD::FNEG , MVT::f32, Custom);
392 // Use ANDPD and ORPD to simulate FCOPYSIGN.
393 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
394 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
396 // We don't support sin/cos/fmod
397 setOperationAction(ISD::FSIN , MVT::f64, Expand);
398 setOperationAction(ISD::FCOS , MVT::f64, Expand);
399 setOperationAction(ISD::FSIN , MVT::f32, Expand);
400 setOperationAction(ISD::FCOS , MVT::f32, Expand);
402 // Expand FP immediates into loads from the stack, except for the special
404 addLegalFPImmediate(APFloat(+0.0)); // xorpd
405 addLegalFPImmediate(APFloat(+0.0f)); // xorps
407 // Floating truncations from f80 and extensions to f80 go through memory.
408 // If optimizing, we lie about this though and handle it in
409 // InstructionSelectPreprocess so that dagcombine2 can hack on these.
411 setConvertAction(MVT::f32, MVT::f80, Expand);
412 setConvertAction(MVT::f64, MVT::f80, Expand);
413 setConvertAction(MVT::f80, MVT::f32, Expand);
414 setConvertAction(MVT::f80, MVT::f64, Expand);
416 } else if (!UseSoftFloat && X86ScalarSSEf32) {
417 // Use SSE for f32, x87 for f64.
418 // Set up the FP register classes.
419 addRegisterClass(MVT::f32, X86::FR32RegisterClass);
420 addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
422 // Use ANDPS to simulate FABS.
423 setOperationAction(ISD::FABS , MVT::f32, Custom);
425 // Use XORP to simulate FNEG.
426 setOperationAction(ISD::FNEG , MVT::f32, Custom);
428 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
430 // Use ANDPS and ORPS to simulate FCOPYSIGN.
431 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
432 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
434 // We don't support sin/cos/fmod
435 setOperationAction(ISD::FSIN , MVT::f32, Expand);
436 setOperationAction(ISD::FCOS , MVT::f32, Expand);
438 // Special cases we handle for FP constants.
439 addLegalFPImmediate(APFloat(+0.0f)); // xorps
440 addLegalFPImmediate(APFloat(+0.0)); // FLD0
441 addLegalFPImmediate(APFloat(+1.0)); // FLD1
442 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
443 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
445 // SSE <-> X87 conversions go through memory. If optimizing, we lie about
446 // this though and handle it in InstructionSelectPreprocess so that
447 // dagcombine2 can hack on these.
449 setConvertAction(MVT::f32, MVT::f64, Expand);
450 setConvertAction(MVT::f32, MVT::f80, Expand);
451 setConvertAction(MVT::f80, MVT::f32, Expand);
452 setConvertAction(MVT::f64, MVT::f32, Expand);
453 // And x87->x87 truncations also.
454 setConvertAction(MVT::f80, MVT::f64, Expand);
458 setOperationAction(ISD::FSIN , MVT::f64 , Expand);
459 setOperationAction(ISD::FCOS , MVT::f64 , Expand);
461 } else if (!UseSoftFloat) {
462 // f32 and f64 in x87.
463 // Set up the FP register classes.
464 addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
465 addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
467 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
468 setOperationAction(ISD::UNDEF, MVT::f32, Expand);
469 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
470 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
472 // Floating truncations go through memory. If optimizing, we lie about
473 // this though and handle it in InstructionSelectPreprocess so that
474 // dagcombine2 can hack on these.
476 setConvertAction(MVT::f80, MVT::f32, Expand);
477 setConvertAction(MVT::f64, MVT::f32, Expand);
478 setConvertAction(MVT::f80, MVT::f64, Expand);
482 setOperationAction(ISD::FSIN , MVT::f64 , Expand);
483 setOperationAction(ISD::FCOS , MVT::f64 , Expand);
485 addLegalFPImmediate(APFloat(+0.0)); // FLD0
486 addLegalFPImmediate(APFloat(+1.0)); // FLD1
487 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
488 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
489 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
490 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
491 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
492 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
495 // Long double always uses X87.
497 addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
498 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
499 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
502 APFloat TmpFlt(+0.0);
503 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
505 addLegalFPImmediate(TmpFlt); // FLD0
507 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
508 APFloat TmpFlt2(+1.0);
509 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
511 addLegalFPImmediate(TmpFlt2); // FLD1
512 TmpFlt2.changeSign();
513 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
517 setOperationAction(ISD::FSIN , MVT::f80 , Expand);
518 setOperationAction(ISD::FCOS , MVT::f80 , Expand);
522 // Always use a library call for pow.
523 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
524 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
525 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
527 setOperationAction(ISD::FLOG, MVT::f80, Expand);
528 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
529 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
530 setOperationAction(ISD::FEXP, MVT::f80, Expand);
531 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
533 // First set operation action for all vector types to either promote
534 // (for widening) or expand (for scalarization). Then we will selectively
535 // turn on ones that can be effectively codegen'd.
536 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
537 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
538 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
539 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
540 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
541 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
542 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
543 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
544 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
545 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
546 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
547 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
548 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
549 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
550 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
551 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
552 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
553 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
554 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
555 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
556 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
557 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
558 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
559 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
560 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
561 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
562 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
563 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
564 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
565 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
566 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
567 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
568 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
569 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
570 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
571 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
572 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
573 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
574 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
575 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
576 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
577 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
578 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
579 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
580 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
583 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
584 // with -msoft-float, disable use of MMX as well.
585 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
586 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass);
587 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
588 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
589 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass);
590 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
592 // FIXME: add MMX packed arithmetics
594 setOperationAction(ISD::ADD, MVT::v8i8, Legal);
595 setOperationAction(ISD::ADD, MVT::v4i16, Legal);
596 setOperationAction(ISD::ADD, MVT::v2i32, Legal);
597 setOperationAction(ISD::ADD, MVT::v1i64, Legal);
599 setOperationAction(ISD::SUB, MVT::v8i8, Legal);
600 setOperationAction(ISD::SUB, MVT::v4i16, Legal);
601 setOperationAction(ISD::SUB, MVT::v2i32, Legal);
602 setOperationAction(ISD::SUB, MVT::v1i64, Legal);
604 setOperationAction(ISD::MULHS, MVT::v4i16, Legal);
605 setOperationAction(ISD::MUL, MVT::v4i16, Legal);
607 setOperationAction(ISD::AND, MVT::v8i8, Promote);
608 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64);
609 setOperationAction(ISD::AND, MVT::v4i16, Promote);
610 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64);
611 setOperationAction(ISD::AND, MVT::v2i32, Promote);
612 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64);
613 setOperationAction(ISD::AND, MVT::v1i64, Legal);
615 setOperationAction(ISD::OR, MVT::v8i8, Promote);
616 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64);
617 setOperationAction(ISD::OR, MVT::v4i16, Promote);
618 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64);
619 setOperationAction(ISD::OR, MVT::v2i32, Promote);
620 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64);
621 setOperationAction(ISD::OR, MVT::v1i64, Legal);
623 setOperationAction(ISD::XOR, MVT::v8i8, Promote);
624 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64);
625 setOperationAction(ISD::XOR, MVT::v4i16, Promote);
626 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64);
627 setOperationAction(ISD::XOR, MVT::v2i32, Promote);
628 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64);
629 setOperationAction(ISD::XOR, MVT::v1i64, Legal);
631 setOperationAction(ISD::LOAD, MVT::v8i8, Promote);
632 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64);
633 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
634 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64);
635 setOperationAction(ISD::LOAD, MVT::v2i32, Promote);
636 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64);
637 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
638 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64);
639 setOperationAction(ISD::LOAD, MVT::v1i64, Legal);
641 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
642 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
643 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
644 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
645 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
647 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
648 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
649 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
650 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
652 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom);
653 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom);
654 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom);
655 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom);
657 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
659 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
660 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand);
661 setOperationAction(ISD::SELECT, MVT::v8i8, Promote);
662 setOperationAction(ISD::SELECT, MVT::v4i16, Promote);
663 setOperationAction(ISD::SELECT, MVT::v2i32, Promote);
664 setOperationAction(ISD::SELECT, MVT::v1i64, Custom);
667 if (!UseSoftFloat && Subtarget->hasSSE1()) {
668 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
670 setOperationAction(ISD::FADD, MVT::v4f32, Legal);
671 setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
672 setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
673 setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
674 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
675 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
676 setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
677 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
678 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
679 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
680 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
681 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom);
684 if (!UseSoftFloat && Subtarget->hasSSE2()) {
685 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
687 // FIXME: Unfortunately -soft-float means XMM registers cannot be used even
688 // for integer operations.
689 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
690 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
691 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
692 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
694 setOperationAction(ISD::ADD, MVT::v16i8, Legal);
695 setOperationAction(ISD::ADD, MVT::v8i16, Legal);
696 setOperationAction(ISD::ADD, MVT::v4i32, Legal);
697 setOperationAction(ISD::ADD, MVT::v2i64, Legal);
698 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
699 setOperationAction(ISD::SUB, MVT::v16i8, Legal);
700 setOperationAction(ISD::SUB, MVT::v8i16, Legal);
701 setOperationAction(ISD::SUB, MVT::v4i32, Legal);
702 setOperationAction(ISD::SUB, MVT::v2i64, Legal);
703 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
704 setOperationAction(ISD::FADD, MVT::v2f64, Legal);
705 setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
706 setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
707 setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
708 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
709 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
711 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom);
712 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom);
713 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom);
714 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom);
716 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
717 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
718 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
719 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
720 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
722 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
723 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
724 MVT VT = (MVT::SimpleValueType)i;
725 // Do not attempt to custom lower non-power-of-2 vectors
726 if (!isPowerOf2_32(VT.getVectorNumElements()))
728 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
729 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
730 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
732 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
733 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
734 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
735 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
736 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
737 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
738 if (Subtarget->is64Bit()) {
739 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
740 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
743 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
744 for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
745 setOperationAction(ISD::AND, (MVT::SimpleValueType)VT, Promote);
746 AddPromotedToType (ISD::AND, (MVT::SimpleValueType)VT, MVT::v2i64);
747 setOperationAction(ISD::OR, (MVT::SimpleValueType)VT, Promote);
748 AddPromotedToType (ISD::OR, (MVT::SimpleValueType)VT, MVT::v2i64);
749 setOperationAction(ISD::XOR, (MVT::SimpleValueType)VT, Promote);
750 AddPromotedToType (ISD::XOR, (MVT::SimpleValueType)VT, MVT::v2i64);
751 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Promote);
752 AddPromotedToType (ISD::LOAD, (MVT::SimpleValueType)VT, MVT::v2i64);
753 setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote);
754 AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v2i64);
757 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
759 // Custom lower v2i64 and v2f64 selects.
760 setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
761 setOperationAction(ISD::LOAD, MVT::v2i64, Legal);
762 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
763 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
767 if (Subtarget->hasSSE41()) {
768 // FIXME: Do we need to handle scalar-to-vector here?
769 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
771 // i8 and i16 vectors are custom , because the source register and source
772 // source memory operand types are not the same width. f32 vectors are
773 // custom since the immediate controlling the insert encodes additional
775 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
776 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
777 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
778 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
780 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
781 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
782 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
783 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
785 if (Subtarget->is64Bit()) {
786 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal);
787 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
791 if (Subtarget->hasSSE42()) {
792 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom);
795 // We want to custom lower some of our intrinsics.
796 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
798 // Add/Sub/Mul with overflow operations are custom lowered.
799 setOperationAction(ISD::SADDO, MVT::i32, Custom);
800 setOperationAction(ISD::SADDO, MVT::i64, Custom);
801 setOperationAction(ISD::UADDO, MVT::i32, Custom);
802 setOperationAction(ISD::UADDO, MVT::i64, Custom);
803 setOperationAction(ISD::SSUBO, MVT::i32, Custom);
804 setOperationAction(ISD::SSUBO, MVT::i64, Custom);
805 setOperationAction(ISD::USUBO, MVT::i32, Custom);
806 setOperationAction(ISD::USUBO, MVT::i64, Custom);
807 setOperationAction(ISD::SMULO, MVT::i32, Custom);
808 setOperationAction(ISD::SMULO, MVT::i64, Custom);
809 setOperationAction(ISD::UMULO, MVT::i32, Custom);
810 setOperationAction(ISD::UMULO, MVT::i64, Custom);
812 // We have target-specific dag combine patterns for the following nodes:
813 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
814 setTargetDAGCombine(ISD::BUILD_VECTOR);
815 setTargetDAGCombine(ISD::SELECT);
816 setTargetDAGCombine(ISD::SHL);
817 setTargetDAGCombine(ISD::SRA);
818 setTargetDAGCombine(ISD::SRL);
819 setTargetDAGCombine(ISD::STORE);
821 computeRegisterProperties();
823 // FIXME: These should be based on subtarget info. Plus, the values should
824 // be smaller when we are in optimizing for size mode.
825 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
826 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
827 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
828 allowUnalignedMemoryAccesses = true; // x86 supports it!
829 setPrefLoopAlignment(16);
833 MVT X86TargetLowering::getSetCCResultType(MVT VT) const {
838 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
839 /// the desired ByVal argument alignment.
840 static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
843 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
844 if (VTy->getBitWidth() == 128)
846 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
847 unsigned EltAlign = 0;
848 getMaxByValAlign(ATy->getElementType(), EltAlign);
849 if (EltAlign > MaxAlign)
851 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
852 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
853 unsigned EltAlign = 0;
854 getMaxByValAlign(STy->getElementType(i), EltAlign);
855 if (EltAlign > MaxAlign)
864 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
865 /// function arguments in the caller parameter area. For X86, aggregates
866 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
867 /// are at 4-byte boundaries.
868 unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
869 if (Subtarget->is64Bit()) {
870 // Max of 8 and alignment of type.
871 unsigned TyAlign = TD->getABITypeAlignment(Ty);
878 if (Subtarget->hasSSE1())
879 getMaxByValAlign(Ty, Align);
883 /// getOptimalMemOpType - Returns the target specific optimal type for load
884 /// and store operations as a result of memset, memcpy, and memmove
885 /// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
888 X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
889 bool isSrcConst, bool isSrcStr) const {
890 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
891 // linux. This is because the stack realignment code can't handle certain
892 // cases like PR2962. This should be removed when PR2962 is fixed.
893 if (Subtarget->getStackAlignment() >= 16) {
894 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
896 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
899 if (Subtarget->is64Bit() && Size >= 8)
905 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
907 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
908 SelectionDAG &DAG) const {
909 if (usesGlobalOffsetTable())
910 return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy());
911 if (!Subtarget->isPICStyleRIPRel())
912 // This doesn't have DebugLoc associated with it, but is not really the
913 // same as a Register.
914 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(),
919 //===----------------------------------------------------------------------===//
920 // Return Value Calling Convention Implementation
921 //===----------------------------------------------------------------------===//
923 #include "X86GenCallingConv.inc"
925 /// LowerRET - Lower an ISD::RET node.
926 SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
927 DebugLoc dl = Op.getDebugLoc();
928 assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args");
930 SmallVector<CCValAssign, 16> RVLocs;
931 unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
932 bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
933 CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
934 CCInfo.AnalyzeReturn(Op.getNode(), RetCC_X86);
936 // If this is the first return lowered for this function, add the regs to the
937 // liveout set for the function.
938 if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
939 for (unsigned i = 0; i != RVLocs.size(); ++i)
940 if (RVLocs[i].isRegLoc())
941 DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
943 SDValue Chain = Op.getOperand(0);
945 // Handle tail call return.
946 Chain = GetPossiblePreceedingTailCall(Chain, X86ISD::TAILCALL);
947 if (Chain.getOpcode() == X86ISD::TAILCALL) {
948 SDValue TailCall = Chain;
949 SDValue TargetAddress = TailCall.getOperand(1);
950 SDValue StackAdjustment = TailCall.getOperand(2);
951 assert(((TargetAddress.getOpcode() == ISD::Register &&
952 (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX ||
953 cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) ||
954 TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
955 TargetAddress.getOpcode() == ISD::TargetGlobalAddress) &&
956 "Expecting an global address, external symbol, or register");
957 assert(StackAdjustment.getOpcode() == ISD::Constant &&
958 "Expecting a const value");
960 SmallVector<SDValue,8> Operands;
961 Operands.push_back(Chain.getOperand(0));
962 Operands.push_back(TargetAddress);
963 Operands.push_back(StackAdjustment);
964 // Copy registers used by the call. Last operand is a flag so it is not
966 for (unsigned i=3; i < TailCall.getNumOperands()-1; i++) {
967 Operands.push_back(Chain.getOperand(i));
969 return DAG.getNode(X86ISD::TC_RETURN, dl, MVT::Other, &Operands[0],
976 SmallVector<SDValue, 6> RetOps;
977 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
978 // Operand #1 = Bytes To Pop
979 RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16));
981 // Copy the result values into the output registers.
982 for (unsigned i = 0; i != RVLocs.size(); ++i) {
983 CCValAssign &VA = RVLocs[i];
984 assert(VA.isRegLoc() && "Can only return in registers!");
985 SDValue ValToCopy = Op.getOperand(i*2+1);
987 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
988 // the RET instruction and handled by the FP Stackifier.
989 if (VA.getLocReg() == X86::ST0 ||
990 VA.getLocReg() == X86::ST1) {
991 // If this is a copy from an xmm register to ST(0), use an FPExtend to
992 // change the value to the FP stack register class.
993 if (isScalarFPTypeInSSEReg(VA.getValVT()))
994 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
995 RetOps.push_back(ValToCopy);
996 // Don't emit a copytoreg.
1000 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1001 Flag = Chain.getValue(1);
1004 // The x86-64 ABI for returning structs by value requires that we copy
1005 // the sret argument into %rax for the return. We saved the argument into
1006 // a virtual register in the entry block, so now we copy the value out
1008 if (Subtarget->is64Bit() &&
1009 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1010 MachineFunction &MF = DAG.getMachineFunction();
1011 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1012 unsigned Reg = FuncInfo->getSRetReturnReg();
1014 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1015 FuncInfo->setSRetReturnReg(Reg);
1017 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1019 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1020 Flag = Chain.getValue(1);
1023 RetOps[0] = Chain; // Update chain.
1025 // Add the flag if we have it.
1027 RetOps.push_back(Flag);
1029 return DAG.getNode(X86ISD::RET_FLAG, dl,
1030 MVT::Other, &RetOps[0], RetOps.size());
1034 /// LowerCallResult - Lower the result values of an ISD::CALL into the
1035 /// appropriate copies out of appropriate physical registers. This assumes that
1036 /// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
1037 /// being lowered. The returns a SDNode with the same number of values as the
1039 SDNode *X86TargetLowering::
1040 LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
1041 unsigned CallingConv, SelectionDAG &DAG) {
1043 DebugLoc dl = TheCall->getDebugLoc();
1044 // Assign locations to each value returned by this call.
1045 SmallVector<CCValAssign, 16> RVLocs;
1046 bool isVarArg = TheCall->isVarArg();
1047 bool Is64Bit = Subtarget->is64Bit();
1048 CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
1049 CCInfo.AnalyzeCallResult(TheCall, RetCC_X86);
1051 SmallVector<SDValue, 8> ResultVals;
1053 // Copy all of the result registers out of their specified physreg.
1054 for (unsigned i = 0; i != RVLocs.size(); ++i) {
1055 CCValAssign &VA = RVLocs[i];
1056 MVT CopyVT = VA.getValVT();
1058 // If this is x86-64, and we disabled SSE, we can't return FP values
1059 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1060 ((Is64Bit || TheCall->isInreg()) && !Subtarget->hasSSE1())) {
1061 cerr << "SSE register return with SSE disabled\n";
1065 // If this is a call to a function that returns an fp value on the floating
1066 // point stack, but where we prefer to use the value in xmm registers, copy
1067 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
1068 if ((VA.getLocReg() == X86::ST0 ||
1069 VA.getLocReg() == X86::ST1) &&
1070 isScalarFPTypeInSSEReg(VA.getValVT())) {
1075 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
1076 // For x86-64, MMX values are returned in XMM0 and XMM1. Issue an
1077 // extract_vector_elt to i64 and then bit_convert it to the desired type.
1078 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1079 MVT::v2i64, InFlag).getValue(1);
1080 Val = Chain.getValue(0);
1081 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1082 Val, DAG.getConstant(0, MVT::i64));
1083 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
1085 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1086 CopyVT, InFlag).getValue(1);
1087 Val = Chain.getValue(0);
1089 InFlag = Chain.getValue(2);
1091 if (CopyVT != VA.getValVT()) {
1092 // Round the F80 the right size, which also moves to the appropriate xmm
1094 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1095 // This truncation won't change the value.
1096 DAG.getIntPtrConstant(1));
1099 ResultVals.push_back(Val);
1102 // Merge everything together with a MERGE_VALUES node.
1103 ResultVals.push_back(Chain);
1104 return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
1105 &ResultVals[0], ResultVals.size()).getNode();
1109 //===----------------------------------------------------------------------===//
1110 // C & StdCall & Fast Calling Convention implementation
1111 //===----------------------------------------------------------------------===//
1112 // StdCall calling convention seems to be standard for many Windows' API
1113 // routines and around. It differs from C calling convention just a little:
1114 // callee should clean up the stack, not caller. Symbols should be also
1115 // decorated in some fancy way :) It doesn't support any vector arguments.
1116 // For info on fast calling convention see Fast Calling Convention (tail call)
1117 // implementation LowerX86_32FastCCCallTo.
1119 /// AddLiveIn - This helper function adds the specified physical register to the
1120 /// MachineFunction as a live in value. It also creates a corresponding virtual
1121 /// register for it.
1122 static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
1123 const TargetRegisterClass *RC) {
1124 assert(RC->contains(PReg) && "Not the correct regclass!");
1125 unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
1126 MF.getRegInfo().addLiveIn(PReg, VReg);
1130 /// CallIsStructReturn - Determines whether a CALL node uses struct return
1132 static bool CallIsStructReturn(CallSDNode *TheCall) {
1133 unsigned NumOps = TheCall->getNumArgs();
1137 return TheCall->getArgFlags(0).isSRet();
1140 /// ArgsAreStructReturn - Determines whether a FORMAL_ARGUMENTS node uses struct
1141 /// return semantics.
1142 static bool ArgsAreStructReturn(SDValue Op) {
1143 unsigned NumArgs = Op.getNode()->getNumValues() - 1;
1147 return cast<ARG_FLAGSSDNode>(Op.getOperand(3))->getArgFlags().isSRet();
1150 /// IsCalleePop - Determines whether a CALL or FORMAL_ARGUMENTS node requires
1151 /// the callee to pop its own arguments. Callee pop is necessary to support tail
1153 bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) {
1157 switch (CallingConv) {
1160 case CallingConv::X86_StdCall:
1161 return !Subtarget->is64Bit();
1162 case CallingConv::X86_FastCall:
1163 return !Subtarget->is64Bit();
1164 case CallingConv::Fast:
1165 return PerformTailCallOpt;
1169 /// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1170 /// given CallingConvention value.
1171 CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
1172 if (Subtarget->is64Bit()) {
1173 if (Subtarget->isTargetWin64())
1174 return CC_X86_Win64_C;
1175 else if (CC == CallingConv::Fast && PerformTailCallOpt)
1176 return CC_X86_64_TailCall;
1181 if (CC == CallingConv::X86_FastCall)
1182 return CC_X86_32_FastCall;
1183 else if (CC == CallingConv::Fast)
1184 return CC_X86_32_FastCC;
1189 /// NameDecorationForFORMAL_ARGUMENTS - Selects the appropriate decoration to
1190 /// apply to a MachineFunction containing a given FORMAL_ARGUMENTS node.
1192 X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDValue Op) {
1193 unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1194 if (CC == CallingConv::X86_FastCall)
1196 else if (CC == CallingConv::X86_StdCall)
1202 /// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer
1203 /// in a register before calling.
1204 bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) {
1205 return !IsTailCall && !Is64Bit &&
1206 getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1207 Subtarget->isPICStyleGOT();
1210 /// CallRequiresFnAddressInReg - Check whether the call requires the function
1211 /// address to be loaded in a register.
1213 X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) {
1214 return !Is64Bit && IsTailCall &&
1215 getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1216 Subtarget->isPICStyleGOT();
1219 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1220 /// by "Src" to address "Dst" with size and alignment information specified by
1221 /// the specific parameter attribute. The copy will be passed as a byval
1222 /// function parameter.
1224 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1225 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1227 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1228 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1229 /*AlwaysInline=*/true, NULL, 0, NULL, 0);
1232 SDValue X86TargetLowering::LowerMemArgument(SDValue Op, SelectionDAG &DAG,
1233 const CCValAssign &VA,
1234 MachineFrameInfo *MFI,
1236 SDValue Root, unsigned i) {
1237 // Create the nodes corresponding to a load from this parameter slot.
1238 ISD::ArgFlagsTy Flags =
1239 cast<ARG_FLAGSSDNode>(Op.getOperand(3 + i))->getArgFlags();
1240 bool AlwaysUseMutable = (CC==CallingConv::Fast) && PerformTailCallOpt;
1241 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1243 // FIXME: For now, all byval parameter objects are marked mutable. This can be
1244 // changed with more analysis.
1245 // In case of tail call optimization mark all arguments mutable. Since they
1246 // could be overwritten by lowering of arguments in case of a tail call.
1247 int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
1248 VA.getLocMemOffset(), isImmutable);
1249 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1250 if (Flags.isByVal())
1252 return DAG.getLoad(VA.getValVT(), Op.getDebugLoc(), Root, FIN,
1253 PseudoSourceValue::getFixedStack(FI), 0);
1257 X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
1258 MachineFunction &MF = DAG.getMachineFunction();
1259 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1260 DebugLoc dl = Op.getDebugLoc();
1262 const Function* Fn = MF.getFunction();
1263 if (Fn->hasExternalLinkage() &&
1264 Subtarget->isTargetCygMing() &&
1265 Fn->getName() == "main")
1266 FuncInfo->setForceFramePointer(true);
1268 // Decorate the function name.
1269 FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op));
1271 MachineFrameInfo *MFI = MF.getFrameInfo();
1272 SDValue Root = Op.getOperand(0);
1273 bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
1274 unsigned CC = MF.getFunction()->getCallingConv();
1275 bool Is64Bit = Subtarget->is64Bit();
1276 bool IsWin64 = Subtarget->isTargetWin64();
1278 assert(!(isVarArg && CC == CallingConv::Fast) &&
1279 "Var args not supported with calling convention fastcc");
1281 // Assign locations to all of the incoming arguments.
1282 SmallVector<CCValAssign, 16> ArgLocs;
1283 CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
1284 CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(CC));
1286 SmallVector<SDValue, 8> ArgValues;
1287 unsigned LastVal = ~0U;
1288 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1289 CCValAssign &VA = ArgLocs[i];
1290 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1292 assert(VA.getValNo() != LastVal &&
1293 "Don't support value assigned to multiple locs yet");
1294 LastVal = VA.getValNo();
1296 if (VA.isRegLoc()) {
1297 MVT RegVT = VA.getLocVT();
1298 TargetRegisterClass *RC = NULL;
1299 if (RegVT == MVT::i32)
1300 RC = X86::GR32RegisterClass;
1301 else if (Is64Bit && RegVT == MVT::i64)
1302 RC = X86::GR64RegisterClass;
1303 else if (RegVT == MVT::f32)
1304 RC = X86::FR32RegisterClass;
1305 else if (RegVT == MVT::f64)
1306 RC = X86::FR64RegisterClass;
1307 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
1308 RC = X86::VR128RegisterClass;
1309 else if (RegVT.isVector()) {
1310 assert(RegVT.getSizeInBits() == 64);
1312 RC = X86::VR64RegisterClass; // MMX values are passed in MMXs.
1314 // Darwin calling convention passes MMX values in either GPRs or
1315 // XMMs in x86-64. Other targets pass them in memory.
1316 if (RegVT != MVT::v1i64 && Subtarget->hasSSE2()) {
1317 RC = X86::VR128RegisterClass; // MMX values are passed in XMMs.
1320 RC = X86::GR64RegisterClass; // v1i64 values are passed in GPRs.
1325 assert(0 && "Unknown argument type!");
1328 unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
1329 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
1331 // If this is an 8 or 16-bit value, it is really passed promoted to 32
1332 // bits. Insert an assert[sz]ext to capture this, then truncate to the
1334 if (VA.getLocInfo() == CCValAssign::SExt)
1335 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1336 DAG.getValueType(VA.getValVT()));
1337 else if (VA.getLocInfo() == CCValAssign::ZExt)
1338 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1339 DAG.getValueType(VA.getValVT()));
1341 if (VA.getLocInfo() != CCValAssign::Full)
1342 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1344 // Handle MMX values passed in GPRs.
1345 if (Is64Bit && RegVT != VA.getLocVT()) {
1346 if (RegVT.getSizeInBits() == 64 && RC == X86::GR64RegisterClass)
1347 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue);
1348 else if (RC == X86::VR128RegisterClass) {
1349 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1350 ArgValue, DAG.getConstant(0, MVT::i64));
1351 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue);
1355 ArgValues.push_back(ArgValue);
1357 assert(VA.isMemLoc());
1358 ArgValues.push_back(LowerMemArgument(Op, DAG, VA, MFI, CC, Root, i));
1362 // The x86-64 ABI for returning structs by value requires that we copy
1363 // the sret argument into %rax for the return. Save the argument into
1364 // a virtual register so that we can access it from the return points.
1365 if (Is64Bit && DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1366 MachineFunction &MF = DAG.getMachineFunction();
1367 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1368 unsigned Reg = FuncInfo->getSRetReturnReg();
1370 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1371 FuncInfo->setSRetReturnReg(Reg);
1373 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, ArgValues[0]);
1374 Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Root);
1377 unsigned StackSize = CCInfo.getNextStackOffset();
1378 // align stack specially for tail calls
1379 if (PerformTailCallOpt && CC == CallingConv::Fast)
1380 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1382 // If the function takes variable number of arguments, make a frame index for
1383 // the start of the first vararg value... for expansion of llvm.va_start.
1385 if (Is64Bit || CC != CallingConv::X86_FastCall) {
1386 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
1389 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1391 // FIXME: We should really autogenerate these arrays
1392 static const unsigned GPR64ArgRegsWin64[] = {
1393 X86::RCX, X86::RDX, X86::R8, X86::R9
1395 static const unsigned XMMArgRegsWin64[] = {
1396 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
1398 static const unsigned GPR64ArgRegs64Bit[] = {
1399 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1401 static const unsigned XMMArgRegs64Bit[] = {
1402 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1403 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1405 const unsigned *GPR64ArgRegs, *XMMArgRegs;
1408 TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
1409 GPR64ArgRegs = GPR64ArgRegsWin64;
1410 XMMArgRegs = XMMArgRegsWin64;
1412 TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1413 GPR64ArgRegs = GPR64ArgRegs64Bit;
1414 XMMArgRegs = XMMArgRegs64Bit;
1416 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1418 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
1421 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
1422 "SSE register cannot be used when SSE is disabled!");
1423 assert(!(NumXMMRegs && UseSoftFloat) &&
1424 "SSE register cannot be used when SSE is disabled!");
1425 if (UseSoftFloat || !Subtarget->hasSSE1()) {
1426 // Kernel mode asks for SSE to be disabled, so don't push them
1428 TotalNumXMMRegs = 0;
1430 // For X86-64, if there are vararg parameters that are passed via
1431 // registers, then we must store them to their spots on the stack so they
1432 // may be loaded by deferencing the result of va_next.
1433 VarArgsGPOffset = NumIntRegs * 8;
1434 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16;
1435 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 +
1436 TotalNumXMMRegs * 16, 16);
1438 // Store the integer parameter registers.
1439 SmallVector<SDValue, 8> MemOps;
1440 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
1441 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1442 DAG.getIntPtrConstant(VarArgsGPOffset));
1443 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
1444 unsigned VReg = AddLiveIn(MF, GPR64ArgRegs[NumIntRegs],
1445 X86::GR64RegisterClass);
1446 SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i64);
1448 DAG.getStore(Val.getValue(1), dl, Val, FIN,
1449 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
1450 MemOps.push_back(Store);
1451 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
1452 DAG.getIntPtrConstant(8));
1455 // Now store the XMM (fp + vector) parameter registers.
1456 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1457 DAG.getIntPtrConstant(VarArgsFPOffset));
1458 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
1459 unsigned VReg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs],
1460 X86::VR128RegisterClass);
1461 SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::v4f32);
1463 DAG.getStore(Val.getValue(1), dl, Val, FIN,
1464 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
1465 MemOps.push_back(Store);
1466 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
1467 DAG.getIntPtrConstant(16));
1469 if (!MemOps.empty())
1470 Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1471 &MemOps[0], MemOps.size());
1475 ArgValues.push_back(Root);
1477 // Some CCs need callee pop.
1478 if (IsCalleePop(isVarArg, CC)) {
1479 BytesToPopOnReturn = StackSize; // Callee pops everything.
1480 BytesCallerReserves = 0;
1482 BytesToPopOnReturn = 0; // Callee pops nothing.
1483 // If this is an sret function, the return should pop the hidden pointer.
1484 if (!Is64Bit && CC != CallingConv::Fast && ArgsAreStructReturn(Op))
1485 BytesToPopOnReturn = 4;
1486 BytesCallerReserves = StackSize;
1490 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only.
1491 if (CC == CallingConv::X86_FastCall)
1492 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs.
1495 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
1497 // Return the new list of results.
1498 return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
1499 &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
1503 X86TargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
1504 const SDValue &StackPtr,
1505 const CCValAssign &VA,
1507 SDValue Arg, ISD::ArgFlagsTy Flags) {
1508 DebugLoc dl = TheCall->getDebugLoc();
1509 unsigned LocMemOffset = VA.getLocMemOffset();
1510 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1511 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1512 if (Flags.isByVal()) {
1513 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1515 return DAG.getStore(Chain, dl, Arg, PtrOff,
1516 PseudoSourceValue::getStack(), LocMemOffset);
1519 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
1520 /// optimization is performed and it is required.
1522 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
1523 SDValue &OutRetAddr,
1529 if (!IsTailCall || FPDiff==0) return Chain;
1531 // Adjust the Return address stack slot.
1532 MVT VT = getPointerTy();
1533 OutRetAddr = getReturnAddressFrameIndex(DAG);
1535 // Load the "old" Return address.
1536 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0);
1537 return SDValue(OutRetAddr.getNode(), 1);
1540 /// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
1541 /// optimization is performed and it is required (FPDiff!=0).
1543 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
1544 SDValue Chain, SDValue RetAddrFrIdx,
1545 bool Is64Bit, int FPDiff, DebugLoc dl) {
1546 // Store the return address to the appropriate stack slot.
1547 if (!FPDiff) return Chain;
1548 // Calculate the new stack slot for the return address.
1549 int SlotSize = Is64Bit ? 8 : 4;
1550 int NewReturnAddrFI =
1551 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
1552 MVT VT = Is64Bit ? MVT::i64 : MVT::i32;
1553 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
1554 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1555 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0);
1559 SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
1560 MachineFunction &MF = DAG.getMachineFunction();
1561 CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
1562 SDValue Chain = TheCall->getChain();
1563 unsigned CC = TheCall->getCallingConv();
1564 bool isVarArg = TheCall->isVarArg();
1565 bool IsTailCall = TheCall->isTailCall() &&
1566 CC == CallingConv::Fast && PerformTailCallOpt;
1567 SDValue Callee = TheCall->getCallee();
1568 bool Is64Bit = Subtarget->is64Bit();
1569 bool IsStructRet = CallIsStructReturn(TheCall);
1570 DebugLoc dl = TheCall->getDebugLoc();
1572 assert(!(isVarArg && CC == CallingConv::Fast) &&
1573 "Var args not supported with calling convention fastcc");
1575 // Analyze operands of the call, assigning locations to each operand.
1576 SmallVector<CCValAssign, 16> ArgLocs;
1577 CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
1578 CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC));
1580 // Get a count of how many bytes are to be pushed on the stack.
1581 unsigned NumBytes = CCInfo.getNextStackOffset();
1582 if (PerformTailCallOpt && CC == CallingConv::Fast)
1583 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
1587 // Lower arguments at fp - stackoffset + fpdiff.
1588 unsigned NumBytesCallerPushed =
1589 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
1590 FPDiff = NumBytesCallerPushed - NumBytes;
1592 // Set the delta of movement of the returnaddr stackslot.
1593 // But only set if delta is greater than previous delta.
1594 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
1595 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
1598 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
1600 SDValue RetAddrFrIdx;
1601 // Load return adress for tail calls.
1602 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, IsTailCall, Is64Bit,
1605 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1606 SmallVector<SDValue, 8> MemOpChains;
1609 // Walk the register/memloc assignments, inserting copies/loads. In the case
1610 // of tail call optimization arguments are handle later.
1611 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1612 CCValAssign &VA = ArgLocs[i];
1613 SDValue Arg = TheCall->getArg(i);
1614 ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
1615 bool isByVal = Flags.isByVal();
1617 // Promote the value if needed.
1618 switch (VA.getLocInfo()) {
1619 default: assert(0 && "Unknown loc info!");
1620 case CCValAssign::Full: break;
1621 case CCValAssign::SExt:
1622 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
1624 case CCValAssign::ZExt:
1625 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
1627 case CCValAssign::AExt:
1628 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1632 if (VA.isRegLoc()) {
1634 MVT RegVT = VA.getLocVT();
1635 if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
1636 switch (VA.getLocReg()) {
1639 case X86::RDI: case X86::RSI: case X86::RDX: case X86::RCX:
1641 // Special case: passing MMX values in GPR registers.
1642 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
1645 case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3:
1646 case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: {
1647 // Special case: passing MMX values in XMM registers.
1648 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
1649 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
1650 Arg = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2i64,
1651 DAG.getUNDEF(MVT::v2i64), Arg,
1652 getMOVLMask(2, DAG, dl));
1657 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1659 if (!IsTailCall || (IsTailCall && isByVal)) {
1660 assert(VA.isMemLoc());
1661 if (StackPtr.getNode() == 0)
1662 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
1664 MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
1665 Chain, Arg, Flags));
1670 if (!MemOpChains.empty())
1671 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1672 &MemOpChains[0], MemOpChains.size());
1674 // Build a sequence of copy-to-reg nodes chained together with token chain
1675 // and flag operands which copy the outgoing args into registers.
1677 // Tail call byval lowering might overwrite argument registers so in case of
1678 // tail call optimization the copies to registers are lowered later.
1680 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1681 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1682 RegsToPass[i].second, InFlag);
1683 InFlag = Chain.getValue(1);
1686 // ELF / PIC requires GOT in the EBX register before function calls via PLT
1688 if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) {
1689 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
1690 DAG.getNode(X86ISD::GlobalBaseReg,
1691 DebugLoc::getUnknownLoc(),
1694 InFlag = Chain.getValue(1);
1696 // If we are tail calling and generating PIC/GOT style code load the address
1697 // of the callee into ecx. The value in ecx is used as target of the tail
1698 // jump. This is done to circumvent the ebx/callee-saved problem for tail
1699 // calls on PIC/GOT architectures. Normally we would just put the address of
1700 // GOT into ebx and then call target@PLT. But for tail callss ebx would be
1701 // restored (since ebx is callee saved) before jumping to the target@PLT.
1702 if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) {
1703 // Note: The actual moving to ecx is done further down.
1704 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
1705 if (G && !G->getGlobal()->hasHiddenVisibility() &&
1706 !G->getGlobal()->hasProtectedVisibility())
1707 Callee = LowerGlobalAddress(Callee, DAG);
1708 else if (isa<ExternalSymbolSDNode>(Callee))
1709 Callee = LowerExternalSymbol(Callee,DAG);
1712 if (Is64Bit && isVarArg) {
1713 // From AMD64 ABI document:
1714 // For calls that may call functions that use varargs or stdargs
1715 // (prototype-less calls or calls to functions containing ellipsis (...) in
1716 // the declaration) %al is used as hidden argument to specify the number
1717 // of SSE registers used. The contents of %al do not need to match exactly
1718 // the number of registers, but must be an ubound on the number of SSE
1719 // registers used and is in the range 0 - 8 inclusive.
1721 // FIXME: Verify this on Win64
1722 // Count the number of XMM registers allocated.
1723 static const unsigned XMMArgRegs[] = {
1724 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1725 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1727 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
1728 assert((Subtarget->hasSSE1() || !NumXMMRegs)
1729 && "SSE registers cannot be used when SSE is disabled");
1731 Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
1732 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
1733 InFlag = Chain.getValue(1);
1737 // For tail calls lower the arguments to the 'real' stack slot.
1739 SmallVector<SDValue, 8> MemOpChains2;
1742 // Do not flag preceeding copytoreg stuff together with the following stuff.
1744 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1745 CCValAssign &VA = ArgLocs[i];
1746 if (!VA.isRegLoc()) {
1747 assert(VA.isMemLoc());
1748 SDValue Arg = TheCall->getArg(i);
1749 ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
1750 // Create frame index.
1751 int32_t Offset = VA.getLocMemOffset()+FPDiff;
1752 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
1753 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
1754 FIN = DAG.getFrameIndex(FI, getPointerTy());
1756 if (Flags.isByVal()) {
1757 // Copy relative to framepointer.
1758 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
1759 if (StackPtr.getNode() == 0)
1760 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
1762 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
1764 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, Chain,
1767 // Store relative to framepointer.
1768 MemOpChains2.push_back(
1769 DAG.getStore(Chain, dl, Arg, FIN,
1770 PseudoSourceValue::getFixedStack(FI), 0));
1775 if (!MemOpChains2.empty())
1776 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1777 &MemOpChains2[0], MemOpChains2.size());
1779 // Copy arguments to their registers.
1780 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1781 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1782 RegsToPass[i].second, InFlag);
1783 InFlag = Chain.getValue(1);
1787 // Store the return address to the appropriate stack slot.
1788 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
1792 // If the callee is a GlobalAddress node (quite common, every direct call is)
1793 // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
1794 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1795 // We should use extra load for direct calls to dllimported functions in
1797 if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
1798 getTargetMachine(), true))
1799 Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy(),
1801 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1802 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
1803 } else if (IsTailCall) {
1804 unsigned Opc = Is64Bit ? X86::R9 : X86::EAX;
1806 Chain = DAG.getCopyToReg(Chain, dl,
1807 DAG.getRegister(Opc, getPointerTy()),
1809 Callee = DAG.getRegister(Opc, getPointerTy());
1810 // Add register as live out.
1811 DAG.getMachineFunction().getRegInfo().addLiveOut(Opc);
1814 // Returns a chain & a flag for retval copy to use.
1815 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
1816 SmallVector<SDValue, 8> Ops;
1819 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1820 DAG.getIntPtrConstant(0, true), InFlag);
1821 InFlag = Chain.getValue(1);
1823 // Returns a chain & a flag for retval copy to use.
1824 NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
1828 Ops.push_back(Chain);
1829 Ops.push_back(Callee);
1832 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
1834 // Add argument registers to the end of the list so that they are known live
1836 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1837 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1838 RegsToPass[i].second.getValueType()));
1840 // Add an implicit use GOT pointer in EBX.
1841 if (!IsTailCall && !Is64Bit &&
1842 getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1843 Subtarget->isPICStyleGOT())
1844 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
1846 // Add an implicit use of AL for x86 vararg functions.
1847 if (Is64Bit && isVarArg)
1848 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
1850 if (InFlag.getNode())
1851 Ops.push_back(InFlag);
1854 assert(InFlag.getNode() &&
1855 "Flag must be set. Depend on flag being set in LowerRET");
1856 Chain = DAG.getNode(X86ISD::TAILCALL, dl,
1857 TheCall->getVTList(), &Ops[0], Ops.size());
1859 return SDValue(Chain.getNode(), Op.getResNo());
1862 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
1863 InFlag = Chain.getValue(1);
1865 // Create the CALLSEQ_END node.
1866 unsigned NumBytesForCalleeToPush;
1867 if (IsCalleePop(isVarArg, CC))
1868 NumBytesForCalleeToPush = NumBytes; // Callee pops everything
1869 else if (!Is64Bit && CC != CallingConv::Fast && IsStructRet)
1870 // If this is is a call to a struct-return function, the callee
1871 // pops the hidden struct pointer, so we have to push it back.
1872 // This is common for Darwin/X86, Linux & Mingw32 targets.
1873 NumBytesForCalleeToPush = 4;
1875 NumBytesForCalleeToPush = 0; // Callee pops nothing.
1877 // Returns a flag for retval copy to use.
1878 Chain = DAG.getCALLSEQ_END(Chain,
1879 DAG.getIntPtrConstant(NumBytes, true),
1880 DAG.getIntPtrConstant(NumBytesForCalleeToPush,
1883 InFlag = Chain.getValue(1);
1885 // Handle result values, copying them out of physregs into vregs that we
1887 return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
1892 //===----------------------------------------------------------------------===//
1893 // Fast Calling Convention (tail call) implementation
1894 //===----------------------------------------------------------------------===//
1896 // Like std call, callee cleans arguments, convention except that ECX is
1897 // reserved for storing the tail called function address. Only 2 registers are
1898 // free for argument passing (inreg). Tail call optimization is performed
1900 // * tailcallopt is enabled
1901 // * caller/callee are fastcc
1902 // On X86_64 architecture with GOT-style position independent code only local
1903 // (within module) calls are supported at the moment.
1904 // To keep the stack aligned according to platform abi the function
1905 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
1906 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
1907 // If a tail called function callee has more arguments than the caller the
1908 // caller needs to make sure that there is room to move the RETADDR to. This is
1909 // achieved by reserving an area the size of the argument delta right after the
1910 // original REtADDR, but before the saved framepointer or the spilled registers
1911 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
1923 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
1924 /// for a 16 byte align requirement.
1925 unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
1926 SelectionDAG& DAG) {
1927 MachineFunction &MF = DAG.getMachineFunction();
1928 const TargetMachine &TM = MF.getTarget();
1929 const TargetFrameInfo &TFI = *TM.getFrameInfo();
1930 unsigned StackAlignment = TFI.getStackAlignment();
1931 uint64_t AlignMask = StackAlignment - 1;
1932 int64_t Offset = StackSize;
1933 uint64_t SlotSize = TD->getPointerSize();
1934 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
1935 // Number smaller than 12 so just add the difference.
1936 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
1938 // Mask out lower bits, add stackalignment once plus the 12 bytes.
1939 Offset = ((~AlignMask) & Offset) + StackAlignment +
1940 (StackAlignment-SlotSize);
1945 /// IsEligibleForTailCallElimination - Check to see whether the next instruction
1946 /// following the call is a return. A function is eligible if caller/callee
1947 /// calling conventions match, currently only fastcc supports tail calls, and
1948 /// the function CALL is immediatly followed by a RET.
1949 bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall,
1951 SelectionDAG& DAG) const {
1952 if (!PerformTailCallOpt)
1955 if (CheckTailCallReturnConstraints(TheCall, Ret)) {
1956 MachineFunction &MF = DAG.getMachineFunction();
1957 unsigned CallerCC = MF.getFunction()->getCallingConv();
1958 unsigned CalleeCC= TheCall->getCallingConv();
1959 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
1960 SDValue Callee = TheCall->getCallee();
1961 // On x86/32Bit PIC/GOT tail calls are supported.
1962 if (getTargetMachine().getRelocationModel() != Reloc::PIC_ ||
1963 !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit())
1966 // Can only do local tail calls (in same module, hidden or protected) on
1967 // x86_64 PIC/GOT at the moment.
1968 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1969 return G->getGlobal()->hasHiddenVisibility()
1970 || G->getGlobal()->hasProtectedVisibility();
1978 X86TargetLowering::createFastISel(MachineFunction &mf,
1979 MachineModuleInfo *mmo,
1981 DenseMap<const Value *, unsigned> &vm,
1982 DenseMap<const BasicBlock *,
1983 MachineBasicBlock *> &bm,
1984 DenseMap<const AllocaInst *, int> &am
1986 , SmallSet<Instruction*, 8> &cil
1989 return X86::createFastISel(mf, mmo, dw, vm, bm, am
1997 //===----------------------------------------------------------------------===//
1998 // Other Lowering Hooks
1999 //===----------------------------------------------------------------------===//
2002 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
2003 MachineFunction &MF = DAG.getMachineFunction();
2004 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2005 int ReturnAddrIndex = FuncInfo->getRAIndex();
2007 if (ReturnAddrIndex == 0) {
2008 // Set up a frame object for the return address.
2009 uint64_t SlotSize = TD->getPointerSize();
2010 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize);
2011 FuncInfo->setRAIndex(ReturnAddrIndex);
2014 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
2018 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
2019 /// specific condition code, returning the condition code and the LHS/RHS of the
2020 /// comparison to make.
2021 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
2022 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
2024 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2025 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
2026 // X > -1 -> X == 0, jump !sign.
2027 RHS = DAG.getConstant(0, RHS.getValueType());
2028 return X86::COND_NS;
2029 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
2030 // X < 0 -> X == 0, jump on sign.
2032 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
2034 RHS = DAG.getConstant(0, RHS.getValueType());
2035 return X86::COND_LE;
2039 switch (SetCCOpcode) {
2040 default: assert(0 && "Invalid integer condition!");
2041 case ISD::SETEQ: return X86::COND_E;
2042 case ISD::SETGT: return X86::COND_G;
2043 case ISD::SETGE: return X86::COND_GE;
2044 case ISD::SETLT: return X86::COND_L;
2045 case ISD::SETLE: return X86::COND_LE;
2046 case ISD::SETNE: return X86::COND_NE;
2047 case ISD::SETULT: return X86::COND_B;
2048 case ISD::SETUGT: return X86::COND_A;
2049 case ISD::SETULE: return X86::COND_BE;
2050 case ISD::SETUGE: return X86::COND_AE;
2054 // First determine if it is required or is profitable to flip the operands.
2056 // If LHS is a foldable load, but RHS is not, flip the condition.
2057 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
2058 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
2059 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2060 std::swap(LHS, RHS);
2063 switch (SetCCOpcode) {
2069 std::swap(LHS, RHS);
2073 // On a floating point condition, the flags are set as follows:
2075 // 0 | 0 | 0 | X > Y
2076 // 0 | 0 | 1 | X < Y
2077 // 1 | 0 | 0 | X == Y
2078 // 1 | 1 | 1 | unordered
2079 switch (SetCCOpcode) {
2080 default: assert(0 && "Condcode should be pre-legalized away");
2082 case ISD::SETEQ: return X86::COND_E;
2083 case ISD::SETOLT: // flipped
2085 case ISD::SETGT: return X86::COND_A;
2086 case ISD::SETOLE: // flipped
2088 case ISD::SETGE: return X86::COND_AE;
2089 case ISD::SETUGT: // flipped
2091 case ISD::SETLT: return X86::COND_B;
2092 case ISD::SETUGE: // flipped
2094 case ISD::SETLE: return X86::COND_BE;
2096 case ISD::SETNE: return X86::COND_NE;
2097 case ISD::SETUO: return X86::COND_P;
2098 case ISD::SETO: return X86::COND_NP;
2102 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
2103 /// code. Current x86 isa includes the following FP cmov instructions:
2104 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2105 static bool hasFPCMov(unsigned X86CC) {
2121 /// isUndefOrInRange - Op is either an undef node or a ConstantSDNode. Return
2122 /// true if Op is undef or if its value falls within the specified range (L, H].
2123 static bool isUndefOrInRange(SDValue Op, unsigned Low, unsigned Hi) {
2124 if (Op.getOpcode() == ISD::UNDEF)
2127 unsigned Val = cast<ConstantSDNode>(Op)->getZExtValue();
2128 return (Val >= Low && Val < Hi);
2131 /// isUndefOrEqual - Op is either an undef node or a ConstantSDNode. Return
2132 /// true if Op is undef or if its value equal to the specified value.
2133 static bool isUndefOrEqual(SDValue Op, unsigned Val) {
2134 if (Op.getOpcode() == ISD::UNDEF)
2136 return cast<ConstantSDNode>(Op)->getZExtValue() == Val;
2139 /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
2140 /// specifies a shuffle of elements that is suitable for input to PSHUFD.
2141 bool X86::isPSHUFDMask(SDNode *N) {
2142 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2144 if (N->getNumOperands() != 2 && N->getNumOperands() != 4)
2147 // Check if the value doesn't reference the second vector.
2148 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
2149 SDValue Arg = N->getOperand(i);
2150 if (Arg.getOpcode() == ISD::UNDEF) continue;
2151 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2152 if (cast<ConstantSDNode>(Arg)->getZExtValue() >= e)
2159 /// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
2160 /// specifies a shuffle of elements that is suitable for input to PSHUFHW.
2161 bool X86::isPSHUFHWMask(SDNode *N) {
2162 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2164 if (N->getNumOperands() != 8)
2167 // Lower quadword copied in order.
2168 for (unsigned i = 0; i != 4; ++i) {
2169 SDValue Arg = N->getOperand(i);
2170 if (Arg.getOpcode() == ISD::UNDEF) continue;
2171 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2172 if (cast<ConstantSDNode>(Arg)->getZExtValue() != i)
2176 // Upper quadword shuffled.
2177 for (unsigned i = 4; i != 8; ++i) {
2178 SDValue Arg = N->getOperand(i);
2179 if (Arg.getOpcode() == ISD::UNDEF) continue;
2180 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2181 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2182 if (Val < 4 || Val > 7)
2189 /// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
2190 /// specifies a shuffle of elements that is suitable for input to PSHUFLW.
2191 bool X86::isPSHUFLWMask(SDNode *N) {
2192 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2194 if (N->getNumOperands() != 8)
2197 // Upper quadword copied in order.
2198 for (unsigned i = 4; i != 8; ++i)
2199 if (!isUndefOrEqual(N->getOperand(i), i))
2202 // Lower quadword shuffled.
2203 for (unsigned i = 0; i != 4; ++i)
2204 if (!isUndefOrInRange(N->getOperand(i), 0, 4))
2210 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
2211 /// specifies a shuffle of elements that is suitable for input to SHUFP*.
2212 template<class SDOperand>
2213 static bool isSHUFPMask(SDOperand *Elems, unsigned NumElems) {
2214 if (NumElems != 2 && NumElems != 4) return false;
2216 unsigned Half = NumElems / 2;
2217 for (unsigned i = 0; i < Half; ++i)
2218 if (!isUndefOrInRange(Elems[i], 0, NumElems))
2220 for (unsigned i = Half; i < NumElems; ++i)
2221 if (!isUndefOrInRange(Elems[i], NumElems, NumElems*2))
2227 bool X86::isSHUFPMask(SDNode *N) {
2228 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2229 return ::isSHUFPMask(N->op_begin(), N->getNumOperands());
2232 /// isCommutedSHUFP - Returns true if the shuffle mask is exactly
2233 /// the reverse of what x86 shuffles want. x86 shuffles requires the lower
2234 /// half elements to come from vector 1 (which would equal the dest.) and
2235 /// the upper half to come from vector 2.
2236 template<class SDOperand>
2237 static bool isCommutedSHUFP(SDOperand *Ops, unsigned NumOps) {
2238 if (NumOps != 2 && NumOps != 4) return false;
2240 unsigned Half = NumOps / 2;
2241 for (unsigned i = 0; i < Half; ++i)
2242 if (!isUndefOrInRange(Ops[i], NumOps, NumOps*2))
2244 for (unsigned i = Half; i < NumOps; ++i)
2245 if (!isUndefOrInRange(Ops[i], 0, NumOps))
2250 static bool isCommutedSHUFP(SDNode *N) {
2251 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2252 return isCommutedSHUFP(N->op_begin(), N->getNumOperands());
2255 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
2256 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
2257 bool X86::isMOVHLPSMask(SDNode *N) {
2258 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2260 if (N->getNumOperands() != 4)
2263 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
2264 return isUndefOrEqual(N->getOperand(0), 6) &&
2265 isUndefOrEqual(N->getOperand(1), 7) &&
2266 isUndefOrEqual(N->getOperand(2), 2) &&
2267 isUndefOrEqual(N->getOperand(3), 3);
2270 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
2271 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
2273 bool X86::isMOVHLPS_v_undef_Mask(SDNode *N) {
2274 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2276 if (N->getNumOperands() != 4)
2279 // Expect bit0 == 2, bit1 == 3, bit2 == 2, bit3 == 3
2280 return isUndefOrEqual(N->getOperand(0), 2) &&
2281 isUndefOrEqual(N->getOperand(1), 3) &&
2282 isUndefOrEqual(N->getOperand(2), 2) &&
2283 isUndefOrEqual(N->getOperand(3), 3);
2286 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
2287 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
2288 bool X86::isMOVLPMask(SDNode *N) {
2289 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2291 unsigned NumElems = N->getNumOperands();
2292 if (NumElems != 2 && NumElems != 4)
2295 for (unsigned i = 0; i < NumElems/2; ++i)
2296 if (!isUndefOrEqual(N->getOperand(i), i + NumElems))
2299 for (unsigned i = NumElems/2; i < NumElems; ++i)
2300 if (!isUndefOrEqual(N->getOperand(i), i))
2306 /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
2307 /// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
2309 bool X86::isMOVHPMask(SDNode *N) {
2310 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2312 unsigned NumElems = N->getNumOperands();
2313 if (NumElems != 2 && NumElems != 4)
2316 for (unsigned i = 0; i < NumElems/2; ++i)
2317 if (!isUndefOrEqual(N->getOperand(i), i))
2320 for (unsigned i = 0; i < NumElems/2; ++i) {
2321 SDValue Arg = N->getOperand(i + NumElems/2);
2322 if (!isUndefOrEqual(Arg, i + NumElems))
2329 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
2330 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
2331 template<class SDOperand>
2332 bool static isUNPCKLMask(SDOperand *Elts, unsigned NumElts,
2333 bool V2IsSplat = false) {
2334 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2337 for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) {
2338 SDValue BitI = Elts[i];
2339 SDValue BitI1 = Elts[i+1];
2340 if (!isUndefOrEqual(BitI, j))
2343 if (!isUndefOrEqual(BitI1, NumElts))
2346 if (!isUndefOrEqual(BitI1, j + NumElts))
2354 bool X86::isUNPCKLMask(SDNode *N, bool V2IsSplat) {
2355 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2356 return ::isUNPCKLMask(N->op_begin(), N->getNumOperands(), V2IsSplat);
2359 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
2360 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
2361 template<class SDOperand>
2362 bool static isUNPCKHMask(SDOperand *Elts, unsigned NumElts,
2363 bool V2IsSplat = false) {
2364 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2367 for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) {
2368 SDValue BitI = Elts[i];
2369 SDValue BitI1 = Elts[i+1];
2370 if (!isUndefOrEqual(BitI, j + NumElts/2))
2373 if (isUndefOrEqual(BitI1, NumElts))
2376 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
2384 bool X86::isUNPCKHMask(SDNode *N, bool V2IsSplat) {
2385 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2386 return ::isUNPCKHMask(N->op_begin(), N->getNumOperands(), V2IsSplat);
2389 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
2390 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
2392 bool X86::isUNPCKL_v_undef_Mask(SDNode *N) {
2393 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2395 unsigned NumElems = N->getNumOperands();
2396 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2399 for (unsigned i = 0, j = 0; i != NumElems; i += 2, ++j) {
2400 SDValue BitI = N->getOperand(i);
2401 SDValue BitI1 = N->getOperand(i+1);
2403 if (!isUndefOrEqual(BitI, j))
2405 if (!isUndefOrEqual(BitI1, j))
2412 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
2413 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
2415 bool X86::isUNPCKH_v_undef_Mask(SDNode *N) {
2416 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2418 unsigned NumElems = N->getNumOperands();
2419 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2422 for (unsigned i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
2423 SDValue BitI = N->getOperand(i);
2424 SDValue BitI1 = N->getOperand(i + 1);
2426 if (!isUndefOrEqual(BitI, j))
2428 if (!isUndefOrEqual(BitI1, j))
2435 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
2436 /// specifies a shuffle of elements that is suitable for input to MOVSS,
2437 /// MOVSD, and MOVD, i.e. setting the lowest element.
2438 template<class SDOperand>
2439 static bool isMOVLMask(SDOperand *Elts, unsigned NumElts) {
2440 if (NumElts != 2 && NumElts != 4)
2443 if (!isUndefOrEqual(Elts[0], NumElts))
2446 for (unsigned i = 1; i < NumElts; ++i) {
2447 if (!isUndefOrEqual(Elts[i], i))
2454 bool X86::isMOVLMask(SDNode *N) {
2455 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2456 return ::isMOVLMask(N->op_begin(), N->getNumOperands());
2459 /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
2460 /// of what x86 movss want. X86 movs requires the lowest element to be lowest
2461 /// element of vector 2 and the other elements to come from vector 1 in order.
2462 template<class SDOperand>
2463 static bool isCommutedMOVL(SDOperand *Ops, unsigned NumOps,
2464 bool V2IsSplat = false,
2465 bool V2IsUndef = false) {
2466 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
2469 if (!isUndefOrEqual(Ops[0], 0))
2472 for (unsigned i = 1; i < NumOps; ++i) {
2473 SDValue Arg = Ops[i];
2474 if (!(isUndefOrEqual(Arg, i+NumOps) ||
2475 (V2IsUndef && isUndefOrInRange(Arg, NumOps, NumOps*2)) ||
2476 (V2IsSplat && isUndefOrEqual(Arg, NumOps))))
2483 static bool isCommutedMOVL(SDNode *N, bool V2IsSplat = false,
2484 bool V2IsUndef = false) {
2485 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2486 return isCommutedMOVL(N->op_begin(), N->getNumOperands(),
2487 V2IsSplat, V2IsUndef);
2490 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2491 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
2492 bool X86::isMOVSHDUPMask(SDNode *N) {
2493 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2495 if (N->getNumOperands() != 4)
2498 // Expect 1, 1, 3, 3
2499 for (unsigned i = 0; i < 2; ++i) {
2500 SDValue Arg = N->getOperand(i);
2501 if (Arg.getOpcode() == ISD::UNDEF) continue;
2502 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2503 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2504 if (Val != 1) return false;
2508 for (unsigned i = 2; i < 4; ++i) {
2509 SDValue Arg = N->getOperand(i);
2510 if (Arg.getOpcode() == ISD::UNDEF) continue;
2511 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2512 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2513 if (Val != 3) return false;
2517 // Don't use movshdup if it can be done with a shufps.
2521 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2522 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
2523 bool X86::isMOVSLDUPMask(SDNode *N) {
2524 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2526 if (N->getNumOperands() != 4)
2529 // Expect 0, 0, 2, 2
2530 for (unsigned i = 0; i < 2; ++i) {
2531 SDValue Arg = N->getOperand(i);
2532 if (Arg.getOpcode() == ISD::UNDEF) continue;
2533 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2534 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2535 if (Val != 0) return false;
2539 for (unsigned i = 2; i < 4; ++i) {
2540 SDValue Arg = N->getOperand(i);
2541 if (Arg.getOpcode() == ISD::UNDEF) continue;
2542 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2543 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2544 if (Val != 2) return false;
2548 // Don't use movshdup if it can be done with a shufps.
2552 /// isIdentityMask - Return true if the specified VECTOR_SHUFFLE operand
2553 /// specifies a identity operation on the LHS or RHS.
2554 static bool isIdentityMask(SDNode *N, bool RHS = false) {
2555 unsigned NumElems = N->getNumOperands();
2556 for (unsigned i = 0; i < NumElems; ++i)
2557 if (!isUndefOrEqual(N->getOperand(i), i + (RHS ? NumElems : 0)))
2562 /// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
2563 /// a splat of a single element.
2564 static bool isSplatMask(SDNode *N) {
2565 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2567 // This is a splat operation if each element of the permute is the same, and
2568 // if the value doesn't reference the second vector.
2569 unsigned NumElems = N->getNumOperands();
2570 SDValue ElementBase;
2572 for (; i != NumElems; ++i) {
2573 SDValue Elt = N->getOperand(i);
2574 if (isa<ConstantSDNode>(Elt)) {
2580 if (!ElementBase.getNode())
2583 for (; i != NumElems; ++i) {
2584 SDValue Arg = N->getOperand(i);
2585 if (Arg.getOpcode() == ISD::UNDEF) continue;
2586 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2587 if (Arg != ElementBase) return false;
2590 // Make sure it is a splat of the first vector operand.
2591 return cast<ConstantSDNode>(ElementBase)->getZExtValue() < NumElems;
2594 /// getSplatMaskEltNo - Given a splat mask, return the index to the element
2595 /// we want to splat.
2596 static SDValue getSplatMaskEltNo(SDNode *N) {
2597 assert(isSplatMask(N) && "Not a splat mask");
2598 unsigned NumElems = N->getNumOperands();
2599 SDValue ElementBase;
2601 for (; i != NumElems; ++i) {
2602 SDValue Elt = N->getOperand(i);
2603 if (isa<ConstantSDNode>(Elt))
2606 assert(0 && " No splat value found!");
2611 /// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
2612 /// a splat of a single element and it's a 2 or 4 element mask.
2613 bool X86::isSplatMask(SDNode *N) {
2614 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2616 // We can only splat 64-bit, and 32-bit quantities with a single instruction.
2617 if (N->getNumOperands() != 4 && N->getNumOperands() != 2)
2619 return ::isSplatMask(N);
2622 /// isSplatLoMask - Return true if the specified VECTOR_SHUFFLE operand
2623 /// specifies a splat of zero element.
2624 bool X86::isSplatLoMask(SDNode *N) {
2625 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2627 for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
2628 if (!isUndefOrEqual(N->getOperand(i), 0))
2633 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2634 /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
2635 bool X86::isMOVDDUPMask(SDNode *N) {
2636 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2638 unsigned e = N->getNumOperands() / 2;
2639 for (unsigned i = 0; i < e; ++i)
2640 if (!isUndefOrEqual(N->getOperand(i), i))
2642 for (unsigned i = 0; i < e; ++i)
2643 if (!isUndefOrEqual(N->getOperand(e+i), i))
2648 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
2649 /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
2651 unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
2652 unsigned NumOperands = N->getNumOperands();
2653 unsigned Shift = (NumOperands == 4) ? 2 : 1;
2655 for (unsigned i = 0; i < NumOperands; ++i) {
2657 SDValue Arg = N->getOperand(NumOperands-i-1);
2658 if (Arg.getOpcode() != ISD::UNDEF)
2659 Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2660 if (Val >= NumOperands) Val -= NumOperands;
2662 if (i != NumOperands - 1)
2669 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
2670 /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
2672 unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
2674 // 8 nodes, but we only care about the last 4.
2675 for (unsigned i = 7; i >= 4; --i) {
2677 SDValue Arg = N->getOperand(i);
2678 if (Arg.getOpcode() != ISD::UNDEF) {
2679 Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2689 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
2690 /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
2692 unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
2694 // 8 nodes, but we only care about the first 4.
2695 for (int i = 3; i >= 0; --i) {
2697 SDValue Arg = N->getOperand(i);
2698 if (Arg.getOpcode() != ISD::UNDEF)
2699 Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2708 /// isPSHUFHW_PSHUFLWMask - true if the specified VECTOR_SHUFFLE operand
2709 /// specifies a 8 element shuffle that can be broken into a pair of
2710 /// PSHUFHW and PSHUFLW.
2711 static bool isPSHUFHW_PSHUFLWMask(SDNode *N) {
2712 assert(N->getOpcode() == ISD::BUILD_VECTOR);
2714 if (N->getNumOperands() != 8)
2717 // Lower quadword shuffled.
2718 for (unsigned i = 0; i != 4; ++i) {
2719 SDValue Arg = N->getOperand(i);
2720 if (Arg.getOpcode() == ISD::UNDEF) continue;
2721 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2722 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2727 // Upper quadword shuffled.
2728 for (unsigned i = 4; i != 8; ++i) {
2729 SDValue Arg = N->getOperand(i);
2730 if (Arg.getOpcode() == ISD::UNDEF) continue;
2731 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2732 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2733 if (Val < 4 || Val > 7)
2740 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as
2741 /// values in ther permute mask.
2742 static SDValue CommuteVectorShuffle(SDValue Op, SDValue &V1,
2743 SDValue &V2, SDValue &Mask,
2744 SelectionDAG &DAG) {
2745 MVT VT = Op.getValueType();
2746 MVT MaskVT = Mask.getValueType();
2747 MVT EltVT = MaskVT.getVectorElementType();
2748 unsigned NumElems = Mask.getNumOperands();
2749 SmallVector<SDValue, 8> MaskVec;
2750 DebugLoc dl = Op.getDebugLoc();
2752 for (unsigned i = 0; i != NumElems; ++i) {
2753 SDValue Arg = Mask.getOperand(i);
2754 if (Arg.getOpcode() == ISD::UNDEF) {
2755 MaskVec.push_back(DAG.getUNDEF(EltVT));
2758 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2759 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2761 MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT));
2763 MaskVec.push_back(DAG.getConstant(Val - NumElems, EltVT));
2767 Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskVec[0], NumElems);
2768 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, Mask);
2771 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
2772 /// the two vector operands have swapped position.
2774 SDValue CommuteVectorShuffleMask(SDValue Mask, SelectionDAG &DAG, DebugLoc dl) {
2775 MVT MaskVT = Mask.getValueType();
2776 MVT EltVT = MaskVT.getVectorElementType();
2777 unsigned NumElems = Mask.getNumOperands();
2778 SmallVector<SDValue, 8> MaskVec;
2779 for (unsigned i = 0; i != NumElems; ++i) {
2780 SDValue Arg = Mask.getOperand(i);
2781 if (Arg.getOpcode() == ISD::UNDEF) {
2782 MaskVec.push_back(DAG.getUNDEF(EltVT));
2785 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2786 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2788 MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT));
2790 MaskVec.push_back(DAG.getConstant(Val - NumElems, EltVT));
2792 return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskVec[0], NumElems);
2796 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
2797 /// match movhlps. The lower half elements should come from upper half of
2798 /// V1 (and in order), and the upper half elements should come from the upper
2799 /// half of V2 (and in order).
2800 static bool ShouldXformToMOVHLPS(SDNode *Mask) {
2801 unsigned NumElems = Mask->getNumOperands();
2804 for (unsigned i = 0, e = 2; i != e; ++i)
2805 if (!isUndefOrEqual(Mask->getOperand(i), i+2))
2807 for (unsigned i = 2; i != 4; ++i)
2808 if (!isUndefOrEqual(Mask->getOperand(i), i+4))
2813 /// isScalarLoadToVector - Returns true if the node is a scalar load that
2814 /// is promoted to a vector. It also returns the LoadSDNode by reference if
2816 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
2817 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
2819 N = N->getOperand(0).getNode();
2820 if (!ISD::isNON_EXTLoad(N))
2823 *LD = cast<LoadSDNode>(N);
2827 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
2828 /// match movlp{s|d}. The lower half elements should come from lower half of
2829 /// V1 (and in order), and the upper half elements should come from the upper
2830 /// half of V2 (and in order). And since V1 will become the source of the
2831 /// MOVLP, it must be either a vector load or a scalar load to vector.
2832 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, SDNode *Mask) {
2833 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
2835 // Is V2 is a vector load, don't do this transformation. We will try to use
2836 // load folding shufps op.
2837 if (ISD::isNON_EXTLoad(V2))
2840 unsigned NumElems = Mask->getNumOperands();
2841 if (NumElems != 2 && NumElems != 4)
2843 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
2844 if (!isUndefOrEqual(Mask->getOperand(i), i))
2846 for (unsigned i = NumElems/2; i != NumElems; ++i)
2847 if (!isUndefOrEqual(Mask->getOperand(i), i+NumElems))
2852 /// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
2854 static bool isSplatVector(SDNode *N) {
2855 if (N->getOpcode() != ISD::BUILD_VECTOR)
2858 SDValue SplatValue = N->getOperand(0);
2859 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
2860 if (N->getOperand(i) != SplatValue)
2865 /// isUndefShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
2867 static bool isUndefShuffle(SDNode *N) {
2868 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
2871 SDValue V1 = N->getOperand(0);
2872 SDValue V2 = N->getOperand(1);
2873 SDValue Mask = N->getOperand(2);
2874 unsigned NumElems = Mask.getNumOperands();
2875 for (unsigned i = 0; i != NumElems; ++i) {
2876 SDValue Arg = Mask.getOperand(i);
2877 if (Arg.getOpcode() != ISD::UNDEF) {
2878 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2879 if (Val < NumElems && V1.getOpcode() != ISD::UNDEF)
2881 else if (Val >= NumElems && V2.getOpcode() != ISD::UNDEF)
2888 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
2890 static inline bool isZeroNode(SDValue Elt) {
2891 return ((isa<ConstantSDNode>(Elt) &&
2892 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
2893 (isa<ConstantFPSDNode>(Elt) &&
2894 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
2897 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
2898 /// to an zero vector.
2899 static bool isZeroShuffle(SDNode *N) {
2900 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
2903 SDValue V1 = N->getOperand(0);
2904 SDValue V2 = N->getOperand(1);
2905 SDValue Mask = N->getOperand(2);
2906 unsigned NumElems = Mask.getNumOperands();
2907 for (unsigned i = 0; i != NumElems; ++i) {
2908 SDValue Arg = Mask.getOperand(i);
2909 if (Arg.getOpcode() == ISD::UNDEF)
2912 unsigned Idx = cast<ConstantSDNode>(Arg)->getZExtValue();
2913 if (Idx < NumElems) {
2914 unsigned Opc = V1.getNode()->getOpcode();
2915 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
2917 if (Opc != ISD::BUILD_VECTOR ||
2918 !isZeroNode(V1.getNode()->getOperand(Idx)))
2920 } else if (Idx >= NumElems) {
2921 unsigned Opc = V2.getNode()->getOpcode();
2922 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
2924 if (Opc != ISD::BUILD_VECTOR ||
2925 !isZeroNode(V2.getNode()->getOperand(Idx - NumElems)))
2932 /// getZeroVector - Returns a vector of specified type with all zero elements.
2934 static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG,
2936 assert(VT.isVector() && "Expected a vector type");
2938 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
2939 // type. This ensures they get CSE'd.
2941 if (VT.getSizeInBits() == 64) { // MMX
2942 SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
2943 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
2944 } else if (HasSSE2) { // SSE2
2945 SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
2946 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
2948 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
2949 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
2951 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
2954 /// getOnesVector - Returns a vector of specified type with all bits set.
2956 static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
2957 assert(VT.isVector() && "Expected a vector type");
2959 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
2960 // type. This ensures they get CSE'd.
2961 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
2963 if (VT.getSizeInBits() == 64) // MMX
2964 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
2966 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
2967 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
2971 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
2972 /// that point to V2 points to its first element.
2973 static SDValue NormalizeMask(SDValue Mask, SelectionDAG &DAG) {
2974 assert(Mask.getOpcode() == ISD::BUILD_VECTOR);
2976 bool Changed = false;
2977 SmallVector<SDValue, 8> MaskVec;
2978 unsigned NumElems = Mask.getNumOperands();
2979 for (unsigned i = 0; i != NumElems; ++i) {
2980 SDValue Arg = Mask.getOperand(i);
2981 if (Arg.getOpcode() != ISD::UNDEF) {
2982 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2983 if (Val > NumElems) {
2984 Arg = DAG.getConstant(NumElems, Arg.getValueType());
2988 MaskVec.push_back(Arg);
2992 Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getDebugLoc(),
2993 Mask.getValueType(),
2994 &MaskVec[0], MaskVec.size());
2998 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
2999 /// operation of specified width.
3000 static SDValue getMOVLMask(unsigned NumElems, SelectionDAG &DAG, DebugLoc dl) {
3001 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
3002 MVT BaseVT = MaskVT.getVectorElementType();
3004 SmallVector<SDValue, 8> MaskVec;
3005 MaskVec.push_back(DAG.getConstant(NumElems, BaseVT));
3006 for (unsigned i = 1; i != NumElems; ++i)
3007 MaskVec.push_back(DAG.getConstant(i, BaseVT));
3008 return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
3009 &MaskVec[0], MaskVec.size());
3012 /// getUnpacklMask - Returns a vector_shuffle mask for an unpackl operation
3013 /// of specified width.
3014 static SDValue getUnpacklMask(unsigned NumElems, SelectionDAG &DAG,
3016 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
3017 MVT BaseVT = MaskVT.getVectorElementType();
3018 SmallVector<SDValue, 8> MaskVec;
3019 for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
3020 MaskVec.push_back(DAG.getConstant(i, BaseVT));
3021 MaskVec.push_back(DAG.getConstant(i + NumElems, BaseVT));
3023 return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
3024 &MaskVec[0], MaskVec.size());
3027 /// getUnpackhMask - Returns a vector_shuffle mask for an unpackh operation
3028 /// of specified width.
3029 static SDValue getUnpackhMask(unsigned NumElems, SelectionDAG &DAG,
3031 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
3032 MVT BaseVT = MaskVT.getVectorElementType();
3033 unsigned Half = NumElems/2;
3034 SmallVector<SDValue, 8> MaskVec;
3035 for (unsigned i = 0; i != Half; ++i) {
3036 MaskVec.push_back(DAG.getConstant(i + Half, BaseVT));
3037 MaskVec.push_back(DAG.getConstant(i + NumElems + Half, BaseVT));
3039 return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
3040 &MaskVec[0], MaskVec.size());
3043 /// getSwapEltZeroMask - Returns a vector_shuffle mask for a shuffle that swaps
3044 /// element #0 of a vector with the specified index, leaving the rest of the
3045 /// elements in place.
3046 static SDValue getSwapEltZeroMask(unsigned NumElems, unsigned DestElt,
3047 SelectionDAG &DAG, DebugLoc dl) {
3048 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
3049 MVT BaseVT = MaskVT.getVectorElementType();
3050 SmallVector<SDValue, 8> MaskVec;
3051 // Element #0 of the result gets the elt we are replacing.
3052 MaskVec.push_back(DAG.getConstant(DestElt, BaseVT));
3053 for (unsigned i = 1; i != NumElems; ++i)
3054 MaskVec.push_back(DAG.getConstant(i == DestElt ? 0 : i, BaseVT));
3055 return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
3056 &MaskVec[0], MaskVec.size());
3059 /// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
3060 static SDValue PromoteSplat(SDValue Op, SelectionDAG &DAG, bool HasSSE2) {
3061 MVT PVT = HasSSE2 ? MVT::v4i32 : MVT::v4f32;
3062 MVT VT = Op.getValueType();
3065 SDValue V1 = Op.getOperand(0);
3066 SDValue Mask = Op.getOperand(2);
3067 unsigned MaskNumElems = Mask.getNumOperands();
3068 unsigned NumElems = MaskNumElems;
3069 DebugLoc dl = Op.getDebugLoc();
3070 // Special handling of v4f32 -> v4i32.
3071 if (VT != MVT::v4f32) {
3072 // Find which element we want to splat.
3073 SDNode* EltNoNode = getSplatMaskEltNo(Mask.getNode()).getNode();
3074 unsigned EltNo = cast<ConstantSDNode>(EltNoNode)->getZExtValue();
3075 // unpack elements to the correct location
3076 while (NumElems > 4) {
3077 if (EltNo < NumElems/2) {
3078 Mask = getUnpacklMask(MaskNumElems, DAG, dl);
3080 Mask = getUnpackhMask(MaskNumElems, DAG, dl);
3081 EltNo -= NumElems/2;
3083 V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V1, Mask);
3086 SDValue Cst = DAG.getConstant(EltNo, MVT::i32);
3087 Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3090 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
3091 SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, PVT, V1,
3092 DAG.getUNDEF(PVT), Mask);
3093 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Shuffle);
3096 /// isVectorLoad - Returns true if the node is a vector load, a scalar
3097 /// load that's promoted to vector, or a load bitcasted.
3098 static bool isVectorLoad(SDValue Op) {
3099 assert(Op.getValueType().isVector() && "Expected a vector type");
3100 if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR ||
3101 Op.getOpcode() == ISD::BIT_CONVERT) {
3102 return isa<LoadSDNode>(Op.getOperand(0));
3104 return isa<LoadSDNode>(Op);
3108 /// CanonicalizeMovddup - Cannonicalize movddup shuffle to v2f64.
3110 static SDValue CanonicalizeMovddup(SDValue Op, SDValue V1, SDValue Mask,
3111 SelectionDAG &DAG, bool HasSSE3) {
3112 // If we have sse3 and shuffle has more than one use or input is a load, then
3113 // use movddup. Otherwise, use movlhps.
3114 bool UseMovddup = HasSSE3 && (!Op.hasOneUse() || isVectorLoad(V1));
3115 MVT PVT = UseMovddup ? MVT::v2f64 : MVT::v4f32;
3116 MVT VT = Op.getValueType();
3119 DebugLoc dl = Op.getDebugLoc();
3120 unsigned NumElems = PVT.getVectorNumElements();
3121 if (NumElems == 2) {
3122 SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3123 Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3125 assert(NumElems == 4);
3126 SDValue Cst0 = DAG.getTargetConstant(0, MVT::i32);
3127 SDValue Cst1 = DAG.getTargetConstant(1, MVT::i32);
3128 Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
3129 Cst0, Cst1, Cst0, Cst1);
3132 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
3133 SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, PVT, V1,
3134 DAG.getUNDEF(PVT), Mask);
3135 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Shuffle);
3138 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
3139 /// vector of zero or undef vector. This produces a shuffle where the low
3140 /// element of V2 is swizzled into the zero/undef vector, landing at element
3141 /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
3142 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
3143 bool isZero, bool HasSSE2,
3144 SelectionDAG &DAG) {
3145 DebugLoc dl = V2.getDebugLoc();
3146 MVT VT = V2.getValueType();
3148 ? getZeroVector(VT, HasSSE2, DAG, dl) : DAG.getUNDEF(VT);
3149 unsigned NumElems = V2.getValueType().getVectorNumElements();
3150 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
3151 MVT EVT = MaskVT.getVectorElementType();
3152 SmallVector<SDValue, 16> MaskVec;
3153 for (unsigned i = 0; i != NumElems; ++i)
3154 if (i == Idx) // If this is the insertion idx, put the low elt of V2 here.
3155 MaskVec.push_back(DAG.getConstant(NumElems, EVT));
3157 MaskVec.push_back(DAG.getConstant(i, EVT));
3158 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
3159 &MaskVec[0], MaskVec.size());
3160 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, Mask);
3163 /// getNumOfConsecutiveZeros - Return the number of elements in a result of
3164 /// a shuffle that is zero.
3166 unsigned getNumOfConsecutiveZeros(SDValue Op, SDValue Mask,
3167 unsigned NumElems, bool Low,
3168 SelectionDAG &DAG) {
3169 unsigned NumZeros = 0;
3170 for (unsigned i = 0; i < NumElems; ++i) {
3171 unsigned Index = Low ? i : NumElems-i-1;
3172 SDValue Idx = Mask.getOperand(Index);
3173 if (Idx.getOpcode() == ISD::UNDEF) {
3177 SDValue Elt = DAG.getShuffleScalarElt(Op.getNode(), Index);
3178 if (Elt.getNode() && isZeroNode(Elt))
3186 /// isVectorShift - Returns true if the shuffle can be implemented as a
3187 /// logical left or right shift of a vector.
3188 static bool isVectorShift(SDValue Op, SDValue Mask, SelectionDAG &DAG,
3189 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
3190 unsigned NumElems = Mask.getNumOperands();
3193 unsigned NumZeros= getNumOfConsecutiveZeros(Op, Mask, NumElems, true, DAG);
3196 NumZeros = getNumOfConsecutiveZeros(Op, Mask, NumElems, false, DAG);
3201 bool SeenV1 = false;
3202 bool SeenV2 = false;
3203 for (unsigned i = NumZeros; i < NumElems; ++i) {
3204 unsigned Val = isLeft ? (i - NumZeros) : i;
3205 SDValue Idx = Mask.getOperand(isLeft ? i : (i - NumZeros));
3206 if (Idx.getOpcode() == ISD::UNDEF)
3208 unsigned Index = cast<ConstantSDNode>(Idx)->getZExtValue();
3209 if (Index < NumElems)
3218 if (SeenV1 && SeenV2)
3221 ShVal = SeenV1 ? Op.getOperand(0) : Op.getOperand(1);
3227 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
3229 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
3230 unsigned NumNonZero, unsigned NumZero,
3231 SelectionDAG &DAG, TargetLowering &TLI) {
3235 DebugLoc dl = Op.getDebugLoc();
3238 for (unsigned i = 0; i < 16; ++i) {
3239 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
3240 if (ThisIsNonZero && First) {
3242 V = getZeroVector(MVT::v8i16, true, DAG, dl);
3244 V = DAG.getUNDEF(MVT::v8i16);
3249 SDValue ThisElt(0, 0), LastElt(0, 0);
3250 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
3251 if (LastIsNonZero) {
3252 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
3253 MVT::i16, Op.getOperand(i-1));
3255 if (ThisIsNonZero) {
3256 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
3257 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
3258 ThisElt, DAG.getConstant(8, MVT::i8));
3260 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
3264 if (ThisElt.getNode())
3265 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
3266 DAG.getIntPtrConstant(i/2));
3270 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
3273 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
3275 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
3276 unsigned NumNonZero, unsigned NumZero,
3277 SelectionDAG &DAG, TargetLowering &TLI) {
3281 DebugLoc dl = Op.getDebugLoc();
3284 for (unsigned i = 0; i < 8; ++i) {
3285 bool isNonZero = (NonZeros & (1 << i)) != 0;
3289 V = getZeroVector(MVT::v8i16, true, DAG, dl);
3291 V = DAG.getUNDEF(MVT::v8i16);
3294 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
3295 MVT::v8i16, V, Op.getOperand(i),
3296 DAG.getIntPtrConstant(i));
3303 /// getVShift - Return a vector logical shift node.
3305 static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp,
3306 unsigned NumBits, SelectionDAG &DAG,
3307 const TargetLowering &TLI, DebugLoc dl) {
3308 bool isMMX = VT.getSizeInBits() == 64;
3309 MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
3310 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
3311 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
3312 return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3313 DAG.getNode(Opc, dl, ShVT, SrcOp,
3314 DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
3318 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
3319 DebugLoc dl = Op.getDebugLoc();
3320 // All zero's are handled with pxor, all one's are handled with pcmpeqd.
3321 if (ISD::isBuildVectorAllZeros(Op.getNode())
3322 || ISD::isBuildVectorAllOnes(Op.getNode())) {
3323 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
3324 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
3325 // eliminated on x86-32 hosts.
3326 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
3329 if (ISD::isBuildVectorAllOnes(Op.getNode()))
3330 return getOnesVector(Op.getValueType(), DAG, dl);
3331 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
3334 MVT VT = Op.getValueType();
3335 MVT EVT = VT.getVectorElementType();
3336 unsigned EVTBits = EVT.getSizeInBits();
3338 unsigned NumElems = Op.getNumOperands();
3339 unsigned NumZero = 0;
3340 unsigned NumNonZero = 0;
3341 unsigned NonZeros = 0;
3342 bool IsAllConstants = true;
3343 SmallSet<SDValue, 8> Values;
3344 for (unsigned i = 0; i < NumElems; ++i) {
3345 SDValue Elt = Op.getOperand(i);
3346 if (Elt.getOpcode() == ISD::UNDEF)
3349 if (Elt.getOpcode() != ISD::Constant &&
3350 Elt.getOpcode() != ISD::ConstantFP)
3351 IsAllConstants = false;
3352 if (isZeroNode(Elt))
3355 NonZeros |= (1 << i);
3360 if (NumNonZero == 0) {
3361 // All undef vector. Return an UNDEF. All zero vectors were handled above.
3362 return DAG.getUNDEF(VT);
3365 // Special case for single non-zero, non-undef, element.
3366 if (NumNonZero == 1 && NumElems <= 4) {
3367 unsigned Idx = CountTrailingZeros_32(NonZeros);
3368 SDValue Item = Op.getOperand(Idx);
3370 // If this is an insertion of an i64 value on x86-32, and if the top bits of
3371 // the value are obviously zero, truncate the value to i32 and do the
3372 // insertion that way. Only do this if the value is non-constant or if the
3373 // value is a constant being inserted into element 0. It is cheaper to do
3374 // a constant pool load than it is to do a movd + shuffle.
3375 if (EVT == MVT::i64 && !Subtarget->is64Bit() &&
3376 (!IsAllConstants || Idx == 0)) {
3377 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
3378 // Handle MMX and SSE both.
3379 MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
3380 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
3382 // Truncate the value (which may itself be a constant) to i32, and
3383 // convert it to a vector with movd (S2V+shuffle to zero extend).
3384 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
3385 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
3386 Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3387 Subtarget->hasSSE2(), DAG);
3389 // Now we have our 32-bit value zero extended in the low element of
3390 // a vector. If Idx != 0, swizzle it into place.
3393 Item, DAG.getUNDEF(Item.getValueType()),
3394 getSwapEltZeroMask(VecElts, Idx, DAG, dl)
3396 Item = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VecVT, Ops, 3);
3398 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
3402 // If we have a constant or non-constant insertion into the low element of
3403 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
3404 // the rest of the elements. This will be matched as movd/movq/movss/movsd
3405 // depending on what the source datatype is. Because we can only get here
3406 // when NumElems <= 4, this only needs to handle i32/f32/i64/f64.
3408 // Don't do this for i64 values on x86-32.
3409 (EVT != MVT::i64 || Subtarget->is64Bit())) {
3410 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3411 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
3412 return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
3413 Subtarget->hasSSE2(), DAG);
3416 // Is it a vector logical left shift?
3417 if (NumElems == 2 && Idx == 1 &&
3418 isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) {
3419 unsigned NumBits = VT.getSizeInBits();
3420 return getVShift(true, VT,
3421 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
3422 VT, Op.getOperand(1)),
3423 NumBits/2, DAG, *this, dl);
3426 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
3429 // Otherwise, if this is a vector with i32 or f32 elements, and the element
3430 // is a non-constant being inserted into an element other than the low one,
3431 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
3432 // movd/movss) to move this into the low element, then shuffle it into
3434 if (EVTBits == 32) {
3435 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3437 // Turn it into a shuffle of zero and zero-extended scalar to vector.
3438 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
3439 Subtarget->hasSSE2(), DAG);
3440 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
3441 MVT MaskEVT = MaskVT.getVectorElementType();
3442 SmallVector<SDValue, 8> MaskVec;
3443 for (unsigned i = 0; i < NumElems; i++)
3444 MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT));
3445 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
3446 &MaskVec[0], MaskVec.size());
3447 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, Item,
3448 DAG.getUNDEF(VT), Mask);
3452 // Splat is obviously ok. Let legalizer expand it to a shuffle.
3453 if (Values.size() == 1)
3456 // A vector full of immediates; various special cases are already
3457 // handled, so this is best done with a single constant-pool load.
3461 // Let legalizer expand 2-wide build_vectors.
3462 if (EVTBits == 64) {
3463 if (NumNonZero == 1) {
3464 // One half is zero or undef.
3465 unsigned Idx = CountTrailingZeros_32(NonZeros);
3466 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
3467 Op.getOperand(Idx));
3468 return getShuffleVectorZeroOrUndef(V2, Idx, true,
3469 Subtarget->hasSSE2(), DAG);
3474 // If element VT is < 32 bits, convert it to inserts into a zero vector.
3475 if (EVTBits == 8 && NumElems == 16) {
3476 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
3478 if (V.getNode()) return V;
3481 if (EVTBits == 16 && NumElems == 8) {
3482 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
3484 if (V.getNode()) return V;
3487 // If element VT is == 32 bits, turn it into a number of shuffles.
3488 SmallVector<SDValue, 8> V;
3490 if (NumElems == 4 && NumZero > 0) {
3491 for (unsigned i = 0; i < 4; ++i) {
3492 bool isZero = !(NonZeros & (1 << i));
3494 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
3496 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3499 for (unsigned i = 0; i < 2; ++i) {
3500 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
3503 V[i] = V[i*2]; // Must be a zero vector.
3506 V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2+1], V[i*2],
3507 getMOVLMask(NumElems, DAG, dl));
3510 V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2], V[i*2+1],
3511 getMOVLMask(NumElems, DAG, dl));
3514 V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2], V[i*2+1],
3515 getUnpacklMask(NumElems, DAG, dl));
3520 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
3521 MVT EVT = MaskVT.getVectorElementType();
3522 SmallVector<SDValue, 8> MaskVec;
3523 bool Reverse = (NonZeros & 0x3) == 2;
3524 for (unsigned i = 0; i < 2; ++i)
3526 MaskVec.push_back(DAG.getConstant(1-i, EVT));
3528 MaskVec.push_back(DAG.getConstant(i, EVT));
3529 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
3530 for (unsigned i = 0; i < 2; ++i)
3532 MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT));
3534 MaskVec.push_back(DAG.getConstant(i+NumElems, EVT));
3535 SDValue ShufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
3536 &MaskVec[0], MaskVec.size());
3537 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[0], V[1], ShufMask);
3540 if (Values.size() > 2) {
3541 // Expand into a number of unpckl*.
3543 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
3544 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
3545 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
3546 SDValue UnpckMask = getUnpacklMask(NumElems, DAG, dl);
3547 for (unsigned i = 0; i < NumElems; ++i)
3548 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3550 while (NumElems != 0) {
3551 for (unsigned i = 0; i < NumElems; ++i)
3552 V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i], V[i + NumElems],
3563 SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
3564 SDValue PermMask, SelectionDAG &DAG,
3565 TargetLowering &TLI, DebugLoc dl) {
3567 MVT MaskVT = MVT::getIntVectorWithNumElements(8);
3568 MVT MaskEVT = MaskVT.getVectorElementType();
3569 MVT PtrVT = TLI.getPointerTy();
3570 SmallVector<SDValue, 8> MaskElts(PermMask.getNode()->op_begin(),
3571 PermMask.getNode()->op_end());
3573 // First record which half of which vector the low elements come from.
3574 SmallVector<unsigned, 4> LowQuad(4);
3575 for (unsigned i = 0; i < 4; ++i) {
3576 SDValue Elt = MaskElts[i];
3577 if (Elt.getOpcode() == ISD::UNDEF)
3579 unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3580 int QuadIdx = EltIdx / 4;
3584 int BestLowQuad = -1;
3585 unsigned MaxQuad = 1;
3586 for (unsigned i = 0; i < 4; ++i) {
3587 if (LowQuad[i] > MaxQuad) {
3589 MaxQuad = LowQuad[i];
3593 // Record which half of which vector the high elements come from.
3594 SmallVector<unsigned, 4> HighQuad(4);
3595 for (unsigned i = 4; i < 8; ++i) {
3596 SDValue Elt = MaskElts[i];
3597 if (Elt.getOpcode() == ISD::UNDEF)
3599 unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3600 int QuadIdx = EltIdx / 4;
3601 ++HighQuad[QuadIdx];
3604 int BestHighQuad = -1;
3606 for (unsigned i = 0; i < 4; ++i) {
3607 if (HighQuad[i] > MaxQuad) {
3609 MaxQuad = HighQuad[i];
3613 // If it's possible to sort parts of either half with PSHUF{H|L}W, then do it.
3614 if (BestLowQuad != -1 || BestHighQuad != -1) {
3615 // First sort the 4 chunks in order using shufpd.
3616 SmallVector<SDValue, 8> MaskVec;
3618 if (BestLowQuad != -1)
3619 MaskVec.push_back(DAG.getConstant(BestLowQuad, MVT::i32));
3621 MaskVec.push_back(DAG.getConstant(0, MVT::i32));
3623 if (BestHighQuad != -1)
3624 MaskVec.push_back(DAG.getConstant(BestHighQuad, MVT::i32));
3626 MaskVec.push_back(DAG.getConstant(1, MVT::i32));
3628 SDValue Mask= DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, &MaskVec[0],2);
3629 NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2i64,
3630 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
3631 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), Mask);
3632 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
3634 // Now sort high and low parts separately.
3635 BitVector InOrder(8);
3636 if (BestLowQuad != -1) {
3637 // Sort lower half in order using PSHUFLW.
3639 bool AnyOutOrder = false;
3641 for (unsigned i = 0; i != 4; ++i) {
3642 SDValue Elt = MaskElts[i];
3643 if (Elt.getOpcode() == ISD::UNDEF) {
3644 MaskVec.push_back(Elt);
3647 unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3651 MaskVec.push_back(DAG.getConstant(EltIdx % 4, MaskEVT));
3653 // If this element is in the right place after this shuffle, then
3655 if ((int)(EltIdx / 4) == BestLowQuad)
3660 for (unsigned i = 4; i != 8; ++i)
3661 MaskVec.push_back(DAG.getConstant(i, MaskEVT));
3662 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
3664 NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16,
3669 if (BestHighQuad != -1) {
3670 // Sort high half in order using PSHUFHW if possible.
3673 for (unsigned i = 0; i != 4; ++i)
3674 MaskVec.push_back(DAG.getConstant(i, MaskEVT));
3676 bool AnyOutOrder = false;
3677 for (unsigned i = 4; i != 8; ++i) {
3678 SDValue Elt = MaskElts[i];
3679 if (Elt.getOpcode() == ISD::UNDEF) {
3680 MaskVec.push_back(Elt);
3683 unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3687 MaskVec.push_back(DAG.getConstant((EltIdx % 4) + 4, MaskEVT));
3689 // If this element is in the right place after this shuffle, then
3691 if ((int)(EltIdx / 4) == BestHighQuad)
3697 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl,
3698 MaskVT, &MaskVec[0], 8);
3699 NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16,
3704 // The other elements are put in the right place using pextrw and pinsrw.
3705 for (unsigned i = 0; i != 8; ++i) {
3708 SDValue Elt = MaskElts[i];
3709 if (Elt.getOpcode() == ISD::UNDEF)
3711 unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3712 SDValue ExtOp = (EltIdx < 8)
3713 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
3714 DAG.getConstant(EltIdx, PtrVT))
3715 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
3716 DAG.getConstant(EltIdx - 8, PtrVT));
3717 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
3718 DAG.getConstant(i, PtrVT));
3724 // PSHUF{H|L}W are not used. Lower into extracts and inserts but try to use as
3725 // few as possible. First, let's find out how many elements are already in the
3727 unsigned V1InOrder = 0;
3728 unsigned V1FromV1 = 0;
3729 unsigned V2InOrder = 0;
3730 unsigned V2FromV2 = 0;
3731 SmallVector<SDValue, 8> V1Elts;
3732 SmallVector<SDValue, 8> V2Elts;
3733 for (unsigned i = 0; i < 8; ++i) {
3734 SDValue Elt = MaskElts[i];
3735 if (Elt.getOpcode() == ISD::UNDEF) {
3736 V1Elts.push_back(Elt);
3737 V2Elts.push_back(Elt);
3742 unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3744 V1Elts.push_back(Elt);
3745 V2Elts.push_back(DAG.getConstant(i+8, MaskEVT));
3747 } else if (EltIdx == i+8) {
3748 V1Elts.push_back(Elt);
3749 V2Elts.push_back(DAG.getConstant(i, MaskEVT));
3751 } else if (EltIdx < 8) {
3752 V1Elts.push_back(Elt);
3753 V2Elts.push_back(DAG.getConstant(EltIdx+8, MaskEVT));
3756 V1Elts.push_back(Elt);
3757 V2Elts.push_back(DAG.getConstant(EltIdx-8, MaskEVT));
3762 if (V2InOrder > V1InOrder) {
3763 PermMask = CommuteVectorShuffleMask(PermMask, DAG, dl);
3765 std::swap(V1Elts, V2Elts);
3766 std::swap(V1FromV1, V2FromV2);
3769 if ((V1FromV1 + V1InOrder) != 8) {
3770 // Some elements are from V2.
3772 // If there are elements that are from V1 but out of place,
3773 // then first sort them in place
3774 SmallVector<SDValue, 8> MaskVec;
3775 for (unsigned i = 0; i < 8; ++i) {
3776 SDValue Elt = V1Elts[i];
3777 if (Elt.getOpcode() == ISD::UNDEF) {
3778 MaskVec.push_back(DAG.getUNDEF(MaskEVT));
3781 unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3783 MaskVec.push_back(DAG.getUNDEF(MaskEVT));
3785 MaskVec.push_back(DAG.getConstant(EltIdx, MaskEVT));
3787 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskVec[0], 8);
3788 V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, V1, V1, Mask);
3792 for (unsigned i = 0; i < 8; ++i) {
3793 SDValue Elt = V1Elts[i];
3794 if (Elt.getOpcode() == ISD::UNDEF)
3796 unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3799 SDValue ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
3800 DAG.getConstant(EltIdx - 8, PtrVT));
3801 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
3802 DAG.getConstant(i, PtrVT));
3806 // All elements are from V1.
3808 for (unsigned i = 0; i < 8; ++i) {
3809 SDValue Elt = V1Elts[i];
3810 if (Elt.getOpcode() == ISD::UNDEF)
3812 unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3813 SDValue ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
3814 DAG.getConstant(EltIdx, PtrVT));
3815 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
3816 DAG.getConstant(i, PtrVT));
3822 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
3823 /// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
3824 /// done when every pair / quad of shuffle mask elements point to elements in
3825 /// the right sequence. e.g.
3826 /// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
3828 SDValue RewriteAsNarrowerShuffle(SDValue V1, SDValue V2,
3830 SDValue PermMask, SelectionDAG &DAG,
3831 TargetLowering &TLI, DebugLoc dl) {
3832 unsigned NumElems = PermMask.getNumOperands();
3833 unsigned NewWidth = (NumElems == 4) ? 2 : 4;
3834 MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
3835 MVT MaskEltVT = MaskVT.getVectorElementType();
3837 switch (VT.getSimpleVT()) {
3838 default: assert(false && "Unexpected!");
3839 case MVT::v4f32: NewVT = MVT::v2f64; break;
3840 case MVT::v4i32: NewVT = MVT::v2i64; break;
3841 case MVT::v8i16: NewVT = MVT::v4i32; break;
3842 case MVT::v16i8: NewVT = MVT::v4i32; break;
3845 if (NewWidth == 2) {
3851 unsigned Scale = NumElems / NewWidth;
3852 SmallVector<SDValue, 8> MaskVec;
3853 for (unsigned i = 0; i < NumElems; i += Scale) {
3854 unsigned StartIdx = ~0U;
3855 for (unsigned j = 0; j < Scale; ++j) {
3856 SDValue Elt = PermMask.getOperand(i+j);
3857 if (Elt.getOpcode() == ISD::UNDEF)
3859 unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3860 if (StartIdx == ~0U)
3861 StartIdx = EltIdx - (EltIdx % Scale);
3862 if (EltIdx != StartIdx + j)
3865 if (StartIdx == ~0U)
3866 MaskVec.push_back(DAG.getUNDEF(MaskEltVT));
3868 MaskVec.push_back(DAG.getConstant(StartIdx / Scale, MaskEltVT));
3871 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
3872 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
3873 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, NewVT, V1, V2,
3874 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
3875 &MaskVec[0], MaskVec.size()));
3878 /// getVZextMovL - Return a zero-extending vector move low node.
3880 static SDValue getVZextMovL(MVT VT, MVT OpVT,
3881 SDValue SrcOp, SelectionDAG &DAG,
3882 const X86Subtarget *Subtarget, DebugLoc dl) {
3883 if (VT == MVT::v2f64 || VT == MVT::v4f32) {
3884 LoadSDNode *LD = NULL;
3885 if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
3886 LD = dyn_cast<LoadSDNode>(SrcOp);
3888 // movssrr and movsdrr do not clear top bits. Try to use movd, movq
3890 MVT EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
3891 if ((EVT != MVT::i64 || Subtarget->is64Bit()) &&
3892 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
3893 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
3894 SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) {
3896 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
3897 return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3898 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
3899 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
3907 return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3908 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
3909 DAG.getNode(ISD::BIT_CONVERT, dl,
3913 /// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
3916 LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2,
3917 SDValue PermMask, MVT VT, SelectionDAG &DAG,
3919 MVT MaskVT = PermMask.getValueType();
3920 MVT MaskEVT = MaskVT.getVectorElementType();
3921 SmallVector<std::pair<int, int>, 8> Locs;
3923 SmallVector<SDValue, 8> Mask1(4, DAG.getUNDEF(MaskEVT));
3926 for (unsigned i = 0; i != 4; ++i) {
3927 SDValue Elt = PermMask.getOperand(i);
3928 if (Elt.getOpcode() == ISD::UNDEF) {
3929 Locs[i] = std::make_pair(-1, -1);
3931 unsigned Val = cast<ConstantSDNode>(Elt)->getZExtValue();
3932 assert(Val < 8 && "Invalid VECTOR_SHUFFLE index!");
3934 Locs[i] = std::make_pair(0, NumLo);
3938 Locs[i] = std::make_pair(1, NumHi);
3940 Mask1[2+NumHi] = Elt;
3946 if (NumLo <= 2 && NumHi <= 2) {
3947 // If no more than two elements come from either vector. This can be
3948 // implemented with two shuffles. First shuffle gather the elements.
3949 // The second shuffle, which takes the first shuffle as both of its
3950 // vector operands, put the elements into the right order.
3951 V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2,
3952 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
3953 &Mask1[0], Mask1.size()));
3955 SmallVector<SDValue, 8> Mask2(4, DAG.getUNDEF(MaskEVT));
3956 for (unsigned i = 0; i != 4; ++i) {
3957 if (Locs[i].first == -1)
3960 unsigned Idx = (i < 2) ? 0 : 4;
3961 Idx += Locs[i].first * 2 + Locs[i].second;
3962 Mask2[i] = DAG.getConstant(Idx, MaskEVT);
3966 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V1,
3967 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
3968 &Mask2[0], Mask2.size()));
3969 } else if (NumLo == 3 || NumHi == 3) {
3970 // Otherwise, we must have three elements from one vector, call it X, and
3971 // one element from the other, call it Y. First, use a shufps to build an
3972 // intermediate vector with the one element from Y and the element from X
3973 // that will be in the same half in the final destination (the indexes don't
3974 // matter). Then, use a shufps to build the final vector, taking the half
3975 // containing the element from Y from the intermediate, and the other half
3978 // Normalize it so the 3 elements come from V1.
3979 PermMask = CommuteVectorShuffleMask(PermMask, DAG, dl);
3983 // Find the element from V2.
3985 for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
3986 SDValue Elt = PermMask.getOperand(HiIndex);
3987 if (Elt.getOpcode() == ISD::UNDEF)
3989 unsigned Val = cast<ConstantSDNode>(Elt)->getZExtValue();
3994 Mask1[0] = PermMask.getOperand(HiIndex);
3995 Mask1[1] = DAG.getUNDEF(MaskEVT);
3996 Mask1[2] = PermMask.getOperand(HiIndex^1);
3997 Mask1[3] = DAG.getUNDEF(MaskEVT);
3998 V2 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2,
3999 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &Mask1[0], 4));
4002 Mask1[0] = PermMask.getOperand(0);
4003 Mask1[1] = PermMask.getOperand(1);
4004 Mask1[2] = DAG.getConstant(HiIndex & 1 ? 6 : 4, MaskEVT);
4005 Mask1[3] = DAG.getConstant(HiIndex & 1 ? 4 : 6, MaskEVT);
4006 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2,
4007 DAG.getNode(ISD::BUILD_VECTOR, dl,
4008 MaskVT, &Mask1[0], 4));
4010 Mask1[0] = DAG.getConstant(HiIndex & 1 ? 2 : 0, MaskEVT);
4011 Mask1[1] = DAG.getConstant(HiIndex & 1 ? 0 : 2, MaskEVT);
4012 Mask1[2] = PermMask.getOperand(2);
4013 Mask1[3] = PermMask.getOperand(3);
4014 if (Mask1[2].getOpcode() != ISD::UNDEF)
4016 DAG.getConstant(cast<ConstantSDNode>(Mask1[2])->getZExtValue()+4,
4018 if (Mask1[3].getOpcode() != ISD::UNDEF)
4020 DAG.getConstant(cast<ConstantSDNode>(Mask1[3])->getZExtValue()+4,
4022 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V2, V1,
4023 DAG.getNode(ISD::BUILD_VECTOR, dl,
4024 MaskVT, &Mask1[0], 4));
4028 // Break it into (shuffle shuffle_hi, shuffle_lo).
4030 SmallVector<SDValue,8> LoMask(4, DAG.getUNDEF(MaskEVT));
4031 SmallVector<SDValue,8> HiMask(4, DAG.getUNDEF(MaskEVT));
4032 SmallVector<SDValue,8> *MaskPtr = &LoMask;
4033 unsigned MaskIdx = 0;
4036 for (unsigned i = 0; i != 4; ++i) {
4043 SDValue Elt = PermMask.getOperand(i);
4044 if (Elt.getOpcode() == ISD::UNDEF) {
4045 Locs[i] = std::make_pair(-1, -1);
4046 } else if (cast<ConstantSDNode>(Elt)->getZExtValue() < 4) {
4047 Locs[i] = std::make_pair(MaskIdx, LoIdx);
4048 (*MaskPtr)[LoIdx] = Elt;
4051 Locs[i] = std::make_pair(MaskIdx, HiIdx);
4052 (*MaskPtr)[HiIdx] = Elt;
4057 SDValue LoShuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2,
4058 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
4059 &LoMask[0], LoMask.size()));
4060 SDValue HiShuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2,
4061 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
4062 &HiMask[0], HiMask.size()));
4063 SmallVector<SDValue, 8> MaskOps;
4064 for (unsigned i = 0; i != 4; ++i) {
4065 if (Locs[i].first == -1) {
4066 MaskOps.push_back(DAG.getUNDEF(MaskEVT));
4068 unsigned Idx = Locs[i].first * 4 + Locs[i].second;
4069 MaskOps.push_back(DAG.getConstant(Idx, MaskEVT));
4072 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, LoShuffle, HiShuffle,
4073 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
4074 &MaskOps[0], MaskOps.size()));
4078 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
4079 SDValue V1 = Op.getOperand(0);
4080 SDValue V2 = Op.getOperand(1);
4081 SDValue PermMask = Op.getOperand(2);
4082 MVT VT = Op.getValueType();
4083 DebugLoc dl = Op.getDebugLoc();
4084 unsigned NumElems = PermMask.getNumOperands();
4085 bool isMMX = VT.getSizeInBits() == 64;
4086 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
4087 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
4088 bool V1IsSplat = false;
4089 bool V2IsSplat = false;
4091 if (isUndefShuffle(Op.getNode()))
4092 return DAG.getUNDEF(VT);
4094 if (isZeroShuffle(Op.getNode()))
4095 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
4097 if (isIdentityMask(PermMask.getNode()))
4099 else if (isIdentityMask(PermMask.getNode(), true))
4102 // Canonicalize movddup shuffles.
4103 if (V2IsUndef && Subtarget->hasSSE2() &&
4104 VT.getSizeInBits() == 128 &&
4105 X86::isMOVDDUPMask(PermMask.getNode()))
4106 return CanonicalizeMovddup(Op, V1, PermMask, DAG, Subtarget->hasSSE3());
4108 if (isSplatMask(PermMask.getNode())) {
4109 if (isMMX || NumElems < 4) return Op;
4110 // Promote it to a v4{if}32 splat.
4111 return PromoteSplat(Op, DAG, Subtarget->hasSSE2());
4114 // If the shuffle can be profitably rewritten as a narrower shuffle, then
4116 if (VT == MVT::v8i16 || VT == MVT::v16i8) {
4117 SDValue NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG,
4119 if (NewOp.getNode())
4120 return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4121 LowerVECTOR_SHUFFLE(NewOp, DAG));
4122 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
4123 // FIXME: Figure out a cleaner way to do this.
4124 // Try to make use of movq to zero out the top part.
4125 if (ISD::isBuildVectorAllZeros(V2.getNode())) {
4126 SDValue NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
4128 if (NewOp.getNode()) {
4129 SDValue NewV1 = NewOp.getOperand(0);
4130 SDValue NewV2 = NewOp.getOperand(1);
4131 SDValue NewMask = NewOp.getOperand(2);
4132 if (isCommutedMOVL(NewMask.getNode(), true, false)) {
4133 NewOp = CommuteVectorShuffle(NewOp, NewV1, NewV2, NewMask, DAG);
4134 return getVZextMovL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget,
4138 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
4139 SDValue NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
4141 if (NewOp.getNode() && X86::isMOVLMask(NewOp.getOperand(2).getNode()))
4142 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
4143 DAG, Subtarget, dl);
4147 // Check if this can be converted into a logical shift.
4148 bool isLeft = false;
4151 bool isShift = isVectorShift(Op, PermMask, DAG, isLeft, ShVal, ShAmt);
4152 if (isShift && ShVal.hasOneUse()) {
4153 // If the shifted value has multiple uses, it may be cheaper to use
4154 // v_set0 + movlhps or movhlps, etc.
4155 MVT EVT = VT.getVectorElementType();
4156 ShAmt *= EVT.getSizeInBits();
4157 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4160 if (X86::isMOVLMask(PermMask.getNode())) {
4163 if (ISD::isBuildVectorAllZeros(V1.getNode()))
4164 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
4169 if (!isMMX && (X86::isMOVSHDUPMask(PermMask.getNode()) ||
4170 X86::isMOVSLDUPMask(PermMask.getNode()) ||
4171 X86::isMOVHLPSMask(PermMask.getNode()) ||
4172 X86::isMOVHPMask(PermMask.getNode()) ||
4173 X86::isMOVLPMask(PermMask.getNode())))
4176 if (ShouldXformToMOVHLPS(PermMask.getNode()) ||
4177 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), PermMask.getNode()))
4178 return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
4181 // No better options. Use a vshl / vsrl.
4182 MVT EVT = VT.getVectorElementType();
4183 ShAmt *= EVT.getSizeInBits();
4184 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4187 bool Commuted = false;
4188 // FIXME: This should also accept a bitcast of a splat? Be careful, not
4189 // 1,1,1,1 -> v8i16 though.
4190 V1IsSplat = isSplatVector(V1.getNode());
4191 V2IsSplat = isSplatVector(V2.getNode());
4193 // Canonicalize the splat or undef, if present, to be on the RHS.
4194 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
4195 Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
4196 std::swap(V1IsSplat, V2IsSplat);
4197 std::swap(V1IsUndef, V2IsUndef);
4201 // FIXME: Figure out a cleaner way to do this.
4202 if (isCommutedMOVL(PermMask.getNode(), V2IsSplat, V2IsUndef)) {
4203 if (V2IsUndef) return V1;
4204 Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
4206 // V2 is a splat, so the mask may be malformed. That is, it may point
4207 // to any V2 element. The instruction selectior won't like this. Get
4208 // a corrected mask and commute to form a proper MOVS{S|D}.
4209 SDValue NewMask = getMOVLMask(NumElems, DAG, dl);
4210 if (NewMask.getNode() != PermMask.getNode())
4211 Op = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, NewMask);
4216 if (X86::isUNPCKL_v_undef_Mask(PermMask.getNode()) ||
4217 X86::isUNPCKH_v_undef_Mask(PermMask.getNode()) ||
4218 X86::isUNPCKLMask(PermMask.getNode()) ||
4219 X86::isUNPCKHMask(PermMask.getNode()))
4223 // Normalize mask so all entries that point to V2 points to its first
4224 // element then try to match unpck{h|l} again. If match, return a
4225 // new vector_shuffle with the corrected mask.
4226 SDValue NewMask = NormalizeMask(PermMask, DAG);
4227 if (NewMask.getNode() != PermMask.getNode()) {
4228 if (X86::isUNPCKLMask(NewMask.getNode(), true)) {
4229 SDValue NewMask = getUnpacklMask(NumElems, DAG, dl);
4230 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, NewMask);
4231 } else if (X86::isUNPCKHMask(NewMask.getNode(), true)) {
4232 SDValue NewMask = getUnpackhMask(NumElems, DAG, dl);
4233 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, NewMask);
4238 // Normalize the node to match x86 shuffle ops if needed
4239 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(PermMask.getNode()))
4240 Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
4243 // Commute is back and try unpck* again.
4244 Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
4245 if (X86::isUNPCKL_v_undef_Mask(PermMask.getNode()) ||
4246 X86::isUNPCKH_v_undef_Mask(PermMask.getNode()) ||
4247 X86::isUNPCKLMask(PermMask.getNode()) ||
4248 X86::isUNPCKHMask(PermMask.getNode()))
4252 // Try PSHUF* first, then SHUFP*.
4253 // MMX doesn't have PSHUFD but it does have PSHUFW. While it's theoretically
4254 // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented.
4255 if (isMMX && NumElems == 4 && X86::isPSHUFDMask(PermMask.getNode())) {
4256 if (V2.getOpcode() != ISD::UNDEF)
4257 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1,
4258 DAG.getUNDEF(VT), PermMask);
4263 if (Subtarget->hasSSE2() &&
4264 (X86::isPSHUFDMask(PermMask.getNode()) ||
4265 X86::isPSHUFHWMask(PermMask.getNode()) ||
4266 X86::isPSHUFLWMask(PermMask.getNode()))) {
4268 if (VT == MVT::v4f32) {
4270 Op = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, RVT,
4271 DAG.getNode(ISD::BIT_CONVERT, dl, RVT, V1),
4272 DAG.getUNDEF(RVT), PermMask);
4273 } else if (V2.getOpcode() != ISD::UNDEF)
4274 Op = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, RVT, V1,
4275 DAG.getUNDEF(RVT), PermMask);
4277 Op = DAG.getNode(ISD::BIT_CONVERT, dl, VT, Op);
4281 // Binary or unary shufps.
4282 if (X86::isSHUFPMask(PermMask.getNode()) ||
4283 (V2.getOpcode() == ISD::UNDEF && X86::isPSHUFDMask(PermMask.getNode())))
4287 // Handle v8i16 specifically since SSE can do byte extraction and insertion.
4288 if (VT == MVT::v8i16) {
4289 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(V1, V2, PermMask, DAG, *this, dl);
4290 if (NewOp.getNode())
4294 // Handle all 4 wide cases with a number of shuffles except for MMX.
4295 if (NumElems == 4 && !isMMX)
4296 return LowerVECTOR_SHUFFLE_4wide(V1, V2, PermMask, VT, DAG, dl);
4302 X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
4303 SelectionDAG &DAG) {
4304 MVT VT = Op.getValueType();
4305 DebugLoc dl = Op.getDebugLoc();
4306 if (VT.getSizeInBits() == 8) {
4307 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
4308 Op.getOperand(0), Op.getOperand(1));
4309 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4310 DAG.getValueType(VT));
4311 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4312 } else if (VT.getSizeInBits() == 16) {
4313 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4314 // If Idx is 0, it's cheaper to do a move instead of a pextrw.
4316 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4317 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4318 DAG.getNode(ISD::BIT_CONVERT, dl,
4322 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
4323 Op.getOperand(0), Op.getOperand(1));
4324 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4325 DAG.getValueType(VT));
4326 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4327 } else if (VT == MVT::f32) {
4328 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
4329 // the result back to FR32 register. It's only worth matching if the
4330 // result has a single use which is a store or a bitcast to i32. And in
4331 // the case of a store, it's not worth it if the index is a constant 0,
4332 // because a MOVSSmr can be used instead, which is smaller and faster.
4333 if (!Op.hasOneUse())
4335 SDNode *User = *Op.getNode()->use_begin();
4336 if ((User->getOpcode() != ISD::STORE ||
4337 (isa<ConstantSDNode>(Op.getOperand(1)) &&
4338 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
4339 (User->getOpcode() != ISD::BIT_CONVERT ||
4340 User->getValueType(0) != MVT::i32))
4342 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4343 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
4346 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
4347 } else if (VT == MVT::i32) {
4348 // ExtractPS works with constant index.
4349 if (isa<ConstantSDNode>(Op.getOperand(1)))
4357 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4358 if (!isa<ConstantSDNode>(Op.getOperand(1)))
4361 if (Subtarget->hasSSE41()) {
4362 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
4367 MVT VT = Op.getValueType();
4368 DebugLoc dl = Op.getDebugLoc();
4369 // TODO: handle v16i8.
4370 if (VT.getSizeInBits() == 16) {
4371 SDValue Vec = Op.getOperand(0);
4372 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4374 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4375 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4376 DAG.getNode(ISD::BIT_CONVERT, dl,
4379 // Transform it so it match pextrw which produces a 32-bit result.
4380 MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1);
4381 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT,
4382 Op.getOperand(0), Op.getOperand(1));
4383 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EVT, Extract,
4384 DAG.getValueType(VT));
4385 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4386 } else if (VT.getSizeInBits() == 32) {
4387 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4390 // SHUFPS the element to the lowest double word, then movss.
4391 MVT MaskVT = MVT::getIntVectorWithNumElements(4);
4392 SmallVector<SDValue, 8> IdxVec;
4394 push_back(DAG.getConstant(Idx, MaskVT.getVectorElementType()));
4396 push_back(DAG.getUNDEF(MaskVT.getVectorElementType()));
4398 push_back(DAG.getUNDEF(MaskVT.getVectorElementType()));
4400 push_back(DAG.getUNDEF(MaskVT.getVectorElementType()));
4401 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
4402 &IdxVec[0], IdxVec.size());
4403 SDValue Vec = Op.getOperand(0);
4404 Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, Vec.getValueType(),
4405 Vec, DAG.getUNDEF(Vec.getValueType()), Mask);
4406 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4407 DAG.getIntPtrConstant(0));
4408 } else if (VT.getSizeInBits() == 64) {
4409 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
4410 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
4411 // to match extract_elt for f64.
4412 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4416 // UNPCKHPD the element to the lowest double word, then movsd.
4417 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
4418 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
4419 MVT MaskVT = MVT::getIntVectorWithNumElements(2);
4420 SmallVector<SDValue, 8> IdxVec;
4421 IdxVec.push_back(DAG.getConstant(1, MaskVT.getVectorElementType()));
4423 push_back(DAG.getUNDEF(MaskVT.getVectorElementType()));
4424 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
4425 &IdxVec[0], IdxVec.size());
4426 SDValue Vec = Op.getOperand(0);
4427 Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, Vec.getValueType(),
4428 Vec, DAG.getUNDEF(Vec.getValueType()),
4430 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4431 DAG.getIntPtrConstant(0));
4438 X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
4439 MVT VT = Op.getValueType();
4440 MVT EVT = VT.getVectorElementType();
4441 DebugLoc dl = Op.getDebugLoc();
4443 SDValue N0 = Op.getOperand(0);
4444 SDValue N1 = Op.getOperand(1);
4445 SDValue N2 = Op.getOperand(2);
4447 if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) &&
4448 isa<ConstantSDNode>(N2)) {
4449 unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB
4451 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
4453 if (N1.getValueType() != MVT::i32)
4454 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
4455 if (N2.getValueType() != MVT::i32)
4456 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4457 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
4458 } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
4459 // Bits [7:6] of the constant are the source select. This will always be
4460 // zero here. The DAG Combiner may combine an extract_elt index into these
4461 // bits. For example (insert (extract, 3), 2) could be matched by putting
4462 // the '3' into bits [7:6] of X86ISD::INSERTPS.
4463 // Bits [5:4] of the constant are the destination select. This is the
4464 // value of the incoming immediate.
4465 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
4466 // combine either bitwise AND or insert of float 0.0 to set these bits.
4467 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
4468 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
4469 } else if (EVT == MVT::i32) {
4470 // InsertPS works with constant index.
4471 if (isa<ConstantSDNode>(N2))
4478 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4479 MVT VT = Op.getValueType();
4480 MVT EVT = VT.getVectorElementType();
4482 if (Subtarget->hasSSE41())
4483 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
4488 DebugLoc dl = Op.getDebugLoc();
4489 SDValue N0 = Op.getOperand(0);
4490 SDValue N1 = Op.getOperand(1);
4491 SDValue N2 = Op.getOperand(2);
4493 if (EVT.getSizeInBits() == 16) {
4494 // Transform it so it match pinsrw which expects a 16-bit value in a GR32
4495 // as its second argument.
4496 if (N1.getValueType() != MVT::i32)
4497 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
4498 if (N2.getValueType() != MVT::i32)
4499 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4500 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
4506 X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
4507 DebugLoc dl = Op.getDebugLoc();
4508 if (Op.getValueType() == MVT::v2f32)
4509 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
4510 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
4511 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
4512 Op.getOperand(0))));
4514 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
4515 MVT VT = MVT::v2i32;
4516 switch (Op.getValueType().getSimpleVT()) {
4523 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
4524 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
4527 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
4528 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
4529 // one of the above mentioned nodes. It has to be wrapped because otherwise
4530 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
4531 // be used to form addressing mode. These wrapped nodes will be selected
4534 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
4535 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
4536 // FIXME there isn't really any debug info here, should come from the parent
4537 DebugLoc dl = CP->getDebugLoc();
4538 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(),
4540 CP->getAlignment());
4541 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
4542 // With PIC, the address is actually $g + Offset.
4543 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
4544 !Subtarget->isPICStyleRIPRel()) {
4545 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
4546 DAG.getNode(X86ISD::GlobalBaseReg,
4547 DebugLoc::getUnknownLoc(),
4556 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
4558 SelectionDAG &DAG) const {
4559 bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_;
4560 bool ExtraLoadRequired =
4561 Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false);
4563 // Create the TargetGlobalAddress node, folding in the constant
4564 // offset if it is legal.
4566 if (!IsPic && !ExtraLoadRequired && isInt32(Offset)) {
4567 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
4570 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0);
4571 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
4573 // With PIC, the address is actually $g + Offset.
4574 if (IsPic && !Subtarget->isPICStyleRIPRel()) {
4575 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
4576 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
4580 // For Darwin & Mingw32, external and weak symbols are indirect, so we want to
4581 // load the value at address GV, not the value of GV itself. This means that
4582 // the GlobalAddress must be in the base or index register of the address, not
4583 // the GV offset field. Platform check is inside GVRequiresExtraLoad() call
4584 // The same applies for external symbols during PIC codegen
4585 if (ExtraLoadRequired)
4586 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
4587 PseudoSourceValue::getGOT(), 0);
4589 // If there was a non-zero offset that we didn't fold, create an explicit
4592 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
4593 DAG.getConstant(Offset, getPointerTy()));
4599 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
4600 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
4601 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
4602 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
4605 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
4607 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4610 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better
4611 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
4612 DAG.getNode(X86ISD::GlobalBaseReg,
4613 DebugLoc::getUnknownLoc(),
4615 InFlag = Chain.getValue(1);
4617 // emit leal symbol@TLSGD(,%ebx,1), %eax
4618 SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag);
4619 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
4620 GA->getValueType(0),
4622 SDValue Ops[] = { Chain, TGA, InFlag };
4623 SDValue Result = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
4624 InFlag = Result.getValue(2);
4625 Chain = Result.getValue(1);
4627 // call ___tls_get_addr. This function receives its argument in
4628 // the register EAX.
4629 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Result, InFlag);
4630 InFlag = Chain.getValue(1);
4632 NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
4633 SDValue Ops1[] = { Chain,
4634 DAG.getTargetExternalSymbol("___tls_get_addr",
4636 DAG.getRegister(X86::EAX, PtrVT),
4637 DAG.getRegister(X86::EBX, PtrVT),
4639 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops1, 5);
4640 InFlag = Chain.getValue(1);
4642 return DAG.getCopyFromReg(Chain, dl, X86::EAX, PtrVT, InFlag);
4645 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
4647 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4649 SDValue InFlag, Chain;
4650 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better
4652 // emit leaq symbol@TLSGD(%rip), %rdi
4653 SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag);
4654 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
4655 GA->getValueType(0),
4657 SDValue Ops[] = { DAG.getEntryNode(), TGA};
4658 SDValue Result = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
4659 Chain = Result.getValue(1);
4660 InFlag = Result.getValue(2);
4662 // call __tls_get_addr. This function receives its argument in
4663 // the register RDI.
4664 Chain = DAG.getCopyToReg(Chain, dl, X86::RDI, Result, InFlag);
4665 InFlag = Chain.getValue(1);
4667 NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
4668 SDValue Ops1[] = { Chain,
4669 DAG.getTargetExternalSymbol("__tls_get_addr",
4671 DAG.getRegister(X86::RDI, PtrVT),
4673 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops1, 4);
4674 InFlag = Chain.getValue(1);
4676 return DAG.getCopyFromReg(Chain, dl, X86::RAX, PtrVT, InFlag);
4679 // Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
4680 // "local exec" model.
4681 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4683 DebugLoc dl = GA->getDebugLoc();
4684 // Get the Thread Pointer
4685 SDValue ThreadPointer = DAG.getNode(X86ISD::THREAD_POINTER,
4686 DebugLoc::getUnknownLoc(), PtrVT);
4687 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
4689 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
4690 GA->getValueType(0),
4692 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
4694 if (GA->getGlobal()->isDeclaration()) // initial exec TLS model
4695 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
4696 PseudoSourceValue::getGOT(), 0);
4698 // The address of the thread local variable is the add of the thread
4699 // pointer with the offset of the variable.
4700 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
4704 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
4705 // TODO: implement the "local dynamic" model
4706 // TODO: implement the "initial exec"model for pic executables
4707 assert(Subtarget->isTargetELF() &&
4708 "TLS not implemented for non-ELF targets");
4709 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
4710 // If the relocation model is PIC, use the "General Dynamic" TLS Model,
4711 // otherwise use the "Local Exec"TLS Model
4712 if (Subtarget->is64Bit()) {
4713 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
4715 if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
4716 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
4718 return LowerToTLSExecModel(GA, DAG, getPointerTy());
4723 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
4724 // FIXME there isn't really any debug info here
4725 DebugLoc dl = Op.getDebugLoc();
4726 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
4727 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
4728 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
4729 // With PIC, the address is actually $g + Offset.
4730 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
4731 !Subtarget->isPICStyleRIPRel()) {
4732 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
4733 DAG.getNode(X86ISD::GlobalBaseReg,
4734 DebugLoc::getUnknownLoc(),
4742 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
4743 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
4744 // FIXME there isn't really any debug into here
4745 DebugLoc dl = JT->getDebugLoc();
4746 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
4747 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
4748 // With PIC, the address is actually $g + Offset.
4749 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
4750 !Subtarget->isPICStyleRIPRel()) {
4751 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
4752 DAG.getNode(X86ISD::GlobalBaseReg,
4753 DebugLoc::getUnknownLoc(),
4761 /// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
4762 /// take a 2 x i32 value to shift plus a shift amount.
4763 SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
4764 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4765 MVT VT = Op.getValueType();
4766 unsigned VTBits = VT.getSizeInBits();
4767 DebugLoc dl = Op.getDebugLoc();
4768 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
4769 SDValue ShOpLo = Op.getOperand(0);
4770 SDValue ShOpHi = Op.getOperand(1);
4771 SDValue ShAmt = Op.getOperand(2);
4772 SDValue Tmp1 = isSRA ?
4773 DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
4774 DAG.getConstant(VTBits - 1, MVT::i8)) :
4775 DAG.getConstant(0, VT);
4778 if (Op.getOpcode() == ISD::SHL_PARTS) {
4779 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
4780 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
4782 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
4783 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
4786 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
4787 DAG.getConstant(VTBits, MVT::i8));
4788 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
4789 AndNode, DAG.getConstant(0, MVT::i8));
4792 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
4793 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
4794 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
4796 if (Op.getOpcode() == ISD::SHL_PARTS) {
4797 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
4798 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
4800 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
4801 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
4804 SDValue Ops[2] = { Lo, Hi };
4805 return DAG.getMergeValues(Ops, 2, dl);
4808 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
4809 MVT SrcVT = Op.getOperand(0).getValueType();
4810 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
4811 "Unknown SINT_TO_FP to lower!");
4813 // These are really Legal; caller falls through into that case.
4814 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
4816 if (SrcVT == MVT::i64 && Op.getValueType() != MVT::f80 &&
4817 Subtarget->is64Bit())
4820 DebugLoc dl = Op.getDebugLoc();
4821 unsigned Size = SrcVT.getSizeInBits()/8;
4822 MachineFunction &MF = DAG.getMachineFunction();
4823 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
4824 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
4825 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
4827 PseudoSourceValue::getFixedStack(SSFI), 0);
4831 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
4833 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
4835 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
4836 SmallVector<SDValue, 8> Ops;
4837 Ops.push_back(Chain);
4838 Ops.push_back(StackSlot);
4839 Ops.push_back(DAG.getValueType(SrcVT));
4840 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
4841 Tys, &Ops[0], Ops.size());
4844 Chain = Result.getValue(1);
4845 SDValue InFlag = Result.getValue(2);
4847 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
4848 // shouldn't be necessary except that RFP cannot be live across
4849 // multiple blocks. When stackifier is fixed, they can be uncoupled.
4850 MachineFunction &MF = DAG.getMachineFunction();
4851 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
4852 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
4853 Tys = DAG.getVTList(MVT::Other);
4854 SmallVector<SDValue, 8> Ops;
4855 Ops.push_back(Chain);
4856 Ops.push_back(Result);
4857 Ops.push_back(StackSlot);
4858 Ops.push_back(DAG.getValueType(Op.getValueType()));
4859 Ops.push_back(InFlag);
4860 Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size());
4861 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
4862 PseudoSourceValue::getFixedStack(SSFI), 0);
4868 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
4869 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) {
4870 // This algorithm is not obvious. Here it is in C code, more or less:
4872 double uint64_to_double( uint32_t hi, uint32_t lo ) {
4873 static const __m128i exp = { 0x4330000045300000ULL, 0 };
4874 static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
4876 // Copy ints to xmm registers.
4877 __m128i xh = _mm_cvtsi32_si128( hi );
4878 __m128i xl = _mm_cvtsi32_si128( lo );
4880 // Combine into low half of a single xmm register.
4881 __m128i x = _mm_unpacklo_epi32( xh, xl );
4885 // Merge in appropriate exponents to give the integer bits the right
4887 x = _mm_unpacklo_epi32( x, exp );
4889 // Subtract away the biases to deal with the IEEE-754 double precision
4891 d = _mm_sub_pd( (__m128d) x, bias );
4893 // All conversions up to here are exact. The correctly rounded result is
4894 // calculated using the current rounding mode using the following
4896 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
4897 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this
4898 // store doesn't really need to be here (except
4899 // maybe to zero the other double)
4904 DebugLoc dl = Op.getDebugLoc();
4906 // Build some magic constants.
4907 std::vector<Constant*> CV0;
4908 CV0.push_back(ConstantInt::get(APInt(32, 0x45300000)));
4909 CV0.push_back(ConstantInt::get(APInt(32, 0x43300000)));
4910 CV0.push_back(ConstantInt::get(APInt(32, 0)));
4911 CV0.push_back(ConstantInt::get(APInt(32, 0)));
4912 Constant *C0 = ConstantVector::get(CV0);
4913 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 4);
4915 std::vector<Constant*> CV1;
4916 CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL))));
4917 CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL))));
4918 Constant *C1 = ConstantVector::get(CV1);
4919 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 4);
4921 SmallVector<SDValue, 4> MaskVec;
4922 MaskVec.push_back(DAG.getConstant(0, MVT::i32));
4923 MaskVec.push_back(DAG.getConstant(4, MVT::i32));
4924 MaskVec.push_back(DAG.getConstant(1, MVT::i32));
4925 MaskVec.push_back(DAG.getConstant(5, MVT::i32));
4926 SDValue UnpcklMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
4927 &MaskVec[0], MaskVec.size());
4928 SmallVector<SDValue, 4> MaskVec2;
4929 MaskVec2.push_back(DAG.getConstant(1, MVT::i32));
4930 MaskVec2.push_back(DAG.getConstant(0, MVT::i32));
4931 SDValue ShufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32,
4932 &MaskVec2[0], MaskVec2.size());
4934 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
4935 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
4937 DAG.getIntPtrConstant(1)));
4938 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
4939 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
4941 DAG.getIntPtrConstant(0)));
4942 SDValue Unpck1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v4i32,
4943 XR1, XR2, UnpcklMask);
4944 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
4945 PseudoSourceValue::getConstantPool(), 0,
4947 SDValue Unpck2 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v4i32,
4948 Unpck1, CLod0, UnpcklMask);
4949 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
4950 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
4951 PseudoSourceValue::getConstantPool(), 0,
4953 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
4955 // Add the halves; easiest way is to swap them into another reg first.
4956 SDValue Shuf = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2f64,
4957 Sub, Sub, ShufMask);
4958 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
4959 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
4960 DAG.getIntPtrConstant(0));
4963 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
4964 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) {
4965 DebugLoc dl = Op.getDebugLoc();
4966 // FP constant to bias correct the final result.
4967 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
4970 // Load the 32-bit value into an XMM register.
4971 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
4972 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
4974 DAG.getIntPtrConstant(0)));
4976 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
4977 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
4978 DAG.getIntPtrConstant(0));
4980 // Or the load with the bias.
4981 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
4982 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
4983 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4985 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
4986 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4987 MVT::v2f64, Bias)));
4988 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
4989 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
4990 DAG.getIntPtrConstant(0));
4992 // Subtract the bias.
4993 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
4995 // Handle final rounding.
4996 MVT DestVT = Op.getValueType();
4998 if (DestVT.bitsLT(MVT::f64)) {
4999 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
5000 DAG.getIntPtrConstant(0));
5001 } else if (DestVT.bitsGT(MVT::f64)) {
5002 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
5005 // Handle final rounding.
5009 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5010 SDValue N0 = Op.getOperand(0);
5011 DebugLoc dl = Op.getDebugLoc();
5013 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't
5014 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
5015 // the optimization here.
5016 if (DAG.SignBitIsZero(N0))
5017 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
5019 MVT SrcVT = N0.getValueType();
5020 if (SrcVT == MVT::i64) {
5021 // We only handle SSE2 f64 target here; caller can handle the rest.
5022 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64)
5025 return LowerUINT_TO_FP_i64(Op, DAG);
5026 } else if (SrcVT == MVT::i32) {
5027 return LowerUINT_TO_FP_i32(Op, DAG);
5030 assert(0 && "Unknown UINT_TO_FP to lower!");
5034 std::pair<SDValue,SDValue> X86TargetLowering::
5035 FP_TO_SINTHelper(SDValue Op, SelectionDAG &DAG) {
5036 DebugLoc dl = Op.getDebugLoc();
5037 assert(Op.getValueType().getSimpleVT() <= MVT::i64 &&
5038 Op.getValueType().getSimpleVT() >= MVT::i16 &&
5039 "Unknown FP_TO_SINT to lower!");
5041 // These are really Legal.
5042 if (Op.getValueType() == MVT::i32 &&
5043 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5044 return std::make_pair(SDValue(), SDValue());
5045 if (Subtarget->is64Bit() &&
5046 Op.getValueType() == MVT::i64 &&
5047 Op.getOperand(0).getValueType() != MVT::f80)
5048 return std::make_pair(SDValue(), SDValue());
5050 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
5052 MachineFunction &MF = DAG.getMachineFunction();
5053 unsigned MemSize = Op.getValueType().getSizeInBits()/8;
5054 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
5055 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5057 switch (Op.getValueType().getSimpleVT()) {
5058 default: assert(0 && "Invalid FP_TO_SINT to lower!");
5059 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
5060 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
5061 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
5064 SDValue Chain = DAG.getEntryNode();
5065 SDValue Value = Op.getOperand(0);
5066 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
5067 assert(Op.getValueType() == MVT::i64 && "Invalid FP_TO_SINT to lower!");
5068 Chain = DAG.getStore(Chain, dl, Value, StackSlot,
5069 PseudoSourceValue::getFixedStack(SSFI), 0);
5070 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
5072 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
5074 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
5075 Chain = Value.getValue(1);
5076 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
5077 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5080 // Build the FP_TO_INT*_IN_MEM
5081 SDValue Ops[] = { Chain, Value, StackSlot };
5082 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
5084 return std::make_pair(FIST, StackSlot);
5087 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
5088 std::pair<SDValue,SDValue> Vals = FP_TO_SINTHelper(Op, DAG);
5089 SDValue FIST = Vals.first, StackSlot = Vals.second;
5090 if (FIST.getNode() == 0) return SDValue();
5093 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5094 FIST, StackSlot, NULL, 0);
5097 SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
5098 DebugLoc dl = Op.getDebugLoc();
5099 MVT VT = Op.getValueType();
5102 EltVT = VT.getVectorElementType();
5103 std::vector<Constant*> CV;
5104 if (EltVT == MVT::f64) {
5105 Constant *C = ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63))));
5109 Constant *C = ConstantFP::get(APFloat(APInt(32, ~(1U << 31))));
5115 Constant *C = ConstantVector::get(CV);
5116 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
5117 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5118 PseudoSourceValue::getConstantPool(), 0,
5120 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
5123 SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
5124 DebugLoc dl = Op.getDebugLoc();
5125 MVT VT = Op.getValueType();
5127 unsigned EltNum = 1;
5128 if (VT.isVector()) {
5129 EltVT = VT.getVectorElementType();
5130 EltNum = VT.getVectorNumElements();
5132 std::vector<Constant*> CV;
5133 if (EltVT == MVT::f64) {
5134 Constant *C = ConstantFP::get(APFloat(APInt(64, 1ULL << 63)));
5138 Constant *C = ConstantFP::get(APFloat(APInt(32, 1U << 31)));
5144 Constant *C = ConstantVector::get(CV);
5145 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
5146 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5147 PseudoSourceValue::getConstantPool(), 0,
5149 if (VT.isVector()) {
5150 return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
5151 DAG.getNode(ISD::XOR, dl, MVT::v2i64,
5152 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5154 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
5156 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
5160 SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
5161 SDValue Op0 = Op.getOperand(0);
5162 SDValue Op1 = Op.getOperand(1);
5163 DebugLoc dl = Op.getDebugLoc();
5164 MVT VT = Op.getValueType();
5165 MVT SrcVT = Op1.getValueType();
5167 // If second operand is smaller, extend it first.
5168 if (SrcVT.bitsLT(VT)) {
5169 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
5172 // And if it is bigger, shrink it first.
5173 if (SrcVT.bitsGT(VT)) {
5174 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
5178 // At this point the operands and the result should have the same
5179 // type, and that won't be f80 since that is not custom lowered.
5181 // First get the sign bit of second operand.
5182 std::vector<Constant*> CV;
5183 if (SrcVT == MVT::f64) {
5184 CV.push_back(ConstantFP::get(APFloat(APInt(64, 1ULL << 63))));
5185 CV.push_back(ConstantFP::get(APFloat(APInt(64, 0))));
5187 CV.push_back(ConstantFP::get(APFloat(APInt(32, 1U << 31))));
5188 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
5189 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
5190 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
5192 Constant *C = ConstantVector::get(CV);
5193 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
5194 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
5195 PseudoSourceValue::getConstantPool(), 0,
5197 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
5199 // Shift sign bit right or left if the two operands have different types.
5200 if (SrcVT.bitsGT(VT)) {
5201 // Op0 is MVT::f32, Op1 is MVT::f64.
5202 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
5203 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
5204 DAG.getConstant(32, MVT::i32));
5205 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
5206 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
5207 DAG.getIntPtrConstant(0));
5210 // Clear first operand sign bit.
5212 if (VT == MVT::f64) {
5213 CV.push_back(ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63)))));
5214 CV.push_back(ConstantFP::get(APFloat(APInt(64, 0))));
5216 CV.push_back(ConstantFP::get(APFloat(APInt(32, ~(1U << 31)))));
5217 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
5218 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
5219 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
5221 C = ConstantVector::get(CV);
5222 CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
5223 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5224 PseudoSourceValue::getConstantPool(), 0,
5226 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
5228 // Or the value with the sign bit.
5229 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
5232 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
5233 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
5234 SDValue Op0 = Op.getOperand(0);
5235 SDValue Op1 = Op.getOperand(1);
5236 DebugLoc dl = Op.getDebugLoc();
5237 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
5239 // Lower (X & (1 << N)) == 0 to BT(X, N).
5240 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
5241 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
5242 if (Op0.getOpcode() == ISD::AND &&
5244 Op1.getOpcode() == ISD::Constant &&
5245 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
5246 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5248 if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
5249 if (ConstantSDNode *Op010C =
5250 dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
5251 if (Op010C->getZExtValue() == 1) {
5252 LHS = Op0.getOperand(0);
5253 RHS = Op0.getOperand(1).getOperand(1);
5255 } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
5256 if (ConstantSDNode *Op000C =
5257 dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
5258 if (Op000C->getZExtValue() == 1) {
5259 LHS = Op0.getOperand(1);
5260 RHS = Op0.getOperand(0).getOperand(1);
5262 } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
5263 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
5264 SDValue AndLHS = Op0.getOperand(0);
5265 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
5266 LHS = AndLHS.getOperand(0);
5267 RHS = AndLHS.getOperand(1);
5271 if (LHS.getNode()) {
5272 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT
5273 // instruction. Since the shift amount is in-range-or-undefined, we know
5274 // that doing a bittest on the i16 value is ok. We extend to i32 because
5275 // the encoding for the i16 version is larger than the i32 version.
5276 if (LHS.getValueType() == MVT::i8)
5277 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
5279 // If the operand types disagree, extend the shift amount to match. Since
5280 // BT ignores high bits (like shifts) we can use anyextend.
5281 if (LHS.getValueType() != RHS.getValueType())
5282 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
5284 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
5285 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
5286 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
5287 DAG.getConstant(Cond, MVT::i8), BT);
5291 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
5292 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
5294 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
5295 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
5296 DAG.getConstant(X86CC, MVT::i8), Cond);
5299 SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
5301 SDValue Op0 = Op.getOperand(0);
5302 SDValue Op1 = Op.getOperand(1);
5303 SDValue CC = Op.getOperand(2);
5304 MVT VT = Op.getValueType();
5305 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
5306 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
5307 DebugLoc dl = Op.getDebugLoc();
5311 MVT VT0 = Op0.getValueType();
5312 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
5313 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
5316 switch (SetCCOpcode) {
5319 case ISD::SETEQ: SSECC = 0; break;
5321 case ISD::SETGT: Swap = true; // Fallthrough
5323 case ISD::SETOLT: SSECC = 1; break;
5325 case ISD::SETGE: Swap = true; // Fallthrough
5327 case ISD::SETOLE: SSECC = 2; break;
5328 case ISD::SETUO: SSECC = 3; break;
5330 case ISD::SETNE: SSECC = 4; break;
5331 case ISD::SETULE: Swap = true;
5332 case ISD::SETUGE: SSECC = 5; break;
5333 case ISD::SETULT: Swap = true;
5334 case ISD::SETUGT: SSECC = 6; break;
5335 case ISD::SETO: SSECC = 7; break;
5338 std::swap(Op0, Op1);
5340 // In the two special cases we can't handle, emit two comparisons.
5342 if (SetCCOpcode == ISD::SETUEQ) {
5344 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
5345 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
5346 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
5348 else if (SetCCOpcode == ISD::SETONE) {
5350 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
5351 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
5352 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
5354 assert(0 && "Illegal FP comparison");
5356 // Handle all other FP comparisons here.
5357 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
5360 // We are handling one of the integer comparisons here. Since SSE only has
5361 // GT and EQ comparisons for integer, swapping operands and multiple
5362 // operations may be required for some comparisons.
5363 unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
5364 bool Swap = false, Invert = false, FlipSigns = false;
5366 switch (VT.getSimpleVT()) {
5368 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
5369 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
5370 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
5371 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
5374 switch (SetCCOpcode) {
5376 case ISD::SETNE: Invert = true;
5377 case ISD::SETEQ: Opc = EQOpc; break;
5378 case ISD::SETLT: Swap = true;
5379 case ISD::SETGT: Opc = GTOpc; break;
5380 case ISD::SETGE: Swap = true;
5381 case ISD::SETLE: Opc = GTOpc; Invert = true; break;
5382 case ISD::SETULT: Swap = true;
5383 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
5384 case ISD::SETUGE: Swap = true;
5385 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
5388 std::swap(Op0, Op1);
5390 // Since SSE has no unsigned integer comparisons, we need to flip the sign
5391 // bits of the inputs before performing those operations.
5393 MVT EltVT = VT.getVectorElementType();
5394 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
5396 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
5397 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
5399 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
5400 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
5403 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
5405 // If the logical-not of the result is required, perform that now.
5407 Result = DAG.getNOT(dl, Result, VT);
5412 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
5413 static bool isX86LogicalCmp(unsigned Opc) {
5414 return Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI;
5417 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
5418 bool addTest = true;
5419 SDValue Cond = Op.getOperand(0);
5420 DebugLoc dl = Op.getDebugLoc();
5423 if (Cond.getOpcode() == ISD::SETCC)
5424 Cond = LowerSETCC(Cond, DAG);
5426 // If condition flag is set by a X86ISD::CMP, then use it as the condition
5427 // setting operand in place of the X86ISD::SETCC.
5428 if (Cond.getOpcode() == X86ISD::SETCC) {
5429 CC = Cond.getOperand(0);
5431 SDValue Cmp = Cond.getOperand(1);
5432 unsigned Opc = Cmp.getOpcode();
5433 MVT VT = Op.getValueType();
5435 bool IllegalFPCMov = false;
5436 if (VT.isFloatingPoint() && !VT.isVector() &&
5437 !isScalarFPTypeInSSEReg(VT)) // FPStack?
5438 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
5440 if ((isX86LogicalCmp(Opc) && !IllegalFPCMov) || Opc == X86ISD::BT) { // FIXME
5447 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5448 Cond= DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond,
5449 DAG.getConstant(0, MVT::i8));
5452 const MVT *VTs = DAG.getNodeValueTypes(Op.getValueType(),
5454 SmallVector<SDValue, 4> Ops;
5455 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
5456 // condition is true.
5457 Ops.push_back(Op.getOperand(2));
5458 Ops.push_back(Op.getOperand(1));
5460 Ops.push_back(Cond);
5461 return DAG.getNode(X86ISD::CMOV, dl, VTs, 2, &Ops[0], Ops.size());
5464 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
5465 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
5466 // from the AND / OR.
5467 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
5468 Opc = Op.getOpcode();
5469 if (Opc != ISD::OR && Opc != ISD::AND)
5471 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
5472 Op.getOperand(0).hasOneUse() &&
5473 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
5474 Op.getOperand(1).hasOneUse());
5477 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
5478 // 1 and that the SETCC node has a single use.
5479 static bool isXor1OfSetCC(SDValue Op) {
5480 if (Op.getOpcode() != ISD::XOR)
5482 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5483 if (N1C && N1C->getAPIntValue() == 1) {
5484 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
5485 Op.getOperand(0).hasOneUse();
5490 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
5491 bool addTest = true;
5492 SDValue Chain = Op.getOperand(0);
5493 SDValue Cond = Op.getOperand(1);
5494 SDValue Dest = Op.getOperand(2);
5495 DebugLoc dl = Op.getDebugLoc();
5498 if (Cond.getOpcode() == ISD::SETCC)
5499 Cond = LowerSETCC(Cond, DAG);
5501 // FIXME: LowerXALUO doesn't handle these!!
5502 else if (Cond.getOpcode() == X86ISD::ADD ||
5503 Cond.getOpcode() == X86ISD::SUB ||
5504 Cond.getOpcode() == X86ISD::SMUL ||
5505 Cond.getOpcode() == X86ISD::UMUL)
5506 Cond = LowerXALUO(Cond, DAG);
5509 // If condition flag is set by a X86ISD::CMP, then use it as the condition
5510 // setting operand in place of the X86ISD::SETCC.
5511 if (Cond.getOpcode() == X86ISD::SETCC) {
5512 CC = Cond.getOperand(0);
5514 SDValue Cmp = Cond.getOperand(1);
5515 unsigned Opc = Cmp.getOpcode();
5516 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
5517 if (isX86LogicalCmp(Opc) || Opc == X86ISD::BT) {
5521 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
5525 // These can only come from an arithmetic instruction with overflow,
5526 // e.g. SADDO, UADDO.
5527 Cond = Cond.getNode()->getOperand(1);
5534 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
5535 SDValue Cmp = Cond.getOperand(0).getOperand(1);
5536 unsigned Opc = Cmp.getOpcode();
5537 if (CondOpc == ISD::OR) {
5538 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
5539 // two branches instead of an explicit OR instruction with a
5541 if (Cmp == Cond.getOperand(1).getOperand(1) &&
5542 isX86LogicalCmp(Opc)) {
5543 CC = Cond.getOperand(0).getOperand(0);
5544 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
5545 Chain, Dest, CC, Cmp);
5546 CC = Cond.getOperand(1).getOperand(0);
5550 } else { // ISD::AND
5551 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
5552 // two branches instead of an explicit AND instruction with a
5553 // separate test. However, we only do this if this block doesn't
5554 // have a fall-through edge, because this requires an explicit
5555 // jmp when the condition is false.
5556 if (Cmp == Cond.getOperand(1).getOperand(1) &&
5557 isX86LogicalCmp(Opc) &&
5558 Op.getNode()->hasOneUse()) {
5559 X86::CondCode CCode =
5560 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
5561 CCode = X86::GetOppositeBranchCondition(CCode);
5562 CC = DAG.getConstant(CCode, MVT::i8);
5563 SDValue User = SDValue(*Op.getNode()->use_begin(), 0);
5564 // Look for an unconditional branch following this conditional branch.
5565 // We need this because we need to reverse the successors in order
5566 // to implement FCMP_OEQ.
5567 if (User.getOpcode() == ISD::BR) {
5568 SDValue FalseBB = User.getOperand(1);
5570 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest);
5571 assert(NewBR == User);
5574 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
5575 Chain, Dest, CC, Cmp);
5576 X86::CondCode CCode =
5577 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
5578 CCode = X86::GetOppositeBranchCondition(CCode);
5579 CC = DAG.getConstant(CCode, MVT::i8);
5585 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
5586 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
5587 // It should be transformed during dag combiner except when the condition
5588 // is set by a arithmetics with overflow node.
5589 X86::CondCode CCode =
5590 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
5591 CCode = X86::GetOppositeBranchCondition(CCode);
5592 CC = DAG.getConstant(CCode, MVT::i8);
5593 Cond = Cond.getOperand(0).getOperand(1);
5599 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5600 Cond= DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond,
5601 DAG.getConstant(0, MVT::i8));
5603 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
5604 Chain, Dest, CC, Cond);
5608 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
5609 // Calls to _alloca is needed to probe the stack when allocating more than 4k
5610 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
5611 // that the guard pages used by the OS virtual memory manager are allocated in
5612 // correct sequence.
5614 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
5615 SelectionDAG &DAG) {
5616 assert(Subtarget->isTargetCygMing() &&
5617 "This should be used only on Cygwin/Mingw targets");
5618 DebugLoc dl = Op.getDebugLoc();
5621 SDValue Chain = Op.getOperand(0);
5622 SDValue Size = Op.getOperand(1);
5623 // FIXME: Ensure alignment here
5627 MVT IntPtr = getPointerTy();
5628 MVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
5630 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
5632 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
5633 Flag = Chain.getValue(1);
5635 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
5636 SDValue Ops[] = { Chain,
5637 DAG.getTargetExternalSymbol("_alloca", IntPtr),
5638 DAG.getRegister(X86::EAX, IntPtr),
5639 DAG.getRegister(X86StackPtr, SPTy),
5641 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5);
5642 Flag = Chain.getValue(1);
5644 Chain = DAG.getCALLSEQ_END(Chain,
5645 DAG.getIntPtrConstant(0, true),
5646 DAG.getIntPtrConstant(0, true),
5649 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
5651 SDValue Ops1[2] = { Chain.getValue(0), Chain };
5652 return DAG.getMergeValues(Ops1, 2, dl);
5656 X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
5658 SDValue Dst, SDValue Src,
5659 SDValue Size, unsigned Align,
5661 uint64_t DstSVOff) {
5662 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
5664 // If not DWORD aligned or size is more than the threshold, call the library.
5665 // The libc version is likely to be faster for these cases. It can use the
5666 // address value and run time information about the CPU.
5667 if ((Align & 3) != 0 ||
5669 ConstantSize->getZExtValue() >
5670 getSubtarget()->getMaxInlineSizeThreshold()) {
5671 SDValue InFlag(0, 0);
5673 // Check to see if there is a specialized entry-point for memory zeroing.
5674 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
5676 if (const char *bzeroEntry = V &&
5677 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
5678 MVT IntPtr = getPointerTy();
5679 const Type *IntPtrTy = TD->getIntPtrType();
5680 TargetLowering::ArgListTy Args;
5681 TargetLowering::ArgListEntry Entry;
5683 Entry.Ty = IntPtrTy;
5684 Args.push_back(Entry);
5686 Args.push_back(Entry);
5687 std::pair<SDValue,SDValue> CallResult =
5688 LowerCallTo(Chain, Type::VoidTy, false, false, false, false,
5689 CallingConv::C, false,
5690 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
5691 return CallResult.second;
5694 // Otherwise have the target-independent code call memset.
5698 uint64_t SizeVal = ConstantSize->getZExtValue();
5699 SDValue InFlag(0, 0);
5702 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
5703 unsigned BytesLeft = 0;
5704 bool TwoRepStos = false;
5707 uint64_t Val = ValC->getZExtValue() & 255;
5709 // If the value is a constant, then we can potentially use larger sets.
5710 switch (Align & 3) {
5711 case 2: // WORD aligned
5714 Val = (Val << 8) | Val;
5716 case 0: // DWORD aligned
5719 Val = (Val << 8) | Val;
5720 Val = (Val << 16) | Val;
5721 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned
5724 Val = (Val << 32) | Val;
5727 default: // Byte aligned
5730 Count = DAG.getIntPtrConstant(SizeVal);
5734 if (AVT.bitsGT(MVT::i8)) {
5735 unsigned UBytes = AVT.getSizeInBits() / 8;
5736 Count = DAG.getIntPtrConstant(SizeVal / UBytes);
5737 BytesLeft = SizeVal % UBytes;
5740 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
5742 InFlag = Chain.getValue(1);
5745 Count = DAG.getIntPtrConstant(SizeVal);
5746 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
5747 InFlag = Chain.getValue(1);
5750 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
5753 InFlag = Chain.getValue(1);
5754 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
5757 InFlag = Chain.getValue(1);
5759 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5760 SmallVector<SDValue, 8> Ops;
5761 Ops.push_back(Chain);
5762 Ops.push_back(DAG.getValueType(AVT));
5763 Ops.push_back(InFlag);
5764 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size());
5767 InFlag = Chain.getValue(1);
5769 MVT CVT = Count.getValueType();
5770 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
5771 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
5772 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
5775 InFlag = Chain.getValue(1);
5776 Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5778 Ops.push_back(Chain);
5779 Ops.push_back(DAG.getValueType(MVT::i8));
5780 Ops.push_back(InFlag);
5781 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size());
5782 } else if (BytesLeft) {
5783 // Handle the last 1 - 7 bytes.
5784 unsigned Offset = SizeVal - BytesLeft;
5785 MVT AddrVT = Dst.getValueType();
5786 MVT SizeVT = Size.getValueType();
5788 Chain = DAG.getMemset(Chain, dl,
5789 DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
5790 DAG.getConstant(Offset, AddrVT)),
5792 DAG.getConstant(BytesLeft, SizeVT),
5793 Align, DstSV, DstSVOff + Offset);
5796 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
5801 X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
5802 SDValue Chain, SDValue Dst, SDValue Src,
5803 SDValue Size, unsigned Align,
5805 const Value *DstSV, uint64_t DstSVOff,
5806 const Value *SrcSV, uint64_t SrcSVOff) {
5807 // This requires the copy size to be a constant, preferrably
5808 // within a subtarget-specific limit.
5809 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
5812 uint64_t SizeVal = ConstantSize->getZExtValue();
5813 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
5816 /// If not DWORD aligned, call the library.
5817 if ((Align & 3) != 0)
5822 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned
5825 unsigned UBytes = AVT.getSizeInBits() / 8;
5826 unsigned CountVal = SizeVal / UBytes;
5827 SDValue Count = DAG.getIntPtrConstant(CountVal);
5828 unsigned BytesLeft = SizeVal % UBytes;
5830 SDValue InFlag(0, 0);
5831 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
5834 InFlag = Chain.getValue(1);
5835 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
5838 InFlag = Chain.getValue(1);
5839 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
5842 InFlag = Chain.getValue(1);
5844 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5845 SmallVector<SDValue, 8> Ops;
5846 Ops.push_back(Chain);
5847 Ops.push_back(DAG.getValueType(AVT));
5848 Ops.push_back(InFlag);
5849 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size());
5851 SmallVector<SDValue, 4> Results;
5852 Results.push_back(RepMovs);
5854 // Handle the last 1 - 7 bytes.
5855 unsigned Offset = SizeVal - BytesLeft;
5856 MVT DstVT = Dst.getValueType();
5857 MVT SrcVT = Src.getValueType();
5858 MVT SizeVT = Size.getValueType();
5859 Results.push_back(DAG.getMemcpy(Chain, dl,
5860 DAG.getNode(ISD::ADD, dl, DstVT, Dst,
5861 DAG.getConstant(Offset, DstVT)),
5862 DAG.getNode(ISD::ADD, dl, SrcVT, Src,
5863 DAG.getConstant(Offset, SrcVT)),
5864 DAG.getConstant(BytesLeft, SizeVT),
5865 Align, AlwaysInline,
5866 DstSV, DstSVOff + Offset,
5867 SrcSV, SrcSVOff + Offset));
5870 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
5871 &Results[0], Results.size());
5874 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
5875 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
5876 DebugLoc dl = Op.getDebugLoc();
5878 if (!Subtarget->is64Bit()) {
5879 // vastart just stores the address of the VarArgsFrameIndex slot into the
5880 // memory location argument.
5881 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
5882 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
5886 // gp_offset (0 - 6 * 8)
5887 // fp_offset (48 - 48 + 8 * 16)
5888 // overflow_arg_area (point to parameters coming in memory).
5890 SmallVector<SDValue, 8> MemOps;
5891 SDValue FIN = Op.getOperand(1);
5893 SDValue Store = DAG.getStore(Op.getOperand(0), dl,
5894 DAG.getConstant(VarArgsGPOffset, MVT::i32),
5896 MemOps.push_back(Store);
5899 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5900 FIN, DAG.getIntPtrConstant(4));
5901 Store = DAG.getStore(Op.getOperand(0), dl,
5902 DAG.getConstant(VarArgsFPOffset, MVT::i32),
5904 MemOps.push_back(Store);
5906 // Store ptr to overflow_arg_area
5907 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5908 FIN, DAG.getIntPtrConstant(4));
5909 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
5910 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0);
5911 MemOps.push_back(Store);
5913 // Store ptr to reg_save_area.
5914 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5915 FIN, DAG.getIntPtrConstant(8));
5916 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
5917 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0);
5918 MemOps.push_back(Store);
5919 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
5920 &MemOps[0], MemOps.size());
5923 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) {
5924 // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
5925 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
5926 SDValue Chain = Op.getOperand(0);
5927 SDValue SrcPtr = Op.getOperand(1);
5928 SDValue SrcSV = Op.getOperand(2);
5930 assert(0 && "VAArgInst is not yet implemented for x86-64!");
5935 SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) {
5936 // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
5937 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
5938 SDValue Chain = Op.getOperand(0);
5939 SDValue DstPtr = Op.getOperand(1);
5940 SDValue SrcPtr = Op.getOperand(2);
5941 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
5942 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
5943 DebugLoc dl = Op.getDebugLoc();
5945 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
5946 DAG.getIntPtrConstant(24), 8, false,
5947 DstSV, 0, SrcSV, 0);
5951 X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
5952 DebugLoc dl = Op.getDebugLoc();
5953 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5955 default: return SDValue(); // Don't custom lower most intrinsics.
5956 // Comparison intrinsics.
5957 case Intrinsic::x86_sse_comieq_ss:
5958 case Intrinsic::x86_sse_comilt_ss:
5959 case Intrinsic::x86_sse_comile_ss:
5960 case Intrinsic::x86_sse_comigt_ss:
5961 case Intrinsic::x86_sse_comige_ss:
5962 case Intrinsic::x86_sse_comineq_ss:
5963 case Intrinsic::x86_sse_ucomieq_ss:
5964 case Intrinsic::x86_sse_ucomilt_ss:
5965 case Intrinsic::x86_sse_ucomile_ss:
5966 case Intrinsic::x86_sse_ucomigt_ss:
5967 case Intrinsic::x86_sse_ucomige_ss:
5968 case Intrinsic::x86_sse_ucomineq_ss:
5969 case Intrinsic::x86_sse2_comieq_sd:
5970 case Intrinsic::x86_sse2_comilt_sd:
5971 case Intrinsic::x86_sse2_comile_sd:
5972 case Intrinsic::x86_sse2_comigt_sd:
5973 case Intrinsic::x86_sse2_comige_sd:
5974 case Intrinsic::x86_sse2_comineq_sd:
5975 case Intrinsic::x86_sse2_ucomieq_sd:
5976 case Intrinsic::x86_sse2_ucomilt_sd:
5977 case Intrinsic::x86_sse2_ucomile_sd:
5978 case Intrinsic::x86_sse2_ucomigt_sd:
5979 case Intrinsic::x86_sse2_ucomige_sd:
5980 case Intrinsic::x86_sse2_ucomineq_sd: {
5982 ISD::CondCode CC = ISD::SETCC_INVALID;
5985 case Intrinsic::x86_sse_comieq_ss:
5986 case Intrinsic::x86_sse2_comieq_sd:
5990 case Intrinsic::x86_sse_comilt_ss:
5991 case Intrinsic::x86_sse2_comilt_sd:
5995 case Intrinsic::x86_sse_comile_ss:
5996 case Intrinsic::x86_sse2_comile_sd:
6000 case Intrinsic::x86_sse_comigt_ss:
6001 case Intrinsic::x86_sse2_comigt_sd:
6005 case Intrinsic::x86_sse_comige_ss:
6006 case Intrinsic::x86_sse2_comige_sd:
6010 case Intrinsic::x86_sse_comineq_ss:
6011 case Intrinsic::x86_sse2_comineq_sd:
6015 case Intrinsic::x86_sse_ucomieq_ss:
6016 case Intrinsic::x86_sse2_ucomieq_sd:
6017 Opc = X86ISD::UCOMI;
6020 case Intrinsic::x86_sse_ucomilt_ss:
6021 case Intrinsic::x86_sse2_ucomilt_sd:
6022 Opc = X86ISD::UCOMI;
6025 case Intrinsic::x86_sse_ucomile_ss:
6026 case Intrinsic::x86_sse2_ucomile_sd:
6027 Opc = X86ISD::UCOMI;
6030 case Intrinsic::x86_sse_ucomigt_ss:
6031 case Intrinsic::x86_sse2_ucomigt_sd:
6032 Opc = X86ISD::UCOMI;
6035 case Intrinsic::x86_sse_ucomige_ss:
6036 case Intrinsic::x86_sse2_ucomige_sd:
6037 Opc = X86ISD::UCOMI;
6040 case Intrinsic::x86_sse_ucomineq_ss:
6041 case Intrinsic::x86_sse2_ucomineq_sd:
6042 Opc = X86ISD::UCOMI;
6047 SDValue LHS = Op.getOperand(1);
6048 SDValue RHS = Op.getOperand(2);
6049 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
6050 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
6051 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6052 DAG.getConstant(X86CC, MVT::i8), Cond);
6053 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6056 // Fix vector shift instructions where the last operand is a non-immediate
6058 case Intrinsic::x86_sse2_pslli_w:
6059 case Intrinsic::x86_sse2_pslli_d:
6060 case Intrinsic::x86_sse2_pslli_q:
6061 case Intrinsic::x86_sse2_psrli_w:
6062 case Intrinsic::x86_sse2_psrli_d:
6063 case Intrinsic::x86_sse2_psrli_q:
6064 case Intrinsic::x86_sse2_psrai_w:
6065 case Intrinsic::x86_sse2_psrai_d:
6066 case Intrinsic::x86_mmx_pslli_w:
6067 case Intrinsic::x86_mmx_pslli_d:
6068 case Intrinsic::x86_mmx_pslli_q:
6069 case Intrinsic::x86_mmx_psrli_w:
6070 case Intrinsic::x86_mmx_psrli_d:
6071 case Intrinsic::x86_mmx_psrli_q:
6072 case Intrinsic::x86_mmx_psrai_w:
6073 case Intrinsic::x86_mmx_psrai_d: {
6074 SDValue ShAmt = Op.getOperand(2);
6075 if (isa<ConstantSDNode>(ShAmt))
6078 unsigned NewIntNo = 0;
6079 MVT ShAmtVT = MVT::v4i32;
6081 case Intrinsic::x86_sse2_pslli_w:
6082 NewIntNo = Intrinsic::x86_sse2_psll_w;
6084 case Intrinsic::x86_sse2_pslli_d:
6085 NewIntNo = Intrinsic::x86_sse2_psll_d;
6087 case Intrinsic::x86_sse2_pslli_q:
6088 NewIntNo = Intrinsic::x86_sse2_psll_q;
6090 case Intrinsic::x86_sse2_psrli_w:
6091 NewIntNo = Intrinsic::x86_sse2_psrl_w;
6093 case Intrinsic::x86_sse2_psrli_d:
6094 NewIntNo = Intrinsic::x86_sse2_psrl_d;
6096 case Intrinsic::x86_sse2_psrli_q:
6097 NewIntNo = Intrinsic::x86_sse2_psrl_q;
6099 case Intrinsic::x86_sse2_psrai_w:
6100 NewIntNo = Intrinsic::x86_sse2_psra_w;
6102 case Intrinsic::x86_sse2_psrai_d:
6103 NewIntNo = Intrinsic::x86_sse2_psra_d;
6106 ShAmtVT = MVT::v2i32;
6108 case Intrinsic::x86_mmx_pslli_w:
6109 NewIntNo = Intrinsic::x86_mmx_psll_w;
6111 case Intrinsic::x86_mmx_pslli_d:
6112 NewIntNo = Intrinsic::x86_mmx_psll_d;
6114 case Intrinsic::x86_mmx_pslli_q:
6115 NewIntNo = Intrinsic::x86_mmx_psll_q;
6117 case Intrinsic::x86_mmx_psrli_w:
6118 NewIntNo = Intrinsic::x86_mmx_psrl_w;
6120 case Intrinsic::x86_mmx_psrli_d:
6121 NewIntNo = Intrinsic::x86_mmx_psrl_d;
6123 case Intrinsic::x86_mmx_psrli_q:
6124 NewIntNo = Intrinsic::x86_mmx_psrl_q;
6126 case Intrinsic::x86_mmx_psrai_w:
6127 NewIntNo = Intrinsic::x86_mmx_psra_w;
6129 case Intrinsic::x86_mmx_psrai_d:
6130 NewIntNo = Intrinsic::x86_mmx_psra_d;
6132 default: abort(); // Can't reach here.
6137 MVT VT = Op.getValueType();
6138 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT,
6139 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShAmtVT, ShAmt));
6140 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6141 DAG.getConstant(NewIntNo, MVT::i32),
6142 Op.getOperand(1), ShAmt);
6147 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
6148 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6149 DebugLoc dl = Op.getDebugLoc();
6152 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6154 DAG.getConstant(TD->getPointerSize(),
6155 Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
6156 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
6157 DAG.getNode(ISD::ADD, dl, getPointerTy(),
6162 // Just load the return address.
6163 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
6164 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
6165 RetAddrFI, NULL, 0);
6168 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
6169 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
6170 MFI->setFrameAddressIsTaken(true);
6171 MVT VT = Op.getValueType();
6172 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful
6173 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6174 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
6175 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6177 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0);
6181 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
6182 SelectionDAG &DAG) {
6183 return DAG.getIntPtrConstant(2*TD->getPointerSize());
6186 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
6188 MachineFunction &MF = DAG.getMachineFunction();
6189 SDValue Chain = Op.getOperand(0);
6190 SDValue Offset = Op.getOperand(1);
6191 SDValue Handler = Op.getOperand(2);
6192 DebugLoc dl = Op.getDebugLoc();
6194 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
6196 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
6198 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
6199 DAG.getIntPtrConstant(-TD->getPointerSize()));
6200 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
6201 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0);
6202 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
6203 MF.getRegInfo().addLiveOut(StoreAddrReg);
6205 return DAG.getNode(X86ISD::EH_RETURN, dl,
6207 Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
6210 SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
6211 SelectionDAG &DAG) {
6212 SDValue Root = Op.getOperand(0);
6213 SDValue Trmp = Op.getOperand(1); // trampoline
6214 SDValue FPtr = Op.getOperand(2); // nested function
6215 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
6216 DebugLoc dl = Op.getDebugLoc();
6218 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
6220 const X86InstrInfo *TII =
6221 ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
6223 if (Subtarget->is64Bit()) {
6224 SDValue OutChains[6];
6226 // Large code-model.
6228 const unsigned char JMP64r = TII->getBaseOpcodeFor(X86::JMP64r);
6229 const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri);
6231 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
6232 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
6234 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
6236 // Load the pointer to the nested function into R11.
6237 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
6238 SDValue Addr = Trmp;
6239 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
6242 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6243 DAG.getConstant(2, MVT::i64));
6244 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2);
6246 // Load the 'nest' parameter value into R10.
6247 // R10 is specified in X86CallingConv.td
6248 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
6249 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6250 DAG.getConstant(10, MVT::i64));
6251 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
6252 Addr, TrmpAddr, 10);
6254 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6255 DAG.getConstant(12, MVT::i64));
6256 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2);
6258 // Jump to the nested function.
6259 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
6260 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6261 DAG.getConstant(20, MVT::i64));
6262 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
6263 Addr, TrmpAddr, 20);
6265 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
6266 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6267 DAG.getConstant(22, MVT::i64));
6268 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
6272 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
6273 return DAG.getMergeValues(Ops, 2, dl);
6275 const Function *Func =
6276 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
6277 unsigned CC = Func->getCallingConv();
6282 assert(0 && "Unsupported calling convention");
6283 case CallingConv::C:
6284 case CallingConv::X86_StdCall: {
6285 // Pass 'nest' parameter in ECX.
6286 // Must be kept in sync with X86CallingConv.td
6289 // Check that ECX wasn't needed by an 'inreg' parameter.
6290 const FunctionType *FTy = Func->getFunctionType();
6291 const AttrListPtr &Attrs = Func->getAttributes();
6293 if (!Attrs.isEmpty() && !Func->isVarArg()) {
6294 unsigned InRegCount = 0;
6297 for (FunctionType::param_iterator I = FTy->param_begin(),
6298 E = FTy->param_end(); I != E; ++I, ++Idx)
6299 if (Attrs.paramHasAttr(Idx, Attribute::InReg))
6300 // FIXME: should only count parameters that are lowered to integers.
6301 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
6303 if (InRegCount > 2) {
6304 cerr << "Nest register in use - reduce number of inreg parameters!\n";
6310 case CallingConv::X86_FastCall:
6311 case CallingConv::Fast:
6312 // Pass 'nest' parameter in EAX.
6313 // Must be kept in sync with X86CallingConv.td
6318 SDValue OutChains[4];
6321 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6322 DAG.getConstant(10, MVT::i32));
6323 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
6325 const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri);
6326 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
6327 OutChains[0] = DAG.getStore(Root, dl,
6328 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
6331 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6332 DAG.getConstant(1, MVT::i32));
6333 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1);
6335 const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP);
6336 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6337 DAG.getConstant(5, MVT::i32));
6338 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
6339 TrmpAddr, 5, false, 1);
6341 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6342 DAG.getConstant(6, MVT::i32));
6343 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1);
6346 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
6347 return DAG.getMergeValues(Ops, 2, dl);
6351 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
6353 The rounding mode is in bits 11:10 of FPSR, and has the following
6360 FLT_ROUNDS, on the other hand, expects the following:
6367 To perform the conversion, we do:
6368 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
6371 MachineFunction &MF = DAG.getMachineFunction();
6372 const TargetMachine &TM = MF.getTarget();
6373 const TargetFrameInfo &TFI = *TM.getFrameInfo();
6374 unsigned StackAlignment = TFI.getStackAlignment();
6375 MVT VT = Op.getValueType();
6376 DebugLoc dl = Op.getDebugLoc();
6378 // Save FP Control Word to stack slot
6379 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment);
6380 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
6382 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
6383 DAG.getEntryNode(), StackSlot);
6385 // Load FP Control Word from stack slot
6386 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0);
6388 // Transform as necessary
6390 DAG.getNode(ISD::SRL, dl, MVT::i16,
6391 DAG.getNode(ISD::AND, dl, MVT::i16,
6392 CWD, DAG.getConstant(0x800, MVT::i16)),
6393 DAG.getConstant(11, MVT::i8));
6395 DAG.getNode(ISD::SRL, dl, MVT::i16,
6396 DAG.getNode(ISD::AND, dl, MVT::i16,
6397 CWD, DAG.getConstant(0x400, MVT::i16)),
6398 DAG.getConstant(9, MVT::i8));
6401 DAG.getNode(ISD::AND, dl, MVT::i16,
6402 DAG.getNode(ISD::ADD, dl, MVT::i16,
6403 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
6404 DAG.getConstant(1, MVT::i16)),
6405 DAG.getConstant(3, MVT::i16));
6408 return DAG.getNode((VT.getSizeInBits() < 16 ?
6409 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
6412 SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
6413 MVT VT = Op.getValueType();
6415 unsigned NumBits = VT.getSizeInBits();
6416 DebugLoc dl = Op.getDebugLoc();
6418 Op = Op.getOperand(0);
6419 if (VT == MVT::i8) {
6420 // Zero extend to i32 since there is not an i8 bsr.
6422 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
6425 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
6426 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
6427 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
6429 // If src is zero (i.e. bsr sets ZF), returns NumBits.
6430 SmallVector<SDValue, 4> Ops;
6432 Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT));
6433 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8));
6434 Ops.push_back(Op.getValue(1));
6435 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4);
6437 // Finally xor with NumBits-1.
6438 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
6441 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
6445 SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
6446 MVT VT = Op.getValueType();
6448 unsigned NumBits = VT.getSizeInBits();
6449 DebugLoc dl = Op.getDebugLoc();
6451 Op = Op.getOperand(0);
6452 if (VT == MVT::i8) {
6454 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
6457 // Issue a bsf (scan bits forward) which also sets EFLAGS.
6458 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
6459 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
6461 // If src is zero (i.e. bsf sets ZF), returns NumBits.
6462 SmallVector<SDValue, 4> Ops;
6464 Ops.push_back(DAG.getConstant(NumBits, OpVT));
6465 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8));
6466 Ops.push_back(Op.getValue(1));
6467 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4);
6470 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
6474 SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) {
6475 MVT VT = Op.getValueType();
6476 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
6477 DebugLoc dl = Op.getDebugLoc();
6479 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
6480 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
6481 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
6482 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
6483 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
6485 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
6486 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
6487 // return AloBlo + AloBhi + AhiBlo;
6489 SDValue A = Op.getOperand(0);
6490 SDValue B = Op.getOperand(1);
6492 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6493 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
6494 A, DAG.getConstant(32, MVT::i32));
6495 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6496 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
6497 B, DAG.getConstant(32, MVT::i32));
6498 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6499 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
6501 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6502 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
6504 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6505 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
6507 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6508 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
6509 AloBhi, DAG.getConstant(32, MVT::i32));
6510 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6511 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
6512 AhiBlo, DAG.getConstant(32, MVT::i32));
6513 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
6514 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
6519 SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) {
6520 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
6521 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
6522 // looks for this combo and may remove the "setcc" instruction if the "setcc"
6523 // has only one use.
6524 SDNode *N = Op.getNode();
6525 SDValue LHS = N->getOperand(0);
6526 SDValue RHS = N->getOperand(1);
6527 unsigned BaseOp = 0;
6529 DebugLoc dl = Op.getDebugLoc();
6531 switch (Op.getOpcode()) {
6532 default: assert(0 && "Unknown ovf instruction!");
6534 BaseOp = X86ISD::ADD;
6538 BaseOp = X86ISD::ADD;
6542 BaseOp = X86ISD::SUB;
6546 BaseOp = X86ISD::SUB;
6550 BaseOp = X86ISD::SMUL;
6554 BaseOp = X86ISD::UMUL;
6559 // Also sets EFLAGS.
6560 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
6561 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
6564 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
6565 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
6567 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
6571 SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
6572 MVT T = Op.getValueType();
6573 DebugLoc dl = Op.getDebugLoc();
6576 switch(T.getSimpleVT()) {
6578 assert(false && "Invalid value type!");
6579 case MVT::i8: Reg = X86::AL; size = 1; break;
6580 case MVT::i16: Reg = X86::AX; size = 2; break;
6581 case MVT::i32: Reg = X86::EAX; size = 4; break;
6583 assert(Subtarget->is64Bit() && "Node not type legal!");
6584 Reg = X86::RAX; size = 8;
6587 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
6588 Op.getOperand(2), SDValue());
6589 SDValue Ops[] = { cpIn.getValue(0),
6592 DAG.getTargetConstant(size, MVT::i8),
6594 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6595 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
6597 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
6601 SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
6602 SelectionDAG &DAG) {
6603 assert(Subtarget->is64Bit() && "Result not type legalized?");
6604 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6605 SDValue TheChain = Op.getOperand(0);
6606 DebugLoc dl = Op.getDebugLoc();
6607 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
6608 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
6609 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
6611 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
6612 DAG.getConstant(32, MVT::i8));
6614 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
6617 return DAG.getMergeValues(Ops, 2, dl);
6620 SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
6621 SDNode *Node = Op.getNode();
6622 DebugLoc dl = Node->getDebugLoc();
6623 MVT T = Node->getValueType(0);
6624 SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
6625 DAG.getConstant(0, T), Node->getOperand(2));
6626 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
6627 cast<AtomicSDNode>(Node)->getMemoryVT(),
6628 Node->getOperand(0),
6629 Node->getOperand(1), negOp,
6630 cast<AtomicSDNode>(Node)->getSrcValue(),
6631 cast<AtomicSDNode>(Node)->getAlignment());
6634 /// LowerOperation - Provide custom lowering hooks for some operations.
6636 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
6637 switch (Op.getOpcode()) {
6638 default: assert(0 && "Should not custom lower this!");
6639 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG);
6640 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
6641 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
6642 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
6643 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6644 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
6645 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
6646 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
6647 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
6648 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
6649 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
6650 case ISD::SHL_PARTS:
6651 case ISD::SRA_PARTS:
6652 case ISD::SRL_PARTS: return LowerShift(Op, DAG);
6653 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
6654 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
6655 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
6656 case ISD::FABS: return LowerFABS(Op, DAG);
6657 case ISD::FNEG: return LowerFNEG(Op, DAG);
6658 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
6659 case ISD::SETCC: return LowerSETCC(Op, DAG);
6660 case ISD::VSETCC: return LowerVSETCC(Op, DAG);
6661 case ISD::SELECT: return LowerSELECT(Op, DAG);
6662 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
6663 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
6664 case ISD::CALL: return LowerCALL(Op, DAG);
6665 case ISD::RET: return LowerRET(Op, DAG);
6666 case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG);
6667 case ISD::VASTART: return LowerVASTART(Op, DAG);
6668 case ISD::VAARG: return LowerVAARG(Op, DAG);
6669 case ISD::VACOPY: return LowerVACOPY(Op, DAG);
6670 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6671 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
6672 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
6673 case ISD::FRAME_TO_ARGS_OFFSET:
6674 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
6675 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
6676 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
6677 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG);
6678 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
6679 case ISD::CTLZ: return LowerCTLZ(Op, DAG);
6680 case ISD::CTTZ: return LowerCTTZ(Op, DAG);
6681 case ISD::MUL: return LowerMUL_V2I64(Op, DAG);
6687 case ISD::UMULO: return LowerXALUO(Op, DAG);
6688 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG);
6692 void X86TargetLowering::
6693 ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
6694 SelectionDAG &DAG, unsigned NewOp) {
6695 MVT T = Node->getValueType(0);
6696 DebugLoc dl = Node->getDebugLoc();
6697 assert (T == MVT::i64 && "Only know how to expand i64 atomics");
6699 SDValue Chain = Node->getOperand(0);
6700 SDValue In1 = Node->getOperand(1);
6701 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
6702 Node->getOperand(2), DAG.getIntPtrConstant(0));
6703 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
6704 Node->getOperand(2), DAG.getIntPtrConstant(1));
6705 // This is a generalized SDNode, not an AtomicSDNode, so it doesn't
6706 // have a MemOperand. Pass the info through as a normal operand.
6707 SDValue LSI = DAG.getMemOperand(cast<MemSDNode>(Node)->getMemOperand());
6708 SDValue Ops[] = { Chain, In1, In2L, In2H, LSI };
6709 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
6710 SDValue Result = DAG.getNode(NewOp, dl, Tys, Ops, 5);
6711 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
6712 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
6713 Results.push_back(Result.getValue(2));
6716 /// ReplaceNodeResults - Replace a node with an illegal result type
6717 /// with a new node built out of custom code.
6718 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
6719 SmallVectorImpl<SDValue>&Results,
6720 SelectionDAG &DAG) {
6721 DebugLoc dl = N->getDebugLoc();
6722 switch (N->getOpcode()) {
6724 assert(false && "Do not know how to custom type legalize this operation!");
6726 case ISD::FP_TO_SINT: {
6727 std::pair<SDValue,SDValue> Vals = FP_TO_SINTHelper(SDValue(N, 0), DAG);
6728 SDValue FIST = Vals.first, StackSlot = Vals.second;
6729 if (FIST.getNode() != 0) {
6730 MVT VT = N->getValueType(0);
6731 // Return a load from the stack slot.
6732 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0));
6736 case ISD::READCYCLECOUNTER: {
6737 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6738 SDValue TheChain = N->getOperand(0);
6739 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
6740 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
6742 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
6744 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
6745 SDValue Ops[] = { eax, edx };
6746 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
6747 Results.push_back(edx.getValue(1));
6750 case ISD::ATOMIC_CMP_SWAP: {
6751 MVT T = N->getValueType(0);
6752 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
6753 SDValue cpInL, cpInH;
6754 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
6755 DAG.getConstant(0, MVT::i32));
6756 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
6757 DAG.getConstant(1, MVT::i32));
6758 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
6759 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
6761 SDValue swapInL, swapInH;
6762 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
6763 DAG.getConstant(0, MVT::i32));
6764 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
6765 DAG.getConstant(1, MVT::i32));
6766 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
6768 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
6769 swapInL.getValue(1));
6770 SDValue Ops[] = { swapInH.getValue(0),
6772 swapInH.getValue(1) };
6773 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6774 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
6775 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
6776 MVT::i32, Result.getValue(1));
6777 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
6778 MVT::i32, cpOutL.getValue(2));
6779 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
6780 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
6781 Results.push_back(cpOutH.getValue(1));
6784 case ISD::ATOMIC_LOAD_ADD:
6785 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
6787 case ISD::ATOMIC_LOAD_AND:
6788 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
6790 case ISD::ATOMIC_LOAD_NAND:
6791 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
6793 case ISD::ATOMIC_LOAD_OR:
6794 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
6796 case ISD::ATOMIC_LOAD_SUB:
6797 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
6799 case ISD::ATOMIC_LOAD_XOR:
6800 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
6802 case ISD::ATOMIC_SWAP:
6803 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
6808 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
6810 default: return NULL;
6811 case X86ISD::BSF: return "X86ISD::BSF";
6812 case X86ISD::BSR: return "X86ISD::BSR";
6813 case X86ISD::SHLD: return "X86ISD::SHLD";
6814 case X86ISD::SHRD: return "X86ISD::SHRD";
6815 case X86ISD::FAND: return "X86ISD::FAND";
6816 case X86ISD::FOR: return "X86ISD::FOR";
6817 case X86ISD::FXOR: return "X86ISD::FXOR";
6818 case X86ISD::FSRL: return "X86ISD::FSRL";
6819 case X86ISD::FILD: return "X86ISD::FILD";
6820 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
6821 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
6822 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
6823 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
6824 case X86ISD::FLD: return "X86ISD::FLD";
6825 case X86ISD::FST: return "X86ISD::FST";
6826 case X86ISD::CALL: return "X86ISD::CALL";
6827 case X86ISD::TAILCALL: return "X86ISD::TAILCALL";
6828 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
6829 case X86ISD::BT: return "X86ISD::BT";
6830 case X86ISD::CMP: return "X86ISD::CMP";
6831 case X86ISD::COMI: return "X86ISD::COMI";
6832 case X86ISD::UCOMI: return "X86ISD::UCOMI";
6833 case X86ISD::SETCC: return "X86ISD::SETCC";
6834 case X86ISD::CMOV: return "X86ISD::CMOV";
6835 case X86ISD::BRCOND: return "X86ISD::BRCOND";
6836 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
6837 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
6838 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
6839 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
6840 case X86ISD::Wrapper: return "X86ISD::Wrapper";
6841 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
6842 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
6843 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
6844 case X86ISD::PINSRB: return "X86ISD::PINSRB";
6845 case X86ISD::PINSRW: return "X86ISD::PINSRW";
6846 case X86ISD::FMAX: return "X86ISD::FMAX";
6847 case X86ISD::FMIN: return "X86ISD::FMIN";
6848 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
6849 case X86ISD::FRCP: return "X86ISD::FRCP";
6850 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
6851 case X86ISD::THREAD_POINTER: return "X86ISD::THREAD_POINTER";
6852 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
6853 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
6854 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
6855 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
6856 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
6857 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG";
6858 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG";
6859 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG";
6860 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG";
6861 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG";
6862 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG";
6863 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
6864 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
6865 case X86ISD::VSHL: return "X86ISD::VSHL";
6866 case X86ISD::VSRL: return "X86ISD::VSRL";
6867 case X86ISD::CMPPD: return "X86ISD::CMPPD";
6868 case X86ISD::CMPPS: return "X86ISD::CMPPS";
6869 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB";
6870 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW";
6871 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD";
6872 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ";
6873 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB";
6874 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW";
6875 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD";
6876 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ";
6877 case X86ISD::ADD: return "X86ISD::ADD";
6878 case X86ISD::SUB: return "X86ISD::SUB";
6879 case X86ISD::SMUL: return "X86ISD::SMUL";
6880 case X86ISD::UMUL: return "X86ISD::UMUL";
6884 // isLegalAddressingMode - Return true if the addressing mode represented
6885 // by AM is legal for this target, for a load/store of the specified type.
6886 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
6887 const Type *Ty) const {
6888 // X86 supports extremely general addressing modes.
6890 // X86 allows a sign-extended 32-bit immediate field as a displacement.
6891 if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1)
6895 // We can only fold this if we don't need an extra load.
6896 if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false))
6898 // If BaseGV requires a register, we cannot also have a BaseReg.
6899 if (Subtarget->GVRequiresRegister(AM.BaseGV, getTargetMachine(), false) &&
6903 // X86-64 only supports addr of globals in small code model.
6904 if (Subtarget->is64Bit()) {
6905 if (getTargetMachine().getCodeModel() != CodeModel::Small)
6907 // If lower 4G is not available, then we must use rip-relative addressing.
6908 if (AM.BaseOffs || AM.Scale > 1)
6919 // These scales always work.
6924 // These scales are formed with basereg+scalereg. Only accept if there is
6929 default: // Other stuff never works.
6937 bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
6938 if (!Ty1->isInteger() || !Ty2->isInteger())
6940 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
6941 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
6942 if (NumBits1 <= NumBits2)
6944 return Subtarget->is64Bit() || NumBits1 < 64;
6947 bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const {
6948 if (!VT1.isInteger() || !VT2.isInteger())
6950 unsigned NumBits1 = VT1.getSizeInBits();
6951 unsigned NumBits2 = VT2.getSizeInBits();
6952 if (NumBits1 <= NumBits2)
6954 return Subtarget->is64Bit() || NumBits1 < 64;
6957 /// isShuffleMaskLegal - Targets can use this to indicate that they only
6958 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
6959 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
6960 /// are assumed to be legal.
6962 X86TargetLowering::isShuffleMaskLegal(SDValue Mask, MVT VT) const {
6963 // Only do shuffles on 128-bit vector types for now.
6964 if (VT.getSizeInBits() == 64) return false;
6965 return (Mask.getNode()->getNumOperands() <= 4 ||
6966 isIdentityMask(Mask.getNode()) ||
6967 isIdentityMask(Mask.getNode(), true) ||
6968 isSplatMask(Mask.getNode()) ||
6969 isPSHUFHW_PSHUFLWMask(Mask.getNode()) ||
6970 X86::isUNPCKLMask(Mask.getNode()) ||
6971 X86::isUNPCKHMask(Mask.getNode()) ||
6972 X86::isUNPCKL_v_undef_Mask(Mask.getNode()) ||
6973 X86::isUNPCKH_v_undef_Mask(Mask.getNode()));
6977 X86TargetLowering::isVectorClearMaskLegal(const std::vector<SDValue> &BVOps,
6978 MVT EVT, SelectionDAG &DAG) const {
6979 unsigned NumElts = BVOps.size();
6980 // Only do shuffles on 128-bit vector types for now.
6981 if (EVT.getSizeInBits() * NumElts == 64) return false;
6982 if (NumElts == 2) return true;
6984 return (isMOVLMask(&BVOps[0], 4) ||
6985 isCommutedMOVL(&BVOps[0], 4, true) ||
6986 isSHUFPMask(&BVOps[0], 4) ||
6987 isCommutedSHUFP(&BVOps[0], 4));
6992 //===----------------------------------------------------------------------===//
6993 // X86 Scheduler Hooks
6994 //===----------------------------------------------------------------------===//
6996 // private utility function
6998 X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
6999 MachineBasicBlock *MBB,
7007 TargetRegisterClass *RC,
7008 bool invSrc) const {
7009 // For the atomic bitwise operator, we generate
7012 // ld t1 = [bitinstr.addr]
7013 // op t2 = t1, [bitinstr.val]
7015 // lcs dest = [bitinstr.addr], t2 [EAX is implicit]
7017 // fallthrough -->nextMBB
7018 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7019 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7020 MachineFunction::iterator MBBIter = MBB;
7023 /// First build the CFG
7024 MachineFunction *F = MBB->getParent();
7025 MachineBasicBlock *thisMBB = MBB;
7026 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7027 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7028 F->insert(MBBIter, newMBB);
7029 F->insert(MBBIter, nextMBB);
7031 // Move all successors to thisMBB to nextMBB
7032 nextMBB->transferSuccessors(thisMBB);
7034 // Update thisMBB to fall through to newMBB
7035 thisMBB->addSuccessor(newMBB);
7037 // newMBB jumps to itself and fall through to nextMBB
7038 newMBB->addSuccessor(nextMBB);
7039 newMBB->addSuccessor(newMBB);
7041 // Insert instructions into newMBB based on incoming instruction
7042 assert(bInstr->getNumOperands() < 8 && "unexpected number of operands");
7043 DebugLoc dl = bInstr->getDebugLoc();
7044 MachineOperand& destOper = bInstr->getOperand(0);
7045 MachineOperand* argOpers[6];
7046 int numArgs = bInstr->getNumOperands() - 1;
7047 for (int i=0; i < numArgs; ++i)
7048 argOpers[i] = &bInstr->getOperand(i+1);
7050 // x86 address has 4 operands: base, index, scale, and displacement
7051 int lastAddrIndx = 3; // [0,3]
7054 unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
7055 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
7056 for (int i=0; i <= lastAddrIndx; ++i)
7057 (*MIB).addOperand(*argOpers[i]);
7059 unsigned tt = F->getRegInfo().createVirtualRegister(RC);
7061 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
7066 unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
7067 assert((argOpers[valArgIndx]->isReg() ||
7068 argOpers[valArgIndx]->isImm()) &&
7070 if (argOpers[valArgIndx]->isReg())
7071 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
7073 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
7075 (*MIB).addOperand(*argOpers[valArgIndx]);
7077 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg);
7080 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
7081 for (int i=0; i <= lastAddrIndx; ++i)
7082 (*MIB).addOperand(*argOpers[i]);
7084 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7085 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
7087 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
7091 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
7093 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now.
7097 // private utility function: 64 bit atomics on 32 bit host.
7099 X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
7100 MachineBasicBlock *MBB,
7105 bool invSrc) const {
7106 // For the atomic bitwise operator, we generate
7107 // thisMBB (instructions are in pairs, except cmpxchg8b)
7108 // ld t1,t2 = [bitinstr.addr]
7110 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
7111 // op t5, t6 <- out1, out2, [bitinstr.val]
7112 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val])
7113 // mov ECX, EBX <- t5, t6
7114 // mov EAX, EDX <- t1, t2
7115 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit]
7116 // mov t3, t4 <- EAX, EDX
7118 // result in out1, out2
7119 // fallthrough -->nextMBB
7121 const TargetRegisterClass *RC = X86::GR32RegisterClass;
7122 const unsigned LoadOpc = X86::MOV32rm;
7123 const unsigned copyOpc = X86::MOV32rr;
7124 const unsigned NotOpc = X86::NOT32r;
7125 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7126 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7127 MachineFunction::iterator MBBIter = MBB;
7130 /// First build the CFG
7131 MachineFunction *F = MBB->getParent();
7132 MachineBasicBlock *thisMBB = MBB;
7133 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7134 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7135 F->insert(MBBIter, newMBB);
7136 F->insert(MBBIter, nextMBB);
7138 // Move all successors to thisMBB to nextMBB
7139 nextMBB->transferSuccessors(thisMBB);
7141 // Update thisMBB to fall through to newMBB
7142 thisMBB->addSuccessor(newMBB);
7144 // newMBB jumps to itself and fall through to nextMBB
7145 newMBB->addSuccessor(nextMBB);
7146 newMBB->addSuccessor(newMBB);
7148 DebugLoc dl = bInstr->getDebugLoc();
7149 // Insert instructions into newMBB based on incoming instruction
7150 // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
7151 assert(bInstr->getNumOperands() < 18 && "unexpected number of operands");
7152 MachineOperand& dest1Oper = bInstr->getOperand(0);
7153 MachineOperand& dest2Oper = bInstr->getOperand(1);
7154 MachineOperand* argOpers[6];
7155 for (int i=0; i < 6; ++i)
7156 argOpers[i] = &bInstr->getOperand(i+2);
7158 // x86 address has 4 operands: base, index, scale, and displacement
7159 int lastAddrIndx = 3; // [0,3]
7161 unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
7162 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
7163 for (int i=0; i <= lastAddrIndx; ++i)
7164 (*MIB).addOperand(*argOpers[i]);
7165 unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
7166 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
7167 // add 4 to displacement.
7168 for (int i=0; i <= lastAddrIndx-1; ++i)
7169 (*MIB).addOperand(*argOpers[i]);
7170 MachineOperand newOp3 = *(argOpers[3]);
7172 newOp3.setImm(newOp3.getImm()+4);
7174 newOp3.setOffset(newOp3.getOffset()+4);
7175 (*MIB).addOperand(newOp3);
7177 // t3/4 are defined later, at the bottom of the loop
7178 unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
7179 unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
7180 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
7181 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
7182 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
7183 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
7185 unsigned tt1 = F->getRegInfo().createVirtualRegister(RC);
7186 unsigned tt2 = F->getRegInfo().createVirtualRegister(RC);
7188 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1);
7189 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2);
7195 assert((argOpers[4]->isReg() || argOpers[4]->isImm()) &&
7197 unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
7198 unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
7199 if (argOpers[4]->isReg())
7200 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
7202 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
7203 if (regOpcL != X86::MOV32rr)
7205 (*MIB).addOperand(*argOpers[4]);
7206 assert(argOpers[5]->isReg() == argOpers[4]->isReg());
7207 assert(argOpers[5]->isImm() == argOpers[4]->isImm());
7208 if (argOpers[5]->isReg())
7209 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
7211 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
7212 if (regOpcH != X86::MOV32rr)
7214 (*MIB).addOperand(*argOpers[5]);
7216 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
7218 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX);
7221 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX);
7223 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX);
7226 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
7227 for (int i=0; i <= lastAddrIndx; ++i)
7228 (*MIB).addOperand(*argOpers[i]);
7230 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7231 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
7233 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
7234 MIB.addReg(X86::EAX);
7235 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4);
7236 MIB.addReg(X86::EDX);
7239 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
7241 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now.
7245 // private utility function
7247 X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
7248 MachineBasicBlock *MBB,
7249 unsigned cmovOpc) const {
7250 // For the atomic min/max operator, we generate
7253 // ld t1 = [min/max.addr]
7254 // mov t2 = [min/max.val]
7256 // cmov[cond] t2 = t1
7258 // lcs dest = [bitinstr.addr], t2 [EAX is implicit]
7260 // fallthrough -->nextMBB
7262 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7263 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7264 MachineFunction::iterator MBBIter = MBB;
7267 /// First build the CFG
7268 MachineFunction *F = MBB->getParent();
7269 MachineBasicBlock *thisMBB = MBB;
7270 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7271 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7272 F->insert(MBBIter, newMBB);
7273 F->insert(MBBIter, nextMBB);
7275 // Move all successors to thisMBB to nextMBB
7276 nextMBB->transferSuccessors(thisMBB);
7278 // Update thisMBB to fall through to newMBB
7279 thisMBB->addSuccessor(newMBB);
7281 // newMBB jumps to newMBB and fall through to nextMBB
7282 newMBB->addSuccessor(nextMBB);
7283 newMBB->addSuccessor(newMBB);
7285 DebugLoc dl = mInstr->getDebugLoc();
7286 // Insert instructions into newMBB based on incoming instruction
7287 assert(mInstr->getNumOperands() < 8 && "unexpected number of operands");
7288 MachineOperand& destOper = mInstr->getOperand(0);
7289 MachineOperand* argOpers[6];
7290 int numArgs = mInstr->getNumOperands() - 1;
7291 for (int i=0; i < numArgs; ++i)
7292 argOpers[i] = &mInstr->getOperand(i+1);
7294 // x86 address has 4 operands: base, index, scale, and displacement
7295 int lastAddrIndx = 3; // [0,3]
7298 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
7299 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
7300 for (int i=0; i <= lastAddrIndx; ++i)
7301 (*MIB).addOperand(*argOpers[i]);
7303 // We only support register and immediate values
7304 assert((argOpers[valArgIndx]->isReg() ||
7305 argOpers[valArgIndx]->isImm()) &&
7308 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
7309 if (argOpers[valArgIndx]->isReg())
7310 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
7312 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
7313 (*MIB).addOperand(*argOpers[valArgIndx]);
7315 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX);
7318 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
7323 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
7324 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
7328 // Cmp and exchange if none has modified the memory location
7329 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
7330 for (int i=0; i <= lastAddrIndx; ++i)
7331 (*MIB).addOperand(*argOpers[i]);
7333 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7334 (*MIB).addMemOperand(*F, *mInstr->memoperands_begin());
7336 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
7337 MIB.addReg(X86::EAX);
7340 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
7342 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now.
7348 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
7349 MachineBasicBlock *BB) const {
7350 DebugLoc dl = MI->getDebugLoc();
7351 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7352 switch (MI->getOpcode()) {
7353 default: assert(false && "Unexpected instr type to insert");
7354 case X86::CMOV_V1I64:
7355 case X86::CMOV_FR32:
7356 case X86::CMOV_FR64:
7357 case X86::CMOV_V4F32:
7358 case X86::CMOV_V2F64:
7359 case X86::CMOV_V2I64: {
7360 // To "insert" a SELECT_CC instruction, we actually have to insert the
7361 // diamond control-flow pattern. The incoming instruction knows the
7362 // destination vreg to set, the condition code register to branch on, the
7363 // true/false values to select between, and a branch opcode to use.
7364 const BasicBlock *LLVM_BB = BB->getBasicBlock();
7365 MachineFunction::iterator It = BB;
7371 // cmpTY ccX, r1, r2
7373 // fallthrough --> copy0MBB
7374 MachineBasicBlock *thisMBB = BB;
7375 MachineFunction *F = BB->getParent();
7376 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
7377 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
7379 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
7380 BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB);
7381 F->insert(It, copy0MBB);
7382 F->insert(It, sinkMBB);
7383 // Update machine-CFG edges by transferring all successors of the current
7384 // block to the new block which will contain the Phi node for the select.
7385 sinkMBB->transferSuccessors(BB);
7387 // Add the true and fallthrough blocks as its successors.
7388 BB->addSuccessor(copy0MBB);
7389 BB->addSuccessor(sinkMBB);
7392 // %FalseValue = ...
7393 // # fallthrough to sinkMBB
7396 // Update machine-CFG edges
7397 BB->addSuccessor(sinkMBB);
7400 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
7403 BuildMI(BB, dl, TII->get(X86::PHI), MI->getOperand(0).getReg())
7404 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
7405 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
7407 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
7411 case X86::FP32_TO_INT16_IN_MEM:
7412 case X86::FP32_TO_INT32_IN_MEM:
7413 case X86::FP32_TO_INT64_IN_MEM:
7414 case X86::FP64_TO_INT16_IN_MEM:
7415 case X86::FP64_TO_INT32_IN_MEM:
7416 case X86::FP64_TO_INT64_IN_MEM:
7417 case X86::FP80_TO_INT16_IN_MEM:
7418 case X86::FP80_TO_INT32_IN_MEM:
7419 case X86::FP80_TO_INT64_IN_MEM: {
7420 // Change the floating point control register to use "round towards zero"
7421 // mode when truncating to an integer value.
7422 MachineFunction *F = BB->getParent();
7423 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
7424 addFrameReference(BuildMI(BB, dl, TII->get(X86::FNSTCW16m)), CWFrameIdx);
7426 // Load the old value of the high byte of the control word...
7428 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
7429 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16rm), OldCW),
7432 // Set the high part to be round to zero...
7433 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mi)), CWFrameIdx)
7436 // Reload the modified control word now...
7437 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx);
7439 // Restore the memory image of control word to original value
7440 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mr)), CWFrameIdx)
7443 // Get the X86 opcode to use.
7445 switch (MI->getOpcode()) {
7446 default: assert(0 && "illegal opcode!");
7447 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
7448 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
7449 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
7450 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
7451 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
7452 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
7453 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
7454 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
7455 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
7459 MachineOperand &Op = MI->getOperand(0);
7461 AM.BaseType = X86AddressMode::RegBase;
7462 AM.Base.Reg = Op.getReg();
7464 AM.BaseType = X86AddressMode::FrameIndexBase;
7465 AM.Base.FrameIndex = Op.getIndex();
7467 Op = MI->getOperand(1);
7469 AM.Scale = Op.getImm();
7470 Op = MI->getOperand(2);
7472 AM.IndexReg = Op.getImm();
7473 Op = MI->getOperand(3);
7474 if (Op.isGlobal()) {
7475 AM.GV = Op.getGlobal();
7477 AM.Disp = Op.getImm();
7479 addFullAddress(BuildMI(BB, dl, TII->get(Opc)), AM)
7480 .addReg(MI->getOperand(4).getReg());
7482 // Reload the original control word now.
7483 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx);
7485 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
7488 case X86::ATOMAND32:
7489 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
7490 X86::AND32ri, X86::MOV32rm,
7491 X86::LCMPXCHG32, X86::MOV32rr,
7492 X86::NOT32r, X86::EAX,
7493 X86::GR32RegisterClass);
7495 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
7496 X86::OR32ri, X86::MOV32rm,
7497 X86::LCMPXCHG32, X86::MOV32rr,
7498 X86::NOT32r, X86::EAX,
7499 X86::GR32RegisterClass);
7500 case X86::ATOMXOR32:
7501 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
7502 X86::XOR32ri, X86::MOV32rm,
7503 X86::LCMPXCHG32, X86::MOV32rr,
7504 X86::NOT32r, X86::EAX,
7505 X86::GR32RegisterClass);
7506 case X86::ATOMNAND32:
7507 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
7508 X86::AND32ri, X86::MOV32rm,
7509 X86::LCMPXCHG32, X86::MOV32rr,
7510 X86::NOT32r, X86::EAX,
7511 X86::GR32RegisterClass, true);
7512 case X86::ATOMMIN32:
7513 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
7514 case X86::ATOMMAX32:
7515 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
7516 case X86::ATOMUMIN32:
7517 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
7518 case X86::ATOMUMAX32:
7519 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
7521 case X86::ATOMAND16:
7522 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
7523 X86::AND16ri, X86::MOV16rm,
7524 X86::LCMPXCHG16, X86::MOV16rr,
7525 X86::NOT16r, X86::AX,
7526 X86::GR16RegisterClass);
7528 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
7529 X86::OR16ri, X86::MOV16rm,
7530 X86::LCMPXCHG16, X86::MOV16rr,
7531 X86::NOT16r, X86::AX,
7532 X86::GR16RegisterClass);
7533 case X86::ATOMXOR16:
7534 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
7535 X86::XOR16ri, X86::MOV16rm,
7536 X86::LCMPXCHG16, X86::MOV16rr,
7537 X86::NOT16r, X86::AX,
7538 X86::GR16RegisterClass);
7539 case X86::ATOMNAND16:
7540 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
7541 X86::AND16ri, X86::MOV16rm,
7542 X86::LCMPXCHG16, X86::MOV16rr,
7543 X86::NOT16r, X86::AX,
7544 X86::GR16RegisterClass, true);
7545 case X86::ATOMMIN16:
7546 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
7547 case X86::ATOMMAX16:
7548 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
7549 case X86::ATOMUMIN16:
7550 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
7551 case X86::ATOMUMAX16:
7552 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
7555 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
7556 X86::AND8ri, X86::MOV8rm,
7557 X86::LCMPXCHG8, X86::MOV8rr,
7558 X86::NOT8r, X86::AL,
7559 X86::GR8RegisterClass);
7561 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
7562 X86::OR8ri, X86::MOV8rm,
7563 X86::LCMPXCHG8, X86::MOV8rr,
7564 X86::NOT8r, X86::AL,
7565 X86::GR8RegisterClass);
7567 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
7568 X86::XOR8ri, X86::MOV8rm,
7569 X86::LCMPXCHG8, X86::MOV8rr,
7570 X86::NOT8r, X86::AL,
7571 X86::GR8RegisterClass);
7572 case X86::ATOMNAND8:
7573 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
7574 X86::AND8ri, X86::MOV8rm,
7575 X86::LCMPXCHG8, X86::MOV8rr,
7576 X86::NOT8r, X86::AL,
7577 X86::GR8RegisterClass, true);
7578 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
7579 // This group is for 64-bit host.
7580 case X86::ATOMAND64:
7581 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
7582 X86::AND64ri32, X86::MOV64rm,
7583 X86::LCMPXCHG64, X86::MOV64rr,
7584 X86::NOT64r, X86::RAX,
7585 X86::GR64RegisterClass);
7587 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
7588 X86::OR64ri32, X86::MOV64rm,
7589 X86::LCMPXCHG64, X86::MOV64rr,
7590 X86::NOT64r, X86::RAX,
7591 X86::GR64RegisterClass);
7592 case X86::ATOMXOR64:
7593 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
7594 X86::XOR64ri32, X86::MOV64rm,
7595 X86::LCMPXCHG64, X86::MOV64rr,
7596 X86::NOT64r, X86::RAX,
7597 X86::GR64RegisterClass);
7598 case X86::ATOMNAND64:
7599 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
7600 X86::AND64ri32, X86::MOV64rm,
7601 X86::LCMPXCHG64, X86::MOV64rr,
7602 X86::NOT64r, X86::RAX,
7603 X86::GR64RegisterClass, true);
7604 case X86::ATOMMIN64:
7605 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
7606 case X86::ATOMMAX64:
7607 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
7608 case X86::ATOMUMIN64:
7609 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
7610 case X86::ATOMUMAX64:
7611 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
7613 // This group does 64-bit operations on a 32-bit host.
7614 case X86::ATOMAND6432:
7615 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7616 X86::AND32rr, X86::AND32rr,
7617 X86::AND32ri, X86::AND32ri,
7619 case X86::ATOMOR6432:
7620 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7621 X86::OR32rr, X86::OR32rr,
7622 X86::OR32ri, X86::OR32ri,
7624 case X86::ATOMXOR6432:
7625 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7626 X86::XOR32rr, X86::XOR32rr,
7627 X86::XOR32ri, X86::XOR32ri,
7629 case X86::ATOMNAND6432:
7630 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7631 X86::AND32rr, X86::AND32rr,
7632 X86::AND32ri, X86::AND32ri,
7634 case X86::ATOMADD6432:
7635 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7636 X86::ADD32rr, X86::ADC32rr,
7637 X86::ADD32ri, X86::ADC32ri,
7639 case X86::ATOMSUB6432:
7640 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7641 X86::SUB32rr, X86::SBB32rr,
7642 X86::SUB32ri, X86::SBB32ri,
7644 case X86::ATOMSWAP6432:
7645 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7646 X86::MOV32rr, X86::MOV32rr,
7647 X86::MOV32ri, X86::MOV32ri,
7652 //===----------------------------------------------------------------------===//
7653 // X86 Optimization Hooks
7654 //===----------------------------------------------------------------------===//
7656 void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
7660 const SelectionDAG &DAG,
7661 unsigned Depth) const {
7662 unsigned Opc = Op.getOpcode();
7663 assert((Opc >= ISD::BUILTIN_OP_END ||
7664 Opc == ISD::INTRINSIC_WO_CHAIN ||
7665 Opc == ISD::INTRINSIC_W_CHAIN ||
7666 Opc == ISD::INTRINSIC_VOID) &&
7667 "Should use MaskedValueIsZero if you don't know whether Op"
7668 " is a target node!");
7670 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything.
7677 // These nodes' second result is a boolean.
7678 if (Op.getResNo() == 0)
7682 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
7683 Mask.getBitWidth() - 1);
7688 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
7689 /// node is a GlobalAddress + offset.
7690 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
7691 GlobalValue* &GA, int64_t &Offset) const{
7692 if (N->getOpcode() == X86ISD::Wrapper) {
7693 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
7694 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
7695 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
7699 return TargetLowering::isGAPlusOffset(N, GA, Offset);
7702 static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
7703 const TargetLowering &TLI) {
7706 if (TLI.isGAPlusOffset(Base, GV, Offset))
7707 return (GV->getAlignment() >= N && (Offset % N) == 0);
7708 // DAG combine handles the stack object case.
7712 static bool EltsFromConsecutiveLoads(SDNode *N, SDValue PermMask,
7713 unsigned NumElems, MVT EVT,
7715 SelectionDAG &DAG, MachineFrameInfo *MFI,
7716 const TargetLowering &TLI) {
7718 for (unsigned i = 0; i < NumElems; ++i) {
7719 SDValue Idx = PermMask.getOperand(i);
7720 if (Idx.getOpcode() == ISD::UNDEF) {
7726 SDValue Elt = DAG.getShuffleScalarElt(N, i);
7727 if (!Elt.getNode() ||
7728 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
7731 Base = Elt.getNode();
7732 if (Base->getOpcode() == ISD::UNDEF)
7736 if (Elt.getOpcode() == ISD::UNDEF)
7739 if (!TLI.isConsecutiveLoad(Elt.getNode(), Base,
7740 EVT.getSizeInBits()/8, i, MFI))
7746 /// PerformShuffleCombine - Combine a vector_shuffle that is equal to
7747 /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
7748 /// if the load addresses are consecutive, non-overlapping, and in the right
7750 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
7751 const TargetLowering &TLI) {
7752 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7753 DebugLoc dl = N->getDebugLoc();
7754 MVT VT = N->getValueType(0);
7755 MVT EVT = VT.getVectorElementType();
7756 SDValue PermMask = N->getOperand(2);
7757 unsigned NumElems = PermMask.getNumOperands();
7758 SDNode *Base = NULL;
7759 if (!EltsFromConsecutiveLoads(N, PermMask, NumElems, EVT, Base,
7763 LoadSDNode *LD = cast<LoadSDNode>(Base);
7764 if (isBaseAlignmentOfN(16, Base->getOperand(1).getNode(), TLI))
7765 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
7766 LD->getSrcValue(), LD->getSrcValueOffset(),
7768 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
7769 LD->getSrcValue(), LD->getSrcValueOffset(),
7770 LD->isVolatile(), LD->getAlignment());
7773 /// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
7774 static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
7775 TargetLowering::DAGCombinerInfo &DCI,
7776 const X86Subtarget *Subtarget,
7777 const TargetLowering &TLI) {
7778 unsigned NumOps = N->getNumOperands();
7779 DebugLoc dl = N->getDebugLoc();
7781 // Ignore single operand BUILD_VECTOR.
7785 MVT VT = N->getValueType(0);
7786 MVT EVT = VT.getVectorElementType();
7787 if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit())
7788 // We are looking for load i64 and zero extend. We want to transform
7789 // it before legalizer has a chance to expand it. Also look for i64
7790 // BUILD_PAIR bit casted to f64.
7792 // This must be an insertion into a zero vector.
7793 SDValue HighElt = N->getOperand(1);
7794 if (!isZeroNode(HighElt))
7797 // Value must be a load.
7798 SDNode *Base = N->getOperand(0).getNode();
7799 if (!isa<LoadSDNode>(Base)) {
7800 if (Base->getOpcode() != ISD::BIT_CONVERT)
7802 Base = Base->getOperand(0).getNode();
7803 if (!isa<LoadSDNode>(Base))
7807 // Transform it into VZEXT_LOAD addr.
7808 LoadSDNode *LD = cast<LoadSDNode>(Base);
7810 // Load must not be an extload.
7811 if (LD->getExtensionType() != ISD::NON_EXTLOAD)
7814 // Load type should legal type so we don't have to legalize it.
7815 if (!TLI.isTypeLegal(VT))
7818 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7819 SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
7820 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
7821 TargetLowering::TargetLoweringOpt TLO(DAG);
7822 TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1));
7823 DCI.CommitTargetLoweringOpt(TLO);
7827 /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
7828 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
7829 const X86Subtarget *Subtarget) {
7830 DebugLoc dl = N->getDebugLoc();
7831 SDValue Cond = N->getOperand(0);
7833 // If we have SSE[12] support, try to form min/max nodes.
7834 if (Subtarget->hasSSE2() &&
7835 (N->getValueType(0) == MVT::f32 || N->getValueType(0) == MVT::f64)) {
7836 if (Cond.getOpcode() == ISD::SETCC) {
7837 // Get the LHS/RHS of the select.
7838 SDValue LHS = N->getOperand(1);
7839 SDValue RHS = N->getOperand(2);
7840 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
7842 unsigned Opcode = 0;
7843 if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
7846 case ISD::SETOLE: // (X <= Y) ? X : Y -> min
7849 if (!UnsafeFPMath) break;
7851 case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min
7853 Opcode = X86ISD::FMIN;
7856 case ISD::SETOGT: // (X > Y) ? X : Y -> max
7859 if (!UnsafeFPMath) break;
7861 case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max
7863 Opcode = X86ISD::FMAX;
7866 } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
7869 case ISD::SETOGT: // (X > Y) ? Y : X -> min
7872 if (!UnsafeFPMath) break;
7874 case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min
7876 Opcode = X86ISD::FMIN;
7879 case ISD::SETOLE: // (X <= Y) ? Y : X -> max
7882 if (!UnsafeFPMath) break;
7884 case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max
7886 Opcode = X86ISD::FMAX;
7892 return DAG.getNode(Opcode, dl, N->getValueType(0), LHS, RHS);
7900 /// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
7902 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
7903 const X86Subtarget *Subtarget) {
7904 // On X86 with SSE2 support, we can transform this to a vector shift if
7905 // all elements are shifted by the same amount. We can't do this in legalize
7906 // because the a constant vector is typically transformed to a constant pool
7907 // so we have no knowledge of the shift amount.
7908 if (!Subtarget->hasSSE2())
7911 MVT VT = N->getValueType(0);
7912 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
7915 SDValue ShAmtOp = N->getOperand(1);
7916 MVT EltVT = VT.getVectorElementType();
7917 DebugLoc dl = N->getDebugLoc();
7919 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
7920 unsigned NumElts = VT.getVectorNumElements();
7922 for (; i != NumElts; ++i) {
7923 SDValue Arg = ShAmtOp.getOperand(i);
7924 if (Arg.getOpcode() == ISD::UNDEF) continue;
7928 for (; i != NumElts; ++i) {
7929 SDValue Arg = ShAmtOp.getOperand(i);
7930 if (Arg.getOpcode() == ISD::UNDEF) continue;
7931 if (Arg != BaseShAmt) {
7935 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
7936 isSplatMask(ShAmtOp.getOperand(2).getNode())) {
7937 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ShAmtOp,
7938 DAG.getIntPtrConstant(0));
7942 if (EltVT.bitsGT(MVT::i32))
7943 BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
7944 else if (EltVT.bitsLT(MVT::i32))
7945 BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BaseShAmt);
7947 // The shift amount is identical so we can do a vector shift.
7948 SDValue ValOp = N->getOperand(0);
7949 switch (N->getOpcode()) {
7951 assert(0 && "Unknown shift opcode!");
7954 if (VT == MVT::v2i64)
7955 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7956 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
7958 if (VT == MVT::v4i32)
7959 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7960 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
7962 if (VT == MVT::v8i16)
7963 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7964 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
7968 if (VT == MVT::v4i32)
7969 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7970 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
7972 if (VT == MVT::v8i16)
7973 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7974 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
7978 if (VT == MVT::v2i64)
7979 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7980 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
7982 if (VT == MVT::v4i32)
7983 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7984 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
7986 if (VT == MVT::v8i16)
7987 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7988 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
7995 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
7996 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
7997 const X86Subtarget *Subtarget) {
7998 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
7999 // the FP state in cases where an emms may be missing.
8000 // A preferable solution to the general problem is to figure out the right
8001 // places to insert EMMS. This qualifies as a quick hack.
8002 StoreSDNode *St = cast<StoreSDNode>(N);
8003 if (St->getValue().getValueType().isVector() &&
8004 St->getValue().getValueType().getSizeInBits() == 64 &&
8005 isa<LoadSDNode>(St->getValue()) &&
8006 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
8007 St->getChain().hasOneUse() && !St->isVolatile()) {
8008 SDNode* LdVal = St->getValue().getNode();
8010 int TokenFactorIndex = -1;
8011 SmallVector<SDValue, 8> Ops;
8012 SDNode* ChainVal = St->getChain().getNode();
8013 // Must be a store of a load. We currently handle two cases: the load
8014 // is a direct child, and it's under an intervening TokenFactor. It is
8015 // possible to dig deeper under nested TokenFactors.
8016 if (ChainVal == LdVal)
8017 Ld = cast<LoadSDNode>(St->getChain());
8018 else if (St->getValue().hasOneUse() &&
8019 ChainVal->getOpcode() == ISD::TokenFactor) {
8020 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
8021 if (ChainVal->getOperand(i).getNode() == LdVal) {
8022 TokenFactorIndex = i;
8023 Ld = cast<LoadSDNode>(St->getValue());
8025 Ops.push_back(ChainVal->getOperand(i));
8029 DebugLoc dl = N->getDebugLoc();
8030 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
8031 if (Subtarget->is64Bit()) {
8032 SDValue NewLd = DAG.getLoad(MVT::i64, dl, Ld->getChain(),
8033 Ld->getBasePtr(), Ld->getSrcValue(),
8034 Ld->getSrcValueOffset(), Ld->isVolatile(),
8035 Ld->getAlignment());
8036 SDValue NewChain = NewLd.getValue(1);
8037 if (TokenFactorIndex != -1) {
8038 Ops.push_back(NewChain);
8039 NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Ops[0],
8042 return DAG.getStore(NewChain, dl, NewLd, St->getBasePtr(),
8043 St->getSrcValue(), St->getSrcValueOffset(),
8044 St->isVolatile(), St->getAlignment());
8047 // Otherwise, lower to two 32-bit copies.
8048 SDValue LoAddr = Ld->getBasePtr();
8049 SDValue HiAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, LoAddr,
8050 DAG.getConstant(4, MVT::i32));
8052 SDValue LoLd = DAG.getLoad(MVT::i32, dl, Ld->getChain(), LoAddr,
8053 Ld->getSrcValue(), Ld->getSrcValueOffset(),
8054 Ld->isVolatile(), Ld->getAlignment());
8055 SDValue HiLd = DAG.getLoad(MVT::i32, dl, Ld->getChain(), HiAddr,
8056 Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
8058 MinAlign(Ld->getAlignment(), 4));
8060 SDValue NewChain = LoLd.getValue(1);
8061 if (TokenFactorIndex != -1) {
8062 Ops.push_back(LoLd);
8063 Ops.push_back(HiLd);
8064 NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Ops[0],
8068 LoAddr = St->getBasePtr();
8069 HiAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, LoAddr,
8070 DAG.getConstant(4, MVT::i32));
8072 SDValue LoSt = DAG.getStore(NewChain, dl, LoLd, LoAddr,
8073 St->getSrcValue(), St->getSrcValueOffset(),
8074 St->isVolatile(), St->getAlignment());
8075 SDValue HiSt = DAG.getStore(NewChain, dl, HiLd, HiAddr,
8077 St->getSrcValueOffset() + 4,
8079 MinAlign(St->getAlignment(), 4));
8080 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoSt, HiSt);
8086 /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
8087 /// X86ISD::FXOR nodes.
8088 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
8089 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
8090 // F[X]OR(0.0, x) -> x
8091 // F[X]OR(x, 0.0) -> x
8092 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
8093 if (C->getValueAPF().isPosZero())
8094 return N->getOperand(1);
8095 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
8096 if (C->getValueAPF().isPosZero())
8097 return N->getOperand(0);
8101 /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
8102 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
8103 // FAND(0.0, x) -> 0.0
8104 // FAND(x, 0.0) -> 0.0
8105 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
8106 if (C->getValueAPF().isPosZero())
8107 return N->getOperand(0);
8108 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
8109 if (C->getValueAPF().isPosZero())
8110 return N->getOperand(1);
8114 static SDValue PerformBTCombine(SDNode *N,
8116 TargetLowering::DAGCombinerInfo &DCI) {
8117 // BT ignores high bits in the bit index operand.
8118 SDValue Op1 = N->getOperand(1);
8119 if (Op1.hasOneUse()) {
8120 unsigned BitWidth = Op1.getValueSizeInBits();
8121 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
8122 APInt KnownZero, KnownOne;
8123 TargetLowering::TargetLoweringOpt TLO(DAG);
8124 TargetLowering &TLI = DAG.getTargetLoweringInfo();
8125 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
8126 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
8127 DCI.CommitTargetLoweringOpt(TLO);
8132 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
8133 DAGCombinerInfo &DCI) const {
8134 SelectionDAG &DAG = DCI.DAG;
8135 switch (N->getOpcode()) {
8137 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
8138 case ISD::BUILD_VECTOR:
8139 return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this);
8140 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
8143 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);
8144 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
8146 case X86ISD::FOR: return PerformFORCombine(N, DAG);
8147 case X86ISD::FAND: return PerformFANDCombine(N, DAG);
8148 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
8154 //===----------------------------------------------------------------------===//
8155 // X86 Inline Assembly Support
8156 //===----------------------------------------------------------------------===//
8158 /// getConstraintType - Given a constraint letter, return the type of
8159 /// constraint it is for this target.
8160 X86TargetLowering::ConstraintType
8161 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
8162 if (Constraint.size() == 1) {
8163 switch (Constraint[0]) {
8175 return C_RegisterClass;
8183 return TargetLowering::getConstraintType(Constraint);
8186 /// LowerXConstraint - try to replace an X constraint, which matches anything,
8187 /// with another that has more specific requirements based on the type of the
8188 /// corresponding operand.
8189 const char *X86TargetLowering::
8190 LowerXConstraint(MVT ConstraintVT) const {
8191 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
8192 // 'f' like normal targets.
8193 if (ConstraintVT.isFloatingPoint()) {
8194 if (Subtarget->hasSSE2())
8196 if (Subtarget->hasSSE1())
8200 return TargetLowering::LowerXConstraint(ConstraintVT);
8203 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
8204 /// vector. If it is invalid, don't add anything to Ops.
8205 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
8208 std::vector<SDValue>&Ops,
8209 SelectionDAG &DAG) const {
8210 SDValue Result(0, 0);
8212 switch (Constraint) {
8215 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8216 if (C->getZExtValue() <= 31) {
8217 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8223 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8224 if (C->getZExtValue() <= 63) {
8225 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8231 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8232 if (C->getZExtValue() <= 255) {
8233 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8239 // 32-bit signed value
8240 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8241 const ConstantInt *CI = C->getConstantIntValue();
8242 if (CI->isValueValidForType(Type::Int32Ty, C->getSExtValue())) {
8243 // Widen to 64 bits here to get it sign extended.
8244 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
8247 // FIXME gcc accepts some relocatable values here too, but only in certain
8248 // memory models; it's complicated.
8253 // 32-bit unsigned value
8254 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8255 const ConstantInt *CI = C->getConstantIntValue();
8256 if (CI->isValueValidForType(Type::Int32Ty, C->getZExtValue())) {
8257 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8261 // FIXME gcc accepts some relocatable values here too, but only in certain
8262 // memory models; it's complicated.
8266 // Literal immediates are always ok.
8267 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
8268 // Widen to 64 bits here to get it sign extended.
8269 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
8273 // If we are in non-pic codegen mode, we allow the address of a global (with
8274 // an optional displacement) to be used with 'i'.
8275 GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
8278 // Match either (GA) or (GA+C)
8280 Offset = GA->getOffset();
8281 } else if (Op.getOpcode() == ISD::ADD) {
8282 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
8283 GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
8285 Offset = GA->getOffset()+C->getZExtValue();
8287 C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
8288 GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
8290 Offset = GA->getOffset()+C->getZExtValue();
8298 Op = LowerGlobalAddress(GA->getGlobal(), Op.getDebugLoc(),
8301 Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
8307 // Otherwise, not valid for this mode.
8312 if (Result.getNode()) {
8313 Ops.push_back(Result);
8316 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
8320 std::vector<unsigned> X86TargetLowering::
8321 getRegClassForInlineAsmConstraint(const std::string &Constraint,
8323 if (Constraint.size() == 1) {
8324 // FIXME: not handling fp-stack yet!
8325 switch (Constraint[0]) { // GCC X86 Constraint Letters
8326 default: break; // Unknown constraint letter
8327 case 'q': // Q_REGS (GENERAL_REGS in 64-bit mode)
8330 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
8331 else if (VT == MVT::i16)
8332 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
8333 else if (VT == MVT::i8)
8334 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
8335 else if (VT == MVT::i64)
8336 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
8341 return std::vector<unsigned>();
8344 std::pair<unsigned, const TargetRegisterClass*>
8345 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
8347 // First, see if this is a constraint that directly corresponds to an LLVM
8349 if (Constraint.size() == 1) {
8350 // GCC Constraint Letters
8351 switch (Constraint[0]) {
8353 case 'r': // GENERAL_REGS
8354 case 'R': // LEGACY_REGS
8355 case 'l': // INDEX_REGS
8357 return std::make_pair(0U, X86::GR8RegisterClass);
8359 return std::make_pair(0U, X86::GR16RegisterClass);
8360 if (VT == MVT::i32 || !Subtarget->is64Bit())
8361 return std::make_pair(0U, X86::GR32RegisterClass);
8362 return std::make_pair(0U, X86::GR64RegisterClass);
8363 case 'f': // FP Stack registers.
8364 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
8365 // value to the correct fpstack register class.
8366 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
8367 return std::make_pair(0U, X86::RFP32RegisterClass);
8368 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
8369 return std::make_pair(0U, X86::RFP64RegisterClass);
8370 return std::make_pair(0U, X86::RFP80RegisterClass);
8371 case 'y': // MMX_REGS if MMX allowed.
8372 if (!Subtarget->hasMMX()) break;
8373 return std::make_pair(0U, X86::VR64RegisterClass);
8374 case 'Y': // SSE_REGS if SSE2 allowed
8375 if (!Subtarget->hasSSE2()) break;
8377 case 'x': // SSE_REGS if SSE1 allowed
8378 if (!Subtarget->hasSSE1()) break;
8380 switch (VT.getSimpleVT()) {
8382 // Scalar SSE types.
8385 return std::make_pair(0U, X86::FR32RegisterClass);
8388 return std::make_pair(0U, X86::FR64RegisterClass);
8396 return std::make_pair(0U, X86::VR128RegisterClass);
8402 // Use the default implementation in TargetLowering to convert the register
8403 // constraint into a member of a register class.
8404 std::pair<unsigned, const TargetRegisterClass*> Res;
8405 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
8407 // Not found as a standard register?
8408 if (Res.second == 0) {
8409 // GCC calls "st(0)" just plain "st".
8410 if (StringsEqualNoCase("{st}", Constraint)) {
8411 Res.first = X86::ST0;
8412 Res.second = X86::RFP80RegisterClass;
8414 // 'A' means EAX + EDX.
8415 if (Constraint == "A") {
8416 Res.first = X86::EAX;
8417 Res.second = X86::GRADRegisterClass;
8422 // Otherwise, check to see if this is a register class of the wrong value
8423 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
8424 // turn into {ax},{dx}.
8425 if (Res.second->hasType(VT))
8426 return Res; // Correct type already, nothing to do.
8428 // All of the single-register GCC register classes map their values onto
8429 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
8430 // really want an 8-bit or 32-bit register, map to the appropriate register
8431 // class and return the appropriate register.
8432 if (Res.second == X86::GR16RegisterClass) {
8433 if (VT == MVT::i8) {
8434 unsigned DestReg = 0;
8435 switch (Res.first) {
8437 case X86::AX: DestReg = X86::AL; break;
8438 case X86::DX: DestReg = X86::DL; break;
8439 case X86::CX: DestReg = X86::CL; break;
8440 case X86::BX: DestReg = X86::BL; break;
8443 Res.first = DestReg;
8444 Res.second = Res.second = X86::GR8RegisterClass;
8446 } else if (VT == MVT::i32) {
8447 unsigned DestReg = 0;
8448 switch (Res.first) {
8450 case X86::AX: DestReg = X86::EAX; break;
8451 case X86::DX: DestReg = X86::EDX; break;
8452 case X86::CX: DestReg = X86::ECX; break;
8453 case X86::BX: DestReg = X86::EBX; break;
8454 case X86::SI: DestReg = X86::ESI; break;
8455 case X86::DI: DestReg = X86::EDI; break;
8456 case X86::BP: DestReg = X86::EBP; break;
8457 case X86::SP: DestReg = X86::ESP; break;
8460 Res.first = DestReg;
8461 Res.second = Res.second = X86::GR32RegisterClass;
8463 } else if (VT == MVT::i64) {
8464 unsigned DestReg = 0;
8465 switch (Res.first) {
8467 case X86::AX: DestReg = X86::RAX; break;
8468 case X86::DX: DestReg = X86::RDX; break;
8469 case X86::CX: DestReg = X86::RCX; break;
8470 case X86::BX: DestReg = X86::RBX; break;
8471 case X86::SI: DestReg = X86::RSI; break;
8472 case X86::DI: DestReg = X86::RDI; break;
8473 case X86::BP: DestReg = X86::RBP; break;
8474 case X86::SP: DestReg = X86::RSP; break;
8477 Res.first = DestReg;
8478 Res.second = Res.second = X86::GR64RegisterClass;
8481 } else if (Res.second == X86::FR32RegisterClass ||
8482 Res.second == X86::FR64RegisterClass ||
8483 Res.second == X86::VR128RegisterClass) {
8484 // Handle references to XMM physical registers that got mapped into the
8485 // wrong class. This can happen with constraints like {xmm0} where the
8486 // target independent register mapper will just pick the first match it can
8487 // find, ignoring the required type.
8489 Res.second = X86::FR32RegisterClass;
8490 else if (VT == MVT::f64)
8491 Res.second = X86::FR64RegisterClass;
8492 else if (X86::VR128RegisterClass->hasType(VT))
8493 Res.second = X86::VR128RegisterClass;
8499 //===----------------------------------------------------------------------===//
8500 // X86 Widen vector type
8501 //===----------------------------------------------------------------------===//
8503 /// getWidenVectorType: given a vector type, returns the type to widen
8504 /// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
8505 /// If there is no vector type that we want to widen to, returns MVT::Other
8506 /// When and where to widen is target dependent based on the cost of
8507 /// scalarizing vs using the wider vector type.
8509 MVT X86TargetLowering::getWidenVectorType(MVT VT) const {
8510 assert(VT.isVector());
8511 if (isTypeLegal(VT))
8514 // TODO: In computeRegisterProperty, we can compute the list of legal vector
8515 // type based on element type. This would speed up our search (though
8516 // it may not be worth it since the size of the list is relatively
8518 MVT EltVT = VT.getVectorElementType();
8519 unsigned NElts = VT.getVectorNumElements();
8521 // On X86, it make sense to widen any vector wider than 1
8525 for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE;
8526 nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
8527 MVT SVT = (MVT::SimpleValueType)nVT;
8529 if (isTypeLegal(SVT) &&
8530 SVT.getVectorElementType() == EltVT &&
8531 SVT.getVectorNumElements() > NElts)