1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X64
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X32
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
5 ; This file checks that atomic (non-seq_cst) stores of immediate values are
6 ; done in one mov instruction and not 2. More precisely, it makes sure that the
7 ; immediate is not first copied uselessly into a register.
9 ; Similarily, it checks that a binary operation of an immediate with an atomic
10 ; variable that is stored back in that variable is done as a single instruction.
11 ; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release)
12 ; should be just an add instruction, instead of loading x into a register, doing
13 ; an add and storing the result back.
14 ; The binary operations supported are currently add, and, or, xor.
15 ; sub is not supported because they are translated by an addition of the
18 ; We also check the same patterns:
20 ; - For register instead of immediate operands.
21 ; - For floating point operations.
23 ; seq_cst stores are left as (lock) xchgl, but we try to check every other
24 ; attribute at least once.
26 ; Please note that these operations do not require the lock prefix: only
27 ; sequentially consistent stores require this kind of protection on X86.
28 ; And even for seq_cst operations, llvm uses the xchg instruction which has
29 ; an implicit lock prefix, so making it explicit is not required.
31 define void @store_atomic_imm_8(i8* %p) {
32 ; X64-LABEL: store_atomic_imm_8:
35 ; X32-LABEL: store_atomic_imm_8:
38 store atomic i8 42, i8* %p release, align 1
42 define void @store_atomic_imm_16(i16* %p) {
43 ; X64-LABEL: store_atomic_imm_16:
46 ; X32-LABEL: store_atomic_imm_16:
49 store atomic i16 42, i16* %p monotonic, align 2
53 define void @store_atomic_imm_32(i32* %p) {
54 ; X64-LABEL: store_atomic_imm_32:
57 ; On 32 bits, there is an extra movl for each of those functions
58 ; (probably for alignment reasons).
59 ; X32-LABEL: store_atomic_imm_32:
60 ; X32: movl 4(%esp), %eax
63 store atomic i32 42, i32* %p release, align 4
67 define void @store_atomic_imm_64(i64* %p) {
68 ; X64-LABEL: store_atomic_imm_64:
71 ; These are implemented with a CAS loop on 32 bit architectures, and thus
72 ; cannot be optimized in the same way as the others.
73 ; X32-LABEL: store_atomic_imm_64:
75 store atomic i64 42, i64* %p release, align 8
79 ; If an immediate is too big to fit in 32 bits, it cannot be store in one mov,
80 ; even on X64, one must use movabsq that can only target a register.
81 define void @store_atomic_imm_64_big(i64* %p) {
82 ; X64-LABEL: store_atomic_imm_64_big:
85 store atomic i64 100000000000, i64* %p monotonic, align 8
89 ; It would be incorrect to replace a lock xchgl by a movl
90 define void @store_atomic_imm_32_seq_cst(i32* %p) {
91 ; X64-LABEL: store_atomic_imm_32_seq_cst:
93 ; X32-LABEL: store_atomic_imm_32_seq_cst:
95 store atomic i32 42, i32* %p seq_cst, align 4
101 define void @add_8i(i8* %p) {
110 %1 = load atomic i8, i8* %p seq_cst, align 1
112 store atomic i8 %2, i8* %p release, align 1
116 define void @add_8r(i8* %p, i8 %v) {
125 %1 = load atomic i8, i8* %p seq_cst, align 1
127 store atomic i8 %2, i8* %p release, align 1
131 define void @add_16i(i16* %p) {
132 ; Currently the transformation is not done on 16 bit accesses, as the backend
133 ; treat 16 bit arithmetic as expensive on X86/X86_64.
134 ; X64-LABEL: add_16i:
136 ; X32-LABEL: add_16i:
138 %1 = load atomic i16, i16* %p acquire, align 2
140 store atomic i16 %2, i16* %p release, align 2
144 define void @add_16r(i16* %p, i16 %v) {
145 ; Currently the transformation is not done on 16 bit accesses, as the backend
146 ; treat 16 bit arithmetic as expensive on X86/X86_64.
147 ; X64-LABEL: add_16r:
149 ; X32-LABEL: add_16r:
150 ; X32-NOT: addw [.*], (
151 %1 = load atomic i16, i16* %p acquire, align 2
153 store atomic i16 %2, i16* %p release, align 2
157 define void @add_32i(i32* %p) {
158 ; X64-LABEL: add_32i:
162 ; X32-LABEL: add_32i:
166 %1 = load atomic i32, i32* %p acquire, align 4
168 store atomic i32 %2, i32* %p monotonic, align 4
172 define void @add_32r(i32* %p, i32 %v) {
173 ; X64-LABEL: add_32r:
177 ; X32-LABEL: add_32r:
181 %1 = load atomic i32, i32* %p acquire, align 4
183 store atomic i32 %2, i32* %p monotonic, align 4
187 ; The following is a corner case where the load is added to itself. The pattern
188 ; matching should not fold this. We only test with 32-bit add, but the same
189 ; applies to other sizes and operations.
190 define void @add_32r_self(i32* %p) {
191 ; X64-LABEL: add_32r_self:
193 ; X64: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]]
194 ; X64: addl %[[R]], %[[R]]
195 ; X64: movl %[[R]], (%[[M]])
196 ; X32-LABEL: add_32r_self:
198 ; X32: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]]
199 ; X32: addl %[[R]], %[[R]]
200 ; X32: movl %[[R]], (%[[M]])
201 %1 = load atomic i32, i32* %p acquire, align 4
203 store atomic i32 %2, i32* %p monotonic, align 4
207 ; The following is a corner case where the load's result is returned. The
208 ; optimizer isn't allowed to duplicate the load because it's atomic.
209 define i32 @add_32r_ret_load(i32* %p, i32 %v) {
210 ; X64-LABEL: add_32r_ret_load:
212 ; X64: movl (%rdi), %eax
213 ; X64-NEXT: addl %eax, %esi
214 ; X64-NEXT: movl %esi, (%rdi)
216 ; X32-LABEL: add_32r_ret_load:
218 ; X32: movl 4(%esp), %[[P:[a-z]+]]
219 ; X32-NEXT: movl (%[[P]]),
221 ; More code here, we just don't want it to load from P.
222 ; X32: movl %{{.*}}, (%[[P]])
224 %1 = load atomic i32, i32* %p acquire, align 4
226 store atomic i32 %2, i32* %p monotonic, align 4
230 define void @add_64i(i64* %p) {
231 ; X64-LABEL: add_64i:
235 ; We do not check X86-32 as it cannot do 'addq'.
236 ; X32-LABEL: add_64i:
237 %1 = load atomic i64, i64* %p acquire, align 8
239 store atomic i64 %2, i64* %p release, align 8
243 define void @add_64r(i64* %p, i64 %v) {
244 ; X64-LABEL: add_64r:
248 ; We do not check X86-32 as it cannot do 'addq'.
249 ; X32-LABEL: add_64r:
250 %1 = load atomic i64, i64* %p acquire, align 8
252 store atomic i64 %2, i64* %p release, align 8
256 define void @add_32i_seq_cst(i32* %p) {
257 ; X64-LABEL: add_32i_seq_cst:
259 ; X32-LABEL: add_32i_seq_cst:
261 %1 = load atomic i32, i32* %p monotonic, align 4
263 store atomic i32 %2, i32* %p seq_cst, align 4
267 define void @add_32r_seq_cst(i32* %p, i32 %v) {
268 ; X64-LABEL: add_32r_seq_cst:
270 ; X32-LABEL: add_32r_seq_cst:
272 %1 = load atomic i32, i32* %p monotonic, align 4
274 store atomic i32 %2, i32* %p seq_cst, align 4
280 define void @and_8i(i8* %p) {
289 %1 = load atomic i8, i8* %p monotonic, align 1
291 store atomic i8 %2, i8* %p release, align 1
295 define void @and_8r(i8* %p, i8 %v) {
304 %1 = load atomic i8, i8* %p monotonic, align 1
306 store atomic i8 %2, i8* %p release, align 1
310 define void @and_16i(i16* %p) {
311 ; Currently the transformation is not done on 16 bit accesses, as the backend
312 ; treat 16 bit arithmetic as expensive on X86/X86_64.
313 ; X64-LABEL: and_16i:
315 ; X32-LABEL: and_16i:
317 %1 = load atomic i16, i16* %p acquire, align 2
319 store atomic i16 %2, i16* %p release, align 2
323 define void @and_16r(i16* %p, i16 %v) {
324 ; Currently the transformation is not done on 16 bit accesses, as the backend
325 ; treat 16 bit arithmetic as expensive on X86/X86_64.
326 ; X64-LABEL: and_16r:
328 ; X32-LABEL: and_16r:
329 ; X32-NOT: andw [.*], (
330 %1 = load atomic i16, i16* %p acquire, align 2
332 store atomic i16 %2, i16* %p release, align 2
336 define void @and_32i(i32* %p) {
337 ; X64-LABEL: and_32i:
341 ; X32-LABEL: and_32i:
345 %1 = load atomic i32, i32* %p acquire, align 4
347 store atomic i32 %2, i32* %p release, align 4
351 define void @and_32r(i32* %p, i32 %v) {
352 ; X64-LABEL: and_32r:
356 ; X32-LABEL: and_32r:
360 %1 = load atomic i32, i32* %p acquire, align 4
362 store atomic i32 %2, i32* %p release, align 4
366 define void @and_64i(i64* %p) {
367 ; X64-LABEL: and_64i:
371 ; We do not check X86-32 as it cannot do 'andq'.
372 ; X32-LABEL: and_64i:
373 %1 = load atomic i64, i64* %p acquire, align 8
375 store atomic i64 %2, i64* %p release, align 8
379 define void @and_64r(i64* %p, i64 %v) {
380 ; X64-LABEL: and_64r:
384 ; We do not check X86-32 as it cannot do 'andq'.
385 ; X32-LABEL: and_64r:
386 %1 = load atomic i64, i64* %p acquire, align 8
388 store atomic i64 %2, i64* %p release, align 8
392 define void @and_32i_seq_cst(i32* %p) {
393 ; X64-LABEL: and_32i_seq_cst:
395 ; X32-LABEL: and_32i_seq_cst:
397 %1 = load atomic i32, i32* %p monotonic, align 4
399 store atomic i32 %2, i32* %p seq_cst, align 4
403 define void @and_32r_seq_cst(i32* %p, i32 %v) {
404 ; X64-LABEL: and_32r_seq_cst:
406 ; X32-LABEL: and_32r_seq_cst:
408 %1 = load atomic i32, i32* %p monotonic, align 4
410 store atomic i32 %2, i32* %p seq_cst, align 4
416 define void @or_8i(i8* %p) {
425 %1 = load atomic i8, i8* %p acquire, align 1
427 store atomic i8 %2, i8* %p release, align 1
431 define void @or_8r(i8* %p, i8 %v) {
440 %1 = load atomic i8, i8* %p acquire, align 1
442 store atomic i8 %2, i8* %p release, align 1
446 define void @or_16i(i16* %p) {
451 %1 = load atomic i16, i16* %p acquire, align 2
453 store atomic i16 %2, i16* %p release, align 2
457 define void @or_16r(i16* %p, i16 %v) {
461 ; X32-NOT: orw [.*], (
462 %1 = load atomic i16, i16* %p acquire, align 2
464 store atomic i16 %2, i16* %p release, align 2
468 define void @or_32i(i32* %p) {
477 %1 = load atomic i32, i32* %p acquire, align 4
479 store atomic i32 %2, i32* %p release, align 4
483 define void @or_32r(i32* %p, i32 %v) {
492 %1 = load atomic i32, i32* %p acquire, align 4
494 store atomic i32 %2, i32* %p release, align 4
498 define void @or_64i(i64* %p) {
503 ; We do not check X86-32 as it cannot do 'orq'.
505 %1 = load atomic i64, i64* %p acquire, align 8
507 store atomic i64 %2, i64* %p release, align 8
511 define void @or_64r(i64* %p, i64 %v) {
516 ; We do not check X86-32 as it cannot do 'orq'.
518 %1 = load atomic i64, i64* %p acquire, align 8
520 store atomic i64 %2, i64* %p release, align 8
524 define void @or_32i_seq_cst(i32* %p) {
525 ; X64-LABEL: or_32i_seq_cst:
527 ; X32-LABEL: or_32i_seq_cst:
529 %1 = load atomic i32, i32* %p monotonic, align 4
531 store atomic i32 %2, i32* %p seq_cst, align 4
535 define void @or_32r_seq_cst(i32* %p, i32 %v) {
536 ; X64-LABEL: or_32r_seq_cst:
538 ; X32-LABEL: or_32r_seq_cst:
540 %1 = load atomic i32, i32* %p monotonic, align 4
542 store atomic i32 %2, i32* %p seq_cst, align 4
548 define void @xor_8i(i8* %p) {
557 %1 = load atomic i8, i8* %p acquire, align 1
559 store atomic i8 %2, i8* %p release, align 1
563 define void @xor_8r(i8* %p, i8 %v) {
572 %1 = load atomic i8, i8* %p acquire, align 1
574 store atomic i8 %2, i8* %p release, align 1
578 define void @xor_16i(i16* %p) {
579 ; X64-LABEL: xor_16i:
581 ; X32-LABEL: xor_16i:
583 %1 = load atomic i16, i16* %p acquire, align 2
585 store atomic i16 %2, i16* %p release, align 2
589 define void @xor_16r(i16* %p, i16 %v) {
590 ; X64-LABEL: xor_16r:
592 ; X32-LABEL: xor_16r:
593 ; X32-NOT: xorw [.*], (
594 %1 = load atomic i16, i16* %p acquire, align 2
596 store atomic i16 %2, i16* %p release, align 2
600 define void @xor_32i(i32* %p) {
601 ; X64-LABEL: xor_32i:
605 ; X32-LABEL: xor_32i:
609 %1 = load atomic i32, i32* %p acquire, align 4
611 store atomic i32 %2, i32* %p release, align 4
615 define void @xor_32r(i32* %p, i32 %v) {
616 ; X64-LABEL: xor_32r:
620 ; X32-LABEL: xor_32r:
624 %1 = load atomic i32, i32* %p acquire, align 4
626 store atomic i32 %2, i32* %p release, align 4
630 define void @xor_64i(i64* %p) {
631 ; X64-LABEL: xor_64i:
635 ; We do not check X86-32 as it cannot do 'xorq'.
636 ; X32-LABEL: xor_64i:
637 %1 = load atomic i64, i64* %p acquire, align 8
639 store atomic i64 %2, i64* %p release, align 8
643 define void @xor_64r(i64* %p, i64 %v) {
644 ; X64-LABEL: xor_64r:
648 ; We do not check X86-32 as it cannot do 'xorq'.
649 ; X32-LABEL: xor_64r:
650 %1 = load atomic i64, i64* %p acquire, align 8
652 store atomic i64 %2, i64* %p release, align 8
656 define void @xor_32i_seq_cst(i32* %p) {
657 ; X64-LABEL: xor_32i_seq_cst:
659 ; X32-LABEL: xor_32i_seq_cst:
661 %1 = load atomic i32, i32* %p monotonic, align 4
663 store atomic i32 %2, i32* %p seq_cst, align 4
667 define void @xor_32r_seq_cst(i32* %p, i32 %v) {
668 ; X64-LABEL: xor_32r_seq_cst:
670 ; X32-LABEL: xor_32r_seq_cst:
672 %1 = load atomic i32, i32* %p monotonic, align 4
674 store atomic i32 %2, i32* %p seq_cst, align 4
680 define void @inc_8(i8* %p) {
689 ; SLOW_INC-LABEL: inc_8:
692 %1 = load atomic i8, i8* %p seq_cst, align 1
694 store atomic i8 %2, i8* %p release, align 1
698 define void @inc_16(i16* %p) {
699 ; Currently the transformation is not done on 16 bit accesses, as the backend
700 ; treat 16 bit arithmetic as expensive on X86/X86_64.
705 ; SLOW_INC-LABEL: inc_16:
707 %1 = load atomic i16, i16* %p acquire, align 2
709 store atomic i16 %2, i16* %p release, align 2
713 define void @inc_32(i32* %p) {
722 ; SLOW_INC-LABEL: inc_32:
725 %1 = load atomic i32, i32* %p acquire, align 4
727 store atomic i32 %2, i32* %p monotonic, align 4
731 define void @inc_64(i64* %p) {
736 ; We do not check X86-32 as it cannot do 'incq'.
738 ; SLOW_INC-LABEL: inc_64:
741 %1 = load atomic i64, i64* %p acquire, align 8
743 store atomic i64 %2, i64* %p release, align 8
747 define void @inc_32_seq_cst(i32* %p) {
748 ; X64-LABEL: inc_32_seq_cst:
750 ; X32-LABEL: inc_32_seq_cst:
752 %1 = load atomic i32, i32* %p monotonic, align 4
754 store atomic i32 %2, i32* %p seq_cst, align 4
760 define void @dec_8(i8* %p) {
769 ; SLOW_INC-LABEL: dec_8:
772 %1 = load atomic i8, i8* %p seq_cst, align 1
774 store atomic i8 %2, i8* %p release, align 1
778 define void @dec_16(i16* %p) {
779 ; Currently the transformation is not done on 16 bit accesses, as the backend
780 ; treat 16 bit arithmetic as expensive on X86/X86_64.
785 ; SLOW_INC-LABEL: dec_16:
787 %1 = load atomic i16, i16* %p acquire, align 2
789 store atomic i16 %2, i16* %p release, align 2
793 define void @dec_32(i32* %p) {
802 ; SLOW_INC-LABEL: dec_32:
805 %1 = load atomic i32, i32* %p acquire, align 4
807 store atomic i32 %2, i32* %p monotonic, align 4
811 define void @dec_64(i64* %p) {
816 ; We do not check X86-32 as it cannot do 'decq'.
818 ; SLOW_INC-LABEL: dec_64:
821 %1 = load atomic i64, i64* %p acquire, align 8
823 store atomic i64 %2, i64* %p release, align 8
827 define void @dec_32_seq_cst(i32* %p) {
828 ; X64-LABEL: dec_32_seq_cst:
830 ; X32-LABEL: dec_32_seq_cst:
832 %1 = load atomic i32, i32* %p monotonic, align 4
834 store atomic i32 %2, i32* %p seq_cst, align 4
840 define void @fadd_32r(float* %loc, float %val) {
841 ; X64-LABEL: fadd_32r:
844 ; X64: addss (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]]
845 ; X64-NEXT: movss %[[XMM]], (%[[M]])
846 ; X32-LABEL: fadd_32r:
847 ; Don't check x86-32.
848 ; LLVM's SSE handling is conservative on x86-32 even without using atomics.
849 %floc = bitcast float* %loc to i32*
850 %1 = load atomic i32, i32* %floc seq_cst, align 4
851 %2 = bitcast i32 %1 to float
852 %add = fadd float %2, %val
853 %3 = bitcast float %add to i32
854 store atomic i32 %3, i32* %floc release, align 4
858 define void @fadd_64r(double* %loc, double %val) {
859 ; X64-LABEL: fadd_64r:
862 ; X64: addsd (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]]
863 ; X64-NEXT: movsd %[[XMM]], (%[[M]])
864 ; X32-LABEL: fadd_64r:
865 ; Don't check x86-32 (see comment above).
866 %floc = bitcast double* %loc to i64*
867 %1 = load atomic i64, i64* %floc seq_cst, align 8
868 %2 = bitcast i64 %1 to double
869 %add = fadd double %2, %val
870 %3 = bitcast double %add to i64
871 store atomic i64 %3, i64* %floc release, align 8
875 @glob32 = global float 0.000000e+00, align 4
876 @glob64 = global double 0.000000e+00, align 8
878 ; Floating-point add to a global using an immediate.
879 define void @fadd_32g() {
880 ; X64-LABEL: fadd_32g:
882 ; X64: movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
883 ; X64-NEXT: addss glob32(%rip), %[[XMM]]
884 ; X64-NEXT: movss %[[XMM]], glob32(%rip)
885 ; X32-LABEL: fadd_32g:
886 ; Don't check x86-32 (see comment above).
887 %i = load atomic i32, i32* bitcast (float* @glob32 to i32*) monotonic, align 4
888 %f = bitcast i32 %i to float
889 %add = fadd float %f, 1.000000e+00
890 %s = bitcast float %add to i32
891 store atomic i32 %s, i32* bitcast (float* @glob32 to i32*) monotonic, align 4
895 define void @fadd_64g() {
896 ; X64-LABEL: fadd_64g:
898 ; X64: movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
899 ; X64-NEXT: addsd glob64(%rip), %[[XMM]]
900 ; X64-NEXT: movsd %[[XMM]], glob64(%rip)
901 ; X32-LABEL: fadd_64g:
902 ; Don't check x86-32 (see comment above).
903 %i = load atomic i64, i64* bitcast (double* @glob64 to i64*) monotonic, align 8
904 %f = bitcast i64 %i to double
905 %add = fadd double %f, 1.000000e+00
906 %s = bitcast double %add to i64
907 store atomic i64 %s, i64* bitcast (double* @glob64 to i64*) monotonic, align 8
911 ; Floating-point add to a hard-coded immediate location using an immediate.
912 define void @fadd_32imm() {
913 ; X64-LABEL: fadd_32imm:
915 ; X64: movl $3735928559, %e[[M:[a-z]+]]
916 ; X64: movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
917 ; X64-NEXT: addss (%r[[M]]), %[[XMM]]
918 ; X64-NEXT: movss %[[XMM]], (%r[[M]])
919 ; X32-LABEL: fadd_32imm:
920 ; Don't check x86-32 (see comment above).
921 %i = load atomic i32, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4
922 %f = bitcast i32 %i to float
923 %add = fadd float %f, 1.000000e+00
924 %s = bitcast float %add to i32
925 store atomic i32 %s, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4
929 define void @fadd_64imm() {
930 ; X64-LABEL: fadd_64imm:
932 ; X64: movl $3735928559, %e[[M:[a-z]+]]
933 ; X64: movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
934 ; X64-NEXT: addsd (%r[[M]]), %[[XMM]]
935 ; X64-NEXT: movsd %[[XMM]], (%r[[M]])
936 ; X32-LABEL: fadd_64imm:
937 ; Don't check x86-32 (see comment above).
938 %i = load atomic i64, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8
939 %f = bitcast i64 %i to double
940 %add = fadd double %f, 1.000000e+00
941 %s = bitcast double %add to i64
942 store atomic i64 %s, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8
946 ; Floating-point add to a stack location.
947 define void @fadd_32stack() {
948 ; X64-LABEL: fadd_32stack:
950 ; X64: movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
951 ; X64-NEXT: addss [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]]
952 ; X64-NEXT: movss %[[XMM]], [[STACKOFF]](%rsp)
953 ; X32-LABEL: fadd_32stack:
954 ; Don't check x86-32 (see comment above).
955 %ptr = alloca i32, align 4
956 %bc3 = bitcast i32* %ptr to float*
957 %load = load atomic i32, i32* %ptr acquire, align 4
958 %bc0 = bitcast i32 %load to float
959 %fadd = fadd float 1.000000e+00, %bc0
960 %bc1 = bitcast float %fadd to i32
961 store atomic i32 %bc1, i32* %ptr release, align 4
965 define void @fadd_64stack() {
966 ; X64-LABEL: fadd_64stack:
968 ; X64: movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
969 ; X64-NEXT: addsd [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]]
970 ; X64-NEXT: movsd %[[XMM]], [[STACKOFF]](%rsp)
971 ; X32-LABEL: fadd_64stack:
972 ; Don't check x86-32 (see comment above).
973 %ptr = alloca i64, align 8
974 %bc3 = bitcast i64* %ptr to double*
975 %load = load atomic i64, i64* %ptr acquire, align 8
976 %bc0 = bitcast i64 %load to double
977 %fadd = fadd double 1.000000e+00, %bc0
978 %bc1 = bitcast double %fadd to i64
979 store atomic i64 %bc1, i64* %ptr release, align 8