test/CodeGen/X86/atomic_mi.ll

   1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X64
   2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X32
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
   4
   5 ; This file checks that atomic (non-seq_cst) stores of immediate values are
   6 ; done in one mov instruction and not 2. More precisely, it makes sure that the
   7 ; immediate is not first copied uselessly into a register.
   8
   9 ; Similarily, it checks that a binary operation of an immediate with an atomic
  10 ; variable that is stored back in that variable is done as a single instruction.
  11 ; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release)
  12 ; should be just an add instruction, instead of loading x into a register, doing
  13 ; an add and storing the result back.
  14 ; The binary operations supported are currently add, and, or, xor.
  15 ; sub is not supported because they are translated by an addition of the
  16 ; negated immediate.
  17 ;
  18 ; We also check the same patterns:
  19 ; - For inc/dec.
  20 ; - For register instead of immediate operands.
  21 ; - For floating point operations.
  22
  23 ; seq_cst stores are left as (lock) xchgl, but we try to check every other
  24 ; attribute at least once.
  25
  26 ; Please note that these operations do not require the lock prefix: only
  27 ; sequentially consistent stores require this kind of protection on X86.
  28 ; And even for seq_cst operations, llvm uses the xchg instruction which has
  29 ; an implicit lock prefix, so making it explicit is not required.
  30
  31 define void @store_atomic_imm_8(i8* %p) {
  32 ; X64-LABEL: store_atomic_imm_8:
  33 ; X64: movb
  34 ; X64-NOT: movb
  35 ; X32-LABEL: store_atomic_imm_8:
  36 ; X32: movb
  37 ; X32-NOT: movb
  38   store atomic i8 42, i8* %p release, align 1
  39   ret void
  40 }
  41
  42 define void @store_atomic_imm_16(i16* %p) {
  43 ; X64-LABEL: store_atomic_imm_16:
  44 ; X64: movw
  45 ; X64-NOT: movw
  46 ; X32-LABEL: store_atomic_imm_16:
  47 ; X32: movw
  48 ; X32-NOT: movw
  49   store atomic i16 42, i16* %p monotonic, align 2
  50   ret void
  51 }
  52
  53 define void @store_atomic_imm_32(i32* %p) {
  54 ; X64-LABEL: store_atomic_imm_32:
  55 ; X64: movl
  56 ; X64-NOT: movl
  57 ;   On 32 bits, there is an extra movl for each of those functions
  58 ;   (probably for alignment reasons).
  59 ; X32-LABEL: store_atomic_imm_32:
  60 ; X32: movl 4(%esp), %eax
  61 ; X32: movl
  62 ; X32-NOT: movl
  63   store atomic i32 42, i32* %p release, align 4
  64   ret void
  65 }
  66
  67 define void @store_atomic_imm_64(i64* %p) {
  68 ; X64-LABEL: store_atomic_imm_64:
  69 ; X64: movq
  70 ; X64-NOT: movq
  71 ;   These are implemented with a CAS loop on 32 bit architectures, and thus
  72 ;   cannot be optimized in the same way as the others.
  73 ; X32-LABEL: store_atomic_imm_64:
  74 ; X32: cmpxchg8b
  75   store atomic i64 42, i64* %p release, align 8
  76   ret void
  77 }
  78
  79 ; If an immediate is too big to fit in 32 bits, it cannot be store in one mov,
  80 ; even on X64, one must use movabsq that can only target a register.
  81 define void @store_atomic_imm_64_big(i64* %p) {
  82 ; X64-LABEL: store_atomic_imm_64_big:
  83 ; X64: movabsq
  84 ; X64: movq
  85   store atomic i64 100000000000, i64* %p monotonic, align 8
  86   ret void
  87 }
  88
  89 ; It would be incorrect to replace a lock xchgl by a movl
  90 define void @store_atomic_imm_32_seq_cst(i32* %p) {
  91 ; X64-LABEL: store_atomic_imm_32_seq_cst:
  92 ; X64: xchgl
  93 ; X32-LABEL: store_atomic_imm_32_seq_cst:
  94 ; X32: xchgl
  95   store atomic i32 42, i32* %p seq_cst, align 4
  96   ret void
  97 }
  98
  99 ; ----- ADD -----
 100
 101 define void @add_8i(i8* %p) {
 102 ; X64-LABEL: add_8i:
 103 ; X64-NOT: lock
 104 ; X64: addb
 105 ; X64-NOT: movb
 106 ; X32-LABEL: add_8i:
 107 ; X32-NOT: lock
 108 ; X32: addb
 109 ; X32-NOT: movb
 110   %1 = load atomic i8, i8* %p seq_cst, align 1
 111   %2 = add i8 %1, 2
 112   store atomic i8 %2, i8* %p release, align 1
 113   ret void
 114 }
 115
 116 define void @add_8r(i8* %p, i8 %v) {
 117 ; X64-LABEL: add_8r:
 118 ; X64-NOT: lock
 119 ; X64: addb
 120 ; X64-NOT: movb
 121 ; X32-LABEL: add_8r:
 122 ; X32-NOT: lock
 123 ; X32: addb
 124 ; X32-NOT: movb
 125   %1 = load atomic i8, i8* %p seq_cst, align 1
 126   %2 = add i8 %1, %v
 127   store atomic i8 %2, i8* %p release, align 1
 128   ret void
 129 }
 130
 131 define void @add_16i(i16* %p) {
 132 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 133 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 134 ; X64-LABEL: add_16i:
 135 ; X64-NOT: addw
 136 ; X32-LABEL: add_16i:
 137 ; X32-NOT: addw
 138   %1 = load atomic i16, i16* %p acquire, align 2
 139   %2 = add i16 %1, 2
 140   store atomic i16 %2, i16* %p release, align 2
 141   ret void
 142 }
 143
 144 define void @add_16r(i16* %p, i16 %v) {
 145 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 146 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 147 ; X64-LABEL: add_16r:
 148 ; X64-NOT: addw
 149 ; X32-LABEL: add_16r:
 150 ; X32-NOT: addw [.*], (
 151   %1 = load atomic i16, i16* %p acquire, align 2
 152   %2 = add i16 %1, %v
 153   store atomic i16 %2, i16* %p release, align 2
 154   ret void
 155 }
 156
 157 define void @add_32i(i32* %p) {
 158 ; X64-LABEL: add_32i:
 159 ; X64-NOT: lock
 160 ; X64: addl
 161 ; X64-NOT: movl
 162 ; X32-LABEL: add_32i:
 163 ; X32-NOT: lock
 164 ; X32: addl
 165 ; X32-NOT: movl
 166   %1 = load atomic i32, i32* %p acquire, align 4
 167   %2 = add i32 %1, 2
 168   store atomic i32 %2, i32* %p monotonic, align 4
 169   ret void
 170 }
 171
 172 define void @add_32r(i32* %p, i32 %v) {
 173 ; X64-LABEL: add_32r:
 174 ; X64-NOT: lock
 175 ; X64: addl
 176 ; X64-NOT: movl
 177 ; X32-LABEL: add_32r:
 178 ; X32-NOT: lock
 179 ; X32: addl
 180 ; X32-NOT: movl
 181   %1 = load atomic i32, i32* %p acquire, align 4
 182   %2 = add i32 %1, %v
 183   store atomic i32 %2, i32* %p monotonic, align 4
 184   ret void
 185 }
 186
 187 ; The following is a corner case where the load is added to itself. The pattern
 188 ; matching should not fold this. We only test with 32-bit add, but the same
 189 ; applies to other sizes and operations.
 190 define void @add_32r_self(i32* %p) {
 191 ; X64-LABEL: add_32r_self:
 192 ; X64-NOT: lock
 193 ; X64: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]]
 194 ; X64: addl %[[R]], %[[R]]
 195 ; X64: movl %[[R]], (%[[M]])
 196 ; X32-LABEL: add_32r_self:
 197 ; X32-NOT: lock
 198 ; X32: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]]
 199 ; X32: addl %[[R]], %[[R]]
 200 ; X32: movl %[[R]], (%[[M]])
 201   %1 = load atomic i32, i32* %p acquire, align 4
 202   %2 = add i32 %1, %1
 203   store atomic i32 %2, i32* %p monotonic, align 4
 204   ret void
 205 }
 206
 207 ; The following is a corner case where the load's result is returned. The
 208 ; optimizer isn't allowed to duplicate the load because it's atomic.
 209 define i32 @add_32r_ret_load(i32* %p, i32 %v) {
 210 ; X64-LABEL: add_32r_ret_load:
 211 ; X64-NOT: lock
 212 ; X64:      movl (%rdi), %eax
 213 ; X64-NEXT: addl %eax, %esi
 214 ; X64-NEXT: movl %esi, (%rdi)
 215 ; X64-NEXT: retq
 216 ; X32-LABEL: add_32r_ret_load:
 217 ; X32-NOT: lock
 218 ; X32:      movl 4(%esp), %[[P:[a-z]+]]
 219 ; X32-NEXT: movl (%[[P]]),
 220 ; X32-NOT: %[[P]]
 221 ; More code here, we just don't want it to load from P.
 222 ; X32: movl %{{.*}}, (%[[P]])
 223 ; X32-NEXT: retl
 224   %1 = load atomic i32, i32* %p acquire, align 4
 225   %2 = add i32 %1, %v
 226   store atomic i32 %2, i32* %p monotonic, align 4
 227   ret i32 %1
 228 }
 229
 230 define void @add_64i(i64* %p) {
 231 ; X64-LABEL: add_64i:
 232 ; X64-NOT: lock
 233 ; X64: addq
 234 ; X64-NOT: movq
 235 ;   We do not check X86-32 as it cannot do 'addq'.
 236 ; X32-LABEL: add_64i:
 237   %1 = load atomic i64, i64* %p acquire, align 8
 238   %2 = add i64 %1, 2
 239   store atomic i64 %2, i64* %p release, align 8
 240   ret void
 241 }
 242
 243 define void @add_64r(i64* %p, i64 %v) {
 244 ; X64-LABEL: add_64r:
 245 ; X64-NOT: lock
 246 ; X64: addq
 247 ; X64-NOT: movq
 248 ;   We do not check X86-32 as it cannot do 'addq'.
 249 ; X32-LABEL: add_64r:
 250   %1 = load atomic i64, i64* %p acquire, align 8
 251   %2 = add i64 %1, %v
 252   store atomic i64 %2, i64* %p release, align 8
 253   ret void
 254 }
 255
 256 define void @add_32i_seq_cst(i32* %p) {
 257 ; X64-LABEL: add_32i_seq_cst:
 258 ; X64: xchgl
 259 ; X32-LABEL: add_32i_seq_cst:
 260 ; X32: xchgl
 261   %1 = load atomic i32, i32* %p monotonic, align 4
 262   %2 = add i32 %1, 2
 263   store atomic i32 %2, i32* %p seq_cst, align 4
 264   ret void
 265 }
 266
 267 define void @add_32r_seq_cst(i32* %p, i32 %v) {
 268 ; X64-LABEL: add_32r_seq_cst:
 269 ; X64: xchgl
 270 ; X32-LABEL: add_32r_seq_cst:
 271 ; X32: xchgl
 272   %1 = load atomic i32, i32* %p monotonic, align 4
 273   %2 = add i32 %1, %v
 274   store atomic i32 %2, i32* %p seq_cst, align 4
 275   ret void
 276 }
 277
 278 ; ----- AND -----
 279
 280 define void @and_8i(i8* %p) {
 281 ; X64-LABEL: and_8i:
 282 ; X64-NOT: lock
 283 ; X64: andb
 284 ; X64-NOT: movb
 285 ; X32-LABEL: and_8i:
 286 ; X32-NOT: lock
 287 ; X32: andb
 288 ; X32-NOT: movb
 289   %1 = load atomic i8, i8* %p monotonic, align 1
 290   %2 = and i8 %1, 2
 291   store atomic i8 %2, i8* %p release, align 1
 292   ret void
 293 }
 294
 295 define void @and_8r(i8* %p, i8 %v) {
 296 ; X64-LABEL: and_8r:
 297 ; X64-NOT: lock
 298 ; X64: andb
 299 ; X64-NOT: movb
 300 ; X32-LABEL: and_8r:
 301 ; X32-NOT: lock
 302 ; X32: andb
 303 ; X32-NOT: movb
 304   %1 = load atomic i8, i8* %p monotonic, align 1
 305   %2 = and i8 %1, %v
 306   store atomic i8 %2, i8* %p release, align 1
 307   ret void
 308 }
 309
 310 define void @and_16i(i16* %p) {
 311 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 312 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 313 ; X64-LABEL: and_16i:
 314 ; X64-NOT: andw
 315 ; X32-LABEL: and_16i:
 316 ; X32-NOT: andw
 317   %1 = load atomic i16, i16* %p acquire, align 2
 318   %2 = and i16 %1, 2
 319   store atomic i16 %2, i16* %p release, align 2
 320   ret void
 321 }
 322
 323 define void @and_16r(i16* %p, i16 %v) {
 324 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 325 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 326 ; X64-LABEL: and_16r:
 327 ; X64-NOT: andw
 328 ; X32-LABEL: and_16r:
 329 ; X32-NOT: andw [.*], (
 330   %1 = load atomic i16, i16* %p acquire, align 2
 331   %2 = and i16 %1, %v
 332   store atomic i16 %2, i16* %p release, align 2
 333   ret void
 334 }
 335
 336 define void @and_32i(i32* %p) {
 337 ; X64-LABEL: and_32i:
 338 ; X64-NOT: lock
 339 ; X64: andl
 340 ; X64-NOT: movl
 341 ; X32-LABEL: and_32i:
 342 ; X32-NOT: lock
 343 ; X32: andl
 344 ; X32-NOT: movl
 345   %1 = load atomic i32, i32* %p acquire, align 4
 346   %2 = and i32 %1, 2
 347   store atomic i32 %2, i32* %p release, align 4
 348   ret void
 349 }
 350
 351 define void @and_32r(i32* %p, i32 %v) {
 352 ; X64-LABEL: and_32r:
 353 ; X64-NOT: lock
 354 ; X64: andl
 355 ; X64-NOT: movl
 356 ; X32-LABEL: and_32r:
 357 ; X32-NOT: lock
 358 ; X32: andl
 359 ; X32-NOT: movl
 360   %1 = load atomic i32, i32* %p acquire, align 4
 361   %2 = and i32 %1, %v
 362   store atomic i32 %2, i32* %p release, align 4
 363   ret void
 364 }
 365
 366 define void @and_64i(i64* %p) {
 367 ; X64-LABEL: and_64i:
 368 ; X64-NOT: lock
 369 ; X64: andq
 370 ; X64-NOT: movq
 371 ;   We do not check X86-32 as it cannot do 'andq'.
 372 ; X32-LABEL: and_64i:
 373   %1 = load atomic i64, i64* %p acquire, align 8
 374   %2 = and i64 %1, 2
 375   store atomic i64 %2, i64* %p release, align 8
 376   ret void
 377 }
 378
 379 define void @and_64r(i64* %p, i64 %v) {
 380 ; X64-LABEL: and_64r:
 381 ; X64-NOT: lock
 382 ; X64: andq
 383 ; X64-NOT: movq
 384 ;   We do not check X86-32 as it cannot do 'andq'.
 385 ; X32-LABEL: and_64r:
 386   %1 = load atomic i64, i64* %p acquire, align 8
 387   %2 = and i64 %1, %v
 388   store atomic i64 %2, i64* %p release, align 8
 389   ret void
 390 }
 391
 392 define void @and_32i_seq_cst(i32* %p) {
 393 ; X64-LABEL: and_32i_seq_cst:
 394 ; X64: xchgl
 395 ; X32-LABEL: and_32i_seq_cst:
 396 ; X32: xchgl
 397   %1 = load atomic i32, i32* %p monotonic, align 4
 398   %2 = and i32 %1, 2
 399   store atomic i32 %2, i32* %p seq_cst, align 4
 400   ret void
 401 }
 402
 403 define void @and_32r_seq_cst(i32* %p, i32 %v) {
 404 ; X64-LABEL: and_32r_seq_cst:
 405 ; X64: xchgl
 406 ; X32-LABEL: and_32r_seq_cst:
 407 ; X32: xchgl
 408   %1 = load atomic i32, i32* %p monotonic, align 4
 409   %2 = and i32 %1, %v
 410   store atomic i32 %2, i32* %p seq_cst, align 4
 411   ret void
 412 }
 413
 414 ; ----- OR -----
 415
 416 define void @or_8i(i8* %p) {
 417 ; X64-LABEL: or_8i:
 418 ; X64-NOT: lock
 419 ; X64: orb
 420 ; X64-NOT: movb
 421 ; X32-LABEL: or_8i:
 422 ; X32-NOT: lock
 423 ; X32: orb
 424 ; X32-NOT: movb
 425   %1 = load atomic i8, i8* %p acquire, align 1
 426   %2 = or i8 %1, 2
 427   store atomic i8 %2, i8* %p release, align 1
 428   ret void
 429 }
 430
 431 define void @or_8r(i8* %p, i8 %v) {
 432 ; X64-LABEL: or_8r:
 433 ; X64-NOT: lock
 434 ; X64: orb
 435 ; X64-NOT: movb
 436 ; X32-LABEL: or_8r:
 437 ; X32-NOT: lock
 438 ; X32: orb
 439 ; X32-NOT: movb
 440   %1 = load atomic i8, i8* %p acquire, align 1
 441   %2 = or i8 %1, %v
 442   store atomic i8 %2, i8* %p release, align 1
 443   ret void
 444 }
 445
 446 define void @or_16i(i16* %p) {
 447 ; X64-LABEL: or_16i:
 448 ; X64-NOT: orw
 449 ; X32-LABEL: or_16i:
 450 ; X32-NOT: orw
 451   %1 = load atomic i16, i16* %p acquire, align 2
 452   %2 = or i16 %1, 2
 453   store atomic i16 %2, i16* %p release, align 2
 454   ret void
 455 }
 456
 457 define void @or_16r(i16* %p, i16 %v) {
 458 ; X64-LABEL: or_16r:
 459 ; X64-NOT: orw
 460 ; X32-LABEL: or_16r:
 461 ; X32-NOT: orw [.*], (
 462   %1 = load atomic i16, i16* %p acquire, align 2
 463   %2 = or i16 %1, %v
 464   store atomic i16 %2, i16* %p release, align 2
 465   ret void
 466 }
 467
 468 define void @or_32i(i32* %p) {
 469 ; X64-LABEL: or_32i:
 470 ; X64-NOT: lock
 471 ; X64: orl
 472 ; X64-NOT: movl
 473 ; X32-LABEL: or_32i:
 474 ; X32-NOT: lock
 475 ; X32: orl
 476 ; X32-NOT: movl
 477   %1 = load atomic i32, i32* %p acquire, align 4
 478   %2 = or i32 %1, 2
 479   store atomic i32 %2, i32* %p release, align 4
 480   ret void
 481 }
 482
 483 define void @or_32r(i32* %p, i32 %v) {
 484 ; X64-LABEL: or_32r:
 485 ; X64-NOT: lock
 486 ; X64: orl
 487 ; X64-NOT: movl
 488 ; X32-LABEL: or_32r:
 489 ; X32-NOT: lock
 490 ; X32: orl
 491 ; X32-NOT: movl
 492   %1 = load atomic i32, i32* %p acquire, align 4
 493   %2 = or i32 %1, %v
 494   store atomic i32 %2, i32* %p release, align 4
 495   ret void
 496 }
 497
 498 define void @or_64i(i64* %p) {
 499 ; X64-LABEL: or_64i:
 500 ; X64-NOT: lock
 501 ; X64: orq
 502 ; X64-NOT: movq
 503 ;   We do not check X86-32 as it cannot do 'orq'.
 504 ; X32-LABEL: or_64i:
 505   %1 = load atomic i64, i64* %p acquire, align 8
 506   %2 = or i64 %1, 2
 507   store atomic i64 %2, i64* %p release, align 8
 508   ret void
 509 }
 510
 511 define void @or_64r(i64* %p, i64 %v) {
 512 ; X64-LABEL: or_64r:
 513 ; X64-NOT: lock
 514 ; X64: orq
 515 ; X64-NOT: movq
 516 ;   We do not check X86-32 as it cannot do 'orq'.
 517 ; X32-LABEL: or_64r:
 518   %1 = load atomic i64, i64* %p acquire, align 8
 519   %2 = or i64 %1, %v
 520   store atomic i64 %2, i64* %p release, align 8
 521   ret void
 522 }
 523
 524 define void @or_32i_seq_cst(i32* %p) {
 525 ; X64-LABEL: or_32i_seq_cst:
 526 ; X64: xchgl
 527 ; X32-LABEL: or_32i_seq_cst:
 528 ; X32: xchgl
 529   %1 = load atomic i32, i32* %p monotonic, align 4
 530   %2 = or i32 %1, 2
 531   store atomic i32 %2, i32* %p seq_cst, align 4
 532   ret void
 533 }
 534
 535 define void @or_32r_seq_cst(i32* %p, i32 %v) {
 536 ; X64-LABEL: or_32r_seq_cst:
 537 ; X64: xchgl
 538 ; X32-LABEL: or_32r_seq_cst:
 539 ; X32: xchgl
 540   %1 = load atomic i32, i32* %p monotonic, align 4
 541   %2 = or i32 %1, %v
 542   store atomic i32 %2, i32* %p seq_cst, align 4
 543   ret void
 544 }
 545
 546 ; ----- XOR -----
 547
 548 define void @xor_8i(i8* %p) {
 549 ; X64-LABEL: xor_8i:
 550 ; X64-NOT: lock
 551 ; X64: xorb
 552 ; X64-NOT: movb
 553 ; X32-LABEL: xor_8i:
 554 ; X32-NOT: lock
 555 ; X32: xorb
 556 ; X32-NOT: movb
 557   %1 = load atomic i8, i8* %p acquire, align 1
 558   %2 = xor i8 %1, 2
 559   store atomic i8 %2, i8* %p release, align 1
 560   ret void
 561 }
 562
 563 define void @xor_8r(i8* %p, i8 %v) {
 564 ; X64-LABEL: xor_8r:
 565 ; X64-NOT: lock
 566 ; X64: xorb
 567 ; X64-NOT: movb
 568 ; X32-LABEL: xor_8r:
 569 ; X32-NOT: lock
 570 ; X32: xorb
 571 ; X32-NOT: movb
 572   %1 = load atomic i8, i8* %p acquire, align 1
 573   %2 = xor i8 %1, %v
 574   store atomic i8 %2, i8* %p release, align 1
 575   ret void
 576 }
 577
 578 define void @xor_16i(i16* %p) {
 579 ; X64-LABEL: xor_16i:
 580 ; X64-NOT: xorw
 581 ; X32-LABEL: xor_16i:
 582 ; X32-NOT: xorw
 583   %1 = load atomic i16, i16* %p acquire, align 2
 584   %2 = xor i16 %1, 2
 585   store atomic i16 %2, i16* %p release, align 2
 586   ret void
 587 }
 588
 589 define void @xor_16r(i16* %p, i16 %v) {
 590 ; X64-LABEL: xor_16r:
 591 ; X64-NOT: xorw
 592 ; X32-LABEL: xor_16r:
 593 ; X32-NOT: xorw [.*], (
 594   %1 = load atomic i16, i16* %p acquire, align 2
 595   %2 = xor i16 %1, %v
 596   store atomic i16 %2, i16* %p release, align 2
 597   ret void
 598 }
 599
 600 define void @xor_32i(i32* %p) {
 601 ; X64-LABEL: xor_32i:
 602 ; X64-NOT: lock
 603 ; X64: xorl
 604 ; X64-NOT: movl
 605 ; X32-LABEL: xor_32i:
 606 ; X32-NOT: lock
 607 ; X32: xorl
 608 ; X32-NOT: movl
 609   %1 = load atomic i32, i32* %p acquire, align 4
 610   %2 = xor i32 %1, 2
 611   store atomic i32 %2, i32* %p release, align 4
 612   ret void
 613 }
 614
 615 define void @xor_32r(i32* %p, i32 %v) {
 616 ; X64-LABEL: xor_32r:
 617 ; X64-NOT: lock
 618 ; X64: xorl
 619 ; X64-NOT: movl
 620 ; X32-LABEL: xor_32r:
 621 ; X32-NOT: lock
 622 ; X32: xorl
 623 ; X32-NOT: movl
 624   %1 = load atomic i32, i32* %p acquire, align 4
 625   %2 = xor i32 %1, %v
 626   store atomic i32 %2, i32* %p release, align 4
 627   ret void
 628 }
 629
 630 define void @xor_64i(i64* %p) {
 631 ; X64-LABEL: xor_64i:
 632 ; X64-NOT: lock
 633 ; X64: xorq
 634 ; X64-NOT: movq
 635 ;   We do not check X86-32 as it cannot do 'xorq'.
 636 ; X32-LABEL: xor_64i:
 637   %1 = load atomic i64, i64* %p acquire, align 8
 638   %2 = xor i64 %1, 2
 639   store atomic i64 %2, i64* %p release, align 8
 640   ret void
 641 }
 642
 643 define void @xor_64r(i64* %p, i64 %v) {
 644 ; X64-LABEL: xor_64r:
 645 ; X64-NOT: lock
 646 ; X64: xorq
 647 ; X64-NOT: movq
 648 ;   We do not check X86-32 as it cannot do 'xorq'.
 649 ; X32-LABEL: xor_64r:
 650   %1 = load atomic i64, i64* %p acquire, align 8
 651   %2 = xor i64 %1, %v
 652   store atomic i64 %2, i64* %p release, align 8
 653   ret void
 654 }
 655
 656 define void @xor_32i_seq_cst(i32* %p) {
 657 ; X64-LABEL: xor_32i_seq_cst:
 658 ; X64: xchgl
 659 ; X32-LABEL: xor_32i_seq_cst:
 660 ; X32: xchgl
 661   %1 = load atomic i32, i32* %p monotonic, align 4
 662   %2 = xor i32 %1, 2
 663   store atomic i32 %2, i32* %p seq_cst, align 4
 664   ret void
 665 }
 666
 667 define void @xor_32r_seq_cst(i32* %p, i32 %v) {
 668 ; X64-LABEL: xor_32r_seq_cst:
 669 ; X64: xchgl
 670 ; X32-LABEL: xor_32r_seq_cst:
 671 ; X32: xchgl
 672   %1 = load atomic i32, i32* %p monotonic, align 4
 673   %2 = xor i32 %1, %v
 674   store atomic i32 %2, i32* %p seq_cst, align 4
 675   ret void
 676 }
 677
 678 ; ----- INC -----
 679
 680 define void @inc_8(i8* %p) {
 681 ; X64-LABEL: inc_8:
 682 ; X64-NOT: lock
 683 ; X64: incb
 684 ; X64-NOT: movb
 685 ; X32-LABEL: inc_8:
 686 ; X32-NOT: lock
 687 ; X32: incb
 688 ; X32-NOT: movb
 689 ; SLOW_INC-LABEL: inc_8:
 690 ; SLOW_INC-NOT: incb
 691 ; SLOW_INC-NOT: movb
 692   %1 = load atomic i8, i8* %p seq_cst, align 1
 693   %2 = add i8 %1, 1
 694   store atomic i8 %2, i8* %p release, align 1
 695   ret void
 696 }
 697
 698 define void @inc_16(i16* %p) {
 699 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 700 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 701 ; X64-LABEL: inc_16:
 702 ; X64-NOT: incw
 703 ; X32-LABEL: inc_16:
 704 ; X32-NOT: incw
 705 ; SLOW_INC-LABEL: inc_16:
 706 ; SLOW_INC-NOT: incw
 707   %1 = load atomic i16, i16* %p acquire, align 2
 708   %2 = add i16 %1, 1
 709   store atomic i16 %2, i16* %p release, align 2
 710   ret void
 711 }
 712
 713 define void @inc_32(i32* %p) {
 714 ; X64-LABEL: inc_32:
 715 ; X64-NOT: lock
 716 ; X64: incl
 717 ; X64-NOT: movl
 718 ; X32-LABEL: inc_32:
 719 ; X32-NOT: lock
 720 ; X32: incl
 721 ; X32-NOT: movl
 722 ; SLOW_INC-LABEL: inc_32:
 723 ; SLOW_INC-NOT: incl
 724 ; SLOW_INC-NOT: movl
 725   %1 = load atomic i32, i32* %p acquire, align 4
 726   %2 = add i32 %1, 1
 727   store atomic i32 %2, i32* %p monotonic, align 4
 728   ret void
 729 }
 730
 731 define void @inc_64(i64* %p) {
 732 ; X64-LABEL: inc_64:
 733 ; X64-NOT: lock
 734 ; X64: incq
 735 ; X64-NOT: movq
 736 ;   We do not check X86-32 as it cannot do 'incq'.
 737 ; X32-LABEL: inc_64:
 738 ; SLOW_INC-LABEL: inc_64:
 739 ; SLOW_INC-NOT: incq
 740 ; SLOW_INC-NOT: movq
 741   %1 = load atomic i64, i64* %p acquire, align 8
 742   %2 = add i64 %1, 1
 743   store atomic i64 %2, i64* %p release, align 8
 744   ret void
 745 }
 746
 747 define void @inc_32_seq_cst(i32* %p) {
 748 ; X64-LABEL: inc_32_seq_cst:
 749 ; X64: xchgl
 750 ; X32-LABEL: inc_32_seq_cst:
 751 ; X32: xchgl
 752   %1 = load atomic i32, i32* %p monotonic, align 4
 753   %2 = add i32 %1, 1
 754   store atomic i32 %2, i32* %p seq_cst, align 4
 755   ret void
 756 }
 757
 758 ; ----- DEC -----
 759
 760 define void @dec_8(i8* %p) {
 761 ; X64-LABEL: dec_8:
 762 ; X64-NOT: lock
 763 ; X64: decb
 764 ; X64-NOT: movb
 765 ; X32-LABEL: dec_8:
 766 ; X32-NOT: lock
 767 ; X32: decb
 768 ; X32-NOT: movb
 769 ; SLOW_INC-LABEL: dec_8:
 770 ; SLOW_INC-NOT: decb
 771 ; SLOW_INC-NOT: movb
 772   %1 = load atomic i8, i8* %p seq_cst, align 1
 773   %2 = sub i8 %1, 1
 774   store atomic i8 %2, i8* %p release, align 1
 775   ret void
 776 }
 777
 778 define void @dec_16(i16* %p) {
 779 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 780 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 781 ; X64-LABEL: dec_16:
 782 ; X64-NOT: decw
 783 ; X32-LABEL: dec_16:
 784 ; X32-NOT: decw
 785 ; SLOW_INC-LABEL: dec_16:
 786 ; SLOW_INC-NOT: decw
 787   %1 = load atomic i16, i16* %p acquire, align 2
 788   %2 = sub i16 %1, 1
 789   store atomic i16 %2, i16* %p release, align 2
 790   ret void
 791 }
 792
 793 define void @dec_32(i32* %p) {
 794 ; X64-LABEL: dec_32:
 795 ; X64-NOT: lock
 796 ; X64: decl
 797 ; X64-NOT: movl
 798 ; X32-LABEL: dec_32:
 799 ; X32-NOT: lock
 800 ; X32: decl
 801 ; X32-NOT: movl
 802 ; SLOW_INC-LABEL: dec_32:
 803 ; SLOW_INC-NOT: decl
 804 ; SLOW_INC-NOT: movl
 805   %1 = load atomic i32, i32* %p acquire, align 4
 806   %2 = sub i32 %1, 1
 807   store atomic i32 %2, i32* %p monotonic, align 4
 808   ret void
 809 }
 810
 811 define void @dec_64(i64* %p) {
 812 ; X64-LABEL: dec_64:
 813 ; X64-NOT: lock
 814 ; X64: decq
 815 ; X64-NOT: movq
 816 ;   We do not check X86-32 as it cannot do 'decq'.
 817 ; X32-LABEL: dec_64:
 818 ; SLOW_INC-LABEL: dec_64:
 819 ; SLOW_INC-NOT: decq
 820 ; SLOW_INC-NOT: movq
 821   %1 = load atomic i64, i64* %p acquire, align 8
 822   %2 = sub i64 %1, 1
 823   store atomic i64 %2, i64* %p release, align 8
 824   ret void
 825 }
 826
 827 define void @dec_32_seq_cst(i32* %p) {
 828 ; X64-LABEL: dec_32_seq_cst:
 829 ; X64: xchgl
 830 ; X32-LABEL: dec_32_seq_cst:
 831 ; X32: xchgl
 832   %1 = load atomic i32, i32* %p monotonic, align 4
 833   %2 = sub i32 %1, 1
 834   store atomic i32 %2, i32* %p seq_cst, align 4
 835   ret void
 836 }
 837
 838 ; ----- FADD -----
 839
 840 define void @fadd_32r(float* %loc, float %val) {
 841 ; X64-LABEL: fadd_32r:
 842 ; X64-NOT: lock
 843 ; X64-NOT: mov
 844 ; X64: addss (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]]
 845 ; X64-NEXT: movss %[[XMM]], (%[[M]])
 846 ; X32-LABEL: fadd_32r:
 847 ; Don't check x86-32.
 848 ; LLVM's SSE handling is conservative on x86-32 even without using atomics.
 849   %floc = bitcast float* %loc to i32*
 850   %1 = load atomic i32, i32* %floc seq_cst, align 4
 851   %2 = bitcast i32 %1 to float
 852   %add = fadd float %2, %val
 853   %3 = bitcast float %add to i32
 854   store atomic i32 %3, i32* %floc release, align 4
 855   ret void
 856 }
 857
 858 define void @fadd_64r(double* %loc, double %val) {
 859 ; X64-LABEL: fadd_64r:
 860 ; X64-NOT: lock
 861 ; X64-NOT: mov
 862 ; X64: addsd (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]]
 863 ; X64-NEXT: movsd %[[XMM]], (%[[M]])
 864 ; X32-LABEL: fadd_64r:
 865 ; Don't check x86-32 (see comment above).
 866   %floc = bitcast double* %loc to i64*
 867   %1 = load atomic i64, i64* %floc seq_cst, align 8
 868   %2 = bitcast i64 %1 to double
 869   %add = fadd double %2, %val
 870   %3 = bitcast double %add to i64
 871   store atomic i64 %3, i64* %floc release, align 8
 872   ret void
 873 }
 874
 875 @glob32 = global float 0.000000e+00, align 4
 876 @glob64 = global double 0.000000e+00, align 8
 877
 878 ; Floating-point add to a global using an immediate.
 879 define void @fadd_32g() {
 880 ; X64-LABEL: fadd_32g:
 881 ; X64-NOT: lock
 882 ; X64:      movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
 883 ; X64-NEXT: addss glob32(%rip), %[[XMM]]
 884 ; X64-NEXT: movss %[[XMM]], glob32(%rip)
 885 ; X32-LABEL: fadd_32g:
 886 ; Don't check x86-32 (see comment above).
 887   %i = load atomic i32, i32* bitcast (float* @glob32 to i32*) monotonic, align 4
 888   %f = bitcast i32 %i to float
 889   %add = fadd float %f, 1.000000e+00
 890   %s = bitcast float %add to i32
 891   store atomic i32 %s, i32* bitcast (float* @glob32 to i32*) monotonic, align 4
 892   ret void
 893 }
 894
 895 define void @fadd_64g() {
 896 ; X64-LABEL: fadd_64g:
 897 ; X64-NOT: lock
 898 ; X64:      movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
 899 ; X64-NEXT: addsd glob64(%rip), %[[XMM]]
 900 ; X64-NEXT: movsd %[[XMM]], glob64(%rip)
 901 ; X32-LABEL: fadd_64g:
 902 ; Don't check x86-32 (see comment above).
 903   %i = load atomic i64, i64* bitcast (double* @glob64 to i64*) monotonic, align 8
 904   %f = bitcast i64 %i to double
 905   %add = fadd double %f, 1.000000e+00
 906   %s = bitcast double %add to i64
 907   store atomic i64 %s, i64* bitcast (double* @glob64 to i64*) monotonic, align 8
 908   ret void
 909 }
 910
 911 ; Floating-point add to a hard-coded immediate location using an immediate.
 912 define void @fadd_32imm() {
 913 ; X64-LABEL: fadd_32imm:
 914 ; X64-NOT: lock
 915 ; X64:      movl $3735928559, %e[[M:[a-z]+]]
 916 ; X64:      movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
 917 ; X64-NEXT: addss (%r[[M]]), %[[XMM]]
 918 ; X64-NEXT: movss %[[XMM]], (%r[[M]])
 919 ; X32-LABEL: fadd_32imm:
 920 ; Don't check x86-32 (see comment above).
 921   %i = load atomic i32, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4
 922   %f = bitcast i32 %i to float
 923   %add = fadd float %f, 1.000000e+00
 924   %s = bitcast float %add to i32
 925   store atomic i32 %s, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4
 926   ret void
 927 }
 928
 929 define void @fadd_64imm() {
 930 ; X64-LABEL: fadd_64imm:
 931 ; X64-NOT: lock
 932 ; X64:      movl $3735928559, %e[[M:[a-z]+]]
 933 ; X64:      movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
 934 ; X64-NEXT: addsd (%r[[M]]), %[[XMM]]
 935 ; X64-NEXT: movsd %[[XMM]], (%r[[M]])
 936 ; X32-LABEL: fadd_64imm:
 937 ; Don't check x86-32 (see comment above).
 938   %i = load atomic i64, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8
 939   %f = bitcast i64 %i to double
 940   %add = fadd double %f, 1.000000e+00
 941   %s = bitcast double %add to i64
 942   store atomic i64 %s, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8
 943   ret void
 944 }
 945
 946 ; Floating-point add to a stack location.
 947 define void @fadd_32stack() {
 948 ; X64-LABEL: fadd_32stack:
 949 ; X64-NOT: lock
 950 ; X64:      movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
 951 ; X64-NEXT: addss [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]]
 952 ; X64-NEXT: movss %[[XMM]], [[STACKOFF]](%rsp)
 953 ; X32-LABEL: fadd_32stack:
 954 ; Don't check x86-32 (see comment above).
 955   %ptr = alloca i32, align 4
 956   %bc3 = bitcast i32* %ptr to float*
 957   %load = load atomic i32, i32* %ptr acquire, align 4
 958   %bc0 = bitcast i32 %load to float
 959   %fadd = fadd float 1.000000e+00, %bc0
 960   %bc1 = bitcast float %fadd to i32
 961   store atomic i32 %bc1, i32* %ptr release, align 4
 962   ret void
 963 }
 964
 965 define void @fadd_64stack() {
 966 ; X64-LABEL: fadd_64stack:
 967 ; X64-NOT: lock
 968 ; X64:      movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
 969 ; X64-NEXT: addsd [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]]
 970 ; X64-NEXT: movsd %[[XMM]], [[STACKOFF]](%rsp)
 971 ; X32-LABEL: fadd_64stack:
 972 ; Don't check x86-32 (see comment above).
 973   %ptr = alloca i64, align 8
 974   %bc3 = bitcast i64* %ptr to double*
 975   %load = load atomic i64, i64* %ptr acquire, align 8
 976   %bc0 = bitcast i64 %load to double
 977   %fadd = fadd double 1.000000e+00, %bc0
 978   %bc1 = bitcast double %fadd to i64
 979   store atomic i64 %bc1, i64* %ptr release, align 8
 980   ret void
 981 }