include "SystemZPatterns.td"
include "SystemZInstrFormats.td"
include "SystemZInstrInfo.td"
-include "SystemZInstrFP.td"
include "SystemZInstrVector.td"
+include "SystemZInstrFP.td"
def SystemZInstrInfo : InstrInfo {}
Context);
}
+// MI loads the high part of a vector from memory. Return an instruction
+// that uses replicating vector load Opcode to do the same thing.
+static MCInst lowerSubvectorLoad(const MachineInstr *MI, unsigned Opcode) {
+ return MCInstBuilder(Opcode)
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(MI->getOperand(3).getReg());
+}
+
+// MI stores the high part of a vector to memory. Return an instruction
+// that uses elemental vector store Opcode to do the same thing.
+static MCInst lowerSubvectorStore(const MachineInstr *MI, unsigned Opcode) {
+ return MCInstBuilder(Opcode)
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(MI->getOperand(3).getReg())
+ .addImm(0);
+}
+
void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
SystemZMCInstLower Lower(MF->getContext(), *this);
MCInst LoweredMI;
.addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()));
break;
+ case SystemZ::VLR32:
+ case SystemZ::VLR64:
+ LoweredMI = MCInstBuilder(SystemZ::VLR)
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()));
+ break;
+
+ case SystemZ::VL32:
+ LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF);
+ break;
+
+ case SystemZ::VL64:
+ LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPG);
+ break;
+
+ case SystemZ::VST32:
+ LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEF);
+ break;
+
+ case SystemZ::VST64:
+ LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEG);
+ break;
+
case SystemZ::LFER:
LoweredMI = MCInstBuilder(SystemZ::VLGVF)
.addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg()))
addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
else
addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
- addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
- addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
- addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
+ addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
+ if (Subtarget.hasVector()) {
+ addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
+ addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
+ } else {
+ addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
+ addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
+ }
addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
if (Subtarget.hasVector()) {
defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>;
defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>;
}
-defm : CompareZeroFP<LTEBRCompare, FP32>;
-defm : CompareZeroFP<LTDBRCompare, FP64>;
-defm : CompareZeroFP<LTXBRCompare, FP128>;
+// Note that the comparison against zero operation is not available if we
+// have vector support, since load-and-test instructions will partially
+// clobber the target (vector) register.
+let Predicates = [FeatureNoVector] in {
+ defm : CompareZeroFP<LTEBRCompare, FP32>;
+ defm : CompareZeroFP<LTDBRCompare, FP64>;
+ defm : CompareZeroFP<LTXBRCompare, FP128>;
+}
// Moves between 64-bit integer and floating-point registers.
def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>;
defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
+ // For z13 we prefer LDE over LE to avoid partial register dependencies.
+ def LDE32 : UnaryRXE<"lde", 0xED24, null_frag, FP32, 4>;
+
// These instructions are split after register allocation, so we don't
// want a custom inserter.
let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
// A floating-point load-and test operation. Create both a normal unary
// operation and one that acts as a comparison against zero.
+// Note that the comparison against zero operation is not available if we
+// have vector support, since load-and-test instructions will partially
+// clobber the target (vector) register.
multiclass LoadAndTestRRE<string mnemonic, bits<16> opcode,
RegisterOperand cls> {
def "" : UnaryRRE<mnemonic, opcode, null_frag, cls, cls>;
- let isCodeGenOnly = 1 in
+ let isCodeGenOnly = 1, Predicates = [FeatureNoVector] in
def Compare : CompareRRE<mnemonic, opcode, null_frag, cls, cls>;
}
class UnaryAliasVRS<RegisterOperand cls1, RegisterOperand cls2>
: Alias<6, (outs cls1:$src1), (ins cls2:$src2), []>;
+// An alias of a UnaryVRR*, but with different register sizes.
+class UnaryAliasVRR<SDPatternOperator operator, TypedReg tr1, TypedReg tr2>
+ : Alias<6, (outs tr1.op:$V1), (ins tr2.op:$V2),
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]>;
+
+// An alias of a UnaryVRX, but with different register sizes.
+class UnaryAliasVRX<SDPatternOperator operator, TypedReg tr,
+ AddressingMode mode = bdxaddr12only>
+ : Alias<6, (outs tr.op:$V1), (ins mode:$XBD2),
+ [(set tr.op:$V1, (tr.vt (operator mode:$XBD2)))]>;
+
+// An alias of a StoreVRX, but with different register sizes.
+class StoreAliasVRX<SDPatternOperator operator, TypedReg tr,
+ AddressingMode mode = bdxaddr12only>
+ : Alias<6, (outs), (ins tr.op:$V1, mode:$XBD2),
+ [(operator (tr.vt tr.op:$V1), mode:$XBD2)]>;
+
// An alias of a BinaryRI, but with different register sizes.
class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
Immediate imm>
Opcode = SystemZ::LDR;
else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg))
Opcode = SystemZ::LXR;
+ else if (SystemZ::VR32BitRegClass.contains(DestReg, SrcReg))
+ Opcode = SystemZ::VLR32;
+ else if (SystemZ::VR64BitRegClass.contains(DestReg, SrcReg))
+ Opcode = SystemZ::VLR64;
else if (SystemZ::VR128BitRegClass.contains(DestReg, SrcReg))
Opcode = SystemZ::VLR;
else
} else if (RC == &SystemZ::FP128BitRegClass) {
LoadOpcode = SystemZ::LX;
StoreOpcode = SystemZ::STX;
+ } else if (RC == &SystemZ::VR32BitRegClass) {
+ LoadOpcode = SystemZ::VL32;
+ StoreOpcode = SystemZ::VST32;
+ } else if (RC == &SystemZ::VR64BitRegClass) {
+ LoadOpcode = SystemZ::VL64;
+ StoreOpcode = SystemZ::VST64;
} else if (RC == &SystemZ::VF128BitRegClass ||
RC == &SystemZ::VR128BitRegClass) {
LoadOpcode = SystemZ::VL;
let Predicates = [FeatureVector] in {
// Register move.
def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>;
+ def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>;
+ def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>;
// Load GR from VR element.
def VLGVB : BinaryVRSc<"vlgvb", 0xE721, null_frag, v128b, 0>;
def : Pat<(v2f64 (z_replicate_loadf64 bdxaddr12only:$addr)),
(VLREPG bdxaddr12only:$addr)>;
+ // Use VLREP to load subvectors. These patterns use "12pair" because
+ // LEY and LDY offer full 20-bit displacement fields. It's often better
+ // to use those instructions rather than force a 20-bit displacement
+ // into a GPR temporary.
+ def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>;
+ def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
+
// Load logical element and zero.
def VLLEZB : UnaryVRX<"vllezb", 0xE704, z_vllezi8, v128b, 1, 0>;
def VLLEZH : UnaryVRX<"vllezh", 0xE704, z_vllezi16, v128h, 2, 1>;
imm32zx1:$index),
(VSTEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;
+ // Use VSTE to store subvectors. These patterns use "12pair" because
+ // STEY and STDY offer full 20-bit displacement fields. It's often better
+ // to use those instructions rather than force a 20-bit displacement
+ // into a GPR temporary.
+ def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>;
+ def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
+
// Scatter element.
def VSCEF : StoreBinaryVRV<"vscef", 0xE71B, 4, imm32zx2>;
def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>;
let Predicates = [FeatureVector] in {
// Add.
def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
- def WFADB : BinaryVRRc<"wfadb", 0xE7E3, null_frag, v64db, v64db, 3, 8>;
+ def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;
// Convert from fixed 64-bit.
def VCDGB : TernaryVRRa<"vcdgb", 0xE7C3, null_frag, v128db, v128g, 3, 0>;
// Divide.
def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
- def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, null_frag, v64db, v64db, 3, 8>;
+ def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;
// Load FP integer.
def VFIDB : TernaryVRRa<"vfidb", 0xE7C7, null_frag, v128db, v128db, 3, 0>;
def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
defm : VectorRounding<VFIDB, v128db>;
+ defm : VectorRounding<WFIDB, v64db>;
// Load lengthened.
def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
- def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, null_frag, v64db, v32eb, 2, 8>;
+ def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fextend, v64db, v32eb, 2, 8>;
// Load rounded,
def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
+ def : FPConversion<WLEDB, fround, v32eb, v64db, 0, 0>;
// Multiply.
def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
- def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, null_frag, v64db, v64db, 3, 8>;
+ def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;
// Multiply and add.
def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
- def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, null_frag, v64db, v64db, 8, 3>;
+ def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;
// Multiply and subtract.
def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
- def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, null_frag, v64db, v64db, 8, 3>;
+ def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;
// Load complement,
def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>;
- def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 0>;
+ def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>;
// Load negative.
def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>;
- def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 1>;
+ def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>;
// Load positive.
def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>;
- def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 2>;
+ def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>;
// Square root.
def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
- def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, null_frag, v64db, v64db, 3, 8>;
+ def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;
// Subtract.
def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
- def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, null_frag, v64db, v64db, 3, 8>;
+ def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;
// Test data class immediate.
let Defs = [CC] in {
let Predicates = [FeatureVector] in {
// Compare scalar.
let Defs = [CC] in
- def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, null_frag, v64db, 3>;
+ def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;
// Compare and signal scalar.
let Defs = [CC] in
#include "SystemZTargetMachine.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
using namespace llvm;
private:
bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
unsigned LLIxL, unsigned LLIxH);
+ bool shortenOn0(MachineInstr &MI, unsigned Opcode);
+ bool shortenOn01(MachineInstr &MI, unsigned Opcode);
+ bool shortenOn001(MachineInstr &MI, unsigned Opcode);
+ bool shortenFPConv(MachineInstr &MI, unsigned Opcode);
const SystemZInstrInfo *TII;
return false;
}
+// Change MI's opcode to Opcode if register operand 0 has a 4-bit encoding.
+bool SystemZShortenInst::shortenOn0(MachineInstr &MI, unsigned Opcode) {
+ if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16) {
+ MI.setDesc(TII->get(Opcode));
+ return true;
+ }
+ return false;
+}
+
+// Change MI's opcode to Opcode if register operands 0 and 1 have a
+// 4-bit encoding.
+bool SystemZShortenInst::shortenOn01(MachineInstr &MI, unsigned Opcode) {
+ if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+ SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
+ MI.setDesc(TII->get(Opcode));
+ return true;
+ }
+ return false;
+}
+
+// Change MI's opcode to Opcode if register operands 0, 1 and 2 have a
+// 4-bit encoding and if operands 0 and 1 are tied.
+bool SystemZShortenInst::shortenOn001(MachineInstr &MI, unsigned Opcode) {
+ if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+ MI.getOperand(1).getReg() == MI.getOperand(0).getReg() &&
+ SystemZMC::getFirstReg(MI.getOperand(2).getReg()) < 16) {
+ MI.setDesc(TII->get(Opcode));
+ return true;
+ }
+ return false;
+}
+
+// MI is a vector-style conversion instruction with the operand order:
+// destination, source, exact-suppress, rounding-mode. If both registers
+// have a 4-bit encoding then change it to Opcode, which has operand order:
+// destination, rouding-mode, source, exact-suppress.
+bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
+ if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+ SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
+ MachineOperand Dest(MI.getOperand(0));
+ MachineOperand Src(MI.getOperand(1));
+ MachineOperand Suppress(MI.getOperand(2));
+ MachineOperand Mode(MI.getOperand(3));
+ MI.RemoveOperand(3);
+ MI.RemoveOperand(2);
+ MI.RemoveOperand(1);
+ MI.RemoveOperand(0);
+ MI.setDesc(TII->get(Opcode));
+ MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
+ .addOperand(Dest)
+ .addOperand(Mode)
+ .addOperand(Src)
+ .addOperand(Suppress);
+ return true;
+ }
+ return false;
+}
+
// Process all instructions in MBB. Return true if something changed.
bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
bool Changed = false;
// Iterate backwards through the block looking for instructions to change.
for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
MachineInstr &MI = *MBBI;
- unsigned Opcode = MI.getOpcode();
- if (Opcode == SystemZ::IILF)
+ switch (MI.getOpcode()) {
+ case SystemZ::IILF:
Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL,
SystemZ::LLILH);
- else if (Opcode == SystemZ::IIHF)
+ break;
+
+ case SystemZ::IIHF:
Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL,
SystemZ::LLIHH);
+ break;
+
+ case SystemZ::WFADB:
+ Changed |= shortenOn001(MI, SystemZ::ADBR);
+ break;
+
+ case SystemZ::WFDDB:
+ Changed |= shortenOn001(MI, SystemZ::DDBR);
+ break;
+
+ case SystemZ::WFIDB:
+ Changed |= shortenFPConv(MI, SystemZ::FIDBRA);
+ break;
+
+ case SystemZ::WLDEB:
+ Changed |= shortenOn01(MI, SystemZ::LDEBR);
+ break;
+
+ case SystemZ::WLEDB:
+ Changed |= shortenFPConv(MI, SystemZ::LEDBRA);
+ break;
+
+ case SystemZ::WFMDB:
+ Changed |= shortenOn001(MI, SystemZ::MDBR);
+ break;
+
+ case SystemZ::WFLCDB:
+ Changed |= shortenOn01(MI, SystemZ::LCDBR);
+ break;
+
+ case SystemZ::WFLNDB:
+ Changed |= shortenOn01(MI, SystemZ::LNDBR);
+ break;
+
+ case SystemZ::WFLPDB:
+ Changed |= shortenOn01(MI, SystemZ::LPDBR);
+ break;
+
+ case SystemZ::WFSQDB:
+ Changed |= shortenOn01(MI, SystemZ::SQDBR);
+ break;
+
+ case SystemZ::WFSDB:
+ Changed |= shortenOn001(MI, SystemZ::SDBR);
+ break;
+
+ case SystemZ::WFCDB:
+ Changed |= shortenOn01(MI, SystemZ::CDBR);
+ break;
+
+ case SystemZ::VL32:
+ // For z13 we prefer LDE over LE to avoid partial register dependencies.
+ Changed |= shortenOn0(MI, SystemZ::LDE32);
+ break;
+
+ case SystemZ::VST32:
+ Changed |= shortenOn0(MI, SystemZ::STE);
+ break;
+
+ case SystemZ::VL64:
+ Changed |= shortenOn0(MI, SystemZ::LD);
+ break;
+
+ case SystemZ::VST64:
+ Changed |= shortenOn0(MI, SystemZ::STD);
+ break;
+ }
+
unsigned UsedLow = 0;
unsigned UsedHigh = 0;
for (auto MOI = MI.operands_begin(), MOE = MI.operands_end();
; Test floating-point absolute.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
; Test f32.
declare float @llvm.fabs.f32(float %f)
; Test negated floating-point absolute.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
; Test f32.
declare float @llvm.fabs.f32(float %f)
; Test 64-bit floating-point addition.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
declare double @foo()
; Check register addition.
define double @f7(double *%ptr0) {
; CHECK-LABEL: f7:
; CHECK: brasl %r14, foo@PLT
-; CHECK: adb %f0, 160(%r15)
+; CHECK-SCALAR: adb %f0, 160(%r15)
; CHECK: br %r14
%ptr1 = getelementptr double, double *%ptr0, i64 2
%ptr2 = getelementptr double, double *%ptr0, i64 4
; Test 64-bit floating-point comparison. The tests assume a z10 implementation
; of select, using conditional branches rather than LOCGR.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
declare double @foo()
define i64 @f1(i64 %a, i64 %b, double %f1, double %f2) {
; CHECK-LABEL: f1:
; CHECK: cdbr %f0, %f2
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
%cond = fcmp oeq double %f1, %f2
%res = select i1 %cond, i64 %a, i64 %b
define i64 @f2(i64 %a, i64 %b, double %f1, double *%ptr) {
; CHECK-LABEL: f2:
; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
%f2 = load double , double *%ptr
%cond = fcmp oeq double %f1, %f2
define i64 @f3(i64 %a, i64 %b, double %f1, double *%base) {
; CHECK-LABEL: f3:
; CHECK: cdb %f0, 4088(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
%ptr = getelementptr double, double *%base, i64 511
%f2 = load double , double *%ptr
; CHECK-LABEL: f4:
; CHECK: aghi %r4, 4096
; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
%ptr = getelementptr double, double *%base, i64 512
%f2 = load double , double *%ptr
; CHECK-LABEL: f5:
; CHECK: aghi %r4, -8
; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
%ptr = getelementptr double, double *%base, i64 -1
%f2 = load double , double *%ptr
; CHECK-LABEL: f6:
; CHECK: sllg %r1, %r5, 3
; CHECK: cdb %f0, 800(%r1,%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
%ptr1 = getelementptr double, double *%base, i64 %index
%ptr2 = getelementptr double, double *%ptr1, i64 100
define double @f7(double *%ptr0) {
; CHECK-LABEL: f7:
; CHECK: brasl %r14, foo@PLT
-; CHECK: cdb {{%f[0-9]+}}, 160(%r15)
+; CHECK-SCALAR: cdb {{%f[0-9]+}}, 160(%r15)
; CHECK: br %r14
%ptr1 = getelementptr double, double *%ptr0, i64 2
%ptr2 = getelementptr double, double *%ptr0, i64 4
; Check comparison with zero.
define i64 @f8(i64 %a, i64 %b, double %f) {
; CHECK-LABEL: f8:
-; CHECK: ltdbr %f0, %f0
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR: ltdbr %f0, %f0
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR: lzdr %f1
+; CHECK-VECTOR-NEXT: cdbr %f0, %f1
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
%cond = fcmp oeq double %f, 0.0
%res = select i1 %cond, i64 %a, i64 %b
define i64 @f9(i64 %a, i64 %b, double %f2, double *%ptr) {
; CHECK-LABEL: f9:
; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: jl {{\.L.*}}
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: jl
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnl %r2, %r3
; CHECK: br %r14
%f1 = load double , double *%ptr
%cond = fcmp ogt double %f1, %f2
; Test floating-point truncations.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
; Test f64->f32.
define float @f1(double %d1, double %d2) {
; CHECK-LABEL: f1:
-; CHECK: ledbr %f0, %f2
+; CHECK-SCALAR: ledbr %f0, %f2
+; CHECK-VECTOR: ledbra %f0, 0, %f2, 0
; CHECK: br %r14
%res = fptrunc double %d2 to float
ret float %res
define void @f5(double *%dst, fp128 *%ptr, double %d1, double %d2) {
; CHECK-LABEL: f5:
; CHECK: ldxbr %f1, %f1
-; CHECK: adbr %f1, %f2
-; CHECK: std %f1, 0(%r2)
+; CHECK-SCALAR: adbr %f1, %f2
+; CHECK-SCALAR: std %f1, 0(%r2)
+; CHECK-VECTOR: wfadb [[REG:%f[0-9]+]], %f1, %f2
+; CHECK-VECTOR: std [[REG]], 0(%r2)
; CHECK: br %r14
%val = load fp128 , fp128 *%ptr
%conv = fptrunc fp128 %val to double
; Test extensions of f32 to f64.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
; Check register extension.
define double @f1(float %val) {
; to use LDEB if possible.
define void @f7(double *%ptr1, float *%ptr2) {
; CHECK-LABEL: f7:
-; CHECK: ldeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
+; CHECK-SCALAR: ldeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
; CHECK: br %r14
%val0 = load volatile float , float *%ptr2
%val1 = load volatile float , float *%ptr2
; Test 64-bit floating-point division.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
declare double @foo()
define double @f7(double *%ptr0) {
; CHECK-LABEL: f7:
; CHECK: brasl %r14, foo@PLT
-; CHECK: ddb %f0, 160(%r15)
+; CHECK-SCALAR: ddb %f0, 160(%r15)
; CHECK: br %r14
%ptr1 = getelementptr double, double *%ptr0, i64 2
%ptr2 = getelementptr double, double *%ptr0, i64 4
; Test moves between FPRs.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
; Test f32 moves.
define float @f1(float %a, float %b) {
; CHECK-LABEL: f1:
; CHECK: ler %f0, %f2
+; CHECK: br %r14
ret float %b
}
define double @f2(double %a, double %b) {
; CHECK-LABEL: f2:
; CHECK: ldr %f0, %f2
+; CHECK: br %r14
ret double %b
}
; CHECK-LABEL: f3:
; CHECK: lxr
; CHECK: axbr
+; CHECK: br %r14
%val = load volatile fp128 , fp128 *%x
%sum = fadd fp128 %val, %val
store volatile fp128 %sum, fp128 *%x
; Test 64-bit floating-point loads.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
; Test the low end of the LD range.
define double @f1(double *%src) {
; Test 64-bit floating-point stores.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
; Test the low end of the STD range.
define void @f1(double *%src, double %val) {
--- /dev/null
+; Test 32-bit floating-point loads for z13.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test that we use LDE instead of LE - low end of the LE range.
+define float @f1(float *%src) {
+; CHECK-LABEL: f1:
+; CHECK: lde %f0, 0(%r2)
+; CHECK: br %r14
+ %val = load float, float *%src
+ ret float %val
+}
+
+; Test that we use LDE instead of LE - high end of the LE range.
+define float @f2(float *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lde %f0, 4092(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr float, float *%src, i64 1023
+ %val = load float, float *%ptr
+ ret float %val
+}
+
+; Check the next word up, which should use LEY instead of LDE.
+define float @f3(float *%src) {
+; CHECK-LABEL: f3:
+; CHECK: ley %f0, 4096(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr float, float *%src, i64 1024
+ %val = load float, float *%ptr
+ ret float %val
+}
+
+; Check the high end of the aligned LEY range.
+define float @f4(float *%src) {
+; CHECK-LABEL: f4:
+; CHECK: ley %f0, 524284(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr float, float *%src, i64 131071
+ %val = load float, float *%ptr
+ ret float %val
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define float @f5(float *%src) {
+; CHECK-LABEL: f5:
+; CHECK: agfi %r2, 524288
+; CHECK: lde %f0, 0(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr float, float *%src, i64 131072
+ %val = load float, float *%ptr
+ ret float %val
+}
+
+; Check the high end of the negative aligned LEY range.
+define float @f6(float *%src) {
+; CHECK-LABEL: f6:
+; CHECK: ley %f0, -4(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr float, float *%src, i64 -1
+ %val = load float, float *%ptr
+ ret float %val
+}
+
+; Check the low end of the LEY range.
+define float @f7(float *%src) {
+; CHECK-LABEL: f7:
+; CHECK: ley %f0, -524288(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr float, float *%src, i64 -131072
+ %val = load float, float *%ptr
+ ret float %val
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define float @f8(float *%src) {
+; CHECK-LABEL: f8:
+; CHECK: agfi %r2, -524292
+; CHECK: lde %f0, 0(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr float, float *%src, i64 -131073
+ %val = load float, float *%ptr
+ ret float %val
+}
+
+; Check that LDE allows an index.
+define float @f9(i64 %src, i64 %index) {
+; CHECK-LABEL: f9:
+; CHECK: lde %f0, 4092({{%r3,%r2|%r2,%r3}})
+; CHECK: br %r14
+ %add1 = add i64 %src, %index
+ %add2 = add i64 %add1, 4092
+ %ptr = inttoptr i64 %add2 to float *
+ %val = load float, float *%ptr
+ ret float %val
+}
+
+; Check that LEY allows an index.
+define float @f10(i64 %src, i64 %index) {
+; CHECK-LABEL: f10:
+; CHECK: ley %f0, 4096({{%r3,%r2|%r2,%r3}})
+; CHECK: br %r14
+ %add1 = add i64 %src, %index
+ %add2 = add i64 %add1, 4096
+ %ptr = inttoptr i64 %add2 to float *
+ %val = load float, float *%ptr
+ ret float %val
+}
; Test multiplication of two f64s, producing an f64 result.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
declare double @foo()
define double @f7(double *%ptr0) {
; CHECK-LABEL: f7:
; CHECK: brasl %r14, foo@PLT
-; CHECK: mdb %f0, 160(%r15)
+; CHECK-SCALAR: mdb %f0, 160(%r15)
; CHECK: br %r14
%ptr1 = getelementptr double, double *%ptr0, i64 2
%ptr2 = getelementptr double, double *%ptr0, i64 4
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
define double @f1(double %f1, double %f2, double %acc) {
; CHECK-LABEL: f1:
-; CHECK: madbr %f4, %f0, %f2
-; CHECK: ldr %f0, %f4
+; CHECK-SCALAR: madbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmadb %f0, %f0, %f2, %f4
; CHECK: br %r14
%res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
ret double %res
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
define double @f1(double %f1, double %f2, double %acc) {
; CHECK-LABEL: f1:
-; CHECK: msdbr %f4, %f0, %f2
-; CHECK: ldr %f0, %f4
+; CHECK-SCALAR: msdbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmsdb %f0, %f0, %f2, %f4
; CHECK: br %r14
%negacc = fsub double -0.0, %acc
%res = call double @llvm.fma.f64 (double %f1, double %f2, double %negacc)
; Test floating-point negation.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
; Test f32.
define float @f1(float %f) {
; Test rounding functions for z196 and above.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
; Test rint for f32.
declare float @llvm.rint.f32(float %f)
declare double @llvm.rint.f64(double %f)
define double @f2(double %f) {
; CHECK-LABEL: f2:
-; CHECK: fidbr %f0, 0, %f0
+; CHECK-SCALAR: fidbr %f0, 0, %f0
+; CHECK-VECTOR: fidbra %f0, 0, %f0, 0
; CHECK: br %r14
%res = call double @llvm.rint.f64(double %f)
ret double %res
; Test 64-bit square root.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
declare double @llvm.sqrt.f64(double %f)
declare double @sqrt(double)
; to use SQDB if possible.
define void @f7(double *%ptr) {
; CHECK-LABEL: f7:
-; CHECK: sqdb {{%f[0-9]+}}, 160(%r15)
+; CHECK-SCALAR: sqdb {{%f[0-9]+}}, 160(%r15)
; CHECK: br %r14
%val0 = load volatile double , double *%ptr
%val1 = load volatile double , double *%ptr
; Test 64-bit floating-point subtraction.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
declare double @foo()
define double @f7(double *%ptr0) {
; CHECK-LABEL: f7:
; CHECK: brasl %r14, foo@PLT
-; CHECK: sdb %f0, 16{{[04]}}(%r15)
+; CHECK-SCALAR: sdb %f0, 16{{[04]}}(%r15)
; CHECK: br %r14
%ptr1 = getelementptr double, double *%ptr0, i64 2
%ptr2 = getelementptr double, double *%ptr0, i64 4
; uses a different register class, but the set of saved and restored
; registers should be the same.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
; This function should require all FPRs, but no other spill slots.
; We need to save and restore 8 of the 16 FPRs, so the frame size
; Test the saving and restoring of FPRs in large frames.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck -check-prefix=CHECK-NOFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
; Test a frame size that requires some FPRs to be saved and loaded using
; the 20-bit STDY and LDY while others can use the 12-bit STD and LD.
; Test spilling of FPRs.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
; We need to save and restore 8 of the 16 FPRs and allocate an additional
; 4-byte spill slot, rounded to 8 bytes. The frame size should be exactly
--- /dev/null
+; Like frame-03.ll, but for z13. In this case we have 16 more registers
+; available.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; This function should require all FPRs, but no other spill slots.
+; We need to save and restore 8 of the 16 FPRs, so the frame size
+; should be exactly 160 + 8 * 8 = 224. The CFA offset is 160
+; (the caller-allocated part of the frame) + 224.
+define void @f1(double *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: aghi %r15, -224
+; CHECK: .cfi_def_cfa_offset 384
+; CHECK: std %f8, 216(%r15)
+; CHECK: std %f9, 208(%r15)
+; CHECK: std %f10, 200(%r15)
+; CHECK: std %f11, 192(%r15)
+; CHECK: std %f12, 184(%r15)
+; CHECK: std %f13, 176(%r15)
+; CHECK: std %f14, 168(%r15)
+; CHECK: std %f15, 160(%r15)
+; CHECK: .cfi_offset %f8, -168
+; CHECK: .cfi_offset %f9, -176
+; CHECK: .cfi_offset %f10, -184
+; CHECK: .cfi_offset %f11, -192
+; CHECK: .cfi_offset %f12, -200
+; CHECK: .cfi_offset %f13, -208
+; CHECK: .cfi_offset %f14, -216
+; CHECK: .cfi_offset %f15, -224
+; CHECK-DAG: ld %f0, 0(%r2)
+; CHECK-DAG: ld %f7, 0(%r2)
+; CHECK-DAG: ld %f8, 0(%r2)
+; CHECK-DAG: ld %f15, 0(%r2)
+; CHECK-DAG: vlrepg %v16, 0(%r2)
+; CHECK-DAG: vlrepg %v23, 0(%r2)
+; CHECK-DAG: vlrepg %v24, 0(%r2)
+; CHECK-DAG: vlrepg %v31, 0(%r2)
+; CHECK: ld %f8, 216(%r15)
+; CHECK: ld %f9, 208(%r15)
+; CHECK: ld %f10, 200(%r15)
+; CHECK: ld %f11, 192(%r15)
+; CHECK: ld %f12, 184(%r15)
+; CHECK: ld %f13, 176(%r15)
+; CHECK: ld %f14, 168(%r15)
+; CHECK: ld %f15, 160(%r15)
+; CHECK: aghi %r15, 224
+; CHECK: br %r14
+ %l0 = load volatile double, double *%ptr
+ %l1 = load volatile double, double *%ptr
+ %l2 = load volatile double, double *%ptr
+ %l3 = load volatile double, double *%ptr
+ %l4 = load volatile double, double *%ptr
+ %l5 = load volatile double, double *%ptr
+ %l6 = load volatile double, double *%ptr
+ %l7 = load volatile double, double *%ptr
+ %l8 = load volatile double, double *%ptr
+ %l9 = load volatile double, double *%ptr
+ %l10 = load volatile double, double *%ptr
+ %l11 = load volatile double, double *%ptr
+ %l12 = load volatile double, double *%ptr
+ %l13 = load volatile double, double *%ptr
+ %l14 = load volatile double, double *%ptr
+ %l15 = load volatile double, double *%ptr
+ %l16 = load volatile double, double *%ptr
+ %l17 = load volatile double, double *%ptr
+ %l18 = load volatile double, double *%ptr
+ %l19 = load volatile double, double *%ptr
+ %l20 = load volatile double, double *%ptr
+ %l21 = load volatile double, double *%ptr
+ %l22 = load volatile double, double *%ptr
+ %l23 = load volatile double, double *%ptr
+ %l24 = load volatile double, double *%ptr
+ %l25 = load volatile double, double *%ptr
+ %l26 = load volatile double, double *%ptr
+ %l27 = load volatile double, double *%ptr
+ %l28 = load volatile double, double *%ptr
+ %l29 = load volatile double, double *%ptr
+ %l30 = load volatile double, double *%ptr
+ %l31 = load volatile double, double *%ptr
+ %acc0 = fsub double %l0, %l0
+ %acc1 = fsub double %l1, %acc0
+ %acc2 = fsub double %l2, %acc1
+ %acc3 = fsub double %l3, %acc2
+ %acc4 = fsub double %l4, %acc3
+ %acc5 = fsub double %l5, %acc4
+ %acc6 = fsub double %l6, %acc5
+ %acc7 = fsub double %l7, %acc6
+ %acc8 = fsub double %l8, %acc7
+ %acc9 = fsub double %l9, %acc8
+ %acc10 = fsub double %l10, %acc9
+ %acc11 = fsub double %l11, %acc10
+ %acc12 = fsub double %l12, %acc11
+ %acc13 = fsub double %l13, %acc12
+ %acc14 = fsub double %l14, %acc13
+ %acc15 = fsub double %l15, %acc14
+ %acc16 = fsub double %l16, %acc15
+ %acc17 = fsub double %l17, %acc16
+ %acc18 = fsub double %l18, %acc17
+ %acc19 = fsub double %l19, %acc18
+ %acc20 = fsub double %l20, %acc19
+ %acc21 = fsub double %l21, %acc20
+ %acc22 = fsub double %l22, %acc21
+ %acc23 = fsub double %l23, %acc22
+ %acc24 = fsub double %l24, %acc23
+ %acc25 = fsub double %l25, %acc24
+ %acc26 = fsub double %l26, %acc25
+ %acc27 = fsub double %l27, %acc26
+ %acc28 = fsub double %l28, %acc27
+ %acc29 = fsub double %l29, %acc28
+ %acc30 = fsub double %l30, %acc29
+ %acc31 = fsub double %l31, %acc30
+ store volatile double %acc0, double *%ptr
+ store volatile double %acc1, double *%ptr
+ store volatile double %acc2, double *%ptr
+ store volatile double %acc3, double *%ptr
+ store volatile double %acc4, double *%ptr
+ store volatile double %acc5, double *%ptr
+ store volatile double %acc6, double *%ptr
+ store volatile double %acc7, double *%ptr
+ store volatile double %acc8, double *%ptr
+ store volatile double %acc9, double *%ptr
+ store volatile double %acc10, double *%ptr
+ store volatile double %acc11, double *%ptr
+ store volatile double %acc12, double *%ptr
+ store volatile double %acc13, double *%ptr
+ store volatile double %acc14, double *%ptr
+ store volatile double %acc15, double *%ptr
+ store volatile double %acc16, double *%ptr
+ store volatile double %acc17, double *%ptr
+ store volatile double %acc18, double *%ptr
+ store volatile double %acc19, double *%ptr
+ store volatile double %acc20, double *%ptr
+ store volatile double %acc21, double *%ptr
+ store volatile double %acc22, double *%ptr
+ store volatile double %acc23, double *%ptr
+ store volatile double %acc24, double *%ptr
+ store volatile double %acc25, double *%ptr
+ store volatile double %acc26, double *%ptr
+ store volatile double %acc27, double *%ptr
+ store volatile double %acc28, double *%ptr
+ store volatile double %acc29, double *%ptr
+ store volatile double %acc30, double *%ptr
+ store volatile double %acc31, double *%ptr
+ ret void
+}
+
+; Like f1, but requires one fewer FPR. We allocate in numerical order,
+; so %f15 is the one that gets dropped.
+define void @f2(double *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: aghi %r15, -216
+; CHECK: .cfi_def_cfa_offset 376
+; CHECK: std %f8, 208(%r15)
+; CHECK: std %f9, 200(%r15)
+; CHECK: std %f10, 192(%r15)
+; CHECK: std %f11, 184(%r15)
+; CHECK: std %f12, 176(%r15)
+; CHECK: std %f13, 168(%r15)
+; CHECK: std %f14, 160(%r15)
+; CHECK: .cfi_offset %f8, -168
+; CHECK: .cfi_offset %f9, -176
+; CHECK: .cfi_offset %f10, -184
+; CHECK: .cfi_offset %f11, -192
+; CHECK: .cfi_offset %f12, -200
+; CHECK: .cfi_offset %f13, -208
+; CHECK: .cfi_offset %f14, -216
+; CHECK-NOT: %v15
+; CHECK-NOT: %f15
+; CHECK: ld %f8, 208(%r15)
+; CHECK: ld %f9, 200(%r15)
+; CHECK: ld %f10, 192(%r15)
+; CHECK: ld %f11, 184(%r15)
+; CHECK: ld %f12, 176(%r15)
+; CHECK: ld %f13, 168(%r15)
+; CHECK: ld %f14, 160(%r15)
+; CHECK: aghi %r15, 216
+; CHECK: br %r14
+ %l0 = load volatile double, double *%ptr
+ %l1 = load volatile double, double *%ptr
+ %l2 = load volatile double, double *%ptr
+ %l3 = load volatile double, double *%ptr
+ %l4 = load volatile double, double *%ptr
+ %l5 = load volatile double, double *%ptr
+ %l6 = load volatile double, double *%ptr
+ %l7 = load volatile double, double *%ptr
+ %l8 = load volatile double, double *%ptr
+ %l9 = load volatile double, double *%ptr
+ %l10 = load volatile double, double *%ptr
+ %l11 = load volatile double, double *%ptr
+ %l12 = load volatile double, double *%ptr
+ %l13 = load volatile double, double *%ptr
+ %l14 = load volatile double, double *%ptr
+ %l16 = load volatile double, double *%ptr
+ %l17 = load volatile double, double *%ptr
+ %l18 = load volatile double, double *%ptr
+ %l19 = load volatile double, double *%ptr
+ %l20 = load volatile double, double *%ptr
+ %l21 = load volatile double, double *%ptr
+ %l22 = load volatile double, double *%ptr
+ %l23 = load volatile double, double *%ptr
+ %l24 = load volatile double, double *%ptr
+ %l25 = load volatile double, double *%ptr
+ %l26 = load volatile double, double *%ptr
+ %l27 = load volatile double, double *%ptr
+ %l28 = load volatile double, double *%ptr
+ %l29 = load volatile double, double *%ptr
+ %l30 = load volatile double, double *%ptr
+ %l31 = load volatile double, double *%ptr
+ %acc0 = fsub double %l0, %l0
+ %acc1 = fsub double %l1, %acc0
+ %acc2 = fsub double %l2, %acc1
+ %acc3 = fsub double %l3, %acc2
+ %acc4 = fsub double %l4, %acc3
+ %acc5 = fsub double %l5, %acc4
+ %acc6 = fsub double %l6, %acc5
+ %acc7 = fsub double %l7, %acc6
+ %acc8 = fsub double %l8, %acc7
+ %acc9 = fsub double %l9, %acc8
+ %acc10 = fsub double %l10, %acc9
+ %acc11 = fsub double %l11, %acc10
+ %acc12 = fsub double %l12, %acc11
+ %acc13 = fsub double %l13, %acc12
+ %acc14 = fsub double %l14, %acc13
+ %acc16 = fsub double %l16, %acc14
+ %acc17 = fsub double %l17, %acc16
+ %acc18 = fsub double %l18, %acc17
+ %acc19 = fsub double %l19, %acc18
+ %acc20 = fsub double %l20, %acc19
+ %acc21 = fsub double %l21, %acc20
+ %acc22 = fsub double %l22, %acc21
+ %acc23 = fsub double %l23, %acc22
+ %acc24 = fsub double %l24, %acc23
+ %acc25 = fsub double %l25, %acc24
+ %acc26 = fsub double %l26, %acc25
+ %acc27 = fsub double %l27, %acc26
+ %acc28 = fsub double %l28, %acc27
+ %acc29 = fsub double %l29, %acc28
+ %acc30 = fsub double %l30, %acc29
+ %acc31 = fsub double %l31, %acc30
+ store volatile double %acc0, double *%ptr
+ store volatile double %acc1, double *%ptr
+ store volatile double %acc2, double *%ptr
+ store volatile double %acc3, double *%ptr
+ store volatile double %acc4, double *%ptr
+ store volatile double %acc5, double *%ptr
+ store volatile double %acc6, double *%ptr
+ store volatile double %acc7, double *%ptr
+ store volatile double %acc8, double *%ptr
+ store volatile double %acc9, double *%ptr
+ store volatile double %acc10, double *%ptr
+ store volatile double %acc11, double *%ptr
+ store volatile double %acc12, double *%ptr
+ store volatile double %acc13, double *%ptr
+ store volatile double %acc14, double *%ptr
+ store volatile double %acc16, double *%ptr
+ store volatile double %acc17, double *%ptr
+ store volatile double %acc18, double *%ptr
+ store volatile double %acc19, double *%ptr
+ store volatile double %acc20, double *%ptr
+ store volatile double %acc21, double *%ptr
+ store volatile double %acc22, double *%ptr
+ store volatile double %acc23, double *%ptr
+ store volatile double %acc24, double *%ptr
+ store volatile double %acc25, double *%ptr
+ store volatile double %acc26, double *%ptr
+ store volatile double %acc27, double *%ptr
+ store volatile double %acc28, double *%ptr
+ store volatile double %acc29, double *%ptr
+ store volatile double %acc30, double *%ptr
+ store volatile double %acc31, double *%ptr
+ ret void
+}
+
+; Like f1, but should require only one call-saved FPR.
+define void @f3(double *%ptr) {
+; CHECK-LABEL: f3:
+; CHECK: aghi %r15, -168
+; CHECK: .cfi_def_cfa_offset 328
+; CHECK: std %f8, 160(%r15)
+; CHECK: .cfi_offset %f8, -168
+; CHECK-NOT: {{%[fv]9}}
+; CHECK-NOT: {{%[fv]1[0-5]}}
+; CHECK: ld %f8, 160(%r15)
+; CHECK: aghi %r15, 168
+; CHECK: br %r14
+ %l0 = load volatile double, double *%ptr
+ %l1 = load volatile double, double *%ptr
+ %l2 = load volatile double, double *%ptr
+ %l3 = load volatile double, double *%ptr
+ %l4 = load volatile double, double *%ptr
+ %l5 = load volatile double, double *%ptr
+ %l6 = load volatile double, double *%ptr
+ %l7 = load volatile double, double *%ptr
+ %l8 = load volatile double, double *%ptr
+ %l16 = load volatile double, double *%ptr
+ %l17 = load volatile double, double *%ptr
+ %l18 = load volatile double, double *%ptr
+ %l19 = load volatile double, double *%ptr
+ %l20 = load volatile double, double *%ptr
+ %l21 = load volatile double, double *%ptr
+ %l22 = load volatile double, double *%ptr
+ %l23 = load volatile double, double *%ptr
+ %l24 = load volatile double, double *%ptr
+ %l25 = load volatile double, double *%ptr
+ %l26 = load volatile double, double *%ptr
+ %l27 = load volatile double, double *%ptr
+ %l28 = load volatile double, double *%ptr
+ %l29 = load volatile double, double *%ptr
+ %l30 = load volatile double, double *%ptr
+ %l31 = load volatile double, double *%ptr
+ %acc0 = fsub double %l0, %l0
+ %acc1 = fsub double %l1, %acc0
+ %acc2 = fsub double %l2, %acc1
+ %acc3 = fsub double %l3, %acc2
+ %acc4 = fsub double %l4, %acc3
+ %acc5 = fsub double %l5, %acc4
+ %acc6 = fsub double %l6, %acc5
+ %acc7 = fsub double %l7, %acc6
+ %acc8 = fsub double %l8, %acc7
+ %acc16 = fsub double %l16, %acc8
+ %acc17 = fsub double %l17, %acc16
+ %acc18 = fsub double %l18, %acc17
+ %acc19 = fsub double %l19, %acc18
+ %acc20 = fsub double %l20, %acc19
+ %acc21 = fsub double %l21, %acc20
+ %acc22 = fsub double %l22, %acc21
+ %acc23 = fsub double %l23, %acc22
+ %acc24 = fsub double %l24, %acc23
+ %acc25 = fsub double %l25, %acc24
+ %acc26 = fsub double %l26, %acc25
+ %acc27 = fsub double %l27, %acc26
+ %acc28 = fsub double %l28, %acc27
+ %acc29 = fsub double %l29, %acc28
+ %acc30 = fsub double %l30, %acc29
+ %acc31 = fsub double %l31, %acc30
+ store volatile double %acc0, double *%ptr
+ store volatile double %acc1, double *%ptr
+ store volatile double %acc2, double *%ptr
+ store volatile double %acc3, double *%ptr
+ store volatile double %acc4, double *%ptr
+ store volatile double %acc5, double *%ptr
+ store volatile double %acc6, double *%ptr
+ store volatile double %acc7, double *%ptr
+ store volatile double %acc8, double *%ptr
+ store volatile double %acc16, double *%ptr
+ store volatile double %acc17, double *%ptr
+ store volatile double %acc18, double *%ptr
+ store volatile double %acc19, double *%ptr
+ store volatile double %acc20, double *%ptr
+ store volatile double %acc21, double *%ptr
+ store volatile double %acc22, double *%ptr
+ store volatile double %acc23, double *%ptr
+ store volatile double %acc24, double *%ptr
+ store volatile double %acc25, double *%ptr
+ store volatile double %acc26, double *%ptr
+ store volatile double %acc27, double *%ptr
+ store volatile double %acc28, double *%ptr
+ store volatile double %acc29, double *%ptr
+ store volatile double %acc30, double *%ptr
+ store volatile double %acc31, double *%ptr
+ ret void
+}
+
+; This function should use all call-clobbered FPRs and vector registers
+; but no call-saved ones. It shouldn't need to create a frame.
+define void @f4(double *%ptr) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: %r15
+; CHECK-NOT: {{%[fv][89]}}
+; CHECK-NOT: {{%[fv]1[0-5]}}
+; CHECK: br %r14
+ %l0 = load volatile double, double *%ptr
+ %l1 = load volatile double, double *%ptr
+ %l2 = load volatile double, double *%ptr
+ %l3 = load volatile double, double *%ptr
+ %l4 = load volatile double, double *%ptr
+ %l5 = load volatile double, double *%ptr
+ %l6 = load volatile double, double *%ptr
+ %l7 = load volatile double, double *%ptr
+ %l16 = load volatile double, double *%ptr
+ %l17 = load volatile double, double *%ptr
+ %l18 = load volatile double, double *%ptr
+ %l19 = load volatile double, double *%ptr
+ %l20 = load volatile double, double *%ptr
+ %l21 = load volatile double, double *%ptr
+ %l22 = load volatile double, double *%ptr
+ %l23 = load volatile double, double *%ptr
+ %l24 = load volatile double, double *%ptr
+ %l25 = load volatile double, double *%ptr
+ %l26 = load volatile double, double *%ptr
+ %l27 = load volatile double, double *%ptr
+ %l28 = load volatile double, double *%ptr
+ %l29 = load volatile double, double *%ptr
+ %l30 = load volatile double, double *%ptr
+ %l31 = load volatile double, double *%ptr
+ %acc0 = fsub double %l0, %l0
+ %acc1 = fsub double %l1, %acc0
+ %acc2 = fsub double %l2, %acc1
+ %acc3 = fsub double %l3, %acc2
+ %acc4 = fsub double %l4, %acc3
+ %acc5 = fsub double %l5, %acc4
+ %acc6 = fsub double %l6, %acc5
+ %acc7 = fsub double %l7, %acc6
+ %acc16 = fsub double %l16, %acc7
+ %acc17 = fsub double %l17, %acc16
+ %acc18 = fsub double %l18, %acc17
+ %acc19 = fsub double %l19, %acc18
+ %acc20 = fsub double %l20, %acc19
+ %acc21 = fsub double %l21, %acc20
+ %acc22 = fsub double %l22, %acc21
+ %acc23 = fsub double %l23, %acc22
+ %acc24 = fsub double %l24, %acc23
+ %acc25 = fsub double %l25, %acc24
+ %acc26 = fsub double %l26, %acc25
+ %acc27 = fsub double %l27, %acc26
+ %acc28 = fsub double %l28, %acc27
+ %acc29 = fsub double %l29, %acc28
+ %acc30 = fsub double %l30, %acc29
+ %acc31 = fsub double %l31, %acc30
+ store volatile double %acc0, double *%ptr
+ store volatile double %acc1, double *%ptr
+ store volatile double %acc2, double *%ptr
+ store volatile double %acc3, double *%ptr
+ store volatile double %acc4, double *%ptr
+ store volatile double %acc5, double *%ptr
+ store volatile double %acc6, double *%ptr
+ store volatile double %acc7, double *%ptr
+ store volatile double %acc16, double *%ptr
+ store volatile double %acc17, double *%ptr
+ store volatile double %acc18, double *%ptr
+ store volatile double %acc19, double *%ptr
+ store volatile double %acc20, double *%ptr
+ store volatile double %acc21, double *%ptr
+ store volatile double %acc22, double *%ptr
+ store volatile double %acc23, double *%ptr
+ store volatile double %acc24, double *%ptr
+ store volatile double %acc25, double *%ptr
+ store volatile double %acc26, double *%ptr
+ store volatile double %acc27, double *%ptr
+ store volatile double %acc28, double *%ptr
+ store volatile double %acc29, double *%ptr
+ store volatile double %acc30, double *%ptr
+ store volatile double %acc31, double *%ptr
+ ret void
+}
-; Test v2f64 absolute.
+; Test f64 and v2f64 absolute.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+declare double @llvm.fabs.f64(double)
declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
; Test a plain absolute.
%ret = fsub <2 x double> <double -0.0, double -0.0>, %abs
ret <2 x double> %ret
}
+
+; Test an f64 absolute that uses vector registers.
+define double @f3(<2 x double> %val) {
+; CHECK-LABEL: f3:
+; CHECK: wflpdb %f0, %v24
+; CHECK: br %r14
+ %scalar = extractelement <2 x double> %val, i32 0
+ %ret = call double @llvm.fabs.f64(double %scalar)
+ ret double %ret
+}
+
+; Test an f64 negative absolute that uses vector registers.
+define double @f4(<2 x double> %val) {
+; CHECK-LABEL: f4:
+; CHECK: wflndb %f0, %v24
+; CHECK: br %r14
+ %scalar = extractelement <2 x double> %val, i32 0
+ %abs = call double @llvm.fabs.f64(double %scalar)
+ %ret = fsub double -0.0, %abs
+ ret double %ret
+}
%ret = fadd <2 x double> %val1, %val2
ret <2 x double> %ret
}
+
+; Test an f64 addition that uses vector registers.
+define double @f6(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: wfadb %f0, %v24, %v26
+; CHECK: br %r14
+ %scalar1 = extractelement <2 x double> %val1, i32 0
+ %scalar2 = extractelement <2 x double> %val2, i32 0
+ %ret = fadd double %scalar1, %scalar2
+ ret double %ret
+}
-; Test v2f64 comparisons.
+; Test f64 and v2f64 comparisons.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
%ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
ret <2 x double> %ret
}
+
+; Test an f64 comparison that uses vector registers.
+define i64 @f29(i64 %a, i64 %b, double %f1, <2 x double> %vec) {
+; CHECK-LABEL: f29:
+; CHECK: wfcdb %f0, %v24
+; CHECK-NEXT: locgrne %r2, %r3
+; CHECK: br %r14
+ %f2 = extractelement <2 x double> %vec, i32 0
+ %cond = fcmp oeq double %f1, %f2
+ %res = select i1 %cond, i64 %a, i64 %b
+ ret i64 %res
+}
store <2 x float> %res, <2 x float> *%ptr
ret void
}
+
+; Test conversion of an f64 in a vector register to an f32.
+define float @f2(<2 x double> %vec) {
+; CHECK-LABEL: f2:
+; CHECK: wledb %f0, %v24
+; CHECK: br %r14
+ %scalar = extractelement <2 x double> %vec, i32 0
+ %ret = fptrunc double %scalar to float
+ ret float %ret
+}
+
+; Test conversion of an f32 in a vector register to an f64.
+define double @f3(<4 x float> %vec) {
+; CHECK-LABEL: f3:
+; CHECK: wldeb %f0, %v24
+; CHECK: br %r14
+ %scalar = extractelement <4 x float> %vec, i32 0
+ %ret = fpext float %scalar to double
+ ret double %ret
+}
%ret = fdiv <2 x double> %val1, %val2
ret <2 x double> %ret
}
+
+; Test an f64 division that uses vector registers.
+define double @f6(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: wfddb %f0, %v24, %v26
+; CHECK: br %r14
+ %scalar1 = extractelement <2 x double> %val1, i32 0
+ %scalar2 = extractelement <2 x double> %val2, i32 0
+ %ret = fdiv double %scalar1, %scalar2
+ ret double %ret
+}
%ret = fmul <2 x double> %val1, %val2
ret <2 x double> %ret
}
+
+; Test an f64 multiplication that uses vector registers.
+define double @f6(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: wfmdb %f0, %v24, %v26
+; CHECK: br %r14
+ %scalar1 = extractelement <2 x double> %val1, i32 0
+ %scalar2 = extractelement <2 x double> %val2, i32 0
+ %ret = fmul double %scalar1, %scalar2
+ ret double %ret
+}
%ret = fsub <2 x double> <double -0.0, double -0.0>, %val
ret <2 x double> %ret
}
+
+; Test an f64 negation that uses vector registers.
+define double @f6(<2 x double> %val) {
+; CHECK-LABEL: f6:
+; CHECK: wflcdb %f0, %v24
+; CHECK: br %r14
+ %scalar = extractelement <2 x double> %val, i32 0
+ %ret = fsub double -0.0, %scalar
+ ret double %ret
+}
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+declare double @llvm.rint.f64(double)
+declare double @llvm.nearbyint.f64(double)
+declare double @llvm.floor.f64(double)
+declare double @llvm.ceil.f64(double)
+declare double @llvm.trunc.f64(double)
+declare double @llvm.round.f64(double)
declare <2 x double> @llvm.rint.v2f64(<2 x double>)
declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
declare <2 x double> @llvm.floor.v2f64(<2 x double>)
%res = call <2 x double> @llvm.round.v2f64(<2 x double> %val)
ret <2 x double> %res
}
+
+define double @f7(<2 x double> %val) {
+; CHECK-LABEL: f7:
+; CHECK: wfidb %f0, %v24, 0, 0
+; CHECK: br %r14
+ %scalar = extractelement <2 x double> %val, i32 0
+ %res = call double @llvm.rint.f64(double %scalar)
+ ret double %res
+}
+
+define double @f8(<2 x double> %val) {
+; CHECK-LABEL: f8:
+; CHECK: wfidb %f0, %v24, 4, 0
+; CHECK: br %r14
+ %scalar = extractelement <2 x double> %val, i32 0
+ %res = call double @llvm.nearbyint.f64(double %scalar)
+ ret double %res
+}
+
+define double @f9(<2 x double> %val) {
+; CHECK-LABEL: f9:
+; CHECK: wfidb %f0, %v24, 4, 7
+; CHECK: br %r14
+ %scalar = extractelement <2 x double> %val, i32 0
+ %res = call double @llvm.floor.f64(double %scalar)
+ ret double %res
+}
+
+define double @f10(<2 x double> %val) {
+; CHECK-LABEL: f10:
+; CHECK: wfidb %f0, %v24, 4, 6
+; CHECK: br %r14
+ %scalar = extractelement <2 x double> %val, i32 0
+ %res = call double @llvm.ceil.f64(double %scalar)
+ ret double %res
+}
+
+define double @f11(<2 x double> %val) {
+; CHECK-LABEL: f11:
+; CHECK: wfidb %f0, %v24, 4, 5
+; CHECK: br %r14
+ %scalar = extractelement <2 x double> %val, i32 0
+ %res = call double @llvm.trunc.f64(double %scalar)
+ ret double %res
+}
+
+define double @f12(<2 x double> %val) {
+; CHECK-LABEL: f12:
+; CHECK: wfidb %f0, %v24, 4, 1
+; CHECK: br %r14
+ %scalar = extractelement <2 x double> %val, i32 0
+ %res = call double @llvm.round.f64(double %scalar)
+ ret double %res
+}
-; Test v2f64 square root.
+; Test f64 and v2f64 square root.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+declare double @llvm.sqrt.f64(double)
declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
define <2 x double> @f1(<2 x double> %val) {
%ret = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %val)
ret <2 x double> %ret
}
+
+define double @f2(<2 x double> %val) {
+; CHECK-LABEL: f2:
+; CHECK: wfsqdb %f0, %v24
+; CHECK: br %r14
+ %scalar = extractelement <2 x double> %val, i32 0
+ %ret = call double @llvm.sqrt.f64(double %scalar)
+ ret double %ret
+}
%ret = fsub <2 x double> %val1, %val2
ret <2 x double> %ret
}
+
+; Test an f64 subtraction that uses vector registers.
+define double @f7(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f7:
+; CHECK: wfsdb %f0, %v24, %v26
+; CHECK: br %r14
+ %scalar1 = extractelement <2 x double> %val1, i32 0
+ %scalar2 = extractelement <2 x double> %val2, i32 0
+ %ret = fsub double %scalar1, %scalar2
+ ret double %ret
+}