From cf0fa9b9dd296771a2de766c5262b96938cf13a3 Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Tue, 5 May 2015 19:28:34 +0000
Subject: [PATCH] [SystemZ] Add CodeGen support for scalar f64 ops in vector
 registers

The z13 vector facility includes some instructions that operate only on the
high f64 in a v2f64, effectively extending the FP register set from 16
to 32 registers.  It's still better to use the old instructions if the
operands happen to fit though, since the older instructions have a shorter
encoding.

Based on a patch by Richard Sandiford.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236524 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/SystemZ/SystemZ.td              |   2 +-
 lib/Target/SystemZ/SystemZAsmPrinter.cpp   |  44 ++
 lib/Target/SystemZ/SystemZISelLowering.cpp |  11 +-
 lib/Target/SystemZ/SystemZInstrFP.td       |  14 +-
 lib/Target/SystemZ/SystemZInstrFormats.td  |  22 +-
 lib/Target/SystemZ/SystemZInstrInfo.cpp    |  10 +
 lib/Target/SystemZ/SystemZInstrVector.td   |  42 +-
 lib/Target/SystemZ/SystemZShortenInst.cpp  | 139 ++++++-
 test/CodeGen/SystemZ/fp-abs-01.ll          |   3 +-
 test/CodeGen/SystemZ/fp-abs-02.ll          |   3 +-
 test/CodeGen/SystemZ/fp-add-02.ll          |   7 +-
 test/CodeGen/SystemZ/fp-cmp-02.ll          |  51 ++-
 test/CodeGen/SystemZ/fp-conv-01.ll         |  14 +-
 test/CodeGen/SystemZ/fp-conv-02.ll         |   6 +-
 test/CodeGen/SystemZ/fp-div-02.ll          |   6 +-
 test/CodeGen/SystemZ/fp-move-01.ll         |   6 +-
 test/CodeGen/SystemZ/fp-move-04.ll         |   3 +-
 test/CodeGen/SystemZ/fp-move-07.ll         |   3 +-
 test/CodeGen/SystemZ/fp-move-11.ll         | 110 +++++
 test/CodeGen/SystemZ/fp-mul-03.ll          |   6 +-
 test/CodeGen/SystemZ/fp-mul-07.ll          |  10 +-
 test/CodeGen/SystemZ/fp-mul-09.ll          |  10 +-
 test/CodeGen/SystemZ/fp-neg-01.ll          |   3 +-
 test/CodeGen/SystemZ/fp-round-02.ll        |   8 +-
 test/CodeGen/SystemZ/fp-sqrt-02.ll         |   6 +-
 test/CodeGen/SystemZ/fp-sub-02.ll          |   6 +-
 test/CodeGen/SystemZ/frame-03.ll           |   2 +-
 test/CodeGen/SystemZ/frame-07.ll           |   4 +-
 test/CodeGen/SystemZ/frame-17.ll           |   2 +-
 test/CodeGen/SystemZ/frame-20.ll           | 445 +++++++++++++++++++++
 test/CodeGen/SystemZ/vec-abs-05.ll         |  24 +-
 test/CodeGen/SystemZ/vec-add-01.ll         |  11 +
 test/CodeGen/SystemZ/vec-cmp-06.ll         |  14 +-
 test/CodeGen/SystemZ/vec-conv-02.ll        |  20 +
 test/CodeGen/SystemZ/vec-div-01.ll         |  11 +
 test/CodeGen/SystemZ/vec-mul-01.ll         |  11 +
 test/CodeGen/SystemZ/vec-neg-01.ll         |  10 +
 test/CodeGen/SystemZ/vec-round-01.ll       |  60 +++
 test/CodeGen/SystemZ/vec-sqrt-01.ll        |  12 +-
 test/CodeGen/SystemZ/vec-sub-01.ll         |  11 +
 40 files changed, 1102 insertions(+), 80 deletions(-)
 create mode 100644 test/CodeGen/SystemZ/fp-move-11.ll
 create mode 100644 test/CodeGen/SystemZ/frame-20.ll

diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
index 4f3f18ed499..d4d636d3479 100644
--- a/lib/Target/SystemZ/SystemZ.td
+++ b/lib/Target/SystemZ/SystemZ.td
@@ -40,8 +40,8 @@ include "SystemZOperands.td"
 include "SystemZPatterns.td"
 include "SystemZInstrFormats.td"
 include "SystemZInstrInfo.td"
-include "SystemZInstrFP.td"
 include "SystemZInstrVector.td"
+include "SystemZInstrFP.td"
 
 def SystemZInstrInfo : InstrInfo {}
 
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 026a75f2140..f16488063af 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -80,6 +80,27 @@ static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
                                  Context);
 }
 
+// MI loads the high part of a vector from memory.  Return an instruction
+// that uses replicating vector load Opcode to do the same thing.
+static MCInst lowerSubvectorLoad(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+    .addReg(MI->getOperand(1).getReg())
+    .addImm(MI->getOperand(2).getImm())
+    .addReg(MI->getOperand(3).getReg());
+}
+
+// MI stores the high part of a vector to memory.  Return an instruction
+// that uses elemental vector store Opcode to do the same thing.
+static MCInst lowerSubvectorStore(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+    .addReg(MI->getOperand(1).getReg())
+    .addImm(MI->getOperand(2).getImm())
+    .addReg(MI->getOperand(3).getReg())
+    .addImm(0);
+}
+
 void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   SystemZMCInstLower Lower(MF->getContext(), *this);
   MCInst LoweredMI;
@@ -158,6 +179,29 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()));
     break;
 
+  case SystemZ::VLR32:
+  case SystemZ::VLR64:
+    LoweredMI = MCInstBuilder(SystemZ::VLR)
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()));
+    break;
+
+  case SystemZ::VL32:
+    LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF);
+    break;
+
+  case SystemZ::VL64:
+    LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPG);
+    break;
+
+  case SystemZ::VST32:
+    LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEF);
+    break;
+
+  case SystemZ::VST64:
+    LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEG);
+    break;
+
   case SystemZ::LFER:
     LoweredMI = MCInstBuilder(SystemZ::VLGVF)
       .addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg()))
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 391cb8c6fc9..ff79a48179f 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -91,9 +91,14 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
     addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
   else
     addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
-  addRegisterClass(MVT::i64,  &SystemZ::GR64BitRegClass);
-  addRegisterClass(MVT::f32,  &SystemZ::FP32BitRegClass);
-  addRegisterClass(MVT::f64,  &SystemZ::FP64BitRegClass);
+  addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
+  if (Subtarget.hasVector()) {
+    addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
+    addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
+  } else {
+    addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
+    addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
+  }
   addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
 
   if (Subtarget.hasVector()) {
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index efa29fa9c00..27fbd7df288 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -46,9 +46,14 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
   defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>;
   defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>;
 }
-defm : CompareZeroFP<LTEBRCompare, FP32>;
-defm : CompareZeroFP<LTDBRCompare, FP64>;
-defm : CompareZeroFP<LTXBRCompare, FP128>;
+// Note that the comparison against zero operation is not available if we
+// have vector support, since load-and-test instructions will partially
+// clobber the target (vector) register.
+let Predicates = [FeatureNoVector] in {
+  defm : CompareZeroFP<LTEBRCompare, FP32>;
+  defm : CompareZeroFP<LTDBRCompare, FP64>;
+  defm : CompareZeroFP<LTXBRCompare, FP128>;
+}
 
 // Moves between 64-bit integer and floating-point registers.
 def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>;
@@ -98,6 +103,9 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
   defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
   defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
 
+  // For z13 we prefer LDE over LE to avoid partial register dependencies.
+  def LDE32 : UnaryRXE<"lde", 0xED24, null_frag, FP32, 4>;
+
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
   let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index dc9dfa801fd..71eb9986499 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2151,10 +2151,13 @@ class PrefetchRILPC<string mnemonic, bits<12> opcode,
 
 // A floating-point load-and test operation.  Create both a normal unary
 // operation and one that acts as a comparison against zero.
+// Note that the comparison against zero operation is not available if we
+// have vector support, since load-and-test instructions will partially
+// clobber the target (vector) register.
 multiclass LoadAndTestRRE<string mnemonic, bits<16> opcode,
                           RegisterOperand cls> {
   def "" : UnaryRRE<mnemonic, opcode, null_frag, cls, cls>;
-  let isCodeGenOnly = 1 in
+  let isCodeGenOnly = 1, Predicates = [FeatureNoVector] in
     def Compare : CompareRRE<mnemonic, opcode, null_frag, cls, cls>;
 }
 
@@ -2401,6 +2404,23 @@ class Alias<int size, dag outs, dag ins, list<dag> pattern>
 class UnaryAliasVRS<RegisterOperand cls1, RegisterOperand cls2>
  : Alias<6, (outs cls1:$src1), (ins cls2:$src2), []>;
 
+// An alias of a UnaryVRR*, but with different register sizes.
+class UnaryAliasVRR<SDPatternOperator operator, TypedReg tr1, TypedReg tr2>
+  : Alias<6, (outs tr1.op:$V1), (ins tr2.op:$V2),
+          [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]>;
+
+// An alias of a UnaryVRX, but with different register sizes.
+class UnaryAliasVRX<SDPatternOperator operator, TypedReg tr,
+                    AddressingMode mode = bdxaddr12only>
+  : Alias<6, (outs tr.op:$V1), (ins mode:$XBD2),
+          [(set tr.op:$V1, (tr.vt (operator mode:$XBD2)))]>;
+
+// An alias of a StoreVRX, but with different register sizes.
+class StoreAliasVRX<SDPatternOperator operator, TypedReg tr,
+                    AddressingMode mode = bdxaddr12only>
+  : Alias<6, (outs), (ins tr.op:$V1, mode:$XBD2),
+          [(operator (tr.vt tr.op:$V1), mode:$XBD2)]>;
+
 // An alias of a BinaryRI, but with different register sizes.
 class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
                     Immediate imm>
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 63101a9d000..8dbd9df32b7 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -578,6 +578,10 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opcode = SystemZ::LDR;
   else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::LXR;
+  else if (SystemZ::VR32BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::VLR32;
+  else if (SystemZ::VR64BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::VLR64;
   else if (SystemZ::VR128BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::VLR;
   else
@@ -1118,6 +1122,12 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
   } else if (RC == &SystemZ::FP128BitRegClass) {
     LoadOpcode = SystemZ::LX;
     StoreOpcode = SystemZ::STX;
+  } else if (RC == &SystemZ::VR32BitRegClass) {
+    LoadOpcode = SystemZ::VL32;
+    StoreOpcode = SystemZ::VST32;
+  } else if (RC == &SystemZ::VR64BitRegClass) {
+    LoadOpcode = SystemZ::VL64;
+    StoreOpcode = SystemZ::VST64;
   } else if (RC == &SystemZ::VF128BitRegClass ||
              RC == &SystemZ::VR128BitRegClass) {
     LoadOpcode = SystemZ::VL;
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index b6c8042b3c8..8abaeb69a20 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -14,6 +14,8 @@
 let Predicates = [FeatureVector] in {
   // Register move.
   def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>;
+  def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>;
+  def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>;
 
   // Load GR from VR element.
   def VLGVB : BinaryVRSc<"vlgvb", 0xE721, null_frag, v128b, 0>;
@@ -123,6 +125,13 @@ let Predicates = [FeatureVector] in {
   def : Pat<(v2f64 (z_replicate_loadf64 bdxaddr12only:$addr)),
             (VLREPG bdxaddr12only:$addr)>;
 
+  // Use VLREP to load subvectors.  These patterns use "12pair" because
+  // LEY and LDY offer full 20-bit displacement fields.  It's often better
+  // to use those instructions rather than force a 20-bit displacement
+  // into a GPR temporary.
+  def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>;
+  def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
+
   // Load logical element and zero.
   def VLLEZB : UnaryVRX<"vllezb", 0xE704, z_vllezi8,  v128b, 1, 0>;
   def VLLEZH : UnaryVRX<"vllezh", 0xE704, z_vllezi16, v128h, 2, 1>;
@@ -193,6 +202,13 @@ let Predicates = [FeatureVector] in {
                        imm32zx1:$index),
             (VSTEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;
 
+  // Use VSTE to store subvectors.  These patterns use "12pair" because
+  // STEY and STDY offer full 20-bit displacement fields.  It's often better
+  // to use those instructions rather than force a 20-bit displacement
+  // into a GPR temporary.
+  def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>;
+  def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
+
   // Scatter element.
   def VSCEF : StoreBinaryVRV<"vscef", 0xE71B, 4, imm32zx2>;
   def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>;
@@ -778,7 +794,7 @@ multiclass VectorRounding<Instruction insn, TypedReg tr> {
 let Predicates = [FeatureVector] in {
   // Add.
   def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
-  def WFADB : BinaryVRRc<"wfadb", 0xE7E3, null_frag, v64db, v64db, 3, 8>;
+  def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;
 
   // Convert from fixed 64-bit.
   def VCDGB : TernaryVRRa<"vcdgb", 0xE7C3, null_frag, v128db, v128g, 3, 0>;
@@ -804,53 +820,55 @@ let Predicates = [FeatureVector] in {
 
   // Divide.
   def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
-  def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, null_frag, v64db, v64db, 3, 8>;
+  def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;
 
   // Load FP integer.
   def VFIDB : TernaryVRRa<"vfidb", 0xE7C7, null_frag, v128db, v128db, 3, 0>;
   def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
   defm : VectorRounding<VFIDB, v128db>;
+  defm : VectorRounding<WFIDB, v64db>;
 
   // Load lengthened.
   def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
-  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, null_frag, v64db, v32eb, 2, 8>;
+  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fextend, v64db, v32eb, 2, 8>;
 
   // Load rounded,
   def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
   def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
   def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
+  def : FPConversion<WLEDB, fround, v32eb, v64db, 0, 0>;
 
   // Multiply.
   def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
-  def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, null_frag, v64db, v64db, 3, 8>;
+  def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;
 
   // Multiply and add.
   def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
-  def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, null_frag, v64db, v64db, 8, 3>;
+  def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;
 
   // Multiply and subtract.
   def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
-  def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, null_frag, v64db, v64db, 8, 3>;
+  def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;
 
   // Load complement,
   def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>;
-  def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 0>;
+  def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>;
 
   // Load negative.
   def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>;
-  def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 1>;
+  def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>;
 
   // Load positive.
   def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>;
-  def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 2>;
+  def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>;
 
   // Square root.
   def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
-  def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, null_frag, v64db, v64db, 3, 8>;
+  def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;
 
   // Subtract.
   def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
-  def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, null_frag, v64db, v64db, 3, 8>;
+  def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;
 
   // Test data class immediate.
   let Defs = [CC] in {
@@ -866,7 +884,7 @@ let Predicates = [FeatureVector] in {
 let Predicates = [FeatureVector] in {
   // Compare scalar.
   let Defs = [CC] in
-    def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, null_frag, v64db, 3>;
+    def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;
 
   // Compare and signal scalar.
   let Defs = [CC] in
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index ec7a8c40d18..d1a17c5500d 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -15,6 +15,7 @@
 
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 
 using namespace llvm;
 
@@ -36,6 +37,10 @@ public:
 private:
   bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
                   unsigned LLIxL, unsigned LLIxH);
+  bool shortenOn0(MachineInstr &MI, unsigned Opcode);
+  bool shortenOn01(MachineInstr &MI, unsigned Opcode);
+  bool shortenOn001(MachineInstr &MI, unsigned Opcode);
+  bool shortenFPConv(MachineInstr &MI, unsigned Opcode);
 
   const SystemZInstrInfo *TII;
 
@@ -97,6 +102,64 @@ bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned *GPRMap,
   return false;
 }
 
+// Change MI's opcode to Opcode if register operand 0 has a 4-bit encoding.
+bool SystemZShortenInst::shortenOn0(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16) {
+    MI.setDesc(TII->get(Opcode));
+    return true;
+  }
+  return false;
+}
+
+// Change MI's opcode to Opcode if register operands 0 and 1 have a
+// 4-bit encoding.
+bool SystemZShortenInst::shortenOn01(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+      SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
+    MI.setDesc(TII->get(Opcode));
+    return true;
+  }
+  return false;
+}
+
+// Change MI's opcode to Opcode if register operands 0, 1 and 2 have a
+// 4-bit encoding and if operands 0 and 1 are tied.
+bool SystemZShortenInst::shortenOn001(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+      MI.getOperand(1).getReg() == MI.getOperand(0).getReg() &&
+      SystemZMC::getFirstReg(MI.getOperand(2).getReg()) < 16) {
+    MI.setDesc(TII->get(Opcode));
+    return true;
+  }
+  return false;
+}
+
+// MI is a vector-style conversion instruction with the operand order:
+// destination, source, exact-suppress, rounding-mode.  If both registers
+// have a 4-bit encoding then change it to Opcode, which has operand order:
+// destination, rouding-mode, source, exact-suppress.
+bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+      SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
+    MachineOperand Dest(MI.getOperand(0));
+    MachineOperand Src(MI.getOperand(1));
+    MachineOperand Suppress(MI.getOperand(2));
+    MachineOperand Mode(MI.getOperand(3));
+    MI.RemoveOperand(3);
+    MI.RemoveOperand(2);
+    MI.RemoveOperand(1);
+    MI.RemoveOperand(0);
+    MI.setDesc(TII->get(Opcode));
+    MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
+      .addOperand(Dest)
+      .addOperand(Mode)
+      .addOperand(Src)
+      .addOperand(Suppress);
+    return true;
+  }
+  return false;
+}
+
 // Process all instructions in MBB.  Return true if something changed.
 bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
@@ -117,13 +180,83 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
   // Iterate backwards through the block looking for instructions to change.
   for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
     MachineInstr &MI = *MBBI;
-    unsigned Opcode = MI.getOpcode();
-    if (Opcode == SystemZ::IILF)
+    switch (MI.getOpcode()) {
+    case SystemZ::IILF:
       Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL,
                             SystemZ::LLILH);
-    else if (Opcode == SystemZ::IIHF)
+      break;
+
+    case SystemZ::IIHF:
       Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL,
                             SystemZ::LLIHH);
+      break;
+
+    case SystemZ::WFADB:
+      Changed |= shortenOn001(MI, SystemZ::ADBR);
+      break;
+
+    case SystemZ::WFDDB:
+      Changed |= shortenOn001(MI, SystemZ::DDBR);
+      break;
+
+    case SystemZ::WFIDB:
+      Changed |= shortenFPConv(MI, SystemZ::FIDBRA);
+      break;
+
+    case SystemZ::WLDEB:
+      Changed |= shortenOn01(MI, SystemZ::LDEBR);
+      break;
+
+    case SystemZ::WLEDB:
+      Changed |= shortenFPConv(MI, SystemZ::LEDBRA);
+      break;
+
+    case SystemZ::WFMDB:
+      Changed |= shortenOn001(MI, SystemZ::MDBR);
+      break;
+
+    case SystemZ::WFLCDB:
+      Changed |= shortenOn01(MI, SystemZ::LCDBR);
+      break;
+
+    case SystemZ::WFLNDB:
+      Changed |= shortenOn01(MI, SystemZ::LNDBR);
+      break;
+
+    case SystemZ::WFLPDB:
+      Changed |= shortenOn01(MI, SystemZ::LPDBR);
+      break;
+
+    case SystemZ::WFSQDB:
+      Changed |= shortenOn01(MI, SystemZ::SQDBR);
+      break;
+
+    case SystemZ::WFSDB:
+      Changed |= shortenOn001(MI, SystemZ::SDBR);
+      break;
+
+    case SystemZ::WFCDB:
+      Changed |= shortenOn01(MI, SystemZ::CDBR);
+      break;
+
+    case SystemZ::VL32:
+      // For z13 we prefer LDE over LE to avoid partial register dependencies.
+      Changed |= shortenOn0(MI, SystemZ::LDE32);
+      break;
+
+    case SystemZ::VST32:
+      Changed |= shortenOn0(MI, SystemZ::STE);
+      break;
+
+    case SystemZ::VL64:
+      Changed |= shortenOn0(MI, SystemZ::LD);
+      break;
+
+    case SystemZ::VST64:
+      Changed |= shortenOn0(MI, SystemZ::STD);
+      break;
+    }
+
     unsigned UsedLow = 0;
     unsigned UsedHigh = 0;
     for (auto MOI = MI.operands_begin(), MOE = MI.operands_end();
diff --git a/test/CodeGen/SystemZ/fp-abs-01.ll b/test/CodeGen/SystemZ/fp-abs-01.ll
index d14a92acae8..3b143d93315 100644
--- a/test/CodeGen/SystemZ/fp-abs-01.ll
+++ b/test/CodeGen/SystemZ/fp-abs-01.ll
@@ -1,6 +1,7 @@
 ; Test floating-point absolute.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Test f32.
 declare float @llvm.fabs.f32(float %f)
diff --git a/test/CodeGen/SystemZ/fp-abs-02.ll b/test/CodeGen/SystemZ/fp-abs-02.ll
index deec8c32b4a..e831ddb86fe 100644
--- a/test/CodeGen/SystemZ/fp-abs-02.ll
+++ b/test/CodeGen/SystemZ/fp-abs-02.ll
@@ -1,6 +1,7 @@
 ; Test negated floating-point absolute.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Test f32.
 declare float @llvm.fabs.f32(float %f)
diff --git a/test/CodeGen/SystemZ/fp-add-02.ll b/test/CodeGen/SystemZ/fp-add-02.ll
index 07c7462020f..5be1ad79d45 100644
--- a/test/CodeGen/SystemZ/fp-add-02.ll
+++ b/test/CodeGen/SystemZ/fp-add-02.ll
@@ -1,7 +1,8 @@
 ; Test 64-bit floating-point addition.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 declare double @foo()
 
 ; Check register addition.
@@ -76,7 +77,7 @@ define double @f6(double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: adb %f0, 160(%r15)
+; CHECK-SCALAR: adb %f0, 160(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr double, double *%ptr0, i64 2
   %ptr2 = getelementptr double, double *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-cmp-02.ll b/test/CodeGen/SystemZ/fp-cmp-02.ll
index 95af309e795..94a256777c7 100644
--- a/test/CodeGen/SystemZ/fp-cmp-02.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-02.ll
@@ -1,7 +1,10 @@
 ; Test 64-bit floating-point comparison.  The tests assume a z10 implementation
 ; of select, using conditional branches rather than LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare double @foo()
 
@@ -9,8 +12,9 @@ declare double @foo()
 define i64 @f1(i64 %a, i64 %b, double %f1, double %f2) {
 ; CHECK-LABEL: f1:
 ; CHECK: cdbr %f0, %f2
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %cond = fcmp oeq double %f1, %f2
   %res = select i1 %cond, i64 %a, i64 %b
@@ -21,8 +25,9 @@ define i64 @f1(i64 %a, i64 %b, double %f1, double %f2) {
 define i64 @f2(i64 %a, i64 %b, double %f1, double *%ptr) {
 ; CHECK-LABEL: f2:
 ; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %f2 = load double , double *%ptr
   %cond = fcmp oeq double %f1, %f2
@@ -34,8 +39,9 @@ define i64 @f2(i64 %a, i64 %b, double %f1, double *%ptr) {
 define i64 @f3(i64 %a, i64 %b, double %f1, double *%base) {
 ; CHECK-LABEL: f3:
 ; CHECK: cdb %f0, 4088(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr = getelementptr double, double *%base, i64 511
   %f2 = load double , double *%ptr
@@ -50,8 +56,9 @@ define i64 @f4(i64 %a, i64 %b, double %f1, double *%base) {
 ; CHECK-LABEL: f4:
 ; CHECK: aghi %r4, 4096
 ; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr = getelementptr double, double *%base, i64 512
   %f2 = load double , double *%ptr
@@ -65,8 +72,9 @@ define i64 @f5(i64 %a, i64 %b, double %f1, double *%base) {
 ; CHECK-LABEL: f5:
 ; CHECK: aghi %r4, -8
 ; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr = getelementptr double, double *%base, i64 -1
   %f2 = load double , double *%ptr
@@ -80,8 +88,9 @@ define i64 @f6(i64 %a, i64 %b, double %f1, double *%base, i64 %index) {
 ; CHECK-LABEL: f6:
 ; CHECK: sllg %r1, %r5, 3
 ; CHECK: cdb %f0, 800(%r1,%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr1 = getelementptr double, double *%base, i64 %index
   %ptr2 = getelementptr double, double *%ptr1, i64 100
@@ -95,7 +104,7 @@ define i64 @f6(i64 %a, i64 %b, double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: cdb {{%f[0-9]+}}, 160(%r15)
+; CHECK-SCALAR: cdb {{%f[0-9]+}}, 160(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr double, double *%ptr0, i64 2
   %ptr2 = getelementptr double, double *%ptr0, i64 4
@@ -152,9 +161,12 @@ define double @f7(double *%ptr0) {
 ; Check comparison with zero.
 define i64 @f8(i64 %a, i64 %b, double %f) {
 ; CHECK-LABEL: f8:
-; CHECK: ltdbr %f0, %f0
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR: ltdbr %f0, %f0
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR: lzdr %f1
+; CHECK-VECTOR-NEXT: cdbr %f0, %f1
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %cond = fcmp oeq double %f, 0.0
   %res = select i1 %cond, i64 %a, i64 %b
@@ -165,8 +177,9 @@ define i64 @f8(i64 %a, i64 %b, double %f) {
 define i64 @f9(i64 %a, i64 %b, double %f2, double *%ptr) {
 ; CHECK-LABEL: f9:
 ; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: jl {{\.L.*}}
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: jl
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnl %r2, %r3
 ; CHECK: br %r14
   %f1 = load double , double *%ptr
   %cond = fcmp ogt double %f1, %f2
diff --git a/test/CodeGen/SystemZ/fp-conv-01.ll b/test/CodeGen/SystemZ/fp-conv-01.ll
index ebc174afada..06740ed4b4a 100644
--- a/test/CodeGen/SystemZ/fp-conv-01.ll
+++ b/test/CodeGen/SystemZ/fp-conv-01.ll
@@ -1,11 +1,15 @@
 ; Test floating-point truncations.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 ; Test f64->f32.
 define float @f1(double %d1, double %d2) {
 ; CHECK-LABEL: f1:
-; CHECK: ledbr %f0, %f2
+; CHECK-SCALAR: ledbr %f0, %f2
+; CHECK-VECTOR: ledbra %f0, 0, %f2, 0
 ; CHECK: br %r14
   %res = fptrunc double %d2 to float
   ret float %res
@@ -50,8 +54,10 @@ define double @f4(fp128 *%ptr) {
 define void @f5(double *%dst, fp128 *%ptr, double %d1, double %d2) {
 ; CHECK-LABEL: f5:
 ; CHECK: ldxbr %f1, %f1
-; CHECK: adbr %f1, %f2
-; CHECK: std %f1, 0(%r2)
+; CHECK-SCALAR: adbr %f1, %f2
+; CHECK-SCALAR: std %f1, 0(%r2)
+; CHECK-VECTOR: wfadb [[REG:%f[0-9]+]], %f1, %f2
+; CHECK-VECTOR: std [[REG]], 0(%r2)
 ; CHECK: br %r14
   %val = load fp128 , fp128 *%ptr
   %conv = fptrunc fp128 %val to double
diff --git a/test/CodeGen/SystemZ/fp-conv-02.ll b/test/CodeGen/SystemZ/fp-conv-02.ll
index e9376ba6973..be32bfe7ba9 100644
--- a/test/CodeGen/SystemZ/fp-conv-02.ll
+++ b/test/CodeGen/SystemZ/fp-conv-02.ll
@@ -1,6 +1,8 @@
 ; Test extensions of f32 to f64.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Check register extension.
 define double @f1(float %val) {
@@ -74,7 +76,7 @@ define double @f6(float *%base, i64 %index) {
 ; to use LDEB if possible.
 define void @f7(double *%ptr1, float *%ptr2) {
 ; CHECK-LABEL: f7:
-; CHECK: ldeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
+; CHECK-SCALAR: ldeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %val0 = load volatile float , float *%ptr2
   %val1 = load volatile float , float *%ptr2
diff --git a/test/CodeGen/SystemZ/fp-div-02.ll b/test/CodeGen/SystemZ/fp-div-02.ll
index 82eeb480602..f120e7c923d 100644
--- a/test/CodeGen/SystemZ/fp-div-02.ll
+++ b/test/CodeGen/SystemZ/fp-div-02.ll
@@ -1,6 +1,8 @@
 ; Test 64-bit floating-point division.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 declare double @foo()
 
@@ -76,7 +78,7 @@ define double @f6(double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: ddb %f0, 160(%r15)
+; CHECK-SCALAR: ddb %f0, 160(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr double, double *%ptr0, i64 2
   %ptr2 = getelementptr double, double *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-move-01.ll b/test/CodeGen/SystemZ/fp-move-01.ll
index 31a8fc55d77..843b1b6a6e6 100644
--- a/test/CodeGen/SystemZ/fp-move-01.ll
+++ b/test/CodeGen/SystemZ/fp-move-01.ll
@@ -1,11 +1,13 @@
 ; Test moves between FPRs.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Test f32 moves.
 define float @f1(float %a, float %b) {
 ; CHECK-LABEL: f1:
 ; CHECK: ler %f0, %f2
+; CHECK: br %r14
   ret float %b
 }
 
@@ -13,6 +15,7 @@ define float @f1(float %a, float %b) {
 define double @f2(double %a, double %b) {
 ; CHECK-LABEL: f2:
 ; CHECK: ldr %f0, %f2
+; CHECK: br %r14
   ret double %b
 }
 
@@ -22,6 +25,7 @@ define void @f3(fp128 *%x) {
 ; CHECK-LABEL: f3:
 ; CHECK: lxr
 ; CHECK: axbr
+; CHECK: br %r14
   %val = load volatile fp128 , fp128 *%x
   %sum = fadd fp128 %val, %val
   store volatile fp128 %sum, fp128 *%x
diff --git a/test/CodeGen/SystemZ/fp-move-04.ll b/test/CodeGen/SystemZ/fp-move-04.ll
index d3728d0e585..6650419b2c3 100644
--- a/test/CodeGen/SystemZ/fp-move-04.ll
+++ b/test/CodeGen/SystemZ/fp-move-04.ll
@@ -1,6 +1,7 @@
 ; Test 64-bit floating-point loads.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Test the low end of the LD range.
 define double @f1(double *%src) {
diff --git a/test/CodeGen/SystemZ/fp-move-07.ll b/test/CodeGen/SystemZ/fp-move-07.ll
index c3ad2a59f66..5361002a97e 100644
--- a/test/CodeGen/SystemZ/fp-move-07.ll
+++ b/test/CodeGen/SystemZ/fp-move-07.ll
@@ -1,6 +1,7 @@
 ; Test 64-bit floating-point stores.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Test the low end of the STD range.
 define void @f1(double *%src, double %val) {
diff --git a/test/CodeGen/SystemZ/fp-move-11.ll b/test/CodeGen/SystemZ/fp-move-11.ll
new file mode 100644
index 00000000000..ce45019425c
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-move-11.ll
@@ -0,0 +1,110 @@
+; Test 32-bit floating-point loads for z13.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test that we use LDE instead of LE - low end of the LE range.
+define float @f1(float *%src) {
+; CHECK-LABEL: f1:
+; CHECK: lde %f0, 0(%r2)
+; CHECK: br %r14
+  %val = load float, float *%src
+  ret float %val
+}
+
+; Test that we use LDE instead of LE - high end of the LE range.
+define float @f2(float *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lde %f0, 4092(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 1023
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the next word up, which should use LEY instead of LDE.
+define float @f3(float *%src) {
+; CHECK-LABEL: f3:
+; CHECK: ley %f0, 4096(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 1024
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the high end of the aligned LEY range.
+define float @f4(float *%src) {
+; CHECK-LABEL: f4:
+; CHECK: ley %f0, 524284(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 131071
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define float @f5(float *%src) {
+; CHECK-LABEL: f5:
+; CHECK: agfi %r2, 524288
+; CHECK: lde %f0, 0(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 131072
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the high end of the negative aligned LEY range.
+define float @f6(float *%src) {
+; CHECK-LABEL: f6:
+; CHECK: ley %f0, -4(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 -1
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the low end of the LEY range.
+define float @f7(float *%src) {
+; CHECK-LABEL: f7:
+; CHECK: ley %f0, -524288(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 -131072
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define float @f8(float *%src) {
+; CHECK-LABEL: f8:
+; CHECK: agfi %r2, -524292
+; CHECK: lde %f0, 0(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 -131073
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check that LDE allows an index.
+define float @f9(i64 %src, i64 %index) {
+; CHECK-LABEL: f9:
+; CHECK: lde %f0, 4092({{%r3,%r2|%r2,%r3}})
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 4092
+  %ptr = inttoptr i64 %add2 to float *
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check that LEY allows an index.
+define float @f10(i64 %src, i64 %index) {
+; CHECK-LABEL: f10:
+; CHECK: ley %f0, 4096({{%r3,%r2|%r2,%r3}})
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 4096
+  %ptr = inttoptr i64 %add2 to float *
+  %val = load float, float *%ptr
+  ret float %val
+}
diff --git a/test/CodeGen/SystemZ/fp-mul-03.ll b/test/CodeGen/SystemZ/fp-mul-03.ll
index 701304ef3ee..0d52121f41c 100644
--- a/test/CodeGen/SystemZ/fp-mul-03.ll
+++ b/test/CodeGen/SystemZ/fp-mul-03.ll
@@ -1,6 +1,8 @@
 ; Test multiplication of two f64s, producing an f64 result.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 declare double @foo()
 
@@ -76,7 +78,7 @@ define double @f6(double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: mdb %f0, 160(%r15)
+; CHECK-SCALAR: mdb %f0, 160(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr double, double *%ptr0, i64 2
   %ptr2 = getelementptr double, double *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-mul-07.ll b/test/CodeGen/SystemZ/fp-mul-07.ll
index b1d0ae3c520..e0b4a5c5d78 100644
--- a/test/CodeGen/SystemZ/fp-mul-07.ll
+++ b/test/CodeGen/SystemZ/fp-mul-07.ll
@@ -1,11 +1,15 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
 
 define double @f1(double %f1, double %f2, double %acc) {
 ; CHECK-LABEL: f1:
-; CHECK: madbr %f4, %f0, %f2
-; CHECK: ldr %f0, %f4
+; CHECK-SCALAR: madbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmadb %f0, %f0, %f2, %f4
 ; CHECK: br %r14
   %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
   ret double %res
diff --git a/test/CodeGen/SystemZ/fp-mul-09.ll b/test/CodeGen/SystemZ/fp-mul-09.ll
index f2eadf55ff3..927a8064823 100644
--- a/test/CodeGen/SystemZ/fp-mul-09.ll
+++ b/test/CodeGen/SystemZ/fp-mul-09.ll
@@ -1,11 +1,15 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
 
 define double @f1(double %f1, double %f2, double %acc) {
 ; CHECK-LABEL: f1:
-; CHECK: msdbr %f4, %f0, %f2
-; CHECK: ldr %f0, %f4
+; CHECK-SCALAR: msdbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmsdb %f0, %f0, %f2, %f4
 ; CHECK: br %r14
   %negacc = fsub double -0.0, %acc
   %res = call double @llvm.fma.f64 (double %f1, double %f2, double %negacc)
diff --git a/test/CodeGen/SystemZ/fp-neg-01.ll b/test/CodeGen/SystemZ/fp-neg-01.ll
index 927bcd44d02..fe2e5f67cf5 100644
--- a/test/CodeGen/SystemZ/fp-neg-01.ll
+++ b/test/CodeGen/SystemZ/fp-neg-01.ll
@@ -1,6 +1,7 @@
 ; Test floating-point negation.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Test f32.
 define float @f1(float %f) {
diff --git a/test/CodeGen/SystemZ/fp-round-02.ll b/test/CodeGen/SystemZ/fp-round-02.ll
index bd5419dad1d..428261478dc 100644
--- a/test/CodeGen/SystemZ/fp-round-02.ll
+++ b/test/CodeGen/SystemZ/fp-round-02.ll
@@ -1,6 +1,9 @@
 ; Test rounding functions for z196 and above.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 ; Test rint for f32.
 declare float @llvm.rint.f32(float %f)
@@ -16,7 +19,8 @@ define float @f1(float %f) {
 declare double @llvm.rint.f64(double %f)
 define double @f2(double %f) {
 ; CHECK-LABEL: f2:
-; CHECK: fidbr %f0, 0, %f0
+; CHECK-SCALAR: fidbr %f0, 0, %f0
+; CHECK-VECTOR: fidbra %f0, 0, %f0, 0
 ; CHECK: br %r14
   %res = call double @llvm.rint.f64(double %f)
   ret double %res
diff --git a/test/CodeGen/SystemZ/fp-sqrt-02.ll b/test/CodeGen/SystemZ/fp-sqrt-02.ll
index a6d987b0d76..a162466064e 100644
--- a/test/CodeGen/SystemZ/fp-sqrt-02.ll
+++ b/test/CodeGen/SystemZ/fp-sqrt-02.ll
@@ -1,6 +1,8 @@
 ; Test 64-bit square root.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 declare double @llvm.sqrt.f64(double %f)
 declare double @sqrt(double)
@@ -77,7 +79,7 @@ define double @f6(double *%base, i64 %index) {
 ; to use SQDB if possible.
 define void @f7(double *%ptr) {
 ; CHECK-LABEL: f7:
-; CHECK: sqdb {{%f[0-9]+}}, 160(%r15)
+; CHECK-SCALAR: sqdb {{%f[0-9]+}}, 160(%r15)
 ; CHECK: br %r14
   %val0 = load volatile double , double *%ptr
   %val1 = load volatile double , double *%ptr
diff --git a/test/CodeGen/SystemZ/fp-sub-02.ll b/test/CodeGen/SystemZ/fp-sub-02.ll
index f59ec0a31d7..143baac23e1 100644
--- a/test/CodeGen/SystemZ/fp-sub-02.ll
+++ b/test/CodeGen/SystemZ/fp-sub-02.ll
@@ -1,6 +1,8 @@
 ; Test 64-bit floating-point subtraction.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 declare double @foo()
 
@@ -76,7 +78,7 @@ define double @f6(double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: sdb %f0, 16{{[04]}}(%r15)
+; CHECK-SCALAR: sdb %f0, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr double, double *%ptr0, i64 2
   %ptr2 = getelementptr double, double *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/frame-03.ll b/test/CodeGen/SystemZ/frame-03.ll
index 029c6d6d37d..21b8fdb0d67 100644
--- a/test/CodeGen/SystemZ/frame-03.ll
+++ b/test/CodeGen/SystemZ/frame-03.ll
@@ -2,7 +2,7 @@
 ; uses a different register class, but the set of saved and restored
 ; registers should be the same.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; This function should require all FPRs, but no other spill slots.
 ; We need to save and restore 8 of the 16 FPRs, so the frame size
diff --git a/test/CodeGen/SystemZ/frame-07.ll b/test/CodeGen/SystemZ/frame-07.ll
index 253bbc26c1f..dd810142962 100644
--- a/test/CodeGen/SystemZ/frame-07.ll
+++ b/test/CodeGen/SystemZ/frame-07.ll
@@ -1,7 +1,7 @@
 ; Test the saving and restoring of FPRs in large frames.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck -check-prefix=CHECK-NOFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
 
 ; Test a frame size that requires some FPRs to be saved and loaded using
 ; the 20-bit STDY and LDY while others can use the 12-bit STD and LD.
diff --git a/test/CodeGen/SystemZ/frame-17.ll b/test/CodeGen/SystemZ/frame-17.ll
index 485297a2b21..502e541bafc 100644
--- a/test/CodeGen/SystemZ/frame-17.ll
+++ b/test/CodeGen/SystemZ/frame-17.ll
@@ -1,6 +1,6 @@
 ; Test spilling of FPRs.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; We need to save and restore 8 of the 16 FPRs and allocate an additional
 ; 4-byte spill slot, rounded to 8 bytes.  The frame size should be exactly
diff --git a/test/CodeGen/SystemZ/frame-20.ll b/test/CodeGen/SystemZ/frame-20.ll
new file mode 100644
index 00000000000..8d601c6f6d5
--- /dev/null
+++ b/test/CodeGen/SystemZ/frame-20.ll
@@ -0,0 +1,445 @@
+; Like frame-03.ll, but for z13.  In this case we have 16 more registers
+; available.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; This function should require all FPRs, but no other spill slots.
+; We need to save and restore 8 of the 16 FPRs, so the frame size
+; should be exactly 160 + 8 * 8 = 224.  The CFA offset is 160
+; (the caller-allocated part of the frame) + 224.
+define void @f1(double *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: aghi %r15, -224
+; CHECK: .cfi_def_cfa_offset 384
+; CHECK: std %f8, 216(%r15)
+; CHECK: std %f9, 208(%r15)
+; CHECK: std %f10, 200(%r15)
+; CHECK: std %f11, 192(%r15)
+; CHECK: std %f12, 184(%r15)
+; CHECK: std %f13, 176(%r15)
+; CHECK: std %f14, 168(%r15)
+; CHECK: std %f15, 160(%r15)
+; CHECK: .cfi_offset %f8, -168
+; CHECK: .cfi_offset %f9, -176
+; CHECK: .cfi_offset %f10, -184
+; CHECK: .cfi_offset %f11, -192
+; CHECK: .cfi_offset %f12, -200
+; CHECK: .cfi_offset %f13, -208
+; CHECK: .cfi_offset %f14, -216
+; CHECK: .cfi_offset %f15, -224
+; CHECK-DAG: ld %f0, 0(%r2)
+; CHECK-DAG: ld %f7, 0(%r2)
+; CHECK-DAG: ld %f8, 0(%r2)
+; CHECK-DAG: ld %f15, 0(%r2)
+; CHECK-DAG: vlrepg %v16, 0(%r2)
+; CHECK-DAG: vlrepg %v23, 0(%r2)
+; CHECK-DAG: vlrepg %v24, 0(%r2)
+; CHECK-DAG: vlrepg %v31, 0(%r2)
+; CHECK: ld %f8, 216(%r15)
+; CHECK: ld %f9, 208(%r15)
+; CHECK: ld %f10, 200(%r15)
+; CHECK: ld %f11, 192(%r15)
+; CHECK: ld %f12, 184(%r15)
+; CHECK: ld %f13, 176(%r15)
+; CHECK: ld %f14, 168(%r15)
+; CHECK: ld %f15, 160(%r15)
+; CHECK: aghi %r15, 224
+; CHECK: br %r14
+  %l0 = load volatile double, double *%ptr
+  %l1 = load volatile double, double *%ptr
+  %l2 = load volatile double, double *%ptr
+  %l3 = load volatile double, double *%ptr
+  %l4 = load volatile double, double *%ptr
+  %l5 = load volatile double, double *%ptr
+  %l6 = load volatile double, double *%ptr
+  %l7 = load volatile double, double *%ptr
+  %l8 = load volatile double, double *%ptr
+  %l9 = load volatile double, double *%ptr
+  %l10 = load volatile double, double *%ptr
+  %l11 = load volatile double, double *%ptr
+  %l12 = load volatile double, double *%ptr
+  %l13 = load volatile double, double *%ptr
+  %l14 = load volatile double, double *%ptr
+  %l15 = load volatile double, double *%ptr
+  %l16 = load volatile double, double *%ptr
+  %l17 = load volatile double, double *%ptr
+  %l18 = load volatile double, double *%ptr
+  %l19 = load volatile double, double *%ptr
+  %l20 = load volatile double, double *%ptr
+  %l21 = load volatile double, double *%ptr
+  %l22 = load volatile double, double *%ptr
+  %l23 = load volatile double, double *%ptr
+  %l24 = load volatile double, double *%ptr
+  %l25 = load volatile double, double *%ptr
+  %l26 = load volatile double, double *%ptr
+  %l27 = load volatile double, double *%ptr
+  %l28 = load volatile double, double *%ptr
+  %l29 = load volatile double, double *%ptr
+  %l30 = load volatile double, double *%ptr
+  %l31 = load volatile double, double *%ptr
+  %acc0 = fsub double %l0, %l0
+  %acc1 = fsub double %l1, %acc0
+  %acc2 = fsub double %l2, %acc1
+  %acc3 = fsub double %l3, %acc2
+  %acc4 = fsub double %l4, %acc3
+  %acc5 = fsub double %l5, %acc4
+  %acc6 = fsub double %l6, %acc5
+  %acc7 = fsub double %l7, %acc6
+  %acc8 = fsub double %l8, %acc7
+  %acc9 = fsub double %l9, %acc8
+  %acc10 = fsub double %l10, %acc9
+  %acc11 = fsub double %l11, %acc10
+  %acc12 = fsub double %l12, %acc11
+  %acc13 = fsub double %l13, %acc12
+  %acc14 = fsub double %l14, %acc13
+  %acc15 = fsub double %l15, %acc14
+  %acc16 = fsub double %l16, %acc15
+  %acc17 = fsub double %l17, %acc16
+  %acc18 = fsub double %l18, %acc17
+  %acc19 = fsub double %l19, %acc18
+  %acc20 = fsub double %l20, %acc19
+  %acc21 = fsub double %l21, %acc20
+  %acc22 = fsub double %l22, %acc21
+  %acc23 = fsub double %l23, %acc22
+  %acc24 = fsub double %l24, %acc23
+  %acc25 = fsub double %l25, %acc24
+  %acc26 = fsub double %l26, %acc25
+  %acc27 = fsub double %l27, %acc26
+  %acc28 = fsub double %l28, %acc27
+  %acc29 = fsub double %l29, %acc28
+  %acc30 = fsub double %l30, %acc29
+  %acc31 = fsub double %l31, %acc30
+  store volatile double %acc0, double *%ptr
+  store volatile double %acc1, double *%ptr
+  store volatile double %acc2, double *%ptr
+  store volatile double %acc3, double *%ptr
+  store volatile double %acc4, double *%ptr
+  store volatile double %acc5, double *%ptr
+  store volatile double %acc6, double *%ptr
+  store volatile double %acc7, double *%ptr
+  store volatile double %acc8, double *%ptr
+  store volatile double %acc9, double *%ptr
+  store volatile double %acc10, double *%ptr
+  store volatile double %acc11, double *%ptr
+  store volatile double %acc12, double *%ptr
+  store volatile double %acc13, double *%ptr
+  store volatile double %acc14, double *%ptr
+  store volatile double %acc15, double *%ptr
+  store volatile double %acc16, double *%ptr
+  store volatile double %acc17, double *%ptr
+  store volatile double %acc18, double *%ptr
+  store volatile double %acc19, double *%ptr
+  store volatile double %acc20, double *%ptr
+  store volatile double %acc21, double *%ptr
+  store volatile double %acc22, double *%ptr
+  store volatile double %acc23, double *%ptr
+  store volatile double %acc24, double *%ptr
+  store volatile double %acc25, double *%ptr
+  store volatile double %acc26, double *%ptr
+  store volatile double %acc27, double *%ptr
+  store volatile double %acc28, double *%ptr
+  store volatile double %acc29, double *%ptr
+  store volatile double %acc30, double *%ptr
+  store volatile double %acc31, double *%ptr
+  ret void
+}
+
+; Like f1, but requires one fewer FPR.  We allocate in numerical order,
+; so %f15 is the one that gets dropped.
+define void @f2(double *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: aghi %r15, -216
+; CHECK: .cfi_def_cfa_offset 376
+; CHECK: std %f8, 208(%r15)
+; CHECK: std %f9, 200(%r15)
+; CHECK: std %f10, 192(%r15)
+; CHECK: std %f11, 184(%r15)
+; CHECK: std %f12, 176(%r15)
+; CHECK: std %f13, 168(%r15)
+; CHECK: std %f14, 160(%r15)
+; CHECK: .cfi_offset %f8, -168
+; CHECK: .cfi_offset %f9, -176
+; CHECK: .cfi_offset %f10, -184
+; CHECK: .cfi_offset %f11, -192
+; CHECK: .cfi_offset %f12, -200
+; CHECK: .cfi_offset %f13, -208
+; CHECK: .cfi_offset %f14, -216
+; CHECK-NOT: %v15
+; CHECK-NOT: %f15
+; CHECK: ld %f8, 208(%r15)
+; CHECK: ld %f9, 200(%r15)
+; CHECK: ld %f10, 192(%r15)
+; CHECK: ld %f11, 184(%r15)
+; CHECK: ld %f12, 176(%r15)
+; CHECK: ld %f13, 168(%r15)
+; CHECK: ld %f14, 160(%r15)
+; CHECK: aghi %r15, 216
+; CHECK: br %r14
+  %l0 = load volatile double, double *%ptr
+  %l1 = load volatile double, double *%ptr
+  %l2 = load volatile double, double *%ptr
+  %l3 = load volatile double, double *%ptr
+  %l4 = load volatile double, double *%ptr
+  %l5 = load volatile double, double *%ptr
+  %l6 = load volatile double, double *%ptr
+  %l7 = load volatile double, double *%ptr
+  %l8 = load volatile double, double *%ptr
+  %l9 = load volatile double, double *%ptr
+  %l10 = load volatile double, double *%ptr
+  %l11 = load volatile double, double *%ptr
+  %l12 = load volatile double, double *%ptr
+  %l13 = load volatile double, double *%ptr
+  %l14 = load volatile double, double *%ptr
+  %l16 = load volatile double, double *%ptr
+  %l17 = load volatile double, double *%ptr
+  %l18 = load volatile double, double *%ptr
+  %l19 = load volatile double, double *%ptr
+  %l20 = load volatile double, double *%ptr
+  %l21 = load volatile double, double *%ptr
+  %l22 = load volatile double, double *%ptr
+  %l23 = load volatile double, double *%ptr
+  %l24 = load volatile double, double *%ptr
+  %l25 = load volatile double, double *%ptr
+  %l26 = load volatile double, double *%ptr
+  %l27 = load volatile double, double *%ptr
+  %l28 = load volatile double, double *%ptr
+  %l29 = load volatile double, double *%ptr
+  %l30 = load volatile double, double *%ptr
+  %l31 = load volatile double, double *%ptr
+  %acc0 = fsub double %l0, %l0
+  %acc1 = fsub double %l1, %acc0
+  %acc2 = fsub double %l2, %acc1
+  %acc3 = fsub double %l3, %acc2
+  %acc4 = fsub double %l4, %acc3
+  %acc5 = fsub double %l5, %acc4
+  %acc6 = fsub double %l6, %acc5
+  %acc7 = fsub double %l7, %acc6
+  %acc8 = fsub double %l8, %acc7
+  %acc9 = fsub double %l9, %acc8
+  %acc10 = fsub double %l10, %acc9
+  %acc11 = fsub double %l11, %acc10
+  %acc12 = fsub double %l12, %acc11
+  %acc13 = fsub double %l13, %acc12
+  %acc14 = fsub double %l14, %acc13
+  %acc16 = fsub double %l16, %acc14
+  %acc17 = fsub double %l17, %acc16
+  %acc18 = fsub double %l18, %acc17
+  %acc19 = fsub double %l19, %acc18
+  %acc20 = fsub double %l20, %acc19
+  %acc21 = fsub double %l21, %acc20
+  %acc22 = fsub double %l22, %acc21
+  %acc23 = fsub double %l23, %acc22
+  %acc24 = fsub double %l24, %acc23
+  %acc25 = fsub double %l25, %acc24
+  %acc26 = fsub double %l26, %acc25
+  %acc27 = fsub double %l27, %acc26
+  %acc28 = fsub double %l28, %acc27
+  %acc29 = fsub double %l29, %acc28
+  %acc30 = fsub double %l30, %acc29
+  %acc31 = fsub double %l31, %acc30
+  store volatile double %acc0, double *%ptr
+  store volatile double %acc1, double *%ptr
+  store volatile double %acc2, double *%ptr
+  store volatile double %acc3, double *%ptr
+  store volatile double %acc4, double *%ptr
+  store volatile double %acc5, double *%ptr
+  store volatile double %acc6, double *%ptr
+  store volatile double %acc7, double *%ptr
+  store volatile double %acc8, double *%ptr
+  store volatile double %acc9, double *%ptr
+  store volatile double %acc10, double *%ptr
+  store volatile double %acc11, double *%ptr
+  store volatile double %acc12, double *%ptr
+  store volatile double %acc13, double *%ptr
+  store volatile double %acc14, double *%ptr
+  store volatile double %acc16, double *%ptr
+  store volatile double %acc17, double *%ptr
+  store volatile double %acc18, double *%ptr
+  store volatile double %acc19, double *%ptr
+  store volatile double %acc20, double *%ptr
+  store volatile double %acc21, double *%ptr
+  store volatile double %acc22, double *%ptr
+  store volatile double %acc23, double *%ptr
+  store volatile double %acc24, double *%ptr
+  store volatile double %acc25, double *%ptr
+  store volatile double %acc26, double *%ptr
+  store volatile double %acc27, double *%ptr
+  store volatile double %acc28, double *%ptr
+  store volatile double %acc29, double *%ptr
+  store volatile double %acc30, double *%ptr
+  store volatile double %acc31, double *%ptr
+  ret void
+}
+
+; Like f1, but should require only one call-saved FPR.
+define void @f3(double *%ptr) {
+; CHECK-LABEL: f3:
+; CHECK: aghi %r15, -168
+; CHECK: .cfi_def_cfa_offset 328
+; CHECK: std %f8, 160(%r15)
+; CHECK: .cfi_offset %f8, -168
+; CHECK-NOT: {{%[fv]9}}
+; CHECK-NOT: {{%[fv]1[0-5]}}
+; CHECK: ld %f8, 160(%r15)
+; CHECK: aghi %r15, 168
+; CHECK: br %r14
+  %l0 = load volatile double, double *%ptr
+  %l1 = load volatile double, double *%ptr
+  %l2 = load volatile double, double *%ptr
+  %l3 = load volatile double, double *%ptr
+  %l4 = load volatile double, double *%ptr
+  %l5 = load volatile double, double *%ptr
+  %l6 = load volatile double, double *%ptr
+  %l7 = load volatile double, double *%ptr
+  %l8 = load volatile double, double *%ptr
+  %l16 = load volatile double, double *%ptr
+  %l17 = load volatile double, double *%ptr
+  %l18 = load volatile double, double *%ptr
+  %l19 = load volatile double, double *%ptr
+  %l20 = load volatile double, double *%ptr
+  %l21 = load volatile double, double *%ptr
+  %l22 = load volatile double, double *%ptr
+  %l23 = load volatile double, double *%ptr
+  %l24 = load volatile double, double *%ptr
+  %l25 = load volatile double, double *%ptr
+  %l26 = load volatile double, double *%ptr
+  %l27 = load volatile double, double *%ptr
+  %l28 = load volatile double, double *%ptr
+  %l29 = load volatile double, double *%ptr
+  %l30 = load volatile double, double *%ptr
+  %l31 = load volatile double, double *%ptr
+  %acc0 = fsub double %l0, %l0
+  %acc1 = fsub double %l1, %acc0
+  %acc2 = fsub double %l2, %acc1
+  %acc3 = fsub double %l3, %acc2
+  %acc4 = fsub double %l4, %acc3
+  %acc5 = fsub double %l5, %acc4
+  %acc6 = fsub double %l6, %acc5
+  %acc7 = fsub double %l7, %acc6
+  %acc8 = fsub double %l8, %acc7
+  %acc16 = fsub double %l16, %acc8
+  %acc17 = fsub double %l17, %acc16
+  %acc18 = fsub double %l18, %acc17
+  %acc19 = fsub double %l19, %acc18
+  %acc20 = fsub double %l20, %acc19
+  %acc21 = fsub double %l21, %acc20
+  %acc22 = fsub double %l22, %acc21
+  %acc23 = fsub double %l23, %acc22
+  %acc24 = fsub double %l24, %acc23
+  %acc25 = fsub double %l25, %acc24
+  %acc26 = fsub double %l26, %acc25
+  %acc27 = fsub double %l27, %acc26
+  %acc28 = fsub double %l28, %acc27
+  %acc29 = fsub double %l29, %acc28
+  %acc30 = fsub double %l30, %acc29
+  %acc31 = fsub double %l31, %acc30
+  store volatile double %acc0, double *%ptr
+  store volatile double %acc1, double *%ptr
+  store volatile double %acc2, double *%ptr
+  store volatile double %acc3, double *%ptr
+  store volatile double %acc4, double *%ptr
+  store volatile double %acc5, double *%ptr
+  store volatile double %acc6, double *%ptr
+  store volatile double %acc7, double *%ptr
+  store volatile double %acc8, double *%ptr
+  store volatile double %acc16, double *%ptr
+  store volatile double %acc17, double *%ptr
+  store volatile double %acc18, double *%ptr
+  store volatile double %acc19, double *%ptr
+  store volatile double %acc20, double *%ptr
+  store volatile double %acc21, double *%ptr
+  store volatile double %acc22, double *%ptr
+  store volatile double %acc23, double *%ptr
+  store volatile double %acc24, double *%ptr
+  store volatile double %acc25, double *%ptr
+  store volatile double %acc26, double *%ptr
+  store volatile double %acc27, double *%ptr
+  store volatile double %acc28, double *%ptr
+  store volatile double %acc29, double *%ptr
+  store volatile double %acc30, double *%ptr
+  store volatile double %acc31, double *%ptr
+  ret void
+}
+
+; This function should use all call-clobbered FPRs and vector registers
+; but no call-saved ones.  It shouldn't need to create a frame.
+define void @f4(double *%ptr) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: %r15
+; CHECK-NOT: {{%[fv][89]}}
+; CHECK-NOT: {{%[fv]1[0-5]}}
+; CHECK: br %r14
+  %l0 = load volatile double, double *%ptr
+  %l1 = load volatile double, double *%ptr
+  %l2 = load volatile double, double *%ptr
+  %l3 = load volatile double, double *%ptr
+  %l4 = load volatile double, double *%ptr
+  %l5 = load volatile double, double *%ptr
+  %l6 = load volatile double, double *%ptr
+  %l7 = load volatile double, double *%ptr
+  %l16 = load volatile double, double *%ptr
+  %l17 = load volatile double, double *%ptr
+  %l18 = load volatile double, double *%ptr
+  %l19 = load volatile double, double *%ptr
+  %l20 = load volatile double, double *%ptr
+  %l21 = load volatile double, double *%ptr
+  %l22 = load volatile double, double *%ptr
+  %l23 = load volatile double, double *%ptr
+  %l24 = load volatile double, double *%ptr
+  %l25 = load volatile double, double *%ptr
+  %l26 = load volatile double, double *%ptr
+  %l27 = load volatile double, double *%ptr
+  %l28 = load volatile double, double *%ptr
+  %l29 = load volatile double, double *%ptr
+  %l30 = load volatile double, double *%ptr
+  %l31 = load volatile double, double *%ptr
+  %acc0 = fsub double %l0, %l0
+  %acc1 = fsub double %l1, %acc0
+  %acc2 = fsub double %l2, %acc1
+  %acc3 = fsub double %l3, %acc2
+  %acc4 = fsub double %l4, %acc3
+  %acc5 = fsub double %l5, %acc4
+  %acc6 = fsub double %l6, %acc5
+  %acc7 = fsub double %l7, %acc6
+  %acc16 = fsub double %l16, %acc7
+  %acc17 = fsub double %l17, %acc16
+  %acc18 = fsub double %l18, %acc17
+  %acc19 = fsub double %l19, %acc18
+  %acc20 = fsub double %l20, %acc19
+  %acc21 = fsub double %l21, %acc20
+  %acc22 = fsub double %l22, %acc21
+  %acc23 = fsub double %l23, %acc22
+  %acc24 = fsub double %l24, %acc23
+  %acc25 = fsub double %l25, %acc24
+  %acc26 = fsub double %l26, %acc25
+  %acc27 = fsub double %l27, %acc26
+  %acc28 = fsub double %l28, %acc27
+  %acc29 = fsub double %l29, %acc28
+  %acc30 = fsub double %l30, %acc29
+  %acc31 = fsub double %l31, %acc30
+  store volatile double %acc0, double *%ptr
+  store volatile double %acc1, double *%ptr
+  store volatile double %acc2, double *%ptr
+  store volatile double %acc3, double *%ptr
+  store volatile double %acc4, double *%ptr
+  store volatile double %acc5, double *%ptr
+  store volatile double %acc6, double *%ptr
+  store volatile double %acc7, double *%ptr
+  store volatile double %acc16, double *%ptr
+  store volatile double %acc17, double *%ptr
+  store volatile double %acc18, double *%ptr
+  store volatile double %acc19, double *%ptr
+  store volatile double %acc20, double *%ptr
+  store volatile double %acc21, double *%ptr
+  store volatile double %acc22, double *%ptr
+  store volatile double %acc23, double *%ptr
+  store volatile double %acc24, double *%ptr
+  store volatile double %acc25, double *%ptr
+  store volatile double %acc26, double *%ptr
+  store volatile double %acc27, double *%ptr
+  store volatile double %acc28, double *%ptr
+  store volatile double %acc29, double *%ptr
+  store volatile double %acc30, double *%ptr
+  store volatile double %acc31, double *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/vec-abs-05.ll b/test/CodeGen/SystemZ/vec-abs-05.ll
index 89142b21854..63210f87b94 100644
--- a/test/CodeGen/SystemZ/vec-abs-05.ll
+++ b/test/CodeGen/SystemZ/vec-abs-05.ll
@@ -1,7 +1,8 @@
-; Test v2f64 absolute.
+; Test f64 and v2f64 absolute.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
+declare double @llvm.fabs.f64(double)
 declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
 
 ; Test a plain absolute.
@@ -22,3 +23,24 @@ define <2 x double> @f2(<2 x double> %val) {
   %ret = fsub <2 x double> <double -0.0, double -0.0>, %abs
   ret <2 x double> %ret
 }
+
+; Test an f64 absolute that uses vector registers.
+define double @f3(<2 x double> %val) {
+; CHECK-LABEL: f3:
+; CHECK: wflpdb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %ret = call double @llvm.fabs.f64(double %scalar)
+  ret double %ret
+}
+
+; Test an f64 negative absolute that uses vector registers.
+define double @f4(<2 x double> %val) {
+; CHECK-LABEL: f4:
+; CHECK: wflndb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %abs = call double @llvm.fabs.f64(double %scalar)
+  %ret = fsub double -0.0, %abs
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-add-01.ll b/test/CodeGen/SystemZ/vec-add-01.ll
index 1de2aa2a1b9..31703437767 100644
--- a/test/CodeGen/SystemZ/vec-add-01.ll
+++ b/test/CodeGen/SystemZ/vec-add-01.ll
@@ -47,3 +47,14 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
   %ret = fadd <2 x double> %val1, %val2
   ret <2 x double> %ret
 }
+
+; Test an f64 addition that uses vector registers.
+define double @f6(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: wfadb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <2 x double> %val1, i32 0
+  %scalar2 = extractelement <2 x double> %val2, i32 0
+  %ret = fadd double %scalar1, %scalar2
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-cmp-06.ll b/test/CodeGen/SystemZ/vec-cmp-06.ll
index bdb8744631a..eef57555b48 100644
--- a/test/CodeGen/SystemZ/vec-cmp-06.ll
+++ b/test/CodeGen/SystemZ/vec-cmp-06.ll
@@ -1,4 +1,4 @@
-; Test v2f64 comparisons.
+; Test f64 and v2f64 comparisons.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
@@ -335,3 +335,15 @@ define <2 x double> @f28(<2 x double> %val1, <2 x double> %val2,
   %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
   ret <2 x double> %ret
 }
+
+; Test an f64 comparison that uses vector registers.
+define i64 @f29(i64 %a, i64 %b, double %f1, <2 x double> %vec) {
+; CHECK-LABEL: f29:
+; CHECK: wfcdb %f0, %v24
+; CHECK-NEXT: locgrne %r2, %r3
+; CHECK: br %r14
+  %f2 = extractelement <2 x double> %vec, i32 0
+  %cond = fcmp oeq double %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/vec-conv-02.ll b/test/CodeGen/SystemZ/vec-conv-02.ll
index ceccfc60b37..ab84389f3c8 100644
--- a/test/CodeGen/SystemZ/vec-conv-02.ll
+++ b/test/CodeGen/SystemZ/vec-conv-02.ll
@@ -11,3 +11,23 @@ define void @f1(<2 x double> %val, <2 x float> *%ptr) {
   store <2 x float> %res, <2 x float> *%ptr
   ret void
 }
+
+; Test conversion of an f64 in a vector register to an f32.
+define float @f2(<2 x double> %vec) {
+; CHECK-LABEL: f2:
+; CHECK: wledb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %vec, i32 0
+  %ret = fptrunc double %scalar to float
+  ret float %ret
+}
+
+; Test conversion of an f32 in a vector register to an f64.
+define double @f3(<4 x float> %vec) {
+; CHECK-LABEL: f3:
+; CHECK: wldeb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %vec, i32 0
+  %ret = fpext float %scalar to double
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-div-01.ll b/test/CodeGen/SystemZ/vec-div-01.ll
index 5666444e9da..506d40861d3 100644
--- a/test/CodeGen/SystemZ/vec-div-01.ll
+++ b/test/CodeGen/SystemZ/vec-div-01.ll
@@ -70,3 +70,14 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
   %ret = fdiv <2 x double> %val1, %val2
   ret <2 x double> %ret
 }
+
+; Test an f64 division that uses vector registers.
+define double @f6(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: wfddb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <2 x double> %val1, i32 0
+  %scalar2 = extractelement <2 x double> %val2, i32 0
+  %ret = fdiv double %scalar1, %scalar2
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-mul-01.ll b/test/CodeGen/SystemZ/vec-mul-01.ll
index d0018fa1f8c..5ecc30d4427 100644
--- a/test/CodeGen/SystemZ/vec-mul-01.ll
+++ b/test/CodeGen/SystemZ/vec-mul-01.ll
@@ -47,3 +47,14 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
   %ret = fmul <2 x double> %val1, %val2
   ret <2 x double> %ret
 }
+
+; Test an f64 multiplication that uses vector registers.
+define double @f6(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: wfmdb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <2 x double> %val1, i32 0
+  %scalar2 = extractelement <2 x double> %val2, i32 0
+  %ret = fmul double %scalar1, %scalar2
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-neg-01.ll b/test/CodeGen/SystemZ/vec-neg-01.ll
index 491e24bb34f..b1389ce4d6d 100644
--- a/test/CodeGen/SystemZ/vec-neg-01.ll
+++ b/test/CodeGen/SystemZ/vec-neg-01.ll
@@ -46,3 +46,13 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val) {
   %ret = fsub <2 x double> <double -0.0, double -0.0>, %val
   ret <2 x double> %ret
 }
+
+; Test an f64 negation that uses vector registers.
+define double @f6(<2 x double> %val) {
+; CHECK-LABEL: f6:
+; CHECK: wflcdb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %ret = fsub double -0.0, %scalar
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-round-01.ll b/test/CodeGen/SystemZ/vec-round-01.ll
index 284b83e96f7..82718276bb0 100644
--- a/test/CodeGen/SystemZ/vec-round-01.ll
+++ b/test/CodeGen/SystemZ/vec-round-01.ll
@@ -2,6 +2,12 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
+declare double @llvm.rint.f64(double)
+declare double @llvm.nearbyint.f64(double)
+declare double @llvm.floor.f64(double)
+declare double @llvm.ceil.f64(double)
+declare double @llvm.trunc.f64(double)
+declare double @llvm.round.f64(double)
 declare <2 x double> @llvm.rint.v2f64(<2 x double>)
 declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
 declare <2 x double> @llvm.floor.v2f64(<2 x double>)
@@ -56,3 +62,57 @@ define <2 x double> @f6(<2 x double> %val) {
   %res = call <2 x double> @llvm.round.v2f64(<2 x double> %val)
   ret <2 x double> %res
 }
+
+define double @f7(<2 x double> %val) {
+; CHECK-LABEL: f7:
+; CHECK: wfidb %f0, %v24, 0, 0
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.rint.f64(double %scalar)
+  ret double %res
+}
+
+define double @f8(<2 x double> %val) {
+; CHECK-LABEL: f8:
+; CHECK: wfidb %f0, %v24, 4, 0
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.nearbyint.f64(double %scalar)
+  ret double %res
+}
+
+define double @f9(<2 x double> %val) {
+; CHECK-LABEL: f9:
+; CHECK: wfidb %f0, %v24, 4, 7
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.floor.f64(double %scalar)
+  ret double %res
+}
+
+define double @f10(<2 x double> %val) {
+; CHECK-LABEL: f10:
+; CHECK: wfidb %f0, %v24, 4, 6
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.ceil.f64(double %scalar)
+  ret double %res
+}
+
+define double @f11(<2 x double> %val) {
+; CHECK-LABEL: f11:
+; CHECK: wfidb %f0, %v24, 4, 5
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.trunc.f64(double %scalar)
+  ret double %res
+}
+
+define double @f12(<2 x double> %val) {
+; CHECK-LABEL: f12:
+; CHECK: wfidb %f0, %v24, 4, 1
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.round.f64(double %scalar)
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/vec-sqrt-01.ll b/test/CodeGen/SystemZ/vec-sqrt-01.ll
index 0160c24a749..5c3ffb3b064 100644
--- a/test/CodeGen/SystemZ/vec-sqrt-01.ll
+++ b/test/CodeGen/SystemZ/vec-sqrt-01.ll
@@ -1,7 +1,8 @@
-; Test v2f64 square root.
+; Test f64 and v2f64 square root.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
+declare double @llvm.sqrt.f64(double)
 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
 
 define <2 x double> @f1(<2 x double> %val) {
@@ -11,3 +12,12 @@ define <2 x double> @f1(<2 x double> %val) {
   %ret = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %val)
   ret <2 x double> %ret
 }
+
+define double @f2(<2 x double> %val) {
+; CHECK-LABEL: f2:
+; CHECK: wfsqdb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %ret = call double @llvm.sqrt.f64(double %scalar)
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-sub-01.ll b/test/CodeGen/SystemZ/vec-sub-01.ll
index aabf1c9be4a..5620ebcb8c4 100644
--- a/test/CodeGen/SystemZ/vec-sub-01.ll
+++ b/test/CodeGen/SystemZ/vec-sub-01.ll
@@ -74,3 +74,14 @@ define <2 x double> @f6(<2 x double> %dummy, <2 x double> %val1,
   %ret = fsub <2 x double> %val1, %val2
   ret <2 x double> %ret
 }
+
+; Test an f64 subtraction that uses vector registers.
+define double @f7(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f7:
+; CHECK: wfsdb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <2 x double> %val1, i32 0
+  %scalar2 = extractelement <2 x double> %val2, i32 0
+  %ret = fsub double %scalar1, %scalar2
+  ret double %ret
+}
-- 
2.34.1