From 13c83a2a09a0842ff57ec020fe3f534de766ccd1 Mon Sep 17 00:00:00 2001
From: Chad Rosier <mcrosier@codeaurora.org>
Date: Tue, 12 Nov 2013 19:13:08 +0000
Subject: [PATCH] [AArch64] Implemented AdvSIMD scalar x indexed element format
 and AdvSIMD scalar copy in MC layer. Added the MC layer tests.  Fixed triple
 setting in test cases.

Patch by Ana Pazos <apazos@codeaurora.org>.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194501 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrFormats.td     |  28 ++
 lib/Target/AArch64/AArch64InstrInfo.td        |   2 +-
 lib/Target/AArch64/AArch64InstrNEON.td        | 318 ++++++++++++++++--
 .../Disassembler/AArch64Disassembler.cpp      |  10 +
 test/MC/AArch64/neon-3vdiff.s                 |   2 +-
 test/MC/AArch64/neon-diagnostics.s            | 247 +++++++++++++-
 test/MC/AArch64/neon-scalar-by-elem-mla.s     |  44 +++
 test/MC/AArch64/neon-scalar-by-elem-mul.s     |  37 ++
 .../neon-scalar-by-elem-saturating-mla.s      |  46 +++
 .../neon-scalar-by-elem-saturating-mul.s      |  58 ++++
 test/MC/AArch64/neon-scalar-dup.s             |  29 ++
 test/MC/AArch64/neon-simd-copy.s              |   2 +-
 test/MC/AArch64/neon-simd-shift.s             |   2 +-
 .../AArch64/neon-instructions.txt             | 213 ++++++++++++
 14 files changed, 1001 insertions(+), 37 deletions(-)
 create mode 100644 test/MC/AArch64/neon-scalar-by-elem-mla.s
 create mode 100644 test/MC/AArch64/neon-scalar-by-elem-mul.s
 create mode 100644 test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s
 create mode 100644 test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s
 create mode 100644 test/MC/AArch64/neon-scalar-dup.s
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index b0aadfb031a..2c8cc6bc8ec 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1359,5 +1359,33 @@ class NeonI_Crypto_3VSHA<bits<2> size, bits<3> opcode,
   // Inherit Rd in 4-0
 }
 
+// Format AdvSIMD scalar x indexed element
+class NeonI_ScalarXIndexedElem<bit u, bit szhi, bit szlo,
+                               bits<4> opcode, dag outs, dag ins,
+                               string asmstr, list<dag> patterns,
+                               InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = 0b1;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11111;
+  let Inst{23} = szhi;
+  let Inst{22} = szlo;
+  // l in Inst{21}
+  // m in Instr{20}
+  // Inherit Rm in 19-16
+  let Inst{15-12} = opcode;
+  // h in Inst{11}
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+// Format AdvSIMD scalar copy - insert from element to scalar
+class NeonI_ScalarCopy<dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : NeonI_copy<0b1, 0b0, 0b0000, outs, ins, asmstr, patterns, itin> {
+  let Inst{28} = 0b1;
+}
 }
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index ae217f9d4fd..23d81fc478e 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1278,7 +1278,7 @@ def : Pat<(i64 (sext_inreg (anyext i32:$Rn), i1)),
 
 // UBFX makes sense as an implementation of a 64-bit zero-extension too. Could
 // use either 64-bit or 32-bit variant, but 32-bit might be more efficient.
-def : Pat<(zext i32:$Rn), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
+def : Pat<(i64 (zext i32:$Rn)), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
                                          sub_32)>;
 
 //===-------------------------------
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
index 95e54f2fcb3..83bb1fa6541 100644
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ b/lib/Target/AArch64/AArch64InstrNEON.td
@@ -4671,6 +4671,294 @@ defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfmaxnm,
 defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfminnm, 
   int_aarch64_neon_vpfminnmq, FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
 
+def neon_uimm0_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm == 0;}]> {
+  let ParserMatchClass = neon_uimm0_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm1_bare : Operand<i64>,
+                        ImmLeaf<i64, [{(void)Imm; return true;}]> {
+  let ParserMatchClass = neon_uimm1_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm2_bare : Operand<i64>,
+                        ImmLeaf<i64, [{(void)Imm; return true;}]> {
+  let ParserMatchClass = neon_uimm2_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm3_bare : Operand<i64>,
+                        ImmLeaf<i64, [{(void)Imm; return true;}]> {
+  let ParserMatchClass = uimm3_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm4_bare : Operand<i64>,
+                        ImmLeaf<i64, [{(void)Imm; return true;}]> {
+  let ParserMatchClass = uimm4_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+
+// Scalar by element Arithmetic
+
+class NeonI_ScalarXIndexedElemArith<string asmop, bits<4> opcode,
+                                    string rmlane, bit u, bit szhi, bit szlo,
+                                    RegisterClass ResFPR, RegisterClass OpFPR,
+                                    RegisterOperand OpVPR, Operand OpImm>
+  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
+                             (outs ResFPR:$Rd),
+                             (ins OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
+                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
+                             [],
+                             NoItinerary> {
+  bits<3> Imm;
+  bits<5> MRm;
+}
+
+class NeonI_ScalarXIndexedElemArith_Constraint_Impl<string asmop, bits<4> opcode,
+                                                    string rmlane,
+                                                    bit u, bit szhi, bit szlo,
+                                                    RegisterClass ResFPR,
+                                                    RegisterClass OpFPR,
+                                                    RegisterOperand OpVPR,
+                                                    Operand OpImm>
+  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
+                             (outs ResFPR:$Rd),
+                             (ins ResFPR:$src, OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
+                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
+                             [],
+                             NoItinerary> {
+  let Constraints = "$src = $Rd";
+  bits<3> Imm;
+  bits<5> MRm;
+}
+
+// Scalar Floating Point  multiply (scalar, by element)
+def FMULssv_4S : NeonI_ScalarXIndexedElemArith<"fmul",
+  0b1001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMULddv_2D : NeonI_ScalarXIndexedElemArith<"fmul",
+  0b1001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Floating Point  multiply extended (scalar, by element)
+def FMULXssv_4S : NeonI_ScalarXIndexedElemArith<"fmulx",
+  0b1001, ".s", 0b1, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMULXddv_2D : NeonI_ScalarXIndexedElemArith<"fmulx",
+  0b1001, ".d", 0b1, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Floating Point fused multiply-add (scalar, by element)
+def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
+  0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMLAddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
+  0b0001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Floating Point fused multiply-subtract (scalar, by element)
+def FMLSssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
+  0b0101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMLSddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
+  0b0101, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Signed saturating doubling multiply-add long (scalar, by element)
+def SQDMLALshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLALshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLALdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMLALdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Signed saturating doubling
+// multiply-subtract long (scalar, by element)
+def SQDMLSLshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLSLshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLSLdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMLSLdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Signed saturating doubling multiply long (scalar, by element)
+def SQDMULLshv_4H : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULLshv_8H : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULLdsv_2S : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMULLdsv_4S : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Signed saturating doubling multiply returning
+// high half (scalar, by element)
+def SQDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Signed saturating rounding doubling multiply
+// returning high half (scalar, by element)
+def SQRDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQRDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQRDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQRDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+
+// Scalar Copy - DUP element to scalar
+class NeonI_Scalar_DUP<string asmop, string asmlane,
+                       RegisterClass ResRC, RegisterOperand VPRC,
+                       Operand OpImm>
+  : NeonI_ScalarCopy<(outs ResRC:$Rd), (ins VPRC:$Rn, OpImm:$Imm),
+                     asmop # "\t$Rd, $Rn." # asmlane # "[$Imm]",
+                     [],
+                     NoItinerary> {
+  bits<4> Imm;
+}
+
+def DUPbv_B : NeonI_Scalar_DUP<"dup", "b", FPR8, VPR128, neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def DUPhv_H : NeonI_Scalar_DUP<"dup", "h", FPR16, VPR128, neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def DUPsv_S : NeonI_Scalar_DUP<"dup", "s", FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -4792,36 +5080,6 @@ def : Pat<(v2i64  (bitconvert (f128   FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v4f32  (bitconvert (f128   FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v2f64  (bitconvert (f128   FPR128:$src))), (v2f64 FPR128:$src)>;
 
-def neon_uimm0_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm == 0;}]> {
-  let ParserMatchClass = neon_uimm0_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm1_bare : Operand<i64>,
-                        ImmLeaf<i64, [{(void)Imm; return true;}]> {
-  let ParserMatchClass = neon_uimm1_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm2_bare : Operand<i64>,
-                        ImmLeaf<i64, [{(void)Imm; return true;}]> {
-  let ParserMatchClass = neon_uimm2_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm3_bare : Operand<i64>,
-                        ImmLeaf<i64, [{(void)Imm; return true;}]> {
-  let ParserMatchClass = uimm3_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm4_bare : Operand<i64>,
-                        ImmLeaf<i64, [{(void)Imm; return true;}]> {
-  let ParserMatchClass = uimm4_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
 def neon_uimm3 : Operand<i64>,
                    ImmLeaf<i64, [{(void)Imm; return true;}]> {
   let ParserMatchClass = uimm3_asmoperand;
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 38845b6d63e..c4f30628c31 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -82,6 +82,8 @@ static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                          uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                          uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
                                               unsigned RegNo, uint64_t Address,
                                               const void *Decoder);
@@ -379,6 +381,14 @@ DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus
+DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  return DecodeFPR64RegisterClass(Inst, RegNo, Address, Decoder);
+}
 
 static DecodeStatus
 DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
diff --git a/test/MC/AArch64/neon-3vdiff.s b/test/MC/AArch64/neon-3vdiff.s
index 337b94c2bca..1de69098a75 100644
--- a/test/MC/AArch64/neon-3vdiff.s
+++ b/test/MC/AArch64/neon-3vdiff.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s
index 5ada875b13a..0a2332b41e1 100644
--- a/test/MC/AArch64/neon-diagnostics.s
+++ b/test/MC/AArch64/neon-diagnostics.s
@@ -4667,7 +4667,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal s17, h27, s12
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: too few operands for instruction
 // CHECK-ERROR:        sqdmlal d19, s24, d12
 // CHECK-ERROR:                          ^
 
@@ -4681,7 +4681,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl s14, h12, s25
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: too few operands for instruction
 // CHECK-ERROR:        sqdmlsl d12, s23, d13
 // CHECK-ERROR:                          ^
 
@@ -4695,7 +4695,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull s12, h22, s12
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: too few operands for instruction
 // CHECK-ERROR:        sqdmull d15, s22, d12
 // CHECK-ERROR:                          ^
 
@@ -5687,3 +5687,244 @@
 // CHECK-ERROR <stdin>:4341:17: error: invalid operand for instruction
 // CHECK-ERROR         trn2 v0.1d, v1.1d, v2.1d
 // CHECK-ERROR                 ^
+
+//----------------------------------------------------------------------
+// Floating Point  multiply (scalar, by element)
+//----------------------------------------------------------------------
+      // mismatched and invalid vector types
+      fmul    s0, s1, v1.h[0]
+      fmul    h0, h1, v1.s[0]
+      // invalid lane
+      fmul    s2, s29, v10.s[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmul    s0, s1, v1.h[0]
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmul    h0, h1, v1.s[0]
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error:  lane number incompatible with layout
+// CHECK-ERROR:          fmul    s2, s29, v10.s[4]
+// CHECK-ERROR:                                 ^
+
+//----------------------------------------------------------------------
+// Floating Point  multiply extended (scalar, by element)
+//----------------------------------------------------------------------
+      // mismatched and invalid vector types
+      fmulx    d0, d1, v1.b[0]
+      fmulx    h0, h1, v1.d[0]
+      // invalid lane
+      fmulx    d2, d29, v10.d[3]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmulx    d0, d1, v1.b[0]
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmulx    h0, h1, v1.d[0]
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error:  lane number incompatible with layout
+// CHECK-ERROR:          fmulx    d2, d29, v10.d[3]
+// CHECK-ERROR:                                  ^
+
+//----------------------------------------------------------------------
+// Floating Point fused multiply-add (scalar, by element)
+//----------------------------------------------------------------------
+      // mismatched and invalid vector types
+      fmla    b0, b1, v1.b[0]
+      fmla    d30, s11, v1.d[1]
+      // invalid lane
+      fmla    s16, s22, v16.s[5]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmla    b0, b1, v1.b[0]
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmla    d30, s11, v1.d[1]
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error:  lane number incompatible with layout
+// CHECK-ERROR:          fmla    s16, s22, v16.s[5]
+// CHECK-ERROR:                                  ^
+
+//----------------------------------------------------------------------
+// Floating Point fused multiply-subtract (scalar, by element)
+//----------------------------------------------------------------------
+    // mismatched and invalid vector types
+    fmls    s29, h10, v28.s[1]
+    fmls    h7, h17, v26.s[2]
+    // invalid lane
+    fmls    d16, d22, v16.d[-1]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmls    s29, h10, v28.s[1]
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmls    h7, h17, v26.s[2]
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error:  expected lane number
+// CHECK-ERROR:          fmls    d16, d22, v16.d[-1]
+// CHECK-ERROR:                                  ^
+
+//----------------------------------------------------------------------
+// Scalar Signed saturating doubling multiply-add long
+// (scalar, by element)
+//----------------------------------------------------------------------
+    // mismatched and invalid vector types
+    sqdmlal s0, h0, v0.s[0]
+    sqdmlal s8, s9, v14.s[1]
+    // invalid lane
+    sqdmlal s4, s5, v1.s[5]
+    // invalid vector index
+    sqdmlal s0, h0, v17.h[0]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmlal s0, h0, v0.s[0]
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmlal s8, s9, v14.s[1]
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          sqdmlal s4, s5, v1.s[5]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmlal s0, h0, v17.h[0]
+// CHECK-ERROR:                           ^
+
+//----------------------------------------------------------------------
+// Scalar Signed saturating doubling multiply-subtract long
+// (scalar, by element)
+//----------------------------------------------------------------------
+    // mismatched and invalid vector types
+    sqdmlsl s1, h1, v1.d[0]
+    sqdmlsl d1, h1, v13.s[0]
+    // invalid lane
+    sqdmlsl d1, s1, v13.s[4]
+    // invalid vector index
+    sqdmlsl s1, h1, v20.h[7]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmlsl s1, h1, v1.d[0]
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmlsl d1, h1, v13.s[0]
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          sqdmlsl d1, s1, v13.s[4]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmlsl s1, h1, v20.h[7]
+// CHECK-ERROR:                           ^
+
+//----------------------------------------------------------------------
+// Scalar Signed saturating doubling multiply long (scalar, by element)
+//----------------------------------------------------------------------
+    // mismatched and invalid vector types
+    // invalid lane
+    // invalid vector index
+    // mismatched and invalid vector types
+    sqdmull s1, h1, v1.s[1]
+    sqdmull s1, s1, v4.s[0]
+    // invalid lane
+    sqdmull s12, h17, v9.h[9]
+    // invalid vector index
+    sqdmull s1, h1, v16.h[5]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmull s1, h1, v1.s[1]
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmull s1, s1, v4.s[0]
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          sqdmull s12, h17, v9.h[9]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmull s1, h1, v16.h[5]
+// CHECK-ERROR:                           ^
+
+//----------------------------------------------------------------------
+// Scalar Signed saturating doubling multiply returning
+// high half (scalar, by element)
+//----------------------------------------------------------------------
+    // mismatched and invalid vector types
+    sqdmulh h0, s1, v0.h[0]
+    sqdmulh s25, s26, v27.h[3]
+    // invalid lane
+    sqdmulh s25, s26, v27.s[4]
+    // invalid vector index
+    sqdmulh s0, h1, v30.h[0]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmulh h0, s1, v0.h[0]
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmulh s25, s26, v27.h[3]
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          sqdmulh s25, s26, v27.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmulh s0, h1, v30.h[0]
+// CHECK-ERROR:                      ^
+
+//----------------------------------------------------------------------
+// Scalar Signed saturating rounding doubling multiply
+// returning high half (scalar, by element)
+//----------------------------------------------------------------------
+    // mismatched and invalid vector types
+    sqrdmulh h31, h30, v14.s[2]
+    sqrdmulh s5, h6, v7.s[2]
+    // invalid lane
+    sqrdmulh h31, h30, v14.h[9]
+    // invalid vector index
+    sqrdmulh h31, h30, v20.h[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqrdmulh h31, h30, v14.s[2]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqrdmulh s5, h6, v7.s[2]
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          sqrdmulh h31, h30, v14.h[9]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqrdmulh h31, h30, v20.h[4]
+// CHECK-ERROR:                              ^
+
+//----------------------------------------------------------------------
+// Scalar Duplicate element (scalar)
+//----------------------------------------------------------------------
+      // mismatched and invalid vector types
+      dup b0, v1.d[0]
+      dup h0, v31.b[8]
+      dup s0, v2.h[4]
+      dup d0, v17.s[3]
+      // invalid  lane
+      dup d0, v17.d[4]
+      dup s0, v1.s[7]
+      dup h0, v31.h[16]
+      dup b1, v3.b[16]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          dup b0, v1.d[0]
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          dup h0, v31.b[8]
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          dup s0, v2.h[4]
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          dup d0, v17.s[3]
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          dup d0, v17.d[4]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          dup s0, v1.s[7]
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          dup h0, v31.h[16]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          dup b1, v3.b[16]
+// CHECK-ERROR:                       ^
diff --git a/test/MC/AArch64/neon-scalar-by-elem-mla.s b/test/MC/AArch64/neon-scalar-by-elem-mla.s
new file mode 100644
index 00000000000..fec9d12d8b8
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-by-elem-mla.s
@@ -0,0 +1,44 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//------------------------------------------------------------------------------
+// Floating Point fused multiply-add (scalar, by element)
+//------------------------------------------------------------------------------
+    fmla    s0, s1, v1.s[0]
+    fmla    s30, s11, v1.s[1]
+    fmla    s4, s5, v7.s[2]
+    fmla    s16, s22, v16.s[3]
+    fmla    d0, d1, v1.d[0]
+    fmla    d30, d11, v1.d[1]
+
+// CHECK: fmla    s0, s1, v1.s[0]         // encoding: [0x20,0x10,0x81,0x5f]
+// CHECK: fmla    s30, s11, v1.s[1]       // encoding: [0x7e,0x11,0xa1,0x5f]
+// CHECK: fmla    s4, s5, v7.s[2]         // encoding: [0xa4,0x18,0x87,0x5f]
+// CHECK: fmla    s16, s22, v16.s[3]      // encoding: [0xd0,0x1a,0xb0,0x5f]
+// CHECK: fmla    d0, d1, v1.d[0]         // encoding: [0x20,0x10,0xc1,0x5f]
+// CHECK: fmla    d30, d11, v1.d[1]       // encoding: [0x7e,0x19,0xc1,0x5f]
+ 
+//------------------------------------------------------------------------------
+// Floating Point fused multiply-subtract (scalar, by element)
+//------------------------------------------------------------------------------
+
+    fmls    s2, s3, v4.s[0]
+    fmls    s29, s10, v28.s[1]      
+    fmls    s5, s12, v23.s[2]       
+    fmls    s7, s17, v26.s[3]       
+    fmls    d0, d1, v1.d[0]         
+    fmls    d30, d11, v1.d[1]       
+
+// CHECK: fmls    s2, s3, v4.s[0]     // encoding: [0x62,0x50,0x84,0x5f]
+// CHECK: fmls    s29, s10, v28.s[1]  // encoding: [0x5d,0x51,0xbc,0x5f]
+// CHECK: fmls    s5, s12, v23.s[2]   // encoding: [0x85,0x59,0x97,0x5f]
+// CHECK: fmls    s7, s17, v26.s[3]   // encoding: [0x27,0x5a,0xba,0x5f]
+// CHECK: fmls    d0, d1, v1.d[0]     // encoding: [0x20,0x50,0xc1,0x5f]
+// CHECK: fmls    d30, d11, v1.d[1]   // encoding: [0x7e,0x59,0xc1,0x5f]
+
+        
+        
+        
+
+        
+        
+
diff --git a/test/MC/AArch64/neon-scalar-by-elem-mul.s b/test/MC/AArch64/neon-scalar-by-elem-mul.s
new file mode 100644
index 00000000000..8b8a3f57a9c
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-by-elem-mul.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//------------------------------------------------------------------------------
+// Floating Point  multiply (scalar, by element)
+//------------------------------------------------------------------------------
+    fmul    s0, s1, v1.s[0]
+    fmul    s30, s11, v1.s[1]
+    fmul    s4, s5, v7.s[2]
+    fmul    s16, s22, v16.s[3]
+    fmul    d0, d1, v1.d[0]
+    fmul    d30, d11, v1.d[1]
+
+// CHECK: fmul    s0, s1, v1.s[0]      // encoding: [0x20,0x90,0x81,0x5f]
+// CHECK: fmul    s30, s11, v1.s[1]    // encoding: [0x7e,0x91,0xa1,0x5f]
+// CHECK: fmul    s4, s5, v7.s[2]      // encoding: [0xa4,0x98,0x87,0x5f]
+// CHECK: fmul    s16, s22, v16.s[3]   // encoding: [0xd0,0x9a,0xb0,0x5f]
+// CHECK: fmul    d0, d1, v1.d[0]      // encoding: [0x20,0x90,0xc1,0x5f]
+// CHECK: fmul    d30, d11, v1.d[1]    // encoding: [0x7e,0x99,0xc1,0x5f]
+
+
+//------------------------------------------------------------------------------
+// Floating Point  multiply extended (scalar, by element)
+//------------------------------------------------------------------------------
+    fmulx   s6, s2, v8.s[0]
+    fmulx   s7, s3, v13.s[1]
+    fmulx   s9, s7, v9.s[2]
+    fmulx   s13, s21, v10.s[3]
+    fmulx   d15, d9, v7.d[0]
+    fmulx   d13, d12, v11.d[1]
+
+// CHECK: fmulx   s6, s2, v8.s[0]         // encoding: [0x46,0x90,0x88,0x7f]
+// CHECK: fmulx   s7, s3, v13.s[1]        // encoding: [0x67,0x90,0xad,0x7f]
+// CHECK: fmulx   s9, s7, v9.s[2]         // encoding: [0xe9,0x98,0x89,0x7f]
+// CHECK: fmulx   s13, s21, v10.s[3]      // encoding: [0xad,0x9a,0xaa,0x7f]
+// CHECK: fmulx   d15, d9, v7.d[0]        // encoding: [0x2f,0x91,0xc7,0x7f]
+// CHECK: fmulx   d13, d12, v11.d[1]      // encoding: [0x8d,0x99,0xcb,0x7f]
+
diff --git a/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s b/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s
new file mode 100644
index 00000000000..e3d7e0514f9
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s
@@ -0,0 +1,46 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//-----------------------------------------------------------------------------
+// Signed saturating doubling multiply-add long (scalar, by element)
+//-----------------------------------------------------------------------------
+    sqdmlal s0, h0, v0.h[0]
+    sqdmlal s7, h1, v4.h[3]
+    sqdmlal s11, h16, v8.h[4]
+    sqdmlal s30, h30, v15.h[7]
+    sqdmlal d0, s0, v3.s[0]
+    sqdmlal d30, s30, v30.s[3]
+    sqdmlal d8, s9, v14.s[1]
+
+// CHECK: sqdmlal s0, h0, v0.h[0]       // encoding: [0x00,0x30,0x40,0x5f]
+// CHECK: sqdmlal s7, h1, v4.h[3]       // encoding: [0x27,0x30,0x74,0x5f]
+// CHECK: sqdmlal s11, h16, v8.h[4]     // encoding: [0x0b,0x3a,0x48,0x5f]
+// CHECK: sqdmlal s30, h30, v15.h[7]    // encoding: [0xde,0x3b,0x7f,0x5f]
+// CHECK: sqdmlal d0, s0, v3.s[0]       // encoding: [0x00,0x30,0x83,0x5f]
+// CHECK: sqdmlal d30, s30, v30.s[3]    // encoding: [0xde,0x3b,0xbe,0x5f]
+// CHECK: sqdmlal d8, s9, v14.s[1]      // encoding: [0x28,0x31,0xae,0x5f]
+ 
+//-----------------------------------------------------------------------------
+// Signed saturating doubling multiply-subtract long (scalar, by element)
+//-----------------------------------------------------------------------------
+    sqdmlsl s1, h1, v1.h[0]
+    sqdmlsl s8, h2, v5.h[1]
+    sqdmlsl s12, h13, v14.h[2]
+    sqdmlsl s29, h28, v11.h[7]
+    sqdmlsl d1, s1, v13.s[0]
+    sqdmlsl d31, s31, v31.s[2]
+    sqdmlsl d16, s18, v28.s[3]
+
+// CHECK: sqdmlsl s1, h1, v1.h[0]       // encoding: [0x21,0x70,0x41,0x5f]
+// CHECK: sqdmlsl s8, h2, v5.h[1]       // encoding: [0x48,0x70,0x55,0x5f]
+// CHECK: sqdmlsl s12, h13, v14.h[2]    // encoding: [0xac,0x71,0x6e,0x5f]
+// CHECK: sqdmlsl s29, h28, v11.h[7]    // encoding: [0x9d,0x7b,0x7b,0x5f]
+// CHECK: sqdmlsl d1, s1, v13.s[0]      // encoding: [0x21,0x70,0x8d,0x5f]
+// CHECK: sqdmlsl d31, s31, v31.s[2]    // encoding: [0xff,0x7b,0x9f,0x5f]
+// CHECK: sqdmlsl d16, s18, v28.s[3]    // encoding: [0x50,0x7a,0xbc,0x5f]
+
+
+        
+
+        
+        
+
diff --git a/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s b/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s
new file mode 100644
index 00000000000..8a8405ef282
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s
@@ -0,0 +1,58 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//-----------------------------------------------------------------------------
+// Signed saturating doubling multiply long (scalar, by element)
+//-----------------------------------------------------------------------------
+    sqdmull s1, h1, v1.h[1]
+    sqdmull s8, h2, v5.h[2]
+    sqdmull s12, h17, v9.h[3]
+    sqdmull s31, h31, v15.h[7]
+    sqdmull d1, s1, v4.s[0]
+    sqdmull d31, s31, v31.s[3]
+    sqdmull d9, s10, v15.s[0]
+
+
+// CHECK: sqdmull s1, h1, v1.h[1]       // encoding: [0x21,0xb0,0x51,0x5f]
+// CHECK: sqdmull s8, h2, v5.h[2]       // encoding: [0x48,0xb0,0x65,0x5f]
+// CHECK: sqdmull s12, h17, v9.h[3]     // encoding: [0x2c,0xb2,0x79,0x5f]
+// CHECK: sqdmull s31, h31, v15.h[7]    // encoding: [0xff,0xbb,0x7f,0x5f]
+// CHECK: sqdmull d1, s1, v4.s[0]       // encoding: [0x21,0xb0,0x84,0x5f]
+// CHECK: sqdmull d31, s31, v31.s[3]    // encoding: [0xff,0xbb,0xbf,0x5f]
+// CHECK: sqdmull d9, s10, v15.s[0]     // encoding: [0x49,0xb1,0x8f,0x5f]
+ 
+//-----------------------------------------------------------------------------
+// Scalar Signed saturating doubling multiply returning
+// high half (scalar, by element)
+//-----------------------------------------------------------------------------
+    sqdmulh h0, h1, v0.h[0]
+    sqdmulh h10, h11, v10.h[4]
+    sqdmulh h20, h21, v15.h[7]
+    sqdmulh s25, s26, v27.s[3]
+    sqdmulh s2, s6, v7.s[0]
+
+// CHECK: sqdmulh h0, h1, v0.h[0]       // encoding: [0x20,0xc0,0x40,0x5f]
+// CHECK: sqdmulh h10, h11, v10.h[4]    // encoding: [0x6a,0xc9,0x4a,0x5f]
+// CHECK: sqdmulh h20, h21, v15.h[7]    // encoding: [0xb4,0xca,0x7f,0x5f]
+// CHECK: sqdmulh s25, s26, v27.s[3]    // encoding: [0x59,0xcb,0xbb,0x5f]
+// CHECK: sqdmulh s2, s6, v7.s[0]       // encoding: [0xc2,0xc0,0x87,0x5f]
+
+//-----------------------------------------------------------------------------
+// Signed saturating rounding doubling multiply returning
+// high half (scalar, by element)
+//-----------------------------------------------------------------------------
+    sqrdmulh h31, h30, v14.h[2]
+    sqrdmulh h1, h1, v1.h[4]
+    sqrdmulh h21, h22, v15.h[7]
+    sqrdmulh s5, s6, v7.s[2]
+    sqrdmulh s20, s26, v27.s[1]
+
+// CHECK: sqrdmulh h31, h30, v14.h[2]   // encoding: [0xdf,0xd3,0x6e,0x5f]
+// CHECK: sqrdmulh h1, h1, v1.h[4]      // encoding: [0x21,0xd8,0x41,0x5f]
+// CHECK: sqrdmulh h21, h22, v15.h[7]   // encoding: [0xd5,0xda,0x7f,0x5f]
+// CHECK: sqrdmulh s5, s6, v7.s[2]      // encoding: [0xc5,0xd8,0x87,0x5f]
+// CHECK: sqrdmulh s20, s26, v27.s[1]   // encoding: [0x54,0xd3,0xbb,0x5f]
+        
+
+        
+        
+
diff --git a/test/MC/AArch64/neon-scalar-dup.s b/test/MC/AArch64/neon-scalar-dup.s
new file mode 100644
index 00000000000..64366f2edc0
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-dup.s
@@ -0,0 +1,29 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//------------------------------------------------------------------------------
+// Duplicate element (scalar)
+//------------------------------------------------------------------------------
+         dup b0, v0.b[15]
+         dup b1, v0.b[7]
+         dup b17, v0.b[0]
+         dup h5, v31.h[7]
+         dup h9, v1.h[4]
+         dup h11, v17.h[0]
+         dup s2, v2.s[3]
+         dup s4, v21.s[0]
+         dup s31, v21.s[2]
+         dup d3, v5.d[0]
+         dup d6, v5.d[1]
+
+// CHECK: dup b0, v0.b[15]      // encoding: [0x00,0x04,0x1f,0x5e]
+// CHECK: dup b1, v0.b[7]       // encoding: [0x01,0x04,0x0f,0x5e]
+// CHECK: dup b17, v0.b[0]      // encoding: [0x11,0x04,0x01,0x5e]
+// CHECK: dup h5, v31.h[7]      // encoding: [0xe5,0x07,0x1e,0x5e]
+// CHECK: dup h9, v1.h[4]       // encoding: [0x29,0x04,0x12,0x5e]
+// CHECK: dup h11, v17.h[0]     // encoding: [0x2b,0x06,0x02,0x5e]
+// CHECK: dup s2, v2.s[3]       // encoding: [0x42,0x04,0x1c,0x5e]
+// CHECK: dup s4, v21.s[0]      // encoding: [0xa4,0x06,0x04,0x5e]
+// CHECK: dup s31, v21.s[2]     // encoding: [0xbf,0x06,0x14,0x5e]
+// CHECK: dup d3, v5.d[0]       // encoding: [0xa3,0x04,0x08,0x5e]
+// CHECK: dup d6, v5.d[1]       // encoding: [0xa6,0x04,0x18,0x5e]
+
diff --git a/test/MC/AArch64/neon-simd-copy.s b/test/MC/AArch64/neon-simd-copy.s
index 7edcc1b2a21..67fe309f1d5 100644
--- a/test/MC/AArch64/neon-simd-copy.s
+++ b/test/MC/AArch64/neon-simd-copy.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-simd-shift.s b/test/MC/AArch64/neon-simd-shift.s
index 9e6e1aaf86c..a16432324ef 100644
--- a/test/MC/AArch64/neon-simd-shift.s
+++ b/test/MC/AArch64/neon-simd-shift.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/Disassembler/AArch64/neon-instructions.txt b/test/MC/Disassembler/AArch64/neon-instructions.txt
index 2f533758153..b9ea7c140be 100644
--- a/test/MC/Disassembler/AArch64/neon-instructions.txt
+++ b/test/MC/Disassembler/AArch64/neon-instructions.txt
@@ -2174,3 +2174,216 @@ G# RUN: llvm-mc  -triple aarch64-none-linux-gnu -mattr=+neon -disassemble < %s |
 0x28,0x78,0x82,0x0e
 0x19,0x78,0x82,0x4e
 0x0a,0x78,0xc2,0x4e
+
+#----------------------------------------------------------------------
+# Scalar Floating Point  multiply (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: fmul s0, s1, v1.s[0]
+# CHECK: fmul s0, s1, v1.s[3]
+# CHECK: fmul d0, d1, v1.d[0]
+# CHECK: fmul d0, d1, v1.d[1]
+# CHECK: fmul d15, d15, v15.d[1]
+0x20 0x90 0x81 0x5f
+0x20 0x98 0xa1 0x5f
+0x20 0x90 0xc1 0x5f
+0x20 0x98 0xc1 0x5f
+0xef 0x99 0xcf 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Floating Point  multiply extended (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: fmulx s3, s5, v7.s[0]
+# CHECK: fmulx s3, s5, v7.s[3]
+# CHECK: fmulx s3, s5, v15.s[3]
+# CHECK: fmulx d0, d4, v8.d[0]
+# CHECK: fmulx d0, d4, v8.d[1]
+0xa3 0x90 0x87 0x7f
+0xa3 0x98 0xa7 0x7f
+0xa3 0x98 0xaf 0x7f
+0x80 0x90 0xc8 0x7f
+0x80 0x98 0xc8 0x7f
+
+#----------------------------------------------------------------------
+# Scalar Floating Point fused multiply-add (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: fmla s0, s1, v1.s[0]
+# CHECK: fmla s0, s1, v1.s[3]
+# CHECK: fmla d0, d1, v1.d[0]
+# CHECK: fmla d0, d1, v1.d[1]
+# CHECK: fmla d15, d15, v15.d[1]
+0x20 0x10 0x81 0x5f
+0x20 0x18 0xa1 0x5f
+0x20 0x10 0xc1 0x5f
+0x20 0x18 0xc1 0x5f
+0xef 0x19 0xcf 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Floating Point fused multiply-sub (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: fmls s3, s5, v7.s[0]
+# CHECK: fmls s3, s5, v7.s[3]
+# CHECK: fmls s3, s5, v15.s[3]
+# CHECK: fmls d0, d4, v8.d[0]
+# CHECK: fmls d0, d4, v8.d[1]
+0xa3 0x50 0x87 0x5f
+0xa3 0x58 0xa7 0x5f
+0xa3 0x58 0xaf 0x5f
+0x80 0x50 0xc8 0x5f
+0x80 0x58 0xc8 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Signed saturating doubling
+# multiply-add long (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: sqdmlal s0, h0, v0.h[0]
+# CHECK: sqdmlal s0, h0, v0.h[1]
+# CHECK: sqdmlal s0, h0, v0.h[2]
+# CHECK: sqdmlal s0, h0, v0.h[3]
+# CHECK: sqdmlal s0, h0, v0.h[4]
+# CHECK: sqdmlal s0, h0, v0.h[5]
+# CHECK: sqdmlal s0, h0, v0.h[6]
+# CHECK: sqdmlal s0, h0, v0.h[7]
+# CHECK: sqdmlal d8, s9, v15.s[0]
+# CHECK: sqdmlal d8, s9, v15.s[1]
+# CHECK: sqdmlal d8, s9, v15.s[2]
+# CHECK: sqdmlal d8, s9, v15.s[3]
+0x00 0x30 0x40 0x5f
+0x00 0x30 0x50 0x5f
+0x00 0x30 0x60 0x5f
+0x00 0x30 0x70 0x5f
+0x00 0x38 0x40 0x5f
+0x00 0x38 0x50 0x5f
+0x00 0x38 0x60 0x5f
+0x00 0x38 0x70 0x5f
+0x28 0x31 0x8f 0x5f
+0x28 0x31 0xaf 0x5f
+0x28 0x39 0x8f 0x5f
+0x28 0x39 0xaf 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Signed saturating doubling
+# multiply-sub long (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: sqdmlsl s0, h0, v0.h[0]
+# CHECK: sqdmlsl s0, h0, v0.h[1]
+# CHECK: sqdmlsl s0, h0, v0.h[2]
+# CHECK: sqdmlsl s0, h0, v0.h[3]
+# CHECK: sqdmlsl s0, h0, v0.h[4]
+# CHECK: sqdmlsl s0, h0, v0.h[5]
+# CHECK: sqdmlsl s0, h0, v0.h[6]
+# CHECK: sqdmlsl s0, h0, v0.h[7]
+# CHECK: sqdmlsl d8, s9, v15.s[0]
+# CHECK: sqdmlsl d8, s9, v15.s[1]
+# CHECK: sqdmlsl d8, s9, v15.s[2]
+# CHECK: sqdmlsl d8, s9, v15.s[3]
+0x00 0x70 0x40 0x5f
+0x00 0x70 0x50 0x5f
+0x00 0x70 0x60 0x5f
+0x00 0x70 0x70 0x5f
+0x00 0x78 0x40 0x5f
+0x00 0x78 0x50 0x5f
+0x00 0x78 0x60 0x5f
+0x00 0x78 0x70 0x5f
+0x28 0x71 0x8f 0x5f
+0x28 0x71 0xaf 0x5f
+0x28 0x79 0x8f 0x5f
+0x28 0x79 0xaf 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Signed saturating doubling multiply long (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: sqdmull s1, h1, v1.h[0]
+# CHECK: sqdmull s1, h1, v1.h[1]
+# CHECK: sqdmull s1, h1, v1.h[2]
+# CHECK: sqdmull s1, h1, v1.h[3]
+# CHECK: sqdmull s1, h1, v1.h[4]
+# CHECK: sqdmull s1, h1, v1.h[5]
+# CHECK: sqdmull s1, h1, v1.h[6]
+# CHECK: sqdmull s1, h1, v1.h[7]
+# CHECK: sqdmull d1, s1, v4.s[0]
+# CHECK: sqdmull d1, s1, v4.s[1]
+# CHECK: sqdmull d1, s1, v4.s[2]
+# CHECK: sqdmull d1, s1, v4.s[3]
+0x21 0xb0 0x41 0x5f
+0x21 0xb0 0x51 0x5f
+0x21 0xb0 0x61 0x5f
+0x21 0xb0 0x71 0x5f
+0x21 0xb8 0x41 0x5f
+0x21 0xb8 0x51 0x5f
+0x21 0xb8 0x61 0x5f
+0x21 0xb8 0x71 0x5f
+0x21 0xb0 0x84 0x5f
+0x21 0xb0 0xa4 0x5f
+0x21 0xb8 0x84 0x5f
+0x21 0xb8 0xa4 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Signed saturating doubling multiply returning
+# high half (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: sqdmulh h7, h1, v14.h[0]
+# CHECK: sqdmulh h7, h15, v8.h[1]
+# CHECK: sqdmulh h7, h15, v8.h[2]
+# CHECK: sqdmulh h7, h15, v8.h[3]
+# CHECK: sqdmulh h7, h15, v8.h[4]
+# CHECK: sqdmulh h7, h15, v8.h[5]
+# CHECK: sqdmulh h7, h15, v8.h[6]
+# CHECK: sqdmulh h7, h15, v8.h[7]
+# CHECK: sqdmulh s15, s3, v4.s[0]
+# CHECK: sqdmulh s15, s14, v16.s[1]
+# CHECK: sqdmulh s15, s15, v16.s[2]
+# CHECK: sqdmulh s15, s16, v17.s[3]
+0x27 0xc0 0x4e 0x5f
+0xe7 0xc1 0x58 0x5f
+0xe7 0xc1 0x68 0x5f
+0xe7 0xc1 0x78 0x5f
+0xe7 0xc9 0x48 0x5f
+0xe7 0xc9 0x58 0x5f
+0xe7 0xc9 0x68 0x5f
+0xe7 0xc9 0x78 0x5f
+0x6f 0xc0 0x84 0x5f
+0xcf 0xc1 0xb0 0x5f
+0xef 0xc9 0x90 0x5f
+0x0f 0xca 0xb1 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Signed saturating rounding doubling multiply
+# returning high half (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: sqrdmulh h7, h1, v14.h[0]
+# CHECK: sqrdmulh h7, h15, v8.h[1]
+# CHECK: sqrdmulh h7, h15, v8.h[2]
+# CHECK: sqrdmulh h7, h15, v8.h[3]
+# CHECK: sqrdmulh h7, h15, v8.h[4]
+# CHECK: sqrdmulh h7, h15, v8.h[5]
+# CHECK: sqrdmulh h7, h15, v8.h[6]
+# CHECK: sqrdmulh h7, h15, v8.h[7]
+# CHECK: sqrdmulh s15, s3, v4.s[0]
+# CHECK: sqrdmulh s15, s14, v16.s[1]
+# CHECK: sqrdmulh s15, s15, v16.s[2]
+# CHECK: sqrdmulh s15, s16, v17.s[3]
+0x27 0xd0 0x4e 0x5f
+0xe7 0xd1 0x58 0x5f
+0xe7 0xd1 0x68 0x5f
+0xe7 0xd1 0x78 0x5f
+0xe7 0xd9 0x48 0x5f
+0xe7 0xd9 0x58 0x5f
+0xe7 0xd9 0x68 0x5f
+0xe7 0xd9 0x78 0x5f
+0x6f 0xd0 0x84 0x5f
+0xcf 0xd1 0xb0 0x5f
+0xef 0xd9 0x90 0x5f
+0x0f 0xda 0xb1 0x5f
+
+#----------------------------------------------------------------------
+#Duplicate element (scalar)
+#----------------------------------------------------------------------
+# CHECK: dup b0, v0.b[15]
+# CHECK: dup h2, v31.h[5]
+# CHECK: dup s17, v2.s[2]
+# CHECK: dup d6, v12.d[1]
+0x00 0x04 0x1f 0x5e
+0xe2 0x07 0x16 0x5e
+0x51 0x04 0x14 0x5e
+0x86 0x05 0x18 0x5e
+
-- 
2.34.1