{ ARM::VLD3q8Pseudo_UPD, ARM::VLD3q8_UPD, true, true, EvenDblSpc, 3, 8 },
{ ARM::VLD3q8oddPseudo_UPD, ARM::VLD3q8_UPD, true, true, OddDblSpc, 3, 8 },
+{ ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd16, true, false, SingleSpc, 4, 4},
+{ ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd16_UPD, true, true, SingleSpc, 4, 4},
+{ ARM::VLD4DUPd32Pseudo, ARM::VLD4DUPd32, true, false, SingleSpc, 4, 2},
+{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, SingleSpc, 4, 2},
+{ ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, SingleSpc, 4, 8},
+{ ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, SingleSpc, 4, 8},
+
{ ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, SingleSpc, 4, 4 },
{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, SingleSpc, 4, 4 },
{ ARM::VLD4LNd32Pseudo, ARM::VLD4LNd32, true, false, SingleSpc, 4, 2 },
case ARM::VLD3DUPd8Pseudo_UPD:
case ARM::VLD3DUPd16Pseudo_UPD:
case ARM::VLD3DUPd32Pseudo_UPD:
+ case ARM::VLD4DUPd8Pseudo:
+ case ARM::VLD4DUPd16Pseudo:
+ case ARM::VLD4DUPd32Pseudo:
+ case ARM::VLD4DUPd8Pseudo_UPD:
+ case ARM::VLD4DUPd16Pseudo_UPD:
+ case ARM::VLD4DUPd32Pseudo_UPD:
ExpandVLD(MBBI);
break;
return SelectVLDDup(N, 3, Opcodes);
}
+ case ARMISD::VLD4DUP: {
+ unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo,
+ ARM::VLD4DUPd32Pseudo };
+ return SelectVLDDup(N, 4, Opcodes);
+ }
+
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN: {
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
// VLD4DUP : Vector Load (single 4-element structure to all lanes)
-// FIXME: Not yet implemented.
+class VLD4DUP<bits<4> op7_4, string Dt>
+ : NLdSt<1, 0b10, 0b1111, op7_4,
+ (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+ (ins addrmode6:$Rn), IIC_VLD4dup,
+ "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn", "", []> {
+ let Rm = 0b1111;
+}
+
+def VLD4DUPd8 : VLD4DUP<{0,0,0,?}, "8"> { let Inst{4} = Rn{4}; }
+def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16"> { let Inst{4} = Rn{4}; }
+def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> {
+ let Inst{6} = Rn{5};
+ let Inst{4} = Rn{5};
+}
+
+def VLD4DUPd8Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD4DUPd8x2 : VLD4DUP<{0,0,1,?}, "8"> { let Inst{4} = Rn{4}; }
+def VLD4DUPd16x2 : VLD4DUP<{0,1,1,?}, "16"> { let Inst{4} = Rn{4}; }
+def VLD4DUPd32x2 : VLD4DUP<{1,?,1,?}, "32"> {
+ let Inst{6} = Rn{5};
+ let Inst{4} = Rn{5};
+}
+
+// ...with address register writeback:
+class VLD4DUPWB<bits<4> op7_4, string Dt>
+ : NLdSt<1, 0b10, 0b1111, op7_4,
+ (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+ (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4dupu,
+ "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm",
+ "$Rn.addr = $wb", []>;
+
+def VLD4DUPd8_UPD : VLD4DUPWB<{0,0,0,0}, "8"> { let Inst{4} = Rn{4}; }
+def VLD4DUPd16_UPD : VLD4DUPWB<{0,1,0,?}, "16"> { let Inst{4} = Rn{4}; }
+def VLD4DUPd32_UPD : VLD4DUPWB<{1,?,0,?}, "32"> {
+ let Inst{6} = Rn{5};
+ let Inst{4} = Rn{5};
+}
+
+def VLD4DUPd8x2_UPD : VLD4DUPWB<{0,0,1,0}, "8"> { let Inst{4} = Rn{4}; }
+def VLD4DUPd16x2_UPD : VLD4DUPWB<{0,1,1,?}, "16"> { let Inst{4} = Rn{4}; }
+def VLD4DUPd32x2_UPD : VLD4DUPWB<{1,?,1,?}, "32"> {
+ let Inst{6} = Rn{5};
+ let Inst{4} = Rn{5};
+}
+
+def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+
} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
def IIC_VLD4ln : InstrItinClass;
def IIC_VLD4u : InstrItinClass;
def IIC_VLD4lnu : InstrItinClass;
+def IIC_VLD4dup : InstrItinClass;
+def IIC_VLD4dupu : InstrItinClass;
def IIC_VST1 : InstrItinClass;
def IIC_VST1x2 : InstrItinClass;
def IIC_VST1x3 : InstrItinClass;
InstrStage<5, [A8_LSPipe]>],
[4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
//
+ // VLD4dup
+ InstrItinData<IIC_VLD4dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 2, 3, 3, 1]>,
+ //
+ // VLD4dupu
+ InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 2, 3, 3, 2, 1, 1]>,
+ //
// VST1
InstrItinData<IIC_VST1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
InstrStage<2, [A8_NLSPipe], 0>,
InstrStage<5, [A9_LSUnit]>],
[5, 5, 6, 6, 2, 1, 1, 1, 1, 1, 2, 2]>,
//
+ // VLD4dup
+ InstrItinData<IIC_VLD4dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe], 0>,
+ InstrStage<3, [A9_LSUnit]>],
+ [3, 3, 4, 4, 1]>,
+ //
+ // VLD4dupu
+ InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe], 0>,
+ InstrStage<3, [A9_LSUnit]>],
+ [3, 3, 4, 4, 2, 1, 1]>,
+ //
// VST1
InstrItinData<IIC_VST1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
}
declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+
+%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+
+define <2 x i32> @vld4dupi32(i32* %A) nounwind {
+;CHECK: vld4dupi32:
+;Check the alignment value. Max for this instruction is 128 bits:
+;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r0, :128]
+ %tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 32)
+ %tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+ %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1
+ %tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer
+ %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 2
+ %tmp6 = shufflevector <2 x i32> %tmp5, <2 x i32> undef, <2 x i32> zeroinitializer
+ %tmp7 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 3
+ %tmp8 = shufflevector <2 x i32> %tmp7, <2 x i32> undef, <2 x i32> zeroinitializer
+ %tmp9 = add <2 x i32> %tmp2, %tmp4
+ %tmp10 = add <2 x i32> %tmp6, %tmp8
+ %tmp11 = add <2 x i32> %tmp9, %tmp10
+ ret <2 x i32> %tmp11
+}
+
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly