Add support for NEON VLD3-dup instructions.
authorBob Wilson <bob.wilson@apple.com>
Mon, 29 Nov 2010 19:35:29 +0000 (19:35 +0000)
committerBob Wilson <bob.wilson@apple.com>
Mon, 29 Nov 2010 19:35:29 +0000 (19:35 +0000)
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@120312 91177308-0d34-0410-b5e6-96231b3b80d8

lib/Target/ARM/ARMExpandPseudoInsts.cpp
lib/Target/ARM/ARMISelDAGToDAG.cpp
lib/Target/ARM/ARMInstrNEON.td
lib/Target/ARM/ARMSchedule.td
lib/Target/ARM/ARMScheduleA8.td
lib/Target/ARM/ARMScheduleA9.td
test/CodeGen/ARM/vlddup.ll

index f65be1cb179ea8d513495654de670fd347eb07a7..a78ed2616b0e80bec59998c645a4e4814587a987 100644 (file)
@@ -172,6 +172,13 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 { ARM::VLD2q8Pseudo,        ARM::VLD2q8,       true,  false, SingleSpc,  4, 8 },
 { ARM::VLD2q8Pseudo_UPD,    ARM::VLD2q8_UPD,   true,  true,  SingleSpc,  4, 8 },
 
+{ ARM::VLD3DUPd16Pseudo,     ARM::VLD3DUPd16,     true, false, SingleSpc, 3, 4},
+{ ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true,  SingleSpc, 3, 4},
+{ ARM::VLD3DUPd32Pseudo,     ARM::VLD3DUPd32,     true, false, SingleSpc, 3, 2},
+{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true,  SingleSpc, 3, 2},
+{ ARM::VLD3DUPd8Pseudo,      ARM::VLD3DUPd8,      true, false, SingleSpc, 3, 8},
+{ ARM::VLD3DUPd8Pseudo_UPD,  ARM::VLD3DUPd8_UPD,  true, true,  SingleSpc, 3, 8},
+
 { ARM::VLD3LNd16Pseudo,     ARM::VLD3LNd16,     true, false, SingleSpc,  3, 4 },
 { ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true,  SingleSpc,  3, 4 },
 { ARM::VLD3LNd32Pseudo,     ARM::VLD3LNd32,     true, false, SingleSpc,  3, 2 },
@@ -946,6 +953,12 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
     case ARM::VLD2DUPd8Pseudo_UPD:
     case ARM::VLD2DUPd16Pseudo_UPD:
     case ARM::VLD2DUPd32Pseudo_UPD:
+    case ARM::VLD3DUPd8Pseudo:
+    case ARM::VLD3DUPd16Pseudo:
+    case ARM::VLD3DUPd32Pseudo:
+    case ARM::VLD3DUPd8Pseudo_UPD:
+    case ARM::VLD3DUPd16Pseudo_UPD:
+    case ARM::VLD3DUPd32Pseudo_UPD:
       ExpandVLD(MBBI);
       break;
 
index b9fbdc58a1eb1612943cd27fa6767582f2134e41..a3b86cb9dbd8b7416a06b6fa2ea12709b3e9d494 100644 (file)
@@ -2361,6 +2361,12 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     return SelectVLDDup(N, 2, Opcodes);
   }
 
+  case ARMISD::VLD3DUP: {
+    unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo,
+                           ARM::VLD3DUPd32Pseudo };
+    return SelectVLDDup(N, 3, Opcodes);
+  }
+
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
index fd9b823412318ad1b60e6b8b3dafa7effc2e57ee..ebb9d5d8a7fb1c5056fc02154b5a557aba54e488 100644 (file)
@@ -896,6 +896,48 @@ def VLD2DUPd16Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>;
 def VLD2DUPd32Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>;
 
 //   VLD3DUP  : Vector Load (single 3-element structure to all lanes)
+class VLD3DUP<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$Rn), IIC_VLD3dup,
+          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
+
+def VLD3DUPd8  : VLD3DUP<{0,0,0,?}, "8">;
+def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">;
+def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">;
+
+def VLD3DUPd8Pseudo  : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD3DUPd8T  : VLD3DUP<{0,0,1,?}, "8">;
+def VLD3DUPd16T : VLD3DUP<{0,1,1,?}, "16">;
+def VLD3DUPd32T : VLD3DUP<{1,0,1,?}, "32">;
+
+// ...with address register writeback:
+class VLD3DUPWB<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3dupu,
+          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
+
+def VLD3DUPd8_UPD  : VLD3DUPWB<{0,0,0,0}, "8">;
+def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16">;
+def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32">;
+
+def VLD3DUPd8T_UPD  : VLD3DUPWB<{0,0,1,0}, "8">;
+def VLD3DUPd16T_UPD : VLD3DUPWB<{0,1,1,?}, "16">;
+def VLD3DUPd32T_UPD : VLD3DUPWB<{1,0,1,?}, "32">;
+
+def VLD3DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+
 //   VLD4DUP  : Vector Load (single 4-element structure to all lanes)
 //   FIXME: Not yet implemented.
 } // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
index 5202da4639066b132454cce6dfb50b9b1a6b6aa2..6300043114ee51854e4dfd9a2fe201e3371cb413 100644 (file)
@@ -152,6 +152,8 @@ def IIC_VLD3       : InstrItinClass;
 def IIC_VLD3ln     : InstrItinClass;
 def IIC_VLD3u      : InstrItinClass;
 def IIC_VLD3lnu    : InstrItinClass;
+def IIC_VLD3dup    : InstrItinClass;
+def IIC_VLD3dupu   : InstrItinClass;
 def IIC_VLD4       : InstrItinClass;
 def IIC_VLD4ln     : InstrItinClass;
 def IIC_VLD4u      : InstrItinClass;
index be92562a2e559284639d480a56a7057160787eb1..1e9ec0791ad7bc711144fced344a4eb2d4d57a4f 100644 (file)
@@ -559,6 +559,18 @@ def CortexA8Itineraries : ProcessorItineraries<
                                InstrStage<5, [A8_LSPipe]>],
                               [4, 4, 5, 2, 1, 1, 1, 1, 1, 2]>,
   //
+  // VLD3dup
+  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 1]>,
+  //
+  // VLD3dupu
+  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 2, 1, 1]>,
+  //
   // VLD4
   InstrItinData<IIC_VLD4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<4, [A8_NLSPipe], 0>,
index d775a9f26fea6d18f0a9fd3f4426964f6322be3f..a253b988198454c32d21faecf5a9857a67263b9d 100644 (file)
@@ -941,6 +941,24 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<5, [A9_LSUnit]>],
                               [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>,
   //
+  // VLD3dup
+  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 1]>,
+  //
+  // VLD3dupu
+  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 2, 1, 1]>,
+  //
   // VLD4
   InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
index 00dd134d770afa15433dd8acd1165eb86d2ded77..8d78dfbb90bd08ce381389529347a391decc4568 100644 (file)
@@ -71,3 +71,23 @@ define <2 x i32> @vld2dupi32(i32* %A) nounwind {
 
 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
 declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+
+%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
+
+define <4 x i16> @vld3dupi16(i16* %A) nounwind {
+;CHECK: vld3dupi16:
+;Check the (default) alignment value. VLD3 does not support alignment.
+;CHECK: vld3.16 {d16[], d17[], d18[]}, [r0]
+       %tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
+       %tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0
+       %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+       %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1
+       %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
+       %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 2
+       %tmp6 = shufflevector <4 x i16> %tmp5, <4 x i16> undef, <4 x i32> zeroinitializer
+        %tmp7 = add <4 x i16> %tmp2, %tmp4
+        %tmp8 = add <4 x i16> %tmp7, %tmp6
+        ret <4 x i16> %tmp8
+}
+
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly