R600/SI: Add support for i8 and i16 private loads/stores

author Tom Stellard <thomas.stellard@amd.com>

Wed, 22 Jan 2014 19:24:14 +0000 (19:24 +0000)

committer Tom Stellard <thomas.stellard@amd.com>

Wed, 22 Jan 2014 19:24:14 +0000 (19:24 +0000)
author Tom Stellard <thomas.stellard@amd.com>
Wed, 22 Jan 2014 19:24:14 +0000 (19:24 +0000)
committer Tom Stellard <thomas.stellard@amd.com>
Wed, 22 Jan 2014 19:24:14 +0000 (19:24 +0000)
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp

index c59be7ce24306d78e36d5852ce6aebbc94257430..a65dd65b6e61e36365c8eff7143d44cb59c6cfe8 100644 (file)
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -589,18 +589,96 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
    return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
  }
  
+SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  ISD::LoadExtType ExtType = Load->getExtensionType();
+
+  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
+      ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
+    return SDValue();
+
+
+  EVT VT = Op.getValueType();
+  EVT MemVT = Load->getMemoryVT();
+  unsigned Mask = 0;
+  if (Load->getMemoryVT() == MVT::i8) {
+    Mask = 0xff;
+  } else if (Load->getMemoryVT() == MVT::i16) {
+    Mask = 0xffff;
+  }
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
+                            DAG.getConstant(2, MVT::i32));
+  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+                            Load->getChain(), Ptr,
+                            DAG.getTargetConstant(0, MVT::i32),
+                            Op.getOperand(2));
+  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
+                                Load->getBasePtr(),
+                                DAG.getConstant(0x3, MVT::i32));
+  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                 DAG.getConstant(3, MVT::i32));
+  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+  Ret = DAG.getNode(ISD::AND, DL, MVT::i32, Ret,
+                    DAG.getConstant(Mask, MVT::i32));
+  if (ExtType == ISD::SEXTLOAD) {
+    SDValue SExtShift = DAG.getConstant(
+        VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
+    Ret = DAG.getNode(ISD::SHL, DL, MVT::i32, Ret, SExtShift);
+    Ret = DAG.getNode(ISD::SRA, DL, MVT::i32, Ret, SExtShift);
+  }
+
+  return Ret;
+}
+
  SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
    SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
    if (Result.getNode()) {
      return Result;
    }
  
    StoreSDNode *Store = cast<StoreSDNode>(Op);
+  SDValue Chain = Store->getChain();
    if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
         Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
        Store->getValue().getValueType().isVector()) {
      return SplitVectorStore(Op, DAG);
    }
+
+  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+      Store->getMemoryVT().bitsLT(MVT::i32)) {
+    unsigned Mask = 0;
+    if (Store->getMemoryVT() == MVT::i8) {
+      Mask = 0xff;
+    } else if (Store->getMemoryVT() == MVT::i16) {
+      Mask = 0xffff;
+    }
+    SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32);
+    SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
+                              DAG.getConstant(2, MVT::i32));
+    SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                              Chain, Ptr, DAG.getTargetConstant(0, MVT::i32));
+    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, TruncPtr,
+                                  DAG.getConstant(0x3, MVT::i32));
+    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                   DAG.getConstant(3, MVT::i32));
+    SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
+                                    Store->getValue());
+    SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, SExtValue,
+                                      DAG.getConstant(Mask, MVT::i32));
+    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                       MaskedValue, ShiftAmt);
+    SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32),
+                                  ShiftAmt);
+    DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
+                          DAG.getConstant(0xffffffff, MVT::i32));
+    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+
+    SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+    return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                       Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32));
+  }
    return SDValue();
  }
  
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h

index 2dfd3cf492a9ef35f026b0f5e633838db70ebf19..fd6e3a599854fad234c58a877ae940bd4dd604cd 100644 (file)
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -54,6 +54,7 @@ protected:
    /// \brief Split a vector load into multiple scalar loads.
    SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const;
    SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
    bool isHWTrueValue(SDValue Op) const;
    bool isHWFalseValue(SDValue Op) const;
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp

index 8d71919704dff96b40a192c847ce534c909a591b..03feabe23e68283e12208d1ea854c9078c1e49ce 100644 (file)
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -1113,6 +1113,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
      return SDValue();
    }
  
+  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Ret.getNode()) {
+    return Ret;
+  }
    // Lowering for indirect addressing
  
    const MachineFunction &MF = DAG.getMachineFunction();
@@ -1204,6 +1208,15 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
    SDValue Ptr = Op.getOperand(1);
    SDValue LoweredLoad;
  
+  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+  if (Ret.getNode()) {
+    SDValue Ops[2];
+    Ops[0] = Ret;
+    Ops[1] = Chain;
+    return DAG.getMergeValues(Ops, 2, DL);
+  }
+
+
    if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
      SDValue MergedValues[2] = {
        SplitVectorLoad(Op, DAG),
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp

index 4fb844439abaa2d1252c9c8fb15c97296d47ac47..9430689c61ca333b49a5de1643eec0b674f469a7 100644 (file)
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -125,11 +125,17 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  
    setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
    setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
  
+  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
    setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    setTruncStoreAction(MVT::i128, MVT::i64, Expand);
@@ -700,21 +706,26 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
  SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
    SDLoc DL(Op);
    LoadSDNode *Load = cast<LoadSDNode>(Op);
+  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+  SDValue MergedValues[2];
+  MergedValues[1] = Load->getChain();
+  if (Ret.getNode()) {
+    MergedValues[0] = Ret;
+    return DAG.getMergeValues(MergedValues, 2, DL);
+  }
  
-  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
      return SDValue();
+  }
  
    SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
                              DAG.getConstant(2, MVT::i32));
+  Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+                    Load->getChain(), Ptr,
+                    DAG.getTargetConstant(0, MVT::i32),
+                    Op.getOperand(2));
  
-  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
-                            Load->getChain(), Ptr,
-                            DAG.getTargetConstant(0, MVT::i32),
-                            Op.getOperand(2));
-  SDValue MergedValues[2] = {
-    Ret,
-    Load->getChain()
-  };
+  MergedValues[0] = Ret;
    return DAG.getMergeValues(MergedValues, 2, DL);
  
  }
@@ -796,7 +807,34 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    SDValue Chain = Store->getChain();
    SmallVector<SDValue, 8> Values;
  
-  if (VT == MVT::i64) {
+  if (Store->isTruncatingStore()) {
+    unsigned Mask = 0;
+    if (Store->getMemoryVT() == MVT::i8) {
+      Mask = 0xff;
+    } else if (Store->getMemoryVT() == MVT::i16) {
+      Mask = 0xffff;
+    }
+    SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                              Chain, Store->getBasePtr(),
+                              DAG.getConstant(0, MVT::i32));
+    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getBasePtr(),
+                                  DAG.getConstant(0x3, MVT::i32));
+    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                   DAG.getConstant(3, MVT::i32));
+    SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getValue(),
+                                      DAG.getConstant(Mask, MVT::i32));
+    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                       MaskedValue, ShiftAmt);
+    SDValue RotrAmt = DAG.getNode(ISD::SUB, DL, MVT::i32,
+                                  DAG.getConstant(32, MVT::i32), ShiftAmt);
+    SDValue DstMask = DAG.getNode(ISD::ROTR, DL, MVT::i32,
+                                  DAG.getConstant(Mask, MVT::i32),
+                                  RotrAmt);
+    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+    Dst = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+
+    Values.push_back(Dst);
+  } else if (VT == MVT::i64) {
      for (unsigned i = 0; i < 2; ++i) {
        Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
                         Store->getValue(), DAG.getConstant(i, MVT::i32)));
diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll

index aa660b38838dd23cedb320ec08378c6679ce3866..f78cdc4fb02f2e92b524f6af58729b8f893263bb 100644 (file)
--- a/test/CodeGen/R600/extload.ll
+++ b/test/CodeGen/R600/extload.ll
@@ -2,7 +2,7 @@
  
  ; EG-LABEL: @anyext_load_i8:
  ; EG: AND_INT
-; EG-NEXT: 255
+; EG: 255
  define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
    %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
    %load = load i32 addrspace(1)* %cast, align 1
@@ -14,8 +14,9 @@ define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspac
  
  ; EG-LABEL: @anyext_load_i16:
  ; EG: AND_INT
-; EG: LSHL
-; EG: 65535
+; EG: AND_INT
+; EG-DAG: 65535
+; EG-DAG: -65536
  define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
    %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
    %load = load i32 addrspace(1)* %cast, align 1
@@ -27,7 +28,7 @@ define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrs
  
  ; EG-LABEL: @anyext_load_lds_i8:
  ; EG: AND_INT
-; EG-NEXT: 255
+; EG: 255
  define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
    %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
    %load = load i32 addrspace(3)* %cast, align 1
@@ -39,8 +40,9 @@ define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addr
  
  ; EG-LABEL: @anyext_load_lds_i16:
  ; EG: AND_INT
-; EG: LSHL
-; EG: 65535
+; EG: AND_INT
+; EG-DAG: 65535
+; EG-DAG: -65536
  define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
    %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
    %load = load i32 addrspace(3)* %cast, align 1
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll

index 848d164eebd0c251830bd05330b7d4d991b47db1..3fd67d75bb75e0dd2dc9d5fb61c9eb6388eef214 100644 (file)
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -1,10 +1,11 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
  
  ; This test checks that uses and defs of the AR register happen in the same
  ; instruction clause.
  
-; R600-CHECK-LABEL: @mova_same_clause
+; FUNC-LABEL: @mova_same_clause
+
  ; R600-CHECK: MOVA_INT
  ; R600-CHECK-NOT: ALU clause
  ; R600-CHECK: 0 + AR.x
@@ -12,7 +13,6 @@
  ; R600-CHECK-NOT: ALU clause
  ; R600-CHECK: 0 + AR.x
  
-; SI-CHECK-LABEL: @mova_same_clause
  ; SI-CHECK: V_READFIRSTLANE
  ; SI-CHECK: V_MOVRELD
  ; SI-CHECK: S_CBRANCH
@@ -46,9 +46,8 @@ entry:
  ; XXX: This generated code has unnecessary MOVs, we should be able to optimize
  ; this.
  
-; R600-CHECK-LABEL: @multiple_structs
+; FUNC-LABEL: @multiple_structs
  ; R600-CHECK-NOT: MOVA_INT
-; SI-CHECK-LABEL: @multiple_structs
  ; SI-CHECK-NOT: V_MOVREL
  %struct.point = type { i32, i32 }
  
@@ -77,9 +76,8 @@ entry:
  ; loads and stores should be lowered to copies, so there shouldn't be any
  ; MOVA instructions.
  
-; R600-CHECK-LABEL: @direct_loop
+; FUNC-LABEL: @direct_loop
  ; R600-CHECK-NOT: MOVA_INT
-; SI-CHECK-LABEL: @direct_loop
  ; SI-CHECK-NOT: V_MOVREL
  
  define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -113,3 +111,48 @@ for.end:
    store i32 %value, i32 addrspace(1)* %out
    ret void
  }
+
+; FUNC-LABEL: @short_array
+
+; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal
+; R600-CHECK: 65536
+; R600-CHECK: MOVA_INT
+
+; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 65536
+; SI-CHECK: V_MOVRELS_B32_e32
+define void @short_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [2 x i16]
+  %1 = getelementptr [2 x i16]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i16]* %0, i32 0, i32 1
+  store i16 0, i16* %1
+  store i16 1, i16* %2
+  %3 = getelementptr [2 x i16]* %0, i32 0, i32 %index
+  %4 = load i16* %3
+  %5 = sext i16 %4 to i32
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @char_array
+
+; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal
+; R600-CHECK: 256
+; R600-CHECK: MOVA_INT
+
+; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 256
+; SI-CHECK: V_MOVRELS_B32_e32
+define void @char_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [2 x i8]
+  %1 = getelementptr [2 x i8]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i8]* %0, i32 0, i32 1
+  store i8 0, i8* %1
+  store i8 1, i8* %2
+  %3 = getelementptr [2 x i8]* %0, i32 0, i32 %index
+  %4 = load i8* %3
+  %5 = sext i8 %4 to i32
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+
+}
author	Tom Stellard <thomas.stellard@amd.com>
	Wed, 22 Jan 2014 19:24:14 +0000 (19:24 +0000)
committer	Tom Stellard <thomas.stellard@amd.com>
	Wed, 22 Jan 2014 19:24:14 +0000 (19:24 +0000)
lib/Target/R600/AMDGPUISelLowering.cpp		patch \| blob \| history
lib/Target/R600/AMDGPUISelLowering.h		patch \| blob \| history
lib/Target/R600/R600ISelLowering.cpp		patch \| blob \| history
lib/Target/R600/SIISelLowering.cpp		patch \| blob \| history
test/CodeGen/R600/extload.ll		patch \| blob \| history
test/CodeGen/R600/private-memory.ll		patch \| blob \| history