swr/rast: Add autogen of helper llvm intrinsics.

[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

index 99a936d17601b70d1fd9466f0db89c5dd4481006..5c8d81332df118affb7551937e4745172a92b9bc 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -55,9 +55,12 @@ enum ConversionType
  //////////////////////////////////////////////////////////////////////////
  /// Interface to Jitting a fetch shader
  //////////////////////////////////////////////////////////////////////////
-struct FetchJit : public Builder
+struct FetchJit : 
+    public Builder
  {
-    FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
+    FetchJit(JitManager* pJitMgr) :
+        Builder(pJitMgr)
+    {}
  
      Function* Create(const FETCH_COMPILE_STATE& fetchState);
  
@@ -132,7 +135,7 @@ struct FetchJit : public Builder
  
  Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  {
-    std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+    std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
      fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
  
      Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
@@ -145,6 +148,10 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
      auto    argitr = fetch->arg_begin();
  
      // Fetch shader arguments
+    Value* privateContext = &*argitr; ++argitr;
+    privateContext->setName("privateContext");
+    SetPrivateContext(privateContext);
+
      mpFetchInfo = &*argitr; ++argitr;
      mpFetchInfo->setName("fetchInfo");
      Value*    pVtxOut = &*argitr;
@@ -246,7 +253,13 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
                                                 : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
  #endif
              break; // incoming type is already 32bit int
-        default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
+        default:
+            SWR_INVALID("Unsupported index type");
+            vIndices = nullptr;
+#if USE_SIMD16_SHADERS
+            vIndices2 = nullptr;
+#endif
+            break;
      }
  
      if(fetchState.bForceSequentialAccessEnable)
@@ -354,6 +367,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  
      JitManager::DumpToFile(fetch, "opt");
  
+
      return fetch;
  }
  
@@ -426,6 +440,10 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* str
          }
          else if (ied.InstanceStrideEnable)
          {
+            // silence unused variable warnings
+            startOffset = C(0);
+            vCurIndices = vIndices;
+
              SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
          }
          else
@@ -437,7 +455,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* str
          }
  
          // load SWR_VERTEX_BUFFER_STATE::pData
-        Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
+        Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
  
          // load SWR_VERTEX_BUFFER_STATE::pitch
          Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
@@ -741,7 +759,66 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pB
      // only works if pixel size is <= 32bits
      SWR_ASSERT(info.bpp <= 32);
  
-    Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+    Value *pGather;
+    if (info.bpp == 32)
+    {
+        pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+    }
+    else
+    {
+        // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
+        Value *pMem = ALLOCA(mSimdInt32Ty);
+        STORE(VIMMED1(0u), pMem);
+
+        pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
+        Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
+
+        for (uint32_t lane = 0; lane < mVWidth; ++lane)
+        {
+            // Get index
+            Value* index = VEXTRACT(pOffsets, C(lane));
+            Value* mask = VEXTRACT(pMask, C(lane));
+            switch (info.bpp)
+            {
+            case 8:
+            {
+                Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
+                Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
+                STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+                break;
+            }
+
+            case 16:
+            {
+                Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
+                Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
+                STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+                break;
+            }
+            break;
+
+            case 24:
+            {
+                // First 16-bits of data
+                Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
+                Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
+                STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+
+                // Last 8-bits of data
+                pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
+                pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
+                STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+                break;
+            }
+
+            default:
+                SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
+                break;
+            }
+        }
+
+        pGather = LOAD(pMem);
+    }
  
      for (uint32_t comp = 0; comp < 4; ++comp)
      {
@@ -866,10 +943,10 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
          SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
          uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
  
-        Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
+        Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
  
          // VGATHER* takes an *i8 src pointer
-        Value *pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
+        Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
  
          Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
  #if USE_SIMD16_GATHERS
@@ -972,6 +1049,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
          // calculate byte offset to the start of the VB
          Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
          pStreamBase = GEP(pStreamBase, baseOffset);
+        Value* pStreamBaseGFX = ADD(stream, baseOffset);
  
          // if we have a start offset, subtract from max vertex. Used for OOB check
          maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
@@ -1265,7 +1343,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                  // But, we know that elements must be aligned for FETCH. :)
                                  // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
                                  Value *shiftedOffsets16 = LSHR(vOffsets16, 1);
-                                pVtxSrc2[currentVertexElement++] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets16, vGatherMask16, 2);
+                                pVtxSrc2[currentVertexElement++] = GATHERPS_16(gatherSrc16, pStreamBaseGFX, shiftedOffsets16, vGatherMask16, 2, GFX_MEM_CLIENT_FETCH);
                              }
                              else
                              {
@@ -1282,9 +1360,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                  currentVertexElement = 0;
                              }
                          }
-
-                        // offset base to the next component in the vertex to gather
-                        pStreamBase = GEP(pStreamBase, C((char)4));
  #else
                          if (isComponentEnabled(compMask, i))
                          {
@@ -1297,7 +1372,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                  // But, we know that elements must be aligned for FETCH. :)
                                  // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
                                  Value *vShiftedOffsets = LSHR(vOffsets, 1);
-                                vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
+                                vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2, GFX_MEM_CLIENT_FETCH);
                              }
                              else
                              {
@@ -1315,10 +1390,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                  currentVertexElement = 0;
                              }
                          }
+#endif
  
                          // offset base to the next component in the vertex to gather
                          pStreamBase = GEP(pStreamBase, C((char)4));
-#endif
+                        pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
                      }
                  }
                      break;
@@ -1772,7 +1848,7 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
  
          // if valid, load the index. if not, load 0 from the stack
          Value* pValid = SELECT(mask, pIndex, pZeroIndex);
-        Value *index = LOAD(pValid, "valid index");
+        Value *index = LOAD(pValid, "valid index", GFX_MEM_CLIENT_FETCH);
  
          // zero extended index to 32 bits and insert into the correct simd lane
          index = Z_EXT(index, mInt32Ty);
@@ -1808,13 +1884,11 @@ Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
      //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
      //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
      Value* vMaxIndex = VBROADCAST(numIndicesLeft);
-    Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
-
-    // VMASKLOAD takes an *i8 src pointer
-    pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
+    Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
  
      // Load the indices; OOB loads 0
-    return MASKLOADD(pIndices,vIndexMask);
+    pIndices = BITCAST(pIndices, PointerType::get(mSimdInt32Ty, 0));
+    return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0));
  }
  
  //////////////////////////////////////////////////////////////////////////
@@ -2747,8 +2821,10 @@ Value *FetchJit::GenerateCompCtrlVector16(const ComponentControl ctrl)
              Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
              return VBROADCAST_16(pId);
          }
+
+
          case StoreSrc:
-        default:        
+        default:
              SWR_INVALID("Invalid component control");
              return VUNDEF_I_16();
      }
@@ -2763,15 +2839,15 @@ Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
  {
      switch (ctrl)
      {
-        case NoStore:
-            return VUNDEF_I();
-        case Store0:
-            return VIMMED1(0);
-        case Store1Fp:
-            return VIMMED1(1.0f);
-        case Store1Int:
-            return VIMMED1(1);
-        case StoreVertexId:
+    case NoStore:
+        return VUNDEF_I();
+    case Store0:
+        return VIMMED1(0);
+    case Store1Fp:
+        return VIMMED1(1.0f);
+    case Store1Int:
+        return VIMMED1(1);
+    case StoreVertexId:
          {
  #if USE_SIMD16_SHADERS
              Value *pId;
@@ -2788,15 +2864,17 @@ Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
  #endif
              return pId;
          }
-        case StoreInstanceId:
+    case StoreInstanceId:
          {
              Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
              return VBROADCAST(pId);
          }
-        case StoreSrc:
-        default:
-            SWR_INVALID("Invalid component control");
-            return VUNDEF_I();
+
+
+    case StoreSrc:
+    default:
+        SWR_INVALID("Invalid component control");
+        return VUNDEF_I();
      }
  }
  
@@ -2822,6 +2900,10 @@ bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
      }
  }
  
+// Don't want two threads compiling the same fetch shader simultaneously
+// Has problems in the JIT cache implementation
+// This is only a problem for fetch right now.
+static std::mutex gFetchCodegenMutex;
  
  //////////////////////////////////////////////////////////////////////////
  /// @brief JITs from fetch shader IR
@@ -2834,6 +2916,7 @@ PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
      JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
      PFN_FETCH_FUNC pfnFetch;
  
+    gFetchCodegenMutex.lock();
      pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
      // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
      pJitMgr->mIsModuleFinalized = true;
@@ -2848,6 +2931,9 @@ PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
  #endif
  
      pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
+    gFetchCodegenMutex.unlock();
+
+
  
      return pfnFetch;
  }