{ 0.0f, 0.0f, 0.0f, 0.0f },
1, 1
},
- // padding (0x5)
+ // R64G64_FLOAT (0x5)
{
- nullptr,
- { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
- { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
- 0, 0, 0, false, false, false, false,
- { false, false, false, false },
- { 0.0f, 0.0f, 0.0f, 0.0f },
- 1, 1
+ "R64G64_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 64, 64, 0, 0 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ false, // isLuminance
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
},
+
// R32G32B32X32_FLOAT (0x6)
{
"R32G32B32X32_FLOAT",
{ 0.0f, 0.0f, 0.0f, 0.0f },
1, 1
},
- // padding (0x8D)
+ // R64_FLOAT (0x8D)
{
- nullptr,
- { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
- { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
- 0, 0, 0, false, false, false, false,
- { false, false, false, false },
- { 0.0f, 0.0f, 0.0f, 0.0f },
- 1, 1
+ "R64_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 64, 0, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ false, // isLuminance
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
},
+
// R16G16B16X16_UNORM (0x8E)
{
"R16G16B16X16_UNORM",
1, // bcHeight
},
- // padding (0x197)
+ // R64G64B64A64_FLOAT (0x197)
{
- nullptr,
- { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
- { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
- 0, 0, 0, false, false, false, false,
- { false, false, false, false },
- { 0.0f, 0.0f, 0.0f, 0.0f },
- 1, 1
+ "R64G64B64A64_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 64, 64, 64, 64 }, // Bits per component
+ 256, // Bits per element
+ 32, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ false, // isLuminance
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
},
- // padding (0x198)
+
+ // R64G64B64_FLOAT (0x198)
{
- nullptr,
- { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
- { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
- 0, 0, 0, false, false, false, false,
- { false, false, false, false },
- { 0.0f, 0.0f, 0.0f, 0.0f },
- 1, 1
+ "R64G64B64_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 64, 64, 64, 0 }, // Bits per component
+ 192, // Bits per element
+ 24, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ false, // isLuminance
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
},
+
// BC4_SNORM (0x199)
{
"BC4_SNORM",
R32G32B32A32_FLOAT = 0x0,
R32G32B32A32_SINT = 0x1,
R32G32B32A32_UINT = 0x2,
+ R64G64_FLOAT = 0x5,
R32G32B32X32_FLOAT = 0x6,
R32G32B32A32_SSCALED = 0x7,
R32G32B32A32_USCALED = 0x8,
R32_FLOAT_X8X24_TYPELESS = 0x88,
X32_TYPELESS_G8X24_UINT = 0x89,
L32A32_FLOAT = 0x8A,
+ R64_FLOAT = 0x8D,
R16G16B16X16_UNORM = 0x8E,
R16G16B16X16_FLOAT = 0x8F,
L32X32_FLOAT = 0x91,
R8G8B8_SNORM = 0x194,
R8G8B8_SSCALED = 0x195,
R8G8B8_USCALED = 0x196,
+ R64G64B64A64_FLOAT = 0x197,
+ R64G64B64_FLOAT = 0x198,
BC4_SNORM = 0x199,
BC5_SNORM = 0x19A,
R16G16B16_FLOAT = 0x19B,
typedef Format4<32, 32, 32, 32> FormatT;
};
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R64G64_FLOAT> - Format traits specialization for R64G64_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R64G64_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose64_64 TransposeT;
+ typedef Format2<64, 64> FormatT;
+};
+
//////////////////////////////////////////////////////////////////////////
/// FormatTraits<R32G32B32X32_FLOAT> - Format traits specialization for R32G32B32X32_FLOAT
//////////////////////////////////////////////////////////////////////////
typedef Format2<32, 32> FormatT;
};
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R64_FLOAT> - Format traits specialization for R64_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R64_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 64>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<64> TransposeT;
+ typedef Format1<64> FormatT;
+};
+
//////////////////////////////////////////////////////////////////////////
/// FormatTraits<R16G16B16X16_UNORM> - Format traits specialization for R16G16B16X16_UNORM
//////////////////////////////////////////////////////////////////////////
typedef Format3<8, 8, 8> FormatT;
};
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R64G64B64A64_FLOAT> - Format traits specialization for R64G64B64A64_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R64G64B64A64_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 256 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose64_64_64_64 TransposeT;
+ typedef Format4<64, 64, 64, 64> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R64G64B64_FLOAT> - Format traits specialization for R64G64B64_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R64G64B64_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 192 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose64_64_64 TransposeT;
+ typedef Format3<64, 64, 64> FormatT;
+};
+
//////////////////////////////////////////////////////////////////////////
/// FormatTraits<BC4_SNORM> - Format traits specialization for BC4_SNORM
//////////////////////////////////////////////////////////////////////////
#endif
};
+//////////////////////////////////////////////////////////////////////////
+/// Transpose64
+//////////////////////////////////////////////////////////////////////////
+struct Transpose64
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+ static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose64_64
+//////////////////////////////////////////////////////////////////////////
+struct Transpose64_64
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+ static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose64_64_64
+//////////////////////////////////////////////////////////////////////////
+struct Transpose64_64_64
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+ static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose64_64_64_64
+//////////////////////////////////////////////////////////////////////////
+struct Transpose64_64_64_64
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+ static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
// helper function to unroll loops
template<int Begin, int End, int Step = 1>
struct UnrollerL {
return vGather;
}
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Generate a masked gather operation in LLVM IR. If not
+ /// supported on the underlying platform, emulate it with loads
+ /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+ /// @param pBase - Int8* base VB address pointer value
+ /// @param vIndices - SIMD wide value of VB byte offsets
+ /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+ /// @param scale - value to scale indices by
+ Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+ {
+ Value* vGather;
+
+ // use avx2 gather instruction if available
+ if(JM()->mArch.AVX2())
+ {
+ vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale);
+ }
+ else
+ {
+ Value* pStack = STACKSAVE();
+
+ // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
+ Value* vSrcPtr = ALLOCA(vSrc->getType());
+ STORE(vSrc, vSrcPtr);
+
+ vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
+ Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty));
+ Value *vOffsets = MUL(vIndices,vScaleVec);
+ Value *mask = MASK(vMask);
+ for(uint32_t i = 0; i < mVWidth/2; ++i)
+ {
+ // single component byte index
+ Value *offset = VEXTRACT(vOffsets,C(i));
+ // byte pointer to component
+ Value *loadAddress = GEP(pBase,offset);
+ loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
+ // pointer to the value to load if we're masking off a component
+ Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
+ Value *selMask = VEXTRACT(mask,C(i));
+ // switch in a safe address to load if we're trying to access a vertex
+ Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+ Value *val = LOAD(validAddress);
+ vGather = VINSERT(vGather,val,C(i));
+ }
+ STACKRESTORE(pStack);
+ }
+ return vGather;
+ }
+
//////////////////////////////////////////////////////////////////////////
/// @brief convert x86 <N x float> mask to llvm <N x i1> mask
Value* Builder::MASK(Value* vmask)
void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale);
+
void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
bool FetchJit::IsOddFormat(SWR_FORMAT format)
{
const SWR_FORMAT_INFO& info = GetFormatInfo(format);
- if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32)
+ if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
{
return true;
}
}
}
break;
+ case 64:
+ {
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ if (isComponentEnabled(compMask, i))
+ {
+ // if we need to gather the component
+ if (compCtrl[i] == StoreSrc)
+ {
+ Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
+ Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
+ vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
+ vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
+ vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
+ vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
+
+ Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
+ Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
+
+ Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
+
+ Value* pGatherLo = GATHERPD(vZeroDouble,
+ pStreamBase, vOffsetsLo, vMaskLo, C((char)1));
+ Value* pGatherHi = GATHERPD(vZeroDouble,
+ pStreamBase, vOffsetsHi, vMaskHi, C((char)1));
+
+ pGatherLo = VCVTPD2PS(pGatherLo);
+ pGatherHi = VCVTPD2PS(pGatherHi);
+
+ Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
+
+ vVertexElements[currentVertexElement++] = pGather;
+ }
+ else
+ {
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+ }
+
+ if (currentVertexElement > 3)
+ {
+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+
+ }
+
+ // offset base to the next component in the vertex to gather
+ pStreamBase = GEP(pStreamBase, C((char)8));
+ }
+ }
+ break;
default:
SWR_ASSERT(0, "Tried to fetch invalid FP format");
break;
fclose(fd);
#endif
+ pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
+
return pfnFetch;
}
}
intrinsics = [
+ ["VGATHERPD", "x86_avx2_gather_d_pd_256", ["src", "pBase", "indices", "mask", "scale"]],
["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]],
["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
["VPSHUFB", "x86_avx2_pshuf_b", ["a", "b"]],
["VPERMD", "x86_avx2_permd", ["a", "idx"]],
["VPERMPS", "x86_avx2_permps", ["idx", "a"]],
+ ["VCVTPD2PS", "x86_avx_cvt_pd2_ps_256", ["a"]],
["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]],
["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]],
["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]],
{PIPE_FORMAT_R32G32B32_FIXED, R32G32B32_SFIXED},
{PIPE_FORMAT_R32G32B32A32_FIXED, R32G32B32A32_SFIXED},
+ {PIPE_FORMAT_R64_FLOAT, R64_FLOAT},
+ {PIPE_FORMAT_R64G64_FLOAT, R64G64_FLOAT},
+ {PIPE_FORMAT_R64G64B64_FLOAT, R64G64B64_FLOAT},
+ {PIPE_FORMAT_R64G64B64A64_FLOAT, R64G64B64A64_FLOAT},
+
/* These formats have entries in SWR but don't have Load/StoreTile
* implementations. That means these aren't renderable, and thus having
* a mapping entry here is detrimental.