X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fswr%2Fswr_shader.cpp;h=afa184fc4664ab3d7373c56306595e72279d4e24;hb=5131b7a43f8488a797ec880ac2c905614da0aa03;hp=6fc0596ed6011f91211c3a190f204b309849b9f2;hpb=7bd5057fd153d5aa55c74624c2bda9651377711d;p=mesa.git diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp index 6fc0596ed60..afa184fc466 100644 --- a/src/gallium/drivers/swr/swr_shader.cpp +++ b/src/gallium/drivers/swr/swr_shader.cpp @@ -27,11 +27,13 @@ #include "JitManager.h" #include "llvm-c/Core.h" #include "llvm/Support/CBindingWrapping.h" +#include "llvm/IR/LegacyPassManager.h" #pragma pop_macro("DEBUG") #include "state.h" #include "gen_state_llvm.h" #include "builder.h" +#include "functionpasses/passes.h" #include "tgsi/tgsi_strings.h" #include "util/u_format.h" @@ -47,12 +49,6 @@ #include "swr_state.h" #include "swr_screen.h" -#if HAVE_LLVM < 0x0500 -namespace llvm { -typedef AttributeSet AttributeList; -} -#endif - using namespace SwrJit; using namespace llvm; @@ -104,7 +100,7 @@ swr_generate_sampler_key(const struct lp_tgsi_info &info, key.nr_sampler_views = info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; for (unsigned i = 0; i < key.nr_sampler_views; i++) { - if (info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) { + if (info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1u << (i & 31))) { const struct pipe_sampler_view *view = ctx->sampler_views[shader_type][i]; lp_sampler_static_texture_state( @@ -165,6 +161,9 @@ swr_generate_fs_key(struct swr_jit_fs_key &key, sizeof(key.vs_output_semantic_idx)); swr_generate_sampler_key(swr_fs->info, ctx, PIPE_SHADER_FRAGMENT, key); + + key.poly_stipple_enable = ctx->rasterizer->poly_stipple_enable && + ctx->poly_stipple.prim_is_poly; } void @@ -223,6 +222,9 @@ struct BuilderSWR : public Builder { gallivm_free_ir(gallivm); } + void WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput, + unsigned slot, unsigned channel); + struct gallivm_state *gallivm; PFN_VERTEX_FUNC CompileVS(struct swr_context *ctx, swr_jit_vs_key &key); PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_fs_key &key); @@ -339,26 +341,59 @@ BuilderSWR::swr_gs_llvm_fetch_input(const struct lp_build_tgsi_gs_iface *gs_ifac LLVMValueRef swizzle_index) { swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_iface; + Value *vert_index = unwrap(vertex_index); + Value *attr_index = unwrap(attrib_index); IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - assert(is_vindex_indirect == false && is_aindex_indirect == false); + if (is_vindex_indirect || is_aindex_indirect) { + int i; + Value *res = unwrap(bld_base->base.zero); + struct lp_type type = bld_base->base.type; - Value *attrib = - LOAD(GEP(iface->pVtxAttribMap, {C(0), unwrap(attrib_index)})); + for (i = 0; i < type.length; i++) { + Value *vert_chan_index = vert_index; + Value *attr_chan_index = attr_index; - Value *pInput = - LOAD(GEP(iface->pGsCtx, - {C(0), - C(SWR_GS_CONTEXT_vert), - unwrap(vertex_index), - C(0), - attrib, - unwrap(swizzle_index)})); + if (is_vindex_indirect) { + vert_chan_index = VEXTRACT(vert_index, C(i)); + } + if (is_aindex_indirect) { + attr_chan_index = VEXTRACT(attr_index, C(i)); + } + + Value *attrib = + LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_chan_index})); + + Value *pVertex = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pVerts}); + Value *pInputVertStride = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_inputVertStride}); + + Value *pVector = ADD(MUL(vert_chan_index, pInputVertStride), attrib); + Value *pInput = LOAD(GEP(pVertex, {pVector, unwrap(swizzle_index)})); + + Value *value = VEXTRACT(pInput, C(i)); + res = VINSERT(res, value, C(i)); + } - return wrap(pInput); + return wrap(res); + } else { + Value *attrib = LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_index})); + + Value *pVertex = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pVerts}); + Value *pInputVertStride = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_inputVertStride}); + + Value *pVector = ADD(MUL(vert_index, pInputVertStride), attrib); + + Value *pInput = LOAD(GEP(pVertex, {pVector, unwrap(swizzle_index)})); + + return wrap(pInput); + } } +// GS output stream layout +#define VERTEX_COUNT_SIZE 32 +#define CONTROL_HEADER_SIZE (8*32) + void BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base, struct lp_build_tgsi_context * bld_base, @@ -366,60 +401,68 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base LLVMValueRef emitted_vertices_vec) { swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; - SWR_GS_STATE *pGS = iface->pGsState; IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - const uint32_t simdVertexStride = sizeof(simdvertex); - const uint32_t numSimdBatches = (pGS->maxNumVerts + 7) / 8; - const uint32_t inputPrimStride = numSimdBatches * simdVertexStride; + const uint32_t headerSize = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE; + const uint32_t attribSize = 4 * sizeof(float); + const uint32_t vertSize = attribSize * SWR_VTX_NUM_SLOTS; + Value *pVertexOffset = MUL(unwrap(emitted_vertices_vec), VIMMED1(vertSize)); - Value *pStream = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_pStream }); - Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask }); - Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8)); + Value *vMask = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_mask}); + Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, mVWidth)); - Value *vOffsets = C({ - inputPrimStride * 0, - inputPrimStride * 1, - inputPrimStride * 2, - inputPrimStride * 3, - inputPrimStride * 4, - inputPrimStride * 5, - inputPrimStride * 6, - inputPrimStride * 7 } ); - - Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), 3); - Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), 7); + Value *pStack = STACKSAVE(); + Value *pTmpPtr = ALLOCA(mFP32Ty, C(4)); // used for dummy write for lane masking for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) { uint32_t attribSlot = attrib; - if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) - attribSlot = VERTEX_POINT_SIZE_SLOT; - else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_PRIMID) - attribSlot = VERTEX_PRIMID_SLOT; - else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_LAYER) - attribSlot = VERTEX_RTAI_SLOT; - - Value *vOffsetsAttrib = - ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex)))); - vOffsetsAttrib = - ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector)))); - vOffsetsAttrib = - ADD(vOffsetsAttrib, MUL(vSimdSlot, VIMMED1((uint32_t)sizeof(float)))); - - for (uint32_t channel = 0; channel < 4; ++channel) { - Value *vData = LOAD(unwrap(outputs[attrib][channel])); - Value *vPtrs = GEP(pStream, vOffsetsAttrib); - - vPtrs = BITCAST(vPtrs, - VectorType::get(PointerType::get(mFP32Ty, 0), 8)); - - MASKED_SCATTER(vData, vPtrs, 32, vMask1); - - vOffsetsAttrib = - ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar))); + uint32_t sgvChannel = 0; + if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) { + attribSlot = VERTEX_SGV_SLOT; + sgvChannel = VERTEX_SGV_POINT_SIZE_COMP; + } else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_LAYER) { + attribSlot = VERTEX_SGV_SLOT; + sgvChannel = VERTEX_SGV_RTAI_COMP; + } else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_POSITION) { + attribSlot = VERTEX_POSITION_SLOT; + } else { + attribSlot = VERTEX_ATTRIB_START_SLOT + attrib; + if (iface->info->writes_position) { + attribSlot--; + } + } + + Value *pOutputOffset = ADD(pVertexOffset, VIMMED1(headerSize + attribSize * attribSlot)); // + sgvChannel ? + + for (uint32_t lane = 0; lane < mVWidth; ++lane) { + Value *pLaneOffset = VEXTRACT(pOutputOffset, C(lane)); + Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); + Value *pStreamOffset = GEP(pStream, pLaneOffset); + pStreamOffset = BITCAST(pStreamOffset, mFP32PtrTy); + + Value *pLaneMask = VEXTRACT(vMask1, C(lane)); + pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr); + + for (uint32_t channel = 0; channel < 4; ++channel) { + Value *vData; + + if (attribSlot == VERTEX_SGV_SLOT) + vData = LOAD(unwrap(outputs[attrib][0])); + else + vData = LOAD(unwrap(outputs[attrib][channel])); + + if (attribSlot != VERTEX_SGV_SLOT || + sgvChannel == channel) { + vData = VEXTRACT(vData, C(lane)); + STORE(vData, pStreamOffset); + } + pStreamOffset = GEP(pStreamOffset, C(1)); + } } } + + STACKRESTORE(pStack); } void @@ -429,12 +472,9 @@ BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_ba LLVMValueRef emitted_prims_vec) { swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; - SWR_GS_STATE *pGS = iface->pGsState; IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - Value *pCutBuffer = - LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer}); Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask }); Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8)); @@ -456,31 +496,29 @@ BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_ba mask = AND(mask, cmpMask); vMask1 = TRUNC(mask, VectorType::get(mInt1Ty, 8)); - const uint32_t cutPrimStride = - (pGS->maxNumVerts + JM()->mVWidth - 1) / JM()->mVWidth; - Value *vOffsets = C({ - (uint32_t)(cutPrimStride * 0), - (uint32_t)(cutPrimStride * 1), - (uint32_t)(cutPrimStride * 2), - (uint32_t)(cutPrimStride * 3), - (uint32_t)(cutPrimStride * 4), - (uint32_t)(cutPrimStride * 5), - (uint32_t)(cutPrimStride * 6), - (uint32_t)(cutPrimStride * 7) } ); - vCount = SUB(vCount, VIMMED1(1)); - Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), vOffsets); + Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), VIMMED1(VERTEX_COUNT_SIZE)); Value *vValue = SHL(VIMMED1(1), UREM(vCount, VIMMED1(8))); vValue = TRUNC(vValue, VectorType::get(mInt8Ty, 8)); - Value *vPtrs = GEP(pCutBuffer, vOffset); - vPtrs = - BITCAST(vPtrs, VectorType::get(PointerType::get(mInt8Ty, 0), JM()->mVWidth)); + Value *pStack = STACKSAVE(); + Value *pTmpPtr = ALLOCA(mInt8Ty, C(4)); // used for dummy read/write for lane masking + + for (uint32_t lane = 0; lane < mVWidth; ++lane) { + Value *vLaneOffset = VEXTRACT(vOffset, C(lane)); + Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); + Value *pStreamOffset = GEP(pStream, vLaneOffset); - Value *vGather = MASKED_GATHER(vPtrs, 32, vMask1); - vValue = OR(vGather, vValue); - MASKED_SCATTER(vValue, vPtrs, 32, vMask1); + Value *pLaneMask = VEXTRACT(vMask1, C(lane)); + pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr); + + Value *vVal = LOAD(pStreamOffset); + vVal = OR(vVal, VEXTRACT(vValue, C(lane))); + STORE(vVal, pStreamOffset); + } + + STACKRESTORE(pStack); } void @@ -493,7 +531,14 @@ BuilderSWR::swr_gs_llvm_epilogue(const struct lp_build_tgsi_gs_iface *gs_base, IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - STORE(unwrap(total_emitted_vertices_vec), iface->pGsCtx, {0, SWR_GS_CONTEXT_vertexCount}); + // Store emit count to each output stream in the first DWORD + for (uint32_t lane = 0; lane < mVWidth; ++lane) + { + Value* pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); + pStream = BITCAST(pStream, mInt32PtrTy); + Value* pLaneCount = VEXTRACT(unwrap(total_emitted_vertices_vec), C(lane)); + STORE(pLaneCount, pStream); + } } PFN_GS_FUNC @@ -502,6 +547,8 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key) SWR_GS_STATE *pGS = &ctx->gs->gsState; struct tgsi_shader_info *info = &ctx->gs->info.base; + memset(pGS, 0, sizeof(*pGS)); + pGS->gsEnable = true; pGS->numInputAttribs = info->num_inputs; @@ -510,14 +557,24 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key) pGS->maxNumVerts = info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES]; pGS->instanceCount = info->properties[TGSI_PROPERTY_GS_INVOCATIONS]; - pGS->emitsRenderTargetArrayIndex = info->writes_layer; - pGS->emitsPrimitiveID = info->writes_primid; - pGS->emitsViewportArrayIndex = info->writes_viewport_index; - // XXX: single stream for now... pGS->isSingleStream = true; pGS->singleStreamID = 0; + pGS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize + pGS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize + pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset; + pGS->outputVertexSize = SWR_VTX_NUM_SLOTS; + pGS->controlDataSize = 8; // GS ouputs max of 8 32B units + pGS->controlDataOffset = VERTEX_COUNT_SIZE; + pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE; + + pGS->allocationSize = + VERTEX_COUNT_SIZE + // vertex count + CONTROL_HEADER_SIZE + // control header + (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex + pGS->maxNumVerts; // num verts + struct swr_geometry_shader *gs = ctx->gs; LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; @@ -527,10 +584,9 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key) AttrBuilder attrBuilder; attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); - AttributeList attrSet = AttributeList::get( - JM()->mContext, AttributeList::FunctionIndex, attrBuilder); std::vector gsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), + PointerType::get(mInt8Ty, 0), PointerType::get(Gen_SWR_GS_CONTEXT(JM()), 0)}; FunctionType *vsFuncType = FunctionType::get(Type::getVoidTy(JM()->mContext), gsArgs, false); @@ -540,7 +596,13 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key) GlobalValue::ExternalLinkage, "GS", JM()->mpCurrentModule); - pFunction->addAttributes(AttributeList::FunctionIndex, attrSet); +#if HAVE_LLVM < 0x0500 + AttributeSet attrSet = AttributeSet::get( + JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); + pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); +#else + pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder); +#endif BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); IRB()->SetInsertPoint(block); @@ -549,6 +611,8 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key) auto argitr = pFunction->arg_begin(); Value *hPrivateData = &*argitr++; hPrivateData->setName("hPrivateData"); + Value *pWorkerData = &*argitr++; + pWorkerData->setName("pWorkerData"); Value *pGsCtx = &*argitr++; pGsCtx->setName("gsCtx"); @@ -573,8 +637,15 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key) ubyte semantic_name = info->input_semantic_name[slot]; ubyte semantic_idx = info->input_semantic_index[slot]; - unsigned vs_slot = - locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base) + 1; + unsigned vs_slot = locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base); + + vs_slot += VERTEX_ATTRIB_START_SLOT; + + if (ctx->vs->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) + vs_slot--; + + if (semantic_name == TGSI_SEMANTIC_POSITION) + vs_slot = VERTEX_POSITION_SLOT; STORE(C(vs_slot), vtxAttribMap, {0, slot}); mapConstants.push_back(C(vs_slot)); @@ -586,10 +657,11 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key) lp_type_float_vec(32, 32 * 8), wrap(mask_val)); // zero out cut buffer so we can load/modify/store bits - MEMSET(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer}), - C((char)0), - pGS->instanceCount * ((pGS->maxNumVerts + 7) / 8) * JM()->mVWidth, - sizeof(float) * KNOB_SIMD_WIDTH); + for (uint32_t lane = 0; lane < mVWidth; ++lane) + { + Value* pStream = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); + MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, sizeof(float) * KNOB_SIMD_WIDTH); + } struct swr_gs_llvm_iface gs_iface; gs_iface.base.fetch_input = ::swr_gs_llvm_fetch_input; @@ -654,6 +726,23 @@ swr_compile_gs(struct swr_context *ctx, swr_jit_gs_key &key) return func; } +void +BuilderSWR::WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput, unsigned slot, unsigned channel) +{ +#if USE_SIMD16_FRONTEND && !USE_SIMD16_VS + // interleave the simdvertex components into the dest simd16vertex + // slot16offset = slot8offset * 2 + // comp16offset = comp8offset * 2 + alternateOffset + + Value *offset = LOAD(pVsContext, { 0, SWR_VS_CONTEXT_AlternateOffset }); + Value *pOut = GEP(pVtxOutput, { C(0), C(0), C(slot * 2), offset } ); + STORE(pVal, pOut, {channel * 2}); +#else + Value *pOut = GEP(pVtxOutput, {0, 0, slot}); + STORE(pVal, pOut, {0, channel}); +#endif +} + PFN_VERTEX_FUNC BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) { @@ -666,10 +755,9 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) AttrBuilder attrBuilder; attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); - AttributeList attrSet = AttributeList::get( - JM()->mContext, AttributeList::FunctionIndex, attrBuilder); std::vector vsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), + PointerType::get(mInt8Ty, 0), PointerType::get(Gen_SWR_VS_CONTEXT(JM()), 0)}; FunctionType *vsFuncType = FunctionType::get(Type::getVoidTy(JM()->mContext), vsArgs, false); @@ -679,7 +767,13 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) GlobalValue::ExternalLinkage, "VS", JM()->mpCurrentModule); - pFunction->addAttributes(AttributeList::FunctionIndex, attrSet); +#if HAVE_LLVM < 0x0500 + AttributeSet attrSet = AttributeSet::get( + JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); + pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); +#else + pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder); +#endif BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); IRB()->SetInsertPoint(block); @@ -688,6 +782,8 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) auto argitr = pFunction->arg_begin(); Value *hPrivateData = &*argitr++; hPrivateData->setName("hPrivateData"); + Value *pWorkerData = &*argitr++; + pWorkerData->setName("pWorkerData"); Value *pVsCtx = &*argitr++; pVsCtx->setName("vsCtx"); @@ -699,6 +795,9 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) const_sizes_ptr->setName("num_vs_constants"); Value *vtxInput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVin}); +#if USE_SIMD16_VS + vtxInput = BITCAST(vtxInput, PointerType::get(Gen_simd16vertex(JM()), 0)); +#endif for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) { const unsigned mask = swr_vs->info.base.input_usage_mask[attrib]; @@ -716,11 +815,22 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) struct lp_bld_tgsi_system_values system_values; memset(&system_values, 0, sizeof(system_values)); system_values.instance_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_InstanceID})); + +#if USE_SIMD16_VS + system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID16})); +#else system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID})); +#endif + +#if USE_SIMD16_VS + uint32_t vectorWidth = mVWidth16; +#else + uint32_t vectorWidth = mVWidth; +#endif lp_build_tgsi_soa(gallivm, swr_vs->pipe.tokens, - lp_type_float_vec(32, 32 * 8), + lp_type_float_vec(32, 32 * vectorWidth), NULL, // mask wrap(consts_ptr), wrap(const_sizes_ptr), @@ -738,18 +848,34 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); Value *vtxOutput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVout}); +#if USE_SIMD16_VS + vtxOutput = BITCAST(vtxOutput, PointerType::get(Gen_simd16vertex(JM()), 0)); +#endif for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) { if (!outputs[attrib][channel]) continue; - Value *val = LOAD(unwrap(outputs[attrib][channel])); + Value *val; + uint32_t outSlot; - uint32_t outSlot = attrib; - if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) - outSlot = VERTEX_POINT_SIZE_SLOT; - STORE(val, vtxOutput, {0, 0, outSlot, channel}); + if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) { + if (channel != VERTEX_SGV_POINT_SIZE_COMP) + continue; + val = LOAD(unwrap(outputs[attrib][0])); + outSlot = VERTEX_SGV_SLOT; + } else if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_POSITION) { + val = LOAD(unwrap(outputs[attrib][channel])); + outSlot = VERTEX_POSITION_SLOT; + } else { + val = LOAD(unwrap(outputs[attrib][channel])); + outSlot = VERTEX_ATTRIB_START_SLOT + attrib; + if (swr_vs->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) + outSlot--; + } + + WriteVS(val, pVsCtx, vtxOutput, outSlot, channel); } } @@ -759,8 +885,8 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) unsigned cv = 0; if (swr_vs->info.base.writes_clipvertex) { - cv = 1 + locate_linkage(TGSI_SEMANTIC_CLIPVERTEX, 0, - &swr_vs->info.base); + cv = locate_linkage(TGSI_SEMANTIC_CLIPVERTEX, 0, + &swr_vs->info.base); } else { for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) { if (swr_vs->info.base.output_semantic_name[i] == TGSI_SEMANTIC_POSITION && @@ -779,14 +905,14 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) // clip distance overrides user clip planes if ((swr_vs->info.base.clipdist_writemask & clip_mask & (1 << val)) || ((swr_vs->info.base.culldist_writemask << swr_vs->info.base.num_written_clipdistance) & (1 << val))) { - unsigned cv = 1 + locate_linkage(TGSI_SEMANTIC_CLIPDIST, val < 4 ? 0 : 1, - &swr_vs->info.base); + unsigned cv = locate_linkage(TGSI_SEMANTIC_CLIPDIST, val < 4 ? 0 : 1, + &swr_vs->info.base); if (val < 4) { LLVMValueRef dist = LLVMBuildLoad(gallivm->builder, outputs[cv][val], ""); - STORE(unwrap(dist), vtxOutput, {0, 0, VERTEX_CLIPCULL_DIST_LO_SLOT, val}); + WriteVS(unwrap(dist), pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val); } else { LLVMValueRef dist = LLVMBuildLoad(gallivm->builder, outputs[cv][val - 4], ""); - STORE(unwrap(dist), vtxOutput, {0, 0, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4}); + WriteVS(unwrap(dist), pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4); } continue; } @@ -798,15 +924,26 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) Value *py = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 1})); Value *pz = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 2})); Value *pw = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 3})); - Value *dist = FADD(FMUL(unwrap(cx), VBROADCAST(px)), - FADD(FMUL(unwrap(cy), VBROADCAST(py)), - FADD(FMUL(unwrap(cz), VBROADCAST(pz)), - FMUL(unwrap(cw), VBROADCAST(pw))))); +#if USE_SIMD16_VS + Value *bpx = VBROADCAST_16(px); + Value *bpy = VBROADCAST_16(py); + Value *bpz = VBROADCAST_16(pz); + Value *bpw = VBROADCAST_16(pw); +#else + Value *bpx = VBROADCAST(px); + Value *bpy = VBROADCAST(py); + Value *bpz = VBROADCAST(pz); + Value *bpw = VBROADCAST(pw); +#endif + Value *dist = FADD(FMUL(unwrap(cx), bpx), + FADD(FMUL(unwrap(cy), bpy), + FADD(FMUL(unwrap(cz), bpz), + FMUL(unwrap(cw), bpw)))); if (val < 4) - STORE(dist, vtxOutput, {0, 0, VERTEX_CLIPCULL_DIST_LO_SLOT, val}); + WriteVS(dist, pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val); else - STORE(dist, vtxOutput, {0, 0, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4}); + WriteVS(dist, pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4); } } @@ -843,13 +980,40 @@ swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key &key) return func; } +unsigned +swr_so_adjust_attrib(unsigned in_attrib, + swr_vertex_shader *swr_vs) +{ + ubyte semantic_name; + unsigned attrib; + + attrib = in_attrib + VERTEX_ATTRIB_START_SLOT; + + if (swr_vs) { + semantic_name = swr_vs->info.base.output_semantic_name[in_attrib]; + if (semantic_name == TGSI_SEMANTIC_POSITION) { + attrib = VERTEX_POSITION_SLOT; + } else if (semantic_name == TGSI_SEMANTIC_PSIZE) { + attrib = VERTEX_SGV_SLOT; + } else if (semantic_name == TGSI_SEMANTIC_LAYER) { + attrib = VERTEX_SGV_SLOT; + } else { + if (swr_vs->info.base.writes_position) { + attrib--; + } + } + } + + return attrib; +} + static unsigned locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info) { for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) { if ((info->output_semantic_name[i] == name) && (info->output_semantic_index[i] == index)) { - return i - 1; // position is not part of the linkage + return i; } } @@ -877,10 +1041,9 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key) AttrBuilder attrBuilder; attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); - AttributeList attrSet = AttributeList::get( - JM()->mContext, AttributeList::FunctionIndex, attrBuilder); std::vector fsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), + PointerType::get(mInt8Ty, 0), PointerType::get(Gen_SWR_PS_CONTEXT(JM()), 0)}; FunctionType *funcType = FunctionType::get(Type::getVoidTy(JM()->mContext), fsArgs, false); @@ -889,7 +1052,13 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key) GlobalValue::ExternalLinkage, "FS", JM()->mpCurrentModule); - pFunction->addAttributes(AttributeList::FunctionIndex, attrSet); +#if HAVE_LLVM < 0x0500 + AttributeSet attrSet = AttributeSet::get( + JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); + pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); +#else + pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder); +#endif BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); IRB()->SetInsertPoint(block); @@ -898,6 +1067,8 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key) auto args = pFunction->arg_begin(); Value *hPrivateData = &*args++; hPrivateData->setName("hPrivateData"); + Value *pWorkerData = &*args++; + pWorkerData->setName("pWorkerData"); Value *pPS = &*args++; pPS->setName("psCtx"); @@ -991,23 +1162,23 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key) inputs[attrib][3] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_center}, "vOneOverW")); continue; - } else if (semantic_name == TGSI_SEMANTIC_PRIMID) { - Value *primID = LOAD(pPS, {0, SWR_PS_CONTEXT_primID}, "primID"); - inputs[attrib][0] = wrap(VECTOR_SPLAT(JM()->mVWidth, primID)); - inputs[attrib][1] = wrap(VIMMED1(0)); - inputs[attrib][2] = wrap(VIMMED1(0)); - inputs[attrib][3] = wrap(VIMMED1(0)); - continue; } unsigned linkedAttrib = - locate_linkage(semantic_name, semantic_idx, pPrevShader); + locate_linkage(semantic_name, semantic_idx, pPrevShader) - 1; - if (semantic_name == TGSI_SEMANTIC_GENERIC && + uint32_t extraAttribs = 0; + if (semantic_name == TGSI_SEMANTIC_PRIMID && !ctx->gs) { + /* non-gs generated primID - need to grab from swizzleMap override */ + linkedAttrib = pPrevShader->num_outputs - 1; + swr_fs->constantMask |= 1 << linkedAttrib; + extraAttribs++; + } else if (semantic_name == TGSI_SEMANTIC_GENERIC && key.sprite_coord_enable & (1 << semantic_idx)) { /* we add an extra attrib to the backendState in swr_update_derived. */ - linkedAttrib = pPrevShader->num_outputs - 1; + linkedAttrib = pPrevShader->num_outputs + extraAttribs - 1; swr_fs->pointSpriteMask |= (1 << linkedAttrib); + extraAttribs++; } else if (linkedAttrib == 0xFFFFFFFF) { inputs[attrib][0] = wrap(VIMMED1(0.0f)); inputs[attrib][1] = wrap(VIMMED1(0.0f)); @@ -1030,7 +1201,7 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key) Value *offset = NULL; if (semantic_name == TGSI_SEMANTIC_COLOR && key.light_twoside) { bcolorAttrib = locate_linkage( - TGSI_SEMANTIC_BCOLOR, semantic_idx, pPrevShader); + TGSI_SEMANTIC_BCOLOR, semantic_idx, pPrevShader) - 1; /* Neither front nor back colors were available. Nothing to load. */ if (bcolorAttrib == 0xFFFFFFFF && linkedAttrib == 0xFFFFFFFF) continue; @@ -1099,17 +1270,58 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key) memset(&system_values, 0, sizeof(system_values)); struct lp_build_mask_context mask; + bool uses_mask = false; - if (swr_fs->info.base.uses_kill) { - Value *mask_val = LOAD(pPS, {0, SWR_PS_CONTEXT_activeMask}, "activeMask"); + if (swr_fs->info.base.uses_kill || + key.poly_stipple_enable) { + Value *vActiveMask = NULL; + if (swr_fs->info.base.uses_kill) { + vActiveMask = LOAD(pPS, {0, SWR_PS_CONTEXT_activeMask}, "activeMask"); + } + if (key.poly_stipple_enable) { + // first get fragment xy coords and clip to stipple bounds + Value *vXf = LOAD(pPS, {0, SWR_PS_CONTEXT_vX, PixelPositions_UL}); + Value *vYf = LOAD(pPS, {0, SWR_PS_CONTEXT_vY, PixelPositions_UL}); + Value *vXu = FP_TO_UI(vXf, mSimdInt32Ty); + Value *vYu = FP_TO_UI(vYf, mSimdInt32Ty); + + // stipple pattern is 32x32, which means that one line of stipple + // is stored in one word: + // vXstipple is bit offset inside 32-bit stipple word + // vYstipple is word index is stipple array + Value *vXstipple = AND(vXu, VIMMED1(0x1f)); // & (32-1) + Value *vYstipple = AND(vYu, VIMMED1(0x1f)); // & (32-1) + + // grab stipple pattern base address + Value *stipplePtr = GEP(hPrivateData, {0, swr_draw_context_polyStipple, 0}); + stipplePtr = BITCAST(stipplePtr, mInt8PtrTy); + + // peform a gather to grab stipple words for each lane + Value *vStipple = GATHERDD(VUNDEF_I(), stipplePtr, vYstipple, + VIMMED1(0xffffffff), 4); + + // create a mask with one bit corresponding to the x stipple + // and AND it with the pattern, to see if we have a bit + Value *vBitMask = LSHR(VIMMED1(0x80000000), vXstipple); + Value *vStippleMask = AND(vStipple, vBitMask); + vStippleMask = ICMP_NE(vStippleMask, VIMMED1(0)); + vStippleMask = VMASK(vStippleMask); + + if (swr_fs->info.base.uses_kill) { + vActiveMask = AND(vActiveMask, vStippleMask); + } else { + vActiveMask = vStippleMask; + } + } lp_build_mask_begin( - &mask, gallivm, lp_type_float_vec(32, 32 * 8), wrap(mask_val)); + &mask, gallivm, lp_type_float_vec(32, 32 * 8), wrap(vActiveMask)); + uses_mask = true; } lp_build_tgsi_soa(gallivm, swr_fs->pipe.tokens, lp_type_float_vec(32, 32 * 8), - swr_fs->info.base.uses_kill ? &mask : NULL, // mask + uses_mask ? &mask : NULL, // mask wrap(consts_ptr), wrap(const_sizes_ptr), &system_values, @@ -1172,13 +1384,13 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key) } LLVMValueRef mask_result = 0; - if (swr_fs->info.base.uses_kill) { + if (uses_mask) { mask_result = lp_build_mask_end(&mask); } IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - if (swr_fs->info.base.uses_kill) { + if (uses_mask) { STORE(unwrap(mask_result), pPS, {0, SWR_PS_CONTEXT_activeMask}); } @@ -1188,6 +1400,11 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key) gallivm_compile_module(gallivm); + // after the gallivm passes, we have to lower the core's intrinsics + llvm::legacy::FunctionPassManager lowerPass(JM()->mpCurrentModule); + lowerPass.add(createLowerX86Pass(this)); + lowerPass.run(*pFunction); + PFN_PIXEL_KERNEL kernel = (PFN_PIXEL_KERNEL)gallivm_jit_function(gallivm, wrap(pFunction)); debug_printf("frag shader %p\n", kernel);