X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Ffreedreno%2Fa3xx%2Ffd3_program.c;h=64eeb106e535ad8f95c97ade27a2dad7e170e7cd;hb=5b2ef7853246b455f793417e5ae74e2a861afcae;hp=b5544e8c358b14ef9ef0d06f0b7ed59596edeedc;hpb=ee839cc6ef92d37ec6a44e6036e7a2c46172a16a;p=mesa.git diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index b5544e8c358..64eeb106e53 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -28,221 +28,73 @@ #include "pipe/p_state.h" #include "util/u_string.h" +#include "util/u_math.h" #include "util/u_memory.h" #include "util/u_inlines.h" #include "util/u_format.h" -#include "tgsi/tgsi_dump.h" -#include "tgsi/tgsi_parse.h" -#include "freedreno_lowering.h" #include "freedreno_program.h" #include "fd3_program.h" -#include "fd3_compiler.h" #include "fd3_emit.h" #include "fd3_texture.h" -#include "fd3_util.h" +#include "fd3_format.h" -static void -delete_variant(struct fd3_shader_variant *v) -{ - ir3_shader_destroy(v->ir); - fd_bo_del(v->bo); - free(v); -} - -static void -assemble_variant(struct fd3_shader_variant *so) -{ - struct fd_context *ctx = fd_context(so->so->pctx); - uint32_t sz, *bin; - - bin = ir3_shader_assemble(so->ir, &so->info); - sz = so->info.sizedwords * 4; - - so->bo = fd_bo_new(ctx->dev, sz, - DRM_FREEDRENO_GEM_CACHE_WCOMBINE | - DRM_FREEDRENO_GEM_TYPE_KMEM); - - memcpy(fd_bo_map(so->bo), bin, sz); - - free(bin); - - so->instrlen = so->info.sizedwords / 8; - so->constlen = so->info.max_const + 1; -} - -/* for vertex shader, the inputs are loaded into registers before the shader - * is executed, so max_regs from the shader instructions might not properly - * reflect the # of registers actually used: - */ -static void -fixup_vp_regfootprint(struct fd3_shader_variant *so) -{ - unsigned i; - for (i = 0; i < so->inputs_count; i++) { - if (so->inputs[i].compmask) { - uint32_t regid = (so->inputs[i].regid + 3) >> 2; - so->info.max_reg = MAX2(so->info.max_reg, regid); - } - } - for (i = 0; i < so->outputs_count; i++) { - uint32_t regid = (so->outputs[i].regid + 3) >> 2; - so->info.max_reg = MAX2(so->info.max_reg, regid); - } -} - -static struct fd3_shader_variant * -create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key) -{ - struct fd3_shader_variant *v = CALLOC_STRUCT(fd3_shader_variant); - const struct tgsi_token *tokens = so->tokens; - int ret; - - if (!v) - return NULL; - - v->so = so; - v->key = key; - v->type = so->type; - - if (fd_mesa_debug & FD_DBG_DISASM) { - DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", so->type, - key.binning_pass, key.color_two_side, key.half_precision); - tgsi_dump(tokens, 0); - } - - if (!(fd_mesa_debug & FD_DBG_NOOPT)) { - ret = fd3_compile_shader(v, tokens, key); - if (ret) { - debug_error("new compiler failed, trying fallback!"); - - v->inputs_count = 0; - v->outputs_count = 0; - v->total_in = 0; - v->has_samp = false; - v->immediates_count = 0; - } - } else { - ret = -1; /* force fallback to old compiler */ - } - - if (ret) - ret = fd3_compile_shader_old(v, tokens, key); - - if (ret) { - debug_error("compile failed!"); - goto fail; - } - - assemble_variant(v); - if (!v->bo) { - debug_error("assemble failed!"); - goto fail; - } - - if (so->type == SHADER_VERTEX) - fixup_vp_regfootprint(v); - - if (fd_mesa_debug & FD_DBG_DISASM) { - DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type, - key.binning_pass, key.color_two_side, key.half_precision); - disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type); - } - - return v; - -fail: - delete_variant(v); - return NULL; -} - -struct fd3_shader_variant * -fd3_shader_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key) -{ - struct fd3_shader_variant *v; - - /* some shader key values only apply to vertex or frag shader, - * so normalize the key to avoid constructing multiple identical - * variants: - */ - if (so->type == SHADER_FRAGMENT) { - key.binning_pass = false; - } - if (so->type == SHADER_VERTEX) { - key.color_two_side = false; - key.half_precision = false; - } - - for (v = so->variants; v; v = v->next) - if (!memcmp(&key, &v->key, sizeof(key))) - return v; - - /* compile new variant if it doesn't exist already: */ - v = create_variant(so, key); - v->next = so->variants; - so->variants = v; - - return v; -} - - -static void -delete_shader(struct fd3_shader_stateobj *so) -{ - struct fd3_shader_variant *v, *t; - for (v = so->variants; v; ) { - t = v; - v = v->next; - delete_variant(t); - } - free((void *)so->tokens); - free(so); -} - -static struct fd3_shader_stateobj * -create_shader(struct pipe_context *pctx, const struct pipe_shader_state *cso, +static struct ir3_shader * +create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso, enum shader_t type) { - struct fd3_shader_stateobj *so = CALLOC_STRUCT(fd3_shader_stateobj); - so->pctx = pctx; - so->type = type; - so->tokens = tgsi_dup_tokens(cso->tokens); - return so; + struct fd_context *ctx = fd_context(pctx); + struct ir3_compiler *compiler = ctx->screen->compiler; + return ir3_shader_create(compiler, cso, type, &ctx->debug); } static void * fd3_fp_state_create(struct pipe_context *pctx, const struct pipe_shader_state *cso) { - return create_shader(pctx, cso, SHADER_FRAGMENT); + return create_shader_stateobj(pctx, cso, SHADER_FRAGMENT); } static void fd3_fp_state_delete(struct pipe_context *pctx, void *hwcso) { - struct fd3_shader_stateobj *so = hwcso; - delete_shader(so); + struct ir3_shader *so = hwcso; + ir3_shader_destroy(so); } static void * fd3_vp_state_create(struct pipe_context *pctx, const struct pipe_shader_state *cso) { - return create_shader(pctx, cso, SHADER_VERTEX); + return create_shader_stateobj(pctx, cso, SHADER_VERTEX); } static void fd3_vp_state_delete(struct pipe_context *pctx, void *hwcso) { - struct fd3_shader_stateobj *so = hwcso; - delete_shader(so); + struct ir3_shader *so = hwcso; + ir3_shader_destroy(so); +} + +bool +fd3_needs_manual_clipping(const struct ir3_shader *shader, + const struct pipe_rasterizer_state *rast) +{ + uint64_t outputs = ir3_shader_outputs(shader); + + return (!rast->depth_clip || + util_bitcount(rast->clip_plane_enable) > 6 || + outputs & ((1ULL << VARYING_SLOT_CLIP_VERTEX) | + (1ULL << VARYING_SLOT_CLIP_DIST0) | + (1ULL << VARYING_SLOT_CLIP_DIST1))); } + static void -emit_shader(struct fd_ringbuffer *ring, const struct fd3_shader_variant *so) +emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so) { - const struct ir3_shader_info *si = &so->info; + const struct ir3_info *si = &so->info; enum adreno_state_block sb; enum adreno_state_src src; uint32_t i, sz, *bin; @@ -280,80 +132,88 @@ emit_shader(struct fd_ringbuffer *ring, const struct fd3_shader_variant *so) } } -static int -find_output(const struct fd3_shader_variant *so, fd3_semantic semantic) +void +fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, + int nr, struct pipe_surface **bufs) { - int j; - - for (j = 0; j < so->outputs_count; j++) - if (so->outputs[j].semantic == semantic) - return j; + const struct ir3_shader_variant *vp, *fp; + const struct ir3_info *vsi, *fsi; + enum a3xx_instrbuffermode fpbuffer, vpbuffer; + uint32_t fpbuffersz, vpbuffersz, fsoff; + uint32_t pos_regid, posz_regid, psize_regid, color_regid[4] = {0}; + int constmode; + int i, j; - /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n] - * in the vertex shader.. but the fragment shader doesn't know this - * so it will always have both IN.COLOR[n] and IN.BCOLOR[n]. So - * at link time if there is no matching OUT.BCOLOR[n], we must map - * OUT.COLOR[n] to IN.BCOLOR[n]. - */ - if (sem2name(semantic) == TGSI_SEMANTIC_BCOLOR) { - unsigned idx = sem2idx(semantic); - return find_output(so, fd3_semantic_name(TGSI_SEMANTIC_COLOR, idx)); - } + debug_assert(nr <= ARRAY_SIZE(color_regid)); - assert(0); + vp = fd3_emit_get_vp(emit); + fp = fd3_emit_get_fp(emit); - return 0; -} + vsi = &vp->info; + fsi = &fp->info; -static int -next_varying(const struct fd3_shader_variant *so, int i) -{ - while (++i < so->inputs_count) - if (so->inputs[i].compmask && so->inputs[i].bary) - break; - return i; -} + fpbuffer = BUFFER; + vpbuffer = BUFFER; + fpbuffersz = fp->instrlen; + vpbuffersz = vp->instrlen; + + /* + * Decide whether to use BUFFER or CACHE mode for VS and FS. It + * appears like 256 is the hard limit, but when the combined size + * exceeds 128 then blob will try to keep FS in BUFFER mode and + * switch to CACHE for VS until VS is too large. The blob seems + * to switch FS out of BUFFER mode at slightly under 128. But + * a bit fuzzy on the decision tree, so use slightly conservative + * limits. + * + * TODO check if these thresholds for BUFFER vs CACHE mode are the + * same for all a3xx or whether we need to consider the gpuid + */ -static uint32_t -find_output_regid(const struct fd3_shader_variant *so, fd3_semantic semantic) -{ - int j; - for (j = 0; j < so->outputs_count; j++) - if (so->outputs[j].semantic == semantic) - return so->outputs[j].regid; - return regid(63, 0); -} + if ((fpbuffersz + vpbuffersz) > 128) { + if (fpbuffersz < 112) { + /* FP:BUFFER VP:CACHE */ + vpbuffer = CACHE; + vpbuffersz = 256 - fpbuffersz; + } else if (vpbuffersz < 112) { + /* FP:CACHE VP:BUFFER */ + fpbuffer = CACHE; + fpbuffersz = 256 - vpbuffersz; + } else { + /* FP:CACHE VP:CACHE */ + vpbuffer = fpbuffer = CACHE; + vpbuffersz = fpbuffersz = 192; + } + } -void -fd3_program_emit(struct fd_ringbuffer *ring, - struct fd_program_stateobj *prog, struct fd3_shader_key key) -{ - const struct fd3_shader_variant *vp, *fp; - const struct ir3_shader_info *vsi, *fsi; - uint32_t pos_regid, posz_regid, psize_regid, color_regid; - int i, j, k; + if (fpbuffer == BUFFER) { + fsoff = 128 - fpbuffersz; + } else { + fsoff = 256 - fpbuffersz; + } - vp = fd3_shader_variant(prog->vp, key); + /* seems like vs->constlen + fs->constlen > 256, then CONSTMODE=1 */ + constmode = ((vp->constlen + fp->constlen) > 256) ? 1 : 0; - if (key.binning_pass) { - /* use dummy stateobj to simplify binning vs non-binning: */ - static const struct fd3_shader_variant binning_fp = {}; - fp = &binning_fp; + pos_regid = ir3_find_output_regid(vp, VARYING_SLOT_POS); + posz_regid = ir3_find_output_regid(fp, FRAG_RESULT_DEPTH); + psize_regid = ir3_find_output_regid(vp, VARYING_SLOT_PSIZ); + if (fp->color0_mrt) { + color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = + ir3_find_output_regid(fp, FRAG_RESULT_COLOR); } else { - fp = fd3_shader_variant(prog->fp, key); + color_regid[0] = ir3_find_output_regid(fp, FRAG_RESULT_DATA0); + color_regid[1] = ir3_find_output_regid(fp, FRAG_RESULT_DATA1); + color_regid[2] = ir3_find_output_regid(fp, FRAG_RESULT_DATA2); + color_regid[3] = ir3_find_output_regid(fp, FRAG_RESULT_DATA3); } - vsi = &vp->info; - fsi = &fp->info; - - pos_regid = find_output_regid(vp, - fd3_semantic_name(TGSI_SEMANTIC_POSITION, 0)); - posz_regid = find_output_regid(fp, - fd3_semantic_name(TGSI_SEMANTIC_POSITION, 0)); - psize_regid = find_output_regid(vp, - fd3_semantic_name(TGSI_SEMANTIC_PSIZE, 0)); - color_regid = find_output_regid(fp, - fd3_semantic_name(TGSI_SEMANTIC_COLOR, 0)); + /* adjust regids for alpha output formats. there is no alpha render + * format, so it's just treated like red + */ + for (i = 0; i < nr; i++) + if (util_format_is_alpha(pipe_surface_format(bufs[i]))) + color_regid[i] += 3; /* we could probably divide this up into things that need to be * emitted if frag-prog is dirty vs if vert-prog is dirty.. @@ -361,6 +221,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 6); OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) | + A3XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) | /* NOTE: I guess SHADERRESTART and CONSTFULLUPDATE maybe * flush some caches? I think we only need to set those * bits if we have updated const or shader.. @@ -369,19 +230,20 @@ fd3_program_emit(struct fd_ringbuffer *ring, A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE | - COND(fp->frag_coord, A3XX_HLSQ_CONTROL_1_REG_ZWCOORD)); + COND(fp->frag_coord, A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID(regid(0,0)) | + A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID(regid(0,2)))); OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31)); OUT_RING(ring, A3XX_HLSQ_CONTROL_3_REG_REGID(fp->pos_regid)); OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) | A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) | - A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vp->instrlen)); + A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vpbuffersz)); OUT_RING(ring, A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(fp->constlen) | A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(128) | - A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fp->instrlen)); + A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fpbuffersz)); OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); - OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(0) | - COND(key.binning_pass, A3XX_SP_SP_CTRL_REG_BINNING) | + OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(constmode) | + COND(emit->key.binning_pass, A3XX_SP_SP_CTRL_REG_BINNING) | A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) | A3XX_SP_SP_CTRL_REG_L0MODE(0)); @@ -390,61 +252,48 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3); OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) | - A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | - A3XX_SP_VS_CTRL_REG0_CACHEINVALID | + A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(vpbuffer) | + COND(vpbuffer == CACHE, A3XX_SP_VS_CTRL_REG0_CACHEINVALID) | A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) | A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) | - A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) | A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) | A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE | - COND(vp->has_samp, A3XX_SP_VS_CTRL_REG0_PIXLODENABLE) | - A3XX_SP_VS_CTRL_REG0_LENGTH(vp->instrlen)); + A3XX_SP_VS_CTRL_REG0_LENGTH(vpbuffersz)); OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) | A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) | - A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vsi->max_const, 0))); + A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vp->constlen + 1, 0))); OUT_RING(ring, A3XX_SP_VS_PARAM_REG_POSREGID(pos_regid) | A3XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) | - A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(align(fp->total_in, 4) / 4)); + A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(fp->varying_in)); + + struct ir3_shader_linkage l = {0}; + ir3_link_shaders(&l, vp, fp); - for (i = 0, j = -1; j < (int)fp->inputs_count; i++) { + for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { uint32_t reg = 0; OUT_PKT0(ring, REG_A3XX_SP_VS_OUT_REG(i), 1); - j = next_varying(fp, j); - if (j < fp->inputs_count) { - k = find_output(vp, fp->inputs[j].semantic); - reg |= A3XX_SP_VS_OUT_REG_A_REGID(vp->outputs[k].regid); - reg |= A3XX_SP_VS_OUT_REG_A_COMPMASK(fp->inputs[j].compmask); - } + reg |= A3XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); + reg |= A3XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); + j++; - j = next_varying(fp, j); - if (j < fp->inputs_count) { - k = find_output(vp, fp->inputs[j].semantic); - reg |= A3XX_SP_VS_OUT_REG_B_REGID(vp->outputs[k].regid); - reg |= A3XX_SP_VS_OUT_REG_B_COMPMASK(fp->inputs[j].compmask); - } + reg |= A3XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); + reg |= A3XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); + j++; OUT_RING(ring, reg); } - for (i = 0, j = -1; j < (int)fp->inputs_count; i++) { + for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) { uint32_t reg = 0; OUT_PKT0(ring, REG_A3XX_SP_VS_VPC_DST_REG(i), 1); - j = next_varying(fp, j); - if (j < fp->inputs_count) - reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(fp->inputs[j].inloc); - j = next_varying(fp, j); - if (j < fp->inputs_count) - reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(fp->inputs[j].inloc); - j = next_varying(fp, j); - if (j < fp->inputs_count) - reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(fp->inputs[j].inloc); - j = next_varying(fp, j); - if (j < fp->inputs_count) - reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(fp->inputs[j].inloc); + reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc + 8); + reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc + 8); + reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc + 8); + reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc + 8); OUT_RING(ring, reg); } @@ -454,7 +303,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); OUT_RELOC(ring, vp->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */ - if (key.binning_pass) { + if (emit->key.binning_pass) { OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); OUT_RING(ring, 0x00000000); @@ -462,57 +311,130 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER)); OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 1); + OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) | + A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); } else { OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen)); OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | - A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | - A3XX_SP_FS_CTRL_REG0_CACHEINVALID | + A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(fpbuffer) | + COND(fpbuffer == CACHE, A3XX_SP_FS_CTRL_REG0_CACHEINVALID) | A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) | A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) | - A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | + A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP | A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | COND(fp->has_samp > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | - A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen)); + A3XX_SP_FS_CTRL_REG0_LENGTH(fpbuffersz)); OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) | A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) | - A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi->max_const, 0)) | + A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fp->constlen + 1, 0)) | A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63)); + OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2); - OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) | - A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); + OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET( + MAX2(128, vp->constlen)) | + A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(fsoff)); OUT_RELOC(ring, fp->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ } - OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2); - OUT_RING(ring, 0x00000000); /* SP_FS_FLAT_SHAD_MODE_REG_0 */ - OUT_RING(ring, 0x00000000); /* SP_FS_FLAT_SHAD_MODE_REG_1 */ - OUT_PKT0(ring, REG_A3XX_SP_FS_OUTPUT_REG, 1); - if (fp->writes_pos) { - OUT_RING(ring, A3XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE | - A3XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid)); - } else { - OUT_RING(ring, 0x00000000); - } + OUT_RING(ring, + COND(fp->writes_pos, A3XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) | + A3XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid) | + A3XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr) - 1)); OUT_PKT0(ring, REG_A3XX_SP_FS_MRT_REG(0), 4); - OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(color_regid) | - COND(fp->key.half_precision, A3XX_SP_FS_MRT_REG_HALF_PRECISION)); - OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0)); - OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0)); - OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0)); + for (i = 0; i < 4; i++) { + uint32_t mrt_reg = A3XX_SP_FS_MRT_REG_REGID(color_regid[i]) | + COND(fp->key.half_precision, A3XX_SP_FS_MRT_REG_HALF_PRECISION); + + if (i < nr) { + enum pipe_format fmt = pipe_surface_format(bufs[i]); + mrt_reg |= COND(util_format_is_pure_uint(fmt), A3XX_SP_FS_MRT_REG_UINT) | + COND(util_format_is_pure_sint(fmt), A3XX_SP_FS_MRT_REG_SINT); + } + OUT_RING(ring, mrt_reg); + } - if (key.binning_pass) { + if (emit->key.binning_pass) { OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); OUT_RING(ring, A3XX_VPC_ATTR_THRDASSIGN(1) | A3XX_VPC_ATTR_LMSIZE(1) | COND(vp->writes_psize, A3XX_VPC_ATTR_PSIZE)); OUT_RING(ring, 0x00000000); } else { + uint32_t vinterp[4], flatshade[2], vpsrepl[4]; + + memset(vinterp, 0, sizeof(vinterp)); + memset(flatshade, 0, sizeof(flatshade)); + memset(vpsrepl, 0, sizeof(vpsrepl)); + + /* figure out VARYING_INTERP / FLAT_SHAD register values: */ + for (j = -1; (j = ir3_next_varying(fp, j)) < (int)fp->inputs_count; ) { + /* NOTE: varyings are packed, so if compmask is 0xb + * then first, third, and fourth component occupy + * three consecutive varying slots: + */ + unsigned compmask = fp->inputs[j].compmask; + + uint32_t inloc = fp->inputs[j].inloc; + + if ((fp->inputs[j].interpolate == INTERP_MODE_FLAT) || + (fp->inputs[j].rasterflat && emit->rasterflat)) { + uint32_t loc = inloc; + + for (i = 0; i < 4; i++) { + if (compmask & (1 << i)) { + vinterp[loc / 16] |= FLAT << ((loc % 16) * 2); + flatshade[loc / 32] |= 1 << (loc % 32); + loc++; + } + } + } + + gl_varying_slot slot = fp->inputs[j].slot; + + /* since we don't enable PIPE_CAP_TGSI_TEXCOORD: */ + if (slot >= VARYING_SLOT_VAR0) { + unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0); + /* Replace the .xy coordinates with S/T from the point sprite. Set + * interpolation bits for .zw such that they become .01 + */ + if (emit->sprite_coord_enable & texmask) { + /* mask is two 2-bit fields, where: + * '01' -> S + * '10' -> T + * '11' -> 1 - T (flip mode) + */ + unsigned mask = emit->sprite_coord_mode ? 0b1101 : 0b1001; + uint32_t loc = inloc; + if (compmask & 0x1) { + vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x2) { + vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x4) { + /* .z <- 0.0f */ + vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x8) { + /* .w <- 1.0f */ + vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); + loc++; + } + } + } + } + OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) | A3XX_VPC_ATTR_THRDASSIGN(1) | @@ -522,48 +444,37 @@ fd3_program_emit(struct fd_ringbuffer *ring, A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in)); OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4); - OUT_RING(ring, fp->so->vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */ - OUT_RING(ring, fp->so->vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */ - OUT_RING(ring, fp->so->vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */ - OUT_RING(ring, fp->so->vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */ + OUT_RING(ring, vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */ + OUT_RING(ring, vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */ + OUT_RING(ring, vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */ + OUT_RING(ring, vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */ OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4); - OUT_RING(ring, fp->so->vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */ - OUT_RING(ring, fp->so->vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */ - OUT_RING(ring, fp->so->vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */ - OUT_RING(ring, fp->so->vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */ + OUT_RING(ring, vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */ + OUT_RING(ring, vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */ + OUT_RING(ring, vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */ + OUT_RING(ring, vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */ + + OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2); + OUT_RING(ring, flatshade[0]); /* SP_FS_FLAT_SHAD_MODE_REG_0 */ + OUT_RING(ring, flatshade[1]); /* SP_FS_FLAT_SHAD_MODE_REG_1 */ } - OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1); - OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) | - A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(252)); - - emit_shader(ring, vp); + if (vpbuffer == BUFFER) + emit_shader(ring, vp); OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ - if (!key.binning_pass) { - emit_shader(ring, fp); + if (!emit->key.binning_pass) { + if (fpbuffer == BUFFER) + emit_shader(ring, fp); OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ } } -/* hack.. until we figure out how to deal w/ vpsrepl properly.. */ -static void -fix_blit_fp(struct pipe_context *pctx) -{ - struct fd_context *ctx = fd_context(pctx); - struct fd3_shader_stateobj *so = ctx->blit_prog.fp; - - so->vpsrepl[0] = 0x99999999; - so->vpsrepl[1] = 0x99999999; - so->vpsrepl[2] = 0x99999999; - so->vpsrepl[3] = 0x99999999; -} - void fd3_prog_init(struct pipe_context *pctx) { @@ -574,6 +485,4 @@ fd3_prog_init(struct pipe_context *pctx) pctx->delete_vs_state = fd3_vp_state_delete; fd_prog_init(pctx); - - fix_blit_fp(pctx); }