From: Rob Clark Date: Tue, 25 Feb 2014 13:51:30 +0000 (-0500) Subject: freedreno/a3xx: add support for frag coord/face X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=664045752f55bd137110efa8926a8b7ee5ca400b;p=mesa.git freedreno/a3xx: add support for frag coord/face Fixes anything that tries to use gl_FrontFacing/gl_FragCoord. Also, face support is needed to emulate two sided color. Signed-off-by: Rob Clark --- diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c index 818d5611dd9..54b36265ddf 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c @@ -95,8 +95,12 @@ struct fd3_compile_context { * But TGSI doesn't know that, it still declares things as * IN[] registers. So we do all the input tracking normally * and fix things up after compile_instructions() + * + * NOTE that frag_pos is the hardware position (possibly it + * is actually an index or tag or some such.. it is *not* + * values that can be directly used for gl_FragCoord..) */ - struct ir3_instruction *frag_pos; + struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4]; struct tgsi_parse_context parser; unsigned type; @@ -180,6 +184,10 @@ compile_init(struct fd3_compile_context *ctx, struct fd3_shader_variant *so, ctx->current_instr = NULL; ctx->num_output_updates = 0; ctx->atomic = false; + ctx->frag_pos = NULL; + ctx->frag_face = NULL; + + memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord)); #define FM(x) (1 << TGSI_FILE_##x) /* optimize can't deal with relative addressing: */ @@ -309,7 +317,12 @@ push_block(struct fd3_compile_context *ctx) * position) */ if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - nin = MAX2(2, nin); + int n = 2; + if (ctx->info.reads_position) + n += 4; + if (ctx->info.uses_frontface) + n += 4; + nin = MAX2(n, nin); nout += ARRAY_SIZE(ctx->kill); } } else { @@ -1753,11 +1766,164 @@ decl_semantic(const struct tgsi_declaration_semantic *sem) return fd3_semantic_name(sem->Name, sem->Index); } +static struct ir3_instruction * +decl_in_frag_bary(struct fd3_compile_context *ctx, unsigned regid, + unsigned j, unsigned inloc) +{ + struct ir3_instruction *instr; + struct ir3_register *src; + + /* bary.f dst, #inloc, r0.x */ + instr = instr_create(ctx, 2, OPC_BARY_F); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc; + src = ir3_reg_create(instr, 0, IR3_REG_SSA); + src->wrmask = 0x3; + src->instr = ctx->frag_pos; + + return instr; +} + +/* TGSI_SEMANTIC_POSITION + * """""""""""""""""""""" + * + * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that + * fragment shader input contains the fragment's window position. The X + * component starts at zero and always increases from left to right. + * The Y component starts at zero and always increases but Y=0 may either + * indicate the top of the window or the bottom depending on the fragment + * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN). + * The Z coordinate ranges from 0 to 1 to represent depth from the front + * to the back of the Z buffer. The W component contains the reciprocol + * of the interpolated vertex position W component. + */ +static struct ir3_instruction * +decl_in_frag_coord(struct fd3_compile_context *ctx, unsigned regid, + unsigned j) +{ + struct ir3_instruction *instr, *src; + + compile_assert(ctx, !ctx->frag_coord[j]); + + ctx->frag_coord[j] = create_input(ctx->block, NULL, 0); + + + switch (j) { + case 0: /* .x */ + case 1: /* .y */ + /* for frag_coord, we get unsigned values.. we need + * to subtract (integer) 8 and divide by 16 (right- + * shift by 4) then convert to float: + */ + + /* add.s tmp, src, -8 */ + instr = instr_create(ctx, 2, OPC_ADD_S); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j]; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8; + src = instr; + + /* shr.b tmp, tmp, 4 */ + instr = instr_create(ctx, 2, OPC_SHR_B); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4; + src = instr; + + /* mov.u32f32 dst, tmp */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = TYPE_U32; + instr->cat1.dst_type = TYPE_F32; + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + + break; + case 2: /* .z */ + case 3: /* .w */ + /* seems that we can use these as-is: */ + instr = ctx->frag_coord[j]; + break; + default: + compile_error(ctx, "invalid channel\n"); + instr = create_immed(ctx, 0.0); + break; + } + + return instr; +} + +/* TGSI_SEMANTIC_FACE + * """""""""""""""""" + * + * This label applies to fragment shader inputs only and indicates that + * the register contains front/back-face information of the form (F, 0, + * 0, 1). The first component will be positive when the fragment belongs + * to a front-facing polygon, and negative when the fragment belongs to a + * back-facing polygon. + */ +static struct ir3_instruction * +decl_in_frag_face(struct fd3_compile_context *ctx, unsigned regid, + unsigned j) +{ + struct ir3_instruction *instr, *src; + + switch (j) { + case 0: /* .x */ + compile_assert(ctx, !ctx->frag_face); + + ctx->frag_face = create_input(ctx->block, NULL, 0); + + /* for faceness, we always get -1 or 0 (int).. but TGSI expects + * positive vs negative float.. and piglit further seems to + * expect -1.0 or 1.0: + * + * mul.s tmp, hr0.x, 2 + * add.s tmp, tmp, 1 + * mov.s16f32, dst, tmp + * + */ + + instr = instr_create(ctx, 2, OPC_MUL_S); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; + src = instr; + + instr = instr_create(ctx, 2, OPC_ADD_S); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1; + src = instr; + + instr = instr_create(ctx, 1, 0); /* mov */ + instr->cat1.src_type = TYPE_S32; + instr->cat1.dst_type = TYPE_F32; + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + + break; + case 1: /* .y */ + case 2: /* .z */ + instr = create_immed(ctx, 0.0); + break; + case 3: /* .w */ + instr = create_immed(ctx, 1.0); + break; + default: + compile_error(ctx, "invalid channel\n"); + instr = create_immed(ctx, 0.0); + break; + } + + return instr; +} + static void decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) { struct fd3_shader_variant *so = ctx->so; - unsigned i, flags = 0; + unsigned name = decl->Semantic.Name; + unsigned i; /* I don't think we should get frag shader input without * semantic info? Otherwise how do inputs get linked to @@ -1771,7 +1937,7 @@ decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) unsigned r = regid(i, 0); unsigned ncomp, j; - /* TODO use ctx->info.input_usage_mask[decl->Range.n] to figure out ncomp: */ + /* we'll figure out the actual components used after scheduling */ ncomp = 4; DBG("decl in -> r%d", i); @@ -1780,36 +1946,38 @@ decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) so->inputs[n].compmask = (1 << ncomp) - 1; so->inputs[n].regid = r; so->inputs[n].inloc = ctx->next_inloc; - ctx->next_inloc += ncomp; - - so->total_in += ncomp; for (j = 0; j < ncomp; j++) { - struct ir3_instruction *instr; + struct ir3_instruction *instr = NULL; if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - struct ir3_register *src; - - instr = instr_create(ctx, 2, OPC_BARY_F); - - /* dst register: */ - ir3_reg_create(instr, r + j, flags); - - /* input position: */ - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = - so->inputs[n].inloc + j - 8; - - /* input base (always r0.xy): */ - src = ir3_reg_create(instr, regid(0,0), IR3_REG_SSA); - src->wrmask = 0x3; - src->instr = ctx->frag_pos; - + /* for fragment shaders, POSITION and FACE are handled + * specially, not using normal varying / bary.f + */ + if (name == TGSI_SEMANTIC_POSITION) { + so->inputs[n].bary = false; + so->frag_coord = true; + instr = decl_in_frag_coord(ctx, r + j, j); + } else if (name == TGSI_SEMANTIC_FACE) { + so->inputs[n].bary = false; + so->frag_face = true; + instr = decl_in_frag_face(ctx, r + j, j); + } else { + so->inputs[n].bary = true; + instr = decl_in_frag_bary(ctx, r + j, j, + so->inputs[n].inloc + j - 8); + } } else { instr = create_input(ctx->block, NULL, (i * 4) + j); } ctx->block->inputs[(i * 4) + j] = instr; } + + if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) { + ctx->next_inloc += ncomp; + so->total_in += ncomp; + } } } @@ -1878,13 +2046,76 @@ decl_samp(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) ctx->so->samplers_count++; } +/* from TGSI perspective, we actually have inputs. But most of the "inputs" + * for a fragment shader are just bary.f instructions. The *actual* inputs + * from the hw perspective are the frag_pos and optionally frag_coord and + * frag_face. + */ +static void +fixup_frag_inputs(struct fd3_compile_context *ctx) +{ + struct fd3_shader_variant *so = ctx->so; + struct ir3_block *block = ctx->block; + struct ir3_instruction *instr; + int regid = 0; + + block->ninputs = 0; + + if (so->frag_face) { + /* this ultimately gets assigned to hr0.x so doesn't conflict + * with frag_coord/frag_pos.. + */ + block->inputs[block->ninputs++] = ctx->frag_face; + ctx->frag_face->regs[0]->num = 0; + + /* remaining channels not used, but let's avoid confusing + * other parts that expect inputs to come in groups of vec4 + */ + block->inputs[block->ninputs++] = NULL; + block->inputs[block->ninputs++] = NULL; + block->inputs[block->ninputs++] = NULL; + } + + /* since we don't know where to set the regid for frag_coord, + * we have to use r0.x for it. But we don't want to *always* + * use r1.x for frag_pos as that could increase the register + * footprint on simple shaders: + */ + if (so->frag_coord) { + ctx->frag_coord[0]->regs[0]->num = regid++; + ctx->frag_coord[1]->regs[0]->num = regid++; + ctx->frag_coord[2]->regs[0]->num = regid++; + ctx->frag_coord[3]->regs[0]->num = regid++; + + block->inputs[block->ninputs++] = ctx->frag_coord[0]; + block->inputs[block->ninputs++] = ctx->frag_coord[1]; + block->inputs[block->ninputs++] = ctx->frag_coord[2]; + block->inputs[block->ninputs++] = ctx->frag_coord[3]; + } + + /* we always have frag_pos: */ + so->pos_regid = regid; + + /* r0.x */ + instr = create_input(block, NULL, block->ninputs); + instr->regs[0]->num = regid++; + block->inputs[block->ninputs++] = instr; + ctx->frag_pos->regs[1]->instr = instr; + + /* r0.y */ + instr = create_input(block, NULL, block->ninputs); + instr->regs[0]->num = regid++; + block->inputs[block->ninputs++] = instr; + ctx->frag_pos->regs[2]->instr = instr; +} + static void compile_instructions(struct fd3_compile_context *ctx) { push_block(ctx); - /* for fragment shader, we have a single input register (r0.xy) - * which is used as the base for bary.f varying fetch instrs: + /* for fragment shader, we have a single input register (usually + * r0.xy) which is used as the base for bary.f varying fetch instrs: */ if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { struct ir3_instruction *instr; @@ -1957,21 +2188,8 @@ compile_instructions(struct fd3_compile_context *ctx) } /* fixup actual inputs for frag shader: */ - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - struct ir3_instruction *instr; - - ctx->block->ninputs = 2; - - /* r0.x */ - instr = create_input(ctx->block, NULL, 0); - ctx->block->inputs[0] = instr; - ctx->frag_pos->regs[1]->instr = instr; - - /* r0.y */ - instr = create_input(ctx->block, NULL, 1); - ctx->block->inputs[1] = instr; - ctx->frag_pos->regs[2]->instr = instr; - } + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) + fixup_frag_inputs(ctx); } static void @@ -1995,6 +2213,7 @@ fd3_compile_shader(struct fd3_shader_variant *so, const struct tgsi_token *tokens, struct fd3_shader_key key) { struct fd3_compile_context ctx; + struct ir3_block *block; unsigned i, actual_in; int ret = 0; @@ -2011,12 +2230,13 @@ fd3_compile_shader(struct fd3_shader_variant *so, compile_instructions(&ctx); + block = ctx.block; + /* at this point, we want the kill's in the outputs array too, * so that they get scheduled (since they have no dst).. we've * already ensured that the array is big enough in push_block(): */ if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { - struct ir3_block *block = ctx.block; for (i = 0; i < ctx.kill_count; i++) block->outputs[block->noutputs++] = ctx.kill[i]; } @@ -2024,43 +2244,44 @@ fd3_compile_shader(struct fd3_shader_variant *so, if (fd_mesa_debug & FD_DBG_OPTDUMP) compile_dump(&ctx); - ret = ir3_block_flatten(ctx.block); + ret = ir3_block_flatten(block); if (ret < 0) goto out; if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP)) compile_dump(&ctx); - ir3_block_cp(ctx.block); + ir3_block_cp(block); if (fd_mesa_debug & FD_DBG_OPTDUMP) compile_dump(&ctx); - ir3_block_depth(ctx.block); + ir3_block_depth(block); if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("AFTER DEPTH:\n"); - ir3_dump_instr_list(ctx.block->head); + ir3_dump_instr_list(block->head); } - ir3_block_sched(ctx.block); + ir3_block_sched(block); if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("AFTER SCHED:\n"); - ir3_dump_instr_list(ctx.block->head); + ir3_dump_instr_list(block->head); } - ret = ir3_block_ra(ctx.block, so->type, key.half_precision); + ret = ir3_block_ra(block, so->type, key.half_precision, + so->frag_coord, so->frag_face); if (ret) goto out; if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("AFTER RA:\n"); - ir3_dump_instr_list(ctx.block->head); + ir3_dump_instr_list(block->head); } /* fixup input/outputs: */ for (i = 0; i < so->outputs_count; i++) { - so->outputs[i].regid = ctx.block->outputs[i*4]->regs[0]->num; + so->outputs[i].regid = block->outputs[i*4]->regs[0]->num; /* preserve hack for depth output.. tgsi writes depth to .z, * but what we give the hw is the scalar register: */ @@ -2073,7 +2294,7 @@ fd3_compile_shader(struct fd3_shader_variant *so, for (i = 0; i < so->inputs_count; i++) { unsigned j, regid = ~0, compmask = 0; for (j = 0; j < 4; j++) { - struct ir3_instruction *in = ctx.block->inputs[(i*4) + j]; + struct ir3_instruction *in = block->inputs[(i*4) + j]; if (in) { compmask |= (1 << j); regid = in->regs[0]->num - j; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c index 31e415c6a70..998b8e9cf7c 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c @@ -1326,6 +1326,7 @@ decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) so->inputs[n].compmask = (1 << ncomp) - 1; so->inputs[n].regid = r; so->inputs[n].inloc = ctx->next_inloc; + so->inputs[n].bary = true; /* all that is supported */ ctx->next_inloc += ncomp; so->total_in += ncomp; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 0d8d3c5e52c..5bfd976170c 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -296,33 +296,41 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_shader_variant *vp, struct fd3_vertex_buf *vbufs, uint32_t n) { - uint32_t i; + uint32_t i, j, last = 0; n = MIN2(n, vp->inputs_count); - for (i = 0; i < n; i++) { - struct pipe_resource *prsc = vbufs[i].prsc; - struct fd_resource *rsc = fd_resource(prsc); - enum a3xx_vtx_fmt fmt = fd3_pipe2vtx(vbufs[i].format); - bool switchnext = (i != (n - 1)); - uint32_t fs = util_format_get_blocksize(vbufs[i].format); - - OUT_PKT0(ring, REG_A3XX_VFD_FETCH(i), 2); - OUT_RING(ring, A3XX_VFD_FETCH_INSTR_0_FETCHSIZE(fs - 1) | - A3XX_VFD_FETCH_INSTR_0_BUFSTRIDE(vbufs[i].stride) | - COND(switchnext, A3XX_VFD_FETCH_INSTR_0_SWITCHNEXT) | - A3XX_VFD_FETCH_INSTR_0_INDEXCODE(i) | - A3XX_VFD_FETCH_INSTR_0_STEPRATE(1)); - OUT_RELOC(ring, rsc->bo, vbufs[i].offset, 0, 0); - - OUT_PKT0(ring, REG_A3XX_VFD_DECODE_INSTR(i), 1); - OUT_RING(ring, A3XX_VFD_DECODE_INSTR_CONSTFILL | - A3XX_VFD_DECODE_INSTR_WRITEMASK(vp->inputs[i].compmask) | - A3XX_VFD_DECODE_INSTR_FORMAT(fmt) | - A3XX_VFD_DECODE_INSTR_REGID(vp->inputs[i].regid) | - A3XX_VFD_DECODE_INSTR_SHIFTCNT(fs) | - A3XX_VFD_DECODE_INSTR_LASTCOMPVALID | - COND(switchnext, A3XX_VFD_DECODE_INSTR_SWITCHNEXT)); + for (i = 0; i < n; i++) + if (vp->inputs[i].compmask) + last = i; + + for (i = 0, j = 0; i <= last; i++) { + if (vp->inputs[i].compmask) { + struct pipe_resource *prsc = vbufs[i].prsc; + struct fd_resource *rsc = fd_resource(prsc); + enum a3xx_vtx_fmt fmt = fd3_pipe2vtx(vbufs[i].format); + bool switchnext = (i != last); + uint32_t fs = util_format_get_blocksize(vbufs[i].format); + + OUT_PKT0(ring, REG_A3XX_VFD_FETCH(j), 2); + OUT_RING(ring, A3XX_VFD_FETCH_INSTR_0_FETCHSIZE(fs - 1) | + A3XX_VFD_FETCH_INSTR_0_BUFSTRIDE(vbufs[i].stride) | + COND(switchnext, A3XX_VFD_FETCH_INSTR_0_SWITCHNEXT) | + A3XX_VFD_FETCH_INSTR_0_INDEXCODE(j) | + A3XX_VFD_FETCH_INSTR_0_STEPRATE(1)); + OUT_RELOC(ring, rsc->bo, vbufs[i].offset, 0, 0); + + OUT_PKT0(ring, REG_A3XX_VFD_DECODE_INSTR(j), 1); + OUT_RING(ring, A3XX_VFD_DECODE_INSTR_CONSTFILL | + A3XX_VFD_DECODE_INSTR_WRITEMASK(vp->inputs[i].compmask) | + A3XX_VFD_DECODE_INSTR_FORMAT(fmt) | + A3XX_VFD_DECODE_INSTR_REGID(vp->inputs[i].regid) | + A3XX_VFD_DECODE_INSTR_SHIFTCNT(fs) | + A3XX_VFD_DECODE_INSTR_LASTCOMPVALID | + COND(switchnext, A3XX_VFD_DECODE_INSTR_SWITCHNEXT)); + + j++; + } } } @@ -346,23 +354,28 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(ctx->sample_mask)); } - if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_STENCIL_REF)) { - struct fd3_zsa_stateobj *zsa = fd3_zsa_stateobj(ctx->zsa); - struct pipe_stencil_ref *sr = &ctx->stencil_ref; + if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) && !key.binning_pass) { + uint32_t val = fd3_zsa_stateobj(ctx->zsa)->rb_render_control; - if (!key.binning_pass) { - struct fd3_context *fd3_ctx = fd3_context(ctx); + val |= COND(fp->frag_face, A3XX_RB_RENDER_CONTROL_FACENESS); + val |= COND(fp->frag_coord, A3XX_RB_RENDER_CONTROL_XCOORD | + A3XX_RB_RENDER_CONTROL_YCOORD | + A3XX_RB_RENDER_CONTROL_ZCOORD | + A3XX_RB_RENDER_CONTROL_WCOORD); - /* I suppose if we needed to (which I don't *think* we need - * to), we could emit this for binning pass too. But we - * would need to keep a different patch-list for binning - * vs render pass. - */ + /* I suppose if we needed to (which I don't *think* we need + * to), we could emit this for binning pass too. But we + * would need to keep a different patch-list for binning + * vs render pass. + */ - OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); - OUT_RINGP(ring, zsa->rb_render_control, - &fd3_ctx->rbrc_patches); - } + OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); + OUT_RINGP(ring, val, &fd3_context(ctx)->rbrc_patches); + } + + if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_STENCIL_REF)) { + struct fd3_zsa_stateobj *zsa = fd3_zsa_stateobj(ctx->zsa); + struct pipe_stencil_ref *sr = &ctx->stencil_ref; OUT_PKT0(ring, REG_A3XX_RB_ALPHA_REF, 1); OUT_RING(ring, zsa->rb_alpha_ref); @@ -406,9 +419,9 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { uint32_t val = fd3_rasterizer_stateobj(ctx->rasterizer) ->gras_cl_clip_cntl; - if (fp->writes_pos) { - val |= A3XX_GRAS_CL_CLIP_CNTL_ZCLIP_DISABLE; - } + val |= COND(fp->writes_pos, A3XX_GRAS_CL_CLIP_CNTL_ZCLIP_DISABLE); + val |= COND(fp->frag_coord, A3XX_GRAS_CL_CLIP_CNTL_ZCOORD | + A3XX_GRAS_CL_CLIP_CNTL_WCOORD); OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); OUT_RING(ring, val); } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index 9cc09889f98..01502ce955e 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -290,6 +290,15 @@ find_output(const struct fd3_shader_variant *so, fd3_semantic semantic) return 0; } +static int +next_varying(const struct fd3_shader_variant *so, int i) +{ + while (++i < so->inputs_count) + if (so->inputs[i].compmask && so->inputs[i].bary) + break; + return i; +} + static uint32_t find_output_regid(const struct fd3_shader_variant *so, fd3_semantic semantic) { @@ -307,7 +316,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, const struct fd3_shader_variant *vp, *fp; const struct ir3_shader_info *vsi, *fsi; uint32_t pos_regid, posz_regid, psize_regid, color_regid; - int i; + int i, j, k; vp = fd3_shader_variant(prog->vp, key); @@ -344,9 +353,10 @@ fd3_program_emit(struct fd_ringbuffer *ring, A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART | A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | - A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE); + A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE | + COND(fp->frag_coord, A3XX_HLSQ_CONTROL_1_REG_ZWCOORD)); OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31)); - OUT_RING(ring, 0x00000000); /* HLSQ_CONTROL_3_REG */ + OUT_RING(ring, A3XX_HLSQ_CONTROL_3_REG_REGID(fp->pos_regid)); OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) | A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) | A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vp->instrlen)); @@ -379,36 +389,47 @@ fd3_program_emit(struct fd_ringbuffer *ring, A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vsi->max_const, 0))); OUT_RING(ring, A3XX_SP_VS_PARAM_REG_POSREGID(pos_regid) | A3XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) | - A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(fp->inputs_count)); + A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(align(fp->total_in, 4) / 4)); - for (i = 0; i < fp->inputs_count; ) { + for (i = 0, j = -1; j < (int)fp->inputs_count; i++) { uint32_t reg = 0; - int j; - OUT_PKT0(ring, REG_A3XX_SP_VS_OUT_REG(i/2), 1); + OUT_PKT0(ring, REG_A3XX_SP_VS_OUT_REG(i), 1); - j = find_output(vp, fp->inputs[i].semantic); - reg |= A3XX_SP_VS_OUT_REG_A_REGID(vp->outputs[j].regid); - reg |= A3XX_SP_VS_OUT_REG_A_COMPMASK(fp->inputs[i].compmask); - i++; + j = next_varying(fp, j); + if (j < fp->inputs_count) { + k = find_output(vp, fp->inputs[j].semantic); + reg |= A3XX_SP_VS_OUT_REG_A_REGID(vp->outputs[k].regid); + reg |= A3XX_SP_VS_OUT_REG_A_COMPMASK(fp->inputs[j].compmask); + } - j = find_output(vp, fp->inputs[i].semantic); - reg |= A3XX_SP_VS_OUT_REG_B_REGID(vp->outputs[j].regid); - reg |= A3XX_SP_VS_OUT_REG_B_COMPMASK(fp->inputs[i].compmask); - i++; + j = next_varying(fp, j); + if (j < fp->inputs_count) { + k = find_output(vp, fp->inputs[j].semantic); + reg |= A3XX_SP_VS_OUT_REG_B_REGID(vp->outputs[k].regid); + reg |= A3XX_SP_VS_OUT_REG_B_COMPMASK(fp->inputs[j].compmask); + } OUT_RING(ring, reg); } - for (i = 0; i < fp->inputs_count; ) { + for (i = 0, j = -1; j < (int)fp->inputs_count; i++) { uint32_t reg = 0; - OUT_PKT0(ring, REG_A3XX_SP_VS_VPC_DST_REG(i/4), 1); - - reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(fp->inputs[i++].inloc); - reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(fp->inputs[i++].inloc); - reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(fp->inputs[i++].inloc); - reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(fp->inputs[i++].inloc); + OUT_PKT0(ring, REG_A3XX_SP_VS_VPC_DST_REG(i), 1); + + j = next_varying(fp, j); + if (j < fp->inputs_count) + reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(fp->inputs[j].inloc); + j = next_varying(fp, j); + if (j < fp->inputs_count) + reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(fp->inputs[j].inloc); + j = next_varying(fp, j); + if (j < fp->inputs_count) + reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(fp->inputs[j].inloc); + j = next_varying(fp, j); + if (j < fp->inputs_count) + reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(fp->inputs[j].inloc); OUT_RING(ring, reg); } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.h b/src/gallium/drivers/freedreno/a3xx/fd3_program.h index 26aa9f34de7..8f491b05e4d 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h @@ -80,6 +80,12 @@ struct fd3_shader_variant { * + From the vert shader, we only need the output regid */ + /* for frag shader, pos_regid holds the frag_pos, ie. what is passed + * to bary.f instructions + */ + uint8_t pos_regid; + bool frag_coord, frag_face; + /* varyings/outputs: */ unsigned outputs_count; struct { @@ -96,6 +102,7 @@ struct fd3_shader_variant { uint8_t compmask; /* in theory inloc of fs should match outloc of vs: */ uint8_t inloc; + uint8_t bary; } inputs[16]; unsigned total_in; /* sum of inputs (scalar) */ diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h index 9327fbdca72..09052346992 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3.h +++ b/src/gallium/drivers/freedreno/a3xx/ir3.h @@ -312,6 +312,11 @@ static inline bool is_flow(struct ir3_instruction *instr) return (instr->category == 0); } +static inline bool is_kill(struct ir3_instruction *instr) +{ + return is_flow(instr) && (instr->opc == OPC_KILL); +} + static inline bool is_nop(struct ir3_instruction *instr) { return is_flow(instr) && (instr->opc == OPC_NOP); @@ -380,7 +385,7 @@ void ir3_block_sched(struct ir3_block *block); /* register assignment: */ int ir3_block_ra(struct ir3_block *block, enum shader_t type, - bool half_precision); + bool half_precision, bool frag_coord, bool frag_face); #ifndef ARRAY_SIZE diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c index 904727a7d70..9d3a7783494 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c @@ -54,6 +54,8 @@ struct ir3_ra_ctx { struct ir3_block *block; enum shader_t type; bool half_precision; + bool frag_coord; + bool frag_face; int cnt; bool error; }; @@ -100,8 +102,11 @@ static int output_base(struct ir3_ra_ctx *ctx) * see how because the blob driver always uses r0.x (ie. * all zeros) */ - if ((ctx->type == SHADER_FRAGMENT) && !ctx->half_precision) - return 2; + if (ctx->type == SHADER_FRAGMENT) { + if (ctx->half_precision) + return ctx->frag_face ? 1 : 0; + return ctx->frag_coord ? 6 : 2; + } return 0; } @@ -429,6 +434,10 @@ static void ra_assign_reg(struct ir3_visitor *v, struct ir3_instruction *instr, struct ir3_register *reg) { struct ra_assign_visitor *a = ra_assign_visitor(v); + + if (is_flow(instr) && (instr->opc == OPC_KILL)) + return; + reg->flags &= ~IR3_REG_SSA; reg->num = a->num & ~REG_HALF; if (a->num & REG_HALF) { @@ -673,7 +682,7 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) struct ir3_instruction *n; if (!block->parent) { - unsigned i; + unsigned i, j; int base, off = output_base(ctx); base = alloc_block(ctx, NULL, block->noutputs + off); @@ -682,13 +691,19 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) base |= REG_HALF; for (i = 0; i < block->noutputs; i++) - if (block->outputs[i]) + if (block->outputs[i] && !is_kill(block->outputs[i])) ra_assign(ctx, block->outputs[i], base + i + off); if (ctx->type == SHADER_FRAGMENT) { - for (i = 0; i < block->ninputs; i++) + i = 0; + if (ctx->frag_face) { + /* if we have frag_face, it gets hr0.x */ + ra_assign(ctx, block->inputs[i], REG_HALF | 0); + i += 4; + } + for (j = 0; i < block->ninputs; i++, j++) if (block->inputs[i]) - ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + i); + ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j); } else { for (i = 0; i < block->ninputs; i++) if (block->inputs[i]) @@ -712,12 +727,14 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) } int ir3_block_ra(struct ir3_block *block, enum shader_t type, - bool half_precision) + bool half_precision, bool frag_coord, bool frag_face) { struct ir3_ra_ctx ctx = { .block = block, .type = type, .half_precision = half_precision, + .frag_coord = frag_coord, + .frag_face = frag_face, }; ir3_shader_clear_mark(block->shader); return block_ra(&ctx, block);