From 792d1c92df6f58f219eb8b77e668424cdcc9c9af Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 26 Jun 2014 23:07:39 +0100 Subject: [PATCH] vc4: Switch to actually generating vertex and fragment shader code from TGSI. This introduces an IR (QIR, for QPU IR) to do optimization on. It's a scalar, SSA IR in general. It looks like optimization is pretty easy this way, though I haven't figured out if it's going to be good for our weird register allocation or not (or if I want to reduce to basically QPU instructions first), and I've got some problems with it having some multi-QPU-instruction opcodes (SEQ and CMP, for example) which I probably want to break down. Of course, this commit mostly doesn't work, since many other things are still hardwired, like the VBO data. v2: Rewrite to use a bunch of helpers (qir_OPCODE) for emitting QIR instructions into temporary values, and make qir_inst4 take the 4 args separately instead of an array (all later callers wanted individual args). --- src/gallium/drivers/vc4/Makefile.sources | 2 + src/gallium/drivers/vc4/vc4_context.h | 16 + src/gallium/drivers/vc4/vc4_draw.c | 33 +- src/gallium/drivers/vc4/vc4_program.c | 749 +++++++++++++++------- src/gallium/drivers/vc4/vc4_qir.c | 192 ++++++ src/gallium/drivers/vc4/vc4_qir.h | 182 ++++++ src/gallium/drivers/vc4/vc4_qpu.c | 8 + src/gallium/drivers/vc4/vc4_qpu.h | 5 +- src/gallium/drivers/vc4/vc4_qpu_defines.h | 4 +- src/gallium/drivers/vc4/vc4_qpu_disasm.c | 4 +- src/gallium/drivers/vc4/vc4_qpu_emit.c | 292 +++++++++ src/gallium/drivers/vc4/vc4_state.c | 3 +- 12 files changed, 1243 insertions(+), 247 deletions(-) create mode 100644 src/gallium/drivers/vc4/vc4_qir.c create mode 100644 src/gallium/drivers/vc4/vc4_qir.h create mode 100644 src/gallium/drivers/vc4/vc4_qpu_emit.c diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index 294869fe99a..cf464b06315 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -5,6 +5,8 @@ C_SOURCES := \ vc4_draw.c \ vc4_emit.c \ vc4_program.c \ + vc4_qir.c \ + vc4_qpu_emit.c \ vc4_qpu.c \ vc4_qpu_disasm.c \ vc4_qpu_validate.c \ diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index 0c906488756..36ad1bd2c8d 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -32,6 +32,7 @@ #include "vc4_bufmgr.h" #include "vc4_resource.h" #include "vc4_cl.h" +#include "vc4_qir.h" #define VC4_DIRTY_BLEND (1 << 0) #define VC4_DIRTY_RASTERIZER (1 << 1) @@ -63,10 +64,18 @@ struct vc4_texture_stateobj { unsigned dirty_samplers; }; +struct vc4_shader_uniform_info { + enum quniform_contents *contents; + uint32_t *data; + uint32_t count; +}; + struct vc4_shader_state { struct pipe_shader_state base; struct vc4_bo *bo; + struct vc4_shader_uniform_info uniforms[2]; + uint32_t coord_shader_offset; }; @@ -173,7 +182,14 @@ void vc4_simulator_flush(struct vc4_context *vc4, struct vc4_surface *color_surf); void *vc4_simulator_alloc(struct vc4_screen *screen, uint32_t size); +void vc4_get_uniform_bo(struct vc4_context *vc4, + struct vc4_shader_state *shader, + struct vc4_constbuf_stateobj *cb, + int shader_index, struct vc4_bo **out_bo, + uint32_t *out_offset); + void vc4_flush(struct pipe_context *pctx); void vc4_emit_state(struct pipe_context *pctx); +void vc4_generate_code(struct qcompile *c); #endif /* VC4_CONTEXT_H */ diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index 34977bb6938..f3283cda432 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -90,21 +90,6 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) struct vc4_bo *ibo = get_ibo(vc4); struct vc4_bo *vbo = get_vbo(vc4, width, height); - static const uint32_t fs_uni[] = { 0 }; - uint32_t vs_uni[] = { - fui(vc4->framebuffer.width * 16.0f / 2.0f), - fui(vc4->framebuffer.height * 16.0f / 2.0f), - }; - uint32_t cs_uni[] = { - fui(vc4->framebuffer.width * 16.0f / 2.0f), - fui(vc4->framebuffer.height * 16.0f / 2.0f), - }; - struct vc4_bo *fs_ubo = vc4_bo_alloc_mem(vc4->screen, fs_uni, - sizeof(fs_uni), "fs_ubo"); - struct vc4_bo *vs_ubo = vc4_bo_alloc_mem(vc4->screen, vs_uni, - sizeof(vs_uni), "vs_ubo"); - struct vc4_bo *cs_ubo = vc4_bo_alloc_mem(vc4->screen, cs_uni, - sizeof(cs_uni), "cs_ubo"); vc4->needs_flush = true; @@ -149,25 +134,37 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) // Shader Record + struct vc4_bo *fs_ubo, *vs_ubo, *cs_ubo; + uint32_t fs_ubo_offset, vs_ubo_offset, cs_ubo_offset; + vc4_get_uniform_bo(vc4, vc4->prog.fs, + &vc4->constbuf[PIPE_SHADER_FRAGMENT], + 0, &fs_ubo, &fs_ubo_offset); + vc4_get_uniform_bo(vc4, vc4->prog.vs, + &vc4->constbuf[PIPE_SHADER_VERTEX], + 0, &vs_ubo, &vs_ubo_offset); + vc4_get_uniform_bo(vc4, vc4->prog.vs, + &vc4->constbuf[PIPE_SHADER_VERTEX], + 1, &cs_ubo, &cs_ubo_offset); + cl_start_shader_reloc(&vc4->shader_rec, 7); cl_u16(&vc4->shader_rec, VC4_SHADER_FLAG_ENABLE_CLIPPING); cl_u8(&vc4->shader_rec, 0); /* fs num uniforms (unused) */ cl_u8(&vc4->shader_rec, 0); /* fs num varyings */ cl_reloc(vc4, &vc4->shader_rec, vc4->prog.fs->bo, 0); - cl_reloc(vc4, &vc4->shader_rec, fs_ubo, 0); + cl_reloc(vc4, &vc4->shader_rec, fs_ubo, fs_ubo_offset); cl_u16(&vc4->shader_rec, 0); /* vs num uniforms */ cl_u8(&vc4->shader_rec, 1); /* vs attribute array bitfield */ cl_u8(&vc4->shader_rec, 16); /* vs total attribute size */ cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, 0); - cl_reloc(vc4, &vc4->shader_rec, vs_ubo, 0); + cl_reloc(vc4, &vc4->shader_rec, vs_ubo, vs_ubo_offset); cl_u16(&vc4->shader_rec, 0); /* cs num uniforms */ cl_u8(&vc4->shader_rec, 1); /* cs attribute array bitfield */ cl_u8(&vc4->shader_rec, 16); /* vs total attribute size */ cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, vc4->prog.vs->coord_shader_offset); - cl_reloc(vc4, &vc4->shader_rec, cs_ubo, 0); + cl_reloc(vc4, &vc4->shader_rec, cs_ubo, cs_ubo_offset); cl_reloc(vc4, &vc4->shader_rec, vbo, 0); cl_u8(&vc4->shader_rec, 15); /* bytes - 1 in the attribute*/ diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 56fe37c5f5f..8a937359472 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -27,261 +27,513 @@ #include "pipe/p_state.h" #include "util/u_memory.h" #include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_dump.h" #include "vc4_context.h" #include "vc4_qpu.h" +#include "vc4_qir.h" + +struct tgsi_to_qir { + struct tgsi_parse_context parser; + struct qcompile *c; + struct qreg *temps; + struct qreg *inputs; + struct qreg *outputs; + struct qreg *uniforms; + struct qreg *consts; + uint32_t num_consts; + + uint32_t *uniform_data; + enum quniform_contents *uniform_contents; + uint32_t num_uniforms; +}; + +static struct qreg +get_temp_for_uniform(struct tgsi_to_qir *trans, uint32_t uniform) +{ + struct qcompile *c = trans->c; + struct qreg u = { QFILE_UNIF, uniform }; + + struct qreg t = qir_MOV(c, u); + trans->uniforms[uniform] = t; + return t; +} + +static struct qreg +qir_uniform_ui(struct tgsi_to_qir *trans, uint32_t ui) +{ + for (int i = 0; i < trans->num_uniforms; i++) { + if (trans->uniform_contents[i] == QUNIFORM_CONSTANT && + trans->uniform_data[i] == ui) + return trans->uniforms[i]; + } + + trans->uniform_contents[trans->num_uniforms] = QUNIFORM_CONSTANT; + trans->uniform_data[trans->num_uniforms] = ui; + return get_temp_for_uniform(trans, trans->num_uniforms++); +} + +static struct qreg +qir_uniform_f(struct tgsi_to_qir *trans, float f) +{ + return qir_uniform_ui(trans, fui(f)); +} + +static struct qreg +qir_uniform(struct tgsi_to_qir *trans, uint32_t index) +{ + for (int i = 0; i < trans->num_uniforms; i++) { + if (trans->uniform_contents[i] == QUNIFORM_UNIFORM && + trans->uniform_data[i] == index) + return trans->uniforms[i]; + } + + trans->uniform_contents[trans->num_uniforms] = QUNIFORM_UNIFORM; + trans->uniform_data[trans->num_uniforms] = index; + return get_temp_for_uniform(trans, trans->num_uniforms++); +} + +static struct qreg +get_src(struct tgsi_to_qir *trans, struct tgsi_src_register *src, int i) +{ + struct qcompile *c = trans->c; + struct qreg r = c->undef; + + uint32_t s = i; + switch (i) { + case TGSI_SWIZZLE_X: + s = src->SwizzleX; + break; + case TGSI_SWIZZLE_Y: + s = src->SwizzleY; + break; + case TGSI_SWIZZLE_Z: + s = src->SwizzleZ; + break; + case TGSI_SWIZZLE_W: + s = src->SwizzleW; + break; + default: + abort(); + } + + assert(!src->Indirect); + + switch (src->File) { + case TGSI_FILE_NULL: + return r; + case TGSI_FILE_TEMPORARY: + r = trans->temps[src->Index * 4 + s]; + break; + case TGSI_FILE_IMMEDIATE: + r = trans->consts[src->Index * 4 + s]; + break; + case TGSI_FILE_CONSTANT: + r = qir_uniform(trans, src->Index * 4 + s); + break; + case TGSI_FILE_INPUT: + r = trans->inputs[src->Index * 4 + s]; + break; + default: + fprintf(stderr, "unknown src file %d\n", src->File); + abort(); + } + + if (src->Absolute) + r = qir_FMAXABS(c, r, r); + + if (src->Negate) + r = qir_FSUB(c, qir_uniform_f(trans, 0), r); + + return r; +}; + static void -vc4_dump_program(const uint64_t *insts, uint count) +update_dst(struct tgsi_to_qir *trans, struct tgsi_full_instruction *tgsi_inst, + int i, struct qreg val) { - for (int i = 0; i < count; i++) { - fprintf(stderr, "0x%016"PRIx64" ", insts[i]); - vc4_qpu_disasm(&insts[i], 1); - fprintf(stderr, "\n"); + struct tgsi_dst_register *tgsi_dst = &tgsi_inst->Dst[0].Register; + + assert(!tgsi_dst->Indirect); + + switch (tgsi_dst->File) { + case TGSI_FILE_TEMPORARY: + trans->temps[tgsi_dst->Index * 4 + i] = val; + break; + case TGSI_FILE_OUTPUT: + trans->outputs[tgsi_dst->Index * 4 + i] = val; + break; + default: + fprintf(stderr, "unknown dst file %d\n", tgsi_dst->File); + abort(); } +}; + +static struct qreg +tgsi_to_qir_alu(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, struct qreg *src, int i) +{ + struct qcompile *c = trans->c; + struct qreg dst = qir_get_temp(c); + qir_emit(c, qir_inst(op, dst, src[0 * 4 + i], src[1 * 4 + i])); + return dst; } -static struct vc4_shader_state * -vc4_shader_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) +static struct qreg +tgsi_to_qir_mad(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, struct qreg *src, int i) { - struct vc4_shader_state *so = CALLOC_STRUCT(vc4_shader_state); - if (!so) - return NULL; + struct qcompile *c = trans->c; + return qir_FADD(c, + qir_FMUL(c, + src[0 * 4 + i], + src[1 * 4 + i]), + src[2 * 4 + i]); +} - so->base.tokens = tgsi_dup_tokens(cso->tokens); +static struct qreg +tgsi_to_qir_dp(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst, + int num, struct qreg *src, int i) +{ + struct qcompile *c = trans->c; - return so; + struct qreg sum = qir_FMUL(c, src[0 * 4 + 0], src[1 * 4 + 0]); + for (int j = 1; j < num; j++) { + sum = qir_FADD(c, sum, qir_FMUL(c, + src[0 * 4 + j], + src[1 * 4 + j])); + } + return sum; } -static void * -vc4_fs_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) +static struct qreg +tgsi_to_qir_dp2(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, struct qreg *src, int i) { - struct vc4_context *vc4 = vc4_context(pctx); - struct vc4_shader_state *so = vc4_shader_state_create(pctx, cso); - if (!so) - return NULL; + return tgsi_to_qir_dp(trans, tgsi_inst, 2, src, i); +} - uint64_t gen_fsc[100]; - uint64_t cur_inst; - int gen_fsc_len = 0; -#if 0 - cur_inst = qpu_load_imm_f(qpu_r5(), 0.0f); - gen_fsc[gen_fsc_len++] = cur_inst; - - cur_inst = qpu_inst(qpu_a_MOV(qpu_r0(), qpu_vary()), - qpu_m_MOV(qpu_r3(), qpu_r5())); - cur_inst |= QPU_PM; - cur_inst |= QPU_SET_FIELD(QPU_PACK_MUL_8D, QPU_PACK); - gen_fsc[gen_fsc_len++] = cur_inst; - - cur_inst = qpu_inst(qpu_a_FADD(qpu_r0(), qpu_r0(), qpu_r5()), - qpu_m_MOV(qpu_r1(), qpu_vary())); - gen_fsc[gen_fsc_len++] = cur_inst; - - cur_inst = qpu_inst(qpu_a_FADD(qpu_r1(), qpu_r1(), qpu_r5()), - qpu_m_MOV(qpu_r2(), qpu_vary())); - cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_WAIT_FOR_SCOREBOARD, QPU_SIG); - gen_fsc[gen_fsc_len++] = cur_inst; - - cur_inst = qpu_inst(qpu_a_FADD(qpu_r2(), qpu_r2(), qpu_r5()), - qpu_m_MOV(qpu_r3(), qpu_r0())); - cur_inst |= QPU_PM; - cur_inst |= QPU_SET_FIELD(QPU_PACK_MUL_8A, QPU_PACK); - gen_fsc[gen_fsc_len++] = cur_inst; - - cur_inst = qpu_inst(qpu_a_NOP(), - qpu_m_MOV(qpu_r3(), qpu_r1())); - cur_inst |= QPU_PM; - cur_inst |= QPU_SET_FIELD(QPU_PACK_MUL_8B, QPU_PACK); - gen_fsc[gen_fsc_len++] = cur_inst; - - cur_inst = qpu_inst(qpu_a_NOP(), - qpu_m_MOV(qpu_r3(), qpu_r2())); - cur_inst |= QPU_PM; - cur_inst |= QPU_SET_FIELD(QPU_PACK_MUL_8C, QPU_PACK); - gen_fsc[gen_fsc_len++] = cur_inst; - - cur_inst = qpu_inst(qpu_a_MOV(qpu_tlbc(), qpu_r3()), - qpu_m_NOP()); - cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_PROG_END, QPU_SIG); - gen_fsc[gen_fsc_len++] = cur_inst; - - cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - gen_fsc[gen_fsc_len++] = cur_inst; - - cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_SCOREBOARD_UNLOCK, QPU_SIG); - gen_fsc[gen_fsc_len++] = cur_inst; - -#else - - /* drain the varyings. */ - for (int i = 0; i < 3; i++) { - cur_inst = qpu_inst(qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_rb(QPU_R_NOP)), - qpu_m_NOP()); - if (i == 1) - cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_WAIT_FOR_SCOREBOARD, QPU_SIG); - gen_fsc[gen_fsc_len++] = cur_inst; - - cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - gen_fsc[gen_fsc_len++] = cur_inst; +static struct qreg +tgsi_to_qir_dp3(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, struct qreg *src, int i) +{ + return tgsi_to_qir_dp(trans, tgsi_inst, 3, src, i); +} + +static struct qreg +tgsi_to_qir_dp4(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, struct qreg *src, int i) +{ + return tgsi_to_qir_dp(trans, tgsi_inst, 4, src, i); +} + +static void +emit_tgsi_instruction(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst) +{ + struct qcompile *c = trans->c; + struct { + enum qop op; + struct qreg (*func)(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, + struct qreg *src, int i); + } op_trans[] = { + [TGSI_OPCODE_MOV] = { QOP_MOV, tgsi_to_qir_alu }, + [TGSI_OPCODE_ABS] = { QOP_FMAXABS, tgsi_to_qir_alu }, + [TGSI_OPCODE_MUL] = { QOP_FMUL, tgsi_to_qir_alu }, + [TGSI_OPCODE_ADD] = { QOP_FADD, tgsi_to_qir_alu }, + [TGSI_OPCODE_SUB] = { QOP_FSUB, tgsi_to_qir_alu }, + [TGSI_OPCODE_MIN] = { QOP_FMIN, tgsi_to_qir_alu }, + [TGSI_OPCODE_MAX] = { QOP_FMAX, tgsi_to_qir_alu }, + [TGSI_OPCODE_RSQ] = { QOP_RSQ, tgsi_to_qir_alu }, + [TGSI_OPCODE_MAD] = { 0, tgsi_to_qir_mad }, + [TGSI_OPCODE_DP2] = { 0, tgsi_to_qir_dp2 }, + [TGSI_OPCODE_DP3] = { 0, tgsi_to_qir_dp3 }, + [TGSI_OPCODE_DP4] = { 0, tgsi_to_qir_dp4 }, + [TGSI_OPCODE_LIT] = { QOP_MOV, tgsi_to_qir_alu }, /* XXX */ + }; + static int asdf = 0; + uint32_t tgsi_op = tgsi_inst->Instruction.Opcode; + + if (tgsi_op == TGSI_OPCODE_END) + return; + + tgsi_dump_instruction(tgsi_inst, asdf++); + + if (tgsi_op > ARRAY_SIZE(op_trans) || !op_trans[tgsi_op].func) { + fprintf(stderr, "unknown tgsi inst: "); + tgsi_dump_instruction(tgsi_inst, asdf++); + fprintf(stderr, "\n"); + abort(); + } + + struct qreg src_regs[12]; + for (int s = 0; s < 3; s++) { + for (int i = 0; i < 4; i++) { + src_regs[4 * s + i] = + get_src(trans, &tgsi_inst->Src[s].Register, i); + } } - /* some colors */ -#if 1 for (int i = 0; i < 4; i++) { - cur_inst = qpu_load_imm_f(qpu_rn(i), .2 + i / 4.0); - gen_fsc[gen_fsc_len++] = cur_inst; + if (!(tgsi_inst->Dst[0].Register.WriteMask & (1 << i))) + continue; + + struct qreg result; + + result = op_trans[tgsi_op].func(trans, tgsi_inst, + op_trans[tgsi_op].op, + src_regs, i); + + if (tgsi_inst->Instruction.Saturate) { + float low = (tgsi_inst->Instruction.Saturate == + TGSI_SAT_MINUS_PLUS_ONE ? -1.0 : 0.0); + result = qir_FMAX(c, + qir_FMIN(c, + result, + qir_uniform_f(trans, 1.0)), + qir_uniform_f(trans, low)); + } + + update_dst(trans, tgsi_inst, i, result); } +} +static void +parse_tgsi_immediate(struct tgsi_to_qir *trans, struct tgsi_full_immediate *imm) +{ for (int i = 0; i < 4; i++) { - cur_inst = qpu_inst(qpu_a_NOP(), - qpu_m_FMUL(qpu_ra(1), - qpu_rn(i), qpu_rn(i))); - cur_inst |= QPU_PM; - cur_inst |= QPU_SET_FIELD(QPU_PACK_A_8A + i, QPU_PACK); - gen_fsc[gen_fsc_len++] = cur_inst; + unsigned n = trans->num_consts++; + trans->consts[n] = qir_uniform_ui(trans, imm->u[i].Uint); } -#else - cur_inst = qpu_load_imm_ui(qpu_ra(1), 0x22446688); - gen_fsc[gen_fsc_len++] = cur_inst; -#endif +} - cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - gen_fsc[gen_fsc_len++] = cur_inst; +static void +emit_frag_init(struct tgsi_to_qir *trans, struct vc4_shader_state *so) +{ + /* XXX: lols */ + for (int i = 0; i < 4; i++) { + trans->inputs[i] = qir_uniform_ui(trans, fui(i / 4.0)); + } - cur_inst = qpu_inst(qpu_a_MOV(qpu_tlbc(), qpu_ra(1)), - qpu_m_NOP()); - cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_PROG_END, QPU_SIG); - gen_fsc[gen_fsc_len++] = cur_inst; +} - cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - gen_fsc[gen_fsc_len++] = cur_inst; +static void +emit_vert_init(struct tgsi_to_qir *trans, struct vc4_shader_state *so) +{ + struct qcompile *c = trans->c; - cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_SCOREBOARD_UNLOCK, QPU_SIG); - gen_fsc[gen_fsc_len++] = cur_inst; -#endif + /* XXX: attribute type/size/count */ + for (int i = 0; i < 4; i++) { + trans->inputs[i] = qir_get_temp(c); + qir_emit(c, qir_inst(QOP_VPM_READ, trans->inputs[i], + c->undef, c->undef)); + } +} +static void +emit_coord_init(struct tgsi_to_qir *trans, struct vc4_shader_state *so) +{ + struct qcompile *c = trans->c; - if (1) - vc4_dump_program(gen_fsc, gen_fsc_len); - vc4_qpu_validate(gen_fsc, gen_fsc_len); + /* XXX: attribute type/size/count */ + for (int i = 0; i < 4; i++) { + trans->inputs[i] = qir_get_temp(c); + qir_emit(c, qir_inst(QOP_VPM_READ, trans->inputs[i], + c->undef, c->undef)); + } +} - so->bo = vc4_bo_alloc_mem(vc4->screen, gen_fsc, - gen_fsc_len * sizeof(uint64_t), "fs_code"); +static void +emit_frag_end(struct tgsi_to_qir *trans, struct vc4_shader_state *so) +{ + struct qcompile *c = trans->c; + + struct qreg t = qir_get_temp(c); + qir_emit(c, qir_inst4(QOP_PACK_COLORS, t, + trans->outputs[0], + trans->outputs[1], + trans->outputs[2], + trans->outputs[3])); + qir_emit(c, qir_inst(QOP_TLB_COLOR_WRITE, c->undef, + t, c->undef)); +} - return so; +static void +emit_scaled_viewport_write(struct tgsi_to_qir *trans) +{ + struct qcompile *c = trans->c; + struct qreg xyi[2]; + + for (int i = 0; i < 2; i++) { + trans->uniform_contents[trans->num_uniforms] = + QUNIFORM_VIEWPORT_X_SCALE + i; + struct qreg scale = { QFILE_UNIF, trans->num_uniforms++ }; + + xyi[i] = qir_FTOI(c, qir_FMUL(c, trans->outputs[i], scale)); + } + + qir_VPM_WRITE(c, qir_PACK_SCALED(c, xyi[0], xyi[1])); } -static int -gen_vs_cs_code(uint64_t *gen, bool is_vs) +static void +emit_zs_write(struct tgsi_to_qir *trans) { - uint32_t count = 0; - uint64_t cur_inst; - struct qpu_reg x = qpu_ra(0); - struct qpu_reg y = qpu_ra(1); - struct qpu_reg z = qpu_ra(2); - struct qpu_reg w = qpu_ra(3); - struct qpu_reg xy = qpu_ra(10); - struct qpu_reg xs = qpu_ra(12); - struct qpu_reg ys = qpu_ra(13); - struct qpu_reg vpmread = qpu_ra(QPU_R_VPM); - struct qpu_reg vpm = qpu_ra(QPU_W_VPM); + struct qcompile *c = trans->c; - gen[count++] = qpu_load_imm_ui(qpu_vrsetup(), 0x00401a00); - gen[count++] = qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00); + /* XXX: rescale */ + qir_VPM_WRITE(c, trans->outputs[2]); +} -#if 1 - gen[count++] = qpu_inst(qpu_a_MOV(x, vpmread), qpu_m_NOP()); - gen[count++] = qpu_inst(qpu_a_MOV(y, vpmread), qpu_m_NOP()); - gen[count++] = qpu_inst(qpu_a_MOV(z, vpmread), qpu_m_NOP()); - gen[count++] = qpu_inst(qpu_a_MOV(w, vpmread), qpu_m_NOP()); - - - gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_FMUL(xs, x, - qpu_rb(QPU_R_UNIF))); - gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_FMUL(ys, y, - qpu_rb(QPU_R_UNIF))); - - cur_inst = qpu_inst(qpu_a_FTOI(xy, xs), qpu_m_NOP()); - cur_inst |= QPU_SET_FIELD(QPU_PACK_A_16A, QPU_PACK); - gen[count++] = cur_inst; - cur_inst = qpu_inst(qpu_a_FTOI(xy, ys), qpu_m_NOP()); - cur_inst |= QPU_SET_FIELD(QPU_PACK_A_16B, QPU_PACK); - gen[count++] = cur_inst; - -#else - - struct qpu_reg t = qpu_ra(20); - struct qpu_reg hundred = qpu_rb(21); - gen[count++] = qpu_inst(qpu_a_NOP(), - qpu_m_MUL24(t, - qpu_ra(QPU_R_ELEM_QPU), - qpu_ra(QPU_R_ELEM_QPU))); - gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - - gen[count++] = qpu_load_imm_ui(hundred, 400); - gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - - struct qpu_reg xm = qpu_ra(22), ym = qpu_ra(23); - gen[count++] = qpu_inst(qpu_a_NOP(), - qpu_m_MUL24(xm, hundred, qpu_ra(QPU_R_ELEM_QPU))); - gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - - gen[count++] = qpu_inst(qpu_a_NOP(), - qpu_m_MUL24(ym, hundred, t)); - gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - - cur_inst = qpu_inst(qpu_a_MOV(xy, xm), qpu_m_NOP()); - cur_inst |= QPU_SET_FIELD(QPU_PACK_A_16A, QPU_PACK); - gen[count++] = cur_inst; - cur_inst = qpu_inst(qpu_a_MOV(xy, ym), qpu_m_NOP()); - cur_inst |= QPU_SET_FIELD(QPU_PACK_A_16B, QPU_PACK); - gen[count++] = cur_inst; -#endif +static void +emit_1_wc_write(struct tgsi_to_qir *trans) +{ + struct qcompile *c = trans->c; + + /* XXX: RCP */ + qir_VPM_WRITE(c, trans->outputs[3]); +} - gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); +static void +emit_vert_end(struct tgsi_to_qir *trans, struct vc4_shader_state *so) +{ + emit_scaled_viewport_write(trans); + emit_zs_write(trans); + emit_1_wc_write(trans); + /* XXX: write varyings */ +} - if (is_vs) { - gen[count++] = qpu_inst(qpu_a_MOV(vpm, xy), qpu_m_NOP()); +static void +emit_coord_end(struct tgsi_to_qir *trans, struct vc4_shader_state *so) +{ + struct qcompile *c = trans->c; - /* XXX */ - gen[count++] = qpu_inst(qpu_a_MOV(vpm, z), qpu_m_NOP()); - gen[count++] = qpu_inst(qpu_a_MOV(vpm, w), qpu_m_NOP()); + for (int i = 0; i < 4; i++) + qir_VPM_WRITE(c, trans->outputs[i]); - } else { - gen[count++] = qpu_inst(qpu_a_MOV(vpm, x), qpu_m_NOP()); - gen[count++] = qpu_inst(qpu_a_MOV(vpm, y), qpu_m_NOP()); - gen[count++] = qpu_inst(qpu_a_MOV(vpm, z), qpu_m_NOP()); - gen[count++] = qpu_inst(qpu_a_MOV(vpm, w), qpu_m_NOP()); - gen[count++] = qpu_inst(qpu_a_MOV(vpm, xy), qpu_m_NOP()); + emit_scaled_viewport_write(trans); + emit_zs_write(trans); + emit_1_wc_write(trans); +} - /* XXX */ - gen[count++] = qpu_inst(qpu_a_MOV(vpm, z), qpu_m_NOP()); - gen[count++] = qpu_inst(qpu_a_MOV(vpm, w), qpu_m_NOP()); +static struct tgsi_to_qir * +vc4_shader_tgsi_to_qir(struct vc4_shader_state *so, enum qstage stage) +{ + struct tgsi_to_qir *trans = CALLOC_STRUCT(tgsi_to_qir); + struct qcompile *c; + int ret; + + c = qir_compile_init(); + c->stage = stage; + + memset(trans, 0, sizeof(*trans)); + /* XXX sizing */ + trans->temps = calloc(sizeof(struct qreg), 1024); + trans->inputs = calloc(sizeof(struct qreg), 8 * 4); + trans->outputs = calloc(sizeof(struct qreg), 1024); + trans->uniforms = calloc(sizeof(struct qreg), 1024); + trans->consts = calloc(sizeof(struct qreg), 1024); + + trans->uniform_data = calloc(sizeof(uint32_t), 1024); + trans->uniform_contents = calloc(sizeof(enum quniform_contents), 1024); + + trans->c = c; + ret = tgsi_parse_init(&trans->parser, so->base.tokens); + assert(ret == TGSI_PARSE_OK); + + fprintf(stderr, "TGSI:\n"); + tgsi_dump(so->base.tokens, 0); + + switch (stage) { + case QSTAGE_FRAG: + emit_frag_init(trans, so); + break; + case QSTAGE_VERT: + emit_vert_init(trans, so); + break; + case QSTAGE_COORD: + emit_coord_init(trans, so); + break; } - /* PROGRAM END */ - cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_PROG_END, QPU_SIG); - gen[count++] = cur_inst; + while (!tgsi_parse_end_of_tokens(&trans->parser)) { + tgsi_parse_token(&trans->parser); - cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - gen[count++] = cur_inst; + switch (trans->parser.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_INSTRUCTION: + emit_tgsi_instruction(trans, + &trans->parser.FullToken.FullInstruction); + break; - cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); - gen[count++] = cur_inst; + case TGSI_TOKEN_TYPE_IMMEDIATE: + parse_tgsi_immediate(trans, + &trans->parser.FullToken.FullImmediate); + break; + } + } + + switch (stage) { + case QSTAGE_FRAG: + emit_frag_end(trans, so); + break; + case QSTAGE_VERT: + emit_vert_end(trans, so); + break; + case QSTAGE_COORD: + emit_coord_end(trans, so); + break; + } + + qir_dump(c); + + tgsi_parse_free(&trans->parser); + free(trans->temps); - vc4_qpu_validate(gen, count); + vc4_generate_code(c); - return count; + return trans; +} + +static struct vc4_shader_state * +vc4_shader_state_create(struct pipe_context *pctx, + const struct pipe_shader_state *cso) +{ + struct vc4_shader_state *so = CALLOC_STRUCT(vc4_shader_state); + if (!so) + return NULL; + + so->base.tokens = tgsi_dup_tokens(cso->tokens); + + return so; +} + +static void +copy_uniform_state_to_shader(struct vc4_shader_state *so, + int shader_index, + struct tgsi_to_qir *trans) +{ + int count = trans->num_uniforms; + struct vc4_shader_uniform_info *uinfo = &so->uniforms[shader_index]; + + uinfo->count = count; + uinfo->data = malloc(count * sizeof(*uinfo->data)); + memcpy(uinfo->data, trans->uniform_data, + count * sizeof(*uinfo->data)); + uinfo->contents = malloc(count * sizeof(*uinfo->contents)); + memcpy(uinfo->contents, trans->uniform_contents, + count * sizeof(*uinfo->contents)); } static void * -vc4_vs_state_create(struct pipe_context *pctx, +vc4_fs_state_create(struct pipe_context *pctx, const struct pipe_shader_state *cso) { struct vc4_context *vc4 = vc4_context(pctx); @@ -289,27 +541,47 @@ vc4_vs_state_create(struct pipe_context *pctx, if (!so) return NULL; - uint64_t gen[100]; - uint64_t count = 0; - uint64_t *vsc = gen; + struct tgsi_to_qir *trans = vc4_shader_tgsi_to_qir(so, QSTAGE_FRAG); + copy_uniform_state_to_shader(so, 0, trans); - /* VS */ - count += gen_vs_cs_code(gen + count, true); - fprintf(stderr, "VS:\n"); - vc4_dump_program(vsc, count); + so->bo = vc4_bo_alloc_mem(vc4->screen, trans->c->qpu_insts, + trans->c->num_qpu_insts * sizeof(uint64_t), + "fs_code"); - /* CS */ + qir_compile_destroy(trans->c); + free(trans); - /* XXX alignment? */ - uint64_t *csc = gen + count; - so->coord_shader_offset = count * sizeof(uint64_t); - count += gen_vs_cs_code(gen + count, false); + return so; +} - fprintf(stderr, "CS:\n"); - vc4_dump_program(csc, count - (csc - gen)); +static void * +vc4_vs_state_create(struct pipe_context *pctx, + const struct pipe_shader_state *cso) +{ + struct vc4_context *vc4 = vc4_context(pctx); + struct vc4_shader_state *so = vc4_shader_state_create(pctx, cso); + if (!so) + return NULL; + + struct tgsi_to_qir *vs_trans = vc4_shader_tgsi_to_qir(so, QSTAGE_VERT); + copy_uniform_state_to_shader(so, 0, vs_trans); + + struct tgsi_to_qir *cs_trans = vc4_shader_tgsi_to_qir(so, QSTAGE_COORD); + copy_uniform_state_to_shader(so, 1, cs_trans); + + uint32_t vs_size = vs_trans->c->num_qpu_insts * sizeof(uint64_t); + uint32_t cs_size = cs_trans->c->num_qpu_insts * sizeof(uint64_t); + so->coord_shader_offset = vs_size; /* XXX: alignment? */ + so->bo = vc4_bo_alloc(vc4->screen, + so->coord_shader_offset + cs_size, + "vs_code"); - so->bo = vc4_bo_alloc_mem(vc4->screen, gen, count * sizeof(uint64_t), - "vs_code"); + void *map = vc4_bo_map(so->bo); + memcpy(map, vs_trans->c->qpu_insts, vs_size); + memcpy(map + so->coord_shader_offset, cs_trans->c->qpu_insts, cs_size); + + qir_compile_destroy(vs_trans->c); + qir_compile_destroy(cs_trans->c); return so; } @@ -323,6 +595,41 @@ vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso) free(so); } +void +vc4_get_uniform_bo(struct vc4_context *vc4, struct vc4_shader_state *shader, + struct vc4_constbuf_stateobj *cb, + int shader_index, struct vc4_bo **out_bo, + uint32_t *out_offset) +{ + struct vc4_shader_uniform_info *uinfo = &shader->uniforms[shader_index]; + struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, uinfo->count * 4, "ubo"); + uint32_t *map = vc4_bo_map(ubo); + + for (int i = 0; i < uinfo->count; i++) { + switch (uinfo->contents[i]) { + case QUNIFORM_CONSTANT: + map[i] = uinfo->data[i]; + break; + case QUNIFORM_UNIFORM: + map[i] = ((uint32_t *)cb->cb[0].user_buffer)[uinfo->data[i]]; + break; + case QUNIFORM_VIEWPORT_X_SCALE: + map[i] = fui(vc4->framebuffer.width * 16.0f / 2.0f); + break; + case QUNIFORM_VIEWPORT_Y_SCALE: + map[i] = fui(vc4->framebuffer.height * -16.0f / 2.0f); + break; + } +#if 1 + fprintf(stderr, "%p/%d: %d: 0x%08x (%f)\n", + shader, shader_index, i, map[i], uif(map[i])); +#endif + } + + *out_bo = ubo; + *out_offset = 0; +} + static void vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso) { diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c new file mode 100644 index 00000000000..a4bb6cd1fd1 --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -0,0 +1,192 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "util/u_memory.h" +#include "util/u_simple_list.h" + +#include "vc4_qir.h" +#include "vc4_qpu.h" + +struct qir_op_info { + const char *name; + uint8_t ndst, nsrc; +}; + +static const struct qir_op_info qir_op_info[] = { + [QOP_MOV] = { "mov", 1, 1 }, + [QOP_FADD] = { "fadd", 1, 2 }, + [QOP_FSUB] = { "fsub", 1, 2 }, + [QOP_FMUL] = { "fmul", 1, 2 }, + [QOP_FMIN] = { "fmin", 1, 2 }, + [QOP_FMAX] = { "fmax", 1, 2 }, + [QOP_FMINABS] = { "fminabs", 1, 2 }, + [QOP_FMAXABS] = { "fmaxabs", 1, 2 }, + [QOP_FTOI] = { "ftoi", 1, 1 }, + [QOP_RCP] = { "rcp", 1, 1 }, + [QOP_RSQ] = { "rsq", 1, 1 }, + [QOP_EXP2] = { "exp2", 1, 2 }, + [QOP_LOG2] = { "log2", 1, 2 }, + [QOP_PACK_COLORS] = { "pack_colors", 1, 4 }, + [QOP_PACK_SCALED] = { "pack_scaled", 1, 2 }, + [QOP_VPM_WRITE] = { "vpm_write", 0, 1 }, + [QOP_VPM_READ] = { "vpm_read", 0, 1 }, + [QOP_TLB_COLOR_WRITE] = { "tlb_color", 0, 1 }, +}; + +static const char * +qir_get_op_name(enum qop qop) +{ + if (qop < ARRAY_SIZE(qir_op_info) && qir_op_info[qop].name) + return qir_op_info[qop].name; + else + return "???"; +} + +int +qir_get_op_nsrc(enum qop qop) +{ + if (qop < ARRAY_SIZE(qir_op_info) && qir_op_info[qop].name) + return qir_op_info[qop].nsrc; + else + abort(); +} + +static void +qir_print_reg(struct qreg reg) +{ + const char *files[] = { + [QFILE_TEMP] = "t", + [QFILE_VARY] = "v", + [QFILE_UNIF] = "u", + }; + + if (reg.file == QFILE_NULL) + fprintf(stderr, "null"); + else + fprintf(stderr, "%s%d", files[reg.file], reg.index); +} + +void +qir_dump_inst(struct qinst *inst) +{ + fprintf(stderr, "%s ", qir_get_op_name(inst->op)); + + qir_print_reg(inst->dst); + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + fprintf(stderr, ", "); + qir_print_reg(inst->src[i]); + } +} + +void +qir_dump(struct qcompile *c) +{ + struct simple_node *node; + + foreach(node, &c->instructions) { + struct qinst *inst = (struct qinst *)node; + qir_dump_inst(inst); + fprintf(stderr, "\n"); + } +} + +struct qreg +qir_get_temp(struct qcompile *c) +{ + struct qreg reg; + + reg.file = QFILE_TEMP; + reg.index = c->num_temps++; + + return reg; +} + +struct qinst * +qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1) +{ + struct qinst *inst = CALLOC_STRUCT(qinst); + + inst->op = op; + inst->dst = dst; + inst->src = calloc(2, sizeof(inst->src[0])); + inst->src[0] = src0; + inst->src[1] = src1; + + return inst; +} + +struct qinst * +qir_inst4(enum qop op, struct qreg dst, + struct qreg a, + struct qreg b, + struct qreg c, + struct qreg d) +{ + struct qinst *inst = CALLOC_STRUCT(qinst); + + inst->op = op; + inst->dst = dst; + inst->src = calloc(4, sizeof(*inst->src)); + inst->src[0] = a; + inst->src[1] = b; + inst->src[2] = c; + inst->src[3] = d; + + return inst; +} + +void +qir_emit(struct qcompile *c, struct qinst *inst) +{ + insert_at_tail(&c->instructions, &inst->link); +} + +struct qcompile * +qir_compile_init(void) +{ + struct qcompile *c = CALLOC_STRUCT(qcompile); + + make_empty_list(&c->instructions); + + return c; +} + +void +qir_compile_destroy(struct qcompile *c) +{ + free(c); +} + +const char * +qir_get_stage_name(enum qstage stage) +{ + static const char *names[] = { + [QSTAGE_FRAG] = "FS", + [QSTAGE_VERT] = "VS", + [QSTAGE_COORD] = "CS", + }; + + return names[stage]; +} diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h new file mode 100644 index 00000000000..ae9e1796b90 --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -0,0 +1,182 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef VC4_QIR_H +#define VC4_QIR_H + +#include + +#include "util/u_simple_list.h" + +enum qfile { + QFILE_NULL, + QFILE_TEMP, + QFILE_VARY, + QFILE_UNIF, +}; + +struct qreg { + enum qfile file; + uint32_t index; +}; + +enum qop { + QOP_UNDEF, + QOP_MOV, + QOP_FADD, + QOP_FSUB, + QOP_FMUL, + QOP_FMIN, + QOP_FMAX, + QOP_FMINABS, + QOP_FMAXABS, + QOP_FTOI, + QOP_RCP, + QOP_RSQ, + QOP_EXP2, + QOP_LOG2, + QOP_VW_SETUP, + QOP_VR_SETUP, + QOP_PACK_SCALED, + QOP_PACK_COLORS, + QOP_VPM_WRITE, + QOP_VPM_READ, + QOP_TLB_COLOR_WRITE, +}; + +struct simple_node { + struct simple_node *next; + struct simple_node *prev; +}; + +struct qinst { + struct simple_node link; + + enum qop op; + struct qreg dst; + struct qreg *src; +}; + +enum qstage { + /** + * Coordinate shader, runs during binning, before the VS, and just + * outputs position. + */ + QSTAGE_COORD, + QSTAGE_VERT, + QSTAGE_FRAG, +}; + +enum quniform_contents { + /** + * Indicates that a constant 32-bit value is copied from the program's + * uniform contents. + */ + QUNIFORM_CONSTANT, + /** + * Indicates that the program's uniform contents are used as an index + * into the GL uniform storage. + */ + QUNIFORM_UNIFORM, + + /** @{ + * Scaling factors from clip coordinates to relative to the viewport + * center. + * + * This is used by the coordinate and vertex shaders to produce the + * 32-bit entry consisting of 2 16-bit fields with 12.4 signed fixed + * point offsets from the viewport ccenter. + */ + QUNIFORM_VIEWPORT_X_SCALE, + QUNIFORM_VIEWPORT_Y_SCALE, + /** @} */ +}; + +struct qcompile { + struct qreg undef; + enum qstage stage; + uint32_t num_temps; + struct simple_node instructions; + uint32_t immediates[1024]; + + uint64_t *qpu_insts; + uint32_t num_qpu_insts; +}; + +struct qcompile *qir_compile_init(void); +void qir_compile_destroy(struct qcompile *c); +struct qinst *qir_inst(enum qop op, struct qreg dst, + struct qreg src0, struct qreg src1); +struct qinst *qir_inst4(enum qop op, struct qreg dst, + struct qreg a, + struct qreg b, + struct qreg c, + struct qreg d); +void qir_emit(struct qcompile *c, struct qinst *inst); +struct qreg qir_get_temp(struct qcompile *c); +int qir_get_op_nsrc(enum qop qop); + +void qir_dump(struct qcompile *c); +void qir_dump_inst(struct qinst *inst); +const char *qir_get_stage_name(enum qstage stage); + +#define QIR_ALU1(name) \ +static inline struct qreg \ +qir_##name(struct qcompile *c, struct qreg a) \ +{ \ + struct qreg t = qir_get_temp(c); \ + qir_emit(c, qir_inst(QOP_##name, t, a, c->undef)); \ + return t; \ +} + +#define QIR_ALU2(name) \ +static inline struct qreg \ +qir_##name(struct qcompile *c, struct qreg a, struct qreg b) \ +{ \ + struct qreg t = qir_get_temp(c); \ + qir_emit(c, qir_inst(QOP_##name, t, a, b)); \ + return t; \ +} + +QIR_ALU1(MOV) +QIR_ALU2(FADD) +QIR_ALU2(FSUB) +QIR_ALU2(FMUL) +QIR_ALU2(FMIN) +QIR_ALU2(FMAX) +QIR_ALU2(FMINABS) +QIR_ALU2(FMAXABS) +QIR_ALU1(FTOI) +QIR_ALU1(RCP) +QIR_ALU1(RSQ) +QIR_ALU1(EXP2) +QIR_ALU1(LOG2) +QIR_ALU2(PACK_SCALED) + +static inline void +qir_VPM_WRITE(struct qcompile *c, struct qreg a) +{ + qir_emit(c, qir_inst(QOP_VPM_WRITE, c->undef, a, c->undef)); +} + +#endif /* VC4_QIR_H */ diff --git a/src/gallium/drivers/vc4/vc4_qpu.c b/src/gallium/drivers/vc4/vc4_qpu.c index 18863f7eac1..de07f72bdd6 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.c +++ b/src/gallium/drivers/vc4/vc4_qpu.c @@ -208,3 +208,11 @@ qpu_inst(uint64_t add, uint64_t mul) return merge; } + +uint64_t +qpu_set_sig(uint64_t inst, uint32_t sig) +{ + assert(QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_NONE); + return (inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(sig, QPU_SIG); +} + diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h index 00aebf0a706..45aac0e135b 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.h +++ b/src/gallium/drivers/vc4/vc4_qpu.h @@ -130,6 +130,7 @@ uint64_t qpu_m_alu2(enum qpu_op_mul op, struct qpu_reg dst, struct qpu_reg src0, struct qpu_reg src1); uint64_t qpu_inst(uint64_t add, uint64_t mul); uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val); +uint64_t qpu_set_sig(uint64_t inst, uint32_t sig); static inline uint64_t qpu_load_imm_f(struct qpu_reg dst, float val) @@ -163,8 +164,8 @@ A_ALU2(FADD) A_ALU2(FSUB) A_ALU2(FMIN) A_ALU2(FMAX) -A_ALU2(MINABS) -A_ALU2(MAXABS) +A_ALU2(FMINABS) +A_ALU2(FMAXABS) A_ALU1(FTOI) A_ALU1(ITOF) A_ALU2(ADD) diff --git a/src/gallium/drivers/vc4/vc4_qpu_defines.h b/src/gallium/drivers/vc4/vc4_qpu_defines.h index d066f278ab3..13c940c0f8e 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_defines.h +++ b/src/gallium/drivers/vc4/vc4_qpu_defines.h @@ -34,8 +34,8 @@ enum qpu_op_add { QPU_A_FSUB, QPU_A_FMIN, QPU_A_FMAX, - QPU_A_MINABS, - QPU_A_MAXABS, + QPU_A_FMINABS, + QPU_A_FMAXABS, QPU_A_FTOI, QPU_A_ITOF, QPU_A_ADD = 12, diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c index cf90cb2e768..0aea2970f68 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c +++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c @@ -33,8 +33,8 @@ static const char *qpu_add_opcodes[] = { [QPU_A_FSUB] = "fsub", [QPU_A_FMIN] = "fmin", [QPU_A_FMAX] = "fmax", - [QPU_A_MINABS] = "minabs", - [QPU_A_MAXABS] = "maxabs", + [QPU_A_FMINABS] = "fminabs", + [QPU_A_FMAXABS] = "fmaxabs", [QPU_A_FTOI] = "ftoi", [QPU_A_ITOF] = "itof", [QPU_A_ADD] = "add", diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c new file mode 100644 index 00000000000..0f6f2c171a4 --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -0,0 +1,292 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#include "vc4_context.h" +#include "vc4_qir.h" +#include "vc4_qpu.h" + +static void +vc4_dump_program(struct qcompile *c) +{ + fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage)); + + for (int i = 0; i < c->num_qpu_insts; i++) { + fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]); + vc4_qpu_disasm(&c->qpu_insts[i], 1); + fprintf(stderr, "\n"); + } +} + +void +vc4_generate_code(struct qcompile *c) +{ + uint64_t *insts = malloc(sizeof(uint64_t) * 1024); /* XXX: sizing */ + uint32_t ni = 0; + struct qpu_reg allocate_to_qpu_reg[4 + 32 + 32]; + bool reg_in_use[ARRAY_SIZE(allocate_to_qpu_reg)]; + int *reg_allocated = calloc(c->num_temps, sizeof(*reg_allocated)); + int *reg_uses_remaining = + calloc(c->num_temps, sizeof(*reg_uses_remaining)); + + for (int i = 0; i < ARRAY_SIZE(reg_in_use); i++) + reg_in_use[i] = false; + for (int i = 0; i < c->num_temps; i++) + reg_allocated[i] = -1; + for (int i = 0; i < 4; i++) + allocate_to_qpu_reg[i] = qpu_rn(i); + for (int i = 0; i < 32; i++) + allocate_to_qpu_reg[i + 4] = qpu_ra(i); + for (int i = 0; i < 32; i++) + allocate_to_qpu_reg[i + 4 + 32] = qpu_rb(i); + + struct simple_node *node; + foreach(node, &c->instructions) { + struct qinst *qinst = (struct qinst *)node; + + if (qinst->dst.file == QFILE_TEMP) + reg_uses_remaining[qinst->dst.index]++; + for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) { + if (qinst->src[i].file == QFILE_TEMP) + reg_uses_remaining[qinst->src[i].index]++; + } + } + + switch (c->stage) { + case QSTAGE_VERT: + case QSTAGE_COORD: + insts[ni++] = qpu_load_imm_ui(qpu_vrsetup(), 0x00401a00); + insts[ni++] = qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00); + break; + case QSTAGE_FRAG: + break; + } + + foreach(node, &c->instructions) { + struct qinst *qinst = (struct qinst *)node; + +#if 0 + fprintf(stderr, "translating qinst to qpu: "); + qir_dump_inst(qinst); + fprintf(stderr, "\n"); +#endif + + static const struct { + uint32_t op; + bool is_mul; + } translate[] = { +#define A(name) [QOP_##name] = {QPU_A_##name, false} +#define M(name) [QOP_##name] = {QPU_M_##name, true} + A(FADD), + A(FSUB), + A(FMIN), + A(FMAX), + A(FMINABS), + A(FMAXABS), + A(FTOI), + + M(FMUL), + }; + + struct qpu_reg src[4]; + for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) { + int index = qinst->src[i].index; + switch (qinst->src[i].file) { + case QFILE_NULL: + src[i] = qpu_rn(0); + break; + case QFILE_TEMP: + assert(reg_allocated[index] != -1); + src[i] = allocate_to_qpu_reg[reg_allocated[index]]; + reg_uses_remaining[index]--; + if (reg_uses_remaining[index] == 0) + reg_in_use[reg_allocated[index]] = false; + break; + case QFILE_UNIF: + src[i] = qpu_unif(); + break; + case QFILE_VARY: + src[i] = qpu_vary(); + break; + } + } + + struct qpu_reg dst; + switch (qinst->dst.file) { + case QFILE_NULL: + dst = qpu_ra(QPU_W_NOP); + break; + + case QFILE_TEMP: + if (reg_allocated[qinst->dst.index] == -1) { + int alloc; + for (alloc = 0; + alloc < ARRAY_SIZE(reg_in_use); + alloc++) { + /* The pack flags require an A-file register. */ + if (qinst->op == QOP_PACK_SCALED && + allocate_to_qpu_reg[alloc].mux != QPU_MUX_A) { + continue; + } + + if (!reg_in_use[alloc]) + break; + } + assert(alloc != ARRAY_SIZE(reg_in_use) && "need better reg alloc"); + reg_in_use[alloc] = true; + reg_allocated[qinst->dst.index] = alloc; + } + + dst = allocate_to_qpu_reg[reg_allocated[qinst->dst.index]]; + + reg_uses_remaining[qinst->dst.index]--; + if (reg_uses_remaining[qinst->dst.index] == 0) { + reg_in_use[reg_allocated[qinst->dst.index]] = + false; + } + break; + + case QFILE_VARY: + case QFILE_UNIF: + assert(!"not reached"); + break; + } + + switch (qinst->op) { + case QOP_MOV: + /* Skip emitting the MOV if it's a no-op. */ + if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B || + dst.mux != src[0].mux || dst.addr != src[0].addr) { + insts[ni++] = qpu_inst(qpu_a_MOV(dst, src[0]), + qpu_m_NOP()); + } + break; + + case QOP_VPM_WRITE: + insts[ni++] = qpu_inst(qpu_a_MOV(qpu_ra(QPU_W_VPM), + src[0]), + qpu_m_NOP()); + break; + + case QOP_VPM_READ: + insts[ni++] = qpu_inst(qpu_a_MOV(dst, + qpu_ra(QPU_R_VPM)), + qpu_m_NOP()); + break; + + case QOP_PACK_COLORS: + for (int i = 0; i < 4; i++) { + insts[ni++] = qpu_inst(qpu_a_NOP(), + qpu_m_MOV(qpu_r5(), src[i])); + insts[ni - 1] |= QPU_PM; + insts[ni - 1] |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i, + QPU_PACK); + } + + insts[ni++] = qpu_inst(qpu_a_MOV(dst, qpu_r5()), + qpu_m_NOP()); + + break; + + case QOP_TLB_COLOR_WRITE: + insts[ni++] = qpu_inst(qpu_a_MOV(qpu_tlbc(), + src[0]), + qpu_m_NOP()); + break; + + case QOP_PACK_SCALED: + insts[ni++] = qpu_inst(qpu_a_MOV(dst, src[0]), + qpu_m_NOP()); + insts[ni - 1] |= QPU_SET_FIELD(QPU_PACK_A_16A, QPU_PACK); + + insts[ni++] = qpu_inst(qpu_a_MOV(dst, src[1]), + qpu_m_NOP()); + insts[ni - 1] |= QPU_SET_FIELD(QPU_PACK_A_16B, QPU_PACK); + + break; + + default: + assert(qinst->op < ARRAY_SIZE(translate)); + assert(translate[qinst->op].op != 0); /* NOPs */ + + /* If we have only one source, put it in the second + * argument slot as well so that we don't take up + * another raddr just to get unused data. + */ + if (qir_get_op_nsrc(qinst->op) == 1) + src[1] = src[0]; + + if ((src[0].mux == QPU_MUX_A || src[0].mux == QPU_MUX_B) && + (src[1].mux == QPU_MUX_A || src[1].mux == QPU_MUX_B) && + src[0].addr != src[1].addr) { + insts[ni++] = qpu_inst(qpu_a_MOV(qpu_r5(), src[1]), + qpu_m_NOP()); + src[1] = qpu_r5(); + } + + if (translate[qinst->op].is_mul) { + insts[ni++] = qpu_inst(qpu_a_NOP(), + qpu_m_alu2(translate[qinst->op].op, + dst, src[0], src[1])); + } else { + insts[ni++] = qpu_inst(qpu_a_alu2(translate[qinst->op].op, + dst, src[0], src[1]), + qpu_m_NOP()); + } + break; + } + + if ((dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B) && + dst.addr < 32) + insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); + } + + /* thread end can't have VPM write */ + if (QPU_GET_FIELD(insts[ni - 1], QPU_WADDR_ADD) == QPU_W_VPM || + QPU_GET_FIELD(insts[ni - 1], QPU_WADDR_MUL) == QPU_W_VPM) + insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); + + insts[ni - 1] = qpu_set_sig(insts[ni - 1], QPU_SIG_PROG_END); + insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); + insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP()); + + switch (c->stage) { + case QSTAGE_VERT: + case QSTAGE_COORD: + break; + case QSTAGE_FRAG: + insts[2] = qpu_set_sig(insts[2], QPU_SIG_WAIT_FOR_SCOREBOARD); + insts[ni - 1] = qpu_set_sig(insts[ni - 1], + QPU_SIG_SCOREBOARD_UNLOCK); + break; + } + + c->qpu_insts = insts; + c->num_qpu_insts = ni; + + vc4_dump_program(c); + vc4_qpu_validate(insts, ni); +} + diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c index 6ad7eea8f24..ab1e8be6ee3 100644 --- a/src/gallium/drivers/vc4/vc4_state.c +++ b/src/gallium/drivers/vc4/vc4_state.c @@ -254,11 +254,10 @@ vc4_set_constant_buffer(struct pipe_context *pctx, uint shader, uint index, if (unlikely(!cb)) { so->enabled_mask &= ~(1 << index); so->dirty_mask &= ~(1 << index); - pipe_resource_reference(&so->cb[index].buffer, NULL); return; } - pipe_resource_reference(&so->cb[index].buffer, cb->buffer); + assert(!cb->buffer); so->cb[index].buffer_offset = cb->buffer_offset; so->cb[index].buffer_size = cb->buffer_size; so->cb[index].user_buffer = cb->user_buffer; -- 2.30.2