From f87c7008958cdb095efa1cfb29ca8f3c9b9066e4 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 1 Oct 2014 18:27:24 -0700 Subject: [PATCH] vc4: Add support for ARL and indirect register access on TGSI_FILE_CONSTANT. Fixes 14 ARB_vp tests (which had no lowering done), and should improve performance of indirect uniform array access in GLSL. --- src/gallium/drivers/vc4/kernel/vc4_drv.h | 1 + src/gallium/drivers/vc4/kernel/vc4_validate.c | 20 +- .../drivers/vc4/kernel/vc4_validate_shaders.c | 188 +++++++++++++++--- src/gallium/drivers/vc4/vc4_context.h | 23 +++ src/gallium/drivers/vc4/vc4_opt_dead_code.c | 3 +- src/gallium/drivers/vc4/vc4_program.c | 152 +++++++++++++- src/gallium/drivers/vc4/vc4_qir.c | 1 + src/gallium/drivers/vc4/vc4_qir.h | 45 +++++ src/gallium/drivers/vc4/vc4_qpu_emit.c | 5 + src/gallium/drivers/vc4/vc4_screen.c | 3 +- 10 files changed, 407 insertions(+), 34 deletions(-) diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h index 45d9c40f97f..b0eb3f031c5 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_drv.h +++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h @@ -128,6 +128,7 @@ struct exec_info { * Setup") for definitions of the texture parameters. */ struct vc4_texture_sample_info { + bool is_direct; uint32_t p_offset[4]; }; diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c index 977e071d22d..8b04eb99195 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_validate.c +++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c @@ -767,6 +767,23 @@ reloc_tex(struct exec_info *exec, uint32_t cube_map_stride = 0; enum vc4_texture_data_type type; + if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex)) + return false; + + if (sample->is_direct) { + uint32_t remaining_size = tex->base.size - p0; + if (p0 > tex->base.size - 4) { + DRM_ERROR("UBO offset greater than UBO size\n"); + return false; + } + if (p1 > remaining_size - 4) { + DRM_ERROR("UBO clamp would allow reads outside of UBO\n"); + return false; + } + *validated_p0 = tex->paddr + p0; + return true; + } + if (width == 0) width = 2048; if (height == 0) @@ -832,9 +849,6 @@ reloc_tex(struct exec_info *exec, tiling_format = VC4_TILING_FORMAT_T; } - if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex)) - return false; - if (!check_tex_size(exec, tex, offset + cube_map_stride * 5, tiling_format, width, height, cpp)) { return false; diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c index 03c7f23e92d..e797c59a816 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c +++ b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c @@ -51,8 +51,39 @@ struct vc4_shader_validation_state { struct vc4_texture_sample_info tmu_setup[2]; int tmu_write_count[2]; + + /* For registers that were last written to by a MIN instruction with + * one argument being a uniform, the address of the uniform. + * Otherwise, ~0. + * + * This is used for the validation of direct address memory reads. + */ + uint32_t live_clamp_offsets[32 + 32 + 4]; }; +static uint32_t +waddr_to_live_reg_index(uint32_t waddr, bool is_b) +{ + if (waddr < 32) { + if (is_b) + return 32 + waddr; + else + return waddr; + } else if (waddr <= QPU_W_ACC3) { + + return 64 + waddr - QPU_W_ACC0; + } else { + return ~0; + } +} + +static bool +is_tmu_submit(uint32_t waddr) +{ + return (waddr == QPU_W_TMU0_S || + waddr == QPU_W_TMU1_S); +} + static bool is_tmu_write(uint32_t waddr) { @@ -75,24 +106,86 @@ record_validated_texture_sample(struct vc4_validated_shader_info *validated_shad if (!temp_samples) return false; - memcpy(temp_samples[s].p_offset, - validation_state->tmu_setup[tmu].p_offset, - validation_state->tmu_write_count[tmu] * sizeof(uint32_t)); - for (i = validation_state->tmu_write_count[tmu]; i < 4; i++) - temp_samples[s].p_offset[i] = ~0; + memcpy(&temp_samples[s], + &validation_state->tmu_setup[tmu], + sizeof(*temp_samples)); validated_shader->num_texture_samples = s + 1; validated_shader->texture_samples = temp_samples; + for (i = 0; i < 4; i++) + validation_state->tmu_setup[tmu].p_offset[i] = ~0; + return true; } static bool -check_tmu_write(struct vc4_validated_shader_info *validated_shader, +check_tmu_write(uint64_t inst, + struct vc4_validated_shader_info *validated_shader, struct vc4_shader_validation_state *validation_state, - uint32_t waddr) + bool is_mul) { + uint32_t waddr = (is_mul ? + QPU_GET_FIELD(inst, QPU_WADDR_MUL) : + QPU_GET_FIELD(inst, QPU_WADDR_ADD)); + uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); + uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); int tmu = waddr > QPU_W_TMU0_B; + bool submit = is_tmu_submit(waddr); + bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0; + + if (is_direct) { + uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); + uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); + uint32_t clamp_offset = ~0; + + /* Make sure that this texture load is an add of the base + * address of the UBO to a clamped offset within the UBO. + */ + if (is_mul || + QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) { + DRM_ERROR("direct TMU load wasn't an add\n"); + return false; + } + + /* We assert that the the clamped address is the first + * argument, and the UBO base address is the second argument. + * This is arbitrary, but simpler than supporting flipping the + * two either way. + */ + if (add_a == QPU_MUX_A) { + clamp_offset = validation_state->live_clamp_offsets[raddr_a]; + } else if (add_a == QPU_MUX_B) { + clamp_offset = validation_state->live_clamp_offsets[32 + raddr_b]; + } else if (add_a <= QPU_MUX_R4) { + clamp_offset = validation_state->live_clamp_offsets[64 + add_a]; + } + + if (clamp_offset == ~0) { + DRM_ERROR("direct TMU load wasn't clamped\n"); + return false; + } + + /* Store the clamp value's offset in p1 (see reloc_tex() in + * vc4_validate.c). + */ + validation_state->tmu_setup[tmu].p_offset[1] = + clamp_offset; + + if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && + !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { + DRM_ERROR("direct TMU load didn't add to a uniform\n"); + return false; + } + + validation_state->tmu_setup[tmu].is_direct = true; + } else { + if (raddr_a == QPU_R_UNIF || raddr_b == QPU_R_UNIF) { + DRM_ERROR("uniform read in the same instruction as " + "texture setup.\n"); + return false; + } + } if (validation_state->tmu_write_count[tmu] >= 4) { DRM_ERROR("TMU%d got too many parameters before dispatch\n", @@ -102,9 +195,13 @@ check_tmu_write(struct vc4_validated_shader_info *validated_shader, validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] = validated_shader->uniforms_size; validation_state->tmu_write_count[tmu]++; - validated_shader->uniforms_size += 4; + /* Since direct uses a RADDR uniform reference, it will get counted in + * check_instruction_reads() + */ + if (!is_direct) + validated_shader->uniforms_size += 4; - if (waddr == QPU_W_TMU0_S || waddr == QPU_W_TMU1_S) { + if (submit) { if (!record_validated_texture_sample(validated_shader, validation_state, tmu)) { return false; @@ -117,10 +214,17 @@ check_tmu_write(struct vc4_validated_shader_info *validated_shader, } static bool -check_register_write(struct vc4_validated_shader_info *validated_shader, +check_register_write(uint64_t inst, + struct vc4_validated_shader_info *validated_shader, struct vc4_shader_validation_state *validation_state, - uint32_t waddr) + bool is_mul) { + uint32_t waddr = (is_mul ? + QPU_GET_FIELD(inst, QPU_WADDR_MUL) : + QPU_GET_FIELD(inst, QPU_WADDR_ADD)); + bool is_b = is_mul != ((inst & QPU_PM) != 0); + uint32_t live_reg_index; + switch (waddr) { case QPU_W_UNIFORMS_ADDRESS: /* XXX: We'll probably need to support this for reladdr, but @@ -145,8 +249,8 @@ check_register_write(struct vc4_validated_shader_info *validated_shader, case QPU_W_TMU1_T: case QPU_W_TMU1_R: case QPU_W_TMU1_B: - return check_tmu_write(validated_shader, validation_state, - waddr); + return check_tmu_write(inst, validated_shader, validation_state, + is_mul); case QPU_W_HOST_INT: case QPU_W_TMU_NOSWAP: @@ -174,9 +278,44 @@ check_register_write(struct vc4_validated_shader_info *validated_shader, return true; } + /* Clear out the live offset clamp tracking for the written register. + * If this particular instruction is setting up an offset clamp, it'll + * get tracked immediately after we return. + */ + live_reg_index = waddr_to_live_reg_index(waddr, is_b); + if (live_reg_index != ~0) + validation_state->live_clamp_offsets[live_reg_index] = ~0; + return true; } +static void +track_live_clamps(uint64_t inst, + struct vc4_validated_shader_info *validated_shader, + struct vc4_shader_validation_state *validation_state) +{ + uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); + uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); + uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); + uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + bool pm = inst & QPU_PM; + uint32_t live_reg_index; + + if (QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_MIN) + return; + + if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && + !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { + return; + } + + live_reg_index = waddr_to_live_reg_index(waddr_add, pm); + if (live_reg_index != ~0) { + validation_state->live_clamp_offsets[live_reg_index] = + validated_shader->uniforms_size; + } +} + static bool check_instruction_writes(uint64_t inst, struct vc4_validated_shader_info *validated_shader, @@ -184,33 +323,30 @@ check_instruction_writes(uint64_t inst, { uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); + bool ok; if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) { DRM_ERROR("ADD and MUL both set up textures\n"); return false; } - return (check_register_write(validated_shader, validation_state, waddr_add) && - check_register_write(validated_shader, validation_state, waddr_mul)); + ok = (check_register_write(inst, validated_shader, validation_state, false) && + check_register_write(inst, validated_shader, validation_state, true)); + + track_live_clamps(inst, validated_shader, validation_state); + + return ok; } static bool check_instruction_reads(uint64_t inst, struct vc4_validated_shader_info *validated_shader) { - uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); - uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); if (raddr_a == QPU_R_UNIF || raddr_b == QPU_R_UNIF) { - if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul)) { - DRM_ERROR("uniform read in the same instruction as " - "texture setup\n"); - return false; - } - /* This can't overflow the uint32_t, because we're reading 8 * bytes of instruction to increment by 4 here, so we'd * already be OOM. @@ -231,9 +367,15 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj, uint64_t *shader; struct vc4_validated_shader_info *validated_shader; struct vc4_shader_validation_state validation_state; + int i; memset(&validation_state, 0, sizeof(validation_state)); + for (i = 0; i < 8; i++) + validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0; + for (i = 0; i < ARRAY_SIZE(validation_state.live_clamp_offsets); i++) + validation_state.live_clamp_offsets[i] = ~0; + if (start_offset + sizeof(uint64_t) > shader_obj->base.size) { DRM_ERROR("shader starting at %d outside of BO sized %d\n", start_offset, diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index 9eaff8f7178..6a82d8fe5a4 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -87,12 +87,35 @@ struct vc4_uncompiled_shader { const struct tgsi_token *twoside_tokens; }; +struct vc4_ubo_range { + /** + * offset in bytes from the start of the ubo where this range is + * uploaded. + * + * Only set once used is set. + */ + uint32_t dst_offset; + + /** + * offset in bytes from the start of the gallium uniforms where the + * data comes from. + */ + uint32_t src_offset; + + /** size in bytes of this ubo range */ + uint32_t size; +}; + struct vc4_compiled_shader { uint64_t program_id; struct vc4_bo *bo; struct vc4_shader_uniform_info uniforms; + struct vc4_ubo_range *ubo_ranges; + uint32_t num_ubo_ranges; + uint32_t ubo_size; + /** bitmask of which inputs are color inputs, for flat shade handling. */ uint32_t color_inputs; diff --git a/src/gallium/drivers/vc4/vc4_opt_dead_code.c b/src/gallium/drivers/vc4/vc4_opt_dead_code.c index d958dcb1075..408bd4302b4 100644 --- a/src/gallium/drivers/vc4/vc4_opt_dead_code.c +++ b/src/gallium/drivers/vc4/vc4_opt_dead_code.c @@ -92,7 +92,8 @@ qir_opt_dead_code(struct vc4_compile *c) if (dce_tex && (inst->op == QOP_TEX_S || inst->op == QOP_TEX_T || inst->op == QOP_TEX_R || - inst->op == QOP_TEX_B)) { + inst->op == QOP_TEX_B || + inst->op == QOP_TEX_DIRECT)) { dce(c, inst); progress = true; continue; diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index c6b7edb9aef..72bbcd865bc 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -163,10 +163,42 @@ qir_uniform_f(struct vc4_compile *c, float f) return qir_uniform_ui(c, fui(f)); } +static struct qreg +indirect_uniform_load(struct vc4_compile *c, + struct tgsi_full_src_register *src, int swiz) +{ + struct tgsi_ind_register *indirect = &src->Indirect; + struct vc4_compiler_ubo_range *range = &c->ubo_ranges[indirect->ArrayID]; + if (!range->used) { + range->used = true; + range->dst_offset = c->next_ubo_dst_offset; + c->next_ubo_dst_offset += range->size; + c->num_ubo_ranges++; + }; + + assert(src->Register.Indirect); + assert(indirect->File == TGSI_FILE_ADDRESS); + + struct qreg addr_val = c->addr[indirect->Swizzle]; + struct qreg indirect_offset = + qir_ADD(c, addr_val, qir_uniform_ui(c, + range->dst_offset + + (src->Register.Index * 16)+ + swiz * 4)); + indirect_offset = qir_MIN(c, indirect_offset, qir_uniform_ui(c, (range->dst_offset + + range->size - 4))); + + qir_TEX_DIRECT(c, indirect_offset, add_uniform(c, QUNIFORM_UBO_ADDR, 0)); + struct qreg r4 = qir_TEX_RESULT(c); + c->num_texture_samples++; + return qir_MOV(c, r4); +} + static struct qreg get_src(struct vc4_compile *c, unsigned tgsi_op, - struct tgsi_src_register *src, int i) + struct tgsi_full_src_register *full_src, int i) { + struct tgsi_src_register *src = &full_src->Register; struct qreg r = c->undef; uint32_t s = i; @@ -187,8 +219,6 @@ get_src(struct vc4_compile *c, unsigned tgsi_op, abort(); } - assert(!src->Indirect); - switch (src->File) { case TGSI_FILE_NULL: return r; @@ -199,8 +229,12 @@ get_src(struct vc4_compile *c, unsigned tgsi_op, r = c->consts[src->Index * 4 + s]; break; case TGSI_FILE_CONSTANT: - r = get_temp_for_uniform(c, QUNIFORM_UNIFORM, - src->Index * 4 + s); + if (src->Indirect) { + r = indirect_uniform_load(c, full_src, s); + } else { + r = get_temp_for_uniform(c, QUNIFORM_UNIFORM, + src->Index * 4 + s); + } break; case TGSI_FILE_INPUT: r = c->inputs[src->Index * 4 + s]; @@ -250,6 +284,10 @@ update_dst(struct vc4_compile *c, struct tgsi_full_instruction *tgsi_inst, c->num_outputs = MAX2(c->num_outputs, tgsi_dst->Index * 4 + i + 1); break; + case TGSI_FILE_ADDRESS: + assert(tgsi_dst->Index == 0); + c->addr[i] = val; + break; default: fprintf(stderr, "unknown dst file %d\n", tgsi_dst->File); abort(); @@ -906,6 +944,29 @@ tgsi_to_qir_ssg(struct vc4_compile *c, qir_uniform_f(c, -1.0)); } +/* Compare to tgsi_to_qir_flr() for the floor logic. */ +static struct qreg +tgsi_to_qir_arl(struct vc4_compile *c, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, struct qreg *src, int i) +{ + struct qreg trunc = qir_FTOI(c, src[0 * 4 + i]); + struct qreg scaled = qir_SHL(c, trunc, qir_uniform_ui(c, 4)); + + qir_SF(c, qir_FSUB(c, src[0 * 4 + i], qir_ITOF(c, trunc))); + + return qir_SEL_X_Y_NS(c, qir_SUB(c, scaled, qir_uniform_ui(c, 4)), + scaled); +} + +static struct qreg +tgsi_to_qir_uarl(struct vc4_compile *c, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, struct qreg *src, int i) +{ + return qir_SHL(c, src[0 * 4 + i], qir_uniform_ui(c, 4)); +} + static void emit_vertex_input(struct vc4_compile *c, int attr) { @@ -1086,6 +1147,24 @@ add_output(struct vc4_compile *c, c->output_semantics[decl_offset].swizzle = semantic_swizzle; } +static void +add_array_info(struct vc4_compile *c, uint32_t array_id, + uint32_t start, uint32_t size) +{ + if (array_id >= c->ubo_ranges_array_size) { + c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2, + array_id + 1); + c->ubo_ranges = reralloc(c, c->ubo_ranges, + struct vc4_compiler_ubo_range, + c->ubo_ranges_array_size); + } + + c->ubo_ranges[array_id].dst_offset = 0; + c->ubo_ranges[array_id].src_offset = start; + c->ubo_ranges[array_id].size = size; + c->ubo_ranges[array_id].used = false; +} + static void emit_tgsi_declaration(struct vc4_compile *c, struct tgsi_full_declaration *decl) @@ -1152,6 +1231,14 @@ emit_tgsi_declaration(struct vc4_compile *c, } break; + + case TGSI_FILE_CONSTANT: + add_array_info(c, + decl->Array.ArrayID, + decl->Range.First * 16, + (decl->Range.Last - + decl->Range.First + 1) * 16); + break; } } } @@ -1219,6 +1306,8 @@ emit_tgsi_instruction(struct vc4_compile *c, [TGSI_OPCODE_COS] = { 0, tgsi_to_qir_cos }, [TGSI_OPCODE_CLAMP] = { 0, tgsi_to_qir_clamp }, [TGSI_OPCODE_SSG] = { 0, tgsi_to_qir_ssg }, + [TGSI_OPCODE_ARL] = { 0, tgsi_to_qir_arl }, + [TGSI_OPCODE_UARL] = { 0, tgsi_to_qir_uarl }, }; static int asdf = 0; uint32_t tgsi_op = tgsi_inst->Instruction.Opcode; @@ -1231,7 +1320,7 @@ emit_tgsi_instruction(struct vc4_compile *c, for (int i = 0; i < 4; i++) { src_regs[4 * s + i] = get_src(c, tgsi_inst->Instruction.Opcode, - &tgsi_inst->Src[s].Register, i); + &tgsi_inst->Src[s], i); } } @@ -1833,6 +1922,9 @@ vc4_shader_tgsi_to_qir(struct vc4_context *vc4, enum qstage stage, int ret; c->stage = stage; + for (int i = 0; i < 4; i++) + c->addr[i] = qir_uniform_f(c, 0.0); + c->shader_state = &key->shader_state->base; c->program_id = key->shader_state->program_id; c->variant_id = key->shader_state->compiled_variant_count++; @@ -2065,6 +2157,31 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, c->qpu_inst_count * sizeof(uint64_t), "code"); + /* Copy the compiler UBO range state to the compiled shader, dropping + * out arrays that were never referenced by an indirect load. + * + * (Note that QIR dead code elimination of an array access still + * leaves that array alive, though) + */ + if (c->num_ubo_ranges) { + shader->num_ubo_ranges = c->num_ubo_ranges; + shader->ubo_ranges = ralloc_array(shader, struct vc4_ubo_range, + c->num_ubo_ranges); + uint32_t j = 0; + for (int i = 0; i < c->ubo_ranges_array_size; i++) { + struct vc4_compiler_ubo_range *range = + &c->ubo_ranges[i]; + if (!range->used) + continue; + + shader->ubo_ranges[j].dst_offset = range->dst_offset; + shader->ubo_ranges[j].src_offset = range->src_offset; + shader->ubo_ranges[j].size = range->size; + shader->ubo_size += c->ubo_ranges[i].size; + j++; + } + } + qir_compile_destroy(c); struct vc4_key *dup_key; @@ -2461,6 +2578,24 @@ get_texrect_scale(struct vc4_texture_stateobj *texstate, return fui(1.0f / dim); } +static struct vc4_bo * +vc4_upload_ubo(struct vc4_context *vc4, struct vc4_compiled_shader *shader, + const uint32_t *gallium_uniforms) +{ + if (!shader->ubo_size) + return NULL; + + struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo"); + uint32_t *data = vc4_bo_map(ubo); + for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) { + memcpy(data + shader->ubo_ranges[i].dst_offset, + gallium_uniforms + shader->ubo_ranges[i].src_offset, + shader->ubo_ranges[i].size); + } + + return ubo; +} + void vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, struct vc4_constbuf_stateobj *cb, @@ -2468,6 +2603,7 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, { struct vc4_shader_uniform_info *uinfo = &shader->uniforms; const uint32_t *gallium_uniforms = cb->cb[0].user_buffer; + struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms); cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples); @@ -2512,6 +2648,10 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, write_texture_p2(vc4, texstate, uinfo->data[i]); break; + case QUNIFORM_UBO_ADDR: + cl_reloc(vc4, &vc4->uniforms, ubo, 0); + break; + case QUNIFORM_TEXTURE_BORDER_COLOR: write_texture_border_color(vc4, texstate, uinfo->data[i]); break; diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index a7a4d96f758..cd731bc5639 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -93,6 +93,7 @@ static const struct qir_op_info qir_op_info[] = { [QOP_TEX_T] = { "tex_t", 0, 2 }, [QOP_TEX_R] = { "tex_r", 0, 2 }, [QOP_TEX_B] = { "tex_b", 0, 2 }, + [QOP_TEX_DIRECT] = { "tex_direct", 0, 2 }, [QOP_TEX_RESULT] = { "tex_result", 1, 0, true }, [QOP_R4_UNPACK_A] = { "r4_unpack_a", 1, 1 }, [QOP_R4_UNPACK_B] = { "r4_unpack_b", 1, 1 }, diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 077a55ad6fc..cb02db5272c 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -122,6 +122,16 @@ enum qop { QOP_TEX_R, /** Texture LOD bias parameter write */ QOP_TEX_B, + + /** + * Texture-unit 4-byte read with address provided direct in S + * cooordinate. + * + * The first operand is the offset from the start of the UBO, and the + * second is the uniform that has the UBO's base pointer. + */ + QOP_TEX_DIRECT, + /** * Signal of texture read being necessary and then reading r4 into * the destination @@ -207,6 +217,8 @@ enum quniform_contents { /** A reference to a texture config parameter 2 cubemap stride uniform */ QUNIFORM_TEXTURE_CONFIG_P2, + QUNIFORM_UBO_ADDR, + QUNIFORM_TEXRECT_SCALE_X, QUNIFORM_TEXRECT_SCALE_Y, @@ -224,6 +236,31 @@ struct vc4_varying_semantic { uint8_t swizzle; }; +struct vc4_compiler_ubo_range { + /** + * offset in bytes from the start of the ubo where this range is + * uploaded. + * + * Only set once used is set. + */ + uint32_t dst_offset; + + /** + * offset in bytes from the start of the gallium uniforms where the + * data comes from. + */ + uint32_t src_offset; + + /** size in bytes of this ubo range */ + uint32_t size; + + /** + * Set if this range is used by the shader for indirect uniforms + * access. + */ + bool used; +}; + struct vc4_compile { struct vc4_context *vc4; struct tgsi_parse_context parser; @@ -236,12 +273,19 @@ struct vc4_compile { struct qreg *inputs; struct qreg *outputs; struct qreg *consts; + struct qreg addr[4]; /* TGSI ARL destination. */ uint32_t temps_array_size; uint32_t inputs_array_size; uint32_t outputs_array_size; uint32_t uniforms_array_size; uint32_t consts_array_size; uint32_t num_consts; + + struct vc4_compiler_ubo_range *ubo_ranges; + uint32_t ubo_ranges_array_size; + uint32_t num_ubo_ranges; + uint32_t next_ubo_dst_offset; + struct qreg line_x, point_x, point_y; struct qreg discard; @@ -409,6 +453,7 @@ QIR_NODST_2(TEX_S) QIR_NODST_2(TEX_T) QIR_NODST_2(TEX_R) QIR_NODST_2(TEX_B) +QIR_NODST_2(TEX_DIRECT) QIR_ALU0(FRAG_X) QIR_ALU0(FRAG_Y) QIR_ALU0(FRAG_Z) diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 1d9bff39fe0..1d12d11b942 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -517,6 +517,11 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) src[0])); break; + case QOP_TEX_DIRECT: + fixup_raddr_conflict(c, &src[0], &src[1]); + queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1])); + break; + case QOP_TEX_RESULT: queue(c, qpu_NOP()); *last_inst(c) = qpu_set_sig(*last_inst(c), diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index b84b6b014f5..c18760ccf3b 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -299,8 +299,9 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: - case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: return 0; + case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: + return 1; case PIPE_SHADER_CAP_SUBROUTINES: return 0; case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: -- 2.30.2