From 2841bb1fac81c32b736f593507d46c46e7488f68 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 23 Jun 2020 17:09:10 +0200 Subject: [PATCH] ir3, freedreno: Round up constlen earlier Prevents problems when calculating whether we overflow the shared limit. Note that on a6xx, the macros handle the assert for us. Part-of: --- src/freedreno/ir3/ir3_shader.c | 8 ++++++++ src/freedreno/vulkan/tu_clear_blit.c | 6 +++--- src/freedreno/vulkan/tu_pipeline.c | 2 +- src/gallium/drivers/freedreno/a4xx/fd4_program.c | 3 ++- src/gallium/drivers/freedreno/a5xx/fd5_compute.c | 3 ++- src/gallium/drivers/freedreno/a5xx/fd5_program.c | 3 ++- src/gallium/drivers/freedreno/a6xx/fd6_compute.c | 3 +-- src/gallium/drivers/freedreno/a6xx/fd6_program.c | 10 +++++----- 8 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c index 0fe51e9378a..5afb5992851 100644 --- a/src/freedreno/ir3/ir3_shader.c +++ b/src/freedreno/ir3/ir3_shader.c @@ -26,6 +26,7 @@ #include "util/u_atomic.h" #include "util/u_string.h" +#include "util/u_math.h" #include "util/u_memory.h" #include "util/format/u_format.h" @@ -140,6 +141,13 @@ void * ir3_shader_assemble(struct ir3_shader_variant *v) */ v->constlen = MAX2(v->constlen, v->info.max_const + 1); + /* On a4xx and newer, constlen must be a multiple of 16 dwords even though + * uploads are in units of 4 dwords. Round it up here to make calculations + * regarding the shared constlen simpler. + */ + if (gpu_id >= 400) + v->constlen = align(v->constlen, 4); + fixup_regfootprint(v); return bin; diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c index be2222b407d..e93ef73c141 100644 --- a/src/freedreno/vulkan/tu_clear_blit.c +++ b/src/freedreno/vulkan/tu_clear_blit.c @@ -327,7 +327,7 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_ struct ir3_shader_variant vs = { .type = MESA_SHADER_VERTEX, .instrlen = 1, - .constlen = 2, + .constlen = 4, .info.max_reg = 1, .inputs_count = 1, .inputs[0] = { @@ -360,7 +360,7 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_ struct ir3_shader_variant fs = { .type = MESA_SHADER_FRAGMENT, .instrlen = 1, /* max of 9 instructions with num_rts = 8 */ - .constlen = num_rts, + .constlen = align(num_rts, 4), .info.max_reg = MAX2(num_rts, 1) - 1, .total_in = blit ? 2 : 0, .num_samp = blit ? 1 : 0, @@ -389,7 +389,7 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_ struct ir3_shader_variant gs_shader = { .type = MESA_SHADER_GEOMETRY, .instrlen = 1, - .constlen = 2, + .constlen = 4, .info.max_reg = 1, .inputs_count = 1, .inputs[0] = { diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index 964d7438ab6..811c777b3e0 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -412,7 +412,7 @@ tu6_emit_xs_config(struct tu_cs *cs, tu_cs_emit(cs, xs->instrlen); tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1); - tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(align(xs->constlen, 4)) | + tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) | A6XX_HLSQ_VS_CNTL_ENABLED); /* emit program binary diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c index 29d4dad70ed..9484a219eb0 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c @@ -112,7 +112,8 @@ setup_stages(struct fd4_emit *emit, struct stage *s) if (s[i].v) { s[i].i = &s[i].v->info; /* constlen is in units of 4 * vec4: */ - s[i].constlen = align(s[i].v->constlen, 4) / 4; + assert(s[i].v->constlen % 4 == 0); + s[i].constlen = s[i].v->constlen / 4; /* instrlen is already in units of 16 instr.. although * probably we should ditch that and not make the compiler * care about instruction group size of a3xx vs a4xx diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c index d30188d1e69..e12f5dfdff3 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c @@ -122,7 +122,8 @@ cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v, A5XX_SP_CS_CONFIG_SHADEROBJOFFSET(0) | A5XX_SP_CS_CONFIG_ENABLED); - unsigned constlen = align(v->constlen, 4) / 4; + assert(v->constlen % 4 == 0); + unsigned constlen = v->constlen / 4; OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONSTLEN, 2); OUT_RING(ring, constlen); /* HLSQ_CS_CONSTLEN */ OUT_RING(ring, instrlen); /* HLSQ_CS_INSTRLEN */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.c b/src/gallium/drivers/freedreno/a5xx/fd5_program.c index a491cf10b04..258a6431443 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_program.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.c @@ -232,7 +232,8 @@ setup_stages(struct fd5_emit *emit, struct stage *s) if (s[i].v) { s[i].i = &s[i].v->info; /* constlen is in units of 4 * vec4: */ - s[i].constlen = align(s[i].v->constlen, 4) / 4; + assert(s[i].v->constlen % 4 == 0); + s[i].constlen = s[i].v->constlen / 4; /* instrlen is already in units of 16 instr.. although * probably we should ditch that and not make the compiler * care about instruction group size of a3xx vs a5xx diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c index 27ce7e2a29a..438557600f2 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c @@ -81,9 +81,8 @@ cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v) OUT_PKT4(ring, REG_A6XX_HLSQ_UPDATE_CNTL, 1); OUT_RING(ring, 0xff); - unsigned constlen = align(v->constlen, 4); OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL, 1); - OUT_RING(ring, A6XX_HLSQ_CS_CNTL_CONSTLEN(constlen) | + OUT_RING(ring, A6XX_HLSQ_CS_CNTL_CONSTLEN(v->constlen) | A6XX_HLSQ_CS_CNTL_ENABLED); OUT_PKT4(ring, REG_A6XX_SP_CS_CONFIG, 2); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c index 90bca744cc3..dbf145359df 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c @@ -234,19 +234,19 @@ setup_config_stateobj(struct fd_ringbuffer *ring, struct fd6_program_state *stat debug_assert(state->vs->constlen >= state->bs->constlen); OUT_PKT4(ring, REG_A6XX_HLSQ_VS_CNTL, 4); - OUT_RING(ring, A6XX_HLSQ_VS_CNTL_CONSTLEN(align(state->vs->constlen, 4)) | + OUT_RING(ring, A6XX_HLSQ_VS_CNTL_CONSTLEN(state->vs->constlen) | A6XX_HLSQ_VS_CNTL_ENABLED); OUT_RING(ring, COND(state->hs, A6XX_HLSQ_HS_CNTL_ENABLED | - A6XX_HLSQ_HS_CNTL_CONSTLEN(align(state->hs->constlen, 4)))); + A6XX_HLSQ_HS_CNTL_CONSTLEN(state->hs->constlen))); OUT_RING(ring, COND(state->ds, A6XX_HLSQ_DS_CNTL_ENABLED | - A6XX_HLSQ_DS_CNTL_CONSTLEN(align(state->ds->constlen, 4)))); + A6XX_HLSQ_DS_CNTL_CONSTLEN(state->ds->constlen))); OUT_RING(ring, COND(state->gs, A6XX_HLSQ_GS_CNTL_ENABLED | - A6XX_HLSQ_GS_CNTL_CONSTLEN(align(state->gs->constlen, 4)))); + A6XX_HLSQ_GS_CNTL_CONSTLEN(state->gs->constlen))); OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL, 1); - OUT_RING(ring, A6XX_HLSQ_FS_CNTL_CONSTLEN(align(state->fs->constlen, 4)) | + OUT_RING(ring, A6XX_HLSQ_FS_CNTL_CONSTLEN(state->fs->constlen) | A6XX_HLSQ_FS_CNTL_ENABLED); OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 1); -- 2.30.2