From cfa1fb895ac5a752772f4d0748c1c2bce0c2e653 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Mon, 16 Dec 2019 17:17:38 +0100 Subject: [PATCH] a6xx: Add more CP packets And add fields uncovered by looking at the firmware. I think this covers all the memory, register, and scratch manipulation opcodes that exist on A6xx, plus one additional nice find for Vulkan and describing a previously unknown opcode and documenting CP_WAIT_REG_MEM. Note that the bits for the CP_REG_TO_MEM count, as well as the formula for computing the actual count for both CP_REG_TO_MEM and CP_MEM_TO_REG, are changed because the A630 SQE firmware actually does something different. I haven't investigated older microcodes to see whether this extends back to A5xx and A4xx, but the only non-A6xx uses of this field result in the same bit-pattern when using the A6xx bit range and formula, so it should be safe to change the definition universally. Reviewed-by: Kristian H. Kristensen Reviewed-by: Rob Clark Reviewed-by: Eric Anholt Part-of: --- src/freedreno/registers/adreno_pm4.xml | 291 +++++++++++++++++- src/freedreno/vulkan/tu_cmd_buffer.c | 10 +- .../drivers/freedreno/a4xx/fd4_query.c | 4 +- src/gallium/drivers/freedreno/a6xx/fd6_emit.c | 4 +- src/gallium/drivers/freedreno/a6xx/fd6_emit.h | 2 +- src/gallium/drivers/freedreno/a6xx/fd6_gmem.c | 10 +- .../drivers/freedreno/a6xx/fd6_query.c | 4 +- 7 files changed, 294 insertions(+), 31 deletions(-) diff --git a/src/freedreno/registers/adreno_pm4.xml b/src/freedreno/registers/adreno_pm4.xml index 3a7865b489d..533dcf0fb38 100644 --- a/src/freedreno/registers/adreno_pm4.xml +++ b/src/freedreno/registers/adreno_pm4.xml @@ -219,7 +219,7 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> load sequencer instruction memory (code embedded in packet) load constants from a location in memory - + selective invalidation of state pointers dynamically changes shader instruction memory partition @@ -266,7 +266,7 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> Load a buffer with pre-fetch enabled Set bin (?) - + test 2 memory locations to dword values specified @@ -310,7 +310,7 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> for A4xx Write to register with address that does not fit into type-0 pkt - + copy from ME scratch RAM to a register @@ -413,6 +413,15 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + Modifies DST_REG using two sources that can either be registers + or immediates. If SRC1_ADD is set, then do the following: + + $dst = (($dst & $src0) rot $rotate) + $src1 + + Otherwise: + + $dst = (($dst & $src0) rot $rotate) | $src1 + + Here "rot" means rotate left. + + + + + + + + + + + + + + + + - - + + @@ -849,13 +909,62 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) + + + Like CP_REG_TO_MEM, but the memory address to write to can be + offsetted using either one or two registers or scratch + registers. + + + + + + + + + + + + + + + + + + + + + + + + Like CP_REG_TO_MEM, but the memory address to write to can be + offsetted using a DWORD in memory. + + + + + + + + + + + + + + + + + + + + + + - + @@ -880,6 +989,10 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -927,7 +1095,10 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) + + + @@ -953,6 +1124,71 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) + + + Wait until a memory value is greater than or equal to the + reference, using signed comparison. + + + + + + + + + + + + + + + + + + + This uses the same internal comparison as CP_COND_WRITE, + but waits until the comparison is true instead. It busy-loops in + the CP for the given number of cycles before trying again. + + + + + + + + + + + + + + + + + + + + + + + + + + + + Waits for REG0 to not be 0 or REG1 to not equal REF + + + + + + + + + + + + @@ -1201,7 +1437,8 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - + + @@ -1215,5 +1452,31 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) + + + Executes the following DWORDs of commands if the dword at ADDR0 + is not equal to 0 and the dword at ADDR1 is less than REF + (signed comparison). + + + + + + + + + + + + + + + + + + + + + diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 49ea11acfaa..caa1a54af14 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -738,7 +738,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) | A6XX_CP_REG_TEST_0_BIT(0) | - A6XX_CP_REG_TEST_0_UNK25); + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2); tu_cs_emit(cs, 0x10000000); @@ -1124,7 +1124,7 @@ tu6_cache_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs) seqno = tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS, true); - tu_cs_emit_pkt7(cs, CP_UNK_A6XX_14, 4); + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_GTE, 4); tu_cs_emit(cs, 0x00000000); tu_cs_emit_qw(cs, cmd->scratch_bo.iova); tu_cs_emit(cs, seqno); @@ -1217,7 +1217,7 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) | A6XX_CP_REG_TEST_0_BIT(0) | - A6XX_CP_REG_TEST_0_UNK25); + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2); tu_cs_emit(cs, 0x10000000); @@ -1231,7 +1231,7 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs) */ tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(OVERFLOW_FLAG_REG) | - CP_REG_TO_MEM_0_CNT(1 - 1)); + CP_REG_TO_MEM_0_CNT(0)); tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_OVERFLOW); tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1); @@ -1401,7 +1401,7 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) | A6XX_CP_REG_TEST_0_BIT(0) | - A6XX_CP_REG_TEST_0_UNK25); + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2); tu_cs_emit(cs, 0x10000000); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c index bea63f4c28b..1f1ce8e8771 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c @@ -166,7 +166,7 @@ time_elapsed_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring) OUT_PKT3(ring, CP_REG_TO_MEM, 2); OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) | CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_CNT(2-1)); /* write 2 regs to mem */ + CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */ OUT_RELOCW(ring, scratch_bo, sample_off, 0, 0); /* ok... here we really *would* like to use the CP_SET_CONSTANT @@ -188,7 +188,7 @@ time_elapsed_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring) OUT_PKT3(ring, CP_REG_TO_MEM, 2); OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) | CP_REG_TO_MEM_0_ACCUMULATE | - CP_REG_TO_MEM_0_CNT(1-1)); /* readback 1 regs */ + CP_REG_TO_MEM_0_CNT(0)); /* readback 1 regs */ OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0); /* now copy that back to CP_ME_NRT_ADDR: */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c index 0f9b68b1c4b..9e4cbf0a978 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c @@ -754,7 +754,7 @@ fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit, struct ir3 OUT_PKT7(ring, CP_MEM_TO_REG, 3); OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) | CP_MEM_TO_REG_0_64B | CP_MEM_TO_REG_0_ACCUMULATE | - CP_MEM_TO_REG_0_CNT(1 - 1)); + CP_MEM_TO_REG_0_CNT(0)); OUT_RELOC(ring, control_ptr(fd6_context(ctx), flush_base[i].offset)); } @@ -1457,7 +1457,7 @@ fd6_framebuffer_barrier(struct fd_context *ctx) fd6_event_write(batch, ring, 0x31, false); - OUT_PKT7(ring, CP_UNK_A6XX_14, 4); + OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4); OUT_RING(ring, 0x00000000); OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); OUT_RING(ring, seqno); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h index 7ca42adc372..8e2134aec36 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h @@ -186,7 +186,7 @@ fd6_cache_flush(struct fd_batch *batch, struct fd_ringbuffer *ring) seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true); - OUT_PKT7(ring, CP_UNK_A6XX_14, 4); + OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4); OUT_RING(ring, 0x00000000); OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); OUT_RING(ring, seqno); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c index c12c06905d5..a97f4742828 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c @@ -442,7 +442,7 @@ emit_vsc_overflow_test(struct fd_batch *batch) OUT_PKT7(ring, CP_MEM_TO_REG, 3); OUT_RING(ring, CP_MEM_TO_REG_0_REG(OVERFLOW_FLAG_REG) | - CP_MEM_TO_REG_0_CNT(1 - 1)); + CP_MEM_TO_REG_0_CNT(0)); OUT_RELOC(ring, control_ptr(fd6_ctx, vsc_scratch)); /* SRC_LO/HI */ /* @@ -461,7 +461,7 @@ emit_vsc_overflow_test(struct fd_batch *batch) OUT_PKT7(ring, CP_REG_TEST, 1); OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) | A6XX_CP_REG_TEST_0_BIT(0) | - A6XX_CP_REG_TEST_0_UNK25); + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); OUT_PKT7(ring, CP_COND_REG_EXEC, 2); OUT_RING(ring, 0x10000000); @@ -568,7 +568,7 @@ emit_conditional_ib(struct fd_batch *batch, struct fd_tile *tile, OUT_PKT7(ring, CP_REG_TEST, 1); OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(tile->p)) | A6XX_CP_REG_TEST_0_BIT(tile->n) | - A6XX_CP_REG_TEST_0_UNK25); + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); OUT_PKT7(ring, CP_COND_REG_EXEC, 2); OUT_RING(ring, 0x10000000); @@ -856,7 +856,7 @@ fd6_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile) OUT_PKT7(ring, CP_REG_TEST, 1); OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) | A6XX_CP_REG_TEST_0_BIT(0) | - A6XX_CP_REG_TEST_0_UNK25); + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); OUT_PKT7(ring, CP_COND_REG_EXEC, 2); OUT_RING(ring, 0x10000000); @@ -1332,7 +1332,7 @@ fd6_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile) OUT_PKT7(ring, CP_REG_TEST, 1); OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) | A6XX_CP_REG_TEST_0_BIT(0) | - A6XX_CP_REG_TEST_0_UNK25); + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); OUT_PKT7(ring, CP_COND_REG_EXEC, 2); OUT_RING(ring, 0x10000000); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_query.c b/src/gallium/drivers/freedreno/a6xx/fd6_query.c index 29ec167f543..f58fff7b4ba 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_query.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_query.c @@ -325,7 +325,7 @@ primitives_generated_resume(struct fd_acc_query *aq, struct fd_batch *batch) OUT_PKT7(ring, CP_REG_TO_MEM, 3); OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_CNT(counter_count - 1) | + CP_REG_TO_MEM_0_CNT(counter_count) | CP_REG_TO_MEM_0_REG(counter_base)); primitives_relocw(ring, aq, prim_start); @@ -342,7 +342,7 @@ primitives_generated_pause(struct fd_acc_query *aq, struct fd_batch *batch) /* snapshot the end values: */ OUT_PKT7(ring, CP_REG_TO_MEM, 3); OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_CNT(counter_count - 1) | + CP_REG_TO_MEM_0_CNT(counter_count) | CP_REG_TO_MEM_0_REG(counter_base)); primitives_relocw(ring, aq, prim_stop); -- 2.30.2