X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeon%2Fr600_query.c;h=429ea6a956528ef8f551dc8915fc8bae1c843cbb;hb=e4b585f00954538a64d55daa6170507ee4f30126;hp=e6fda0c3bd6c28a5b90e90f18cb38e919c15e5c6;hpb=e607a6be2bd2507f704dd8f10013fe96f4b1b8da;p=mesa.git diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c index e6fda0c3bd6..429ea6a9565 100644 --- a/src/gallium/drivers/radeon/r600_query.c +++ b/src/gallium/drivers/radeon/r600_query.c @@ -25,6 +25,17 @@ #include "r600_query.h" #include "r600_cs.h" #include "util/u_memory.h" +#include "util/u_upload_mgr.h" + +#include "tgsi/tgsi_text.h" + +struct r600_hw_query_params { + unsigned start_offset; + unsigned end_offset; + unsigned fence_offset; + unsigned pair_stride; + unsigned pair_count; +}; /* Queries without buffer handling or suspend/resume. */ struct r600_query_sw { @@ -51,9 +62,12 @@ static enum radeon_value_id winsys_id_from_type(unsigned type) switch (type) { case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY; case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY; + case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM; + case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT; case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS; - case R600_QUERY_NUM_CS_FLUSHES: return RADEON_NUM_CS_FLUSHES; + case R600_QUERY_NUM_CTX_FLUSHES: return RADEON_NUM_CS_FLUSHES; case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED; + case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS; case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE; case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE; case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE; @@ -87,18 +101,31 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx, case R600_QUERY_DMA_CALLS: query->begin_result = rctx->num_dma_calls; break; + case R600_QUERY_NUM_VS_FLUSHES: + query->begin_result = rctx->num_vs_flushes; + break; + case R600_QUERY_NUM_PS_FLUSHES: + query->begin_result = rctx->num_ps_flushes; + break; + case R600_QUERY_NUM_CS_FLUSHES: + query->begin_result = rctx->num_cs_flushes; + break; case R600_QUERY_REQUESTED_VRAM: case R600_QUERY_REQUESTED_GTT: + case R600_QUERY_MAPPED_VRAM: + case R600_QUERY_MAPPED_GTT: case R600_QUERY_VRAM_USAGE: case R600_QUERY_GTT_USAGE: case R600_QUERY_GPU_TEMPERATURE: case R600_QUERY_CURRENT_GPU_SCLK: case R600_QUERY_CURRENT_GPU_MCLK: + case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO: query->begin_result = 0; break; case R600_QUERY_BUFFER_WAIT_TIME: - case R600_QUERY_NUM_CS_FLUSHES: - case R600_QUERY_NUM_BYTES_MOVED: { + case R600_QUERY_NUM_CTX_FLUSHES: + case R600_QUERY_NUM_BYTES_MOVED: + case R600_QUERY_NUM_EVICTIONS: { enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); query->begin_result = rctx->ws->query_value(rctx->ws, ws_id); break; @@ -134,7 +161,7 @@ static bool r600_query_sw_end(struct r600_common_context *rctx, case PIPE_QUERY_TIMESTAMP_DISJOINT: break; case PIPE_QUERY_GPU_FINISHED: - rctx->b.flush(&rctx->b, &query->fence, 0); + rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED); break; case R600_QUERY_DRAW_CALLS: query->end_result = rctx->num_draw_calls; @@ -151,16 +178,28 @@ static bool r600_query_sw_end(struct r600_common_context *rctx, case R600_QUERY_DMA_CALLS: query->end_result = rctx->num_dma_calls; break; + case R600_QUERY_NUM_VS_FLUSHES: + query->end_result = rctx->num_vs_flushes; + break; + case R600_QUERY_NUM_PS_FLUSHES: + query->end_result = rctx->num_ps_flushes; + break; + case R600_QUERY_NUM_CS_FLUSHES: + query->end_result = rctx->num_cs_flushes; + break; case R600_QUERY_REQUESTED_VRAM: case R600_QUERY_REQUESTED_GTT: + case R600_QUERY_MAPPED_VRAM: + case R600_QUERY_MAPPED_GTT: case R600_QUERY_VRAM_USAGE: case R600_QUERY_GTT_USAGE: case R600_QUERY_GPU_TEMPERATURE: case R600_QUERY_CURRENT_GPU_SCLK: case R600_QUERY_CURRENT_GPU_MCLK: case R600_QUERY_BUFFER_WAIT_TIME: - case R600_QUERY_NUM_CS_FLUSHES: - case R600_QUERY_NUM_BYTES_MOVED: { + case R600_QUERY_NUM_CTX_FLUSHES: + case R600_QUERY_NUM_BYTES_MOVED: + case R600_QUERY_NUM_EVICTIONS: { enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); query->end_result = rctx->ws->query_value(rctx->ws, ws_id); break; @@ -176,6 +215,9 @@ static bool r600_query_sw_end(struct r600_common_context *rctx, case R600_QUERY_NUM_SHADERS_CREATED: query->end_result = p_atomic_read(&rctx->screen->num_shaders_created); break; + case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO: + query->end_result = rctx->last_tex_ps_draw_ratio; + break; case R600_QUERY_GPIN_ASIC_ID: case R600_QUERY_GPIN_NUM_SIMD: case R600_QUERY_GPIN_NUM_RB: @@ -205,7 +247,7 @@ static bool r600_query_sw_get_result(struct r600_common_context *rctx, return true; case PIPE_QUERY_GPU_FINISHED: { struct pipe_screen *screen = rctx->b.screen; - result->b = screen->fence_finish(screen, query->fence, + result->b = screen->fence_finish(screen, &rctx->b, query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0); return result->b; } @@ -243,11 +285,13 @@ static bool r600_query_sw_get_result(struct r600_common_context *rctx, return true; } + static struct r600_query_ops sw_query_ops = { .destroy = r600_query_sw_destroy, .begin = r600_query_sw_begin, .end = r600_query_sw_end, - .get_result = r600_query_sw_get_result + .get_result = r600_query_sw_get_result, + .get_result_resource = NULL }; static struct pipe_query *r600_query_sw_create(struct pipe_context *ctx, @@ -287,7 +331,7 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c struct r600_query_hw *query) { unsigned buf_size = MAX2(query->result_size, - ctx->screen->info.gart_page_size); + ctx->screen->info.min_alloc_size); /* Queries are normally read by the CPU after * being written by the gpu, hence staging is probably a good @@ -299,11 +343,9 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c if (!buf) return NULL; - if (query->flags & R600_QUERY_HW_FLAG_PREDICATE) { - if (!query->ops->prepare_buffer(ctx, query, buf)) { - r600_resource_reference(&buf, NULL); - return NULL; - } + if (!query->ops->prepare_buffer(ctx, query, buf)) { + r600_resource_reference(&buf, NULL); + return NULL; } return buf; @@ -328,7 +370,7 @@ static bool r600_query_hw_prepare_buffer(struct r600_common_context *ctx, unsigned i, j; /* Set top bits for unused backends. */ - num_results = buffer->b.b.width0 / (16 * ctx->max_db); + num_results = buffer->b.b.width0 / query->result_size; for (j = 0; j < num_results; j++) { for (i = 0; i < ctx->max_db; i++) { if (!(ctx->backend_mask & (1<result_size = 16 * rctx->max_db; + query->result_size += 16; /* for the fence + alignment */ query->num_cs_dw_begin = 6; - query->num_cs_dw_end = 6; - query->flags |= R600_QUERY_HW_FLAG_PREDICATE; + query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen); break; case PIPE_QUERY_TIME_ELAPSED: - query->result_size = 16; + query->result_size = 24; query->num_cs_dw_begin = 8; - query->num_cs_dw_end = 8; + query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen); break; case PIPE_QUERY_TIMESTAMP: - query->result_size = 8; - query->num_cs_dw_end = 8; + query->result_size = 16; + query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen); query->flags = R600_QUERY_HW_FLAG_NO_START; break; case PIPE_QUERY_PRIMITIVES_EMITTED: @@ -421,13 +472,13 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx, query->num_cs_dw_begin = 6; query->num_cs_dw_end = 6; query->stream = index; - query->flags |= R600_QUERY_HW_FLAG_PREDICATE; break; case PIPE_QUERY_PIPELINE_STATISTICS: /* 11 values on EG, 8 on R600. */ query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16; + query->result_size += 8; /* for the fence + alignment */ query->num_cs_dw_begin = 6; - query->num_cs_dw_end = 6; + query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen); break; default: assert(0); @@ -574,6 +625,9 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); radeon_emit(cs, va); radeon_emit(cs, (va >> 32) & 0xFFFF); + + va += ctx->max_db * 16 - 8; + r600_gfx_write_fence(ctx, va, 0, 0x80000000); break; case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_PRIMITIVES_GENERATED: @@ -586,7 +640,7 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, radeon_emit(cs, (va >> 32) & 0xFFFF); break; case PIPE_QUERY_TIME_ELAPSED: - va += query->result_size/2; + va += 8; /* fall through */ case PIPE_QUERY_TIMESTAMP: radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); @@ -595,14 +649,23 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF)); radeon_emit(cs, 0); radeon_emit(cs, 0); + + va += 8; + r600_gfx_write_fence(ctx, va, 0, 0x80000000); break; - case PIPE_QUERY_PIPELINE_STATISTICS: - va += query->result_size/2; + case PIPE_QUERY_PIPELINE_STATISTICS: { + unsigned sample_size = (query->result_size - 8) / 2; + + va += sample_size; radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); radeon_emit(cs, va); radeon_emit(cs, (va >> 32) & 0xFFFF); + + va += sample_size; + r600_gfx_write_fence(ctx, va, 0, 0x80000000); break; + } default: assert(0); } @@ -724,8 +787,8 @@ static boolean r600_begin_query(struct pipe_context *ctx, return rquery->ops->begin(rctx, rquery); } -static void r600_query_hw_reset_buffers(struct r600_common_context *rctx, - struct r600_query_hw *query) +void r600_query_hw_reset_buffers(struct r600_common_context *rctx, + struct r600_query_hw *query) { struct r600_query_buffer *prev = query->buffer.previous; @@ -740,16 +803,14 @@ static void r600_query_hw_reset_buffers(struct r600_common_context *rctx, query->buffer.results_end = 0; query->buffer.previous = NULL; - if (query->flags & R600_QUERY_HW_FLAG_PREDICATE) { - /* Obtain a new buffer if the current one can't be mapped without a stall. */ - if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) || - !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) { + /* Obtain a new buffer if the current one can't be mapped without a stall. */ + if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) || + !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) { + r600_resource_reference(&query->buffer.buf, NULL); + query->buffer.buf = r600_new_query_buffer(rctx, query); + } else { + if (!query->ops->prepare_buffer(rctx, query, query->buffer.buf)) r600_resource_reference(&query->buffer.buf, NULL); - query->buffer.buf = r600_new_query_buffer(rctx, query); - } else { - if (!query->ops->prepare_buffer(rctx, query, query->buffer.buf)) - r600_resource_reference(&query->buffer.buf, NULL); - } } } @@ -801,6 +862,61 @@ bool r600_query_hw_end(struct r600_common_context *rctx, return true; } +static void r600_get_hw_query_params(struct r600_common_context *rctx, + struct r600_query_hw *rquery, int index, + struct r600_hw_query_params *params) +{ + params->pair_stride = 0; + params->pair_count = 1; + + switch (rquery->b.type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + params->start_offset = 0; + params->end_offset = 8; + params->fence_offset = rctx->max_db * 16; + params->pair_stride = 16; + params->pair_count = rctx->max_db; + break; + case PIPE_QUERY_TIME_ELAPSED: + params->start_offset = 0; + params->end_offset = 8; + params->fence_offset = 16; + break; + case PIPE_QUERY_TIMESTAMP: + params->start_offset = 0; + params->end_offset = 0; + params->fence_offset = 8; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + params->start_offset = 8; + params->end_offset = 24; + params->fence_offset = params->end_offset + 4; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + params->start_offset = 0; + params->end_offset = 16; + params->fence_offset = params->end_offset + 4; + break; + case PIPE_QUERY_SO_STATISTICS: + params->start_offset = 8 - index * 8; + params->end_offset = 24 - index * 8; + params->fence_offset = params->end_offset + 4; + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + { + /* Offsets apply to EG+ */ + static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80}; + params->start_offset = offsets[index]; + params->end_offset = 88 + offsets[index]; + params->fence_offset = 2 * 88; + break; + } + default: + unreachable("r600_get_hw_query_params unsupported"); + } +} + static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index, bool test_status_bit) { @@ -826,20 +942,18 @@ static void r600_query_hw_add_result(struct r600_common_context *ctx, { switch (query->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: { - unsigned results_base = 0; - while (results_base != query->result_size) { + for (unsigned i = 0; i < ctx->max_db; ++i) { + unsigned results_base = i * 16; result->u64 += r600_query_read_result(buffer + results_base, 0, 2, true); - results_base += 16; } break; } case PIPE_QUERY_OCCLUSION_PREDICATE: { - unsigned results_base = 0; - while (results_base != query->result_size) { + for (unsigned i = 0; i < ctx->max_db; ++i) { + unsigned results_base = i * 16; result->b = result->b || r600_query_read_result(buffer + results_base, 0, 2, true) != 0; - results_base += 16; } break; } @@ -847,12 +961,8 @@ static void r600_query_hw_add_result(struct r600_common_context *ctx, result->u64 += r600_query_read_result(buffer, 0, 2, false); break; case PIPE_QUERY_TIMESTAMP: - { - uint32_t *current_result = (uint32_t*)buffer; - result->u64 = (uint64_t)current_result[0] | - (uint64_t)current_result[1] << 32; + result->u64 = *(uint64_t*)buffer; break; - } case PIPE_QUERY_PRIMITIVES_EMITTED: /* SAMPLE_STREAMOUTSTATS stores this structure: * { @@ -951,6 +1061,21 @@ static boolean r600_get_query_result(struct pipe_context *ctx, return rquery->ops->get_result(rctx, rquery, wait, result); } +static void r600_get_query_result_resource(struct pipe_context *ctx, + struct pipe_query *query, + boolean wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_query *rquery = (struct r600_query *)query; + + rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index, + resource, offset); +} + static void r600_query_hw_clear_result(struct r600_query_hw *query, union pipe_query_result *result) { @@ -991,6 +1116,365 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx, return true; } +/* Create the compute shader that is used to collect the results. + * + * One compute grid with a single thread is launched for every query result + * buffer. The thread (optionally) reads a previous summary buffer, then + * accumulates data from the query result buffer, and writes the result either + * to a summary buffer to be consumed by the next grid invocation or to the + * user-supplied buffer. + * + * Data layout: + * + * CONST + * 0.x = end_offset + * 0.y = result_stride + * 0.z = result_count + * 0.w = bit field: + * 1: read previously accumulated values + * 2: write accumulated values for chaining + * 4: write result available + * 8: convert result to boolean (0/1) + * 16: only read one dword and use that as result + * 32: apply timestamp conversion + * 64: store full 64 bits result + * 128: store signed 32 bits result + * 1.x = fence_offset + * 1.y = pair_stride + * 1.z = pair_count + * + * BUFFER[0] = query result buffer + * BUFFER[1] = previous summary buffer + * BUFFER[2] = next summary buffer or user-supplied buffer + */ +static void r600_create_query_result_shader(struct r600_common_context *rctx) +{ + /* TEMP[0].xy = accumulated result so far + * TEMP[0].z = result not available + * + * TEMP[1].x = current result index + * TEMP[1].y = current pair index + */ + static const char text_tmpl[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL BUFFER[0]\n" + "DCL BUFFER[1]\n" + "DCL BUFFER[2]\n" + "DCL CONST[0..1]\n" + "DCL TEMP[0..5]\n" + "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n" + "IMM[1] UINT32 {1, 2, 4, 8}\n" + "IMM[2] UINT32 {16, 32, 64, 128}\n" + "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */ + + "AND TEMP[5], CONST[0].wwww, IMM[2].xxxx\n" + "UIF TEMP[5]\n" + /* Check result availability. */ + "LOAD TEMP[1].x, BUFFER[0], CONST[1].xxxx\n" + "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n" + "MOV TEMP[1], TEMP[0].zzzz\n" + "NOT TEMP[0].z, TEMP[0].zzzz\n" + + /* Load result if available. */ + "UIF TEMP[1]\n" + "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n" + "ENDIF\n" + "ELSE\n" + /* Load previously accumulated result if requested. */ + "MOV TEMP[0], IMM[0].xxxx\n" + "AND TEMP[4], CONST[0].wwww, IMM[1].xxxx\n" + "UIF TEMP[4]\n" + "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n" + "ENDIF\n" + + "MOV TEMP[1].x, IMM[0].xxxx\n" + "BGNLOOP\n" + /* Break if accumulated result so far is not available. */ + "UIF TEMP[0].zzzz\n" + "BRK\n" + "ENDIF\n" + + /* Break if result_index >= result_count. */ + "USGE TEMP[5], TEMP[1].xxxx, CONST[0].zzzz\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + + /* Load fence and check result availability */ + "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy, CONST[1].xxxx\n" + "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" + "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n" + "NOT TEMP[0].z, TEMP[0].zzzz\n" + "UIF TEMP[0].zzzz\n" + "BRK\n" + "ENDIF\n" + + "MOV TEMP[1].y, IMM[0].xxxx\n" + "BGNLOOP\n" + /* Load start and end. */ + "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy\n" + "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[1].yyyy, TEMP[5].xxxx\n" + "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" + + "UADD TEMP[5].x, TEMP[5].xxxx, CONST[0].xxxx\n" + "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].xxxx\n" + + "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n" + "U64ADD TEMP[0].xy, TEMP[0], TEMP[3]\n" + + /* Increment pair index */ + "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n" + "USGE TEMP[5], TEMP[1].yyyy, CONST[1].zzzz\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + "ENDLOOP\n" + + /* Increment result index */ + "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n" + "ENDLOOP\n" + "ENDIF\n" + + "AND TEMP[4], CONST[0].wwww, IMM[1].yyyy\n" + "UIF TEMP[4]\n" + /* Store accumulated data for chaining. */ + "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n" + "ELSE\n" + "AND TEMP[4], CONST[0].wwww, IMM[1].zzzz\n" + "UIF TEMP[4]\n" + /* Store result availability. */ + "NOT TEMP[0].z, TEMP[0]\n" + "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n" + "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n" + + "AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n" + "UIF TEMP[4]\n" + "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n" + "ENDIF\n" + "ELSE\n" + /* Store result if it is available. */ + "NOT TEMP[4], TEMP[0].zzzz\n" + "UIF TEMP[4]\n" + /* Apply timestamp conversion */ + "AND TEMP[4], CONST[0].wwww, IMM[2].yyyy\n" + "UIF TEMP[4]\n" + "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n" + "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n" + "ENDIF\n" + + /* Convert to boolean */ + "AND TEMP[4], CONST[0].wwww, IMM[1].wwww\n" + "UIF TEMP[4]\n" + "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[0].xxxx\n" + "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n" + "MOV TEMP[0].y, IMM[0].xxxx\n" + "ENDIF\n" + + "AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n" + "UIF TEMP[4]\n" + "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n" + "ELSE\n" + /* Clamping */ + "UIF TEMP[0].yyyy\n" + "MOV TEMP[0].x, IMM[0].wwww\n" + "ENDIF\n" + + "AND TEMP[4], CONST[0].wwww, IMM[2].wwww\n" + "UIF TEMP[4]\n" + "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n" + "ENDIF\n" + + "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" + "ENDIF\n" + "ENDIF\n" + "ENDIF\n" + "ENDIF\n" + + "END\n"; + + char text[sizeof(text_tmpl) + 32]; + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {}; + + /* Hard code the frequency into the shader so that the backend can + * use the full range of optimizations for divide-by-constant. + */ + snprintf(text, sizeof(text), text_tmpl, + rctx->screen->info.clock_crystal_freq); + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state); +} + +static void r600_restore_qbo_state(struct r600_common_context *rctx, + struct r600_qbo_state *st) +{ + rctx->b.bind_compute_state(&rctx->b, st->saved_compute); + + rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); + pipe_resource_reference(&st->saved_const0.buffer, NULL); + + rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo); + for (unsigned i = 0; i < 3; ++i) + pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL); +} + +static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, + struct r600_query *rquery, + bool wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset) +{ + struct r600_query_hw *query = (struct r600_query_hw *)rquery; + struct r600_query_buffer *qbuf; + struct r600_query_buffer *qbuf_prev; + struct pipe_resource *tmp_buffer = NULL; + unsigned tmp_buffer_offset = 0; + struct r600_qbo_state saved_state = {}; + struct pipe_grid_info grid = {}; + struct pipe_constant_buffer constant_buffer = {}; + struct pipe_shader_buffer ssbo[3]; + struct r600_hw_query_params params; + struct { + uint32_t end_offset; + uint32_t result_stride; + uint32_t result_count; + uint32_t config; + uint32_t fence_offset; + uint32_t pair_stride; + uint32_t pair_count; + } consts; + + if (!rctx->query_result_shader) { + r600_create_query_result_shader(rctx); + if (!rctx->query_result_shader) + return; + } + + if (query->buffer.previous) { + u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16, + &tmp_buffer_offset, &tmp_buffer); + if (!tmp_buffer) + return; + } + + rctx->save_qbo_state(&rctx->b, &saved_state); + + r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, ¶ms); + consts.end_offset = params.end_offset - params.start_offset; + consts.fence_offset = params.fence_offset - params.start_offset; + consts.result_stride = query->result_size; + consts.pair_stride = params.pair_stride; + consts.pair_count = params.pair_count; + + constant_buffer.buffer_size = sizeof(consts); + constant_buffer.user_buffer = &consts; + + ssbo[1].buffer = tmp_buffer; + ssbo[1].buffer_offset = tmp_buffer_offset; + ssbo[1].buffer_size = 16; + + ssbo[2] = ssbo[1]; + + rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader); + + grid.block[0] = 1; + grid.block[1] = 1; + grid.block[2] = 1; + grid.grid[0] = 1; + grid.grid[1] = 1; + grid.grid[2] = 1; + + consts.config = 0; + if (index < 0) + consts.config |= 4; + if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE || + query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) + consts.config |= 8; + else if (query->b.type == PIPE_QUERY_TIMESTAMP || + query->b.type == PIPE_QUERY_TIME_ELAPSED) + consts.config |= 32; + + switch (result_type) { + case PIPE_QUERY_TYPE_U64: + case PIPE_QUERY_TYPE_I64: + consts.config |= 64; + break; + case PIPE_QUERY_TYPE_I32: + consts.config |= 128; + break; + case PIPE_QUERY_TYPE_U32: + break; + } + + rctx->flags |= rctx->screen->barrier_flags.cp_to_L2; + + for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) { + if (query->b.type != PIPE_QUERY_TIMESTAMP) { + qbuf_prev = qbuf->previous; + consts.result_count = qbuf->results_end / query->result_size; + consts.config &= ~3; + if (qbuf != &query->buffer) + consts.config |= 1; + if (qbuf->previous) + consts.config |= 2; + } else { + /* Only read the last timestamp. */ + qbuf_prev = NULL; + consts.result_count = 0; + consts.config |= 16; + params.start_offset += qbuf->results_end - query->result_size; + } + + rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); + + ssbo[0].buffer = &qbuf->buf->b.b; + ssbo[0].buffer_offset = params.start_offset; + ssbo[0].buffer_size = qbuf->results_end - params.start_offset; + + if (!qbuf->previous) { + ssbo[2].buffer = resource; + ssbo[2].buffer_offset = offset; + ssbo[2].buffer_size = 8; + + ((struct r600_resource *)resource)->TC_L2_dirty = true; + } + + rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo); + + if (wait && qbuf == &query->buffer) { + uint64_t va; + + /* Wait for result availability. Wait only for readiness + * of the last entry, since the fence writes should be + * serialized in the CP. + */ + va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size; + va += params.fence_offset; + + r600_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000); + } + + rctx->b.launch_grid(&rctx->b, &grid); + rctx->flags |= rctx->screen->barrier_flags.compute_to_L2; + } + + r600_restore_qbo_state(rctx, &saved_state); + pipe_resource_reference(&tmp_buffer, NULL); +} + static void r600_render_condition(struct pipe_context *ctx, struct pipe_query *query, boolean condition, @@ -1169,13 +1653,20 @@ static struct pipe_driver_query_info r600_driver_query_list[] = { X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE), X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE), X("dma-calls", DMA_CALLS, UINT64, AVERAGE), + X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE), + X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE), + X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE), X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE), X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE), + X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE), + X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE), X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE), - X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE), + X("num-ctx-flushes", NUM_CTX_FLUSHES, UINT64, AVERAGE), X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE), + X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE), X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE), X("GTT-usage", GTT_USAGE, BYTES, AVERAGE), + X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE), /* GPIN queries are for the benefit of old versions of GPUPerfStudio, * which use it as a fallback path to detect the GPU type. @@ -1232,10 +1723,12 @@ static int r600_get_driver_query_info(struct pipe_screen *screen, switch (info->query_type) { case R600_QUERY_REQUESTED_VRAM: case R600_QUERY_VRAM_USAGE: + case R600_QUERY_MAPPED_VRAM: info->max_value.u64 = rscreen->info.vram_size; break; case R600_QUERY_REQUESTED_GTT: case R600_QUERY_GTT_USAGE: + case R600_QUERY_MAPPED_GTT: info->max_value.u64 = rscreen->info.gart_size; break; case R600_QUERY_GPU_TEMPERATURE: @@ -1287,6 +1780,7 @@ void r600_query_init(struct r600_common_context *rctx) rctx->b.begin_query = r600_begin_query; rctx->b.end_query = r600_end_query; rctx->b.get_query_result = r600_get_query_result; + rctx->b.get_query_result_resource = r600_get_query_result_resource; rctx->render_cond_atom.emit = r600_emit_query_predication; if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0)