#include "os/os_time.h"
#include "tgsi/tgsi_text.h"
+#define R600_MAX_STREAMS 4
+
struct r600_hw_query_params {
unsigned start_offset;
unsigned end_offset;
case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
+ case R600_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER;
case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
+ case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
case R600_QUERY_DRAW_CALLS:
query->begin_result = rctx->num_draw_calls;
break;
+ case R600_QUERY_DECOMPRESS_CALLS:
+ query->begin_result = rctx->num_decompress_calls;
+ break;
+ case R600_QUERY_MRT_DRAW_CALLS:
+ query->begin_result = rctx->num_mrt_draw_calls;
+ break;
case R600_QUERY_PRIM_RESTART_CALLS:
query->begin_result = rctx->num_prim_restart_calls;
break;
case R600_QUERY_NUM_CS_FLUSHES:
query->begin_result = rctx->num_cs_flushes;
break;
- case R600_QUERY_NUM_FB_CACHE_FLUSHES:
- query->begin_result = rctx->num_fb_cache_flushes;
+ case R600_QUERY_NUM_CB_CACHE_FLUSHES:
+ query->begin_result = rctx->num_cb_cache_flushes;
+ break;
+ case R600_QUERY_NUM_DB_CACHE_FLUSHES:
+ query->begin_result = rctx->num_db_cache_flushes;
break;
case R600_QUERY_NUM_L2_INVALIDATES:
query->begin_result = rctx->num_L2_invalidates;
case R600_QUERY_NUM_L2_WRITEBACKS:
query->begin_result = rctx->num_L2_writebacks;
break;
+ case R600_QUERY_NUM_RESIDENT_HANDLES:
+ query->begin_result = rctx->num_resident_handles;
+ break;
case R600_QUERY_TC_OFFLOADED_SLOTS:
query->begin_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
break;
case R600_QUERY_NUM_GFX_IBS:
case R600_QUERY_NUM_SDMA_IBS:
case R600_QUERY_NUM_BYTES_MOVED:
- case R600_QUERY_NUM_EVICTIONS: {
+ case R600_QUERY_NUM_EVICTIONS:
+ case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
break;
}
+ case R600_QUERY_GFX_BO_LIST_SIZE:
+ ws_id = winsys_id_from_type(query->b.type);
+ query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
+ query->begin_time = rctx->ws->query_value(rctx->ws,
+ RADEON_NUM_GFX_IBS);
+ break;
case R600_QUERY_CS_THREAD_BUSY:
ws_id = winsys_id_from_type(query->b.type);
query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
query->begin_time = os_time_get_nano();
break;
+ case R600_QUERY_GALLIUM_THREAD_BUSY:
+ query->begin_result =
+ rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
+ query->begin_time = os_time_get_nano();
+ break;
case R600_QUERY_GPU_LOAD:
case R600_QUERY_GPU_SHADERS_BUSY:
case R600_QUERY_GPU_TA_BUSY:
case R600_QUERY_GPU_MEQ_BUSY:
case R600_QUERY_GPU_ME_BUSY:
case R600_QUERY_GPU_SURF_SYNC_BUSY:
- case R600_QUERY_GPU_DMA_BUSY:
+ case R600_QUERY_GPU_CP_DMA_BUSY:
case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
- case R600_QUERY_GPU_CE_BUSY:
query->begin_result = r600_begin_counter(rctx->screen,
query->b.type);
break;
case R600_QUERY_DRAW_CALLS:
query->end_result = rctx->num_draw_calls;
break;
+ case R600_QUERY_DECOMPRESS_CALLS:
+ query->end_result = rctx->num_decompress_calls;
+ break;
+ case R600_QUERY_MRT_DRAW_CALLS:
+ query->end_result = rctx->num_mrt_draw_calls;
+ break;
case R600_QUERY_PRIM_RESTART_CALLS:
query->end_result = rctx->num_prim_restart_calls;
break;
case R600_QUERY_NUM_CS_FLUSHES:
query->end_result = rctx->num_cs_flushes;
break;
- case R600_QUERY_NUM_FB_CACHE_FLUSHES:
- query->end_result = rctx->num_fb_cache_flushes;
+ case R600_QUERY_NUM_CB_CACHE_FLUSHES:
+ query->end_result = rctx->num_cb_cache_flushes;
+ break;
+ case R600_QUERY_NUM_DB_CACHE_FLUSHES:
+ query->end_result = rctx->num_db_cache_flushes;
break;
case R600_QUERY_NUM_L2_INVALIDATES:
query->end_result = rctx->num_L2_invalidates;
case R600_QUERY_NUM_L2_WRITEBACKS:
query->end_result = rctx->num_L2_writebacks;
break;
+ case R600_QUERY_NUM_RESIDENT_HANDLES:
+ query->end_result = rctx->num_resident_handles;
+ break;
case R600_QUERY_TC_OFFLOADED_SLOTS:
query->end_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
break;
case R600_QUERY_NUM_GFX_IBS:
case R600_QUERY_NUM_SDMA_IBS:
case R600_QUERY_NUM_BYTES_MOVED:
- case R600_QUERY_NUM_EVICTIONS: {
+ case R600_QUERY_NUM_EVICTIONS:
+ case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
break;
}
+ case R600_QUERY_GFX_BO_LIST_SIZE:
+ ws_id = winsys_id_from_type(query->b.type);
+ query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
+ query->end_time = rctx->ws->query_value(rctx->ws,
+ RADEON_NUM_GFX_IBS);
+ break;
case R600_QUERY_CS_THREAD_BUSY:
ws_id = winsys_id_from_type(query->b.type);
query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
query->end_time = os_time_get_nano();
break;
+ case R600_QUERY_GALLIUM_THREAD_BUSY:
+ query->end_result =
+ rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
+ query->end_time = os_time_get_nano();
+ break;
case R600_QUERY_GPU_LOAD:
case R600_QUERY_GPU_SHADERS_BUSY:
case R600_QUERY_GPU_TA_BUSY:
case R600_QUERY_GPU_MEQ_BUSY:
case R600_QUERY_GPU_ME_BUSY:
case R600_QUERY_GPU_SURF_SYNC_BUSY:
- case R600_QUERY_GPU_DMA_BUSY:
+ case R600_QUERY_GPU_CP_DMA_BUSY:
case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
- case R600_QUERY_GPU_CE_BUSY:
query->end_result = r600_end_counter(rctx->screen,
query->b.type,
query->begin_result);
return result->b;
}
+ case R600_QUERY_GFX_BO_LIST_SIZE:
+ result->u64 = (query->end_result - query->begin_result) /
+ (query->end_time - query->begin_time);
+ return true;
case R600_QUERY_CS_THREAD_BUSY:
+ case R600_QUERY_GALLIUM_THREAD_BUSY:
result->u64 = (query->end_result - query->begin_result) * 100 /
(query->end_time - query->begin_time);
return true;
}
r600_resource_reference(&query->buffer.buf, NULL);
+ r600_resource_reference(&query->workaround_buf, NULL);
FREE(rquery);
}
query->num_cs_dw_end = 6;
query->stream = index;
break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+ query->result_size = 32 * R600_MAX_STREAMS;
+ query->num_cs_dw_begin = 6 * R600_MAX_STREAMS;
+ query->num_cs_dw_end = 6 * R600_MAX_STREAMS;
+ break;
case PIPE_QUERY_PIPELINE_STATISTICS:
/* 11 values on EG, 8 on R600. */
query->result_size = (rscreen->chip_class >= EVERGREEN ? 11 : 8) * 16;
}
}
-static unsigned event_type_for_stream(struct r600_query_hw *query)
+static unsigned event_type_for_stream(unsigned stream)
{
- switch (query->stream) {
+ switch (stream) {
default:
case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS;
case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1;
}
}
+static void emit_sample_streamout(struct radeon_winsys_cs *cs, uint64_t va,
+ unsigned stream)
+{
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+}
+
static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
struct r600_query_hw *query,
struct r600_resource *buffer,
case PIPE_QUERY_PRIMITIVES_GENERATED:
case PIPE_QUERY_SO_STATISTICS:
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
- radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
+ emit_sample_streamout(cs, va, query->stream);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
+ emit_sample_streamout(cs, va + 32 * stream, stream);
break;
case PIPE_QUERY_TIME_ELAPSED:
if (ctx->chip_class >= SI) {
* (bottom-of-pipe)
*/
r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
- 0, 3, NULL, va, 0, 0);
+ 0, EOP_DATA_SEL_TIMESTAMP,
+ NULL, va, 0, query->b.type);
}
break;
case PIPE_QUERY_PIPELINE_STATISTICS:
case PIPE_QUERY_PRIMITIVES_GENERATED:
case PIPE_QUERY_SO_STATISTICS:
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- va += query->result_size/2;
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
- radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
+ va += 16;
+ emit_sample_streamout(cs, va, query->stream);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ va += 16;
+ for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
+ emit_sample_streamout(cs, va + 32 * stream, stream);
break;
case PIPE_QUERY_TIME_ELAPSED:
va += 8;
/* fall through */
case PIPE_QUERY_TIMESTAMP:
r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
- 0, 3, NULL, va, 0, 0);
+ 0, EOP_DATA_SEL_TIMESTAMP, NULL, va,
+ 0, query->b.type);
fence_va = va + 8;
break;
case PIPE_QUERY_PIPELINE_STATISTICS: {
RADEON_PRIO_QUERY);
if (fence_va)
- r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, 1,
- query->buffer.buf, fence_va, 0, 0x80000000);
+ r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0,
+ EOP_DATA_SEL_VALUE_32BIT,
+ query->buffer.buf, fence_va, 0x80000000,
+ query->b.type);
}
static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
r600_update_prims_generated_query_state(ctx, query->b.type, -1);
}
+static void emit_set_predicate(struct r600_common_context *ctx,
+ struct r600_resource *buf, uint64_t va,
+ uint32_t op)
+{
+ struct radeon_winsys_cs *cs = ctx->gfx.cs;
+
+ if (ctx->chip_class >= GFX9) {
+ radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
+ radeon_emit(cs, op);
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ } else {
+ radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+ radeon_emit(cs, va);
+ radeon_emit(cs, op | ((va >> 32) & 0xFF));
+ }
+ r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_READ,
+ RADEON_PRIO_QUERY);
+}
+
static void r600_emit_query_predication(struct r600_common_context *ctx,
struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = ctx->gfx.cs;
struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond;
struct r600_query_buffer *qbuf;
uint32_t op;
- bool flag_wait;
+ bool flag_wait, invert;
if (!query)
return;
+ invert = ctx->render_cond_invert;
flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
- switch (query->b.type) {
- case PIPE_QUERY_OCCLUSION_COUNTER:
- case PIPE_QUERY_OCCLUSION_PREDICATE:
- op = PRED_OP(PREDICATION_OP_ZPASS);
- break;
- case PIPE_QUERY_PRIMITIVES_EMITTED:
- case PIPE_QUERY_PRIMITIVES_GENERATED:
- case PIPE_QUERY_SO_STATISTICS:
- case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
- break;
- default:
- assert(0);
- return;
+ if (query->workaround_buf) {
+ op = PRED_OP(PREDICATION_OP_BOOL64);
+ } else {
+ switch (query->b.type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ op = PRED_OP(PREDICATION_OP_ZPASS);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
+ invert = !invert;
+ break;
+ default:
+ assert(0);
+ return;
+ }
}
/* if true then invert, see GL_ARB_conditional_render_inverted */
- if (ctx->render_cond_invert)
- op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */
+ if (invert)
+ op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
else
- op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */
+ op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
+
+ /* Use the value written by compute shader as a workaround. Note that
+ * the wait flag does not apply in this predication mode.
+ *
+ * The shader outputs the result value to L2. Workarounds only affect VI
+ * and later, where the CP reads data from L2, so we don't need an
+ * additional flush.
+ */
+ if (query->workaround_buf) {
+ uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
+ emit_set_predicate(ctx, query->workaround_buf, va, op);
+ return;
+ }
op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
-
+
/* emit predicate packets for all data blocks */
for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
unsigned results_base = 0;
while (results_base < qbuf->results_end) {
uint64_t va = va_base + results_base;
- if (ctx->chip_class >= GFX9) {
- radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
- radeon_emit(cs, op);
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
+ if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+ for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {
+ emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
+
+ /* set CONTINUE bit for all packets except the first */
+ op |= PREDICATION_CONTINUE;
+ }
} else {
- radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
- radeon_emit(cs, va);
- radeon_emit(cs, op | ((va >> 32) & 0xFF));
+ emit_set_predicate(ctx, qbuf->buf, va, op);
+ op |= PREDICATION_CONTINUE;
}
- r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ,
- RADEON_PRIO_QUERY);
- results_base += query->result_size;
- /* set CONTINUE bit for all packets except the first */
- op |= PREDICATION_CONTINUE;
+ results_base += query->result_size;
}
}
}
if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES))
r600_query_hw_reset_buffers(rctx, query);
+ r600_resource_reference(&query->workaround_buf, NULL);
+
r600_query_hw_emit_start(rctx, query);
if (!query->buffer.buf)
return false;
params->end_offset = 24 - index * 8;
params->fence_offset = params->end_offset + 4;
break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ params->pair_count = R600_MAX_STREAMS;
+ params->pair_stride = 32;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ params->start_offset = 0;
+ params->end_offset = 16;
+
+ /* We can re-use the high dword of the last 64-bit value as a
+ * fence: it is initialized as 0, and the high bit is set by
+ * the write of the streamout stats event.
+ */
+ params->fence_offset = rquery->result_size - 4;
+ break;
case PIPE_QUERY_PIPELINE_STATISTICS:
{
/* Offsets apply to EG+ */
r600_query_read_result(buffer, 2, 6, true) !=
r600_query_read_result(buffer, 0, 4, true);
break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {
+ result->b = result->b ||
+ r600_query_read_result(buffer, 2, 6, true) !=
+ r600_query_read_result(buffer, 0, 4, true);
+ buffer = (char *)buffer + 32;
+ }
+ break;
case PIPE_QUERY_PIPELINE_STATISTICS:
if (rscreen->chip_class >= EVERGREEN) {
result->pipeline_statistics.ps_invocations +=
* 32: apply timestamp conversion
* 64: store full 64 bits result
* 128: store signed 32 bits result
+ * 256: SO_OVERFLOW mode: take the difference of two successive half-pairs
* 1.x = fence_offset
* 1.y = pair_stride
* 1.z = pair_count
"DCL BUFFER[0]\n"
"DCL BUFFER[1]\n"
"DCL BUFFER[2]\n"
- "DCL CONST[0..1]\n"
+ "DCL CONST[0][0..1]\n"
"DCL TEMP[0..5]\n"
"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
"IMM[1] UINT32 {1, 2, 4, 8}\n"
"IMM[2] UINT32 {16, 32, 64, 128}\n"
"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
+ "IMM[4] UINT32 {256, 0, 0, 0}\n"
- "AND TEMP[5], CONST[0].wwww, IMM[2].xxxx\n"
+ "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
"UIF TEMP[5]\n"
/* Check result availability. */
- "LOAD TEMP[1].x, BUFFER[0], CONST[1].xxxx\n"
+ "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
"MOV TEMP[1], TEMP[0].zzzz\n"
"NOT TEMP[0].z, TEMP[0].zzzz\n"
"ELSE\n"
/* Load previously accumulated result if requested. */
"MOV TEMP[0], IMM[0].xxxx\n"
- "AND TEMP[4], CONST[0].wwww, IMM[1].xxxx\n"
+ "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
"UIF TEMP[4]\n"
"LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
"ENDIF\n"
"ENDIF\n"
/* Break if result_index >= result_count. */
- "USGE TEMP[5], TEMP[1].xxxx, CONST[0].zzzz\n"
+ "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
"UIF TEMP[5]\n"
"BRK\n"
"ENDIF\n"
/* Load fence and check result availability */
- "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy, CONST[1].xxxx\n"
+ "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
"NOT TEMP[0].z, TEMP[0].zzzz\n"
"MOV TEMP[1].y, IMM[0].xxxx\n"
"BGNLOOP\n"
/* Load start and end. */
- "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy\n"
- "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[1].yyyy, TEMP[5].xxxx\n"
+ "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
+ "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
- "UADD TEMP[5].x, TEMP[5].xxxx, CONST[0].xxxx\n"
- "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].xxxx\n"
+ "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
+ "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
- "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
- "U64ADD TEMP[0].xy, TEMP[0], TEMP[3]\n"
+ "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
+
+ "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
+ "UIF TEMP[5].zzzz\n"
+ /* Load second start/end half-pair and
+ * take the difference
+ */
+ "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
+ "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+ "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
+
+ "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
+ "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
+ "ENDIF\n"
+
+ "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
/* Increment pair index */
"UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
- "USGE TEMP[5], TEMP[1].yyyy, CONST[1].zzzz\n"
+ "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
"UIF TEMP[5]\n"
"BRK\n"
"ENDIF\n"
"ENDLOOP\n"
"ENDIF\n"
- "AND TEMP[4], CONST[0].wwww, IMM[1].yyyy\n"
+ "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
"UIF TEMP[4]\n"
/* Store accumulated data for chaining. */
"STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
"ELSE\n"
- "AND TEMP[4], CONST[0].wwww, IMM[1].zzzz\n"
+ "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
"UIF TEMP[4]\n"
/* Store result availability. */
"NOT TEMP[0].z, TEMP[0]\n"
"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
- "AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n"
+ "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
"UIF TEMP[4]\n"
"STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
"ENDIF\n"
"NOT TEMP[4], TEMP[0].zzzz\n"
"UIF TEMP[4]\n"
/* Apply timestamp conversion */
- "AND TEMP[4], CONST[0].wwww, IMM[2].yyyy\n"
+ "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
"UIF TEMP[4]\n"
"U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
"U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
"ENDIF\n"
/* Convert to boolean */
- "AND TEMP[4], CONST[0].wwww, IMM[1].wwww\n"
+ "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
"UIF TEMP[4]\n"
- "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[0].xxxx\n"
+ "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
"MOV TEMP[0].y, IMM[0].xxxx\n"
"ENDIF\n"
- "AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n"
+ "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
"UIF TEMP[4]\n"
"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
"ELSE\n"
"MOV TEMP[0].x, IMM[0].wwww\n"
"ENDIF\n"
- "AND TEMP[4], CONST[0].wwww, IMM[2].wwww\n"
+ "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
"UIF TEMP[4]\n"
"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
"ENDIF\n"
consts.config = 0;
if (index < 0)
consts.config |= 4;
- if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
- query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE)
+ if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE)
consts.config |= 8;
+ else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+ consts.config |= 8 | 256;
else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
query->b.type == PIPE_QUERY_TIME_ELAPSED)
consts.config |= 32;
struct r600_query_buffer *qbuf;
struct r600_atom *atom = &rctx->render_cond_atom;
- rctx->render_cond = query;
- rctx->render_cond_invert = condition;
- rctx->render_cond_mode = mode;
-
/* Compute the size of SET_PREDICATION packets. */
atom->num_dw = 0;
if (query) {
- for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)
- atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;
+ bool needs_workaround = false;
+
+ /* There was a firmware regression in VI which causes successive
+ * SET_PREDICATION packets to give the wrong answer for
+ * non-inverted stream overflow predication.
+ */
+ if (((rctx->chip_class == VI && rctx->screen->info.pfp_fw_feature < 49) ||
+ (rctx->chip_class == GFX9 && rctx->screen->info.pfp_fw_feature < 38)) &&
+ !condition &&
+ (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
+ (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
+ (rquery->buffer.previous ||
+ rquery->buffer.results_end > rquery->result_size)))) {
+ needs_workaround = true;
+ }
+
+ if (needs_workaround && !rquery->workaround_buf) {
+ bool old_force_off = rctx->render_cond_force_off;
+ rctx->render_cond_force_off = true;
+
+ u_suballocator_alloc(
+ rctx->allocator_zeroed_memory, 8, 8,
+ &rquery->workaround_offset,
+ (struct pipe_resource **)&rquery->workaround_buf);
+
+ /* Reset to NULL to avoid a redundant SET_PREDICATION
+ * from launching the compute grid.
+ */
+ rctx->render_cond = NULL;
+
+ ctx->get_query_result_resource(
+ ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
+ &rquery->workaround_buf->b.b, rquery->workaround_offset);
+
+ /* Settings this in the render cond atom is too late,
+ * so set it here. */
+ rctx->flags |= rctx->screen->barrier_flags.L2_to_cp |
+ R600_CONTEXT_FLUSH_FOR_RENDER_COND;
+
+ rctx->render_cond_force_off = old_force_off;
+ }
+
+ if (needs_workaround) {
+ atom->num_dw = 5;
+ } else {
+ for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)
+ atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;
+
+ if (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+ atom->num_dw *= R600_MAX_STREAMS;
+ }
}
+ rctx->render_cond = query;
+ rctx->render_cond_invert = condition;
+ rctx->render_cond_mode = mode;
+
rctx->set_atom_dirty(rctx, atom, query != NULL);
}
X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
X("num-shader-cache-hits", NUM_SHADER_CACHE_HITS, UINT64, CUMULATIVE),
X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
+ X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
+ X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE),
X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),
X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
- X("num-fb-cache-flushes", NUM_FB_CACHE_FLUSHES, UINT64, AVERAGE),
+ X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
+ X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
+ X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
+ X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE),
+ X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
+ X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
- X("GPU-dma-busy", GPU_DMA_BUSY, UINT64, AVERAGE),
+ X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
- X("GPU-ce-busy", GPU_CE_BUSY, UINT64, AVERAGE),
};
#undef X