gallium/radeon: use a top-of-pipe timestamp for the start of TIME_ELAPSED
[mesa.git] / src / gallium / drivers / radeon / r600_query.c
index 5712cbe63fb240acf635bb12cf218203c34e03ac..61fd6626b97ab7db693aaf6406c3ff05fe81edae 100644 (file)
@@ -26,7 +26,7 @@
 #include "r600_cs.h"
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
-
+#include "os/os_time.h"
 #include "tgsi/tgsi_text.h"
 
 struct r600_hw_query_params {
@@ -43,17 +43,20 @@ struct r600_query_sw {
 
        uint64_t begin_result;
        uint64_t end_result;
+
+       uint64_t begin_time;
+       uint64_t end_time;
+
        /* Fence for GPU_FINISHED. */
        struct pipe_fence_handle *fence;
 };
 
-static void r600_query_sw_destroy(struct r600_common_context *rctx,
+static void r600_query_sw_destroy(struct r600_common_screen *rscreen,
                                  struct r600_query *rquery)
 {
-       struct pipe_screen *screen = rctx->b.screen;
        struct r600_query_sw *query = (struct r600_query_sw *)rquery;
 
-       screen->fence_reference(screen, &query->fence, NULL);
+       rscreen->b.fence_reference(&rscreen->b, &query->fence, NULL);
        FREE(query);
 }
 
@@ -65,15 +68,18 @@ static enum radeon_value_id winsys_id_from_type(unsigned type)
        case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
        case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
        case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
+       case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
        case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
        case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
        case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
        case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
        case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
+       case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
        case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
        case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
        case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
        case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
+       case R600_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;
        default: unreachable("query type does not correspond to winsys id");
        }
 }
@@ -82,6 +88,7 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx,
                                struct r600_query *rquery)
 {
        struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+       enum radeon_value_id ws_id;
 
        switch(query->b.type) {
        case PIPE_QUERY_TIMESTAMP_DISJOINT:
@@ -90,6 +97,9 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx,
        case R600_QUERY_DRAW_CALLS:
                query->begin_result = rctx->num_draw_calls;
                break;
+       case R600_QUERY_PRIM_RESTART_CALLS:
+               query->begin_result = rctx->num_prim_restart_calls;
+               break;
        case R600_QUERY_SPILL_DRAW_CALLS:
                query->begin_result = rctx->num_spill_draw_calls;
                break;
@@ -123,16 +133,27 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx,
        case R600_QUERY_NUM_L2_WRITEBACKS:
                query->begin_result = rctx->num_L2_writebacks;
                break;
+       case R600_QUERY_TC_OFFLOADED_SLOTS:
+               query->begin_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
+               break;
+       case R600_QUERY_TC_DIRECT_SLOTS:
+               query->begin_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
+               break;
+       case R600_QUERY_TC_NUM_SYNCS:
+               query->begin_result = rctx->tc ? rctx->tc->num_syncs : 0;
+               break;
        case R600_QUERY_REQUESTED_VRAM:
        case R600_QUERY_REQUESTED_GTT:
        case R600_QUERY_MAPPED_VRAM:
        case R600_QUERY_MAPPED_GTT:
        case R600_QUERY_VRAM_USAGE:
+       case R600_QUERY_VRAM_VIS_USAGE:
        case R600_QUERY_GTT_USAGE:
        case R600_QUERY_GPU_TEMPERATURE:
        case R600_QUERY_CURRENT_GPU_SCLK:
        case R600_QUERY_CURRENT_GPU_MCLK:
        case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+       case R600_QUERY_NUM_MAPPED_BUFFERS:
                query->begin_result = 0;
                break;
        case R600_QUERY_BUFFER_WAIT_TIME:
@@ -144,11 +165,35 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx,
                query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
                break;
        }
-       case R600_QUERY_GPU_LOAD:
-               query->begin_result = r600_begin_counter_gui(rctx->screen);
+       case R600_QUERY_CS_THREAD_BUSY:
+               ws_id = winsys_id_from_type(query->b.type);
+               query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
+               query->begin_time = os_time_get_nano();
                break;
+       case R600_QUERY_GPU_LOAD:
        case R600_QUERY_GPU_SHADERS_BUSY:
-               query->begin_result = r600_begin_counter_spi(rctx->screen);
+       case R600_QUERY_GPU_TA_BUSY:
+       case R600_QUERY_GPU_GDS_BUSY:
+       case R600_QUERY_GPU_VGT_BUSY:
+       case R600_QUERY_GPU_IA_BUSY:
+       case R600_QUERY_GPU_SX_BUSY:
+       case R600_QUERY_GPU_WD_BUSY:
+       case R600_QUERY_GPU_BCI_BUSY:
+       case R600_QUERY_GPU_SC_BUSY:
+       case R600_QUERY_GPU_PA_BUSY:
+       case R600_QUERY_GPU_DB_BUSY:
+       case R600_QUERY_GPU_CP_BUSY:
+       case R600_QUERY_GPU_CB_BUSY:
+       case R600_QUERY_GPU_SDMA_BUSY:
+       case R600_QUERY_GPU_PFP_BUSY:
+       case R600_QUERY_GPU_MEQ_BUSY:
+       case R600_QUERY_GPU_ME_BUSY:
+       case R600_QUERY_GPU_SURF_SYNC_BUSY:
+       case R600_QUERY_GPU_DMA_BUSY:
+       case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
+       case R600_QUERY_GPU_CE_BUSY:
+               query->begin_result = r600_begin_counter(rctx->screen,
+                                                        query->b.type);
                break;
        case R600_QUERY_NUM_COMPILATIONS:
                query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
@@ -177,6 +222,7 @@ static bool r600_query_sw_end(struct r600_common_context *rctx,
                              struct r600_query *rquery)
 {
        struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+       enum radeon_value_id ws_id;
 
        switch(query->b.type) {
        case PIPE_QUERY_TIMESTAMP_DISJOINT:
@@ -187,6 +233,9 @@ static bool r600_query_sw_end(struct r600_common_context *rctx,
        case R600_QUERY_DRAW_CALLS:
                query->end_result = rctx->num_draw_calls;
                break;
+       case R600_QUERY_PRIM_RESTART_CALLS:
+               query->end_result = rctx->num_prim_restart_calls;
+               break;
        case R600_QUERY_SPILL_DRAW_CALLS:
                query->end_result = rctx->num_spill_draw_calls;
                break;
@@ -220,16 +269,27 @@ static bool r600_query_sw_end(struct r600_common_context *rctx,
        case R600_QUERY_NUM_L2_WRITEBACKS:
                query->end_result = rctx->num_L2_writebacks;
                break;
+       case R600_QUERY_TC_OFFLOADED_SLOTS:
+               query->end_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
+               break;
+       case R600_QUERY_TC_DIRECT_SLOTS:
+               query->end_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
+               break;
+       case R600_QUERY_TC_NUM_SYNCS:
+               query->end_result = rctx->tc ? rctx->tc->num_syncs : 0;
+               break;
        case R600_QUERY_REQUESTED_VRAM:
        case R600_QUERY_REQUESTED_GTT:
        case R600_QUERY_MAPPED_VRAM:
        case R600_QUERY_MAPPED_GTT:
        case R600_QUERY_VRAM_USAGE:
+       case R600_QUERY_VRAM_VIS_USAGE:
        case R600_QUERY_GTT_USAGE:
        case R600_QUERY_GPU_TEMPERATURE:
        case R600_QUERY_CURRENT_GPU_SCLK:
        case R600_QUERY_CURRENT_GPU_MCLK:
        case R600_QUERY_BUFFER_WAIT_TIME:
+       case R600_QUERY_NUM_MAPPED_BUFFERS:
        case R600_QUERY_NUM_GFX_IBS:
        case R600_QUERY_NUM_SDMA_IBS:
        case R600_QUERY_NUM_BYTES_MOVED:
@@ -238,14 +298,36 @@ static bool r600_query_sw_end(struct r600_common_context *rctx,
                query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
                break;
        }
-       case R600_QUERY_GPU_LOAD:
-               query->end_result = r600_end_counter_gui(rctx->screen,
-                                                        query->begin_result);
-               query->begin_result = 0;
+       case R600_QUERY_CS_THREAD_BUSY:
+               ws_id = winsys_id_from_type(query->b.type);
+               query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
+               query->end_time = os_time_get_nano();
                break;
+       case R600_QUERY_GPU_LOAD:
        case R600_QUERY_GPU_SHADERS_BUSY:
-               query->end_result = r600_end_counter_spi(rctx->screen,
-                                                        query->begin_result);
+       case R600_QUERY_GPU_TA_BUSY:
+       case R600_QUERY_GPU_GDS_BUSY:
+       case R600_QUERY_GPU_VGT_BUSY:
+       case R600_QUERY_GPU_IA_BUSY:
+       case R600_QUERY_GPU_SX_BUSY:
+       case R600_QUERY_GPU_WD_BUSY:
+       case R600_QUERY_GPU_BCI_BUSY:
+       case R600_QUERY_GPU_SC_BUSY:
+       case R600_QUERY_GPU_PA_BUSY:
+       case R600_QUERY_GPU_DB_BUSY:
+       case R600_QUERY_GPU_CP_BUSY:
+       case R600_QUERY_GPU_CB_BUSY:
+       case R600_QUERY_GPU_SDMA_BUSY:
+       case R600_QUERY_GPU_PFP_BUSY:
+       case R600_QUERY_GPU_MEQ_BUSY:
+       case R600_QUERY_GPU_ME_BUSY:
+       case R600_QUERY_GPU_SURF_SYNC_BUSY:
+       case R600_QUERY_GPU_DMA_BUSY:
+       case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
+       case R600_QUERY_GPU_CE_BUSY:
+               query->end_result = r600_end_counter(rctx->screen,
+                                                    query->b.type,
+                                                    query->begin_result);
                query->begin_result = 0;
                break;
        case R600_QUERY_NUM_COMPILATIONS:
@@ -290,11 +372,17 @@ static bool r600_query_sw_get_result(struct r600_common_context *rctx,
                return true;
        case PIPE_QUERY_GPU_FINISHED: {
                struct pipe_screen *screen = rctx->b.screen;
-               result->b = screen->fence_finish(screen, &rctx->b, query->fence,
+               struct pipe_context *ctx = rquery->b.flushed ? NULL : &rctx->b;
+
+               result->b = screen->fence_finish(screen, ctx, query->fence,
                                                 wait ? PIPE_TIMEOUT_INFINITE : 0);
                return result->b;
        }
 
+       case R600_QUERY_CS_THREAD_BUSY:
+               result->u64 = (query->end_result - query->begin_result) * 100 /
+                             (query->end_time - query->begin_time);
+               return true;
        case R600_QUERY_GPIN_ASIC_ID:
                result->u32 = 0;
                return true;
@@ -337,8 +425,7 @@ static struct r600_query_ops sw_query_ops = {
        .get_result_resource = NULL
 };
 
-static struct pipe_query *r600_query_sw_create(struct pipe_context *ctx,
-                                              unsigned query_type)
+static struct pipe_query *r600_query_sw_create(unsigned query_type)
 {
        struct r600_query_sw *query;
 
@@ -352,7 +439,7 @@ static struct pipe_query *r600_query_sw_create(struct pipe_context *ctx,
        return (struct pipe_query *)query;
 }
 
-void r600_query_hw_destroy(struct r600_common_context *rctx,
+void r600_query_hw_destroy(struct r600_common_screen *rscreen,
                           struct r600_query *rquery)
 {
        struct r600_query_hw *query = (struct r600_query_hw *)rquery;
@@ -370,23 +457,23 @@ void r600_query_hw_destroy(struct r600_common_context *rctx,
        FREE(rquery);
 }
 
-static struct r600_resource *r600_new_query_buffer(struct r600_common_context *ctx,
+static struct r600_resource *r600_new_query_buffer(struct r600_common_screen *rscreen,
                                                   struct r600_query_hw *query)
 {
        unsigned buf_size = MAX2(query->result_size,
-                                ctx->screen->info.min_alloc_size);
+                                rscreen->info.min_alloc_size);
 
        /* Queries are normally read by the CPU after
         * being written by the gpu, hence staging is probably a good
         * usage pattern.
         */
        struct r600_resource *buf = (struct r600_resource*)
-               pipe_buffer_create(ctx->b.screen, 0,
+               pipe_buffer_create(&rscreen->b, 0,
                                   PIPE_USAGE_STAGING, buf_size);
        if (!buf)
                return NULL;
 
-       if (!query->ops->prepare_buffer(ctx, query, buf)) {
+       if (!query->ops->prepare_buffer(rscreen, query, buf)) {
                r600_resource_reference(&buf, NULL);
                return NULL;
        }
@@ -394,14 +481,14 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
        return buf;
 }
 
-static bool r600_query_hw_prepare_buffer(struct r600_common_context *ctx,
+static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen,
                                         struct r600_query_hw *query,
                                         struct r600_resource *buffer)
 {
        /* Callers ensure that the buffer is currently unused by the GPU. */
-       uint32_t *results = ctx->ws->buffer_map(buffer->buf, NULL,
-                                               PIPE_TRANSFER_WRITE |
-                                               PIPE_TRANSFER_UNSYNCHRONIZED);
+       uint32_t *results = rscreen->ws->buffer_map(buffer->buf, NULL,
+                                                  PIPE_TRANSFER_WRITE |
+                                                  PIPE_TRANSFER_UNSYNCHRONIZED);
        if (!results)
                return false;
 
@@ -409,19 +496,21 @@ static bool r600_query_hw_prepare_buffer(struct r600_common_context *ctx,
 
        if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
            query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+               unsigned max_rbs = rscreen->info.num_render_backends;
+               unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask;
                unsigned num_results;
                unsigned i, j;
 
                /* Set top bits for unused backends. */
                num_results = buffer->b.b.width0 / query->result_size;
                for (j = 0; j < num_results; j++) {
-                       for (i = 0; i < ctx->max_db; i++) {
-                               if (!(ctx->backend_mask & (1<<i))) {
+                       for (i = 0; i < max_rbs; i++) {
+                               if (!(enabled_rb_mask & (1<<i))) {
                                        results[(i * 4)+1] = 0x80000000;
                                        results[(i * 4)+3] = 0x80000000;
                                }
                        }
-                       results += 4 * ctx->max_db;
+                       results += 4 * max_rbs;
                }
        }
 
@@ -452,7 +541,7 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
                                       struct r600_query_hw *query,
                                       struct r600_resource *buffer,
                                       uint64_t va);
-static void r600_query_hw_add_result(struct r600_common_context *ctx,
+static void r600_query_hw_add_result(struct r600_common_screen *rscreen,
                                     struct r600_query_hw *, void *buffer,
                                     union pipe_query_result *result);
 static void r600_query_hw_clear_result(struct r600_query_hw *,
@@ -466,17 +555,17 @@ static struct r600_query_hw_ops query_hw_default_hw_ops = {
        .add_result = r600_query_hw_add_result,
 };
 
-bool r600_query_hw_init(struct r600_common_context *rctx,
+bool r600_query_hw_init(struct r600_common_screen *rscreen,
                        struct r600_query_hw *query)
 {
-       query->buffer.buf = r600_new_query_buffer(rctx, query);
+       query->buffer.buf = r600_new_query_buffer(rscreen, query);
        if (!query->buffer.buf)
                return false;
 
        return true;
 }
 
-static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
+static struct pipe_query *r600_query_hw_create(struct r600_common_screen *rscreen,
                                               unsigned query_type,
                                               unsigned index)
 {
@@ -491,19 +580,19 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
        switch (query_type) {
        case PIPE_QUERY_OCCLUSION_COUNTER:
        case PIPE_QUERY_OCCLUSION_PREDICATE:
-               query->result_size = 16 * rctx->max_db;
+               query->result_size = 16 * rscreen->info.num_render_backends;
                query->result_size += 16; /* for the fence + alignment */
                query->num_cs_dw_begin = 6;
-               query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen);
+               query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen);
                break;
        case PIPE_QUERY_TIME_ELAPSED:
                query->result_size = 24;
                query->num_cs_dw_begin = 8;
-               query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen);
+               query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen);
                break;
        case PIPE_QUERY_TIMESTAMP:
                query->result_size = 16;
-               query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen);
+               query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen);
                query->flags = R600_QUERY_HW_FLAG_NO_START;
                break;
        case PIPE_QUERY_PRIMITIVES_EMITTED:
@@ -518,10 +607,10 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
                break;
        case PIPE_QUERY_PIPELINE_STATISTICS:
                /* 11 values on EG, 8 on R600. */
-               query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16;
+               query->result_size = (rscreen->chip_class >= EVERGREEN ? 11 : 8) * 16;
                query->result_size += 8; /* for the fence + alignment */
                query->num_cs_dw_begin = 6;
-               query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen);
+               query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen);
                break;
        default:
                assert(0);
@@ -529,7 +618,7 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
                return NULL;
        }
 
-       if (!r600_query_hw_init(rctx, query)) {
+       if (!r600_query_hw_init(rscreen, query)) {
                FREE(query);
                return NULL;
        }
@@ -588,7 +677,7 @@ static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
                radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
                radeon_emit(cs, va);
-               radeon_emit(cs, (va >> 32) & 0xFFFF);
+               radeon_emit(cs, va >> 32);
                break;
        case PIPE_QUERY_PRIMITIVES_EMITTED:
        case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -597,17 +686,34 @@ static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
                radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
                radeon_emit(cs, va);
-               radeon_emit(cs, (va >> 32) & 0xFFFF);
+               radeon_emit(cs, va >> 32);
                break;
        case PIPE_QUERY_TIME_ELAPSED:
-               r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
-                                        0, 3, NULL, va, 0, 0);
+               if (ctx->chip_class >= SI) {
+                       /* Write the timestamp from the CP not waiting for
+                        * outstanding draws (top-of-pipe).
+                        */
+                       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+                       radeon_emit(cs, COPY_DATA_COUNT_SEL |
+                                       COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
+                                       COPY_DATA_DST_SEL(COPY_DATA_MEM_ASYNC));
+                       radeon_emit(cs, 0);
+                       radeon_emit(cs, 0);
+                       radeon_emit(cs, va);
+                       radeon_emit(cs, va >> 32);
+               } else {
+                       /* Write the timestamp after the last draw is done.
+                        * (bottom-of-pipe)
+                        */
+                       r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
+                                                0, 3, NULL, va, 0, 0);
+               }
                break;
        case PIPE_QUERY_PIPELINE_STATISTICS:
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
                radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
                radeon_emit(cs, va);
-               radeon_emit(cs, (va >> 32) & 0xFFFF);
+               radeon_emit(cs, va >> 32);
                break;
        default:
                assert(0);
@@ -636,7 +742,7 @@ static void r600_query_hw_emit_start(struct r600_common_context *ctx,
                *qbuf = query->buffer;
                query->buffer.results_end = 0;
                query->buffer.previous = qbuf;
-               query->buffer.buf = r600_new_query_buffer(ctx, query);
+               query->buffer.buf = r600_new_query_buffer(ctx->screen, query);
                if (!query->buffer.buf)
                        return;
        }
@@ -664,9 +770,9 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
                radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
                radeon_emit(cs, va);
-               radeon_emit(cs, (va >> 32) & 0xFFFF);
+               radeon_emit(cs, va >> 32);
 
-               fence_va = va + ctx->max_db * 16 - 8;
+               fence_va = va + ctx->screen->info.num_render_backends * 16 - 8;
                break;
        case PIPE_QUERY_PRIMITIVES_EMITTED:
        case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -676,7 +782,7 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
                radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
                radeon_emit(cs, va);
-               radeon_emit(cs, (va >> 32) & 0xFFFF);
+               radeon_emit(cs, va >> 32);
                break;
        case PIPE_QUERY_TIME_ELAPSED:
                va += 8;
@@ -693,7 +799,7 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
                radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
                radeon_emit(cs, va);
-               radeon_emit(cs, (va >> 32) & 0xFFFF);
+               radeon_emit(cs, va >> 32);
 
                fence_va = va + sample_size;
                break;
@@ -778,12 +884,21 @@ static void r600_emit_query_predication(struct r600_common_context *ctx,
        /* emit predicate packets for all data blocks */
        for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
                unsigned results_base = 0;
-               uint64_t va = qbuf->buf->gpu_address;
+               uint64_t va_base = qbuf->buf->gpu_address;
 
                while (results_base < qbuf->results_end) {
-                       radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-                       radeon_emit(cs, va + results_base);
-                       radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF));
+                       uint64_t va = va_base + results_base;
+
+                       if (ctx->chip_class >= GFX9) {
+                               radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
+                               radeon_emit(cs, op);
+                               radeon_emit(cs, va);
+                               radeon_emit(cs, va >> 32);
+                       } else {
+                               radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+                               radeon_emit(cs, va);
+                               radeon_emit(cs, op | ((va >> 32) & 0xFF));
+                       }
                        r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ,
                                        RADEON_PRIO_QUERY);
                        results_base += query->result_size;
@@ -796,14 +911,15 @@ static void r600_emit_query_predication(struct r600_common_context *ctx,
 
 static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
 {
-       struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+       struct r600_common_screen *rscreen =
+               (struct r600_common_screen *)ctx->screen;
 
        if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
            query_type == PIPE_QUERY_GPU_FINISHED ||
            query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
-               return r600_query_sw_create(ctx, query_type);
+               return r600_query_sw_create(query_type);
 
-       return r600_query_hw_create(rctx, query_type, index);
+       return r600_query_hw_create(rscreen, query_type, index);
 }
 
 static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
@@ -811,7 +927,7 @@ static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *quer
        struct r600_common_context *rctx = (struct r600_common_context *)ctx;
        struct r600_query *rquery = (struct r600_query *)query;
 
-       rquery->ops->destroy(rctx, rquery);
+       rquery->ops->destroy(rctx->screen, rquery);
 }
 
 static boolean r600_begin_query(struct pipe_context *ctx,
@@ -843,9 +959,9 @@ void r600_query_hw_reset_buffers(struct r600_common_context *rctx,
        if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
            !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
                r600_resource_reference(&query->buffer.buf, NULL);
-               query->buffer.buf = r600_new_query_buffer(rctx, query);
+               query->buffer.buf = r600_new_query_buffer(rctx->screen, query);
        } else {
-               if (!query->ops->prepare_buffer(rctx, query, query->buffer.buf))
+               if (!query->ops->prepare_buffer(rctx->screen, query, query->buffer.buf))
                        r600_resource_reference(&query->buffer.buf, NULL);
        }
 }
@@ -902,6 +1018,8 @@ static void r600_get_hw_query_params(struct r600_common_context *rctx,
                                     struct r600_query_hw *rquery, int index,
                                     struct r600_hw_query_params *params)
 {
+       unsigned max_rbs = rctx->screen->info.num_render_backends;
+
        params->pair_stride = 0;
        params->pair_count = 1;
 
@@ -910,9 +1028,9 @@ static void r600_get_hw_query_params(struct r600_common_context *rctx,
        case PIPE_QUERY_OCCLUSION_PREDICATE:
                params->start_offset = 0;
                params->end_offset = 8;
-               params->fence_offset = rctx->max_db * 16;
+               params->fence_offset = max_rbs * 16;
                params->pair_stride = 16;
-               params->pair_count = rctx->max_db;
+               params->pair_count = max_rbs;
                break;
        case PIPE_QUERY_TIME_ELAPSED:
                params->start_offset = 0;
@@ -971,14 +1089,16 @@ static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned
        return 0;
 }
 
-static void r600_query_hw_add_result(struct r600_common_context *ctx,
+static void r600_query_hw_add_result(struct r600_common_screen *rscreen,
                                     struct r600_query_hw *query,
                                     void *buffer,
                                     union pipe_query_result *result)
 {
+       unsigned max_rbs = rscreen->info.num_render_backends;
+
        switch (query->b.type) {
        case PIPE_QUERY_OCCLUSION_COUNTER: {
-               for (unsigned i = 0; i < ctx->max_db; ++i) {
+               for (unsigned i = 0; i < max_rbs; ++i) {
                        unsigned results_base = i * 16;
                        result->u64 +=
                                r600_query_read_result(buffer + results_base, 0, 2, true);
@@ -986,7 +1106,7 @@ static void r600_query_hw_add_result(struct r600_common_context *ctx,
                break;
        }
        case PIPE_QUERY_OCCLUSION_PREDICATE: {
-               for (unsigned i = 0; i < ctx->max_db; ++i) {
+               for (unsigned i = 0; i < max_rbs; ++i) {
                        unsigned results_base = i * 16;
                        result->b = result->b ||
                                r600_query_read_result(buffer + results_base, 0, 2, true) != 0;
@@ -1024,7 +1144,7 @@ static void r600_query_hw_add_result(struct r600_common_context *ctx,
                        r600_query_read_result(buffer, 0, 4, true);
                break;
        case PIPE_QUERY_PIPELINE_STATISTICS:
-               if (ctx->chip_class >= EVERGREEN) {
+               if (rscreen->chip_class >= EVERGREEN) {
                        result->pipeline_statistics.ps_invocations +=
                                r600_query_read_result(buffer, 0, 22, false);
                        result->pipeline_statistics.c_primitives +=
@@ -1122,23 +1242,28 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx,
                              struct r600_query *rquery,
                              bool wait, union pipe_query_result *result)
 {
+       struct r600_common_screen *rscreen = rctx->screen;
        struct r600_query_hw *query = (struct r600_query_hw *)rquery;
        struct r600_query_buffer *qbuf;
 
        query->ops->clear_result(query, result);
 
        for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+               unsigned usage = PIPE_TRANSFER_READ |
+                                (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
                unsigned results_base = 0;
                void *map;
 
-               map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf,
-                                                     PIPE_TRANSFER_READ |
-                                                     (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
+               if (rquery->b.flushed)
+                       map = rctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+               else
+                       map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf, usage);
+
                if (!map)
                        return false;
 
                while (results_base != qbuf->results_end) {
-                       query->ops->add_result(rctx, query, map + results_base,
+                       query->ops->add_result(rscreen, query, map + results_base,
                                               result);
                        results_base += query->result_size;
                }
@@ -1147,7 +1272,7 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx,
        /* Convert the time to expected units. */
        if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
            rquery->type == PIPE_QUERY_TIMESTAMP) {
-               result->u64 = (1000000 * result->u64) / rctx->screen->info.clock_crystal_freq;
+               result->u64 = (1000000 * result->u64) / rscreen->info.clock_crystal_freq;
        }
        return true;
 }
@@ -1514,7 +1639,7 @@ static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
 static void r600_render_condition(struct pipe_context *ctx,
                                  struct pipe_query *query,
                                  boolean condition,
-                                 uint mode)
+                                 enum pipe_render_cond_flag mode)
 {
        struct r600_common_context *rctx = (struct r600_common_context *)ctx;
        struct r600_query_hw *rquery = (struct r600_query_hw *)query;
@@ -1585,19 +1710,23 @@ void r600_resume_queries(struct r600_common_context *ctx)
        }
 }
 
-/* Get backends mask */
-void r600_query_init_backend_mask(struct r600_common_context *ctx)
+/* Fix radeon_info::enabled_rb_mask for R600, R700, EVERGREEN, NI. */
+void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen)
 {
+       struct r600_common_context *ctx =
+               (struct r600_common_context*)rscreen->aux_context;
        struct radeon_winsys_cs *cs = ctx->gfx.cs;
        struct r600_resource *buffer;
        uint32_t *results;
-       unsigned num_backends = ctx->screen->info.num_render_backends;
        unsigned i, mask = 0;
+       unsigned max_rbs = ctx->screen->info.num_render_backends;
+
+       assert(rscreen->chip_class <= CAYMAN);
 
        /* if backend_map query is supported by the kernel */
-       if (ctx->screen->info.r600_gb_backend_map_valid) {
-               unsigned num_tile_pipes = ctx->screen->info.num_tile_pipes;
-               unsigned backend_map = ctx->screen->info.r600_gb_backend_map;
+       if (rscreen->info.r600_gb_backend_map_valid) {
+               unsigned num_tile_pipes = rscreen->info.num_tile_pipes;
+               unsigned backend_map = rscreen->info.r600_gb_backend_map;
                unsigned item_width, item_mask;
 
                if (ctx->chip_class >= EVERGREEN) {
@@ -1614,7 +1743,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
                        backend_map >>= item_width;
                }
                if (mask != 0) {
-                       ctx->backend_mask = mask;
+                       rscreen->info.enabled_rb_mask = mask;
                        return;
                }
        }
@@ -1624,14 +1753,14 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
        /* create buffer for event data */
        buffer = (struct r600_resource*)
                pipe_buffer_create(ctx->b.screen, 0,
-                                  PIPE_USAGE_STAGING, ctx->max_db*16);
+                                  PIPE_USAGE_STAGING, max_rbs * 16);
        if (!buffer)
-               goto err;
+               return;
 
        /* initialize buffer with zeroes */
        results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE);
        if (results) {
-               memset(results, 0, ctx->max_db * 4 * 4);
+               memset(results, 0, max_rbs * 4 * 4);
 
                /* emit EVENT_WRITE for ZPASS_DONE */
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -1645,7 +1774,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
                /* analyze results */
                results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ);
                if (results) {
-                       for(i = 0; i < ctx->max_db; i++) {
+                       for(i = 0; i < max_rbs; i++) {
                                /* at least highest bit will be set if backend is used */
                                if (results[i*4 + 1])
                                        mask |= (1<<i);
@@ -1655,15 +1784,8 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 
        r600_resource_reference(&buffer, NULL);
 
-       if (mask != 0) {
-               ctx->backend_mask = mask;
-               return;
-       }
-
-err:
-       /* fallback to old method - set num_backends lower bits to 1 */
-       ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
-       return;
+       if (mask)
+               rscreen->info.enabled_rb_mask = mask;
 }
 
 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
@@ -1686,6 +1808,7 @@ static struct pipe_driver_query_info r600_driver_query_list[] = {
        X("num-shaders-created",        NUM_SHADERS_CREATED,    UINT64, CUMULATIVE),
        X("num-shader-cache-hits",      NUM_SHADER_CACHE_HITS,  UINT64, CUMULATIVE),
        X("draw-calls",                 DRAW_CALLS,             UINT64, AVERAGE),
+       X("prim-restart-calls",         PRIM_RESTART_CALLS,     UINT64, AVERAGE),
        X("spill-draw-calls",           SPILL_DRAW_CALLS,       UINT64, AVERAGE),
        X("compute-calls",              COMPUTE_CALLS,          UINT64, AVERAGE),
        X("spill-compute-calls",        SPILL_COMPUTE_CALLS,    UINT64, AVERAGE),
@@ -1697,16 +1820,22 @@ static struct pipe_driver_query_info r600_driver_query_list[] = {
        X("num-fb-cache-flushes",       NUM_FB_CACHE_FLUSHES,   UINT64, AVERAGE),
        X("num-L2-invalidates",         NUM_L2_INVALIDATES,     UINT64, AVERAGE),
        X("num-L2-writebacks",          NUM_L2_WRITEBACKS,      UINT64, AVERAGE),
+       X("tc-offloaded-slots",         TC_OFFLOADED_SLOTS,     UINT64, AVERAGE),
+       X("tc-direct-slots",            TC_DIRECT_SLOTS,        UINT64, AVERAGE),
+       X("tc-num-syncs",               TC_NUM_SYNCS,           UINT64, AVERAGE),
+       X("CS-thread-busy",             CS_THREAD_BUSY,         UINT64, AVERAGE),
        X("requested-VRAM",             REQUESTED_VRAM,         BYTES, AVERAGE),
        X("requested-GTT",              REQUESTED_GTT,          BYTES, AVERAGE),
        X("mapped-VRAM",                MAPPED_VRAM,            BYTES, AVERAGE),
        X("mapped-GTT",                 MAPPED_GTT,             BYTES, AVERAGE),
        X("buffer-wait-time",           BUFFER_WAIT_TIME,       MICROSECONDS, CUMULATIVE),
+       X("num-mapped-buffers",         NUM_MAPPED_BUFFERS,     UINT64, AVERAGE),
        X("num-GFX-IBs",                NUM_GFX_IBS,            UINT64, AVERAGE),
        X("num-SDMA-IBs",               NUM_SDMA_IBS,           UINT64, AVERAGE),
        X("num-bytes-moved",            NUM_BYTES_MOVED,        BYTES, CUMULATIVE),
        X("num-evictions",              NUM_EVICTIONS,          UINT64, CUMULATIVE),
        X("VRAM-usage",                 VRAM_USAGE,             BYTES, AVERAGE),
+       X("VRAM-vis-usage",             VRAM_VIS_USAGE,         BYTES, AVERAGE),
        X("GTT-usage",                  GTT_USAGE,              BYTES, AVERAGE),
        X("back-buffer-ps-draw-ratio",  BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
 
@@ -1721,13 +1850,34 @@ static struct pipe_driver_query_info r600_driver_query_list[] = {
        XG(GPIN, "GPIN_003",            GPIN_NUM_SPI,           UINT, AVERAGE),
        XG(GPIN, "GPIN_004",            GPIN_NUM_SE,            UINT, AVERAGE),
 
+       X("temperature",                GPU_TEMPERATURE,        UINT64, AVERAGE),
+       X("shader-clock",               CURRENT_GPU_SCLK,       HZ, AVERAGE),
+       X("memory-clock",               CURRENT_GPU_MCLK,       HZ, AVERAGE),
+
        /* The following queries must be at the end of the list because their
         * availability is adjusted dynamically based on the DRM version. */
        X("GPU-load",                   GPU_LOAD,               UINT64, AVERAGE),
        X("GPU-shaders-busy",           GPU_SHADERS_BUSY,       UINT64, AVERAGE),
-       X("temperature",                GPU_TEMPERATURE,        UINT64, AVERAGE),
-       X("shader-clock",               CURRENT_GPU_SCLK,       HZ, AVERAGE),
-       X("memory-clock",               CURRENT_GPU_MCLK,       HZ, AVERAGE),
+       X("GPU-ta-busy",                GPU_TA_BUSY,            UINT64, AVERAGE),
+       X("GPU-gds-busy",               GPU_GDS_BUSY,           UINT64, AVERAGE),
+       X("GPU-vgt-busy",               GPU_VGT_BUSY,           UINT64, AVERAGE),
+       X("GPU-ia-busy",                GPU_IA_BUSY,            UINT64, AVERAGE),
+       X("GPU-sx-busy",                GPU_SX_BUSY,            UINT64, AVERAGE),
+       X("GPU-wd-busy",                GPU_WD_BUSY,            UINT64, AVERAGE),
+       X("GPU-bci-busy",               GPU_BCI_BUSY,           UINT64, AVERAGE),
+       X("GPU-sc-busy",                GPU_SC_BUSY,            UINT64, AVERAGE),
+       X("GPU-pa-busy",                GPU_PA_BUSY,            UINT64, AVERAGE),
+       X("GPU-db-busy",                GPU_DB_BUSY,            UINT64, AVERAGE),
+       X("GPU-cp-busy",                GPU_CP_BUSY,            UINT64, AVERAGE),
+       X("GPU-cb-busy",                GPU_CB_BUSY,            UINT64, AVERAGE),
+       X("GPU-sdma-busy",              GPU_SDMA_BUSY,          UINT64, AVERAGE),
+       X("GPU-pfp-busy",               GPU_PFP_BUSY,           UINT64, AVERAGE),
+       X("GPU-meq-busy",               GPU_MEQ_BUSY,           UINT64, AVERAGE),
+       X("GPU-me-busy",                GPU_ME_BUSY,            UINT64, AVERAGE),
+       X("GPU-surf-sync-busy",         GPU_SURF_SYNC_BUSY,     UINT64, AVERAGE),
+       X("GPU-dma-busy",               GPU_DMA_BUSY,           UINT64, AVERAGE),
+       X("GPU-scratch-ram-busy",       GPU_SCRATCH_RAM_BUSY,   UINT64, AVERAGE),
+       X("GPU-ce-busy",                GPU_CE_BUSY,            UINT64, AVERAGE),
 };
 
 #undef X
@@ -1738,10 +1888,14 @@ static unsigned r600_get_num_queries(struct r600_common_screen *rscreen)
 {
        if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
                return ARRAY_SIZE(r600_driver_query_list);
-       else if (rscreen->info.drm_major == 3)
-               return ARRAY_SIZE(r600_driver_query_list) - 3;
+       else if (rscreen->info.drm_major == 3) {
+               if (rscreen->chip_class >= VI)
+                       return ARRAY_SIZE(r600_driver_query_list);
+               else
+                       return ARRAY_SIZE(r600_driver_query_list) - 7;
+       }
        else
-               return ARRAY_SIZE(r600_driver_query_list) - 5;
+               return ARRAY_SIZE(r600_driver_query_list) - 25;
 }
 
 static int r600_get_driver_query_info(struct pipe_screen *screen,
@@ -1777,6 +1931,9 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
        case R600_QUERY_GPU_TEMPERATURE:
                info->max_value.u64 = 125;
                break;
+       case R600_QUERY_VRAM_VIS_USAGE:
+               info->max_value.u64 = rscreen->info.vram_vis_size;
+               break;
        }
 
        if (info->group_id != ~(unsigned)0 && rscreen->perfcounters)