From: Rob Clark <robdclark@gmail.com>
Date: Mon, 11 Jul 2016 21:36:45 +0000 (-0400)
Subject: freedreno: re-order support for hw queries
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=7f8fd02dc7cad1ddcfb610db10ffbb41e3e34e7d;p=mesa.git

freedreno: re-order support for hw queries

Push query state down to batch, and use the resource tracking to figure
out which batch(es) need to be flushed to get the query result.

This means we actually need to allocate the prsc up front, before we
know the size.  So we have to add a special way to allocate an un-
backed resource, and then later allocate the backing storage.

Signed-off-by: Rob Clark <robdclark@gmail.com>
---

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index eef5b52f12c..7e83157e38e 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -757,8 +757,9 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
  * state, there could have been a context switch between ioctls):
  */
 void
-fd3_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring)
+fd3_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
 {
+	struct fd_context *ctx = batch->ctx;
 	struct fd3_context *fd3_ctx = fd3_context(ctx);
 	int i;
 
@@ -894,7 +895,7 @@ fd3_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring)
 
 	fd_wfi(ctx, ring);
 
-	fd_hw_query_enable(ctx, ring);
+	fd_hw_query_enable(batch, ring);
 
 	ctx->needs_rb_fbd = true;
 }
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
index 110f30e89be..dfe77589542 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
@@ -93,7 +93,7 @@ void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit);
 void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		struct fd3_emit *emit);
 
-void fd3_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring);
+void fd3_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring);
 
 void fd3_emit_init(struct pipe_context *pctx);
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index b9af45683f9..1788c0c7384 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -734,7 +734,7 @@ fd3_emit_sysmem_prep(struct fd_batch *batch)
 		pitch = fd_resource(psurf->texture)->slices[psurf->u.tex.level].pitch;
 	}
 
-	fd3_emit_restore(batch->ctx, ring);
+	fd3_emit_restore(batch, ring);
 
 	OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1);
 	OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) |
@@ -927,7 +927,7 @@ fd3_emit_tile_init(struct fd_batch *batch)
 	struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
 	uint32_t rb_render_control;
 
-	fd3_emit_restore(batch->ctx, ring);
+	fd3_emit_restore(batch, ring);
 
 	/* note: use gmem->bin_w/h, the bin_w/h parameters may be truncated
 	 * at the right and bottom edge tiles
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_query.c b/src/gallium/drivers/freedreno/a3xx/fd3_query.c
index 8fc0a0d4229..ec034fc127d 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_query.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_query.c
@@ -46,10 +46,10 @@ struct fd_rb_samp_ctrs {
  */
 
 static struct fd_hw_sample *
-occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
+occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
 {
 	struct fd_hw_sample *samp =
-			fd_hw_sample_init(ctx, sizeof(struct fd_rb_samp_ctrs));
+			fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs));
 
 	/* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
 	 * HW_QUERY_BASE_REG register:
@@ -68,7 +68,7 @@ occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
 						INDEX_SIZE_IGN, USE_VISIBILITY, 0));
 	OUT_RING(ring, 0);             /* NumIndices */
 
-	fd_event_write(ctx, ring, ZPASS_DONE);
+	fd_event_write(batch->ctx, ring, ZPASS_DONE);
 
 	OUT_PKT0(ring, REG_A3XX_RBBM_PERFCTR_CTL, 1);
 	OUT_RING(ring, A3XX_RBBM_PERFCTR_CTL_ENABLE);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 88e1a40ec90..9ce93f6e33f 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -736,8 +736,9 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
  * state, there could have been a context switch between ioctls):
  */
 void
-fd4_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring)
+fd4_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
 {
+	struct fd_context *ctx = batch->ctx;
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
 
 	OUT_PKT0(ring, REG_A4XX_RBBM_PERFCTR_CTL, 1);
@@ -885,7 +886,7 @@ fd4_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring)
 	OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1);
 	OUT_RING(ring, 0x0);
 
-	fd_hw_query_enable(ctx, ring);
+	fd_hw_query_enable(batch, ring);
 
 	ctx->needs_rb_fbd = true;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
index 89dc51ad1ee..42e0e5e645a 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
@@ -102,7 +102,7 @@ void fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit);
 void fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		struct fd4_emit *emit);
 
-void fd4_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring);
+void fd4_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring);
 
 void fd4_emit_init(struct pipe_context *pctx);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index afd37a88f43..3f3847c2a28 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -527,7 +527,7 @@ fd4_emit_sysmem_prep(struct fd_batch *batch)
 	struct pipe_framebuffer_state *pfb = &batch->framebuffer;
 	struct fd_ringbuffer *ring = batch->gmem;
 
-	fd4_emit_restore(batch->ctx, ring);
+	fd4_emit_restore(batch, ring);
 
 	OUT_PKT0(ring, REG_A4XX_RB_FRAME_BUFFER_DIMENSION, 1);
 	OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) |
@@ -666,7 +666,7 @@ fd4_emit_tile_init(struct fd_batch *batch)
 	struct fd_ringbuffer *ring = batch->gmem;
 	struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
 
-	fd4_emit_restore(batch->ctx, ring);
+	fd4_emit_restore(batch, ring);
 
 	OUT_PKT0(ring, REG_A4XX_VSC_BIN_SIZE, 1);
 	OUT_RING(ring, A4XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) |
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
index 41e3e6506bd..921384c1911 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
@@ -48,10 +48,10 @@ struct fd_rb_samp_ctrs {
  */
 
 static struct fd_hw_sample *
-occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
+occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
 {
 	struct fd_hw_sample *samp =
-			fd_hw_sample_init(ctx, sizeof(struct fd_rb_samp_ctrs));
+			fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs));
 
 	/* low bits of sample addr should be zero (since they are control
 	 * flags in RB_SAMPLE_COUNT_CONTROL):
@@ -73,7 +73,7 @@ occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
 	OUT_RING(ring, 1);             /* NumInstances */
 	OUT_RING(ring, 0);             /* NumIndices */
 
-	fd_event_write(ctx, ring, ZPASS_DONE);
+	fd_event_write(batch->ctx, ring, ZPASS_DONE);
 
 	return samp;
 }
@@ -123,18 +123,18 @@ time_elapsed_enable(struct fd_context *ctx, struct fd_ringbuffer *ring)
 }
 
 static struct fd_hw_sample *
-time_elapsed_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
+time_elapsed_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
 {
-	struct fd_hw_sample *samp = fd_hw_sample_init(ctx, sizeof(uint64_t));
+	struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t));
 
 	/* use unused part of vsc_size_mem as scratch space, to avoid
 	 * extra allocation:
 	 */
-	struct fd_bo *scratch_bo = fd4_context(ctx)->vsc_size_mem;
+	struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem;
 	const int sample_off = 128;
 	const int addr_off = sample_off + 8;
 
-	debug_assert(ctx->screen->max_freq > 0);
+	debug_assert(batch->ctx->screen->max_freq > 0);
 
 	/* Basic issue is that we need to read counter value to a relative
 	 * destination (with per-tile offset) rather than absolute dest
@@ -161,7 +161,7 @@ time_elapsed_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
 	 * shot, but that's really just polishing a turd..
 	 */
 
-	fd_wfi(ctx, ring);
+	fd_wfi(batch->ctx, ring);
 
 	/* copy sample counter _LO and _HI to scratch: */
 	OUT_PKT3(ring, CP_REG_TO_MEM, 2);
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.c b/src/gallium/drivers/freedreno/freedreno_batch.c
index 2dd7eda72ad..5008f5dbe56 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.c
+++ b/src/gallium/drivers/freedreno/freedreno_batch.c
@@ -32,6 +32,7 @@
 #include "freedreno_batch.h"
 #include "freedreno_context.h"
 #include "freedreno_resource.h"
+#include "freedreno_query_hw.h"
 
 static void
 batch_init(struct fd_batch *batch)
@@ -61,6 +62,7 @@ batch_init(struct fd_batch *batch)
 	batch->needs_flush = false;
 	batch->gmem_reason = 0;
 	batch->num_draws = 0;
+	batch->stage = FD_STAGE_NULL;
 
 	/* reset maximal bounds: */
 	batch->max_scissor.minx = batch->max_scissor.miny = ~0;
@@ -72,6 +74,8 @@ batch_init(struct fd_batch *batch)
 		util_dynarray_init(&batch->rbrc_patches);
 
 	assert(batch->resources->entries == 0);
+
+	util_dynarray_init(&batch->samples);
 }
 
 struct fd_batch *
@@ -98,6 +102,8 @@ fd_batch_create(struct fd_context *ctx)
 static void
 batch_fini(struct fd_batch *batch)
 {
+	pipe_resource_reference(&batch->query_buf, NULL);
+
 	fd_ringbuffer_del(batch->draw);
 	fd_ringbuffer_del(batch->binning);
 	fd_ringbuffer_del(batch->gmem);
@@ -106,6 +112,13 @@ batch_fini(struct fd_batch *batch)
 
 	if (is_a3xx(batch->ctx->screen))
 		util_dynarray_fini(&batch->rbrc_patches);
+
+	while (batch->samples.size > 0) {
+		struct fd_hw_sample *samp =
+			util_dynarray_pop(&batch->samples, struct fd_hw_sample *);
+		fd_hw_sample_reference(batch->ctx, &samp, NULL);
+	}
+	util_dynarray_fini(&batch->samples);
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h
index 89d1d9fea7b..228a1b72bf6 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.h
+++ b/src/gallium/drivers/freedreno/freedreno_batch.h
@@ -28,6 +28,7 @@
 #define FREEDRENO_BATCH_H_
 
 #include "util/u_inlines.h"
+#include "util/list.h"
 
 #include "freedreno_util.h"
 
@@ -35,6 +36,35 @@ struct fd_context;
 struct fd_resource;
 enum fd_resource_status;
 
+/* Bitmask of stages in rendering that a particular query query is
+ * active.  Queries will be automatically started/stopped (generating
+ * additional fd_hw_sample_period's) on entrance/exit from stages that
+ * are applicable to the query.
+ *
+ * NOTE: set the stage to NULL at end of IB to ensure no query is still
+ * active.  Things aren't going to work out the way you want if a query
+ * is active across IB's (or between tile IB and draw IB)
+ */
+enum fd_render_stage {
+	FD_STAGE_NULL     = 0x01,
+	FD_STAGE_DRAW     = 0x02,
+	FD_STAGE_CLEAR    = 0x04,
+	/* TODO before queries which include MEM2GMEM or GMEM2MEM will
+	 * work we will need to call fd_hw_query_prepare() from somewhere
+	 * appropriate so that queries in the tiling IB get backed with
+	 * memory to write results to.
+	 */
+	FD_STAGE_MEM2GMEM = 0x08,
+	FD_STAGE_GMEM2MEM = 0x10,
+	/* used for driver internal draws (ie. util_blitter_blit()): */
+	FD_STAGE_BLIT     = 0x20,
+	FD_STAGE_ALL      = 0xff,
+};
+
+#define MAX_HW_SAMPLE_PROVIDERS 4
+struct fd_hw_sample_provider;
+struct fd_hw_sample;
+
 /* A batch tracks everything about a cmdstream batch/submit, including the
  * ringbuffers used for binning, draw, and gmem cmds, list of associated
  * fd_resource-s, etc.
@@ -118,6 +148,37 @@ struct fd_batch {
 	/** tiling/gmem (IB0) cmdstream: */
 	struct fd_ringbuffer *gmem;
 
+	/**
+	 * hw query related state:
+	 */
+	/*@{*/
+	/* next sample offset.. incremented for each sample in the batch/
+	 * submit, reset to zero on next submit.
+	 */
+	uint32_t next_sample_offset;
+
+	/* cached samples (in case multiple queries need to reference
+	 * the same sample snapshot)
+	 */
+	struct fd_hw_sample *sample_cache[MAX_HW_SAMPLE_PROVIDERS];
+
+	/* which sample providers were active in the current batch: */
+	uint32_t active_providers;
+
+	/* tracking for current stage, to know when to start/stop
+	 * any active queries:
+	 */
+	enum fd_render_stage stage;
+
+	/* list of samples in current batch: */
+	struct util_dynarray samples;
+
+	/* current query result bo and tile stride: */
+	struct pipe_resource *query_buf;
+	uint32_t query_tile_stride;
+	/*@}*/
+
+
 	/* Set of resources used by currently-unsubmitted batch (read or
 	 * write).. does not hold a reference to the resource.
 	 */
diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 13a17e2a78e..1c32cd9ae92 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -168,8 +168,6 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
 	 */
 	ctx->sample_mask = 0xffff;
 
-	ctx->stage = FD_STAGE_NULL;
-
 	pctx = &ctx->base;
 	pctx->screen = pscreen;
 	pctx->priv = priv;
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 74f53ee554f..45876259fd8 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -107,34 +107,6 @@ struct fd_vertex_state {
 	struct fd_vertexbuf_stateobj vertexbuf;
 };
 
-/* Bitmask of stages in rendering that a particular query query is
- * active.  Queries will be automatically started/stopped (generating
- * additional fd_hw_sample_period's) on entrance/exit from stages that
- * are applicable to the query.
- *
- * NOTE: set the stage to NULL at end of IB to ensure no query is still
- * active.  Things aren't going to work out the way you want if a query
- * is active across IB's (or between tile IB and draw IB)
- */
-enum fd_render_stage {
-	FD_STAGE_NULL     = 0x01,
-	FD_STAGE_DRAW     = 0x02,
-	FD_STAGE_CLEAR    = 0x04,
-	/* TODO before queries which include MEM2GMEM or GMEM2MEM will
-	 * work we will need to call fd_hw_query_prepare() from somewhere
-	 * appropriate so that queries in the tiling IB get backed with
-	 * memory to write results to.
-	 */
-	FD_STAGE_MEM2GMEM = 0x08,
-	FD_STAGE_GMEM2MEM = 0x10,
-	/* used for driver internal draws (ie. util_blitter_blit()): */
-	FD_STAGE_BLIT     = 0x20,
-	FD_STAGE_ALL      = 0xff,
-};
-
-#define MAX_HW_SAMPLE_PROVIDERS 4
-struct fd_hw_sample_provider;
-struct fd_hw_sample;
 
 struct fd_context {
 	struct pipe_context base;
@@ -152,39 +124,12 @@ struct fd_context {
 	struct util_slab_mempool sample_pool;
 	struct util_slab_mempool sample_period_pool;
 
-	/* next sample offset.. incremented for each sample in the batch/
-	 * submit, reset to zero on next submit.
-	 */
-	uint32_t next_sample_offset;
-
 	/* sample-providers for hw queries: */
 	const struct fd_hw_sample_provider *sample_providers[MAX_HW_SAMPLE_PROVIDERS];
 
-	/* cached samples (in case multiple queries need to reference
-	 * the same sample snapshot)
-	 */
-	struct fd_hw_sample *sample_cache[MAX_HW_SAMPLE_PROVIDERS];
-
-	/* which sample providers were active in the current batch: */
-	uint32_t active_providers;
-
-	/* tracking for current stage, to know when to start/stop
-	 * any active queries:
-	 */
-	enum fd_render_stage stage;
-
 	/* list of active queries: */
 	struct list_head active_queries;
 
-	/* list of queries that are not active, but were active in the
-	 * current submit:
-	 */
-	struct list_head current_queries;
-
-	/* current query result bo and tile stride: */
-	struct pipe_resource *query_buf;
-	uint32_t query_tile_stride;
-
 	/* table with PIPE_PRIM_MAX entries mapping PIPE_PRIM_x to
 	 * DI_PT_x value to use for draw initiator.  There are some
 	 * slight differences between generation:
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index 112bf5cb624..fd3da1f20e5 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -89,6 +89,10 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 		ctx->discard = false;
 	}
 
+	/* NOTE: needs to be before resource_written(batch->query_buf), otherwise
+	 * query_buf may not be created yet.
+	 */
+	fd_hw_query_set_stage(batch, batch->draw, FD_STAGE_DRAW);
 	/*
 	 * Figure out the buffers/features we need:
 	 */
@@ -154,6 +158,8 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 		if (ctx->streamout.targets[i])
 			resource_written(batch, ctx->streamout.targets[i]->buffer);
 
+	resource_written(batch, batch->query_buf);
+
 	batch->num_draws++;
 
 	prims = u_reduced_prims_for_vertices(info->mode, info->count);
@@ -180,7 +186,6 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 		util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
 		util_format_short_name(pipe_surface_format(pfb->zsbuf)));
 
-	fd_hw_query_set_stage(ctx, batch->draw, FD_STAGE_DRAW);
 	if (ctx->draw_vbo(ctx, info))
 		batch->needs_flush = true;
 
@@ -253,12 +258,14 @@ fd_clear(struct pipe_context *pctx, unsigned buffers,
 		batch->gmem_reason |= FD_GMEM_CLEARS_DEPTH_STENCIL;
 	}
 
+	resource_written(batch, batch->query_buf);
+
 	DBG("%p: %x %ux%u depth=%f, stencil=%u (%s/%s)", batch, buffers,
 		pfb->width, pfb->height, depth, stencil,
 		util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
 		util_format_short_name(pipe_surface_format(pfb->zsbuf)));
 
-	fd_hw_query_set_stage(ctx, batch->draw, FD_STAGE_CLEAR);
+	fd_hw_query_set_stage(batch, batch->draw, FD_STAGE_CLEAR);
 
 	ctx->clear(ctx, buffers, color, depth, stencil);
 
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index a075a8b5c95..d57b6a36d8b 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -323,23 +323,23 @@ render_tiles(struct fd_batch *batch)
 		ctx->emit_tile_prep(batch, tile);
 
 		if (batch->restore) {
-			fd_hw_query_set_stage(ctx, batch->gmem, FD_STAGE_MEM2GMEM);
+			fd_hw_query_set_stage(batch, batch->gmem, FD_STAGE_MEM2GMEM);
 			ctx->emit_tile_mem2gmem(batch, tile);
-			fd_hw_query_set_stage(ctx, batch->gmem, FD_STAGE_NULL);
+			fd_hw_query_set_stage(batch, batch->gmem, FD_STAGE_NULL);
 		}
 
 		ctx->emit_tile_renderprep(batch, tile);
 
-		fd_hw_query_prepare_tile(ctx, i, batch->gmem);
+		fd_hw_query_prepare_tile(batch, i, batch->gmem);
 
 		/* emit IB to drawcmds: */
 		ctx->emit_ib(batch->gmem, batch->draw);
 		fd_reset_wfi(ctx);
 
 		/* emit gmem2mem to transfer tile back to system memory: */
-		fd_hw_query_set_stage(ctx, batch->gmem, FD_STAGE_GMEM2MEM);
+		fd_hw_query_set_stage(batch, batch->gmem, FD_STAGE_GMEM2MEM);
 		ctx->emit_tile_gmem2mem(batch, tile);
-		fd_hw_query_set_stage(ctx, batch->gmem, FD_STAGE_NULL);
+		fd_hw_query_set_stage(batch, batch->gmem, FD_STAGE_NULL);
 	}
 }
 
@@ -350,7 +350,7 @@ render_sysmem(struct fd_batch *batch)
 
 	ctx->emit_sysmem_prep(batch);
 
-	fd_hw_query_prepare_tile(ctx, 0, batch->gmem);
+	fd_hw_query_prepare_tile(batch, 0, batch->gmem);
 
 	/* emit IB to drawcmds: */
 	ctx->emit_ib(batch->gmem, batch->draw);
@@ -376,7 +376,7 @@ fd_gmem_render_tiles(struct fd_batch *batch)
 	/* close out the draw cmds by making sure any active queries are
 	 * paused:
 	 */
-	fd_hw_query_set_stage(ctx, batch->draw, FD_STAGE_NULL);
+	fd_hw_query_set_stage(batch, batch->draw, FD_STAGE_NULL);
 
 	fd_reset_wfi(ctx);
 
@@ -387,7 +387,7 @@ fd_gmem_render_tiles(struct fd_batch *batch)
 			batch, pfb->width, pfb->height,
 			util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
 			util_format_short_name(pipe_surface_format(pfb->zsbuf)));
-		fd_hw_query_prepare(ctx, 1);
+		fd_hw_query_prepare(batch, 1);
 		render_sysmem(batch);
 		ctx->stats.batch_sysmem++;
 	} else {
@@ -397,7 +397,7 @@ fd_gmem_render_tiles(struct fd_batch *batch)
 			batch, pfb->width, pfb->height, gmem->nbins_x, gmem->nbins_y,
 			util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
 			util_format_short_name(pipe_surface_format(pfb->zsbuf)));
-		fd_hw_query_prepare(ctx, gmem->nbins_x * gmem->nbins_y);
+		fd_hw_query_prepare(batch, gmem->nbins_x * gmem->nbins_y);
 		render_tiles(batch);
 		ctx->stats.batch_gmem++;
 	}
diff --git a/src/gallium/drivers/freedreno/freedreno_query_hw.c b/src/gallium/drivers/freedreno/freedreno_query_hw.c
index 808bcefc2ad..12d40d04cda 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_hw.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_hw.c
@@ -61,32 +61,35 @@ static int pidx(unsigned query_type)
 }
 
 static struct fd_hw_sample *
-get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring,
+get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring,
 		unsigned query_type)
 {
+	struct fd_context *ctx = batch->ctx;
 	struct fd_hw_sample *samp = NULL;
 	int idx = pidx(query_type);
 
 	assume(idx >= 0);   /* query never would have been created otherwise */
 
-	if (!ctx->sample_cache[idx]) {
-		ctx->sample_cache[idx] =
-			ctx->sample_providers[idx]->get_sample(ctx, ring);
-		ctx->batch->needs_flush = true;
+	if (!batch->sample_cache[idx]) {
+		struct fd_hw_sample *new_samp =
+			ctx->sample_providers[idx]->get_sample(batch, ring);
+		fd_hw_sample_reference(ctx, &batch->sample_cache[idx], new_samp);
+		util_dynarray_append(&batch->samples, struct fd_hw_sample *, new_samp);
+		batch->needs_flush = true;
 	}
 
-	fd_hw_sample_reference(ctx, &samp, ctx->sample_cache[idx]);
+	fd_hw_sample_reference(ctx, &samp, batch->sample_cache[idx]);
 
 	return samp;
 }
 
 static void
-clear_sample_cache(struct fd_context *ctx)
+clear_sample_cache(struct fd_batch *batch)
 {
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(ctx->sample_cache); i++)
-		fd_hw_sample_reference(ctx, &ctx->sample_cache[i], NULL);
+	for (i = 0; i < ARRAY_SIZE(batch->sample_cache); i++)
+		fd_hw_sample_reference(batch->ctx, &batch->sample_cache[i], NULL);
 }
 
 static bool
@@ -97,38 +100,38 @@ is_active(struct fd_hw_query *hq, enum fd_render_stage stage)
 
 
 static void
-resume_query(struct fd_context *ctx, struct fd_hw_query *hq,
+resume_query(struct fd_batch *batch, struct fd_hw_query *hq,
 		struct fd_ringbuffer *ring)
 {
 	int idx = pidx(hq->provider->query_type);
 	assert(idx >= 0);   /* query never would have been created otherwise */
 	assert(!hq->period);
-	ctx->active_providers |= (1 << idx);
-	hq->period = util_slab_alloc(&ctx->sample_period_pool);
+	batch->active_providers |= (1 << idx);
+	hq->period = util_slab_alloc(&batch->ctx->sample_period_pool);
 	list_inithead(&hq->period->list);
-	hq->period->start = get_sample(ctx, ring, hq->base.type);
+	hq->period->start = get_sample(batch, ring, hq->base.type);
 	/* NOTE: util_slab_alloc() does not zero out the buffer: */
 	hq->period->end = NULL;
 }
 
 static void
-pause_query(struct fd_context *ctx, struct fd_hw_query *hq,
+pause_query(struct fd_batch *batch, struct fd_hw_query *hq,
 		struct fd_ringbuffer *ring)
 {
 	int idx = pidx(hq->provider->query_type);
 	assert(idx >= 0);   /* query never would have been created otherwise */
 	assert(hq->period && !hq->period->end);
-	assert(ctx->active_providers & (1 << idx));
-	hq->period->end = get_sample(ctx, ring, hq->base.type);
-	list_addtail(&hq->period->list, &hq->current_periods);
+	assert(batch->active_providers & (1 << idx));
+	hq->period->end = get_sample(batch, ring, hq->base.type);
+	list_addtail(&hq->period->list, &hq->periods);
 	hq->period = NULL;
 }
 
 static void
-destroy_periods(struct fd_context *ctx, struct list_head *list)
+destroy_periods(struct fd_context *ctx, struct fd_hw_query *hq)
 {
 	struct fd_hw_sample_period *period, *s;
-	LIST_FOR_EACH_ENTRY_SAFE(period, s, list, list) {
+	LIST_FOR_EACH_ENTRY_SAFE(period, s, &hq->periods, list) {
 		fd_hw_sample_reference(ctx, &period->start, NULL);
 		fd_hw_sample_reference(ctx, &period->end, NULL);
 		list_del(&period->list);
@@ -141,8 +144,7 @@ fd_hw_destroy_query(struct fd_context *ctx, struct fd_query *q)
 {
 	struct fd_hw_query *hq = fd_hw_query(q);
 
-	destroy_periods(ctx, &hq->periods);
-	destroy_periods(ctx, &hq->current_periods);
+	destroy_periods(ctx, hq);
 	list_del(&hq->list);
 
 	free(hq);
@@ -151,27 +153,31 @@ fd_hw_destroy_query(struct fd_context *ctx, struct fd_query *q)
 static boolean
 fd_hw_begin_query(struct fd_context *ctx, struct fd_query *q)
 {
+	struct fd_batch *batch = ctx->batch;
 	struct fd_hw_query *hq = fd_hw_query(q);
+
 	if (q->active)
 		return false;
 
 	/* begin_query() should clear previous results: */
-	destroy_periods(ctx, &hq->periods);
+	destroy_periods(ctx, hq);
 
-	if (is_active(hq, ctx->stage))
-		resume_query(ctx, hq, ctx->batch->draw);
+	if (batch && is_active(hq, batch->stage))
+		resume_query(batch, hq, batch->draw);
 
 	q->active = true;
 
 	/* add to active list: */
-	list_del(&hq->list);
+	assert(list_empty(&hq->list));
 	list_addtail(&hq->list, &ctx->active_queries);
-   return true;
+
+	return true;
 }
 
 static void
 fd_hw_end_query(struct fd_context *ctx, struct fd_query *q)
 {
+	struct fd_batch *batch = ctx->batch;
 	struct fd_hw_query *hq = fd_hw_query(q);
 	/* there are a couple special cases, which don't have
 	 * a matching ->begin_query():
@@ -181,12 +187,11 @@ fd_hw_end_query(struct fd_context *ctx, struct fd_query *q)
 	}
 	if (!q->active)
 		return;
-	if (is_active(hq, ctx->stage))
-		pause_query(ctx, hq, ctx->batch->draw);
+	if (batch && is_active(hq, batch->stage))
+		pause_query(batch, hq, batch->draw);
 	q->active = false;
-	/* move to current list: */
-	list_del(&hq->list);
-	list_addtail(&hq->list, &ctx->current_queries);
+	/* remove from active list: */
+	list_delinit(&hq->list);
 }
 
 /* helper to get ptr to specified sample: */
@@ -206,27 +211,12 @@ fd_hw_get_query_result(struct fd_context *ctx, struct fd_query *q,
 	if (q->active)
 		return false;
 
-	/* if the app tries to read back the query result before the
-	 * batch is submitted, that forces us to flush so that there
-	 * are actually results to wait for:
-	 */
-	if (!LIST_IS_EMPTY(&hq->list)) {
-		/* if app didn't actually trigger any cmdstream, then
-		 * we have nothing to do:
-		 */
-		if (!ctx->batch->needs_flush)
-			return true;
-		DBG("reading query result forces flush!");
-		fd_batch_flush(ctx->batch);
-	}
-
 	util_query_clear_result(result, q->type);
 
 	if (LIST_IS_EMPTY(&hq->periods))
 		return true;
 
 	assert(LIST_IS_EMPTY(&hq->list));
-	assert(LIST_IS_EMPTY(&hq->current_periods));
 	assert(!hq->period);
 
 	/* if !wait, then check the last sample (the one most likely to
@@ -240,6 +230,21 @@ fd_hw_get_query_result(struct fd_context *ctx, struct fd_query *q,
 
 		struct fd_resource *rsc = fd_resource(period->end->prsc);
 
+		if (pending(rsc, false)) {
+			/* piglit spec@arb_occlusion_query@occlusion_query_conform
+			 * test, and silly apps perhaps, get stuck in a loop trying
+			 * to get  query result forever with wait==false..  we don't
+			 * wait to flush unnecessarily but we also don't want to
+			 * spin forever:
+			 */
+			if (hq->no_wait_cnt++ > 5)
+				fd_batch_flush(rsc->write_batch);
+			return false;
+		}
+
+		if (!rsc->bo)
+			return false;
+
 		ret = fd_bo_cpu_prep(rsc->bo, ctx->screen->pipe,
 				DRM_FREEDRENO_PREP_READ | DRM_FREEDRENO_PREP_NOSYNC);
 		if (ret)
@@ -260,6 +265,13 @@ fd_hw_get_query_result(struct fd_context *ctx, struct fd_query *q,
 
 		struct fd_resource *rsc = fd_resource(start->prsc);
 
+		if (rsc->write_batch)
+			fd_batch_flush(rsc->write_batch);
+
+		/* some piglit tests at least do query with no draws, I guess: */
+		if (!rsc->bo)
+			continue;
+
 		fd_bo_cpu_prep(rsc->bo, ctx->screen->pipe, DRM_FREEDRENO_PREP_READ);
 
 		void *ptr = fd_bo_map(rsc->bo);
@@ -299,7 +311,6 @@ fd_hw_create_query(struct fd_context *ctx, unsigned query_type)
 	hq->provider = ctx->sample_providers[idx];
 
 	list_inithead(&hq->periods);
-	list_inithead(&hq->current_periods);
 	list_inithead(&hq->list);
 
 	q = &hq->base;
@@ -310,19 +321,38 @@ fd_hw_create_query(struct fd_context *ctx, unsigned query_type)
 }
 
 struct fd_hw_sample *
-fd_hw_sample_init(struct fd_context *ctx, uint32_t size)
+fd_hw_sample_init(struct fd_batch *batch, uint32_t size)
 {
-	struct fd_hw_sample *samp = util_slab_alloc(&ctx->sample_pool);
+	struct fd_hw_sample *samp = util_slab_alloc(&batch->ctx->sample_pool);
 	pipe_reference_init(&samp->reference, 1);
 	samp->size = size;
 	debug_assert(util_is_power_of_two(size));
-	ctx->next_sample_offset = align(ctx->next_sample_offset, size);
-	samp->offset = ctx->next_sample_offset;
+	batch->next_sample_offset = align(batch->next_sample_offset, size);
+	samp->offset = batch->next_sample_offset;
 	/* NOTE: util_slab_alloc() does not zero out the buffer: */
 	samp->prsc = NULL;
 	samp->num_tiles = 0;
 	samp->tile_stride = 0;
-	ctx->next_sample_offset += size;
+	batch->next_sample_offset += size;
+
+	if (!batch->query_buf) {
+		struct pipe_screen *pscreen = &batch->ctx->screen->base;
+		struct pipe_resource templ = {
+			.target  = PIPE_BUFFER,
+			.format  = PIPE_FORMAT_R8_UNORM,
+			.bind    = PIPE_BIND_QUERY_BUFFER,
+			.width0  = 0,    /* create initially zero size buffer */
+			.height0 = 1,
+			.depth0  = 1,
+			.array_size = 1,
+			.last_level = 0,
+			.nr_samples = 1,
+		};
+		batch->query_buf = pscreen->resource_create(pscreen, &templ);
+	}
+
+	pipe_resource_reference(&samp->prsc, batch->query_buf);
+
 	return samp;
 }
 
@@ -333,110 +363,49 @@ __fd_hw_sample_destroy(struct fd_context *ctx, struct fd_hw_sample *samp)
 	util_slab_free(&ctx->sample_pool, samp);
 }
 
-static void
-prepare_sample(struct fd_hw_sample *samp, struct pipe_resource *prsc,
-		uint32_t num_tiles, uint32_t tile_stride)
-{
-	if (samp->prsc) {
-		assert(samp->prsc == prsc);
-		assert(samp->num_tiles == num_tiles);
-		assert(samp->tile_stride == tile_stride);
-		return;
-	}
-	pipe_resource_reference(&samp->prsc, prsc);
-	samp->num_tiles = num_tiles;
-	samp->tile_stride = tile_stride;
-}
-
-static void
-prepare_query(struct fd_hw_query *hq, struct pipe_resource *prsc,
-		uint32_t num_tiles, uint32_t tile_stride)
-{
-	struct fd_hw_sample_period *period, *s;
-
-	/* prepare all the samples in the query: */
-	LIST_FOR_EACH_ENTRY_SAFE(period, s, &hq->current_periods, list) {
-		prepare_sample(period->start, prsc, num_tiles, tile_stride);
-		prepare_sample(period->end, prsc, num_tiles, tile_stride);
-
-		/* move from current_periods list to periods list: */
-		list_del(&period->list);
-		list_addtail(&period->list, &hq->periods);
-	}
-}
-
-static void
-prepare_queries(struct fd_context *ctx, struct pipe_resource *prsc,
-		uint32_t num_tiles, uint32_t tile_stride,
-		struct list_head *list, bool remove)
-{
-	struct fd_hw_query *hq, *s;
-	LIST_FOR_EACH_ENTRY_SAFE(hq, s, list, list) {
-		prepare_query(hq, prsc, num_tiles, tile_stride);
-		if (remove)
-			list_delinit(&hq->list);
-	}
-}
-
 /* called from gmem code once total storage requirements are known (ie.
  * number of samples times number of tiles)
  */
 void
-fd_hw_query_prepare(struct fd_context *ctx, uint32_t num_tiles)
+fd_hw_query_prepare(struct fd_batch *batch, uint32_t num_tiles)
 {
-	uint32_t tile_stride = ctx->next_sample_offset;
-	struct pipe_resource *prsc;
+	uint32_t tile_stride = batch->next_sample_offset;
 
-	pipe_resource_reference(&ctx->query_buf, NULL);
+	if (tile_stride > 0)
+		fd_resource_resize(batch->query_buf, tile_stride * num_tiles);
 
-	if (tile_stride > 0) {
-		struct pipe_screen *pscreen = &ctx->screen->base;
-		struct pipe_resource templ = {
-			.target  = PIPE_BUFFER,
-			.format  = PIPE_FORMAT_R8_UNORM,
-			.bind    = PIPE_BIND_QUERY_BUFFER,
-			.width0  = tile_stride * num_tiles,
-			.height0 = 1,
-			.depth0  = 1,
-			.array_size = 1,
-			.last_level = 0,
-			.nr_samples = 1,
-		};
-		prsc = pscreen->resource_create(pscreen, &templ);
-	} else {
-		prsc = NULL;
-	}
-
-	ctx->query_buf = prsc;
-	ctx->query_tile_stride = tile_stride;
+	batch->query_tile_stride = tile_stride;
 
-	prepare_queries(ctx, prsc, num_tiles, tile_stride,
-			&ctx->active_queries, false);
-	prepare_queries(ctx, prsc, num_tiles, tile_stride,
-			&ctx->current_queries, true);
+	while (batch->samples.size > 0) {
+		struct fd_hw_sample *samp =
+			util_dynarray_pop(&batch->samples, struct fd_hw_sample *);
+		samp->num_tiles = num_tiles;
+		samp->tile_stride = tile_stride;
+		fd_hw_sample_reference(batch->ctx, &samp, NULL);
+	}
 
 	/* reset things for next batch: */
-	ctx->next_sample_offset = 0;
+	batch->next_sample_offset = 0;
 }
 
 void
-fd_hw_query_prepare_tile(struct fd_context *ctx, uint32_t n,
+fd_hw_query_prepare_tile(struct fd_batch *batch, uint32_t n,
 		struct fd_ringbuffer *ring)
 {
-	uint32_t tile_stride = ctx->query_tile_stride;
+	uint32_t tile_stride = batch->query_tile_stride;
 	uint32_t offset = tile_stride * n;
 
 	/* bail if no queries: */
 	if (tile_stride == 0)
 		return;
 
-	fd_wfi(ctx, ring);
+	fd_wfi(batch->ctx, ring);
 	OUT_PKT0 (ring, HW_QUERY_BASE_REG, 1);
-	OUT_RELOCW(ring, fd_resource(ctx->query_buf)->bo, offset, 0, 0);
+	OUT_RELOCW(ring, fd_resource(batch->query_buf)->bo, offset, 0, 0);
 }
 
 void
-fd_hw_query_set_stage(struct fd_context *ctx, struct fd_ringbuffer *ring,
+fd_hw_query_set_stage(struct fd_batch *batch, struct fd_ringbuffer *ring,
 		enum fd_render_stage stage)
 {
 	/* special case: internal blits (like mipmap level generation)
@@ -445,24 +414,24 @@ fd_hw_query_set_stage(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	 * don't enable queries which should be paused during internal
 	 * blits:
 	 */
-	if ((ctx->stage == FD_STAGE_BLIT) &&
+	if ((batch->stage == FD_STAGE_BLIT) &&
 			(stage != FD_STAGE_NULL))
 		return;
 
-	if (stage != ctx->stage) {
+	if (stage != batch->stage) {
 		struct fd_hw_query *hq;
-		LIST_FOR_EACH_ENTRY(hq, &ctx->active_queries, list) {
-			bool was_active = is_active(hq, ctx->stage);
+		LIST_FOR_EACH_ENTRY(hq, &batch->ctx->active_queries, list) {
+			bool was_active = is_active(hq, batch->stage);
 			bool now_active = is_active(hq, stage);
 
 			if (now_active && !was_active)
-				resume_query(ctx, hq, ring);
+				resume_query(batch, hq, ring);
 			else if (was_active && !now_active)
-				pause_query(ctx, hq, ring);
+				pause_query(batch, hq, ring);
 		}
 	}
-	clear_sample_cache(ctx);
-	ctx->stage = stage;
+	clear_sample_cache(batch);
+	batch->stage = stage;
 }
 
 /* call the provider->enable() for all the hw queries that were active
@@ -470,16 +439,17 @@ fd_hw_query_set_stage(struct fd_context *ctx, struct fd_ringbuffer *ring,
  * for the duration of the batch.
  */
 void
-fd_hw_query_enable(struct fd_context *ctx, struct fd_ringbuffer *ring)
+fd_hw_query_enable(struct fd_batch *batch, struct fd_ringbuffer *ring)
 {
+	struct fd_context *ctx = batch->ctx;
 	for (int idx = 0; idx < MAX_HW_SAMPLE_PROVIDERS; idx++) {
-		if (ctx->active_providers & (1 << idx)) {
+		if (batch->active_providers & (1 << idx)) {
 			assert(ctx->sample_providers[idx]);
 			if (ctx->sample_providers[idx]->enable)
 				ctx->sample_providers[idx]->enable(ctx, ring);
 		}
 	}
-	ctx->active_providers = 0;  /* clear it for next frame */
+	batch->active_providers = 0;  /* clear it for next frame */
 }
 
 void
@@ -505,7 +475,6 @@ fd_hw_query_init(struct pipe_context *pctx)
 	util_slab_create(&ctx->sample_period_pool, sizeof(struct fd_hw_sample_period),
 			16, UTIL_SLAB_SINGLETHREADED);
 	list_inithead(&ctx->active_queries);
-	list_inithead(&ctx->current_queries);
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/freedreno_query_hw.h b/src/gallium/drivers/freedreno/freedreno_query_hw.h
index 0afece3495f..abd86682a9f 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_hw.h
+++ b/src/gallium/drivers/freedreno/freedreno_query_hw.h
@@ -84,7 +84,7 @@ struct fd_hw_sample_provider {
 	/* when a new sample is required, emit appropriate cmdstream
 	 * and return a sample object:
 	 */
-	struct fd_hw_sample *(*get_sample)(struct fd_context *ctx,
+	struct fd_hw_sample *(*get_sample)(struct fd_batch *batch,
 			struct fd_ringbuffer *ring);
 
 	/* accumulate the results from specified sample period: */
@@ -119,18 +119,17 @@ struct fd_hw_query {
 
 	const struct fd_hw_sample_provider *provider;
 
-	/* list of fd_hw_sample_period in previous submits: */
+	/* list of fd_hw_sample_periods: */
 	struct list_head periods;
 
-	/* list of fd_hw_sample_period's in current submit: */
-	struct list_head current_periods;
-
 	/* if active and not paused, the current sample period (not
 	 * yet added to current_periods):
 	 */
 	struct fd_hw_sample_period *period;
 
-	struct list_head list;  /* list-node in ctx->active_queries */
+	struct list_head list;   /* list-node in batch->active_queries */
+
+	int no_wait_cnt;         /* see fd_hw_get_query_result */
 };
 
 static inline struct fd_hw_query *
@@ -141,15 +140,15 @@ fd_hw_query(struct fd_query *q)
 
 struct fd_query * fd_hw_create_query(struct fd_context *ctx, unsigned query_type);
 /* helper for sample providers: */
-struct fd_hw_sample * fd_hw_sample_init(struct fd_context *ctx, uint32_t size);
+struct fd_hw_sample * fd_hw_sample_init(struct fd_batch *batch, uint32_t size);
 /* don't call directly, use fd_hw_sample_reference() */
 void __fd_hw_sample_destroy(struct fd_context *ctx, struct fd_hw_sample *samp);
-void fd_hw_query_prepare(struct fd_context *ctx, uint32_t num_tiles);
-void fd_hw_query_prepare_tile(struct fd_context *ctx, uint32_t n,
+void fd_hw_query_prepare(struct fd_batch *batch, uint32_t num_tiles);
+void fd_hw_query_prepare_tile(struct fd_batch *batch, uint32_t n,
 		struct fd_ringbuffer *ring);
-void fd_hw_query_set_stage(struct fd_context *ctx,
+void fd_hw_query_set_stage(struct fd_batch *batch,
 		struct fd_ringbuffer *ring, enum fd_render_stage stage);
-void fd_hw_query_enable(struct fd_context *ctx, struct fd_ringbuffer *ring);
+void fd_hw_query_enable(struct fd_batch *batch, struct fd_ringbuffer *ring);
 void fd_hw_query_register_provider(struct pipe_context *pctx,
 		const struct fd_hw_sample_provider *provider);
 void fd_hw_query_init(struct pipe_context *pctx);
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index a9b94610e46..b6c9488ec65 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -48,23 +48,6 @@
 /* XXX this should go away, needed for 'struct winsys_handle' */
 #include "state_tracker/drm_driver.h"
 
-static bool
-pending(struct fd_resource *rsc, bool write)
-{
-	/* if we have a pending GPU write, we are busy in any case: */
-	if (rsc->write_batch)
-		return true;
-
-	/* if CPU wants to write, but we are pending a GPU read, we are busy: */
-	if (write && rsc->batch_mask)
-		return true;
-
-	if (rsc->stencil && pending(rsc->stencil, write))
-		return true;
-
-	return false;
-}
-
 static void
 fd_invalidate_resource(struct fd_context *ctx, struct pipe_resource *prsc)
 {
@@ -755,6 +738,20 @@ slice_alignment(struct pipe_screen *pscreen, const struct pipe_resource *tmpl)
 	}
 }
 
+/* special case to resize query buf after allocated.. */
+void
+fd_resource_resize(struct pipe_resource *prsc, uint32_t sz)
+{
+	struct fd_resource *rsc = fd_resource(prsc);
+
+	debug_assert(prsc->width0 == 0);
+	debug_assert(prsc->target == PIPE_BUFFER);
+	debug_assert(prsc->bind == PIPE_BIND_QUERY_BUFFER);
+
+	prsc->width0 = sz;
+	realloc_bo(rsc, setup_slices(rsc, 1, prsc->format));
+}
+
 /**
  * Create a new texture object, using the given template info.
  */
@@ -812,6 +809,15 @@ fd_resource_create(struct pipe_screen *pscreen,
 
 	size = setup_slices(rsc, alignment, format);
 
+	/* special case for hw-query buffer, which we need to allocate before we
+	 * know the size:
+	 */
+	if (size == 0) {
+		/* note, semi-intention == instead of & */
+		debug_assert(prsc->bind == PIPE_BIND_QUERY_BUFFER);
+		return prsc;
+	}
+
 	if (rsc->layer_first) {
 		rsc->layer_size = align(size, 4096);
 		size = rsc->layer_size * prsc->array_size;
@@ -1048,7 +1054,8 @@ fd_blitter_pipe_begin(struct fd_context *ctx, bool render_cond, bool discard)
 		util_blitter_save_render_condition(ctx->blitter,
 			ctx->cond_query, ctx->cond_cond, ctx->cond_mode);
 
-	fd_hw_query_set_stage(ctx, ctx->batch->draw, FD_STAGE_BLIT);
+	if (ctx->batch)
+		fd_hw_query_set_stage(ctx->batch, ctx->batch->draw, FD_STAGE_BLIT);
 
 	ctx->discard = discard;
 }
@@ -1056,7 +1063,8 @@ fd_blitter_pipe_begin(struct fd_context *ctx, bool render_cond, bool discard)
 static void
 fd_blitter_pipe_end(struct fd_context *ctx)
 {
-	fd_hw_query_set_stage(ctx, ctx->batch->draw, FD_STAGE_NULL);
+	if (ctx->batch)
+		fd_hw_query_set_stage(ctx->batch, ctx->batch->draw, FD_STAGE_NULL);
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h
index fcdb4c1e364..8caab6b8a5a 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.h
+++ b/src/gallium/drivers/freedreno/freedreno_resource.h
@@ -104,6 +104,23 @@ fd_resource(struct pipe_resource *ptex)
 	return (struct fd_resource *)ptex;
 }
 
+static inline bool
+pending(struct fd_resource *rsc, bool write)
+{
+	/* if we have a pending GPU write, we are busy in any case: */
+	if (rsc->write_batch)
+		return true;
+
+	/* if CPU wants to write, but we are pending a GPU read, we are busy: */
+	if (write && rsc->batch_mask)
+		return true;
+
+	if (rsc->stencil && pending(rsc->stencil, write))
+		return true;
+
+	return false;
+}
+
 struct fd_transfer {
 	struct pipe_transfer base;
 	void *staging;
@@ -140,6 +157,8 @@ fd_resource_offset(struct fd_resource *rsc, unsigned level, unsigned layer)
 void fd_resource_screen_init(struct pipe_screen *pscreen);
 void fd_resource_context_init(struct pipe_context *pctx);
 
+void fd_resource_resize(struct pipe_resource *prsc, uint32_t sz);
+
 bool fd_render_condition_check(struct pipe_context *pctx);
 
 #endif /* FREEDRENO_RESOURCE_H_ */
diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c
index 8ac41d29077..849ea08037d 100644
--- a/src/gallium/drivers/freedreno/freedreno_state.c
+++ b/src/gallium/drivers/freedreno/freedreno_state.c
@@ -37,6 +37,7 @@
 #include "freedreno_resource.h"
 #include "freedreno_texture.h"
 #include "freedreno_gmem.h"
+#include "freedreno_query_hw.h"
 #include "freedreno_util.h"
 
 /* All the generic state handling.. In case of CSO's that are specific
@@ -118,8 +119,10 @@ fd_set_framebuffer_state(struct pipe_context *pctx,
 	struct pipe_framebuffer_state *cso;
 
 	if (ctx->screen->reorder) {
-		struct fd_batch *batch =
-			fd_batch_from_fb(&ctx->screen->batch_cache, ctx, framebuffer);
+		struct fd_batch *batch;
+		if (likely(ctx->batch))
+			fd_hw_query_set_stage(ctx->batch, ctx->batch->draw, FD_STAGE_NULL);
+		batch = fd_batch_from_fb(&ctx->screen->batch_cache, ctx, framebuffer);
 		fd_batch_reference(&ctx->batch, NULL);
 		ctx->batch = batch;
 		ctx->dirty = ~0;