X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fwinsys%2Fr600%2Fdrm%2Fr600_hw_context.c;h=5d415ae6348ad72959a80b59d21f99aa1e709420;hb=4682e706012fe26627a2f827db01b5068cc62814;hp=e60bf75c5b1675130278435bdc569c0816f2a986;hpb=d79a4a612bae66581caf21a74bd745ec51a18e80;p=mesa.git

diff --git a/src/gallium/winsys/r600/drm/r600_hw_context.c b/src/gallium/winsys/r600/drm/r600_hw_context.c
index e60bf75c5b1..5d415ae6348 100644
--- a/src/gallium/winsys/r600/drm/r600_hw_context.c
+++ b/src/gallium/winsys/r600/drm/r600_hw_context.c
@@ -23,47 +23,192 @@
  * Authors:
  *      Jerome Glisse
  */
-#include <errno.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <pipe/p_compiler.h>
-#include <util/u_inlines.h>
-#include <util/u_memory.h>
-#include <pipebuffer/pb_bufmgr.h>
-#include "xf86drm.h"
-#include "radeon_drm.h"
 #include "r600_priv.h"
-#include "bof.h"
 #include "r600d.h"
+#include "util/u_memory.h"
+#include <errno.h>
 
 #define GROUP_FORCE_NEW_BLOCK	0
 
-static void INLINE r600_context_update_fenced_list(struct r600_context *ctx)
+/* Get backends mask */
+void r600_get_backend_mask(struct r600_context *ctx)
 {
-	for (int i = 0; i < ctx->creloc; i++) {
-		if (!LIST_IS_EMPTY(&ctx->bo[i]->fencedlist))
-			LIST_DELINIT(&ctx->bo[i]->fencedlist);
-		LIST_ADDTAIL(&ctx->bo[i]->fencedlist, &ctx->fenced_bo);
-		ctx->bo[i]->fence = ctx->radeon->fence;
-		ctx->bo[i]->ctx = ctx;
+	struct r600_bo * buffer;
+	u32 * results;
+	unsigned num_backends = r600_get_num_backends(ctx->radeon);
+	unsigned i, mask = 0;
+
+	/* if backend_map query is supported by the kernel */
+	if (ctx->radeon->info.r600_backend_map_valid) {
+		unsigned num_tile_pipes = r600_get_num_tile_pipes(ctx->radeon);
+		unsigned backend_map = r600_get_backend_map(ctx->radeon);
+		unsigned item_width, item_mask;
+
+		if (ctx->radeon->chip_class >= EVERGREEN) {
+			item_width = 4;
+			item_mask = 0x7;
+		} else {
+			item_width = 2;
+			item_mask = 0x3;
+		}
+
+		while(num_tile_pipes--) {
+			i = backend_map & item_mask;
+			mask |= (1<<i);
+			backend_map >>= item_width;
+		}
+		if (mask != 0) {
+			ctx->backend_mask = mask;
+			return;
+		}
+	}
+
+	/* otherwise backup path for older kernels */
+
+	/* create buffer for event data */
+	buffer = r600_bo(ctx->radeon, ctx->max_db*16, 1, 0,
+				PIPE_USAGE_STAGING);
+	if (!buffer)
+		goto err;
+
+	/* initialize buffer with zeroes */
+	results = r600_bo_map(ctx->radeon, buffer, ctx->cs, PIPE_TRANSFER_WRITE);
+	if (results) {
+		memset(results, 0, ctx->max_db * 4 * 4);
+		r600_bo_unmap(ctx->radeon, buffer);
+
+		/* emit EVENT_WRITE for ZPASS_DONE */
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
+		ctx->pm4[ctx->pm4_cdwords++] = 0;
+		ctx->pm4[ctx->pm4_cdwords++] = 0;
+
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
+
+		/* execute */
+		r600_context_flush(ctx, 0);
+
+		/* analyze results */
+		results = r600_bo_map(ctx->radeon, buffer, ctx->cs, PIPE_TRANSFER_READ);
+		if (results) {
+			for(i = 0; i < ctx->max_db; i++) {
+				/* at least highest bit will be set if backend is used */
+				if (results[i*4 + 1])
+					mask |= (1<<i);
+			}
+			r600_bo_unmap(ctx->radeon, buffer);
+		}
 	}
+
+	r600_bo_reference(&buffer, NULL);
+
+	if (mask != 0) {
+		ctx->backend_mask = mask;
+		return;
+	}
+
+err:
+	/* fallback to old method - set num_backends lower bits to 1 */
+	ctx->backend_mask = (~((u32)0))>>(32-num_backends);
+	return;
 }
 
-static void INLINE r600_context_fence_wraparound(struct r600_context *ctx, unsigned fence)
+static inline void r600_context_ps_partial_flush(struct r600_context *ctx)
 {
-	struct radeon_bo *bo = NULL;
-	struct radeon_bo *tmp;
+	if (!(ctx->flags & R600_CONTEXT_DRAW_PENDING))
+		return;
 
-	LIST_FOR_EACH_ENTRY_SAFE(bo, tmp, &ctx->fenced_bo, fencedlist) {
-		if (bo->fence <= *ctx->radeon->cfence) {
-			LIST_DELINIT(&bo->fencedlist);
-			bo->fence = 0;
-		} else {
-			bo->fence = fence;
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
+
+	ctx->flags &= ~R600_CONTEXT_DRAW_PENDING;
+}
+
+void r600_init_cs(struct r600_context *ctx)
+{
+	/* R6xx requires this packet at the start of each command buffer */
+	if (ctx->radeon->family < CHIP_RV770) {
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_START_3D_CMDBUF, 0, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = 0x00000000;
+	}
+	/* All asics require this one */
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_CONTEXT_CONTROL, 1, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = 0x80000000;
+	ctx->pm4[ctx->pm4_cdwords++] = 0x80000000;
+
+	ctx->init_dwords = ctx->pm4_cdwords;
+}
+
+static void r600_init_block(struct r600_context *ctx,
+			    struct r600_block *block,
+			    const struct r600_reg *reg, int index, int nreg,
+			    unsigned opcode, unsigned offset_base)
+{
+	int i = index;
+	int j, n = nreg;
+
+	/* initialize block */
+	if (opcode == PKT3_SET_RESOURCE) {
+		block->flags = BLOCK_FLAG_RESOURCE;
+		block->status |= R600_BLOCK_STATUS_RESOURCE_DIRTY; /* dirty all blocks at start */
+	} else {
+		block->flags = 0;
+		block->status |= R600_BLOCK_STATUS_DIRTY; /* dirty all blocks at start */
+	}
+	block->start_offset = reg[i].offset;
+	block->pm4[block->pm4_ndwords++] = PKT3(opcode, n, 0);
+	block->pm4[block->pm4_ndwords++] = (block->start_offset - offset_base) >> 2;
+	block->reg = &block->pm4[block->pm4_ndwords];
+	block->pm4_ndwords += n;
+	block->nreg = n;
+	block->nreg_dirty = n;
+	LIST_INITHEAD(&block->list);
+	LIST_INITHEAD(&block->enable_list);
+
+	for (j = 0; j < n; j++) {
+		if (reg[i+j].flags & REG_FLAG_DIRTY_ALWAYS) {
+			block->flags |= REG_FLAG_DIRTY_ALWAYS;
+		}
+		if (reg[i+j].flags & REG_FLAG_ENABLE_ALWAYS) {
+			if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
+				block->status |= R600_BLOCK_STATUS_ENABLED;
+				LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
+				LIST_ADDTAIL(&block->list,&ctx->dirty);
+			}
+		}
+		if (reg[i+j].flags & REG_FLAG_FLUSH_CHANGE) {
+			block->flags |= REG_FLAG_FLUSH_CHANGE;
+		}
+
+		if (reg[i+j].flags & REG_FLAG_NEED_BO) {
+			block->nbo++;
+			assert(block->nbo < R600_BLOCK_MAX_BO);
+			block->pm4_bo_index[j] = block->nbo;
+			block->pm4[block->pm4_ndwords++] = PKT3(PKT3_NOP, 0, 0);
+			block->pm4[block->pm4_ndwords++] = 0x00000000;
+			if (reg[i+j].flags & REG_FLAG_RV6XX_SBU) {
+				block->reloc[block->nbo].flush_flags = 0;
+				block->reloc[block->nbo].flush_mask = 0;
+			} else {
+				block->reloc[block->nbo].flush_flags = reg[i+j].flush_flags;
+				block->reloc[block->nbo].flush_mask = reg[i+j].flush_mask;
+			}
+			block->reloc[block->nbo].bo_pm4_index = block->pm4_ndwords - 1;
+		}
+		if ((ctx->radeon->family > CHIP_R600) &&
+		    (ctx->radeon->family < CHIP_RV770) && reg[i+j].flags & REG_FLAG_RV6XX_SBU) {
+			block->pm4[block->pm4_ndwords++] = PKT3(PKT3_SURFACE_BASE_UPDATE, 0, 0);
+			block->pm4[block->pm4_ndwords++] = reg[i+j].flush_flags;
 		}
 	}
+	for (j = 0; j < n; j++) {
+		if (reg[i+j].flush_flags) {
+			block->pm4_flush_ndwords += 7;
+		}
+	}
+	/* check that we stay in limit */
+	assert(block->pm4_ndwords < R600_BLOCK_MAX_REG);
 }
 
 int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg, unsigned nreg,
@@ -74,14 +219,18 @@ int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg,
 	int offset;
 
 	for (unsigned i = 0, n = 0; i < nreg; i += n) {
-		u32 j;
-
 		/* ignore new block balise */
 		if (reg[i].offset == GROUP_FORCE_NEW_BLOCK) {
 			n = 1;
 			continue;
 		}
 
+		/* ignore regs not on R600 on R600 */
+		if ((reg[i].flags & REG_FLAG_NOT_R600) && ctx->radeon->family == CHIP_R600) {
+			n = 1;
+			continue;
+		}
+
 		/* register that need relocation are in their own group */
 		/* find number of consecutive registers */
 		n = 0;
@@ -102,54 +251,18 @@ int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg,
 		}
 		ctx->nblocks++;
 		for (int j = 0; j < n; j++) {
-			range = &ctx->range[CTX_RANGE_ID(ctx, reg[i + j].offset)];
-			range->blocks[CTX_BLOCK_ID(ctx, reg[i + j].offset)] = block;
+			range = &ctx->range[CTX_RANGE_ID(reg[i + j].offset)];
+			/* create block table if it doesn't exist */
+			if (!range->blocks)
+				range->blocks = calloc(1 << HASH_SHIFT, sizeof(void *));
+			if (!range->blocks)
+				return -1;
+
+			range->blocks[CTX_BLOCK_ID(reg[i + j].offset)] = block;
 		}
 
-		/* initialize block */
-		block->status |= R600_BLOCK_STATUS_DIRTY; /* dirty all blocks at start */
-		block->start_offset = reg[i].offset;
-		block->pm4[block->pm4_ndwords++] = PKT3(opcode, n, 0);
-		block->pm4[block->pm4_ndwords++] = (block->start_offset - offset_base) >> 2;
-		block->reg = &block->pm4[block->pm4_ndwords];
-		block->pm4_ndwords += n;
-		block->nreg = n;
-		block->nreg_dirty = n;
-		block->flags = 0;
-		LIST_INITHEAD(&block->list);
+		r600_init_block(ctx, block, reg, i, n, opcode, offset_base);
 
-		for (j = 0; j < n; j++) {
-			if (reg[i+j].flags & REG_FLAG_DIRTY_ALWAYS) {
-				block->flags |= REG_FLAG_DIRTY_ALWAYS;
-			}
-			if (reg[i+j].flags & REG_FLAG_NEED_BO) {
-				block->nbo++;
-				assert(block->nbo < R600_BLOCK_MAX_BO);
-				block->pm4_bo_index[j] = block->nbo;
-				block->pm4[block->pm4_ndwords++] = PKT3(PKT3_NOP, 0, 0);
-				block->pm4[block->pm4_ndwords++] = 0x00000000;
-				if (reg[i+j].flags & REG_FLAG_RV6XX_SBU) {
-					block->reloc[block->nbo].flush_flags = 0;
-					block->reloc[block->nbo].flush_mask = 0;
-				} else {
-					block->reloc[block->nbo].flush_flags = reg[i+j].flush_flags;
-					block->reloc[block->nbo].flush_mask = reg[i+j].flush_mask;
-				}
-				block->reloc[block->nbo].bo_pm4_index = block->pm4_ndwords - 1;
-			}
-			if ((ctx->radeon->family > CHIP_R600) &&
-			    (ctx->radeon->family < CHIP_RV770) && reg[i+j].flags & REG_FLAG_RV6XX_SBU) {
-				block->pm4[block->pm4_ndwords++] = PKT3(PKT3_SURFACE_BASE_UPDATE, 0, 0);
-				block->pm4[block->pm4_ndwords++] = reg[i+j].flush_flags;
-			}
-		}
-		for (j = 0; j < n; j++) {
-			if (reg[i+j].flush_flags) {
-				block->pm4_flush_ndwords += 7;
-			}
-		}
-		/* check that we stay in limit */
-		assert(block->pm4_ndwords < R600_BLOCK_MAX_REG);
 	}
 	return 0;
 }
@@ -157,17 +270,17 @@ int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg,
 /* R600/R700 configuration */
 static const struct r600_reg r600_config_reg_list[] = {
 	{R_008958_VGT_PRIMITIVE_TYPE, 0, 0, 0},
-	{R_008C00_SQ_CONFIG, 0, 0, 0},
-	{R_008C04_SQ_GPR_RESOURCE_MGMT_1, 0, 0, 0},
-	{R_008C08_SQ_GPR_RESOURCE_MGMT_2, 0, 0, 0},
-	{R_008C0C_SQ_THREAD_RESOURCE_MGMT, 0, 0, 0},
-	{R_008C10_SQ_STACK_RESOURCE_MGMT_1, 0, 0, 0},
-	{R_008C14_SQ_STACK_RESOURCE_MGMT_2, 0, 0, 0},
-	{R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, 0, 0, 0},
-	{R_009508_TA_CNTL_AUX, 0, 0, 0},
-	{R_009714_VC_ENHANCE, 0, 0, 0},
-	{R_009830_DB_DEBUG, 0, 0, 0},
-	{R_009838_DB_WATERMARKS, 0, 0, 0},
+	{R_008C00_SQ_CONFIG, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0, 0},
+	{R_008C04_SQ_GPR_RESOURCE_MGMT_1, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0, 0},
+	{R_008C08_SQ_GPR_RESOURCE_MGMT_2, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0, 0},
+	{R_008C0C_SQ_THREAD_RESOURCE_MGMT, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0, 0},
+	{R_008C10_SQ_STACK_RESOURCE_MGMT_1, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0, 0},
+	{R_008C14_SQ_STACK_RESOURCE_MGMT_2, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0, 0},
+	{R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0, 0},
+	{R_009508_TA_CNTL_AUX, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0, 0},
+	{R_009714_VC_ENHANCE, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0, 0},
+	{R_009830_DB_DEBUG, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0, 0},
+	{R_009838_DB_WATERMARKS, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0, 0},
 };
 
 static const struct r600_reg r600_ctl_const_list[] = {
@@ -315,14 +428,14 @@ static const struct r600_reg r600_context_reg_list[] = {
 	{R_0286DC_SPI_FOG_CNTL, 0, 0, 0},
 	{R_0286E0_SPI_FOG_FUNC_SCALE, 0, 0, 0},
 	{R_0286E4_SPI_FOG_FUNC_BIAS, 0, 0, 0},
-	{R_028780_CB_BLEND0_CONTROL, 0, 0, 0},
-	{R_028784_CB_BLEND1_CONTROL, 0, 0, 0},
-	{R_028788_CB_BLEND2_CONTROL, 0, 0, 0},
-	{R_02878C_CB_BLEND3_CONTROL, 0, 0, 0},
-	{R_028790_CB_BLEND4_CONTROL, 0, 0, 0},
-	{R_028794_CB_BLEND5_CONTROL, 0, 0, 0},
-	{R_028798_CB_BLEND6_CONTROL, 0, 0, 0},
-	{R_02879C_CB_BLEND7_CONTROL, 0, 0, 0},
+	{R_028780_CB_BLEND0_CONTROL, REG_FLAG_NOT_R600, 0, 0},
+	{R_028784_CB_BLEND1_CONTROL, REG_FLAG_NOT_R600, 0, 0},
+	{R_028788_CB_BLEND2_CONTROL, REG_FLAG_NOT_R600, 0, 0},
+	{R_02878C_CB_BLEND3_CONTROL, REG_FLAG_NOT_R600, 0, 0},
+	{R_028790_CB_BLEND4_CONTROL, REG_FLAG_NOT_R600, 0, 0},
+	{R_028794_CB_BLEND5_CONTROL, REG_FLAG_NOT_R600, 0, 0},
+	{R_028798_CB_BLEND6_CONTROL, REG_FLAG_NOT_R600, 0, 0},
+	{R_02879C_CB_BLEND7_CONTROL, REG_FLAG_NOT_R600, 0, 0},
 	{R_0287A0_CB_SHADER_CONTROL, 0, 0, 0},
 	{R_028800_DB_DEPTH_CONTROL, 0, 0, 0},
 	{R_028804_CB_BLEND_CONTROL, 0, 0, 0},
@@ -527,23 +640,44 @@ static const struct r600_reg r600_context_reg_list[] = {
 };
 
 /* SHADER RESOURCE R600/R700 */
-static int r600_state_resource_init(struct r600_context *ctx, u32 offset)
+int r600_resource_init(struct r600_context *ctx, struct r600_range *range, unsigned offset, unsigned nblocks, unsigned stride, struct r600_reg *reg, int nreg, unsigned offset_base)
+{
+	int i;
+	struct r600_block *block;
+	range->blocks = calloc(nblocks, sizeof(struct r600_block *));
+	if (range->blocks == NULL)
+		return -ENOMEM;
+
+	reg[0].offset += offset;
+	for (i = 0; i < nblocks; i++) {
+		block = calloc(1, sizeof(struct r600_block));
+		if (block == NULL) {
+			return -ENOMEM;
+		}
+		ctx->nblocks++;
+		range->blocks[i] = block;
+		r600_init_block(ctx, block, reg, 0, nreg, PKT3_SET_RESOURCE, offset_base);
+
+		reg[0].offset += stride;
+	}
+	return 0;
+}
+
+      
+static int r600_resource_range_init(struct r600_context *ctx, struct r600_range *range, unsigned offset, unsigned nblocks, unsigned stride)
 {
 	struct r600_reg r600_shader_resource[] = {
-		{R_038000_RESOURCE0_WORD0, 0, 0, 0},
-		{R_038004_RESOURCE0_WORD1, 0, 0, 0},
-		{R_038008_RESOURCE0_WORD2, REG_FLAG_NEED_BO, S_0085F0_TC_ACTION_ENA(1) | S_0085F0_VC_ACTION_ENA(1), 0xFFFFFFFF},
-		{R_03800C_RESOURCE0_WORD3, REG_FLAG_NEED_BO, S_0085F0_TC_ACTION_ENA(1) | S_0085F0_VC_ACTION_ENA(1), 0xFFFFFFFF},
+		{R_038000_RESOURCE0_WORD0, REG_FLAG_NEED_BO, S_0085F0_TC_ACTION_ENA(1) | S_0085F0_VC_ACTION_ENA(1), 0xFFFFFFFF},
+		{R_038004_RESOURCE0_WORD1, REG_FLAG_NEED_BO, S_0085F0_TC_ACTION_ENA(1) | S_0085F0_VC_ACTION_ENA(1), 0xFFFFFFFF},
+		{R_038008_RESOURCE0_WORD2, 0, 0, 0},
+		{R_03800C_RESOURCE0_WORD3, 0, 0, 0},
 		{R_038010_RESOURCE0_WORD4, 0, 0, 0},
 		{R_038014_RESOURCE0_WORD5, 0, 0, 0},
 		{R_038018_RESOURCE0_WORD6, 0, 0, 0},
 	};
 	unsigned nreg = Elements(r600_shader_resource);
 
-	for (int i = 0; i < nreg; i++) {
-		r600_shader_resource[i].offset += offset;
-	}
-	return r600_context_add_block(ctx, r600_shader_resource, nreg, PKT3_SET_RESOURCE, R600_RESOURCE_OFFSET);
+	return r600_resource_init(ctx, range, offset, nblocks, stride, r600_shader_resource, nreg, R600_RESOURCE_OFFSET);
 }
 
 /* SHADER SAMPLER R600/R700 */
@@ -594,15 +728,20 @@ static int r600_loop_const_init(struct r600_context *ctx, u32 offset)
 	return r600_context_add_block(ctx, r600_loop_consts, nreg, PKT3_SET_LOOP_CONST, R600_LOOP_CONST_OFFSET);
 }
 
-static void r600_context_clear_fenced_bo(struct r600_context *ctx)
+static void r600_free_resource_range(struct r600_context *ctx, struct r600_range *range, int nblocks)
 {
-	struct radeon_bo *bo, *tmp;
-
-	LIST_FOR_EACH_ENTRY_SAFE(bo, tmp, &ctx->fenced_bo, fencedlist) {
-		LIST_DELINIT(&bo->fencedlist);
-		bo->fence = 0;
-		bo->ctx = NULL;
+	struct r600_block *block;
+	int i;
+	for (i = 0; i < nblocks; i++) {
+		block = range->blocks[i];
+		if (block) {
+			for (int k = 1; k <= block->nbo; k++)
+				r600_bo_reference(&block->reloc[k].bo, NULL);
+			free(block);
+		}
 	}
+	free(range->blocks);
+
 }
 
 /* initialize */
@@ -612,54 +751,101 @@ void r600_context_fini(struct r600_context *ctx)
 	struct r600_range *range;
 
 	for (int i = 0; i < NUM_RANGES; i++) {
+		if (!ctx->range[i].blocks)
+			continue;
 		for (int j = 0; j < (1 << HASH_SHIFT); j++) {
 			block = ctx->range[i].blocks[j];
 			if (block) {
 				for (int k = 0, offset = block->start_offset; k < block->nreg; k++, offset += 4) {
-					range = &ctx->range[CTX_RANGE_ID(ctx, offset)];
-					range->blocks[CTX_BLOCK_ID(ctx, offset)] = NULL;
+					range = &ctx->range[CTX_RANGE_ID(offset)];
+					range->blocks[CTX_BLOCK_ID(offset)] = NULL;
 				}
 				for (int k = 1; k <= block->nbo; k++) {
-					r600_bo_reference(ctx->radeon, &block->reloc[k].bo, NULL);
+					r600_bo_reference(&block->reloc[k].bo, NULL);
 				}
 				free(block);
 			}
 		}
 		free(ctx->range[i].blocks);
 	}
+	r600_free_resource_range(ctx, &ctx->ps_resources, ctx->num_ps_resources);
+	r600_free_resource_range(ctx, &ctx->vs_resources, ctx->num_vs_resources);
+	r600_free_resource_range(ctx, &ctx->fs_resources, ctx->num_fs_resources);
 	free(ctx->range);
 	free(ctx->blocks);
-	free(ctx->reloc);
 	free(ctx->bo);
-	free(ctx->pm4);
+	ctx->radeon->ws->cs_destroy(ctx->cs);
 
-	r600_context_clear_fenced_bo(ctx);
 	memset(ctx, 0, sizeof(struct r600_context));
 }
 
+static void r600_add_resource_block(struct r600_context *ctx, struct r600_range *range, int num_blocks, int *index)
+{
+	int c = *index;
+	for (int j = 0; j < num_blocks; j++) {
+		if (!range->blocks[j])
+			continue;
+
+		ctx->blocks[c++] = range->blocks[j];
+	}
+	*index = c;
+}
+
+int r600_setup_block_table(struct r600_context *ctx)
+{
+	/* setup block table */
+	int c = 0;
+	ctx->blocks = calloc(ctx->nblocks, sizeof(void*));
+	if (!ctx->blocks)
+		return -ENOMEM;
+	for (int i = 0; i < NUM_RANGES; i++) {
+		if (!ctx->range[i].blocks)
+			continue;
+		for (int j = 0, add; j < (1 << HASH_SHIFT); j++) {
+			if (!ctx->range[i].blocks[j])
+				continue;
+
+			add = 1;
+			for (int k = 0; k < c; k++) {
+				if (ctx->blocks[k] == ctx->range[i].blocks[j]) {
+					add = 0;
+					break;
+				}
+			}
+			if (add) {
+				assert(c < ctx->nblocks);
+				ctx->blocks[c++] = ctx->range[i].blocks[j];
+				j += (ctx->range[i].blocks[j]->nreg) - 1;
+			}
+		}
+	}
+
+	r600_add_resource_block(ctx, &ctx->ps_resources, ctx->num_ps_resources, &c);
+	r600_add_resource_block(ctx, &ctx->vs_resources, ctx->num_vs_resources, &c);
+	r600_add_resource_block(ctx, &ctx->fs_resources, ctx->num_fs_resources, &c);
+	return 0;
+}
+
 int r600_context_init(struct r600_context *ctx, struct radeon *radeon)
 {
 	int r;
 
 	memset(ctx, 0, sizeof(struct r600_context));
 	ctx->radeon = radeon;
+
 	LIST_INITHEAD(&ctx->query_list);
 
+	/* init dirty list */
+	LIST_INITHEAD(&ctx->dirty);
+	LIST_INITHEAD(&ctx->resource_dirty);
+	LIST_INITHEAD(&ctx->enable_list);
+
 	ctx->range = calloc(NUM_RANGES, sizeof(struct r600_range));
 	if (!ctx->range) {
 		r = -ENOMEM;
 		goto out_err;
 	}
 
-	/* initialize hash */
-	for (int i = 0; i < NUM_RANGES; i++) {
-		ctx->range[i].blocks = calloc(1 << HASH_SHIFT, sizeof(void*));
-		if (ctx->range[i].blocks == NULL) {
-			r = -ENOMEM;
-			goto out_err;
-		}
-	}
-
 	/* add blocks */
 	r = r600_context_add_block(ctx, r600_config_reg_list,
 				   Elements(r600_config_reg_list), PKT3_SET_CONFIG_REG, R600_CONFIG_REG_OFFSET);
@@ -699,79 +885,46 @@ int r600_context_init(struct r600_context *ctx, struct radeon *radeon)
 		if (r)
 			goto out_err;
 	}
-	/* PS RESOURCE */
-	for (int j = 0, offset = 0; j < 160; j++, offset += 0x1C) {
-		r = r600_state_resource_init(ctx, offset);
-		if (r)
-			goto out_err;
-	}
-	/* VS RESOURCE */
-	for (int j = 0, offset = 0x1180; j < 160; j++, offset += 0x1C) {
-		r = r600_state_resource_init(ctx, offset);
-		if (r)
-			goto out_err;
-	}
-	/* FS RESOURCE */
-	for (int j = 0, offset = 0x2300; j < 16; j++, offset += 0x1C) {
-		r = r600_state_resource_init(ctx, offset);
-		if (r)
-			goto out_err;
-	}
+
+	ctx->num_ps_resources = 160;
+	ctx->num_vs_resources = 160;
+	ctx->num_fs_resources = 16;
+	r = r600_resource_range_init(ctx, &ctx->ps_resources, 0, 160, 0x1c);
+	if (r)
+		goto out_err;
+	r = r600_resource_range_init(ctx, &ctx->vs_resources, 0x1180, 160, 0x1c);
+	if (r)
+		goto out_err;
+	r = r600_resource_range_init(ctx, &ctx->fs_resources, 0x2300, 16, 0x1c);
+	if (r)
+		goto out_err;
 
 	/* PS loop const */
 	r600_loop_const_init(ctx, 0);
 	/* VS loop const */
 	r600_loop_const_init(ctx, 32);
 
-	/* setup block table */
-	ctx->blocks = calloc(ctx->nblocks, sizeof(void*));
-	for (int i = 0, c = 0; i < NUM_RANGES; i++) {
-		for (int j = 0, add; j < (1 << HASH_SHIFT); j++) {
-			if (ctx->range[i].blocks[j]) {
-				add = 1;
-				for (int k = 0; k < c; k++) {
-					if (ctx->blocks[k] == ctx->range[i].blocks[j]) {
-						add = 0;
-						break;
-					}
-				}
-				if (add) {
-					assert(c < ctx->nblocks);
-					ctx->blocks[c++] = ctx->range[i].blocks[j];
-					j += (ctx->range[i].blocks[j]->nreg) - 1;
-				}
-			}
-		}
-	}
+	r = r600_setup_block_table(ctx);
+	if (r)
+		goto out_err;
+
+	ctx->cs = radeon->ws->cs_create(radeon->ws);
 
 	/* allocate cs variables */
-	ctx->nreloc = RADEON_CTX_MAX_PM4;
-	ctx->reloc = calloc(ctx->nreloc, sizeof(struct r600_reloc));
-	if (ctx->reloc == NULL) {
-		r = -ENOMEM;
-		goto out_err;
-	}
-	ctx->bo = calloc(ctx->nreloc, sizeof(void *));
+	ctx->bo = calloc(RADEON_MAX_CMDBUF_DWORDS, sizeof(void *));
 	if (ctx->bo == NULL) {
 		r = -ENOMEM;
 		goto out_err;
 	}
-	ctx->pm4_ndwords = RADEON_CTX_MAX_PM4;
-	ctx->pm4 = calloc(ctx->pm4_ndwords, 4);
-	if (ctx->pm4 == NULL) {
-		r = -ENOMEM;
-		goto out_err;
-	}
+	ctx->pm4_ndwords = RADEON_MAX_CMDBUF_DWORDS;
+	ctx->pm4 = ctx->cs->buf;
+
+	r600_init_cs(ctx);
 	/* save 16dwords space for fence mecanism */
 	ctx->pm4_ndwords -= 16;
-
-	LIST_INITHEAD(&ctx->fenced_bo);
-
-	/* init dirty list */
-	LIST_INITHEAD(&ctx->dirty);
-
 	ctx->max_db = 4;
 
+	r600_get_backend_mask(ctx);
 	return 0;
 out_err:
 	r600_context_fini(ctx);
@@ -785,7 +938,7 @@ void r600_context_flush_all(struct r600_context *ctx, unsigned flush_flags)
 
 	if ((ctx->pm4_dirty_cdwords + ndwords + ctx->pm4_cdwords) > ctx->pm4_ndwords) {
 		/* need to flush */
-		r600_context_flush(ctx);
+		r600_context_flush(ctx, RADEON_FLUSH_ASYNC);
 	}
 
 	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SURFACE_SYNC, 3, ctx->predicate_drawing);
@@ -796,11 +949,8 @@ void r600_context_flush_all(struct r600_context *ctx, unsigned flush_flags)
 }
 
 void r600_context_bo_flush(struct r600_context *ctx, unsigned flush_flags,
-				unsigned flush_mask, struct r600_bo *rbo)
+				unsigned flush_mask, struct r600_bo *bo)
 {
-	struct radeon_bo *bo;
-
-	bo = rbo->bo;
 	/* if bo has already been flushed */
 	if (!(~bo->last_flush & flush_flags)) {
 		bo->last_flush &= flush_mask;
@@ -812,14 +962,17 @@ void r600_context_bo_flush(struct r600_context *ctx, unsigned flush_flags,
 	     G_0085F0_DB_ACTION_ENA(flush_flags))) {
 		if (ctx->flags & R600_CONTEXT_CHECK_EVENT_FLUSH) {
 			/* the rv670 seems to fail fbo-generatemipmap unless we flush the CB1 dest base ena */
-			if ((ctx->radeon->family == CHIP_RV670) ||
-			    (ctx->radeon->family == CHIP_RS780) ||
-			    (ctx->radeon->family == CHIP_RS880)) {
-				ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SURFACE_SYNC, 3, ctx->predicate_drawing);
-				ctx->pm4[ctx->pm4_cdwords++] = S_0085F0_CB1_DEST_BASE_ENA(1);     /* CP_COHER_CNTL */
-				ctx->pm4[ctx->pm4_cdwords++] = 0xffffffff;      /* CP_COHER_SIZE */
-				ctx->pm4[ctx->pm4_cdwords++] = 0;               /* CP_COHER_BASE */
-				ctx->pm4[ctx->pm4_cdwords++] = 0x0000000A;      /* POLL_INTERVAL */
+			if ((bo->binding & BO_BOUND_TEXTURE) &&
+			    (flush_flags & S_0085F0_CB_ACTION_ENA(1))) {
+				if ((ctx->radeon->family == CHIP_RV670) ||
+				    (ctx->radeon->family == CHIP_RS780) ||
+				    (ctx->radeon->family == CHIP_RS880)) {
+					ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SURFACE_SYNC, 3, ctx->predicate_drawing);
+					ctx->pm4[ctx->pm4_cdwords++] = S_0085F0_CB1_DEST_BASE_ENA(1);     /* CP_COHER_CNTL */
+					ctx->pm4[ctx->pm4_cdwords++] = 0xffffffff;      /* CP_COHER_SIZE */
+					ctx->pm4[ctx->pm4_cdwords++] = 0;               /* CP_COHER_BASE */
+					ctx->pm4[ctx->pm4_cdwords++] = 0x0000000A;      /* POLL_INTERVAL */
+				}
 			}
 
 			ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, ctx->predicate_drawing);
@@ -829,38 +982,15 @@ void r600_context_bo_flush(struct r600_context *ctx, unsigned flush_flags,
 	} else {
 		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SURFACE_SYNC, 3, ctx->predicate_drawing);
 		ctx->pm4[ctx->pm4_cdwords++] = flush_flags;
-		ctx->pm4[ctx->pm4_cdwords++] = (bo->size + 255) >> 8;
+		ctx->pm4[ctx->pm4_cdwords++] = (bo->buf->size + 255) >> 8;
 		ctx->pm4[ctx->pm4_cdwords++] = 0x00000000;
 		ctx->pm4[ctx->pm4_cdwords++] = 0x0000000A;
 		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, ctx->predicate_drawing);
-		ctx->pm4[ctx->pm4_cdwords++] = bo->reloc_id;
+		ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx, bo, RADEON_USAGE_WRITE);
 	}
 	bo->last_flush = (bo->last_flush | flush_flags) & flush_mask;
 }
 
-void r600_context_bo_reloc(struct r600_context *ctx, u32 *pm4, struct r600_bo *rbo)
-{
-	struct radeon_bo *bo;
-
-	bo = rbo->bo;
-	assert(bo != NULL);
-	if (bo->reloc) {
-		*pm4 = bo->reloc_id;
-		return;
-	}
-	bo->reloc = &ctx->reloc[ctx->creloc];
-	bo->reloc_id = ctx->creloc * sizeof(struct r600_reloc) / 4;
-	ctx->reloc[ctx->creloc].handle = bo->handle;
-	ctx->reloc[ctx->creloc].read_domain = rbo->domains & (RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM);
-	ctx->reloc[ctx->creloc].write_domain = rbo->domains & (RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM);
-	ctx->reloc[ctx->creloc].flags = 0;
-	radeon_bo_reference(ctx->radeon, &ctx->bo[ctx->creloc], bo);
-	rbo->fence = ctx->radeon->fence;
-	ctx->creloc++;
-	/* set PKT3 to point to proper reloc */
-	*pm4 = bo->reloc_id;
-}
-
 void r600_context_reg(struct r600_context *ctx,
 		      unsigned offset, unsigned value,
 		      unsigned mask)
@@ -871,8 +1001,8 @@ void r600_context_reg(struct r600_context *ctx,
 	unsigned new_val;
 	int dirty;
 
-	range = &ctx->range[CTX_RANGE_ID(ctx, offset)];
-	block = range->blocks[CTX_BLOCK_ID(ctx, offset)];
+	range = &ctx->range[CTX_RANGE_ID(offset)];
+	block = range->blocks[CTX_BLOCK_ID(offset)];
 	id = (offset - block->start_offset) >> 2;
 
 	dirty = block->status & R600_BLOCK_STATUS_DIRTY;
@@ -884,42 +1014,49 @@ void r600_context_reg(struct r600_context *ctx,
 		dirty |= R600_BLOCK_STATUS_DIRTY;
 		block->reg[id] = new_val;
 	}
-	r600_context_dirty_block(ctx, block, dirty, id);
+	if (dirty)
+		r600_context_dirty_block(ctx, block, dirty, id);
 }
 
-void r600_context_dirty_block(struct r600_context *ctx, struct r600_block *block,
+void r600_context_dirty_block(struct r600_context *ctx,
+			      struct r600_block *block,
 			      int dirty, int index)
 {
-	if (dirty && (index + 1) > block->nreg_dirty)
+	if ((index + 1) > block->nreg_dirty)
 		block->nreg_dirty = index + 1;
 
 	if ((dirty != (block->status & R600_BLOCK_STATUS_DIRTY)) || !(block->status & R600_BLOCK_STATUS_ENABLED)) {
-
-		block->status |= R600_BLOCK_STATUS_ENABLED;
 		block->status |= R600_BLOCK_STATUS_DIRTY;
 		ctx->pm4_dirty_cdwords += block->pm4_ndwords + block->pm4_flush_ndwords;
+		if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
+			block->status |= R600_BLOCK_STATUS_ENABLED;
+			LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
+		}
 		LIST_ADDTAIL(&block->list,&ctx->dirty);
+
+		if (block->flags & REG_FLAG_FLUSH_CHANGE) {
+			r600_context_ps_partial_flush(ctx);
+		}
 	}
 }
 
 void r600_context_pipe_state_set(struct r600_context *ctx, struct r600_pipe_state *state)
 {
-	struct r600_range *range;
 	struct r600_block *block;
 	unsigned new_val;
 	int dirty;
 	for (int i = 0; i < state->nregs; i++) {
 		unsigned id, reloc_id;
+		struct r600_pipe_reg *reg = &state->regs[i];
 
-		range = &ctx->range[CTX_RANGE_ID(ctx, state->regs[i].offset)];
-		block = range->blocks[CTX_BLOCK_ID(ctx, state->regs[i].offset)];
-		id = (state->regs[i].offset - block->start_offset) >> 2;
+		block = reg->block;
+		id = reg->id;
 
 		dirty = block->status & R600_BLOCK_STATUS_DIRTY;
 
 		new_val = block->reg[id];
-		new_val &= ~state->regs[i].mask;
-		new_val |= state->regs[i].value;
+		new_val &= ~reg->mask;
+		new_val |= reg->value;
 		if (new_val != block->reg[id]) {
 			block->reg[id] = new_val;
 			dirty |= R600_BLOCK_STATUS_DIRTY;
@@ -929,105 +1066,120 @@ void r600_context_pipe_state_set(struct r600_context *ctx, struct r600_pipe_stat
 		if (block->pm4_bo_index[id]) {
 			/* find relocation */
 			reloc_id = block->pm4_bo_index[id];
-			r600_bo_reference(ctx->radeon, &block->reloc[reloc_id].bo, state->regs[i].bo);
-			state->regs[i].bo->fence = ctx->radeon->fence;
+			r600_bo_reference(&block->reloc[reloc_id].bo, reg->bo);
+			block->reloc[reloc_id].bo_usage = reg->bo_usage;
 			/* always force dirty for relocs for now */
 			dirty |= R600_BLOCK_STATUS_DIRTY;
 		}
 
-		r600_context_dirty_block(ctx, block, dirty, id);
+		if (dirty)
+			r600_context_dirty_block(ctx, block, dirty, id);
 	}
 }
 
-void r600_context_pipe_state_set_resource(struct r600_context *ctx, struct r600_pipe_state *state, unsigned offset)
+static void r600_context_dirty_resource_block(struct r600_context *ctx,
+					      struct r600_block *block,
+					      int dirty, int index)
+{
+	block->nreg_dirty = index + 1;
+
+	if ((dirty != (block->status & R600_BLOCK_STATUS_RESOURCE_DIRTY)) || !(block->status & R600_BLOCK_STATUS_ENABLED)) {
+		block->status |= R600_BLOCK_STATUS_RESOURCE_DIRTY;
+		ctx->pm4_dirty_cdwords += block->pm4_ndwords + block->pm4_flush_ndwords;
+		if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
+			block->status |= R600_BLOCK_STATUS_ENABLED;
+			LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
+		}
+		LIST_ADDTAIL(&block->list,&ctx->resource_dirty);
+	}
+}
+
+void r600_context_pipe_state_set_resource(struct r600_context *ctx, struct r600_pipe_resource_state *state, struct r600_block *block)
 {
-	struct r600_range *range;
-	struct r600_block *block;
-	int i;
 	int dirty;
 	int num_regs = ctx->radeon->chip_class >= EVERGREEN ? 8 : 7;
+	boolean is_vertex;
 
-	range = &ctx->range[CTX_RANGE_ID(ctx, offset)];
-	block = range->blocks[CTX_BLOCK_ID(ctx, offset)];
 	if (state == NULL) {
-		block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY);
-		r600_bo_reference(ctx->radeon, &block->reloc[1].bo, NULL);
-		r600_bo_reference(ctx->radeon , &block->reloc[2].bo, NULL);
+		block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_RESOURCE_DIRTY);
+		if (block->reloc[1].bo)
+			block->reloc[1].bo->binding &= ~BO_BOUND_TEXTURE;
+
+		r600_bo_reference(&block->reloc[1].bo, NULL);
+		r600_bo_reference(&block->reloc[2].bo, NULL);
 		LIST_DELINIT(&block->list);
+		LIST_DELINIT(&block->enable_list);
 		return;
 	}
 
-	dirty = block->status & R600_BLOCK_STATUS_DIRTY;
+	is_vertex = ((state->val[num_regs-1] & 0xc0000000) == 0xc0000000);
+	dirty = block->status & R600_BLOCK_STATUS_RESOURCE_DIRTY;
 
-	for (i = 0; i < num_regs; i++) {
-		if (block->reg[i] != state->regs[i].value) {
-			dirty |= R600_BLOCK_STATUS_DIRTY;
-			block->reg[i] = state->regs[i].value;
-		}
+	if (memcmp(block->reg, state->val, num_regs*4)) {
+		memcpy(block->reg, state->val, num_regs * 4);
+		dirty |= R600_BLOCK_STATUS_RESOURCE_DIRTY;
 	}
 
 	/* if no BOs on block, force dirty */
 	if (!block->reloc[1].bo || !block->reloc[2].bo)
-		dirty |= R600_BLOCK_STATUS_DIRTY;
+		dirty |= R600_BLOCK_STATUS_RESOURCE_DIRTY;
 
 	if (!dirty) {
-		if (state->regs[0].bo) {
-			if ((block->reloc[1].bo->bo->handle != state->regs[0].bo->bo->handle) ||
-			    (block->reloc[2].bo->bo->handle != state->regs[0].bo->bo->handle))
-				dirty |= R600_BLOCK_STATUS_DIRTY;
+		if (is_vertex) {
+			if (block->reloc[1].bo->buf != state->bo[0]->buf)
+				dirty |= R600_BLOCK_STATUS_RESOURCE_DIRTY;
 		} else {
-			if ((block->reloc[1].bo->bo->handle != state->regs[2].bo->bo->handle) ||
-			    (block->reloc[2].bo->bo->handle != state->regs[3].bo->bo->handle))
-				dirty |= R600_BLOCK_STATUS_DIRTY;
+			if ((block->reloc[1].bo->buf != state->bo[0]->buf) ||
+			    (block->reloc[2].bo->buf != state->bo[1]->buf))
+				dirty |= R600_BLOCK_STATUS_RESOURCE_DIRTY;
 		}
 	}
-	if (!dirty) {
-		if (state->regs[0].bo)
-			state->regs[0].bo->fence = ctx->radeon->fence;
-		else {
-			state->regs[2].bo->fence = ctx->radeon->fence;
-			state->regs[3].bo->fence = ctx->radeon->fence;
-		}
-	} else {
-		r600_bo_reference(ctx->radeon, &block->reloc[1].bo, NULL);
-		r600_bo_reference(ctx->radeon, &block->reloc[2].bo, NULL);
-		if (state->regs[0].bo) {
+
+	if (dirty) {
+		if (is_vertex) {
 			/* VERTEX RESOURCE, we preted there is 2 bo to relocate so
 			 * we have single case btw VERTEX & TEXTURE resource
 			 */
-			r600_bo_reference(ctx->radeon, &block->reloc[1].bo, state->regs[0].bo);
-			r600_bo_reference(ctx->radeon, &block->reloc[2].bo, state->regs[0].bo);
-			state->regs[0].bo->fence = ctx->radeon->fence;
+			r600_bo_reference(&block->reloc[1].bo, state->bo[0]);
+			block->reloc[1].bo_usage = state->bo_usage[0];
+			r600_bo_reference(&block->reloc[2].bo, NULL);
 		} else {
 			/* TEXTURE RESOURCE */
-			r600_bo_reference(ctx->radeon, &block->reloc[1].bo, state->regs[2].bo);
-			r600_bo_reference(ctx->radeon, &block->reloc[2].bo, state->regs[3].bo);
-			state->regs[2].bo->fence = ctx->radeon->fence;
-			state->regs[3].bo->fence = ctx->radeon->fence;
+			r600_bo_reference(&block->reloc[1].bo, state->bo[0]);
+			block->reloc[1].bo_usage = state->bo_usage[0];
+			r600_bo_reference(&block->reloc[2].bo, state->bo[1]);
+			block->reloc[2].bo_usage = state->bo_usage[1];
+			state->bo[0]->binding |= BO_BOUND_TEXTURE;
 		}
+
+		if (is_vertex)
+			block->status |= R600_BLOCK_STATUS_RESOURCE_VERTEX;
+		else
+			block->status &= ~R600_BLOCK_STATUS_RESOURCE_VERTEX;
+	
+		r600_context_dirty_resource_block(ctx, block, dirty, num_regs - 1);
 	}
-	r600_context_dirty_block(ctx, block, dirty, num_regs - 1);
 }
 
-void r600_context_pipe_state_set_ps_resource(struct r600_context *ctx, struct r600_pipe_state *state, unsigned rid)
+void r600_context_pipe_state_set_ps_resource(struct r600_context *ctx, struct r600_pipe_resource_state *state, unsigned rid)
 {
-	unsigned offset = R_038000_SQ_TEX_RESOURCE_WORD0_0 + 0x1C * rid;
+	struct r600_block *block = ctx->ps_resources.blocks[rid];
 
-	r600_context_pipe_state_set_resource(ctx, state, offset);
+	r600_context_pipe_state_set_resource(ctx, state, block);
 }
 
-void r600_context_pipe_state_set_vs_resource(struct r600_context *ctx, struct r600_pipe_state *state, unsigned rid)
+void r600_context_pipe_state_set_vs_resource(struct r600_context *ctx, struct r600_pipe_resource_state *state, unsigned rid)
 {
-	unsigned offset = R_038000_SQ_TEX_RESOURCE_WORD0_0 + 0x1180 + 0x1C * rid;
+	struct r600_block *block = ctx->vs_resources.blocks[rid];
 
-	r600_context_pipe_state_set_resource(ctx, state, offset);
+	r600_context_pipe_state_set_resource(ctx, state, block);
 }
 
-void r600_context_pipe_state_set_fs_resource(struct r600_context *ctx, struct r600_pipe_state *state, unsigned rid)
+void r600_context_pipe_state_set_fs_resource(struct r600_context *ctx, struct r600_pipe_resource_state *state, unsigned rid)
 {
-	unsigned offset = R_038000_SQ_TEX_RESOURCE_WORD0_0 + 0x2300 + 0x1C * rid;
+	struct r600_block *block = ctx->fs_resources.blocks[rid];
 
-	r600_context_pipe_state_set_resource(ctx, state, offset);
+	r600_context_pipe_state_set_resource(ctx, state, block);
 }
 
 static inline void r600_context_pipe_state_set_sampler(struct r600_context *ctx, struct r600_pipe_state *state, unsigned offset)
@@ -1037,11 +1189,12 @@ static inline void r600_context_pipe_state_set_sampler(struct r600_context *ctx,
 	int i;
 	int dirty;
 
-	range = &ctx->range[CTX_RANGE_ID(ctx, offset)];
-	block = range->blocks[CTX_BLOCK_ID(ctx, offset)];
+	range = &ctx->range[CTX_RANGE_ID(offset)];
+	block = range->blocks[CTX_BLOCK_ID(offset)];
 	if (state == NULL) {
 		block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY);
 		LIST_DELINIT(&block->list);
+		LIST_DELINIT(&block->enable_list);
 		return;
 	}
 	dirty = block->status & R600_BLOCK_STATUS_DIRTY;
@@ -1052,19 +1205,10 @@ static inline void r600_context_pipe_state_set_sampler(struct r600_context *ctx,
 		}
 	}
 
-	r600_context_dirty_block(ctx, block, dirty, 2);
+	if (dirty)
+		r600_context_dirty_block(ctx, block, dirty, 2);
 }
 
-static inline void r600_context_ps_partial_flush(struct r600_context *ctx)
-{
-	if (!(ctx->flags & R600_CONTEXT_DRAW_PENDING))
-		return;
-
-	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
-	ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
-
-	ctx->flags &= ~R600_CONTEXT_DRAW_PENDING;
-}
 
 static inline void r600_context_pipe_state_set_sampler_border(struct r600_context *ctx, struct r600_pipe_state *state, unsigned offset)
 {
@@ -1073,11 +1217,12 @@ static inline void r600_context_pipe_state_set_sampler_border(struct r600_contex
 	int i;
 	int dirty;
 
-	range = &ctx->range[CTX_RANGE_ID(ctx, offset)];
-	block = range->blocks[CTX_BLOCK_ID(ctx, offset)];
+	range = &ctx->range[CTX_RANGE_ID(offset)];
+	block = range->blocks[CTX_BLOCK_ID(offset)];
 	if (state == NULL) {
 		block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY);
 		LIST_DELINIT(&block->list);
+		LIST_DELINIT(&block->enable_list);
 		return;
 	}
 	if (state->nregs <= 3) {
@@ -1096,8 +1241,8 @@ static inline void r600_context_pipe_state_set_sampler_border(struct r600_contex
 	 * will end up using the new border color. */
 	if (dirty & R600_BLOCK_STATUS_DIRTY)
 		r600_context_ps_partial_flush(ctx);
-
-	r600_context_dirty_block(ctx, block, dirty, 3);
+	if (dirty)
+		r600_context_dirty_block(ctx, block, dirty, 3);
 }
 
 void r600_context_pipe_state_set_ps_sampler(struct r600_context *ctx, struct r600_pipe_state *state, unsigned id)
@@ -1126,8 +1271,8 @@ struct r600_bo *r600_context_reg_bo(struct r600_context *ctx, unsigned offset)
 	struct r600_block *block;
 	unsigned id;
 
-	range = &ctx->range[CTX_RANGE_ID(ctx, offset)];
-	block = range->blocks[CTX_BLOCK_ID(ctx, offset)];
+	range = &ctx->range[CTX_RANGE_ID(offset)];
+	block = range->blocks[CTX_BLOCK_ID(offset)];
 	offset -= block->start_offset;
 	id = block->pm4_bo_index[offset >> 2];
 	if (block->reloc[id].bo) {
@@ -1138,39 +1283,52 @@ struct r600_bo *r600_context_reg_bo(struct r600_context *ctx, unsigned offset)
 
 void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block)
 {
-	int id;
+	int optional = block->nbo == 0 && !(block->flags & REG_FLAG_DIRTY_ALWAYS);
+	int cp_dwords = block->pm4_ndwords, start_dword = 0;
+	int new_dwords = 0;
+	int nbo = block->nbo;
 
-	if (block->nreg_dirty == 0 && block->nbo == 0 && !(block->flags & REG_FLAG_DIRTY_ALWAYS)) {
+	if (block->nreg_dirty == 0 && optional) {
 		goto out;
 	}
 
-	ctx->flags |= R600_CONTEXT_CHECK_EVENT_FLUSH;
-	for (int j = 0; j < block->nreg; j++) {
-		if (block->pm4_bo_index[j]) {
-			/* find relocation */
-			id = block->pm4_bo_index[j];
-			r600_context_bo_reloc(ctx,
-					&block->pm4[block->reloc[id].bo_pm4_index],
-					block->reloc[id].bo);
-			r600_context_bo_flush(ctx,
-					block->reloc[id].flush_flags,
-					block->reloc[id].flush_mask,
-					block->reloc[id].bo);
+	if (nbo) {
+		ctx->flags |= R600_CONTEXT_CHECK_EVENT_FLUSH;
+
+		for (int j = 0; j < block->nreg; j++) {
+			if (block->pm4_bo_index[j]) {
+				/* find relocation */
+				struct r600_block_reloc *reloc = &block->reloc[block->pm4_bo_index[j]];
+				block->pm4[reloc->bo_pm4_index] =
+					r600_context_bo_reloc(ctx, reloc->bo, reloc->bo_usage);
+				r600_context_bo_flush(ctx,
+						      reloc->flush_flags,
+						      reloc->flush_mask,
+						      reloc->bo);
+				nbo--;
+				if (nbo == 0)
+					break;
+			}
 		}
+		ctx->flags &= ~R600_CONTEXT_CHECK_EVENT_FLUSH;
 	}
-	ctx->flags &= ~R600_CONTEXT_CHECK_EVENT_FLUSH;
-	memcpy(&ctx->pm4[ctx->pm4_cdwords], block->pm4, block->pm4_ndwords * 4);
-	ctx->pm4_cdwords += block->pm4_ndwords;
-
-	if (block->nreg_dirty != block->nreg && block->nbo == 0 && !(block->flags & REG_FLAG_DIRTY_ALWAYS)) {
-		int new_dwords = block->nreg_dirty;
-		uint32_t oldword, newword;
-		ctx->pm4_cdwords -= block->pm4_ndwords;
-		newword = oldword = ctx->pm4[ctx->pm4_cdwords];
+
+	optional &= (block->nreg_dirty != block->nreg);
+	if (optional) {
+		new_dwords = block->nreg_dirty;
+		start_dword = ctx->pm4_cdwords;
+		cp_dwords = new_dwords + 2;
+	}
+	memcpy(&ctx->pm4[ctx->pm4_cdwords], block->pm4, cp_dwords * 4);
+	ctx->pm4_cdwords += cp_dwords;
+
+	if (optional) {
+		uint32_t newword;
+
+		newword = ctx->pm4[start_dword];
 		newword &= PKT_COUNT_C;
 		newword |= PKT_COUNT_S(new_dwords);
-		ctx->pm4[ctx->pm4_cdwords] = newword;
-		ctx->pm4_cdwords += new_dwords + 2;
+		ctx->pm4[start_dword] = newword;
 	}
 out:
 	block->status ^= R600_BLOCK_STATUS_DIRTY;
@@ -1178,6 +1336,40 @@ out:
 	LIST_DELINIT(&block->list);
 }
 
+void r600_context_block_resource_emit_dirty(struct r600_context *ctx, struct r600_block *block)
+{
+	int cp_dwords = block->pm4_ndwords;
+	int nbo = block->nbo;
+
+	ctx->flags |= R600_CONTEXT_CHECK_EVENT_FLUSH;
+
+	if (block->status & R600_BLOCK_STATUS_RESOURCE_VERTEX) {
+		nbo = 1;
+		cp_dwords -= 2; /* don't copy the second NOP */
+	}
+
+	for (int j = 0; j < nbo; j++) {
+		if (block->pm4_bo_index[j]) {
+			/* find relocation */
+			struct r600_block_reloc *reloc = &block->reloc[block->pm4_bo_index[j]];
+			block->pm4[reloc->bo_pm4_index] =
+				r600_context_bo_reloc(ctx, reloc->bo, reloc->bo_usage);
+			r600_context_bo_flush(ctx,
+					      reloc->flush_flags,
+					      reloc->flush_mask,
+					      reloc->bo);
+		}
+	}
+	ctx->flags &= ~R600_CONTEXT_CHECK_EVENT_FLUSH;
+
+	memcpy(&ctx->pm4[ctx->pm4_cdwords], block->pm4, cp_dwords * 4);
+	ctx->pm4_cdwords += cp_dwords;
+
+	block->status ^= R600_BLOCK_STATUS_RESOURCE_DIRTY;
+	block->nreg_dirty = 0;
+	LIST_DELINIT(&block->list);
+}
+
 void r600_context_flush_dest_caches(struct r600_context *ctx)
 {
 	struct r600_bo *cb[8];
@@ -1209,7 +1401,7 @@ void r600_context_flush_dest_caches(struct r600_context *ctx)
 					0, cb[i]);
 	}
 	if (db) {
-		r600_context_bo_flush(ctx, S_0085F0_DB_ACTION_ENA(1), 0, db);
+		r600_context_bo_flush(ctx, S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1), 0, db);
 	}
 	ctx->flags &= ~R600_CONTEXT_CHECK_EVENT_FLUSH;
 	ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
@@ -1220,13 +1412,10 @@ void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw)
 	unsigned ndwords = 7;
 	struct r600_block *dirty_block = NULL;
 	struct r600_block *next_block;
+	uint32_t *pm4;
 
 	if (draw->indices) {
 		ndwords = 11;
-		/* make sure there is enough relocation space before scheduling draw */
-		if (ctx->creloc >= (ctx->nreloc - 1)) {
-			r600_context_flush(ctx);
-		}
 	}
 
 	/* queries need some special values */
@@ -1245,11 +1434,11 @@ void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw)
 
 	/* update the max dword count to make sure we have enough space
 	 * reserved for flushing the destination caches */
-	ctx->pm4_ndwords = RADEON_CTX_MAX_PM4 - ctx->num_dest_buffers * 7 - 16;
+	ctx->pm4_ndwords = RADEON_MAX_CMDBUF_DWORDS - ctx->num_dest_buffers * 7 - 16;
 
 	if ((ctx->pm4_dirty_cdwords + ndwords + ctx->pm4_cdwords) > ctx->pm4_ndwords) {
 		/* need to flush */
-		r600_context_flush(ctx);
+		r600_context_flush(ctx, RADEON_FLUSH_ASYNC);
 	}
 	/* at that point everythings is flushed and ctx->pm4_cdwords = 0 */
 	if ((ctx->pm4_dirty_cdwords + ndwords) > ctx->pm4_ndwords) {
@@ -1261,25 +1450,31 @@ void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw)
 		r600_context_block_emit_dirty(ctx, dirty_block);
 	}
 
+	LIST_FOR_EACH_ENTRY_SAFE(dirty_block, next_block, &ctx->resource_dirty, list) {
+		r600_context_block_resource_emit_dirty(ctx, dirty_block);
+	}
+
 	/* draw packet */
-	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_INDEX_TYPE, 0, ctx->predicate_drawing);
-	ctx->pm4[ctx->pm4_cdwords++] = draw->vgt_index_type;
-	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NUM_INSTANCES, 0, ctx->predicate_drawing);
-	ctx->pm4[ctx->pm4_cdwords++] = draw->vgt_num_instances;
+	pm4 = &ctx->pm4[ctx->pm4_cdwords];
+
+	pm4[0] = PKT3(PKT3_INDEX_TYPE, 0, ctx->predicate_drawing);
+	pm4[1] = draw->vgt_index_type;
+	pm4[2] = PKT3(PKT3_NUM_INSTANCES, 0, ctx->predicate_drawing);
+	pm4[3] = draw->vgt_num_instances;
 	if (draw->indices) {
-		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_DRAW_INDEX, 3, ctx->predicate_drawing);
-		ctx->pm4[ctx->pm4_cdwords++] = draw->indices_bo_offset + r600_bo_offset(draw->indices);
-		ctx->pm4[ctx->pm4_cdwords++] = 0;
-		ctx->pm4[ctx->pm4_cdwords++] = draw->vgt_num_indices;
-		ctx->pm4[ctx->pm4_cdwords++] = draw->vgt_draw_initiator;
-		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, ctx->predicate_drawing);
-		ctx->pm4[ctx->pm4_cdwords++] = 0;
-		r600_context_bo_reloc(ctx, &ctx->pm4[ctx->pm4_cdwords - 1], draw->indices);
+		pm4[4] = PKT3(PKT3_DRAW_INDEX, 3, ctx->predicate_drawing);
+		pm4[5] = draw->indices_bo_offset;
+		pm4[6] = 0;
+		pm4[7] = draw->vgt_num_indices;
+		pm4[8] = draw->vgt_draw_initiator;
+		pm4[9] = PKT3(PKT3_NOP, 0, ctx->predicate_drawing);
+		pm4[10] = r600_context_bo_reloc(ctx, draw->indices, RADEON_USAGE_READ);
 	} else {
-		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, ctx->predicate_drawing);
-		ctx->pm4[ctx->pm4_cdwords++] = draw->vgt_num_indices;
-		ctx->pm4[ctx->pm4_cdwords++] = draw->vgt_draw_initiator;
+		pm4[4] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, ctx->predicate_drawing);
+		pm4[5] = draw->vgt_num_indices;
+		pm4[6] = draw->vgt_draw_initiator;
 	}
+	ctx->pm4_cdwords += ndwords;
 
 	ctx->flags |= (R600_CONTEXT_DST_CACHES_DIRTY | R600_CONTEXT_DRAW_PENDING);
 
@@ -1287,21 +1482,17 @@ void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw)
 	ctx->pm4_dirty_cdwords = 0;
 }
 
-void r600_context_flush(struct r600_context *ctx)
+void r600_context_flush(struct r600_context *ctx, unsigned flags)
 {
-	struct drm_radeon_cs drmib = {};
-	struct drm_radeon_cs_chunk chunks[2];
-	uint64_t chunk_array[2];
-	unsigned fence;
-	int r;
+	struct r600_block *enable_block = NULL;
 
-	if (!ctx->pm4_cdwords)
+	if (ctx->pm4_cdwords == ctx->init_dwords)
 		return;
 
 	/* suspend queries */
 	r600_context_queries_suspend(ctx);
 
-	if (ctx->radeon->family >= CHIP_CEDAR)
+	if (ctx->radeon->chip_class >= EVERGREEN)
 		evergreen_context_flush_dest_caches(ctx);
 	else
 		r600_context_flush_dest_caches(ctx);
@@ -1309,71 +1500,48 @@ void r600_context_flush(struct r600_context *ctx)
 	/* partial flush is needed to avoid lockups on some chips with user fences */
 	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 	ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
-	/* emit fence */
-	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
-	ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
-	ctx->pm4[ctx->pm4_cdwords++] = 0;
-	ctx->pm4[ctx->pm4_cdwords++] = (1 << 29) | (0 << 24);
-	ctx->pm4[ctx->pm4_cdwords++] = ctx->radeon->fence;
-	ctx->pm4[ctx->pm4_cdwords++] = 0;
-	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
-	ctx->pm4[ctx->pm4_cdwords++] = 0;
-	r600_context_bo_reloc(ctx, &ctx->pm4[ctx->pm4_cdwords - 1], ctx->radeon->fence_bo);
-
-#if 1
-	/* emit cs */
-	drmib.num_chunks = 2;
-	drmib.chunks = (uint64_t)(uintptr_t)chunk_array;
-	chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
-	chunks[0].length_dw = ctx->pm4_cdwords;
-	chunks[0].chunk_data = (uint64_t)(uintptr_t)ctx->pm4;
-	chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
-	chunks[1].length_dw = ctx->creloc * sizeof(struct r600_reloc) / 4;
-	chunks[1].chunk_data = (uint64_t)(uintptr_t)ctx->reloc;
-	chunk_array[0] = (uint64_t)(uintptr_t)&chunks[0];
-	chunk_array[1] = (uint64_t)(uintptr_t)&chunks[1];
-	r = drmCommandWriteRead(ctx->radeon->fd, DRM_RADEON_CS, &drmib,
-				sizeof(struct drm_radeon_cs));
-#else
-	*ctx->radeon->cfence = ctx->radeon->fence;
-#endif
-
-	r600_context_update_fenced_list(ctx);
-
-	fence = ctx->radeon->fence + 1;
-	if (fence < ctx->radeon->fence) {
-		/* wrap around */
-		fence = 1;
-		r600_context_fence_wraparound(ctx, fence);
-	}
-	ctx->radeon->fence = fence;
+
+	/* Flush the CS. */
+	ctx->cs->cdw = ctx->pm4_cdwords;
+	ctx->radeon->ws->cs_flush(ctx->cs, flags);
+
+	/* We need to get the pointer to the other CS,
+	 * the command streams are double-buffered. */
+	ctx->pm4 = ctx->cs->buf;
 
 	/* restart */
 	for (int i = 0; i < ctx->creloc; i++) {
-		ctx->bo[i]->reloc = NULL;
 		ctx->bo[i]->last_flush = 0;
-		radeon_bo_reference(ctx->radeon, &ctx->bo[i], NULL);
+		r600_bo_reference(&ctx->bo[i], NULL);
 	}
 	ctx->creloc = 0;
 	ctx->pm4_dirty_cdwords = 0;
 	ctx->pm4_cdwords = 0;
 	ctx->flags = 0;
 
+	r600_init_cs(ctx);
+
 	/* resume queries */
-	r600_context_queries_resume(ctx);
+	r600_context_queries_resume(ctx, TRUE);
 
 	/* set all valid group as dirty so they get reemited on
 	 * next draw command
 	 */
-	for (int i = 0; i < ctx->nblocks; i++) {
-		if (ctx->blocks[i]->status & R600_BLOCK_STATUS_ENABLED) {
-			if(!(ctx->blocks[i]->status & R600_BLOCK_STATUS_DIRTY)) {
-				LIST_ADDTAIL(&ctx->blocks[i]->list,&ctx->dirty);
+	LIST_FOR_EACH_ENTRY(enable_block, &ctx->enable_list, enable_list) {
+		if (!(enable_block->flags & BLOCK_FLAG_RESOURCE)) {
+			if(!(enable_block->status & R600_BLOCK_STATUS_DIRTY)) {
+				LIST_ADDTAIL(&enable_block->list,&ctx->dirty);
+				enable_block->status |= R600_BLOCK_STATUS_DIRTY;
+			}
+		} else {
+			if(!(enable_block->status & R600_BLOCK_STATUS_RESOURCE_DIRTY)) {
+				LIST_ADDTAIL(&enable_block->list,&ctx->resource_dirty);
+				enable_block->status |= R600_BLOCK_STATUS_RESOURCE_DIRTY;
 			}
-			ctx->pm4_dirty_cdwords += ctx->blocks[i]->pm4_ndwords + ctx->blocks[i]->pm4_flush_ndwords;
-			ctx->blocks[i]->status |= R600_BLOCK_STATUS_DIRTY;
-			ctx->blocks[i]->nreg_dirty = ctx->blocks[i]->nreg;
 		}
+		ctx->pm4_dirty_cdwords += enable_block->pm4_ndwords + 
+			enable_block->pm4_flush_ndwords;
+		enable_block->nreg_dirty = enable_block->nreg;
 	}
 }
 
@@ -1381,10 +1549,9 @@ void r600_context_emit_fence(struct r600_context *ctx, struct r600_bo *fence_bo,
 {
 	unsigned ndwords = 10;
 
-	if (((ctx->pm4_dirty_cdwords + ndwords + ctx->pm4_cdwords) > ctx->pm4_ndwords) ||
-	    (ctx->creloc >= (ctx->nreloc - 1))) {
+	if ((ctx->pm4_dirty_cdwords + ndwords + ctx->pm4_cdwords) > ctx->pm4_ndwords) {
 		/* need to flush */
-		r600_context_flush(ctx);
+		r600_context_flush(ctx, RADEON_FLUSH_ASYNC);
 	}
 
 	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
@@ -1396,125 +1563,47 @@ void r600_context_emit_fence(struct r600_context *ctx, struct r600_bo *fence_bo,
 	ctx->pm4[ctx->pm4_cdwords++] = value;                   /* DATA_LO */
 	ctx->pm4[ctx->pm4_cdwords++] = 0;                       /* DATA_HI */
 	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
-	ctx->pm4[ctx->pm4_cdwords++] = 0;
-	r600_context_bo_reloc(ctx, &ctx->pm4[ctx->pm4_cdwords - 1], fence_bo);
-}
-
-void r600_context_dump_bof(struct r600_context *ctx, const char *file)
-{
-	bof_t *bcs, *blob, *array, *bo, *size, *handle, *device_id, *root;
-	unsigned i;
-
-	root = device_id = bcs = blob = array = bo = size = handle = NULL;
-	root = bof_object();
-	if (root == NULL)
-		goto out_err;
-	device_id = bof_int32(ctx->radeon->device);
-	if (device_id == NULL)
-		goto out_err;
-	if (bof_object_set(root, "device_id", device_id))
-		goto out_err;
-	bof_decref(device_id);
-	device_id = NULL;
-	/* dump relocs */
-	blob = bof_blob(ctx->creloc * 16, ctx->reloc);
-	if (blob == NULL)
-		goto out_err;
-	if (bof_object_set(root, "reloc", blob))
-		goto out_err;
-	bof_decref(blob);
-	blob = NULL;
-	/* dump cs */
-	blob = bof_blob(ctx->pm4_cdwords * 4, ctx->pm4);
-	if (blob == NULL)
-		goto out_err;
-	if (bof_object_set(root, "pm4", blob))
-		goto out_err;
-	bof_decref(blob);
-	blob = NULL;
-	/* dump bo */
-	array = bof_array();
-	if (array == NULL)
-		goto out_err;
-	for (i = 0; i < ctx->creloc; i++) {
-		struct radeon_bo *rbo = ctx->bo[i];
-		bo = bof_object();
-		if (bo == NULL)
-			goto out_err;
-		size = bof_int32(rbo->size);
-		if (size == NULL)
-			goto out_err;
-		if (bof_object_set(bo, "size", size))
-			goto out_err;
-		bof_decref(size);
-		size = NULL;
-		handle = bof_int32(rbo->handle);
-		if (handle == NULL)
-			goto out_err;
-		if (bof_object_set(bo, "handle", handle))
-			goto out_err;
-		bof_decref(handle);
-		handle = NULL;
-		radeon_bo_map(ctx->radeon, rbo);
-		blob = bof_blob(rbo->size, rbo->data);
-		radeon_bo_unmap(ctx->radeon, rbo);
-		if (blob == NULL)
-			goto out_err;
-		if (bof_object_set(bo, "data", blob))
-			goto out_err;
-		bof_decref(blob);
-		blob = NULL;
-		if (bof_array_append(array, bo))
-			goto out_err;
-		bof_decref(bo);
-		bo = NULL;
-	}
-	if (bof_object_set(root, "bo", array))
-		goto out_err;
-	bof_dump_file(root, file);
-out_err:
-	bof_decref(blob);
-	bof_decref(array);
-	bof_decref(bo);
-	bof_decref(size);
-	bof_decref(handle);
-	bof_decref(device_id);
-	bof_decref(root);
+	ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
 }
 
 static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
 {
+	unsigned results_base = query->results_start;
 	u64 start, end;
-	u32 *results;
-	int i;
-	int size;
+	u32 *results, *current_result;
 
 	if (wait)
-		results = r600_bo_map(ctx->radeon, query->buffer, PB_USAGE_CPU_READ, NULL);
+		results = r600_bo_map(ctx->radeon, query->buffer, ctx->cs, PIPE_TRANSFER_READ);
 	else
-		results = r600_bo_map(ctx->radeon, query->buffer, PB_USAGE_DONTBLOCK | PB_USAGE_CPU_READ, NULL);
+		results = r600_bo_map(ctx->radeon, query->buffer, ctx->cs, PIPE_TRANSFER_DONTBLOCK | PIPE_TRANSFER_READ);
 	if (!results)
 		return FALSE;
 
-	size = query->num_results * (query->type == PIPE_QUERY_OCCLUSION_COUNTER ? ctx->max_db : 1);
-	for (i = 0; i < size; i += 4) {
-		start = (u64)results[i] | (u64)results[i + 1] << 32;
-		end = (u64)results[i + 2] | (u64)results[i + 3] << 32;
+
+	/* count all results across all data blocks */
+	while (results_base != query->results_end) {
+		current_result = (u32*)((char*)results + results_base);
+
+		start = (u64)current_result[0] | (u64)current_result[1] << 32;
+		end = (u64)current_result[2] | (u64)current_result[3] << 32;
 		if (((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))
                     || query->type == PIPE_QUERY_TIME_ELAPSED) {
 			query->result += end - start;
 		}
+
+		results_base += 4 * 4;
+		if (results_base >= query->buffer_size)
+			results_base = 0;
 	}
-	r600_bo_unmap(ctx->radeon, query->buffer);
-	query->num_results = 0;
 
+	query->results_start = query->results_end;
+	r600_bo_unmap(ctx->radeon, query->buffer);
 	return TRUE;
 }
 
 void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
 {
-	unsigned required_space;
-	int num_backends = r600_get_num_backends(ctx->radeon);
+	unsigned required_space, new_results_end;
 
 	/* query request needs 6/8 dwords for begin + 6/8 dwords for end */
 	if (query->type == PIPE_QUERY_TIME_ELAPSED)
@@ -1524,50 +1613,68 @@ void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
 
 	if ((required_space + ctx->pm4_cdwords) > ctx->pm4_ndwords) {
 		/* need to flush */
-		r600_context_flush(ctx);
+		r600_context_flush(ctx, RADEON_FLUSH_ASYNC);
+	}
+
+	if (query->type == PIPE_QUERY_OCCLUSION_COUNTER) {
+		/* Count queries emitted without flushes, and flush if more than
+		 * half of buffer used, to avoid overwriting results which may be
+		 * still in use. */
+		if (query->state & R600_QUERY_STATE_FLUSHED) {
+			query->queries_emitted = 1;
+		} else {
+			if (++query->queries_emitted > query->buffer_size / query->result_size / 2)
+				r600_context_flush(ctx, RADEON_FLUSH_ASYNC);
+		}
 	}
 
-	/* if query buffer is full force a flush */
-	if (query->num_results*4 >= query->buffer_size - 16) {
-		r600_context_flush(ctx);
+	new_results_end = query->results_end + query->result_size;
+	if (new_results_end >= query->buffer_size)
+		new_results_end = 0;
+
+	/* collect current results if query buffer is full */
+	if (new_results_end == query->results_start) {
+		if (!(query->state & R600_QUERY_STATE_FLUSHED))
+			r600_context_flush(ctx, 0);
 		r600_query_result(ctx, query, TRUE);
 	}
 
-	if (query->type == PIPE_QUERY_OCCLUSION_COUNTER &&
-	    num_backends > 0 && num_backends < ctx->max_db) {
-		/* as per info on ZPASS the driver must set the unusued DB top bits */
+	if (query->type == PIPE_QUERY_OCCLUSION_COUNTER) {
 		u32 *results;
 		int i;
 
-		results = r600_bo_map(ctx->radeon, query->buffer, PB_USAGE_DONTBLOCK | PB_USAGE_CPU_WRITE, NULL);
+		results = r600_bo_map(ctx->radeon, query->buffer, ctx->cs, PIPE_TRANSFER_WRITE);
 		if (results) {
-			memset(results + (query->num_results * 4), 0, ctx->max_db * 4 * 4);
-			
-			for (i = num_backends; i < ctx->max_db; i++) {
-				results[(i * 4)+1] = 0x80000000;
-				results[(i * 4)+3] = 0x80000000;
+			results = (u32*)((char*)results + query->results_end);
+			memset(results, 0, query->result_size);
+
+			/* Set top bits for unused backends */
+			for (i = 0; i < ctx->max_db; i++) {
+				if (!(ctx->backend_mask & (1<<i))) {
+					results[(i * 4)+1] = 0x80000000;
+					results[(i * 4)+3] = 0x80000000;
+				}
 			}
 			r600_bo_unmap(ctx->radeon, query->buffer);
 		}
 	}
-	
+
 	/* emit begin query */
 	if (query->type == PIPE_QUERY_TIME_ELAPSED) {
 		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
 		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
-		ctx->pm4[ctx->pm4_cdwords++] = query->num_results*4 + r600_bo_offset(query->buffer);
+		ctx->pm4[ctx->pm4_cdwords++] = query->results_end;
 		ctx->pm4[ctx->pm4_cdwords++] = (3 << 29);
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 	} else {
 		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
 		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
-		ctx->pm4[ctx->pm4_cdwords++] = query->num_results*4 + r600_bo_offset(query->buffer);
+		ctx->pm4[ctx->pm4_cdwords++] = query->results_end;
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 	}
 	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
-	ctx->pm4[ctx->pm4_cdwords++] = 0;
-	r600_context_bo_reloc(ctx, &ctx->pm4[ctx->pm4_cdwords - 1], query->buffer);
+	ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
 
 	query->state |= R600_QUERY_STATE_STARTED;
 	query->state ^= R600_QUERY_STATE_ENDED;
@@ -1576,49 +1683,74 @@ void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
 
 void r600_query_end(struct r600_context *ctx, struct r600_query *query)
 {
-	/* emit begin query */
+	/* emit end query */
 	if (query->type == PIPE_QUERY_TIME_ELAPSED) {
 		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
 		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
-		ctx->pm4[ctx->pm4_cdwords++] = query->num_results*4 + 8 + r600_bo_offset(query->buffer);
+		ctx->pm4[ctx->pm4_cdwords++] = query->results_end + 8;
 		ctx->pm4[ctx->pm4_cdwords++] = (3 << 29);
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 	} else {
 		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
 		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
-		ctx->pm4[ctx->pm4_cdwords++] = query->num_results*4 + 8 + r600_bo_offset(query->buffer);
+		ctx->pm4[ctx->pm4_cdwords++] = query->results_end + 8;
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 	}
 	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
-	ctx->pm4[ctx->pm4_cdwords++] = 0;
-	r600_context_bo_reloc(ctx, &ctx->pm4[ctx->pm4_cdwords - 1], query->buffer);
+	ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
+
+	query->results_end += query->result_size;
+	if (query->results_end >= query->buffer_size)
+		query->results_end = 0;
 
-	query->num_results += 4 * (query->type == PIPE_QUERY_OCCLUSION_COUNTER ? ctx->max_db : 1);
 	query->state ^= R600_QUERY_STATE_STARTED;
 	query->state |= R600_QUERY_STATE_ENDED;
+	query->state &= ~R600_QUERY_STATE_FLUSHED;
+
 	ctx->num_query_running--;
 }
 
 void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
 			    int flag_wait)
 {
-	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
-
 	if (operation == PREDICATION_OP_CLEAR) {
+		if (ctx->pm4_cdwords + 3 > ctx->pm4_ndwords)
+			r600_context_flush(ctx, RADEON_FLUSH_ASYNC);
+
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 		ctx->pm4[ctx->pm4_cdwords++] = PRED_OP(PREDICATION_OP_CLEAR);
 	} else {
-		int results_base = query->num_results - (4 * ctx->max_db);
-
-		if (results_base < 0)
-			results_base = 0;
-
-		ctx->pm4[ctx->pm4_cdwords++] = results_base*4 + r600_bo_offset(query->buffer);
-		ctx->pm4[ctx->pm4_cdwords++] = PRED_OP(operation) | (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW) | PREDICATION_DRAW_VISIBLE;
-		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
-		ctx->pm4[ctx->pm4_cdwords++] = 0;
-		r600_context_bo_reloc(ctx, &ctx->pm4[ctx->pm4_cdwords - 1], query->buffer);
+		unsigned results_base = query->results_start;
+		unsigned count;
+		u32 op;
+
+		/* find count of the query data blocks */
+		count = query->buffer_size + query->results_end - query->results_start;
+		if (count >= query->buffer_size) count-=query->buffer_size;
+		count /= query->result_size;
+
+		if (ctx->pm4_cdwords + 5 * count > ctx->pm4_ndwords)
+			r600_context_flush(ctx, RADEON_FLUSH_ASYNC);
+
+		op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
+				(flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
+
+		/* emit predicate packets for all data blocks */
+		while (results_base != query->results_end) {
+			ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
+			ctx->pm4[ctx->pm4_cdwords++] = results_base;
+			ctx->pm4[ctx->pm4_cdwords++] = op;
+			ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+			ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx, query->buffer,
+									     RADEON_USAGE_READ);
+			results_base += query->result_size;
+			if (results_base >= query->buffer_size)
+				results_base = 0;
+			/* set CONTINUE bit for all packets except the first */
+			op |= PREDICATION_CONTINUE;
+		}
 	}
 }
 
@@ -1636,6 +1768,14 @@ struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned
 	query->type = query_type;
 	query->buffer_size = 4096;
 
+	if (query_type == PIPE_QUERY_OCCLUSION_COUNTER)
+		query->result_size = 4 * 4 * ctx->max_db;
+	else
+		query->result_size = 4 * 4;
+
+	/* adjust buffer size to simplify offsets wrapping math */
+	query->buffer_size -= query->buffer_size % query->result_size;
+
 	/* As of GL4, query buffers are normally read by the CPU after
 	 * being written by the gpu, hence staging is probably a good
 	 * usage pattern.
@@ -1654,7 +1794,7 @@ struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned
 
 void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
 {
-	r600_bo_reference(ctx->radeon, &query->buffer, NULL);
+	r600_bo_reference(&query->buffer, NULL);
 	LIST_DELINIT(&query->list);
 	free(query);
 }
@@ -1665,8 +1805,8 @@ boolean r600_context_query_result(struct r600_context *ctx,
 {
 	uint64_t *result = (uint64_t*)vresult;
 
-	if (query->num_results) {
-		r600_context_flush(ctx);
+	if (!(query->state & R600_QUERY_STATE_FLUSHED)) {
+		r600_context_flush(ctx, 0);
 	}
 	if (!r600_query_result(ctx, query, wait))
 		return FALSE;
@@ -1690,11 +1830,14 @@ void r600_context_queries_suspend(struct r600_context *ctx)
 	}
 }
 
-void r600_context_queries_resume(struct r600_context *ctx)
+void r600_context_queries_resume(struct r600_context *ctx, boolean flushed)
 {
 	struct r600_query *query;
 
 	LIST_FOR_EACH_ENTRY(query, &ctx->query_list, list) {
+		if (flushed)
+			query->state |= R600_QUERY_STATE_FLUSHED;
+
 		if (query->state & R600_QUERY_STATE_SUSPENDED) {
 			r600_query_begin(ctx, query);
 			query->state ^= R600_QUERY_STATE_SUSPENDED;