src/gallium/drivers/radeonsi/r600_hw_context.c

   1 /*
   2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Jerome Glisse
  25  */
  26 #include "r600_hw_context_priv.h"
  27 #include "radeonsi_pipe.h"
  28 #include "sid.h"
  29 #include "util/u_memory.h"
  30 #include <errno.h>
  31
  32 #define GROUP_FORCE_NEW_BLOCK   0
  33
  34 /* Get backends mask */
  35 void r600_get_backend_mask(struct r600_context *ctx)
  36 {
  37         struct radeon_winsys_cs *cs = ctx->cs;
  38         struct r600_resource *buffer;
  39         uint32_t *results;
  40         unsigned num_backends = ctx->screen->info.r600_num_backends;
  41         unsigned i, mask = 0;
  42
  43         /* if backend_map query is supported by the kernel */
  44         if (ctx->screen->info.r600_backend_map_valid) {
  45                 unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
  46                 unsigned backend_map = ctx->screen->info.r600_backend_map;
  47                 unsigned item_width, item_mask;
  48
  49                 if (ctx->chip_class >= CAYMAN) {
  50                         item_width = 4;
  51                         item_mask = 0x7;
  52                 }
  53
  54                 while(num_tile_pipes--) {
  55                         i = backend_map & item_mask;
  56                         mask |= (1<<i);
  57                         backend_map >>= item_width;
  58                 }
  59                 if (mask != 0) {
  60                         ctx->backend_mask = mask;
  61                         return;
  62                 }
  63         }
  64
  65         /* otherwise backup path for older kernels */
  66
  67         /* create buffer for event data */
  68         buffer = (struct r600_resource*)
  69                 pipe_buffer_create(&ctx->screen->screen, PIPE_BIND_CUSTOM,
  70                                    PIPE_USAGE_STAGING, ctx->max_db*16);
  71         if (!buffer)
  72                 goto err;
  73
  74         /* initialize buffer with zeroes */
  75         results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
  76         if (results) {
  77                 uint64_t va = 0;
  78
  79                 memset(results, 0, ctx->max_db * 4 * 4);
  80                 ctx->ws->buffer_unmap(buffer->cs_buf);
  81
  82                 /* emit EVENT_WRITE for ZPASS_DONE */
  83                 va = r600_resource_va(&ctx->screen->screen, (void *)buffer);
  84                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
  85                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
  86                 cs->buf[cs->cdw++] = va;
  87                 cs->buf[cs->cdw++] = va >> 32;
  88
  89                 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
  90                 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
  91
  92                 /* analyze results */
  93                 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ);
  94                 if (results) {
  95                         for(i = 0; i < ctx->max_db; i++) {
  96                                 /* at least highest bit will be set if backend is used */
  97                                 if (results[i*4 + 1])
  98                                         mask |= (1<<i);
  99                         }
 100                         ctx->ws->buffer_unmap(buffer->cs_buf);
 101                 }
 102         }
 103
 104         pipe_resource_reference((struct pipe_resource**)&buffer, NULL);
 105
 106         if (mask != 0) {
 107                 ctx->backend_mask = mask;
 108                 return;
 109         }
 110
 111 err:
 112         /* fallback to old method - set num_backends lower bits to 1 */
 113         ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
 114         return;
 115 }
 116
 117 static inline void r600_context_ps_partial_flush(struct r600_context *ctx)
 118 {
 119         struct radeon_winsys_cs *cs = ctx->cs;
 120
 121         if (!(ctx->flags & R600_CONTEXT_DRAW_PENDING))
 122                 return;
 123
 124         cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 125         cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 126
 127         ctx->flags &= ~R600_CONTEXT_DRAW_PENDING;
 128 }
 129
 130 void r600_init_cs(struct r600_context *ctx)
 131 {
 132         struct radeon_winsys_cs *cs = ctx->cs;
 133
 134         /* All asics require this one */
 135         cs->buf[cs->cdw++] = PKT3(PKT3_CONTEXT_CONTROL, 1, 0);
 136         cs->buf[cs->cdw++] = 0x80000000;
 137         cs->buf[cs->cdw++] = 0x80000000;
 138
 139         ctx->init_dwords = cs->cdw;
 140 }
 141
 142 static void r600_init_block(struct r600_context *ctx,
 143                             struct r600_block *block,
 144                             const struct r600_reg *reg, int index, int nreg,
 145                             unsigned opcode, unsigned offset_base)
 146 {
 147         int i = index;
 148         int j, n = nreg;
 149
 150         /* initialize block */
 151         block->flags = 0;
 152         block->status |= R600_BLOCK_STATUS_DIRTY; /* dirty all blocks at start */
 153         block->start_offset = reg[i].offset;
 154         block->pm4[block->pm4_ndwords++] = PKT3(opcode, n, 0);
 155         block->pm4[block->pm4_ndwords++] = (block->start_offset - offset_base) >> 2;
 156         block->reg = &block->pm4[block->pm4_ndwords];
 157         block->pm4_ndwords += n;
 158         block->nreg = n;
 159         block->nreg_dirty = n;
 160         LIST_INITHEAD(&block->list);
 161         LIST_INITHEAD(&block->enable_list);
 162
 163         for (j = 0; j < n; j++) {
 164                 if (reg[i+j].flags & REG_FLAG_DIRTY_ALWAYS) {
 165                         block->flags |= REG_FLAG_DIRTY_ALWAYS;
 166                 }
 167                 if (reg[i+j].flags & REG_FLAG_ENABLE_ALWAYS) {
 168                         if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
 169                                 block->status |= R600_BLOCK_STATUS_ENABLED;
 170                                 LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
 171                                 LIST_ADDTAIL(&block->list,&ctx->dirty);
 172                         }
 173                 }
 174                 if (reg[i+j].flags & REG_FLAG_FLUSH_CHANGE) {
 175                         block->flags |= REG_FLAG_FLUSH_CHANGE;
 176                 }
 177
 178                 if (reg[i+j].flags & REG_FLAG_NEED_BO) {
 179                         block->nbo++;
 180                         assert(block->nbo < R600_BLOCK_MAX_BO);
 181                         block->pm4_bo_index[j] = block->nbo;
 182                         block->pm4[block->pm4_ndwords++] = PKT3(PKT3_NOP, 0, 0);
 183                         block->pm4[block->pm4_ndwords++] = 0x00000000;
 184                         block->reloc[block->nbo].bo_pm4_index = block->pm4_ndwords - 1;
 185                 }
 186         }
 187         /* check that we stay in limit */
 188         assert(block->pm4_ndwords < R600_BLOCK_MAX_REG);
 189 }
 190
 191 int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg, unsigned nreg,
 192                            unsigned opcode, unsigned offset_base)
 193 {
 194         struct r600_block *block;
 195         struct r600_range *range;
 196         int offset;
 197
 198         for (unsigned i = 0, n = 0; i < nreg; i += n) {
 199                 /* ignore new block balise */
 200                 if (reg[i].offset == GROUP_FORCE_NEW_BLOCK) {
 201                         n = 1;
 202                         continue;
 203                 }
 204
 205                 /* register that need relocation are in their own group */
 206                 /* find number of consecutive registers */
 207                 n = 0;
 208                 offset = reg[i].offset;
 209                 while (reg[i + n].offset == offset) {
 210                         n++;
 211                         offset += 4;
 212                         if ((n + i) >= nreg)
 213                                 break;
 214                         if (n >= (R600_BLOCK_MAX_REG - 2))
 215                                 break;
 216                 }
 217
 218                 /* allocate new block */
 219                 block = calloc(1, sizeof(struct r600_block));
 220                 if (block == NULL) {
 221                         return -ENOMEM;
 222                 }
 223                 ctx->nblocks++;
 224                 for (int j = 0; j < n; j++) {
 225                         range = &ctx->range[CTX_RANGE_ID(reg[i + j].offset)];
 226                         /* create block table if it doesn't exist */
 227                         if (!range->blocks)
 228                                 range->blocks = calloc(1 << HASH_SHIFT, sizeof(void *));
 229                         if (!range->blocks)
 230                                 return -1;
 231
 232                         range->blocks[CTX_BLOCK_ID(reg[i + j].offset)] = block;
 233                 }
 234
 235                 r600_init_block(ctx, block, reg, i, n, opcode, offset_base);
 236
 237         }
 238         return 0;
 239 }
 240
 241
 242 /* initialize */
 243 void r600_context_fini(struct r600_context *ctx)
 244 {
 245         struct r600_block *block;
 246         struct r600_range *range;
 247
 248         for (int i = 0; i < NUM_RANGES; i++) {
 249                 if (!ctx->range[i].blocks)
 250                         continue;
 251                 for (int j = 0; j < (1 << HASH_SHIFT); j++) {
 252                         block = ctx->range[i].blocks[j];
 253                         if (block) {
 254                                 for (int k = 0, offset = block->start_offset; k < block->nreg; k++, offset += 4) {
 255                                         range = &ctx->range[CTX_RANGE_ID(offset)];
 256                                         range->blocks[CTX_BLOCK_ID(offset)] = NULL;
 257                                 }
 258                                 for (int k = 1; k <= block->nbo; k++) {
 259                                         pipe_resource_reference((struct pipe_resource**)&block->reloc[k].bo, NULL);
 260                                 }
 261                                 free(block);
 262                         }
 263                 }
 264                 free(ctx->range[i].blocks);
 265         }
 266         free(ctx->range);
 267         free(ctx->blocks);
 268         ctx->ws->cs_destroy(ctx->cs);
 269 }
 270
 271 int r600_setup_block_table(struct r600_context *ctx)
 272 {
 273         /* setup block table */
 274         int c = 0;
 275         ctx->blocks = calloc(ctx->nblocks, sizeof(void*));
 276         if (!ctx->blocks)
 277                 return -ENOMEM;
 278         for (int i = 0; i < NUM_RANGES; i++) {
 279                 if (!ctx->range[i].blocks)
 280                         continue;
 281                 for (int j = 0, add; j < (1 << HASH_SHIFT); j++) {
 282                         if (!ctx->range[i].blocks[j])
 283                                 continue;
 284
 285                         add = 1;
 286                         for (int k = 0; k < c; k++) {
 287                                 if (ctx->blocks[k] == ctx->range[i].blocks[j]) {
 288                                         add = 0;
 289                                         break;
 290                                 }
 291                         }
 292                         if (add) {
 293                                 assert(c < ctx->nblocks);
 294                                 ctx->blocks[c++] = ctx->range[i].blocks[j];
 295                                 j += (ctx->range[i].blocks[j]->nreg) - 1;
 296                         }
 297                 }
 298         }
 299
 300         return 0;
 301 }
 302
 303 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 304                         boolean count_draw_in)
 305 {
 306         struct r600_atom *state;
 307
 308         /* The number of dwords we already used in the CS so far. */
 309         num_dw += ctx->cs->cdw;
 310
 311         if (count_draw_in) {
 312                 /* The number of dwords all the dirty states would take. */
 313                 LIST_FOR_EACH_ENTRY(state, &ctx->dirty_states, head) {
 314                         num_dw += state->num_dw;
 315                 }
 316
 317                 num_dw += ctx->pm4_dirty_cdwords;
 318
 319                 /* The upper-bound of how much a draw command would take. */
 320                 num_dw += R600_MAX_DRAW_CS_DWORDS;
 321         }
 322
 323         /* Count in queries_suspend. */
 324         num_dw += ctx->num_cs_dw_queries_suspend;
 325
 326         /* Count in streamout_end at the end of CS. */
 327         num_dw += ctx->num_cs_dw_streamout_end;
 328
 329         /* Count in render_condition(NULL) at the end of CS. */
 330         if (ctx->predicate_drawing) {
 331                 num_dw += 3;
 332         }
 333
 334         /* Count in framebuffer cache flushes at the end of CS. */
 335         num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
 336
 337         /* Save 16 dwords for the fence mechanism. */
 338         num_dw += 16;
 339
 340         /* Flush if there's not enough space. */
 341         if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
 342                 radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
 343         }
 344 }
 345
 346 void r600_context_dirty_block(struct r600_context *ctx,
 347                               struct r600_block *block,
 348                               int dirty, int index)
 349 {
 350         if ((index + 1) > block->nreg_dirty)
 351                 block->nreg_dirty = index + 1;
 352
 353         if ((dirty != (block->status & R600_BLOCK_STATUS_DIRTY)) || !(block->status & R600_BLOCK_STATUS_ENABLED)) {
 354                 block->status |= R600_BLOCK_STATUS_DIRTY;
 355                 ctx->pm4_dirty_cdwords += block->pm4_ndwords;
 356                 if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
 357                         block->status |= R600_BLOCK_STATUS_ENABLED;
 358                         LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
 359                 }
 360                 LIST_ADDTAIL(&block->list,&ctx->dirty);
 361
 362                 if (block->flags & REG_FLAG_FLUSH_CHANGE) {
 363                         r600_context_ps_partial_flush(ctx);
 364                 }
 365         }
 366 }
 367
 368 void r600_context_pipe_state_set(struct r600_context *ctx, struct r600_pipe_state *state)
 369 {
 370         struct r600_block *block;
 371         int dirty;
 372         for (int i = 0; i < state->nregs; i++) {
 373                 unsigned id, reloc_id;
 374                 struct r600_pipe_reg *reg = &state->regs[i];
 375
 376                 block = reg->block;
 377                 id = reg->id;
 378
 379                 dirty = block->status & R600_BLOCK_STATUS_DIRTY;
 380
 381                 if (reg->value != block->reg[id]) {
 382                         block->reg[id] = reg->value;
 383                         dirty |= R600_BLOCK_STATUS_DIRTY;
 384                 }
 385                 if (block->flags & REG_FLAG_DIRTY_ALWAYS)
 386                         dirty |= R600_BLOCK_STATUS_DIRTY;
 387                 if (block->pm4_bo_index[id]) {
 388                         /* find relocation */
 389                         reloc_id = block->pm4_bo_index[id];
 390                         pipe_resource_reference((struct pipe_resource**)&block->reloc[reloc_id].bo, &reg->bo->b.b);
 391                         block->reloc[reloc_id].bo_usage = reg->bo_usage;
 392                         /* always force dirty for relocs for now */
 393                         dirty |= R600_BLOCK_STATUS_DIRTY;
 394                 }
 395
 396                 if (dirty)
 397                         r600_context_dirty_block(ctx, block, dirty, id);
 398         }
 399 }
 400
 401 struct r600_resource *r600_context_reg_bo(struct r600_context *ctx, unsigned offset)
 402 {
 403         struct r600_range *range;
 404         struct r600_block *block;
 405         unsigned id;
 406
 407         range = &ctx->range[CTX_RANGE_ID(offset)];
 408         block = range->blocks[CTX_BLOCK_ID(offset)];
 409         offset -= block->start_offset;
 410         id = block->pm4_bo_index[offset >> 2];
 411         if (block->reloc[id].bo) {
 412                 return block->reloc[id].bo;
 413         }
 414         return NULL;
 415 }
 416
 417 void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block)
 418 {
 419         struct radeon_winsys_cs *cs = ctx->cs;
 420         int optional = block->nbo == 0 && !(block->flags & REG_FLAG_DIRTY_ALWAYS);
 421         int cp_dwords = block->pm4_ndwords, start_dword = 0;
 422         int new_dwords = 0;
 423         int nbo = block->nbo;
 424
 425         if (block->nreg_dirty == 0 && optional) {
 426                 goto out;
 427         }
 428
 429         if (nbo) {
 430                 ctx->flags |= R600_CONTEXT_CHECK_EVENT_FLUSH;
 431
 432                 for (int j = 0; j < block->nreg; j++) {
 433                         if (block->pm4_bo_index[j]) {
 434                                 /* find relocation */
 435                                 struct r600_block_reloc *reloc = &block->reloc[block->pm4_bo_index[j]];
 436                                 block->pm4[reloc->bo_pm4_index] =
 437                                         r600_context_bo_reloc(ctx, reloc->bo, reloc->bo_usage);
 438                                 nbo--;
 439                                 if (nbo == 0)
 440                                         break;
 441                         }
 442                 }
 443                 ctx->flags &= ~R600_CONTEXT_CHECK_EVENT_FLUSH;
 444         }
 445
 446         optional &= (block->nreg_dirty != block->nreg);
 447         if (optional) {
 448                 new_dwords = block->nreg_dirty;
 449                 start_dword = cs->cdw;
 450                 cp_dwords = new_dwords + 2;
 451         }
 452         memcpy(&cs->buf[cs->cdw], block->pm4, cp_dwords * 4);
 453         cs->cdw += cp_dwords;
 454
 455         if (optional) {
 456                 uint32_t newword;
 457
 458                 newword = cs->buf[start_dword];
 459                 newword &= PKT_COUNT_C;
 460                 newword |= PKT_COUNT_S(new_dwords);
 461                 cs->buf[start_dword] = newword;
 462         }
 463 out:
 464         block->status ^= R600_BLOCK_STATUS_DIRTY;
 465         block->nreg_dirty = 0;
 466         LIST_DELINIT(&block->list);
 467 }
 468
 469 void r600_inval_shader_cache(struct r600_context *ctx)
 470 {
 471         ctx->atom_surface_sync.flush_flags |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
 472         ctx->atom_surface_sync.flush_flags |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
 473         r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
 474 }
 475
 476 void r600_inval_texture_cache(struct r600_context *ctx)
 477 {
 478         ctx->atom_surface_sync.flush_flags |= S_0085F0_TC_ACTION_ENA(1);
 479         r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
 480 }
 481
 482 void r600_inval_vertex_cache(struct r600_context *ctx)
 483 {
 484         /* Some GPUs don't have the vertex cache and must use the texture cache instead. */
 485         ctx->atom_surface_sync.flush_flags |= S_0085F0_TC_ACTION_ENA(1);
 486         r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
 487 }
 488
 489 void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now)
 490 {
 491         if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
 492                 return;
 493
 494         ctx->atom_surface_sync.flush_flags |=
 495                 r600_get_cb_flush_flags(ctx) |
 496                 (ctx->framebuffer.zsbuf ? S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) : 0);
 497
 498         if (flush_now) {
 499                 r600_emit_atom(ctx, &ctx->atom_surface_sync.atom);
 500         } else {
 501                 r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
 502         }
 503
 504         ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
 505 }
 506
 507 void r600_context_flush(struct r600_context *ctx, unsigned flags)
 508 {
 509         struct radeon_winsys_cs *cs = ctx->cs;
 510         struct r600_block *enable_block = NULL;
 511         bool queries_suspended = false;
 512         bool streamout_suspended = false;
 513
 514         if (cs->cdw == ctx->init_dwords)
 515                 return;
 516
 517         /* suspend queries */
 518         if (ctx->num_cs_dw_queries_suspend) {
 519                 r600_context_queries_suspend(ctx);
 520                 queries_suspended = true;
 521         }
 522
 523         if (ctx->num_cs_dw_streamout_end) {
 524                 r600_context_streamout_end(ctx);
 525                 streamout_suspended = true;
 526         }
 527
 528         r600_flush_framebuffer(ctx, true);
 529
 530         /* partial flush is needed to avoid lockups on some chips with user fences */
 531         cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 532         cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 533
 534         /* Flush the CS. */
 535         ctx->ws->cs_flush(ctx->cs, flags);
 536
 537         ctx->pm4_dirty_cdwords = 0;
 538         ctx->flags = 0;
 539
 540         r600_init_cs(ctx);
 541
 542         if (streamout_suspended) {
 543                 ctx->streamout_start = TRUE;
 544                 ctx->streamout_append_bitmask = ~0;
 545         }
 546
 547         /* resume queries */
 548         if (queries_suspended) {
 549                 r600_context_queries_resume(ctx);
 550         }
 551
 552         /* set all valid group as dirty so they get reemited on
 553          * next draw command
 554          */
 555         LIST_FOR_EACH_ENTRY(enable_block, &ctx->enable_list, enable_list) {
 556                 if(!(enable_block->status & R600_BLOCK_STATUS_DIRTY)) {
 557                         LIST_ADDTAIL(&enable_block->list,&ctx->dirty);
 558                         enable_block->status |= R600_BLOCK_STATUS_DIRTY;
 559                 }
 560                 ctx->pm4_dirty_cdwords += enable_block->pm4_ndwords;
 561                 enable_block->nreg_dirty = enable_block->nreg;
 562         }
 563 }
 564
 565 void r600_context_emit_fence(struct r600_context *ctx, struct r600_resource *fence_bo, unsigned offset, unsigned value)
 566 {
 567         struct radeon_winsys_cs *cs = ctx->cs;
 568         uint64_t va;
 569
 570         r600_need_cs_space(ctx, 10, FALSE);
 571
 572         va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
 573         va = va + (offset << 2);
 574
 575         cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 576         cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 577         cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
 578         cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
 579         cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL;       /* ADDRESS_LO */
 580         /* DATA_SEL | INT_EN | ADDRESS_HI */
 581         cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
 582         cs->buf[cs->cdw++] = value;                   /* DATA_LO */
 583         cs->buf[cs->cdw++] = 0;                       /* DATA_HI */
 584         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
 585         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
 586 }
 587
 588 static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
 589                                        bool test_status_bit)
 590 {
 591         uint32_t *current_result = (uint32_t*)map;
 592         uint64_t start, end;
 593
 594         start = (uint64_t)current_result[start_index] |
 595                 (uint64_t)current_result[start_index+1] << 32;
 596         end = (uint64_t)current_result[end_index] |
 597               (uint64_t)current_result[end_index+1] << 32;
 598
 599         if (!test_status_bit ||
 600             ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
 601                 return end - start;
 602         }
 603         return 0;
 604 }
 605
 606 static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
 607 {
 608         unsigned results_base = query->results_start;
 609         char *map;
 610
 611         map = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs,
 612                                   PIPE_TRANSFER_READ |
 613                                   (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
 614         if (!map)
 615                 return FALSE;
 616
 617         /* count all results across all data blocks */
 618         switch (query->type) {
 619         case PIPE_QUERY_OCCLUSION_COUNTER:
 620                 while (results_base != query->results_end) {
 621                         query->result.u64 +=
 622                                 r600_query_read_result(map + results_base, 0, 2, true);
 623                         results_base = (results_base + 16) % query->buffer->b.b.width0;
 624                 }
 625                 break;
 626         case PIPE_QUERY_OCCLUSION_PREDICATE:
 627                 while (results_base != query->results_end) {
 628                         query->result.b = query->result.b ||
 629                                 r600_query_read_result(map + results_base, 0, 2, true) != 0;
 630                         results_base = (results_base + 16) % query->buffer->b.b.width0;
 631                 }
 632                 break;
 633         case PIPE_QUERY_TIME_ELAPSED:
 634                 while (results_base != query->results_end) {
 635                         query->result.u64 +=
 636                                 r600_query_read_result(map + results_base, 0, 2, false);
 637                         results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
 638                 }
 639                 break;
 640         case PIPE_QUERY_PRIMITIVES_EMITTED:
 641                 /* SAMPLE_STREAMOUTSTATS stores this structure:
 642                  * {
 643                  *    u64 NumPrimitivesWritten;
 644                  *    u64 PrimitiveStorageNeeded;
 645                  * }
 646                  * We only need NumPrimitivesWritten here. */
 647                 while (results_base != query->results_end) {
 648                         query->result.u64 +=
 649                                 r600_query_read_result(map + results_base, 2, 6, true);
 650                         results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
 651                 }
 652                 break;
 653         case PIPE_QUERY_PRIMITIVES_GENERATED:
 654                 /* Here we read PrimitiveStorageNeeded. */
 655                 while (results_base != query->results_end) {
 656                         query->result.u64 +=
 657                                 r600_query_read_result(map + results_base, 0, 4, true);
 658                         results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
 659                 }
 660                 break;
 661         case PIPE_QUERY_SO_STATISTICS:
 662                 while (results_base != query->results_end) {
 663                         query->result.so.num_primitives_written +=
 664                                 r600_query_read_result(map + results_base, 2, 6, true);
 665                         query->result.so.primitives_storage_needed +=
 666                                 r600_query_read_result(map + results_base, 0, 4, true);
 667                         results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
 668                 }
 669                 break;
 670         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 671                 while (results_base != query->results_end) {
 672                         query->result.b = query->result.b ||
 673                                 r600_query_read_result(map + results_base, 2, 6, true) !=
 674                                 r600_query_read_result(map + results_base, 0, 4, true);
 675                         results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
 676                 }
 677                 break;
 678         default:
 679                 assert(0);
 680         }
 681
 682         query->results_start = query->results_end;
 683         ctx->ws->buffer_unmap(query->buffer->cs_buf);
 684         return TRUE;
 685 }
 686
 687 void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
 688 {
 689         struct radeon_winsys_cs *cs = ctx->cs;
 690         unsigned new_results_end, i;
 691         uint32_t *results;
 692         uint64_t va;
 693
 694         r600_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
 695
 696         new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
 697
 698         /* collect current results if query buffer is full */
 699         if (new_results_end == query->results_start) {
 700                 r600_query_result(ctx, query, TRUE);
 701         }
 702
 703         switch (query->type) {
 704         case PIPE_QUERY_OCCLUSION_COUNTER:
 705         case PIPE_QUERY_OCCLUSION_PREDICATE:
 706                 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
 707                 if (results) {
 708                         results = (uint32_t*)((char*)results + query->results_end);
 709                         memset(results, 0, query->result_size);
 710
 711                         /* Set top bits for unused backends */
 712                         for (i = 0; i < ctx->max_db; i++) {
 713                                 if (!(ctx->backend_mask & (1<<i))) {
 714                                         results[(i * 4)+1] = 0x80000000;
 715                                         results[(i * 4)+3] = 0x80000000;
 716                                 }
 717                         }
 718                         ctx->ws->buffer_unmap(query->buffer->cs_buf);
 719                 }
 720                 break;
 721         case PIPE_QUERY_TIME_ELAPSED:
 722                 break;
 723         case PIPE_QUERY_PRIMITIVES_EMITTED:
 724         case PIPE_QUERY_PRIMITIVES_GENERATED:
 725         case PIPE_QUERY_SO_STATISTICS:
 726         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 727                 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
 728                 results = (uint32_t*)((char*)results + query->results_end);
 729                 memset(results, 0, query->result_size);
 730                 ctx->ws->buffer_unmap(query->buffer->cs_buf);
 731                 break;
 732         default:
 733                 assert(0);
 734         }
 735
 736         /* emit begin query */
 737         va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
 738         va += query->results_end;
 739
 740         switch (query->type) {
 741         case PIPE_QUERY_OCCLUSION_COUNTER:
 742         case PIPE_QUERY_OCCLUSION_PREDICATE:
 743                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
 744                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
 745                 cs->buf[cs->cdw++] = va;
 746                 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
 747                 break;
 748         case PIPE_QUERY_PRIMITIVES_EMITTED:
 749         case PIPE_QUERY_PRIMITIVES_GENERATED:
 750         case PIPE_QUERY_SO_STATISTICS:
 751         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 752                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
 753                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
 754                 cs->buf[cs->cdw++] = query->results_end;
 755                 cs->buf[cs->cdw++] = 0;
 756                 break;
 757         case PIPE_QUERY_TIME_ELAPSED:
 758                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
 759                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
 760                 cs->buf[cs->cdw++] = va;
 761                 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
 762                 cs->buf[cs->cdw++] = 0;
 763                 cs->buf[cs->cdw++] = 0;
 764                 break;
 765         default:
 766                 assert(0);
 767         }
 768         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
 769         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
 770
 771         ctx->num_cs_dw_queries_suspend += query->num_cs_dw;
 772 }
 773
 774 void r600_query_end(struct r600_context *ctx, struct r600_query *query)
 775 {
 776         struct radeon_winsys_cs *cs = ctx->cs;
 777         uint64_t va;
 778
 779         va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
 780         /* emit end query */
 781         switch (query->type) {
 782         case PIPE_QUERY_OCCLUSION_COUNTER:
 783         case PIPE_QUERY_OCCLUSION_PREDICATE:
 784                 va += query->results_end + 8;
 785                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
 786                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
 787                 cs->buf[cs->cdw++] = va;
 788                 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
 789                 break;
 790         case PIPE_QUERY_PRIMITIVES_EMITTED:
 791         case PIPE_QUERY_PRIMITIVES_GENERATED:
 792         case PIPE_QUERY_SO_STATISTICS:
 793         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 794                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
 795                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
 796                 cs->buf[cs->cdw++] = query->results_end + query->result_size/2;
 797                 cs->buf[cs->cdw++] = 0;
 798                 break;
 799         case PIPE_QUERY_TIME_ELAPSED:
 800                 va += query->results_end + query->result_size/2;
 801                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
 802                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
 803                 cs->buf[cs->cdw++] = va;
 804                 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
 805                 cs->buf[cs->cdw++] = 0;
 806                 cs->buf[cs->cdw++] = 0;
 807                 break;
 808         default:
 809                 assert(0);
 810         }
 811         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
 812         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
 813
 814         query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
 815         ctx->num_cs_dw_queries_suspend -= query->num_cs_dw;
 816 }
 817
 818 void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
 819                             int flag_wait)
 820 {
 821         struct radeon_winsys_cs *cs = ctx->cs;
 822         uint64_t va;
 823
 824         if (operation == PREDICATION_OP_CLEAR) {
 825                 r600_need_cs_space(ctx, 3, FALSE);
 826
 827                 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
 828                 cs->buf[cs->cdw++] = 0;
 829                 cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
 830         } else {
 831                 unsigned results_base = query->results_start;
 832                 unsigned count;
 833                 uint32_t op;
 834
 835                 /* find count of the query data blocks */
 836                 count = (query->buffer->b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.width0;
 837                 count /= query->result_size;
 838
 839                 r600_need_cs_space(ctx, 5 * count, TRUE);
 840
 841                 op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
 842                                 (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
 843                 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
 844
 845                 /* emit predicate packets for all data blocks */
 846                 while (results_base != query->results_end) {
 847                         cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
 848                         cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
 849                         cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
 850                         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
 851                         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer,
 852                                                                              RADEON_USAGE_READ);
 853                         results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
 854
 855                         /* set CONTINUE bit for all packets except the first */
 856                         op |= PREDICATION_CONTINUE;
 857                 }
 858         }
 859 }
 860
 861 struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type)
 862 {
 863         struct r600_query *query;
 864         unsigned buffer_size = 4096;
 865
 866         query = CALLOC_STRUCT(r600_query);
 867         if (query == NULL)
 868                 return NULL;
 869
 870         query->type = query_type;
 871
 872         switch (query_type) {
 873         case PIPE_QUERY_OCCLUSION_COUNTER:
 874         case PIPE_QUERY_OCCLUSION_PREDICATE:
 875                 query->result_size = 16 * ctx->max_db;
 876                 query->num_cs_dw = 6;
 877                 break;
 878         case PIPE_QUERY_TIME_ELAPSED:
 879                 query->result_size = 16;
 880                 query->num_cs_dw = 8;
 881                 break;
 882         case PIPE_QUERY_PRIMITIVES_EMITTED:
 883         case PIPE_QUERY_PRIMITIVES_GENERATED:
 884         case PIPE_QUERY_SO_STATISTICS:
 885         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 886                 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
 887                 query->result_size = 32;
 888                 query->num_cs_dw = 6;
 889                 break;
 890         default:
 891                 assert(0);
 892                 FREE(query);
 893                 return NULL;
 894         }
 895
 896         /* adjust buffer size to simplify offsets wrapping math */
 897         buffer_size -= buffer_size % query->result_size;
 898
 899         /* Queries are normally read by the CPU after
 900          * being written by the gpu, hence staging is probably a good
 901          * usage pattern.
 902          */
 903         query->buffer = (struct r600_resource*)
 904                 pipe_buffer_create(&ctx->screen->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING, buffer_size);
 905         if (!query->buffer) {
 906                 FREE(query);
 907                 return NULL;
 908         }
 909         return query;
 910 }
 911
 912 void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
 913 {
 914         pipe_resource_reference((struct pipe_resource**)&query->buffer, NULL);
 915         free(query);
 916 }
 917
 918 boolean r600_context_query_result(struct r600_context *ctx,
 919                                 struct r600_query *query,
 920                                 boolean wait, void *vresult)
 921 {
 922         boolean *result_b = (boolean*)vresult;
 923         uint64_t *result_u64 = (uint64_t*)vresult;
 924         struct pipe_query_data_so_statistics *result_so =
 925                 (struct pipe_query_data_so_statistics*)vresult;
 926
 927         if (!r600_query_result(ctx, query, wait))
 928                 return FALSE;
 929
 930         switch (query->type) {
 931         case PIPE_QUERY_OCCLUSION_COUNTER:
 932         case PIPE_QUERY_PRIMITIVES_EMITTED:
 933         case PIPE_QUERY_PRIMITIVES_GENERATED:
 934                 *result_u64 = query->result.u64;
 935                 break;
 936         case PIPE_QUERY_OCCLUSION_PREDICATE:
 937         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 938                 *result_b = query->result.b;
 939                 break;
 940         case PIPE_QUERY_TIME_ELAPSED:
 941                 *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
 942                 break;
 943         case PIPE_QUERY_SO_STATISTICS:
 944                 *result_so = query->result.so;
 945                 break;
 946         default:
 947                 assert(0);
 948         }
 949         return TRUE;
 950 }
 951
 952 void r600_context_queries_suspend(struct r600_context *ctx)
 953 {
 954         struct r600_query *query;
 955
 956         LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
 957                 r600_query_end(ctx, query);
 958         }
 959         assert(ctx->num_cs_dw_queries_suspend == 0);
 960 }
 961
 962 void r600_context_queries_resume(struct r600_context *ctx)
 963 {
 964         struct r600_query *query;
 965
 966         assert(ctx->num_cs_dw_queries_suspend == 0);
 967
 968         LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
 969                 r600_query_begin(ctx, query);
 970         }
 971 }
 972
 973 void r600_context_streamout_begin(struct r600_context *ctx)
 974 {
 975         struct radeon_winsys_cs *cs = ctx->cs;
 976         struct r600_so_target **t = ctx->so_targets;
 977         unsigned *strides = ctx->vs_shader_so_strides;
 978         unsigned buffer_en, i;
 979
 980         buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) |
 981                     (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) |
 982                     (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) |
 983                     (ctx->num_so_targets >= 4 && t[3] ? 8 : 0);
 984
 985         ctx->num_cs_dw_streamout_end =
 986                 12 + /* flush_vgt_streamout */
 987                 util_bitcount(buffer_en) * 8 +
 988                 3;
 989
 990         r600_need_cs_space(ctx,
 991                            12 + /* flush_vgt_streamout */
 992                            6 + /* enables */
 993                            util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 +
 994                            util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 +
 995                            ctx->num_cs_dw_streamout_end, TRUE);
 996
 997         if (ctx->chip_class >= CAYMAN) {
 998                 evergreen_flush_vgt_streamout(ctx);
 999                 evergreen_set_streamout_enable(ctx, buffer_en);
1000         }
1001
1002         for (i = 0; i < ctx->num_so_targets; i++) {
1003 #if 0
1004                 if (t[i]) {
1005                         t[i]->stride = strides[i];
1006                         t[i]->so_index = i;
1007
1008                         cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 3, 0);
1009                         cs->buf[cs->cdw++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 +
1010                                                         16*i - SI_CONTEXT_REG_OFFSET) >> 2;
1011                         cs->buf[cs->cdw++] = (t[i]->b.buffer_offset +
1012                                                         t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */
1013                         cs->buf[cs->cdw++] = strides[i] >> 2;              /* VTX_STRIDE (in DW) */
1014                         cs->buf[cs->cdw++] = 0;                    /* BUFFER_BASE */
1015
1016                         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1017                         cs->buf[cs->cdw++] =
1018                                 r600_context_bo_reloc(ctx, r600_resource(t[i]->b.buffer),
1019                                                       RADEON_USAGE_WRITE);
1020
1021                         if (ctx->streamout_append_bitmask & (1 << i)) {
1022                                 /* Append. */
1023                                 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
1024                                 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
1025                                                                STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */
1026                                 cs->buf[cs->cdw++] = 0; /* unused */
1027                                 cs->buf[cs->cdw++] = 0; /* unused */
1028                                 cs->buf[cs->cdw++] = 0; /* src address lo */
1029                                 cs->buf[cs->cdw++] = 0; /* src address hi */
1030
1031                                 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1032                                 cs->buf[cs->cdw++] =
1033                                         r600_context_bo_reloc(ctx,  t[i]->filled_size,
1034                                                               RADEON_USAGE_READ);
1035                         } else {
1036                                 /* Start from the beginning. */
1037                                 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
1038                                 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
1039                                                                STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */
1040                                 cs->buf[cs->cdw++] = 0; /* unused */
1041                                 cs->buf[cs->cdw++] = 0; /* unused */
1042                                 cs->buf[cs->cdw++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */
1043                                 cs->buf[cs->cdw++] = 0; /* unused */
1044                         }
1045                 }
1046 #endif
1047         }
1048 }
1049
1050 void r600_context_streamout_end(struct r600_context *ctx)
1051 {
1052         struct radeon_winsys_cs *cs = ctx->cs;
1053         struct r600_so_target **t = ctx->so_targets;
1054         unsigned i, flush_flags = 0;
1055
1056         evergreen_flush_vgt_streamout(ctx);
1057
1058         for (i = 0; i < ctx->num_so_targets; i++) {
1059 #if 0
1060                 if (t[i]) {
1061                         cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
1062                         cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
1063                                                        STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
1064                                                        STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */
1065                         cs->buf[cs->cdw++] = 0; /* dst address lo */
1066                         cs->buf[cs->cdw++] = 0; /* dst address hi */
1067                         cs->buf[cs->cdw++] = 0; /* unused */
1068                         cs->buf[cs->cdw++] = 0; /* unused */
1069
1070                         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1071                         cs->buf[cs->cdw++] =
1072                                 r600_context_bo_reloc(ctx,  t[i]->filled_size,
1073                                                       RADEON_USAGE_WRITE);
1074
1075                         flush_flags |= S_0085F0_SO0_DEST_BASE_ENA(1) << i;
1076                 }
1077 #endif
1078         }
1079
1080         evergreen_set_streamout_enable(ctx, 0);
1081
1082         ctx->atom_surface_sync.flush_flags |= flush_flags;
1083         r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
1084
1085         ctx->num_cs_dw_streamout_end = 0;
1086
1087         /* XXX print some debug info */
1088         for (i = 0; i < ctx->num_so_targets; i++) {
1089                 if (!t[i])
1090                         continue;
1091
1092                 uint32_t *ptr = ctx->ws->buffer_map(t[i]->filled_size->cs_buf, ctx->cs, RADEON_USAGE_READ);
1093                 printf("FILLED_SIZE%i: %u\n", i, *ptr);
1094                 ctx->ws->buffer_unmap(t[i]->filled_size->cs_buf);
1095         }
1096 }
1097
1098 void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
1099 {
1100         struct radeon_winsys_cs *cs = ctx->cs;
1101         r600_need_cs_space(ctx, 14 + 21, TRUE);
1102
1103         cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
1104         cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2;
1105         cs->buf[cs->cdw++] = 0;
1106
1107         cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
1108         cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
1109         cs->buf[cs->cdw++] = t->stride >> 2;
1110
1111 #if 0
1112         cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
1113         cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
1114         cs->buf[cs->cdw++] = 0; /* src address lo */
1115         cs->buf[cs->cdw++] = 0; /* src address hi */
1116         cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
1117         cs->buf[cs->cdw++] = 0; /* unused */
1118 #endif
1119
1120         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1121         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ);
1122
1123 #if 0 /* I have not found this useful yet. */
1124         cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
1125         cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
1126         cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
1127         cs->buf[cs->cdw++] = 0; /* unused */
1128         cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
1129         cs->buf[cs->cdw++] = 0; /* unused */
1130
1131         cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
1132         cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2;
1133         cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
1134
1135         cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
1136         cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2;
1137         cs->buf[cs->cdw++] = t->b.buffer_offset >> 2;
1138
1139         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1140         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct r600_resource*)t->b.buffer,
1141                                                              RADEON_USAGE_WRITE);
1142
1143         cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
1144         cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
1145         cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2;  /* register */
1146         cs->buf[cs->cdw++] = 0;
1147         cs->buf[cs->cdw++] = 0; /* reference value */
1148         cs->buf[cs->cdw++] = 0xffffffff; /* mask */
1149         cs->buf[cs->cdw++] = 4; /* poll interval */
1150 #endif
1151 }