winsys/radeon: simplify buffer map/unmap functions
[mesa.git] / src / gallium / drivers / radeonsi / r600_hw_context.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Jerome Glisse
25 */
26 #include "r600_hw_context_priv.h"
27 #include "radeonsi_pipe.h"
28 #include "sid.h"
29 #include "util/u_memory.h"
30 #include <errno.h>
31
32 #define GROUP_FORCE_NEW_BLOCK 0
33
34 /* Get backends mask */
35 void r600_get_backend_mask(struct r600_context *ctx)
36 {
37 struct radeon_winsys_cs *cs = ctx->cs;
38 struct r600_resource *buffer;
39 uint32_t *results;
40 unsigned num_backends = ctx->screen->info.r600_num_backends;
41 unsigned i, mask = 0;
42
43 /* if backend_map query is supported by the kernel */
44 if (ctx->screen->info.r600_backend_map_valid) {
45 unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
46 unsigned backend_map = ctx->screen->info.r600_backend_map;
47 unsigned item_width, item_mask;
48
49 if (ctx->chip_class >= CAYMAN) {
50 item_width = 4;
51 item_mask = 0x7;
52 }
53
54 while(num_tile_pipes--) {
55 i = backend_map & item_mask;
56 mask |= (1<<i);
57 backend_map >>= item_width;
58 }
59 if (mask != 0) {
60 ctx->backend_mask = mask;
61 return;
62 }
63 }
64
65 /* otherwise backup path for older kernels */
66
67 /* create buffer for event data */
68 buffer = (struct r600_resource*)
69 pipe_buffer_create(&ctx->screen->screen, PIPE_BIND_CUSTOM,
70 PIPE_USAGE_STAGING, ctx->max_db*16);
71 if (!buffer)
72 goto err;
73
74 /* initialize buffer with zeroes */
75 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
76 if (results) {
77 uint64_t va = 0;
78
79 memset(results, 0, ctx->max_db * 4 * 4);
80 ctx->ws->buffer_unmap(buffer->cs_buf);
81
82 /* emit EVENT_WRITE for ZPASS_DONE */
83 va = r600_resource_va(&ctx->screen->screen, (void *)buffer);
84 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
85 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
86 cs->buf[cs->cdw++] = va;
87 cs->buf[cs->cdw++] = va >> 32;
88
89 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
90 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
91
92 /* analyze results */
93 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ);
94 if (results) {
95 for(i = 0; i < ctx->max_db; i++) {
96 /* at least highest bit will be set if backend is used */
97 if (results[i*4 + 1])
98 mask |= (1<<i);
99 }
100 ctx->ws->buffer_unmap(buffer->cs_buf);
101 }
102 }
103
104 pipe_resource_reference((struct pipe_resource**)&buffer, NULL);
105
106 if (mask != 0) {
107 ctx->backend_mask = mask;
108 return;
109 }
110
111 err:
112 /* fallback to old method - set num_backends lower bits to 1 */
113 ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
114 return;
115 }
116
117 static inline void r600_context_ps_partial_flush(struct r600_context *ctx)
118 {
119 struct radeon_winsys_cs *cs = ctx->cs;
120
121 if (!(ctx->flags & R600_CONTEXT_DRAW_PENDING))
122 return;
123
124 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
125 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
126
127 ctx->flags &= ~R600_CONTEXT_DRAW_PENDING;
128 }
129
130 void r600_init_cs(struct r600_context *ctx)
131 {
132 struct radeon_winsys_cs *cs = ctx->cs;
133
134 /* All asics require this one */
135 cs->buf[cs->cdw++] = PKT3(PKT3_CONTEXT_CONTROL, 1, 0);
136 cs->buf[cs->cdw++] = 0x80000000;
137 cs->buf[cs->cdw++] = 0x80000000;
138
139 ctx->init_dwords = cs->cdw;
140 }
141
142 static void r600_init_block(struct r600_context *ctx,
143 struct r600_block *block,
144 const struct r600_reg *reg, int index, int nreg,
145 unsigned opcode, unsigned offset_base)
146 {
147 int i = index;
148 int j, n = nreg;
149
150 /* initialize block */
151 block->flags = 0;
152 block->status |= R600_BLOCK_STATUS_DIRTY; /* dirty all blocks at start */
153 block->start_offset = reg[i].offset;
154 block->pm4[block->pm4_ndwords++] = PKT3(opcode, n, 0);
155 block->pm4[block->pm4_ndwords++] = (block->start_offset - offset_base) >> 2;
156 block->reg = &block->pm4[block->pm4_ndwords];
157 block->pm4_ndwords += n;
158 block->nreg = n;
159 block->nreg_dirty = n;
160 LIST_INITHEAD(&block->list);
161 LIST_INITHEAD(&block->enable_list);
162
163 for (j = 0; j < n; j++) {
164 if (reg[i+j].flags & REG_FLAG_DIRTY_ALWAYS) {
165 block->flags |= REG_FLAG_DIRTY_ALWAYS;
166 }
167 if (reg[i+j].flags & REG_FLAG_ENABLE_ALWAYS) {
168 if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
169 block->status |= R600_BLOCK_STATUS_ENABLED;
170 LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
171 LIST_ADDTAIL(&block->list,&ctx->dirty);
172 }
173 }
174 if (reg[i+j].flags & REG_FLAG_FLUSH_CHANGE) {
175 block->flags |= REG_FLAG_FLUSH_CHANGE;
176 }
177
178 if (reg[i+j].flags & REG_FLAG_NEED_BO) {
179 block->nbo++;
180 assert(block->nbo < R600_BLOCK_MAX_BO);
181 block->pm4_bo_index[j] = block->nbo;
182 block->pm4[block->pm4_ndwords++] = PKT3(PKT3_NOP, 0, 0);
183 block->pm4[block->pm4_ndwords++] = 0x00000000;
184 block->reloc[block->nbo].bo_pm4_index = block->pm4_ndwords - 1;
185 }
186 }
187 /* check that we stay in limit */
188 assert(block->pm4_ndwords < R600_BLOCK_MAX_REG);
189 }
190
191 int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg, unsigned nreg,
192 unsigned opcode, unsigned offset_base)
193 {
194 struct r600_block *block;
195 struct r600_range *range;
196 int offset;
197
198 for (unsigned i = 0, n = 0; i < nreg; i += n) {
199 /* ignore new block balise */
200 if (reg[i].offset == GROUP_FORCE_NEW_BLOCK) {
201 n = 1;
202 continue;
203 }
204
205 /* register that need relocation are in their own group */
206 /* find number of consecutive registers */
207 n = 0;
208 offset = reg[i].offset;
209 while (reg[i + n].offset == offset) {
210 n++;
211 offset += 4;
212 if ((n + i) >= nreg)
213 break;
214 if (n >= (R600_BLOCK_MAX_REG - 2))
215 break;
216 }
217
218 /* allocate new block */
219 block = calloc(1, sizeof(struct r600_block));
220 if (block == NULL) {
221 return -ENOMEM;
222 }
223 ctx->nblocks++;
224 for (int j = 0; j < n; j++) {
225 range = &ctx->range[CTX_RANGE_ID(reg[i + j].offset)];
226 /* create block table if it doesn't exist */
227 if (!range->blocks)
228 range->blocks = calloc(1 << HASH_SHIFT, sizeof(void *));
229 if (!range->blocks)
230 return -1;
231
232 range->blocks[CTX_BLOCK_ID(reg[i + j].offset)] = block;
233 }
234
235 r600_init_block(ctx, block, reg, i, n, opcode, offset_base);
236
237 }
238 return 0;
239 }
240
241
242 /* initialize */
243 void r600_context_fini(struct r600_context *ctx)
244 {
245 struct r600_block *block;
246 struct r600_range *range;
247
248 for (int i = 0; i < NUM_RANGES; i++) {
249 if (!ctx->range[i].blocks)
250 continue;
251 for (int j = 0; j < (1 << HASH_SHIFT); j++) {
252 block = ctx->range[i].blocks[j];
253 if (block) {
254 for (int k = 0, offset = block->start_offset; k < block->nreg; k++, offset += 4) {
255 range = &ctx->range[CTX_RANGE_ID(offset)];
256 range->blocks[CTX_BLOCK_ID(offset)] = NULL;
257 }
258 for (int k = 1; k <= block->nbo; k++) {
259 pipe_resource_reference((struct pipe_resource**)&block->reloc[k].bo, NULL);
260 }
261 free(block);
262 }
263 }
264 free(ctx->range[i].blocks);
265 }
266 free(ctx->range);
267 free(ctx->blocks);
268 ctx->ws->cs_destroy(ctx->cs);
269 }
270
271 int r600_setup_block_table(struct r600_context *ctx)
272 {
273 /* setup block table */
274 int c = 0;
275 ctx->blocks = calloc(ctx->nblocks, sizeof(void*));
276 if (!ctx->blocks)
277 return -ENOMEM;
278 for (int i = 0; i < NUM_RANGES; i++) {
279 if (!ctx->range[i].blocks)
280 continue;
281 for (int j = 0, add; j < (1 << HASH_SHIFT); j++) {
282 if (!ctx->range[i].blocks[j])
283 continue;
284
285 add = 1;
286 for (int k = 0; k < c; k++) {
287 if (ctx->blocks[k] == ctx->range[i].blocks[j]) {
288 add = 0;
289 break;
290 }
291 }
292 if (add) {
293 assert(c < ctx->nblocks);
294 ctx->blocks[c++] = ctx->range[i].blocks[j];
295 j += (ctx->range[i].blocks[j]->nreg) - 1;
296 }
297 }
298 }
299
300 return 0;
301 }
302
303 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
304 boolean count_draw_in)
305 {
306 struct r600_atom *state;
307
308 /* The number of dwords we already used in the CS so far. */
309 num_dw += ctx->cs->cdw;
310
311 if (count_draw_in) {
312 /* The number of dwords all the dirty states would take. */
313 LIST_FOR_EACH_ENTRY(state, &ctx->dirty_states, head) {
314 num_dw += state->num_dw;
315 }
316
317 num_dw += ctx->pm4_dirty_cdwords;
318
319 /* The upper-bound of how much a draw command would take. */
320 num_dw += R600_MAX_DRAW_CS_DWORDS;
321 }
322
323 /* Count in queries_suspend. */
324 num_dw += ctx->num_cs_dw_queries_suspend;
325
326 /* Count in streamout_end at the end of CS. */
327 num_dw += ctx->num_cs_dw_streamout_end;
328
329 /* Count in render_condition(NULL) at the end of CS. */
330 if (ctx->predicate_drawing) {
331 num_dw += 3;
332 }
333
334 /* Count in framebuffer cache flushes at the end of CS. */
335 num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
336
337 /* Save 16 dwords for the fence mechanism. */
338 num_dw += 16;
339
340 /* Flush if there's not enough space. */
341 if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
342 radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
343 }
344 }
345
346 void r600_context_dirty_block(struct r600_context *ctx,
347 struct r600_block *block,
348 int dirty, int index)
349 {
350 if ((index + 1) > block->nreg_dirty)
351 block->nreg_dirty = index + 1;
352
353 if ((dirty != (block->status & R600_BLOCK_STATUS_DIRTY)) || !(block->status & R600_BLOCK_STATUS_ENABLED)) {
354 block->status |= R600_BLOCK_STATUS_DIRTY;
355 ctx->pm4_dirty_cdwords += block->pm4_ndwords;
356 if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
357 block->status |= R600_BLOCK_STATUS_ENABLED;
358 LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
359 }
360 LIST_ADDTAIL(&block->list,&ctx->dirty);
361
362 if (block->flags & REG_FLAG_FLUSH_CHANGE) {
363 r600_context_ps_partial_flush(ctx);
364 }
365 }
366 }
367
368 void r600_context_pipe_state_set(struct r600_context *ctx, struct r600_pipe_state *state)
369 {
370 struct r600_block *block;
371 int dirty;
372 for (int i = 0; i < state->nregs; i++) {
373 unsigned id, reloc_id;
374 struct r600_pipe_reg *reg = &state->regs[i];
375
376 block = reg->block;
377 id = reg->id;
378
379 dirty = block->status & R600_BLOCK_STATUS_DIRTY;
380
381 if (reg->value != block->reg[id]) {
382 block->reg[id] = reg->value;
383 dirty |= R600_BLOCK_STATUS_DIRTY;
384 }
385 if (block->flags & REG_FLAG_DIRTY_ALWAYS)
386 dirty |= R600_BLOCK_STATUS_DIRTY;
387 if (block->pm4_bo_index[id]) {
388 /* find relocation */
389 reloc_id = block->pm4_bo_index[id];
390 pipe_resource_reference((struct pipe_resource**)&block->reloc[reloc_id].bo, &reg->bo->b.b);
391 block->reloc[reloc_id].bo_usage = reg->bo_usage;
392 /* always force dirty for relocs for now */
393 dirty |= R600_BLOCK_STATUS_DIRTY;
394 }
395
396 if (dirty)
397 r600_context_dirty_block(ctx, block, dirty, id);
398 }
399 }
400
401 struct r600_resource *r600_context_reg_bo(struct r600_context *ctx, unsigned offset)
402 {
403 struct r600_range *range;
404 struct r600_block *block;
405 unsigned id;
406
407 range = &ctx->range[CTX_RANGE_ID(offset)];
408 block = range->blocks[CTX_BLOCK_ID(offset)];
409 offset -= block->start_offset;
410 id = block->pm4_bo_index[offset >> 2];
411 if (block->reloc[id].bo) {
412 return block->reloc[id].bo;
413 }
414 return NULL;
415 }
416
417 void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block)
418 {
419 struct radeon_winsys_cs *cs = ctx->cs;
420 int optional = block->nbo == 0 && !(block->flags & REG_FLAG_DIRTY_ALWAYS);
421 int cp_dwords = block->pm4_ndwords, start_dword = 0;
422 int new_dwords = 0;
423 int nbo = block->nbo;
424
425 if (block->nreg_dirty == 0 && optional) {
426 goto out;
427 }
428
429 if (nbo) {
430 ctx->flags |= R600_CONTEXT_CHECK_EVENT_FLUSH;
431
432 for (int j = 0; j < block->nreg; j++) {
433 if (block->pm4_bo_index[j]) {
434 /* find relocation */
435 struct r600_block_reloc *reloc = &block->reloc[block->pm4_bo_index[j]];
436 block->pm4[reloc->bo_pm4_index] =
437 r600_context_bo_reloc(ctx, reloc->bo, reloc->bo_usage);
438 nbo--;
439 if (nbo == 0)
440 break;
441 }
442 }
443 ctx->flags &= ~R600_CONTEXT_CHECK_EVENT_FLUSH;
444 }
445
446 optional &= (block->nreg_dirty != block->nreg);
447 if (optional) {
448 new_dwords = block->nreg_dirty;
449 start_dword = cs->cdw;
450 cp_dwords = new_dwords + 2;
451 }
452 memcpy(&cs->buf[cs->cdw], block->pm4, cp_dwords * 4);
453 cs->cdw += cp_dwords;
454
455 if (optional) {
456 uint32_t newword;
457
458 newword = cs->buf[start_dword];
459 newword &= PKT_COUNT_C;
460 newword |= PKT_COUNT_S(new_dwords);
461 cs->buf[start_dword] = newword;
462 }
463 out:
464 block->status ^= R600_BLOCK_STATUS_DIRTY;
465 block->nreg_dirty = 0;
466 LIST_DELINIT(&block->list);
467 }
468
469 void r600_inval_shader_cache(struct r600_context *ctx)
470 {
471 ctx->atom_surface_sync.flush_flags |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
472 ctx->atom_surface_sync.flush_flags |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
473 r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
474 }
475
476 void r600_inval_texture_cache(struct r600_context *ctx)
477 {
478 ctx->atom_surface_sync.flush_flags |= S_0085F0_TC_ACTION_ENA(1);
479 r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
480 }
481
482 void r600_inval_vertex_cache(struct r600_context *ctx)
483 {
484 /* Some GPUs don't have the vertex cache and must use the texture cache instead. */
485 ctx->atom_surface_sync.flush_flags |= S_0085F0_TC_ACTION_ENA(1);
486 r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
487 }
488
489 void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now)
490 {
491 if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
492 return;
493
494 ctx->atom_surface_sync.flush_flags |=
495 r600_get_cb_flush_flags(ctx) |
496 (ctx->framebuffer.zsbuf ? S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) : 0);
497
498 if (flush_now) {
499 r600_emit_atom(ctx, &ctx->atom_surface_sync.atom);
500 } else {
501 r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
502 }
503
504 ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
505 }
506
507 void r600_context_flush(struct r600_context *ctx, unsigned flags)
508 {
509 struct radeon_winsys_cs *cs = ctx->cs;
510 struct r600_block *enable_block = NULL;
511 bool queries_suspended = false;
512 bool streamout_suspended = false;
513
514 if (cs->cdw == ctx->init_dwords)
515 return;
516
517 /* suspend queries */
518 if (ctx->num_cs_dw_queries_suspend) {
519 r600_context_queries_suspend(ctx);
520 queries_suspended = true;
521 }
522
523 if (ctx->num_cs_dw_streamout_end) {
524 r600_context_streamout_end(ctx);
525 streamout_suspended = true;
526 }
527
528 r600_flush_framebuffer(ctx, true);
529
530 /* partial flush is needed to avoid lockups on some chips with user fences */
531 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
532 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
533
534 /* Flush the CS. */
535 ctx->ws->cs_flush(ctx->cs, flags);
536
537 ctx->pm4_dirty_cdwords = 0;
538 ctx->flags = 0;
539
540 r600_init_cs(ctx);
541
542 if (streamout_suspended) {
543 ctx->streamout_start = TRUE;
544 ctx->streamout_append_bitmask = ~0;
545 }
546
547 /* resume queries */
548 if (queries_suspended) {
549 r600_context_queries_resume(ctx);
550 }
551
552 /* set all valid group as dirty so they get reemited on
553 * next draw command
554 */
555 LIST_FOR_EACH_ENTRY(enable_block, &ctx->enable_list, enable_list) {
556 if(!(enable_block->status & R600_BLOCK_STATUS_DIRTY)) {
557 LIST_ADDTAIL(&enable_block->list,&ctx->dirty);
558 enable_block->status |= R600_BLOCK_STATUS_DIRTY;
559 }
560 ctx->pm4_dirty_cdwords += enable_block->pm4_ndwords;
561 enable_block->nreg_dirty = enable_block->nreg;
562 }
563 }
564
565 void r600_context_emit_fence(struct r600_context *ctx, struct r600_resource *fence_bo, unsigned offset, unsigned value)
566 {
567 struct radeon_winsys_cs *cs = ctx->cs;
568 uint64_t va;
569
570 r600_need_cs_space(ctx, 10, FALSE);
571
572 va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
573 va = va + (offset << 2);
574
575 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
576 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
577 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
578 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
579 cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; /* ADDRESS_LO */
580 /* DATA_SEL | INT_EN | ADDRESS_HI */
581 cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
582 cs->buf[cs->cdw++] = value; /* DATA_LO */
583 cs->buf[cs->cdw++] = 0; /* DATA_HI */
584 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
585 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
586 }
587
588 static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
589 bool test_status_bit)
590 {
591 uint32_t *current_result = (uint32_t*)map;
592 uint64_t start, end;
593
594 start = (uint64_t)current_result[start_index] |
595 (uint64_t)current_result[start_index+1] << 32;
596 end = (uint64_t)current_result[end_index] |
597 (uint64_t)current_result[end_index+1] << 32;
598
599 if (!test_status_bit ||
600 ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
601 return end - start;
602 }
603 return 0;
604 }
605
606 static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
607 {
608 unsigned results_base = query->results_start;
609 char *map;
610
611 map = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs,
612 PIPE_TRANSFER_READ |
613 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
614 if (!map)
615 return FALSE;
616
617 /* count all results across all data blocks */
618 switch (query->type) {
619 case PIPE_QUERY_OCCLUSION_COUNTER:
620 while (results_base != query->results_end) {
621 query->result.u64 +=
622 r600_query_read_result(map + results_base, 0, 2, true);
623 results_base = (results_base + 16) % query->buffer->b.b.width0;
624 }
625 break;
626 case PIPE_QUERY_OCCLUSION_PREDICATE:
627 while (results_base != query->results_end) {
628 query->result.b = query->result.b ||
629 r600_query_read_result(map + results_base, 0, 2, true) != 0;
630 results_base = (results_base + 16) % query->buffer->b.b.width0;
631 }
632 break;
633 case PIPE_QUERY_TIME_ELAPSED:
634 while (results_base != query->results_end) {
635 query->result.u64 +=
636 r600_query_read_result(map + results_base, 0, 2, false);
637 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
638 }
639 break;
640 case PIPE_QUERY_PRIMITIVES_EMITTED:
641 /* SAMPLE_STREAMOUTSTATS stores this structure:
642 * {
643 * u64 NumPrimitivesWritten;
644 * u64 PrimitiveStorageNeeded;
645 * }
646 * We only need NumPrimitivesWritten here. */
647 while (results_base != query->results_end) {
648 query->result.u64 +=
649 r600_query_read_result(map + results_base, 2, 6, true);
650 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
651 }
652 break;
653 case PIPE_QUERY_PRIMITIVES_GENERATED:
654 /* Here we read PrimitiveStorageNeeded. */
655 while (results_base != query->results_end) {
656 query->result.u64 +=
657 r600_query_read_result(map + results_base, 0, 4, true);
658 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
659 }
660 break;
661 case PIPE_QUERY_SO_STATISTICS:
662 while (results_base != query->results_end) {
663 query->result.so.num_primitives_written +=
664 r600_query_read_result(map + results_base, 2, 6, true);
665 query->result.so.primitives_storage_needed +=
666 r600_query_read_result(map + results_base, 0, 4, true);
667 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
668 }
669 break;
670 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
671 while (results_base != query->results_end) {
672 query->result.b = query->result.b ||
673 r600_query_read_result(map + results_base, 2, 6, true) !=
674 r600_query_read_result(map + results_base, 0, 4, true);
675 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
676 }
677 break;
678 default:
679 assert(0);
680 }
681
682 query->results_start = query->results_end;
683 ctx->ws->buffer_unmap(query->buffer->cs_buf);
684 return TRUE;
685 }
686
687 void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
688 {
689 struct radeon_winsys_cs *cs = ctx->cs;
690 unsigned new_results_end, i;
691 uint32_t *results;
692 uint64_t va;
693
694 r600_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
695
696 new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
697
698 /* collect current results if query buffer is full */
699 if (new_results_end == query->results_start) {
700 r600_query_result(ctx, query, TRUE);
701 }
702
703 switch (query->type) {
704 case PIPE_QUERY_OCCLUSION_COUNTER:
705 case PIPE_QUERY_OCCLUSION_PREDICATE:
706 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
707 if (results) {
708 results = (uint32_t*)((char*)results + query->results_end);
709 memset(results, 0, query->result_size);
710
711 /* Set top bits for unused backends */
712 for (i = 0; i < ctx->max_db; i++) {
713 if (!(ctx->backend_mask & (1<<i))) {
714 results[(i * 4)+1] = 0x80000000;
715 results[(i * 4)+3] = 0x80000000;
716 }
717 }
718 ctx->ws->buffer_unmap(query->buffer->cs_buf);
719 }
720 break;
721 case PIPE_QUERY_TIME_ELAPSED:
722 break;
723 case PIPE_QUERY_PRIMITIVES_EMITTED:
724 case PIPE_QUERY_PRIMITIVES_GENERATED:
725 case PIPE_QUERY_SO_STATISTICS:
726 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
727 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
728 results = (uint32_t*)((char*)results + query->results_end);
729 memset(results, 0, query->result_size);
730 ctx->ws->buffer_unmap(query->buffer->cs_buf);
731 break;
732 default:
733 assert(0);
734 }
735
736 /* emit begin query */
737 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
738 va += query->results_end;
739
740 switch (query->type) {
741 case PIPE_QUERY_OCCLUSION_COUNTER:
742 case PIPE_QUERY_OCCLUSION_PREDICATE:
743 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
744 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
745 cs->buf[cs->cdw++] = va;
746 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
747 break;
748 case PIPE_QUERY_PRIMITIVES_EMITTED:
749 case PIPE_QUERY_PRIMITIVES_GENERATED:
750 case PIPE_QUERY_SO_STATISTICS:
751 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
752 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
753 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
754 cs->buf[cs->cdw++] = query->results_end;
755 cs->buf[cs->cdw++] = 0;
756 break;
757 case PIPE_QUERY_TIME_ELAPSED:
758 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
759 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
760 cs->buf[cs->cdw++] = va;
761 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
762 cs->buf[cs->cdw++] = 0;
763 cs->buf[cs->cdw++] = 0;
764 break;
765 default:
766 assert(0);
767 }
768 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
769 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
770
771 ctx->num_cs_dw_queries_suspend += query->num_cs_dw;
772 }
773
774 void r600_query_end(struct r600_context *ctx, struct r600_query *query)
775 {
776 struct radeon_winsys_cs *cs = ctx->cs;
777 uint64_t va;
778
779 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
780 /* emit end query */
781 switch (query->type) {
782 case PIPE_QUERY_OCCLUSION_COUNTER:
783 case PIPE_QUERY_OCCLUSION_PREDICATE:
784 va += query->results_end + 8;
785 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
786 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
787 cs->buf[cs->cdw++] = va;
788 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
789 break;
790 case PIPE_QUERY_PRIMITIVES_EMITTED:
791 case PIPE_QUERY_PRIMITIVES_GENERATED:
792 case PIPE_QUERY_SO_STATISTICS:
793 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
794 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
795 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
796 cs->buf[cs->cdw++] = query->results_end + query->result_size/2;
797 cs->buf[cs->cdw++] = 0;
798 break;
799 case PIPE_QUERY_TIME_ELAPSED:
800 va += query->results_end + query->result_size/2;
801 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
802 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
803 cs->buf[cs->cdw++] = va;
804 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
805 cs->buf[cs->cdw++] = 0;
806 cs->buf[cs->cdw++] = 0;
807 break;
808 default:
809 assert(0);
810 }
811 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
812 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
813
814 query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
815 ctx->num_cs_dw_queries_suspend -= query->num_cs_dw;
816 }
817
818 void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
819 int flag_wait)
820 {
821 struct radeon_winsys_cs *cs = ctx->cs;
822 uint64_t va;
823
824 if (operation == PREDICATION_OP_CLEAR) {
825 r600_need_cs_space(ctx, 3, FALSE);
826
827 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
828 cs->buf[cs->cdw++] = 0;
829 cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
830 } else {
831 unsigned results_base = query->results_start;
832 unsigned count;
833 uint32_t op;
834
835 /* find count of the query data blocks */
836 count = (query->buffer->b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.width0;
837 count /= query->result_size;
838
839 r600_need_cs_space(ctx, 5 * count, TRUE);
840
841 op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
842 (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
843 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
844
845 /* emit predicate packets for all data blocks */
846 while (results_base != query->results_end) {
847 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
848 cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
849 cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
850 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
851 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer,
852 RADEON_USAGE_READ);
853 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
854
855 /* set CONTINUE bit for all packets except the first */
856 op |= PREDICATION_CONTINUE;
857 }
858 }
859 }
860
861 struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type)
862 {
863 struct r600_query *query;
864 unsigned buffer_size = 4096;
865
866 query = CALLOC_STRUCT(r600_query);
867 if (query == NULL)
868 return NULL;
869
870 query->type = query_type;
871
872 switch (query_type) {
873 case PIPE_QUERY_OCCLUSION_COUNTER:
874 case PIPE_QUERY_OCCLUSION_PREDICATE:
875 query->result_size = 16 * ctx->max_db;
876 query->num_cs_dw = 6;
877 break;
878 case PIPE_QUERY_TIME_ELAPSED:
879 query->result_size = 16;
880 query->num_cs_dw = 8;
881 break;
882 case PIPE_QUERY_PRIMITIVES_EMITTED:
883 case PIPE_QUERY_PRIMITIVES_GENERATED:
884 case PIPE_QUERY_SO_STATISTICS:
885 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
886 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
887 query->result_size = 32;
888 query->num_cs_dw = 6;
889 break;
890 default:
891 assert(0);
892 FREE(query);
893 return NULL;
894 }
895
896 /* adjust buffer size to simplify offsets wrapping math */
897 buffer_size -= buffer_size % query->result_size;
898
899 /* Queries are normally read by the CPU after
900 * being written by the gpu, hence staging is probably a good
901 * usage pattern.
902 */
903 query->buffer = (struct r600_resource*)
904 pipe_buffer_create(&ctx->screen->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING, buffer_size);
905 if (!query->buffer) {
906 FREE(query);
907 return NULL;
908 }
909 return query;
910 }
911
912 void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
913 {
914 pipe_resource_reference((struct pipe_resource**)&query->buffer, NULL);
915 free(query);
916 }
917
918 boolean r600_context_query_result(struct r600_context *ctx,
919 struct r600_query *query,
920 boolean wait, void *vresult)
921 {
922 boolean *result_b = (boolean*)vresult;
923 uint64_t *result_u64 = (uint64_t*)vresult;
924 struct pipe_query_data_so_statistics *result_so =
925 (struct pipe_query_data_so_statistics*)vresult;
926
927 if (!r600_query_result(ctx, query, wait))
928 return FALSE;
929
930 switch (query->type) {
931 case PIPE_QUERY_OCCLUSION_COUNTER:
932 case PIPE_QUERY_PRIMITIVES_EMITTED:
933 case PIPE_QUERY_PRIMITIVES_GENERATED:
934 *result_u64 = query->result.u64;
935 break;
936 case PIPE_QUERY_OCCLUSION_PREDICATE:
937 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
938 *result_b = query->result.b;
939 break;
940 case PIPE_QUERY_TIME_ELAPSED:
941 *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
942 break;
943 case PIPE_QUERY_SO_STATISTICS:
944 *result_so = query->result.so;
945 break;
946 default:
947 assert(0);
948 }
949 return TRUE;
950 }
951
952 void r600_context_queries_suspend(struct r600_context *ctx)
953 {
954 struct r600_query *query;
955
956 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
957 r600_query_end(ctx, query);
958 }
959 assert(ctx->num_cs_dw_queries_suspend == 0);
960 }
961
962 void r600_context_queries_resume(struct r600_context *ctx)
963 {
964 struct r600_query *query;
965
966 assert(ctx->num_cs_dw_queries_suspend == 0);
967
968 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
969 r600_query_begin(ctx, query);
970 }
971 }
972
973 void r600_context_streamout_begin(struct r600_context *ctx)
974 {
975 struct radeon_winsys_cs *cs = ctx->cs;
976 struct r600_so_target **t = ctx->so_targets;
977 unsigned *strides = ctx->vs_shader_so_strides;
978 unsigned buffer_en, i;
979
980 buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) |
981 (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) |
982 (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) |
983 (ctx->num_so_targets >= 4 && t[3] ? 8 : 0);
984
985 ctx->num_cs_dw_streamout_end =
986 12 + /* flush_vgt_streamout */
987 util_bitcount(buffer_en) * 8 +
988 3;
989
990 r600_need_cs_space(ctx,
991 12 + /* flush_vgt_streamout */
992 6 + /* enables */
993 util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 +
994 util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 +
995 ctx->num_cs_dw_streamout_end, TRUE);
996
997 if (ctx->chip_class >= CAYMAN) {
998 evergreen_flush_vgt_streamout(ctx);
999 evergreen_set_streamout_enable(ctx, buffer_en);
1000 }
1001
1002 for (i = 0; i < ctx->num_so_targets; i++) {
1003 #if 0
1004 if (t[i]) {
1005 t[i]->stride = strides[i];
1006 t[i]->so_index = i;
1007
1008 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 3, 0);
1009 cs->buf[cs->cdw++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 +
1010 16*i - SI_CONTEXT_REG_OFFSET) >> 2;
1011 cs->buf[cs->cdw++] = (t[i]->b.buffer_offset +
1012 t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */
1013 cs->buf[cs->cdw++] = strides[i] >> 2; /* VTX_STRIDE (in DW) */
1014 cs->buf[cs->cdw++] = 0; /* BUFFER_BASE */
1015
1016 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1017 cs->buf[cs->cdw++] =
1018 r600_context_bo_reloc(ctx, r600_resource(t[i]->b.buffer),
1019 RADEON_USAGE_WRITE);
1020
1021 if (ctx->streamout_append_bitmask & (1 << i)) {
1022 /* Append. */
1023 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
1024 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
1025 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */
1026 cs->buf[cs->cdw++] = 0; /* unused */
1027 cs->buf[cs->cdw++] = 0; /* unused */
1028 cs->buf[cs->cdw++] = 0; /* src address lo */
1029 cs->buf[cs->cdw++] = 0; /* src address hi */
1030
1031 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1032 cs->buf[cs->cdw++] =
1033 r600_context_bo_reloc(ctx, t[i]->filled_size,
1034 RADEON_USAGE_READ);
1035 } else {
1036 /* Start from the beginning. */
1037 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
1038 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
1039 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */
1040 cs->buf[cs->cdw++] = 0; /* unused */
1041 cs->buf[cs->cdw++] = 0; /* unused */
1042 cs->buf[cs->cdw++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */
1043 cs->buf[cs->cdw++] = 0; /* unused */
1044 }
1045 }
1046 #endif
1047 }
1048 }
1049
1050 void r600_context_streamout_end(struct r600_context *ctx)
1051 {
1052 struct radeon_winsys_cs *cs = ctx->cs;
1053 struct r600_so_target **t = ctx->so_targets;
1054 unsigned i, flush_flags = 0;
1055
1056 evergreen_flush_vgt_streamout(ctx);
1057
1058 for (i = 0; i < ctx->num_so_targets; i++) {
1059 #if 0
1060 if (t[i]) {
1061 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
1062 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
1063 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
1064 STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */
1065 cs->buf[cs->cdw++] = 0; /* dst address lo */
1066 cs->buf[cs->cdw++] = 0; /* dst address hi */
1067 cs->buf[cs->cdw++] = 0; /* unused */
1068 cs->buf[cs->cdw++] = 0; /* unused */
1069
1070 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1071 cs->buf[cs->cdw++] =
1072 r600_context_bo_reloc(ctx, t[i]->filled_size,
1073 RADEON_USAGE_WRITE);
1074
1075 flush_flags |= S_0085F0_SO0_DEST_BASE_ENA(1) << i;
1076 }
1077 #endif
1078 }
1079
1080 evergreen_set_streamout_enable(ctx, 0);
1081
1082 ctx->atom_surface_sync.flush_flags |= flush_flags;
1083 r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
1084
1085 ctx->num_cs_dw_streamout_end = 0;
1086
1087 /* XXX print some debug info */
1088 for (i = 0; i < ctx->num_so_targets; i++) {
1089 if (!t[i])
1090 continue;
1091
1092 uint32_t *ptr = ctx->ws->buffer_map(t[i]->filled_size->cs_buf, ctx->cs, RADEON_USAGE_READ);
1093 printf("FILLED_SIZE%i: %u\n", i, *ptr);
1094 ctx->ws->buffer_unmap(t[i]->filled_size->cs_buf);
1095 }
1096 }
1097
1098 void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
1099 {
1100 struct radeon_winsys_cs *cs = ctx->cs;
1101 r600_need_cs_space(ctx, 14 + 21, TRUE);
1102
1103 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
1104 cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2;
1105 cs->buf[cs->cdw++] = 0;
1106
1107 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
1108 cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
1109 cs->buf[cs->cdw++] = t->stride >> 2;
1110
1111 #if 0
1112 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
1113 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
1114 cs->buf[cs->cdw++] = 0; /* src address lo */
1115 cs->buf[cs->cdw++] = 0; /* src address hi */
1116 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
1117 cs->buf[cs->cdw++] = 0; /* unused */
1118 #endif
1119
1120 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1121 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ);
1122
1123 #if 0 /* I have not found this useful yet. */
1124 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
1125 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
1126 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
1127 cs->buf[cs->cdw++] = 0; /* unused */
1128 cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
1129 cs->buf[cs->cdw++] = 0; /* unused */
1130
1131 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
1132 cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2;
1133 cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
1134
1135 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
1136 cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2;
1137 cs->buf[cs->cdw++] = t->b.buffer_offset >> 2;
1138
1139 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1140 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct r600_resource*)t->b.buffer,
1141 RADEON_USAGE_WRITE);
1142
1143 cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
1144 cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
1145 cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2; /* register */
1146 cs->buf[cs->cdw++] = 0;
1147 cs->buf[cs->cdw++] = 0; /* reference value */
1148 cs->buf[cs->cdw++] = 0xffffffff; /* mask */
1149 cs->buf[cs->cdw++] = 4; /* poll interval */
1150 #endif
1151 }