r600g: fix compile warnings in r600_cp_dma_copy_buffer on 32-bit gcc
[mesa.git] / src / gallium / drivers / r600 / r600_hw_context.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Jerome Glisse
25 */
26 #include "r600_hw_context_priv.h"
27 #include "r600d.h"
28 #include "util/u_memory.h"
29 #include <errno.h>
30 #include <unistd.h>
31
32 /* Get backends mask */
33 void r600_get_backend_mask(struct r600_context *ctx)
34 {
35 struct radeon_winsys_cs *cs = ctx->cs;
36 struct r600_resource *buffer;
37 uint32_t *results;
38 unsigned num_backends = ctx->screen->info.r600_num_backends;
39 unsigned i, mask = 0;
40 uint64_t va;
41
42 /* if backend_map query is supported by the kernel */
43 if (ctx->screen->info.r600_backend_map_valid) {
44 unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
45 unsigned backend_map = ctx->screen->info.r600_backend_map;
46 unsigned item_width, item_mask;
47
48 if (ctx->chip_class >= EVERGREEN) {
49 item_width = 4;
50 item_mask = 0x7;
51 } else {
52 item_width = 2;
53 item_mask = 0x3;
54 }
55
56 while(num_tile_pipes--) {
57 i = backend_map & item_mask;
58 mask |= (1<<i);
59 backend_map >>= item_width;
60 }
61 if (mask != 0) {
62 ctx->backend_mask = mask;
63 return;
64 }
65 }
66
67 /* otherwise backup path for older kernels */
68
69 /* create buffer for event data */
70 buffer = (struct r600_resource*)
71 pipe_buffer_create(&ctx->screen->screen, PIPE_BIND_CUSTOM,
72 PIPE_USAGE_STAGING, ctx->max_db*16);
73 if (!buffer)
74 goto err;
75
76 va = r600_resource_va(&ctx->screen->screen, (void*)buffer);
77
78 /* initialize buffer with zeroes */
79 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
80 if (results) {
81 memset(results, 0, ctx->max_db * 4 * 4);
82 ctx->ws->buffer_unmap(buffer->cs_buf);
83
84 /* emit EVENT_WRITE for ZPASS_DONE */
85 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
86 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
87 cs->buf[cs->cdw++] = va;
88 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
89
90 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
91 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
92
93 /* analyze results */
94 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ);
95 if (results) {
96 for(i = 0; i < ctx->max_db; i++) {
97 /* at least highest bit will be set if backend is used */
98 if (results[i*4 + 1])
99 mask |= (1<<i);
100 }
101 ctx->ws->buffer_unmap(buffer->cs_buf);
102 }
103 }
104
105 pipe_resource_reference((struct pipe_resource**)&buffer, NULL);
106
107 if (mask != 0) {
108 ctx->backend_mask = mask;
109 return;
110 }
111
112 err:
113 /* fallback to old method - set num_backends lower bits to 1 */
114 ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
115 return;
116 }
117
118 static void r600_init_block(struct r600_context *ctx,
119 struct r600_block *block,
120 const struct r600_reg *reg, int index, int nreg,
121 unsigned opcode, unsigned offset_base)
122 {
123 int i = index;
124 int j, n = nreg;
125
126 /* initialize block */
127 block->flags = 0;
128 block->status |= R600_BLOCK_STATUS_DIRTY; /* dirty all blocks at start */
129 block->start_offset = reg[i].offset;
130 block->pm4[block->pm4_ndwords++] = PKT3(opcode, n, 0);
131 block->pm4[block->pm4_ndwords++] = (block->start_offset - offset_base) >> 2;
132 block->reg = &block->pm4[block->pm4_ndwords];
133 block->pm4_ndwords += n;
134 block->nreg = n;
135 block->nreg_dirty = n;
136 LIST_INITHEAD(&block->list);
137 LIST_INITHEAD(&block->enable_list);
138
139 for (j = 0; j < n; j++) {
140 if (reg[i+j].flags & REG_FLAG_DIRTY_ALWAYS) {
141 block->flags |= REG_FLAG_DIRTY_ALWAYS;
142 }
143 if (reg[i+j].flags & REG_FLAG_ENABLE_ALWAYS) {
144 if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
145 block->status |= R600_BLOCK_STATUS_ENABLED;
146 LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
147 LIST_ADDTAIL(&block->list,&ctx->dirty);
148 }
149 }
150 if (reg[i+j].flags & REG_FLAG_FLUSH_CHANGE) {
151 block->flags |= REG_FLAG_FLUSH_CHANGE;
152 }
153
154 if (reg[i+j].flags & REG_FLAG_NEED_BO) {
155 block->nbo++;
156 assert(block->nbo < R600_BLOCK_MAX_BO);
157 block->pm4_bo_index[j] = block->nbo;
158 block->pm4[block->pm4_ndwords++] = PKT3(PKT3_NOP, 0, 0);
159 block->pm4[block->pm4_ndwords++] = 0x00000000;
160 block->reloc[block->nbo].bo_pm4_index = block->pm4_ndwords - 1;
161 }
162 }
163 /* check that we stay in limit */
164 assert(block->pm4_ndwords < R600_BLOCK_MAX_REG);
165 }
166
167 int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg, unsigned nreg,
168 unsigned opcode, unsigned offset_base)
169 {
170 struct r600_block *block;
171 struct r600_range *range;
172 int offset;
173
174 for (unsigned i = 0, n = 0; i < nreg; i += n) {
175 /* ignore new block balise */
176 if (reg[i].offset == GROUP_FORCE_NEW_BLOCK) {
177 n = 1;
178 continue;
179 }
180
181 /* register that need relocation are in their own group */
182 /* find number of consecutive registers */
183 n = 0;
184 offset = reg[i].offset;
185 while (reg[i + n].offset == offset) {
186 n++;
187 offset += 4;
188 if ((n + i) >= nreg)
189 break;
190 if (n >= (R600_BLOCK_MAX_REG - 2))
191 break;
192 }
193
194 /* allocate new block */
195 block = calloc(1, sizeof(struct r600_block));
196 if (block == NULL) {
197 return -ENOMEM;
198 }
199 ctx->nblocks++;
200 for (int j = 0; j < n; j++) {
201 range = &ctx->range[CTX_RANGE_ID(reg[i + j].offset)];
202 /* create block table if it doesn't exist */
203 if (!range->blocks)
204 range->blocks = calloc(1 << HASH_SHIFT, sizeof(void *));
205 if (!range->blocks)
206 return -1;
207
208 range->blocks[CTX_BLOCK_ID(reg[i + j].offset)] = block;
209 }
210
211 r600_init_block(ctx, block, reg, i, n, opcode, offset_base);
212
213 }
214 return 0;
215 }
216
217 static const struct r600_reg r600_context_reg_list[] = {
218 {R_028D24_DB_HTILE_SURFACE, 0, 0},
219 {R_028614_SPI_VS_OUT_ID_0, 0, 0},
220 {R_028618_SPI_VS_OUT_ID_1, 0, 0},
221 {R_02861C_SPI_VS_OUT_ID_2, 0, 0},
222 {R_028620_SPI_VS_OUT_ID_3, 0, 0},
223 {R_028624_SPI_VS_OUT_ID_4, 0, 0},
224 {R_028628_SPI_VS_OUT_ID_5, 0, 0},
225 {R_02862C_SPI_VS_OUT_ID_6, 0, 0},
226 {R_028630_SPI_VS_OUT_ID_7, 0, 0},
227 {R_028634_SPI_VS_OUT_ID_8, 0, 0},
228 {R_028638_SPI_VS_OUT_ID_9, 0, 0},
229 {R_0286C4_SPI_VS_OUT_CONFIG, 0, 0},
230 {GROUP_FORCE_NEW_BLOCK, 0, 0},
231 {R_028858_SQ_PGM_START_VS, REG_FLAG_NEED_BO, 0},
232 {GROUP_FORCE_NEW_BLOCK, 0, 0},
233 {R_028868_SQ_PGM_RESOURCES_VS, 0, 0},
234 {GROUP_FORCE_NEW_BLOCK, 0, 0},
235 {R_0288A4_SQ_PGM_RESOURCES_FS, 0, 0},
236 {R_0288DC_SQ_PGM_CF_OFFSET_FS, 0, 0},
237 {R_028644_SPI_PS_INPUT_CNTL_0, 0, 0},
238 {R_028648_SPI_PS_INPUT_CNTL_1, 0, 0},
239 {R_02864C_SPI_PS_INPUT_CNTL_2, 0, 0},
240 {R_028650_SPI_PS_INPUT_CNTL_3, 0, 0},
241 {R_028654_SPI_PS_INPUT_CNTL_4, 0, 0},
242 {R_028658_SPI_PS_INPUT_CNTL_5, 0, 0},
243 {R_02865C_SPI_PS_INPUT_CNTL_6, 0, 0},
244 {R_028660_SPI_PS_INPUT_CNTL_7, 0, 0},
245 {R_028664_SPI_PS_INPUT_CNTL_8, 0, 0},
246 {R_028668_SPI_PS_INPUT_CNTL_9, 0, 0},
247 {R_02866C_SPI_PS_INPUT_CNTL_10, 0, 0},
248 {R_028670_SPI_PS_INPUT_CNTL_11, 0, 0},
249 {R_028674_SPI_PS_INPUT_CNTL_12, 0, 0},
250 {R_028678_SPI_PS_INPUT_CNTL_13, 0, 0},
251 {R_02867C_SPI_PS_INPUT_CNTL_14, 0, 0},
252 {R_028680_SPI_PS_INPUT_CNTL_15, 0, 0},
253 {R_028684_SPI_PS_INPUT_CNTL_16, 0, 0},
254 {R_028688_SPI_PS_INPUT_CNTL_17, 0, 0},
255 {R_02868C_SPI_PS_INPUT_CNTL_18, 0, 0},
256 {R_028690_SPI_PS_INPUT_CNTL_19, 0, 0},
257 {R_028694_SPI_PS_INPUT_CNTL_20, 0, 0},
258 {R_028698_SPI_PS_INPUT_CNTL_21, 0, 0},
259 {R_02869C_SPI_PS_INPUT_CNTL_22, 0, 0},
260 {R_0286A0_SPI_PS_INPUT_CNTL_23, 0, 0},
261 {R_0286A4_SPI_PS_INPUT_CNTL_24, 0, 0},
262 {R_0286A8_SPI_PS_INPUT_CNTL_25, 0, 0},
263 {R_0286AC_SPI_PS_INPUT_CNTL_26, 0, 0},
264 {R_0286B0_SPI_PS_INPUT_CNTL_27, 0, 0},
265 {R_0286B4_SPI_PS_INPUT_CNTL_28, 0, 0},
266 {R_0286B8_SPI_PS_INPUT_CNTL_29, 0, 0},
267 {R_0286BC_SPI_PS_INPUT_CNTL_30, 0, 0},
268 {R_0286C0_SPI_PS_INPUT_CNTL_31, 0, 0},
269 {R_0286CC_SPI_PS_IN_CONTROL_0, 0, 0},
270 {R_0286D0_SPI_PS_IN_CONTROL_1, 0, 0},
271 {R_0286D8_SPI_INPUT_Z, 0, 0},
272 {GROUP_FORCE_NEW_BLOCK, 0, 0},
273 {R_028840_SQ_PGM_START_PS, REG_FLAG_NEED_BO, 0},
274 {GROUP_FORCE_NEW_BLOCK, 0, 0},
275 {R_028850_SQ_PGM_RESOURCES_PS, 0, 0},
276 {R_028854_SQ_PGM_EXPORTS_PS, 0, 0},
277 };
278
279 /* initialize */
280 void r600_context_fini(struct r600_context *ctx)
281 {
282 struct r600_block *block;
283 struct r600_range *range;
284
285 if (ctx->range) {
286 for (int i = 0; i < NUM_RANGES; i++) {
287 if (!ctx->range[i].blocks)
288 continue;
289 for (int j = 0; j < (1 << HASH_SHIFT); j++) {
290 block = ctx->range[i].blocks[j];
291 if (block) {
292 for (int k = 0, offset = block->start_offset; k < block->nreg; k++, offset += 4) {
293 range = &ctx->range[CTX_RANGE_ID(offset)];
294 range->blocks[CTX_BLOCK_ID(offset)] = NULL;
295 }
296 for (int k = 1; k <= block->nbo; k++) {
297 pipe_resource_reference((struct pipe_resource**)&block->reloc[k].bo, NULL);
298 }
299 free(block);
300 }
301 }
302 free(ctx->range[i].blocks);
303 }
304 }
305 free(ctx->blocks);
306 }
307
308 int r600_setup_block_table(struct r600_context *ctx)
309 {
310 /* setup block table */
311 int c = 0;
312 ctx->blocks = calloc(ctx->nblocks, sizeof(void*));
313 if (!ctx->blocks)
314 return -ENOMEM;
315 for (int i = 0; i < NUM_RANGES; i++) {
316 if (!ctx->range[i].blocks)
317 continue;
318 for (int j = 0, add; j < (1 << HASH_SHIFT); j++) {
319 if (!ctx->range[i].blocks[j])
320 continue;
321
322 add = 1;
323 for (int k = 0; k < c; k++) {
324 if (ctx->blocks[k] == ctx->range[i].blocks[j]) {
325 add = 0;
326 break;
327 }
328 }
329 if (add) {
330 assert(c < ctx->nblocks);
331 ctx->blocks[c++] = ctx->range[i].blocks[j];
332 j += (ctx->range[i].blocks[j]->nreg) - 1;
333 }
334 }
335 }
336 return 0;
337 }
338
339 int r600_context_init(struct r600_context *ctx)
340 {
341 int r;
342
343 /* add blocks */
344 r = r600_context_add_block(ctx, r600_context_reg_list,
345 Elements(r600_context_reg_list), PKT3_SET_CONTEXT_REG, R600_CONTEXT_REG_OFFSET);
346 if (r)
347 goto out_err;
348
349 r = r600_setup_block_table(ctx);
350 if (r)
351 goto out_err;
352
353 ctx->max_db = 4;
354 return 0;
355 out_err:
356 r600_context_fini(ctx);
357 return r;
358 }
359
360 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
361 boolean count_draw_in)
362 {
363 /* The number of dwords we already used in the CS so far. */
364 num_dw += ctx->cs->cdw;
365
366 if (count_draw_in) {
367 unsigned i;
368
369 /* The number of dwords all the dirty states would take. */
370 for (i = 0; i < R600_NUM_ATOMS; i++) {
371 if (ctx->atoms[i] && ctx->atoms[i]->dirty) {
372 num_dw += ctx->atoms[i]->num_dw;
373 #if R600_TRACE_CS
374 if (ctx->screen->trace_bo) {
375 num_dw += R600_TRACE_CS_DWORDS;
376 }
377 #endif
378 }
379 }
380
381 num_dw += ctx->pm4_dirty_cdwords;
382
383 /* The upper-bound of how much space a draw command would take. */
384 num_dw += R600_MAX_FLUSH_CS_DWORDS + R600_MAX_DRAW_CS_DWORDS;
385 #if R600_TRACE_CS
386 if (ctx->screen->trace_bo) {
387 num_dw += R600_TRACE_CS_DWORDS;
388 }
389 #endif
390 }
391
392 /* Count in queries_suspend. */
393 num_dw += ctx->num_cs_dw_nontimer_queries_suspend;
394
395 /* Count in streamout_end at the end of CS. */
396 num_dw += ctx->num_cs_dw_streamout_end;
397
398 /* Count in render_condition(NULL) at the end of CS. */
399 if (ctx->predicate_drawing) {
400 num_dw += 3;
401 }
402
403 /* SX_MISC */
404 if (ctx->chip_class <= R700) {
405 num_dw += 3;
406 }
407
408 /* Count in framebuffer cache flushes at the end of CS. */
409 num_dw += R600_MAX_FLUSH_CS_DWORDS;
410
411 /* The fence at the end of CS. */
412 num_dw += 10;
413
414 /* Flush if there's not enough space. */
415 if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
416 r600_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
417 }
418 }
419
420 void r600_context_dirty_block(struct r600_context *ctx,
421 struct r600_block *block,
422 int dirty, int index)
423 {
424 if ((index + 1) > block->nreg_dirty)
425 block->nreg_dirty = index + 1;
426
427 if ((dirty != (block->status & R600_BLOCK_STATUS_DIRTY)) || !(block->status & R600_BLOCK_STATUS_ENABLED)) {
428 block->status |= R600_BLOCK_STATUS_DIRTY;
429 ctx->pm4_dirty_cdwords += block->pm4_ndwords;
430 if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
431 block->status |= R600_BLOCK_STATUS_ENABLED;
432 LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
433 }
434 LIST_ADDTAIL(&block->list,&ctx->dirty);
435
436 if (block->flags & REG_FLAG_FLUSH_CHANGE) {
437 ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE;
438 }
439 }
440 }
441
442 /**
443 * If reg needs a reloc, this function will add it to its block's reloc list.
444 * @return true if reg needs a reloc, false otherwise
445 */
446 static bool r600_reg_set_block_reloc(struct r600_pipe_reg *reg)
447 {
448 unsigned reloc_id;
449
450 if (!reg->block->pm4_bo_index[reg->id]) {
451 return false;
452 }
453 /* find relocation */
454 reloc_id = reg->block->pm4_bo_index[reg->id];
455 pipe_resource_reference(
456 (struct pipe_resource**)&reg->block->reloc[reloc_id].bo,
457 &reg->bo->b.b);
458 reg->block->reloc[reloc_id].bo_usage = reg->bo_usage;
459 return true;
460 }
461
462 /**
463 * This function will emit all the registers in state directly to the command
464 * stream allowing you to bypass the r600_context dirty list.
465 *
466 * This is used for dispatching compute shaders to avoid mixing compute and
467 * 3D states in the context's dirty list.
468 *
469 * @param pkt_flags Should be either 0 or RADEON_CP_PACKET3_COMPUTE_MODE. This
470 * value will be passed on to r600_context_block_emit_dirty an or'd against
471 * the PKT3 headers.
472 */
473 void r600_context_pipe_state_emit(struct r600_context *ctx,
474 struct r600_pipe_state *state,
475 unsigned pkt_flags)
476 {
477 unsigned i;
478
479 /* Mark all blocks as dirty:
480 * Since two registers can be in the same block, we need to make sure
481 * we mark all the blocks dirty before we emit any of them. If we were
482 * to mark blocks dirty and emit them in the same loop, like this:
483 *
484 * foreach (reg in state->regs) {
485 * mark_dirty(reg->block)
486 * emit_block(reg->block)
487 * }
488 *
489 * Then if we have two registers in this state that are in the same
490 * block, we would end up emitting that block twice.
491 */
492 for (i = 0; i < state->nregs; i++) {
493 struct r600_pipe_reg *reg = &state->regs[i];
494 /* Mark all the registers in the block as dirty */
495 reg->block->nreg_dirty = reg->block->nreg;
496 reg->block->status |= R600_BLOCK_STATUS_DIRTY;
497 /* Update the reloc for this register if necessary. */
498 r600_reg_set_block_reloc(reg);
499 }
500
501 /* Emit the registers writes */
502 for (i = 0; i < state->nregs; i++) {
503 struct r600_pipe_reg *reg = &state->regs[i];
504 if (reg->block->status & R600_BLOCK_STATUS_DIRTY) {
505 r600_context_block_emit_dirty(ctx, reg->block, pkt_flags);
506 }
507 }
508 }
509
510 void r600_context_pipe_state_set(struct r600_context *ctx, struct r600_pipe_state *state)
511 {
512 struct r600_block *block;
513 int dirty;
514 for (int i = 0; i < state->nregs; i++) {
515 unsigned id;
516 struct r600_pipe_reg *reg = &state->regs[i];
517
518 block = reg->block;
519 id = reg->id;
520
521 dirty = block->status & R600_BLOCK_STATUS_DIRTY;
522
523 if (reg->value != block->reg[id]) {
524 block->reg[id] = reg->value;
525 dirty |= R600_BLOCK_STATUS_DIRTY;
526 }
527 if (block->flags & REG_FLAG_DIRTY_ALWAYS)
528 dirty |= R600_BLOCK_STATUS_DIRTY;
529 if (r600_reg_set_block_reloc(reg)) {
530 /* always force dirty for relocs for now */
531 dirty |= R600_BLOCK_STATUS_DIRTY;
532 }
533
534 if (dirty)
535 r600_context_dirty_block(ctx, block, dirty, id);
536 }
537 }
538
539 /**
540 * @param pkt_flags should be set to RADEON_CP_PACKET3_COMPUTE_MODE if this
541 * block will be used for compute shaders.
542 */
543 void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block,
544 unsigned pkt_flags)
545 {
546 struct radeon_winsys_cs *cs = ctx->cs;
547 int optional = block->nbo == 0 && !(block->flags & REG_FLAG_DIRTY_ALWAYS);
548 int cp_dwords = block->pm4_ndwords, start_dword = 0;
549 int new_dwords = 0;
550 int nbo = block->nbo;
551
552 if (block->nreg_dirty == 0 && optional) {
553 goto out;
554 }
555
556 if (nbo) {
557 for (int j = 0; j < block->nreg; j++) {
558 if (block->pm4_bo_index[j]) {
559 /* find relocation */
560 struct r600_block_reloc *reloc = &block->reloc[block->pm4_bo_index[j]];
561 if (reloc->bo) {
562 block->pm4[reloc->bo_pm4_index] =
563 r600_context_bo_reloc(ctx, reloc->bo, reloc->bo_usage);
564 } else {
565 block->pm4[reloc->bo_pm4_index] = 0;
566 }
567 nbo--;
568 if (nbo == 0)
569 break;
570
571 }
572 }
573 }
574
575 optional &= (block->nreg_dirty != block->nreg);
576 if (optional) {
577 new_dwords = block->nreg_dirty;
578 start_dword = cs->cdw;
579 cp_dwords = new_dwords + 2;
580 }
581 memcpy(&cs->buf[cs->cdw], block->pm4, cp_dwords * 4);
582
583 /* We are applying the pkt_flags after copying the register block to
584 * the the command stream, because it is possible this block will be
585 * emitted with a different pkt_flags, and we don't want to store the
586 * pkt_flags in the block.
587 */
588 cs->buf[cs->cdw] |= pkt_flags;
589 cs->cdw += cp_dwords;
590
591 if (optional) {
592 uint32_t newword;
593
594 newword = cs->buf[start_dword];
595 newword &= PKT_COUNT_C;
596 newword |= PKT_COUNT_S(new_dwords);
597 cs->buf[start_dword] = newword;
598 }
599 out:
600 block->status ^= R600_BLOCK_STATUS_DIRTY;
601 block->nreg_dirty = 0;
602 LIST_DELINIT(&block->list);
603 }
604
605 void r600_flush_emit(struct r600_context *rctx)
606 {
607 struct radeon_winsys_cs *cs = rctx->cs;
608 unsigned cp_coher_cntl = 0;
609 unsigned wait_until = 0;
610 unsigned emit_flush = 0;
611
612 if (!rctx->flags) {
613 return;
614 }
615
616 if (rctx->chip_class >= R700 &&
617 (rctx->flags & R600_CONTEXT_FLUSH_AND_INV_CB_META)) {
618 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
619 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0);
620 }
621
622 if (rctx->flags & R600_CONTEXT_FLUSH_AND_INV) {
623 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
624 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0);
625 if (rctx->chip_class >= EVERGREEN) {
626 cp_coher_cntl = S_0085F0_CB0_DEST_BASE_ENA(1) |
627 S_0085F0_CB1_DEST_BASE_ENA(1) |
628 S_0085F0_CB2_DEST_BASE_ENA(1) |
629 S_0085F0_CB3_DEST_BASE_ENA(1) |
630 S_0085F0_CB4_DEST_BASE_ENA(1) |
631 S_0085F0_CB5_DEST_BASE_ENA(1) |
632 S_0085F0_CB6_DEST_BASE_ENA(1) |
633 S_0085F0_CB7_DEST_BASE_ENA(1) |
634 S_0085F0_CB8_DEST_BASE_ENA(1) |
635 S_0085F0_CB9_DEST_BASE_ENA(1) |
636 S_0085F0_CB10_DEST_BASE_ENA(1) |
637 S_0085F0_CB11_DEST_BASE_ENA(1) |
638 S_0085F0_DB_DEST_BASE_ENA(1) |
639 S_0085F0_TC_ACTION_ENA(1) |
640 S_0085F0_CB_ACTION_ENA(1) |
641 S_0085F0_DB_ACTION_ENA(1) |
642 S_0085F0_SH_ACTION_ENA(1) |
643 S_0085F0_SMX_ACTION_ENA(1) |
644 S_0085F0_FULL_CACHE_ENA(1);
645 } else {
646 cp_coher_cntl = S_0085F0_SMX_ACTION_ENA(1) |
647 S_0085F0_SH_ACTION_ENA(1) |
648 S_0085F0_VC_ACTION_ENA(1) |
649 S_0085F0_TC_ACTION_ENA(1) |
650 S_0085F0_FULL_CACHE_ENA(1);
651 }
652 }
653
654 if (rctx->flags & R600_CONTEXT_INVAL_READ_CACHES) {
655 cp_coher_cntl |= S_0085F0_VC_ACTION_ENA(1) |
656 S_0085F0_TC_ACTION_ENA(1) |
657 S_0085F0_FULL_CACHE_ENA(1);
658 emit_flush = 1;
659 }
660
661 if (rctx->family >= CHIP_RV770 && rctx->flags & R600_CONTEXT_STREAMOUT_FLUSH) {
662 cp_coher_cntl |= S_0085F0_SO0_DEST_BASE_ENA(1) |
663 S_0085F0_SO1_DEST_BASE_ENA(1) |
664 S_0085F0_SO2_DEST_BASE_ENA(1) |
665 S_0085F0_SO3_DEST_BASE_ENA(1) |
666 S_0085F0_SMX_ACTION_ENA(1);
667 emit_flush = 1;
668 }
669
670 if (emit_flush) {
671 cs->buf[cs->cdw++] = PKT3(PKT3_SURFACE_SYNC, 3, 0);
672 cs->buf[cs->cdw++] = cp_coher_cntl; /* CP_COHER_CNTL */
673 cs->buf[cs->cdw++] = 0xffffffff; /* CP_COHER_SIZE */
674 cs->buf[cs->cdw++] = 0; /* CP_COHER_BASE */
675 cs->buf[cs->cdw++] = 0x0000000A; /* POLL_INTERVAL */
676 }
677
678 if (rctx->flags & R600_CONTEXT_WAIT_3D_IDLE) {
679 wait_until |= S_008040_WAIT_3D_IDLE(1);
680 }
681 if (rctx->flags & R600_CONTEXT_WAIT_CP_DMA_IDLE) {
682 wait_until |= S_008040_WAIT_CP_DMA_IDLE(1);
683 }
684 if (wait_until) {
685 /* wait for things to settle */
686 r600_write_config_reg(cs, R_008040_WAIT_UNTIL, wait_until);
687 }
688
689 /* everything is properly flushed */
690 rctx->flags = 0;
691 }
692
693 void r600_context_flush(struct r600_context *ctx, unsigned flags)
694 {
695 struct radeon_winsys_cs *cs = ctx->cs;
696
697 if (cs->cdw == ctx->start_cs_cmd.num_dw)
698 return;
699
700 ctx->nontimer_queries_suspended = false;
701 ctx->streamout_suspended = false;
702
703 /* suspend queries */
704 if (ctx->num_cs_dw_nontimer_queries_suspend) {
705 r600_suspend_nontimer_queries(ctx);
706 ctx->nontimer_queries_suspended = true;
707 }
708
709 if (ctx->num_cs_dw_streamout_end) {
710 r600_context_streamout_end(ctx);
711 ctx->streamout_suspended = true;
712 }
713
714 /* flush is needed to avoid lockups on some chips with user fences
715 * this will also flush the framebuffer cache
716 */
717 ctx->flags |= R600_CONTEXT_FLUSH_AND_INV |
718 R600_CONTEXT_FLUSH_AND_INV_CB_META |
719 R600_CONTEXT_WAIT_3D_IDLE |
720 R600_CONTEXT_WAIT_CP_DMA_IDLE;
721
722 r600_flush_emit(ctx);
723
724 /* old kernels and userspace don't set SX_MISC, so we must reset it to 0 here */
725 if (ctx->chip_class <= R700) {
726 r600_write_context_reg(cs, R_028350_SX_MISC, 0);
727 }
728
729 /* force to keep tiling flags */
730 if (ctx->keep_tiling_flags) {
731 flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
732 }
733
734 /* Flush the CS. */
735 #if R600_TRACE_CS
736 if (ctx->screen->trace_bo) {
737 struct r600_screen *rscreen = ctx->screen;
738 unsigned i;
739
740 for (i = 0; i < cs->cdw; i++) {
741 fprintf(stderr, "[%4d] [%5d] 0x%08x\n", rscreen->cs_count, i, cs->buf[i]);
742 }
743 rscreen->cs_count++;
744 }
745 #endif
746 ctx->ws->cs_flush(ctx->cs, flags);
747 #if R600_TRACE_CS
748 if (ctx->screen->trace_bo) {
749 struct r600_screen *rscreen = ctx->screen;
750 unsigned i;
751
752 for (i = 0; i < 10; i++) {
753 usleep(5);
754 if (!ctx->ws->buffer_is_busy(rscreen->trace_bo->buf, RADEON_USAGE_READWRITE)) {
755 break;
756 }
757 }
758 if (i == 10) {
759 fprintf(stderr, "timeout on cs lockup likely happen at cs %d dw %d\n",
760 rscreen->trace_ptr[1], rscreen->trace_ptr[0]);
761 } else {
762 fprintf(stderr, "cs %d executed in %dms\n", rscreen->trace_ptr[1], i * 5);
763 }
764 }
765 #endif
766
767 r600_begin_new_cs(ctx);
768 }
769
770 void r600_begin_new_cs(struct r600_context *ctx)
771 {
772 struct r600_block *enable_block = NULL;
773 unsigned shader;
774
775 ctx->pm4_dirty_cdwords = 0;
776 ctx->flags = 0;
777
778 /* Begin a new CS. */
779 r600_emit_command_buffer(ctx->cs, &ctx->start_cs_cmd);
780
781 /* Re-emit states. */
782 ctx->alphatest_state.atom.dirty = true;
783 ctx->blend_color.atom.dirty = true;
784 ctx->cb_misc_state.atom.dirty = true;
785 ctx->clip_misc_state.atom.dirty = true;
786 ctx->clip_state.atom.dirty = true;
787 ctx->db_misc_state.atom.dirty = true;
788 ctx->db_state.atom.dirty = true;
789 ctx->framebuffer.atom.dirty = true;
790 ctx->poly_offset_state.atom.dirty = true;
791 ctx->vgt_state.atom.dirty = true;
792 ctx->vgt2_state.atom.dirty = true;
793 ctx->sample_mask.atom.dirty = true;
794 ctx->scissor.atom.dirty = true;
795 ctx->config_state.atom.dirty = true;
796 ctx->stencil_ref.atom.dirty = true;
797 ctx->vertex_fetch_shader.atom.dirty = true;
798 ctx->viewport.atom.dirty = true;
799
800 if (ctx->blend_state.cso)
801 ctx->blend_state.atom.dirty = true;
802 if (ctx->dsa_state.cso)
803 ctx->dsa_state.atom.dirty = true;
804 if (ctx->rasterizer_state.cso)
805 ctx->rasterizer_state.atom.dirty = true;
806
807 if (ctx->chip_class <= R700) {
808 ctx->seamless_cube_map.atom.dirty = true;
809 }
810
811 ctx->vertex_buffer_state.dirty_mask = ctx->vertex_buffer_state.enabled_mask;
812 r600_vertex_buffers_dirty(ctx);
813
814 /* Re-emit shader resources. */
815 for (shader = 0; shader < PIPE_SHADER_TYPES; shader++) {
816 struct r600_constbuf_state *constbuf = &ctx->constbuf_state[shader];
817 struct r600_textures_info *samplers = &ctx->samplers[shader];
818
819 constbuf->dirty_mask = constbuf->enabled_mask;
820 samplers->views.dirty_mask = samplers->views.enabled_mask;
821 samplers->states.dirty_mask = samplers->states.enabled_mask;
822
823 r600_constant_buffers_dirty(ctx, constbuf);
824 r600_sampler_views_dirty(ctx, &samplers->views);
825 r600_sampler_states_dirty(ctx, &samplers->states);
826 }
827
828 if (ctx->streamout_suspended) {
829 ctx->streamout_start = TRUE;
830 ctx->streamout_append_bitmask = ~0;
831 }
832
833 /* resume queries */
834 if (ctx->nontimer_queries_suspended) {
835 r600_resume_nontimer_queries(ctx);
836 }
837
838 /* set all valid group as dirty so they get reemited on
839 * next draw command
840 */
841 LIST_FOR_EACH_ENTRY(enable_block, &ctx->enable_list, enable_list) {
842 if(!(enable_block->status & R600_BLOCK_STATUS_DIRTY)) {
843 LIST_ADDTAIL(&enable_block->list,&ctx->dirty);
844 enable_block->status |= R600_BLOCK_STATUS_DIRTY;
845 }
846 ctx->pm4_dirty_cdwords += enable_block->pm4_ndwords;
847 enable_block->nreg_dirty = enable_block->nreg;
848 }
849
850 /* Re-emit the draw state. */
851 ctx->last_primitive_type = -1;
852 ctx->last_start_instance = -1;
853 }
854
855 void r600_context_emit_fence(struct r600_context *ctx, struct r600_resource *fence_bo, unsigned offset, unsigned value)
856 {
857 struct radeon_winsys_cs *cs = ctx->cs;
858 uint64_t va;
859
860 r600_need_cs_space(ctx, 10, FALSE);
861
862 va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
863 va = va + (offset << 2);
864
865 r600_write_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_3D_IDLE(1));
866
867 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
868 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
869 cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; /* ADDRESS_LO */
870 /* DATA_SEL | INT_EN | ADDRESS_HI */
871 cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
872 cs->buf[cs->cdw++] = value; /* DATA_LO */
873 cs->buf[cs->cdw++] = 0; /* DATA_HI */
874 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
875 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
876 }
877
878 static void r600_flush_vgt_streamout(struct r600_context *ctx)
879 {
880 struct radeon_winsys_cs *cs = ctx->cs;
881
882 r600_write_config_reg(cs, R_008490_CP_STRMOUT_CNTL, 0);
883
884 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
885 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0);
886
887 cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
888 cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
889 cs->buf[cs->cdw++] = R_008490_CP_STRMOUT_CNTL >> 2; /* register */
890 cs->buf[cs->cdw++] = 0;
891 cs->buf[cs->cdw++] = S_008490_OFFSET_UPDATE_DONE(1); /* reference value */
892 cs->buf[cs->cdw++] = S_008490_OFFSET_UPDATE_DONE(1); /* mask */
893 cs->buf[cs->cdw++] = 4; /* poll interval */
894 }
895
896 static void r600_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit)
897 {
898 struct radeon_winsys_cs *cs = ctx->cs;
899
900 if (buffer_enable_bit) {
901 r600_write_context_reg(cs, R_028AB0_VGT_STRMOUT_EN, S_028AB0_STREAMOUT(1));
902 r600_write_context_reg(cs, R_028B20_VGT_STRMOUT_BUFFER_EN, buffer_enable_bit);
903 } else {
904 r600_write_context_reg(cs, R_028AB0_VGT_STRMOUT_EN, S_028AB0_STREAMOUT(0));
905 }
906 }
907
908 void r600_context_streamout_begin(struct r600_context *ctx)
909 {
910 struct radeon_winsys_cs *cs = ctx->cs;
911 struct r600_so_target **t = ctx->so_targets;
912 unsigned *stride_in_dw = ctx->vs_shader->so.stride;
913 unsigned buffer_en, i, update_flags = 0;
914 uint64_t va;
915 unsigned num_cs_dw_streamout_end;
916
917 buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) |
918 (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) |
919 (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) |
920 (ctx->num_so_targets >= 4 && t[3] ? 8 : 0);
921
922 num_cs_dw_streamout_end =
923 12 + /* flush_vgt_streamout */
924 util_bitcount(buffer_en) * 8 + /* STRMOUT_BUFFER_UPDATE */
925 3 /* set_streamout_enable(0) */;
926
927 r600_need_cs_space(ctx,
928 12 + /* flush_vgt_streamout */
929 6 + /* set_streamout_enable */
930 util_bitcount(buffer_en) * 7 + /* SET_CONTEXT_REG */
931 (ctx->family >= CHIP_RS780 &&
932 ctx->family <= CHIP_RV740 ? util_bitcount(buffer_en) * 5 : 0) + /* STRMOUT_BASE_UPDATE */
933 util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 + /* STRMOUT_BUFFER_UPDATE */
934 util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 + /* STRMOUT_BUFFER_UPDATE */
935 (ctx->family > CHIP_R600 && ctx->family < CHIP_RS780 ? 2 : 0) + /* SURFACE_BASE_UPDATE */
936 num_cs_dw_streamout_end, TRUE);
937
938 /* This must be set after r600_need_cs_space. */
939 ctx->num_cs_dw_streamout_end = num_cs_dw_streamout_end;
940
941 if (ctx->chip_class >= EVERGREEN) {
942 evergreen_flush_vgt_streamout(ctx);
943 evergreen_set_streamout_enable(ctx, buffer_en);
944 } else {
945 r600_flush_vgt_streamout(ctx);
946 r600_set_streamout_enable(ctx, buffer_en);
947 }
948
949 for (i = 0; i < ctx->num_so_targets; i++) {
950 if (t[i]) {
951 t[i]->stride_in_dw = stride_in_dw[i];
952 t[i]->so_index = i;
953 va = r600_resource_va(&ctx->screen->screen,
954 (void*)t[i]->b.buffer);
955
956 update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i);
957
958 r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 3);
959 r600_write_value(cs, (t[i]->b.buffer_offset +
960 t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
961 r600_write_value(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */
962 r600_write_value(cs, va >> 8); /* BUFFER_BASE */
963
964 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
965 cs->buf[cs->cdw++] =
966 r600_context_bo_reloc(ctx, r600_resource(t[i]->b.buffer),
967 RADEON_USAGE_WRITE);
968
969 /* R7xx requires this packet after updating BUFFER_BASE.
970 * Without this, R7xx locks up. */
971 if (ctx->family >= CHIP_RS780 && ctx->family <= CHIP_RV740) {
972 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BASE_UPDATE, 1, 0);
973 cs->buf[cs->cdw++] = i;
974 cs->buf[cs->cdw++] = va >> 8;
975
976 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
977 cs->buf[cs->cdw++] =
978 r600_context_bo_reloc(ctx, r600_resource(t[i]->b.buffer),
979 RADEON_USAGE_WRITE);
980 }
981
982 if (ctx->streamout_append_bitmask & (1 << i)) {
983 va = r600_resource_va(&ctx->screen->screen,
984 (void*)t[i]->buf_filled_size) + t[i]->buf_filled_size_offset;
985 /* Append. */
986 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
987 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
988 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */
989 cs->buf[cs->cdw++] = 0; /* unused */
990 cs->buf[cs->cdw++] = 0; /* unused */
991 cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; /* src address lo */
992 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFFUL; /* src address hi */
993
994 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
995 cs->buf[cs->cdw++] =
996 r600_context_bo_reloc(ctx, t[i]->buf_filled_size,
997 RADEON_USAGE_READ);
998 } else {
999 /* Start from the beginning. */
1000 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
1001 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
1002 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */
1003 cs->buf[cs->cdw++] = 0; /* unused */
1004 cs->buf[cs->cdw++] = 0; /* unused */
1005 cs->buf[cs->cdw++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */
1006 cs->buf[cs->cdw++] = 0; /* unused */
1007 }
1008 }
1009 }
1010
1011 if (ctx->family > CHIP_R600 && ctx->family < CHIP_RS780) {
1012 cs->buf[cs->cdw++] = PKT3(PKT3_SURFACE_BASE_UPDATE, 0, 0);
1013 cs->buf[cs->cdw++] = update_flags;
1014 }
1015 }
1016
1017 void r600_context_streamout_end(struct r600_context *ctx)
1018 {
1019 struct radeon_winsys_cs *cs = ctx->cs;
1020 struct r600_so_target **t = ctx->so_targets;
1021 unsigned i;
1022 uint64_t va;
1023
1024 if (ctx->chip_class >= EVERGREEN) {
1025 evergreen_flush_vgt_streamout(ctx);
1026 } else {
1027 r600_flush_vgt_streamout(ctx);
1028 }
1029
1030 for (i = 0; i < ctx->num_so_targets; i++) {
1031 if (t[i]) {
1032 va = r600_resource_va(&ctx->screen->screen,
1033 (void*)t[i]->buf_filled_size) + t[i]->buf_filled_size_offset;
1034 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
1035 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
1036 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
1037 STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */
1038 cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; /* dst address lo */
1039 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFFUL; /* dst address hi */
1040 cs->buf[cs->cdw++] = 0; /* unused */
1041 cs->buf[cs->cdw++] = 0; /* unused */
1042
1043 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1044 cs->buf[cs->cdw++] =
1045 r600_context_bo_reloc(ctx, t[i]->buf_filled_size,
1046 RADEON_USAGE_WRITE);
1047
1048 }
1049 }
1050
1051 if (ctx->chip_class >= EVERGREEN) {
1052 ctx->flags |= R600_CONTEXT_STREAMOUT_FLUSH;
1053 evergreen_set_streamout_enable(ctx, 0);
1054 } else {
1055 if (ctx->chip_class >= R700) {
1056 ctx->flags |= R600_CONTEXT_STREAMOUT_FLUSH;
1057 }
1058 r600_set_streamout_enable(ctx, 0);
1059 }
1060 ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
1061 ctx->num_cs_dw_streamout_end = 0;
1062 }
1063
1064 /* The max number of bytes to copy per packet. */
1065 #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
1066
1067 void r600_cp_dma_copy_buffer(struct r600_context *rctx,
1068 struct pipe_resource *dst, uint64_t dst_offset,
1069 struct pipe_resource *src, uint64_t src_offset,
1070 unsigned size)
1071 {
1072 struct radeon_winsys_cs *cs = rctx->cs;
1073
1074 assert(size);
1075 assert(rctx->chip_class != R600);
1076
1077 /* CP DMA doesn't work on R600 (flushing seems to be unreliable). */
1078 if (rctx->chip_class == R600) {
1079 return;
1080 }
1081
1082 dst_offset += r600_resource_va(&rctx->screen->screen, dst);
1083 src_offset += r600_resource_va(&rctx->screen->screen, src);
1084
1085 /* We flush the caches, because we might read from or write
1086 * to resources which are bound right now. */
1087 rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES |
1088 R600_CONTEXT_FLUSH_AND_INV |
1089 R600_CONTEXT_FLUSH_AND_INV_CB_META |
1090 R600_CONTEXT_STREAMOUT_FLUSH |
1091 R600_CONTEXT_WAIT_3D_IDLE;
1092
1093 /* There are differences between R700 and EG in CP DMA,
1094 * but we only use the common bits here. */
1095 while (size) {
1096 unsigned sync = 0;
1097 unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
1098 unsigned src_reloc, dst_reloc;
1099
1100 r600_need_cs_space(rctx, 10 + (rctx->flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE);
1101
1102 /* Flush the caches for the first copy only. */
1103 if (rctx->flags) {
1104 r600_flush_emit(rctx);
1105 }
1106
1107 /* Do the synchronization after the last copy, so that all data is written to memory. */
1108 if (size == byte_count) {
1109 sync = PKT3_CP_DMA_CP_SYNC;
1110 }
1111
1112 /* This must be done after r600_need_cs_space. */
1113 src_reloc = r600_context_bo_reloc(rctx, (struct r600_resource*)src, RADEON_USAGE_READ);
1114 dst_reloc = r600_context_bo_reloc(rctx, (struct r600_resource*)dst, RADEON_USAGE_WRITE);
1115
1116 r600_write_value(cs, PKT3(PKT3_CP_DMA, 4, 0));
1117 r600_write_value(cs, src_offset); /* SRC_ADDR_LO [31:0] */
1118 r600_write_value(cs, sync | ((src_offset >> 32) & 0xff)); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */
1119 r600_write_value(cs, dst_offset); /* DST_ADDR_LO [31:0] */
1120 r600_write_value(cs, (dst_offset >> 32) & 0xff); /* DST_ADDR_HI [7:0] */
1121 r600_write_value(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
1122
1123 r600_write_value(cs, PKT3(PKT3_NOP, 0, 0));
1124 r600_write_value(cs, src_reloc);
1125 r600_write_value(cs, PKT3(PKT3_NOP, 0, 0));
1126 r600_write_value(cs, dst_reloc);
1127
1128 size -= byte_count;
1129 src_offset += byte_count;
1130 dst_offset += byte_count;
1131 }
1132 }