2 * Copyright 2013 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
24 * Marek Olšák <marek.olsak@amd.com>
27 /* Resource binding slots and sampler states (each described with 8 or 4 dwords)
28 * live in memory on SI.
30 * This file is responsible for managing lists of resources and sampler states
31 * in memory and binding them, which means updating those structures in memory.
33 * There is also code for updating shader pointers to resources and sampler
34 * states. CP DMA functions are here too.
37 #include "radeon/r600_cs.h"
39 #include "si_shader.h"
42 #include "util/u_memory.h"
43 #include "util/u_upload_mgr.h"
45 #define SI_NUM_CONTEXTS 16
47 static uint32_t null_desc
[8]; /* zeros */
49 /* Set this if you want the 3D engine to wait until CP DMA is done.
50 * It should be set on the last CP DMA packet. */
51 #define R600_CP_DMA_SYNC (1 << 0) /* R600+ */
53 /* Set this if the source data was used as a destination in a previous CP DMA
54 * packet. It's for preventing a read-after-write (RAW) hazard between two
56 #define SI_CP_DMA_RAW_WAIT (1 << 1) /* SI+ */
57 #define CIK_CP_DMA_USE_L2 (1 << 2)
59 /* Emit a CP DMA packet to do a copy from one buffer to another.
60 * The size must fit in bits [20:0].
62 static void si_emit_cp_dma_copy_buffer(struct si_context
*sctx
,
63 uint64_t dst_va
, uint64_t src_va
,
64 unsigned size
, unsigned flags
)
66 struct radeon_winsys_cs
*cs
= sctx
->b
.rings
.gfx
.cs
;
67 uint32_t sync_flag
= flags
& R600_CP_DMA_SYNC
? PKT3_CP_DMA_CP_SYNC
: 0;
68 uint32_t raw_wait
= flags
& SI_CP_DMA_RAW_WAIT
? PKT3_CP_DMA_CMD_RAW_WAIT
: 0;
69 uint32_t sel
= flags
& CIK_CP_DMA_USE_L2
?
70 PKT3_CP_DMA_SRC_SEL(3) | PKT3_CP_DMA_DST_SEL(3) : 0;
73 assert((size
& ((1<<21)-1)) == size
);
75 if (sctx
->b
.chip_class
>= CIK
) {
76 radeon_emit(cs
, PKT3(PKT3_DMA_DATA
, 5, 0));
77 radeon_emit(cs
, sync_flag
| sel
); /* CP_SYNC [31] */
78 radeon_emit(cs
, src_va
); /* SRC_ADDR_LO [31:0] */
79 radeon_emit(cs
, src_va
>> 32); /* SRC_ADDR_HI [31:0] */
80 radeon_emit(cs
, dst_va
); /* DST_ADDR_LO [31:0] */
81 radeon_emit(cs
, dst_va
>> 32); /* DST_ADDR_HI [31:0] */
82 radeon_emit(cs
, size
| raw_wait
); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
84 radeon_emit(cs
, PKT3(PKT3_CP_DMA
, 4, 0));
85 radeon_emit(cs
, src_va
); /* SRC_ADDR_LO [31:0] */
86 radeon_emit(cs
, sync_flag
| ((src_va
>> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
87 radeon_emit(cs
, dst_va
); /* DST_ADDR_LO [31:0] */
88 radeon_emit(cs
, (dst_va
>> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
89 radeon_emit(cs
, size
| raw_wait
); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
93 /* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
94 static void si_emit_cp_dma_clear_buffer(struct si_context
*sctx
,
95 uint64_t dst_va
, unsigned size
,
96 uint32_t clear_value
, unsigned flags
)
98 struct radeon_winsys_cs
*cs
= sctx
->b
.rings
.gfx
.cs
;
99 uint32_t sync_flag
= flags
& R600_CP_DMA_SYNC
? PKT3_CP_DMA_CP_SYNC
: 0;
100 uint32_t raw_wait
= flags
& SI_CP_DMA_RAW_WAIT
? PKT3_CP_DMA_CMD_RAW_WAIT
: 0;
101 uint32_t dst_sel
= flags
& CIK_CP_DMA_USE_L2
? PKT3_CP_DMA_DST_SEL(3) : 0;
104 assert((size
& ((1<<21)-1)) == size
);
106 if (sctx
->b
.chip_class
>= CIK
) {
107 radeon_emit(cs
, PKT3(PKT3_DMA_DATA
, 5, 0));
108 radeon_emit(cs
, sync_flag
| dst_sel
| PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
109 radeon_emit(cs
, clear_value
); /* DATA [31:0] */
111 radeon_emit(cs
, dst_va
); /* DST_ADDR_LO [31:0] */
112 radeon_emit(cs
, dst_va
>> 32); /* DST_ADDR_HI [15:0] */
113 radeon_emit(cs
, size
| raw_wait
); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
115 radeon_emit(cs
, PKT3(PKT3_CP_DMA
, 4, 0));
116 radeon_emit(cs
, clear_value
); /* DATA [31:0] */
117 radeon_emit(cs
, sync_flag
| PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
118 radeon_emit(cs
, dst_va
); /* DST_ADDR_LO [31:0] */
119 radeon_emit(cs
, (dst_va
>> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
120 radeon_emit(cs
, size
| raw_wait
); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
124 static void si_init_descriptors(struct si_context
*sctx
,
125 struct si_descriptors
*desc
,
126 unsigned shader_userdata_reg
,
127 unsigned element_dw_size
,
128 unsigned num_elements
,
129 void (*emit_func
)(struct si_context
*ctx
, struct r600_atom
*state
))
131 assert(num_elements
<= sizeof(desc
->enabled_mask
)*8);
132 assert(num_elements
<= sizeof(desc
->dirty_mask
)*8);
134 desc
->atom
.emit
= (void*)emit_func
;
135 desc
->shader_userdata_reg
= shader_userdata_reg
;
136 desc
->element_dw_size
= element_dw_size
;
137 desc
->num_elements
= num_elements
;
138 desc
->context_size
= num_elements
* element_dw_size
* 4;
140 desc
->buffer
= (struct r600_resource
*)
141 pipe_buffer_create(sctx
->b
.b
.screen
, PIPE_BIND_CUSTOM
,
143 SI_NUM_CONTEXTS
* desc
->context_size
);
145 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
, desc
->buffer
,
146 RADEON_USAGE_READWRITE
, RADEON_PRIO_SHADER_DATA
);
148 /* We don't check for CS space here, because this should be called
149 * only once at context initialization. */
150 si_emit_cp_dma_clear_buffer(sctx
, desc
->buffer
->gpu_address
,
151 desc
->buffer
->b
.b
.width0
, 0,
152 R600_CP_DMA_SYNC
| CIK_CP_DMA_USE_L2
);
155 static void si_release_descriptors(struct si_descriptors
*desc
)
157 pipe_resource_reference((struct pipe_resource
**)&desc
->buffer
, NULL
);
160 static void si_update_descriptors(struct si_context
*sctx
,
161 struct si_descriptors
*desc
)
163 if (desc
->dirty_mask
) {
166 (4 + desc
->element_dw_size
) * util_bitcount(desc
->dirty_mask
) + /* update */
167 4; /* pointer update */
169 if (desc
->shader_userdata_reg
>= R_00B130_SPI_SHADER_USER_DATA_VS_0
&&
170 desc
->shader_userdata_reg
< R_00B230_SPI_SHADER_USER_DATA_GS_0
)
171 desc
->atom
.num_dw
+= 4; /* second pointer update */
173 desc
->atom
.dirty
= true;
175 /* TODO: Investigate if these flushes can be removed after
176 * adding CE support. */
178 /* The descriptors are read with the K cache. */
179 sctx
->b
.flags
|= SI_CONTEXT_INV_KCACHE
;
181 /* Since SI uses uncached CP DMA to update descriptors,
182 * we have to flush TC L2, which is used to fetch constants
183 * along with KCACHE. */
184 if (sctx
->b
.chip_class
== SI
)
185 sctx
->b
.flags
|= SI_CONTEXT_INV_TC_L2
;
187 desc
->atom
.dirty
= false;
191 static void si_emit_shader_pointer(struct si_context
*sctx
,
192 struct r600_atom
*atom
)
194 struct si_descriptors
*desc
= (struct si_descriptors
*)atom
;
195 struct radeon_winsys_cs
*cs
= sctx
->b
.rings
.gfx
.cs
;
196 uint64_t va
= desc
->buffer
->gpu_address
+
197 desc
->current_context_id
* desc
->context_size
+
200 radeon_emit(cs
, PKT3(PKT3_SET_SH_REG
, 2, 0));
201 radeon_emit(cs
, (desc
->shader_userdata_reg
- SI_SH_REG_OFFSET
) >> 2);
203 radeon_emit(cs
, va
>> 32);
205 if (desc
->shader_userdata_reg
>= R_00B130_SPI_SHADER_USER_DATA_VS_0
&&
206 desc
->shader_userdata_reg
< R_00B230_SPI_SHADER_USER_DATA_GS_0
) {
207 radeon_emit(cs
, PKT3(PKT3_SET_SH_REG
, 2, 0));
208 radeon_emit(cs
, (desc
->shader_userdata_reg
+
209 (R_00B330_SPI_SHADER_USER_DATA_ES_0
-
210 R_00B130_SPI_SHADER_USER_DATA_VS_0
) -
211 SI_SH_REG_OFFSET
) >> 2);
213 radeon_emit(cs
, va
>> 32);
217 static void si_emit_descriptors(struct si_context
*sctx
,
218 struct si_descriptors
*desc
,
219 uint32_t **descriptors
)
221 struct radeon_winsys_cs
*cs
= sctx
->b
.rings
.gfx
.cs
;
223 int packet_start
= 0;
225 int last_index
= desc
->num_elements
; /* point to a non-existing element */
226 unsigned dirty_mask
= desc
->dirty_mask
;
227 unsigned new_context_id
= (desc
->current_context_id
+ 1) % SI_NUM_CONTEXTS
;
231 va_base
= desc
->buffer
->gpu_address
;
233 /* Copy the descriptors to a new context slot. */
234 si_emit_cp_dma_copy_buffer(sctx
,
235 va_base
+ new_context_id
* desc
->context_size
,
236 va_base
+ desc
->current_context_id
* desc
->context_size
,
237 desc
->context_size
, R600_CP_DMA_SYNC
| CIK_CP_DMA_USE_L2
);
239 va_base
+= new_context_id
* desc
->context_size
;
241 /* Update the descriptors.
242 * Updates of consecutive descriptors are merged to one WRITE_DATA packet.
244 * XXX When unbinding lots of resources, consider clearing the memory
245 * with CP DMA instead of emitting zeros.
248 int i
= u_bit_scan(&dirty_mask
);
250 assert(i
< desc
->num_elements
);
252 if (last_index
+1 == i
&& packet_size
) {
253 /* Append new data at the end of the last packet. */
254 packet_size
+= desc
->element_dw_size
;
255 cs
->buf
[packet_start
] = PKT3(PKT3_WRITE_DATA
, packet_size
, 0);
257 /* Start a new packet. */
258 uint64_t va
= va_base
+ i
* desc
->element_dw_size
* 4;
260 packet_start
= cs
->cdw
;
261 packet_size
= 2 + desc
->element_dw_size
;
263 radeon_emit(cs
, PKT3(PKT3_WRITE_DATA
, packet_size
, 0));
264 radeon_emit(cs
, PKT3_WRITE_DATA_DST_SEL(sctx
->b
.chip_class
== SI
?
265 PKT3_WRITE_DATA_DST_SEL_MEM_SYNC
:
266 PKT3_WRITE_DATA_DST_SEL_TC_L2
) |
267 PKT3_WRITE_DATA_WR_CONFIRM
|
268 PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME
));
269 radeon_emit(cs
, va
& 0xFFFFFFFFUL
);
270 radeon_emit(cs
, (va
>> 32UL) & 0xFFFFFFFFUL
);
273 radeon_emit_array(cs
, descriptors
[i
], desc
->element_dw_size
);
278 desc
->dirty_mask
= 0;
279 desc
->current_context_id
= new_context_id
;
281 /* Now update the shader userdata pointer. */
282 si_emit_shader_pointer(sctx
, &desc
->atom
);
285 static unsigned si_get_shader_user_data_base(unsigned shader
)
288 case PIPE_SHADER_VERTEX
:
289 return R_00B130_SPI_SHADER_USER_DATA_VS_0
;
290 case PIPE_SHADER_GEOMETRY
:
291 return R_00B230_SPI_SHADER_USER_DATA_GS_0
;
292 case PIPE_SHADER_FRAGMENT
:
293 return R_00B030_SPI_SHADER_USER_DATA_PS_0
;
302 static void si_emit_sampler_views(struct si_context
*sctx
, struct r600_atom
*atom
)
304 struct si_sampler_views
*views
= (struct si_sampler_views
*)atom
;
306 si_emit_descriptors(sctx
, &views
->desc
, views
->desc_data
);
309 static void si_init_sampler_views(struct si_context
*sctx
,
310 struct si_sampler_views
*views
,
313 si_init_descriptors(sctx
, &views
->desc
,
314 si_get_shader_user_data_base(shader
) +
315 SI_SGPR_RESOURCE
* 4,
316 8, SI_NUM_SAMPLER_VIEWS
, si_emit_sampler_views
);
319 static void si_release_sampler_views(struct si_sampler_views
*views
)
323 for (i
= 0; i
< Elements(views
->views
); i
++) {
324 pipe_sampler_view_reference(&views
->views
[i
], NULL
);
326 si_release_descriptors(&views
->desc
);
329 static enum radeon_bo_priority
si_get_resource_ro_priority(struct r600_resource
*res
)
331 if (res
->b
.b
.target
== PIPE_BUFFER
)
332 return RADEON_PRIO_SHADER_BUFFER_RO
;
334 if (res
->b
.b
.nr_samples
> 1)
335 return RADEON_PRIO_SHADER_TEXTURE_MSAA
;
337 return RADEON_PRIO_SHADER_TEXTURE_RO
;
340 static void si_sampler_views_begin_new_cs(struct si_context
*sctx
,
341 struct si_sampler_views
*views
)
343 unsigned mask
= views
->desc
.enabled_mask
;
345 /* Add relocations to the CS. */
347 int i
= u_bit_scan(&mask
);
348 struct si_sampler_view
*rview
=
349 (struct si_sampler_view
*)views
->views
[i
];
351 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
352 rview
->resource
, RADEON_USAGE_READ
,
353 si_get_resource_ro_priority(rview
->resource
));
356 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
, views
->desc
.buffer
,
357 RADEON_USAGE_READWRITE
, RADEON_PRIO_SHADER_DATA
);
359 si_emit_shader_pointer(sctx
, &views
->desc
.atom
);
362 static void si_set_sampler_view(struct si_context
*sctx
, unsigned shader
,
363 unsigned slot
, struct pipe_sampler_view
*view
,
366 struct si_sampler_views
*views
= &sctx
->samplers
[shader
].views
;
368 if (views
->views
[slot
] == view
)
372 struct si_sampler_view
*rview
=
373 (struct si_sampler_view
*)view
;
375 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
376 rview
->resource
, RADEON_USAGE_READ
,
377 si_get_resource_ro_priority(rview
->resource
));
379 pipe_sampler_view_reference(&views
->views
[slot
], view
);
380 views
->desc_data
[slot
] = view_desc
;
381 views
->desc
.enabled_mask
|= 1 << slot
;
383 pipe_sampler_view_reference(&views
->views
[slot
], NULL
);
384 views
->desc_data
[slot
] = null_desc
;
385 views
->desc
.enabled_mask
&= ~(1 << slot
);
388 views
->desc
.dirty_mask
|= 1 << slot
;
391 static void si_set_sampler_views(struct pipe_context
*ctx
,
392 unsigned shader
, unsigned start
,
394 struct pipe_sampler_view
**views
)
396 struct si_context
*sctx
= (struct si_context
*)ctx
;
397 struct si_textures_info
*samplers
= &sctx
->samplers
[shader
];
398 struct si_sampler_view
**rviews
= (struct si_sampler_view
**)views
;
401 if (!count
|| shader
>= SI_NUM_SHADERS
)
404 for (i
= 0; i
< count
; i
++) {
405 unsigned slot
= start
+ i
;
408 samplers
->depth_texture_mask
&= ~(1 << slot
);
409 samplers
->compressed_colortex_mask
&= ~(1 << slot
);
410 si_set_sampler_view(sctx
, shader
, slot
, NULL
, NULL
);
411 si_set_sampler_view(sctx
, shader
, SI_FMASK_TEX_OFFSET
+ slot
,
416 si_set_sampler_view(sctx
, shader
, slot
, views
[i
], rviews
[i
]->state
);
418 if (views
[i
]->texture
->target
!= PIPE_BUFFER
) {
419 struct r600_texture
*rtex
=
420 (struct r600_texture
*)views
[i
]->texture
;
422 if (rtex
->is_depth
&& !rtex
->is_flushing_texture
) {
423 samplers
->depth_texture_mask
|= 1 << slot
;
425 samplers
->depth_texture_mask
&= ~(1 << slot
);
427 if (rtex
->cmask
.size
|| rtex
->fmask
.size
) {
428 samplers
->compressed_colortex_mask
|= 1 << slot
;
430 samplers
->compressed_colortex_mask
&= ~(1 << slot
);
433 if (rtex
->fmask
.size
) {
434 si_set_sampler_view(sctx
, shader
, SI_FMASK_TEX_OFFSET
+ slot
,
435 views
[i
], rviews
[i
]->fmask_state
);
437 si_set_sampler_view(sctx
, shader
, SI_FMASK_TEX_OFFSET
+ slot
,
441 samplers
->depth_texture_mask
&= ~(1 << slot
);
442 samplers
->compressed_colortex_mask
&= ~(1 << slot
);
443 si_set_sampler_view(sctx
, shader
, SI_FMASK_TEX_OFFSET
+ slot
,
448 si_update_descriptors(sctx
, &samplers
->views
.desc
);
453 static void si_emit_sampler_states(struct si_context
*sctx
, struct r600_atom
*atom
)
455 struct si_sampler_states
*states
= (struct si_sampler_states
*)atom
;
457 si_emit_descriptors(sctx
, &states
->desc
, states
->desc_data
);
460 static void si_sampler_states_begin_new_cs(struct si_context
*sctx
,
461 struct si_sampler_states
*states
)
463 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
, states
->desc
.buffer
,
464 RADEON_USAGE_READWRITE
, RADEON_PRIO_SHADER_DATA
);
465 si_emit_shader_pointer(sctx
, &states
->desc
.atom
);
468 void si_set_sampler_descriptors(struct si_context
*sctx
, unsigned shader
,
469 unsigned start
, unsigned count
, void **states
)
471 struct si_sampler_states
*samplers
= &sctx
->samplers
[shader
].states
;
472 struct si_sampler_state
**sstates
= (struct si_sampler_state
**)states
;
476 samplers
->saved_states
[0] = states
[0];
478 samplers
->saved_states
[1] = states
[0];
479 else if (start
== 0 && count
>= 2)
480 samplers
->saved_states
[1] = states
[1];
482 for (i
= 0; i
< count
; i
++) {
483 unsigned slot
= start
+ i
;
486 samplers
->desc
.dirty_mask
&= ~(1 << slot
);
490 samplers
->desc_data
[slot
] = sstates
[i
]->val
;
491 samplers
->desc
.dirty_mask
|= 1 << slot
;
494 si_update_descriptors(sctx
, &samplers
->desc
);
497 /* BUFFER RESOURCES */
499 static void si_emit_buffer_resources(struct si_context
*sctx
, struct r600_atom
*atom
)
501 struct si_buffer_resources
*buffers
= (struct si_buffer_resources
*)atom
;
503 si_emit_descriptors(sctx
, &buffers
->desc
, buffers
->desc_data
);
506 static void si_init_buffer_resources(struct si_context
*sctx
,
507 struct si_buffer_resources
*buffers
,
508 unsigned num_buffers
, unsigned shader
,
509 unsigned shader_userdata_index
,
510 enum radeon_bo_usage shader_usage
,
511 enum radeon_bo_priority priority
)
515 buffers
->num_buffers
= num_buffers
;
516 buffers
->shader_usage
= shader_usage
;
517 buffers
->priority
= priority
;
518 buffers
->buffers
= CALLOC(num_buffers
, sizeof(struct pipe_resource
*));
519 buffers
->desc_storage
= CALLOC(num_buffers
, sizeof(uint32_t) * 4);
521 /* si_emit_descriptors only accepts an array of arrays.
522 * This adds such an array. */
523 buffers
->desc_data
= CALLOC(num_buffers
, sizeof(uint32_t*));
524 for (i
= 0; i
< num_buffers
; i
++) {
525 buffers
->desc_data
[i
] = &buffers
->desc_storage
[i
*4];
528 si_init_descriptors(sctx
, &buffers
->desc
,
529 si_get_shader_user_data_base(shader
) +
530 shader_userdata_index
*4, 4, num_buffers
,
531 si_emit_buffer_resources
);
534 static void si_release_buffer_resources(struct si_buffer_resources
*buffers
)
538 for (i
= 0; i
< buffers
->num_buffers
; i
++) {
539 pipe_resource_reference(&buffers
->buffers
[i
], NULL
);
542 FREE(buffers
->buffers
);
543 FREE(buffers
->desc_storage
);
544 FREE(buffers
->desc_data
);
545 si_release_descriptors(&buffers
->desc
);
548 static void si_buffer_resources_begin_new_cs(struct si_context
*sctx
,
549 struct si_buffer_resources
*buffers
)
551 unsigned mask
= buffers
->desc
.enabled_mask
;
553 /* Add relocations to the CS. */
555 int i
= u_bit_scan(&mask
);
557 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
558 (struct r600_resource
*)buffers
->buffers
[i
],
559 buffers
->shader_usage
, buffers
->priority
);
562 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
563 buffers
->desc
.buffer
, RADEON_USAGE_READWRITE
,
564 RADEON_PRIO_SHADER_DATA
);
566 si_emit_shader_pointer(sctx
, &buffers
->desc
.atom
);
571 static void si_vertex_buffers_begin_new_cs(struct si_context
*sctx
)
573 struct si_descriptors
*desc
= &sctx
->vertex_buffers
;
574 int count
= sctx
->vertex_elements
? sctx
->vertex_elements
->count
: 0;
577 for (i
= 0; i
< count
; i
++) {
578 int vb
= sctx
->vertex_elements
->elements
[i
].vertex_buffer_index
;
580 if (vb
>= Elements(sctx
->vertex_buffer
))
582 if (!sctx
->vertex_buffer
[vb
].buffer
)
585 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
586 (struct r600_resource
*)sctx
->vertex_buffer
[vb
].buffer
,
587 RADEON_USAGE_READ
, RADEON_PRIO_SHADER_BUFFER_RO
);
589 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
590 desc
->buffer
, RADEON_USAGE_READ
,
591 RADEON_PRIO_SHADER_DATA
);
593 si_emit_shader_pointer(sctx
, &desc
->atom
);
596 void si_update_vertex_buffers(struct si_context
*sctx
)
598 struct si_descriptors
*desc
= &sctx
->vertex_buffers
;
599 bool bound
[SI_NUM_VERTEX_BUFFERS
] = {};
600 unsigned i
, count
= sctx
->vertex_elements
->count
;
604 if (!count
|| !sctx
->vertex_elements
)
607 /* Vertex buffer descriptors are the only ones which are uploaded
608 * directly through a staging buffer and don't go through
609 * the fine-grained upload path.
611 u_upload_alloc(sctx
->b
.uploader
, 0, count
* 16, &desc
->buffer_offset
,
612 (struct pipe_resource
**)&desc
->buffer
, (void**)&ptr
);
614 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
615 desc
->buffer
, RADEON_USAGE_READ
,
616 RADEON_PRIO_SHADER_DATA
);
618 assert(count
<= SI_NUM_VERTEX_BUFFERS
);
619 assert(desc
->current_context_id
== 0);
621 for (i
= 0; i
< count
; i
++) {
622 struct pipe_vertex_element
*ve
= &sctx
->vertex_elements
->elements
[i
];
623 struct pipe_vertex_buffer
*vb
;
624 struct r600_resource
*rbuffer
;
626 uint32_t *desc
= &ptr
[i
*4];
628 if (ve
->vertex_buffer_index
>= Elements(sctx
->vertex_buffer
)) {
633 vb
= &sctx
->vertex_buffer
[ve
->vertex_buffer_index
];
634 rbuffer
= (struct r600_resource
*)vb
->buffer
;
635 if (rbuffer
== NULL
) {
640 offset
= vb
->buffer_offset
+ ve
->src_offset
;
641 va
= rbuffer
->gpu_address
+ offset
;
643 /* Fill in T# buffer resource description */
644 desc
[0] = va
& 0xFFFFFFFF;
645 desc
[1] = S_008F04_BASE_ADDRESS_HI(va
>> 32) |
646 S_008F04_STRIDE(vb
->stride
);
648 /* Round up by rounding down and adding 1 */
649 desc
[2] = (vb
->buffer
->width0
- offset
-
650 sctx
->vertex_elements
->format_size
[i
]) /
653 desc
[2] = vb
->buffer
->width0
- offset
;
655 desc
[3] = sctx
->vertex_elements
->rsrc_word3
[i
];
657 if (!bound
[ve
->vertex_buffer_index
]) {
658 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
659 (struct r600_resource
*)vb
->buffer
,
660 RADEON_USAGE_READ
, RADEON_PRIO_SHADER_BUFFER_RO
);
661 bound
[ve
->vertex_buffer_index
] = true;
665 desc
->atom
.num_dw
= 8; /* update 2 shader pointers (VS+ES) */
666 desc
->atom
.dirty
= true;
668 /* Don't flush the const cache. It would have a very negative effect
669 * on performance (confirmed by testing). New descriptors are always
670 * uploaded to a fresh new buffer, so I don't think flushing the const
671 * cache is needed. */
675 /* CONSTANT BUFFERS */
677 void si_upload_const_buffer(struct si_context
*sctx
, struct r600_resource
**rbuffer
,
678 const uint8_t *ptr
, unsigned size
, uint32_t *const_offset
)
682 u_upload_alloc(sctx
->b
.uploader
, 0, size
, const_offset
,
683 (struct pipe_resource
**)rbuffer
, &tmp
);
684 util_memcpy_cpu_to_le32(tmp
, ptr
, size
);
687 static void si_set_constant_buffer(struct pipe_context
*ctx
, uint shader
, uint slot
,
688 struct pipe_constant_buffer
*input
)
690 struct si_context
*sctx
= (struct si_context
*)ctx
;
691 struct si_buffer_resources
*buffers
= &sctx
->const_buffers
[shader
];
693 if (shader
>= SI_NUM_SHADERS
)
696 assert(slot
< buffers
->num_buffers
);
697 pipe_resource_reference(&buffers
->buffers
[slot
], NULL
);
699 /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
700 * with a NULL buffer). We need to use a dummy buffer instead. */
701 if (sctx
->b
.chip_class
== CIK
&&
702 (!input
|| (!input
->buffer
&& !input
->user_buffer
)))
703 input
= &sctx
->null_const_buf
;
705 if (input
&& (input
->buffer
|| input
->user_buffer
)) {
706 struct pipe_resource
*buffer
= NULL
;
709 /* Upload the user buffer if needed. */
710 if (input
->user_buffer
) {
711 unsigned buffer_offset
;
713 si_upload_const_buffer(sctx
,
714 (struct r600_resource
**)&buffer
, input
->user_buffer
,
715 input
->buffer_size
, &buffer_offset
);
716 va
= r600_resource(buffer
)->gpu_address
+ buffer_offset
;
718 pipe_resource_reference(&buffer
, input
->buffer
);
719 va
= r600_resource(buffer
)->gpu_address
+ input
->buffer_offset
;
722 /* Set the descriptor. */
723 uint32_t *desc
= buffers
->desc_data
[slot
];
725 desc
[1] = S_008F04_BASE_ADDRESS_HI(va
>> 32) |
727 desc
[2] = input
->buffer_size
;
728 desc
[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
729 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
730 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) |
731 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
) |
732 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT
) |
733 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
);
735 buffers
->buffers
[slot
] = buffer
;
736 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
737 (struct r600_resource
*)buffer
,
738 buffers
->shader_usage
, buffers
->priority
);
739 buffers
->desc
.enabled_mask
|= 1 << slot
;
741 /* Clear the descriptor. */
742 memset(buffers
->desc_data
[slot
], 0, sizeof(uint32_t) * 4);
743 buffers
->desc
.enabled_mask
&= ~(1 << slot
);
746 buffers
->desc
.dirty_mask
|= 1 << slot
;
747 si_update_descriptors(sctx
, &buffers
->desc
);
752 void si_set_ring_buffer(struct pipe_context
*ctx
, uint shader
, uint slot
,
753 struct pipe_resource
*buffer
,
754 unsigned stride
, unsigned num_records
,
755 bool add_tid
, bool swizzle
,
756 unsigned element_size
, unsigned index_stride
)
758 struct si_context
*sctx
= (struct si_context
*)ctx
;
759 struct si_buffer_resources
*buffers
= &sctx
->rw_buffers
[shader
];
761 if (shader
>= SI_NUM_SHADERS
)
764 /* The stride field in the resource descriptor has 14 bits */
765 assert(stride
< (1 << 14));
767 assert(slot
< buffers
->num_buffers
);
768 pipe_resource_reference(&buffers
->buffers
[slot
], NULL
);
773 va
= r600_resource(buffer
)->gpu_address
;
775 switch (element_size
) {
777 assert(!"Unsupported ring buffer element size");
793 switch (index_stride
) {
795 assert(!"Unsupported ring buffer index stride");
811 /* Set the descriptor. */
812 uint32_t *desc
= buffers
->desc_data
[slot
];
814 desc
[1] = S_008F04_BASE_ADDRESS_HI(va
>> 32) |
815 S_008F04_STRIDE(stride
) |
816 S_008F04_SWIZZLE_ENABLE(swizzle
);
817 desc
[2] = num_records
;
818 desc
[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
819 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
820 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) |
821 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
) |
822 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT
) |
823 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
) |
824 S_008F0C_ELEMENT_SIZE(element_size
) |
825 S_008F0C_INDEX_STRIDE(index_stride
) |
826 S_008F0C_ADD_TID_ENABLE(add_tid
);
828 pipe_resource_reference(&buffers
->buffers
[slot
], buffer
);
829 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
830 (struct r600_resource
*)buffer
,
831 buffers
->shader_usage
, buffers
->priority
);
832 buffers
->desc
.enabled_mask
|= 1 << slot
;
834 /* Clear the descriptor. */
835 memset(buffers
->desc_data
[slot
], 0, sizeof(uint32_t) * 4);
836 buffers
->desc
.enabled_mask
&= ~(1 << slot
);
839 buffers
->desc
.dirty_mask
|= 1 << slot
;
840 si_update_descriptors(sctx
, &buffers
->desc
);
843 /* STREAMOUT BUFFERS */
845 static void si_set_streamout_targets(struct pipe_context
*ctx
,
846 unsigned num_targets
,
847 struct pipe_stream_output_target
**targets
,
848 const unsigned *offsets
)
850 struct si_context
*sctx
= (struct si_context
*)ctx
;
851 struct si_buffer_resources
*buffers
= &sctx
->rw_buffers
[PIPE_SHADER_VERTEX
];
852 unsigned old_num_targets
= sctx
->b
.streamout
.num_targets
;
855 /* We are going to unbind the buffers. Mark which caches need to be flushed. */
856 if (sctx
->b
.streamout
.num_targets
&& sctx
->b
.streamout
.begin_emitted
) {
857 /* Since streamout uses vector writes which go through TC L2
858 * and most other clients can use TC L2 as well, we don't need
861 * The only case which requires flushing it is VGT DMA index
862 * fetching, which is a rare case. Thus, flag the TC L2
863 * dirtiness in the resource and handle it when index fetching
866 for (i
= 0; i
< sctx
->b
.streamout
.num_targets
; i
++)
867 if (sctx
->b
.streamout
.targets
[i
])
868 r600_resource(sctx
->b
.streamout
.targets
[i
]->b
.buffer
)->TC_L2_dirty
= true;
870 /* Invalidate the scalar cache in case a streamout buffer is
871 * going to be used as a constant buffer.
873 * Invalidate TC L1, because streamout bypasses it (done by
874 * setting GLC=1 in the store instruction), but it can contain
875 * outdated data of streamout buffers.
877 * VS_PARTIAL_FLUSH is required if the buffers are going to be
878 * used as an input immediately.
880 sctx
->b
.flags
|= SI_CONTEXT_INV_KCACHE
|
881 SI_CONTEXT_INV_TC_L1
|
882 SI_CONTEXT_VS_PARTIAL_FLUSH
;
885 /* Streamout buffers must be bound in 2 places:
886 * 1) in VGT by setting the VGT_STRMOUT registers
887 * 2) as shader resources
890 /* Set the VGT regs. */
891 r600_set_streamout_targets(ctx
, num_targets
, targets
, offsets
);
893 /* Set the shader resources.*/
894 for (i
= 0; i
< num_targets
; i
++) {
895 bufidx
= SI_SO_BUF_OFFSET
+ i
;
898 struct pipe_resource
*buffer
= targets
[i
]->buffer
;
899 uint64_t va
= r600_resource(buffer
)->gpu_address
;
901 /* Set the descriptor. */
902 uint32_t *desc
= buffers
->desc_data
[bufidx
];
904 desc
[1] = S_008F04_BASE_ADDRESS_HI(va
>> 32);
905 desc
[2] = 0xffffffff;
906 desc
[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
907 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
908 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) |
909 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
);
911 /* Set the resource. */
912 pipe_resource_reference(&buffers
->buffers
[bufidx
],
914 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
915 (struct r600_resource
*)buffer
,
916 buffers
->shader_usage
, buffers
->priority
);
917 buffers
->desc
.enabled_mask
|= 1 << bufidx
;
919 /* Clear the descriptor and unset the resource. */
920 memset(buffers
->desc_data
[bufidx
], 0,
921 sizeof(uint32_t) * 4);
922 pipe_resource_reference(&buffers
->buffers
[bufidx
],
924 buffers
->desc
.enabled_mask
&= ~(1 << bufidx
);
926 buffers
->desc
.dirty_mask
|= 1 << bufidx
;
928 for (; i
< old_num_targets
; i
++) {
929 bufidx
= SI_SO_BUF_OFFSET
+ i
;
930 /* Clear the descriptor and unset the resource. */
931 memset(buffers
->desc_data
[bufidx
], 0, sizeof(uint32_t) * 4);
932 pipe_resource_reference(&buffers
->buffers
[bufidx
], NULL
);
933 buffers
->desc
.enabled_mask
&= ~(1 << bufidx
);
934 buffers
->desc
.dirty_mask
|= 1 << bufidx
;
937 si_update_descriptors(sctx
, &buffers
->desc
);
940 static void si_desc_reset_buffer_offset(struct pipe_context
*ctx
,
941 uint32_t *desc
, uint64_t old_buf_va
,
942 struct pipe_resource
*new_buf
)
944 /* Retrieve the buffer offset from the descriptor. */
945 uint64_t old_desc_va
=
946 desc
[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc
[1]) << 32);
948 assert(old_buf_va
<= old_desc_va
);
949 uint64_t offset_within_buffer
= old_desc_va
- old_buf_va
;
951 /* Update the descriptor. */
952 uint64_t va
= r600_resource(new_buf
)->gpu_address
+ offset_within_buffer
;
955 desc
[1] = (desc
[1] & C_008F04_BASE_ADDRESS_HI
) |
956 S_008F04_BASE_ADDRESS_HI(va
>> 32);
959 /* BUFFER DISCARD/INVALIDATION */
961 /* Reallocate a buffer a update all resource bindings where the buffer is
964 * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
965 * idle by discarding its contents. Apps usually tell us when to do this using
966 * map_buffer flags, for example.
968 static void si_invalidate_buffer(struct pipe_context
*ctx
, struct pipe_resource
*buf
)
970 struct si_context
*sctx
= (struct si_context
*)ctx
;
971 struct r600_resource
*rbuffer
= r600_resource(buf
);
972 unsigned i
, shader
, alignment
= rbuffer
->buf
->alignment
;
973 uint64_t old_va
= rbuffer
->gpu_address
;
974 unsigned num_elems
= sctx
->vertex_elements
?
975 sctx
->vertex_elements
->count
: 0;
976 struct si_sampler_view
*view
;
978 /* Reallocate the buffer in the same pipe_resource. */
979 r600_init_resource(&sctx
->screen
->b
, rbuffer
, rbuffer
->b
.b
.width0
,
982 /* We changed the buffer, now we need to bind it where the old one
983 * was bound. This consists of 2 things:
984 * 1) Updating the resource descriptor and dirtying it.
985 * 2) Adding a relocation to the CS, so that it's usable.
988 /* Vertex buffers. */
989 for (i
= 0; i
< num_elems
; i
++) {
990 int vb
= sctx
->vertex_elements
->elements
[i
].vertex_buffer_index
;
992 if (vb
>= Elements(sctx
->vertex_buffer
))
994 if (!sctx
->vertex_buffer
[vb
].buffer
)
997 if (sctx
->vertex_buffer
[vb
].buffer
== buf
) {
998 sctx
->vertex_buffers_dirty
= true;
1003 /* Read/Write buffers. */
1004 for (shader
= 0; shader
< SI_NUM_SHADERS
; shader
++) {
1005 struct si_buffer_resources
*buffers
= &sctx
->rw_buffers
[shader
];
1007 uint32_t mask
= buffers
->desc
.enabled_mask
;
1010 i
= u_bit_scan(&mask
);
1011 if (buffers
->buffers
[i
] == buf
) {
1012 si_desc_reset_buffer_offset(ctx
, buffers
->desc_data
[i
],
1015 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
1016 rbuffer
, buffers
->shader_usage
,
1019 buffers
->desc
.dirty_mask
|= 1 << i
;
1022 if (i
>= SI_SO_BUF_OFFSET
&& shader
== PIPE_SHADER_VERTEX
) {
1023 /* Update the streamout state. */
1024 if (sctx
->b
.streamout
.begin_emitted
) {
1025 r600_emit_streamout_end(&sctx
->b
);
1027 sctx
->b
.streamout
.append_bitmask
=
1028 sctx
->b
.streamout
.enabled_mask
;
1029 r600_streamout_buffers_dirty(&sctx
->b
);
1034 si_update_descriptors(sctx
, &buffers
->desc
);
1038 /* Constant buffers. */
1039 for (shader
= 0; shader
< SI_NUM_SHADERS
; shader
++) {
1040 struct si_buffer_resources
*buffers
= &sctx
->const_buffers
[shader
];
1042 uint32_t mask
= buffers
->desc
.enabled_mask
;
1045 unsigned i
= u_bit_scan(&mask
);
1046 if (buffers
->buffers
[i
] == buf
) {
1047 si_desc_reset_buffer_offset(ctx
, buffers
->desc_data
[i
],
1050 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
1051 rbuffer
, buffers
->shader_usage
,
1054 buffers
->desc
.dirty_mask
|= 1 << i
;
1059 si_update_descriptors(sctx
, &buffers
->desc
);
1063 /* Texture buffers - update virtual addresses in sampler view descriptors. */
1064 LIST_FOR_EACH_ENTRY(view
, &sctx
->b
.texture_buffers
, list
) {
1065 if (view
->base
.texture
== buf
) {
1066 si_desc_reset_buffer_offset(ctx
, view
->state
, old_va
, buf
);
1069 /* Texture buffers - update bindings. */
1070 for (shader
= 0; shader
< SI_NUM_SHADERS
; shader
++) {
1071 struct si_sampler_views
*views
= &sctx
->samplers
[shader
].views
;
1073 uint32_t mask
= views
->desc
.enabled_mask
;
1076 unsigned i
= u_bit_scan(&mask
);
1077 if (views
->views
[i
]->texture
== buf
) {
1078 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
1079 rbuffer
, RADEON_USAGE_READ
,
1080 RADEON_PRIO_SHADER_BUFFER_RO
);
1082 views
->desc
.dirty_mask
|= 1 << i
;
1087 si_update_descriptors(sctx
, &views
->desc
);
1094 /* The max number of bytes to copy per packet. */
1095 #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
1097 static void si_clear_buffer(struct pipe_context
*ctx
, struct pipe_resource
*dst
,
1098 unsigned offset
, unsigned size
, unsigned value
,
1099 bool is_framebuffer
)
1101 struct si_context
*sctx
= (struct si_context
*)ctx
;
1102 unsigned flush_flags
, tc_l2_flag
;
1107 /* Mark the buffer range of destination as valid (initialized),
1108 * so that transfer_map knows it should wait for the GPU when mapping
1110 util_range_add(&r600_resource(dst
)->valid_buffer_range
, offset
,
1113 /* Fallback for unaligned clears. */
1114 if (offset
% 4 != 0 || size
% 4 != 0) {
1115 uint32_t *map
= sctx
->b
.ws
->buffer_map(r600_resource(dst
)->cs_buf
,
1116 sctx
->b
.rings
.gfx
.cs
,
1117 PIPE_TRANSFER_WRITE
);
1119 for (unsigned i
= 0; i
< size
; i
++)
1124 uint64_t va
= r600_resource(dst
)->gpu_address
+ offset
;
1126 /* Flush the caches where the resource is bound. */
1127 if (is_framebuffer
) {
1128 flush_flags
= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER
;
1131 flush_flags
= SI_CONTEXT_INV_TC_L1
|
1132 (sctx
->b
.chip_class
== SI
? SI_CONTEXT_INV_TC_L2
: 0) |
1133 SI_CONTEXT_INV_KCACHE
;
1134 tc_l2_flag
= sctx
->b
.chip_class
== SI
? 0 : CIK_CP_DMA_USE_L2
;
1137 sctx
->b
.flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
1141 unsigned byte_count
= MIN2(size
, CP_DMA_MAX_BYTE_COUNT
);
1142 unsigned dma_flags
= tc_l2_flag
;
1144 si_need_cs_space(sctx
, 7 + (sctx
->b
.flags
? sctx
->cache_flush
.num_dw
: 0),
1147 /* This must be done after need_cs_space. */
1148 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
,
1149 (struct r600_resource
*)dst
, RADEON_USAGE_WRITE
,
1152 /* Flush the caches for the first copy only.
1153 * Also wait for the previous CP DMA operations. */
1154 if (sctx
->b
.flags
) {
1155 si_emit_cache_flush(&sctx
->b
, NULL
);
1156 dma_flags
|= SI_CP_DMA_RAW_WAIT
; /* same as WAIT_UNTIL=CP_DMA_IDLE */
1159 /* Do the synchronization after the last copy, so that all data is written to memory. */
1160 if (size
== byte_count
)
1161 dma_flags
|= R600_CP_DMA_SYNC
;
1163 /* Emit the clear packet. */
1164 si_emit_cp_dma_clear_buffer(sctx
, va
, byte_count
, value
, dma_flags
);
1170 /* Flush the caches again in case the 3D engine has been prefetching
1172 sctx
->b
.flags
|= flush_flags
;
1175 r600_resource(dst
)->TC_L2_dirty
= true;
1178 void si_copy_buffer(struct si_context
*sctx
,
1179 struct pipe_resource
*dst
, struct pipe_resource
*src
,
1180 uint64_t dst_offset
, uint64_t src_offset
, unsigned size
,
1181 bool is_framebuffer
)
1183 unsigned flush_flags
, tc_l2_flag
;
1188 /* Mark the buffer range of destination as valid (initialized),
1189 * so that transfer_map knows it should wait for the GPU when mapping
1191 util_range_add(&r600_resource(dst
)->valid_buffer_range
, dst_offset
,
1194 dst_offset
+= r600_resource(dst
)->gpu_address
;
1195 src_offset
+= r600_resource(src
)->gpu_address
;
1197 /* Flush the caches where the resource is bound. */
1198 if (is_framebuffer
) {
1199 flush_flags
= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER
;
1202 flush_flags
= SI_CONTEXT_INV_TC_L1
|
1203 (sctx
->b
.chip_class
== SI
? SI_CONTEXT_INV_TC_L2
: 0) |
1204 SI_CONTEXT_INV_KCACHE
;
1205 tc_l2_flag
= sctx
->b
.chip_class
== SI
? 0 : CIK_CP_DMA_USE_L2
;
1208 sctx
->b
.flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
1212 unsigned sync_flags
= tc_l2_flag
;
1213 unsigned byte_count
= MIN2(size
, CP_DMA_MAX_BYTE_COUNT
);
1215 si_need_cs_space(sctx
, 7 + (sctx
->b
.flags
? sctx
->cache_flush
.num_dw
: 0), FALSE
);
1217 /* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
1218 if (sctx
->b
.flags
) {
1219 si_emit_cache_flush(&sctx
->b
, NULL
);
1220 sync_flags
|= SI_CP_DMA_RAW_WAIT
;
1223 /* Do the synchronization after the last copy, so that all data is written to memory. */
1224 if (size
== byte_count
) {
1225 sync_flags
|= R600_CP_DMA_SYNC
;
1228 /* This must be done after r600_need_cs_space. */
1229 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
, (struct r600_resource
*)src
,
1230 RADEON_USAGE_READ
, RADEON_PRIO_MIN
);
1231 r600_context_bo_reloc(&sctx
->b
, &sctx
->b
.rings
.gfx
, (struct r600_resource
*)dst
,
1232 RADEON_USAGE_WRITE
, RADEON_PRIO_MIN
);
1234 si_emit_cp_dma_copy_buffer(sctx
, dst_offset
, src_offset
, byte_count
, sync_flags
);
1237 src_offset
+= byte_count
;
1238 dst_offset
+= byte_count
;
1241 /* Flush the caches again in case the 3D engine has been prefetching
1243 sctx
->b
.flags
|= flush_flags
;
1246 r600_resource(dst
)->TC_L2_dirty
= true;
1251 void si_init_all_descriptors(struct si_context
*sctx
)
1255 for (i
= 0; i
< SI_NUM_SHADERS
; i
++) {
1256 si_init_buffer_resources(sctx
, &sctx
->const_buffers
[i
],
1257 SI_NUM_CONST_BUFFERS
, i
, SI_SGPR_CONST
,
1258 RADEON_USAGE_READ
, RADEON_PRIO_SHADER_BUFFER_RO
);
1259 si_init_buffer_resources(sctx
, &sctx
->rw_buffers
[i
],
1260 i
== PIPE_SHADER_VERTEX
?
1261 SI_NUM_RW_BUFFERS
: SI_NUM_RING_BUFFERS
,
1262 i
, SI_SGPR_RW_BUFFERS
,
1263 RADEON_USAGE_READWRITE
, RADEON_PRIO_SHADER_RESOURCE_RW
);
1265 si_init_sampler_views(sctx
, &sctx
->samplers
[i
].views
, i
);
1267 si_init_descriptors(sctx
, &sctx
->samplers
[i
].states
.desc
,
1268 si_get_shader_user_data_base(i
) + SI_SGPR_SAMPLER
* 4,
1269 4, SI_NUM_SAMPLER_STATES
, si_emit_sampler_states
);
1271 sctx
->atoms
.s
.const_buffers
[i
] = &sctx
->const_buffers
[i
].desc
.atom
;
1272 sctx
->atoms
.s
.rw_buffers
[i
] = &sctx
->rw_buffers
[i
].desc
.atom
;
1273 sctx
->atoms
.s
.sampler_views
[i
] = &sctx
->samplers
[i
].views
.desc
.atom
;
1274 sctx
->atoms
.s
.sampler_states
[i
] = &sctx
->samplers
[i
].states
.desc
.atom
;
1277 si_init_descriptors(sctx
, &sctx
->vertex_buffers
,
1278 si_get_shader_user_data_base(PIPE_SHADER_VERTEX
) +
1279 SI_SGPR_VERTEX_BUFFER
*4, 4, SI_NUM_VERTEX_BUFFERS
,
1280 si_emit_shader_pointer
);
1281 sctx
->atoms
.s
.vertex_buffers
= &sctx
->vertex_buffers
.atom
;
1283 /* Set pipe_context functions. */
1284 sctx
->b
.b
.set_constant_buffer
= si_set_constant_buffer
;
1285 sctx
->b
.b
.set_sampler_views
= si_set_sampler_views
;
1286 sctx
->b
.b
.set_stream_output_targets
= si_set_streamout_targets
;
1287 sctx
->b
.clear_buffer
= si_clear_buffer
;
1288 sctx
->b
.invalidate_buffer
= si_invalidate_buffer
;
1291 void si_release_all_descriptors(struct si_context
*sctx
)
1295 for (i
= 0; i
< SI_NUM_SHADERS
; i
++) {
1296 si_release_buffer_resources(&sctx
->const_buffers
[i
]);
1297 si_release_buffer_resources(&sctx
->rw_buffers
[i
]);
1298 si_release_sampler_views(&sctx
->samplers
[i
].views
);
1299 si_release_descriptors(&sctx
->samplers
[i
].states
.desc
);
1301 si_release_descriptors(&sctx
->vertex_buffers
);
1304 void si_all_descriptors_begin_new_cs(struct si_context
*sctx
)
1308 for (i
= 0; i
< SI_NUM_SHADERS
; i
++) {
1309 si_buffer_resources_begin_new_cs(sctx
, &sctx
->const_buffers
[i
]);
1310 si_buffer_resources_begin_new_cs(sctx
, &sctx
->rw_buffers
[i
]);
1311 si_sampler_views_begin_new_cs(sctx
, &sctx
->samplers
[i
].views
);
1312 si_sampler_states_begin_new_cs(sctx
, &sctx
->samplers
[i
].states
);
1314 si_vertex_buffers_begin_new_cs(sctx
);