2 * Copyright 2013 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
24 * Marek Olšák <marek.olsak@amd.com>
27 /* Resource binding slots and sampler states (each described with 8 or
28 * 4 dwords) are stored in lists in memory which is accessed by shaders
29 * using scalar load instructions.
31 * This file is responsible for managing such lists. It keeps a copy of all
32 * descriptors in CPU memory and re-uploads a whole list if some slots have
35 * This code is also reponsible for updating shader pointers to those lists.
37 * Note that CP DMA can't be used for updating the lists, because a GPU hang
38 * could leave the list in a mid-IB state and the next IB would get wrong
39 * descriptors and the whole context would be unusable at that point.
40 * (Note: The register shadowing can't be used due to the same reason)
42 * Also, uploading descriptors to newly allocated memory doesn't require
46 * Possible scenarios for one 16 dword image+sampler slot:
48 * | Image | w/ FMASK | Buffer | NULL
49 * [ 0: 3] Image[0:3] | Image[0:3] | Null[0:3] | Null[0:3]
50 * [ 4: 7] Image[4:7] | Image[4:7] | Buffer[0:3] | 0
51 * [ 8:11] Null[0:3] | Fmask[0:3] | Null[0:3] | Null[0:3]
52 * [12:15] Sampler[0:3] | Fmask[4:7] | Sampler[0:3] | Sampler[0:3]
54 * FMASK implies MSAA, therefore no sampler state.
55 * Sampler states are never unbound except when FMASK is bound.
58 #include "radeon/r600_cs.h"
60 #include "si_shader.h"
63 #include "util/u_memory.h"
64 #include "util/u_upload_mgr.h"
67 /* NULL image and buffer descriptor.
69 * For images, all fields must be zero except for the swizzle, which
70 * supports arbitrary combinations of 0s and 1s. The texture type must be
71 * any valid type (e.g. 1D). If the texture type isn't set, the hw hangs.
73 * For buffers, all fields must be zero. If they are not, the hw hangs.
75 * This is the only reason why the buffer descriptor must be in words [4:7].
77 static uint32_t null_descriptor
[8] = {
81 S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1
) |
82 S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D
)
83 /* the rest must contain zeros, which is also used by the buffer
87 static void si_init_descriptors(struct si_descriptors
*desc
,
88 unsigned shader_userdata_index
,
89 unsigned element_dw_size
,
90 unsigned num_elements
)
94 assert(num_elements
<= sizeof(desc
->enabled_mask
)*8);
96 desc
->list
= CALLOC(num_elements
, element_dw_size
* 4);
97 desc
->element_dw_size
= element_dw_size
;
98 desc
->num_elements
= num_elements
;
99 desc
->list_dirty
= true; /* upload the list before the next draw */
100 desc
->shader_userdata_offset
= shader_userdata_index
* 4;
102 /* Initialize the array to NULL descriptors if the element size is 8. */
103 if (element_dw_size
% 8 == 0)
104 for (i
= 0; i
< num_elements
* element_dw_size
/ 8; i
++)
105 memcpy(desc
->list
+ i
*8, null_descriptor
,
106 sizeof(null_descriptor
));
109 static void si_release_descriptors(struct si_descriptors
*desc
)
111 pipe_resource_reference((struct pipe_resource
**)&desc
->buffer
, NULL
);
115 static bool si_upload_descriptors(struct si_context
*sctx
,
116 struct si_descriptors
*desc
)
118 unsigned list_size
= desc
->num_elements
* desc
->element_dw_size
* 4;
121 if (!desc
->list_dirty
)
124 u_upload_alloc(sctx
->b
.uploader
, 0, list_size
, 256,
125 &desc
->buffer_offset
,
126 (struct pipe_resource
**)&desc
->buffer
, &ptr
);
128 return false; /* skip the draw call */
130 util_memcpy_cpu_to_le32(ptr
, desc
->list
, list_size
);
132 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
, desc
->buffer
,
133 RADEON_USAGE_READ
, RADEON_PRIO_DESCRIPTORS
);
135 desc
->list_dirty
= false;
136 desc
->pointer_dirty
= true;
137 si_mark_atom_dirty(sctx
, &sctx
->shader_userdata
.atom
);
143 static void si_release_sampler_views(struct si_sampler_views
*views
)
147 for (i
= 0; i
< Elements(views
->views
); i
++) {
148 pipe_sampler_view_reference(&views
->views
[i
], NULL
);
150 si_release_descriptors(&views
->desc
);
153 static void si_sampler_view_add_buffers(struct si_context
*sctx
,
154 struct si_sampler_view
*rview
)
156 if (rview
->resource
) {
157 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
158 rview
->resource
, RADEON_USAGE_READ
,
159 r600_get_sampler_view_priority(rview
->resource
));
162 if (rview
->dcc_buffer
&& rview
->dcc_buffer
!= rview
->resource
) {
163 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
164 rview
->dcc_buffer
, RADEON_USAGE_READ
,
169 static void si_sampler_views_begin_new_cs(struct si_context
*sctx
,
170 struct si_sampler_views
*views
)
172 uint64_t mask
= views
->desc
.enabled_mask
;
174 /* Add buffers to the CS. */
176 int i
= u_bit_scan64(&mask
);
177 struct si_sampler_view
*rview
=
178 (struct si_sampler_view
*)views
->views
[i
];
180 si_sampler_view_add_buffers(sctx
, rview
);
183 if (!views
->desc
.buffer
)
185 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
, views
->desc
.buffer
,
186 RADEON_USAGE_READWRITE
, RADEON_PRIO_DESCRIPTORS
);
189 static void si_set_sampler_view(struct si_context
*sctx
,
190 struct si_sampler_views
*views
,
191 unsigned slot
, struct pipe_sampler_view
*view
)
193 if (views
->views
[slot
] == view
)
197 struct si_sampler_view
*rview
=
198 (struct si_sampler_view
*)view
;
199 struct r600_texture
*rtex
= (struct r600_texture
*)view
->texture
;
201 si_sampler_view_add_buffers(sctx
, rview
);
203 pipe_sampler_view_reference(&views
->views
[slot
], view
);
204 memcpy(views
->desc
.list
+ slot
* 16, rview
->state
, 8*4);
206 if (view
->texture
&& view
->texture
->target
!= PIPE_BUFFER
&&
208 memcpy(views
->desc
.list
+ slot
*16 + 8,
209 rview
->fmask_state
, 8*4);
211 /* Disable FMASK and bind sampler state in [12:15]. */
212 memcpy(views
->desc
.list
+ slot
*16 + 8,
213 null_descriptor
, 4*4);
215 if (views
->sampler_states
[slot
])
216 memcpy(views
->desc
.list
+ slot
*16 + 12,
217 views
->sampler_states
[slot
], 4*4);
220 views
->desc
.enabled_mask
|= 1llu << slot
;
222 pipe_sampler_view_reference(&views
->views
[slot
], NULL
);
223 memcpy(views
->desc
.list
+ slot
*16, null_descriptor
, 8*4);
224 /* Only clear the lower dwords of FMASK. */
225 memcpy(views
->desc
.list
+ slot
*16 + 8, null_descriptor
, 4*4);
226 views
->desc
.enabled_mask
&= ~(1llu << slot
);
229 views
->desc
.list_dirty
= true;
232 static void si_set_sampler_views(struct pipe_context
*ctx
,
233 unsigned shader
, unsigned start
,
235 struct pipe_sampler_view
**views
)
237 struct si_context
*sctx
= (struct si_context
*)ctx
;
238 struct si_textures_info
*samplers
= &sctx
->samplers
[shader
];
241 if (!count
|| shader
>= SI_NUM_SHADERS
)
244 for (i
= 0; i
< count
; i
++) {
245 unsigned slot
= start
+ i
;
247 if (!views
|| !views
[i
]) {
248 samplers
->depth_texture_mask
&= ~(1 << slot
);
249 samplers
->compressed_colortex_mask
&= ~(1 << slot
);
250 si_set_sampler_view(sctx
, &samplers
->views
, slot
, NULL
);
254 si_set_sampler_view(sctx
, &samplers
->views
, slot
, views
[i
]);
256 if (views
[i
]->texture
&& views
[i
]->texture
->target
!= PIPE_BUFFER
) {
257 struct r600_texture
*rtex
=
258 (struct r600_texture
*)views
[i
]->texture
;
260 if (rtex
->is_depth
&& !rtex
->is_flushing_texture
) {
261 samplers
->depth_texture_mask
|= 1 << slot
;
263 samplers
->depth_texture_mask
&= ~(1 << slot
);
265 if (rtex
->cmask
.size
|| rtex
->fmask
.size
||
266 (rtex
->dcc_buffer
&& rtex
->dirty_level_mask
)) {
267 samplers
->compressed_colortex_mask
|= 1 << slot
;
269 samplers
->compressed_colortex_mask
&= ~(1 << slot
);
272 samplers
->depth_texture_mask
&= ~(1 << slot
);
273 samplers
->compressed_colortex_mask
&= ~(1 << slot
);
280 static void si_bind_sampler_states(struct pipe_context
*ctx
, unsigned shader
,
281 unsigned start
, unsigned count
, void **states
)
283 struct si_context
*sctx
= (struct si_context
*)ctx
;
284 struct si_textures_info
*samplers
= &sctx
->samplers
[shader
];
285 struct si_descriptors
*desc
= &samplers
->views
.desc
;
286 struct si_sampler_state
**sstates
= (struct si_sampler_state
**)states
;
289 if (!count
|| shader
>= SI_NUM_SHADERS
)
292 for (i
= 0; i
< count
; i
++) {
293 unsigned slot
= start
+ i
;
296 sstates
[i
] == samplers
->views
.sampler_states
[slot
])
299 samplers
->views
.sampler_states
[slot
] = sstates
[i
];
301 /* If FMASK is bound, don't overwrite it.
302 * The sampler state will be set after FMASK is unbound.
304 if (samplers
->views
.views
[i
] &&
305 samplers
->views
.views
[i
]->texture
&&
306 ((struct r600_texture
*)samplers
->views
.views
[i
]->texture
)->fmask
.size
)
309 memcpy(desc
->list
+ slot
* 16 + 12, sstates
[i
]->val
, 4*4);
310 desc
->list_dirty
= true;
314 /* BUFFER RESOURCES */
316 static void si_init_buffer_resources(struct si_buffer_resources
*buffers
,
317 unsigned num_buffers
,
318 unsigned shader_userdata_index
,
319 enum radeon_bo_usage shader_usage
,
320 enum radeon_bo_priority priority
)
322 buffers
->shader_usage
= shader_usage
;
323 buffers
->priority
= priority
;
324 buffers
->buffers
= CALLOC(num_buffers
, sizeof(struct pipe_resource
*));
326 si_init_descriptors(&buffers
->desc
, shader_userdata_index
, 4,
330 static void si_release_buffer_resources(struct si_buffer_resources
*buffers
)
334 for (i
= 0; i
< buffers
->desc
.num_elements
; i
++) {
335 pipe_resource_reference(&buffers
->buffers
[i
], NULL
);
338 FREE(buffers
->buffers
);
339 si_release_descriptors(&buffers
->desc
);
342 static void si_buffer_resources_begin_new_cs(struct si_context
*sctx
,
343 struct si_buffer_resources
*buffers
)
345 uint64_t mask
= buffers
->desc
.enabled_mask
;
347 /* Add buffers to the CS. */
349 int i
= u_bit_scan64(&mask
);
351 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
352 (struct r600_resource
*)buffers
->buffers
[i
],
353 buffers
->shader_usage
, buffers
->priority
);
356 if (!buffers
->desc
.buffer
)
358 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
359 buffers
->desc
.buffer
, RADEON_USAGE_READWRITE
,
360 RADEON_PRIO_DESCRIPTORS
);
365 static void si_vertex_buffers_begin_new_cs(struct si_context
*sctx
)
367 struct si_descriptors
*desc
= &sctx
->vertex_buffers
;
368 int count
= sctx
->vertex_elements
? sctx
->vertex_elements
->count
: 0;
371 for (i
= 0; i
< count
; i
++) {
372 int vb
= sctx
->vertex_elements
->elements
[i
].vertex_buffer_index
;
374 if (vb
>= Elements(sctx
->vertex_buffer
))
376 if (!sctx
->vertex_buffer
[vb
].buffer
)
379 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
380 (struct r600_resource
*)sctx
->vertex_buffer
[vb
].buffer
,
381 RADEON_USAGE_READ
, RADEON_PRIO_VERTEX_BUFFER
);
386 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
387 desc
->buffer
, RADEON_USAGE_READ
,
388 RADEON_PRIO_DESCRIPTORS
);
391 static bool si_upload_vertex_buffer_descriptors(struct si_context
*sctx
)
393 struct si_descriptors
*desc
= &sctx
->vertex_buffers
;
394 bool bound
[SI_NUM_VERTEX_BUFFERS
] = {};
395 unsigned i
, count
= sctx
->vertex_elements
->count
;
399 if (!sctx
->vertex_buffers_dirty
)
401 if (!count
|| !sctx
->vertex_elements
)
404 /* Vertex buffer descriptors are the only ones which are uploaded
405 * directly through a staging buffer and don't go through
406 * the fine-grained upload path.
408 u_upload_alloc(sctx
->b
.uploader
, 0, count
* 16, 256, &desc
->buffer_offset
,
409 (struct pipe_resource
**)&desc
->buffer
, (void**)&ptr
);
413 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
414 desc
->buffer
, RADEON_USAGE_READ
,
415 RADEON_PRIO_DESCRIPTORS
);
417 assert(count
<= SI_NUM_VERTEX_BUFFERS
);
419 for (i
= 0; i
< count
; i
++) {
420 struct pipe_vertex_element
*ve
= &sctx
->vertex_elements
->elements
[i
];
421 struct pipe_vertex_buffer
*vb
;
422 struct r600_resource
*rbuffer
;
424 uint32_t *desc
= &ptr
[i
*4];
426 if (ve
->vertex_buffer_index
>= Elements(sctx
->vertex_buffer
)) {
431 vb
= &sctx
->vertex_buffer
[ve
->vertex_buffer_index
];
432 rbuffer
= (struct r600_resource
*)vb
->buffer
;
438 offset
= vb
->buffer_offset
+ ve
->src_offset
;
439 va
= rbuffer
->gpu_address
+ offset
;
441 /* Fill in T# buffer resource description */
443 desc
[1] = S_008F04_BASE_ADDRESS_HI(va
>> 32) |
444 S_008F04_STRIDE(vb
->stride
);
446 if (sctx
->b
.chip_class
<= CIK
&& vb
->stride
)
447 /* Round up by rounding down and adding 1 */
448 desc
[2] = (vb
->buffer
->width0
- offset
-
449 sctx
->vertex_elements
->format_size
[i
]) /
452 desc
[2] = vb
->buffer
->width0
- offset
;
454 desc
[3] = sctx
->vertex_elements
->rsrc_word3
[i
];
456 if (!bound
[ve
->vertex_buffer_index
]) {
457 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
458 (struct r600_resource
*)vb
->buffer
,
459 RADEON_USAGE_READ
, RADEON_PRIO_VERTEX_BUFFER
);
460 bound
[ve
->vertex_buffer_index
] = true;
464 /* Don't flush the const cache. It would have a very negative effect
465 * on performance (confirmed by testing). New descriptors are always
466 * uploaded to a fresh new buffer, so I don't think flushing the const
467 * cache is needed. */
468 desc
->pointer_dirty
= true;
469 si_mark_atom_dirty(sctx
, &sctx
->shader_userdata
.atom
);
470 sctx
->vertex_buffers_dirty
= false;
475 /* CONSTANT BUFFERS */
477 void si_upload_const_buffer(struct si_context
*sctx
, struct r600_resource
**rbuffer
,
478 const uint8_t *ptr
, unsigned size
, uint32_t *const_offset
)
482 u_upload_alloc(sctx
->b
.uploader
, 0, size
, 256, const_offset
,
483 (struct pipe_resource
**)rbuffer
, &tmp
);
485 util_memcpy_cpu_to_le32(tmp
, ptr
, size
);
488 static void si_set_constant_buffer(struct pipe_context
*ctx
, uint shader
, uint slot
,
489 struct pipe_constant_buffer
*input
)
491 struct si_context
*sctx
= (struct si_context
*)ctx
;
492 struct si_buffer_resources
*buffers
= &sctx
->const_buffers
[shader
];
494 if (shader
>= SI_NUM_SHADERS
)
497 assert(slot
< buffers
->desc
.num_elements
);
498 pipe_resource_reference(&buffers
->buffers
[slot
], NULL
);
500 /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
501 * with a NULL buffer). We need to use a dummy buffer instead. */
502 if (sctx
->b
.chip_class
== CIK
&&
503 (!input
|| (!input
->buffer
&& !input
->user_buffer
)))
504 input
= &sctx
->null_const_buf
;
506 if (input
&& (input
->buffer
|| input
->user_buffer
)) {
507 struct pipe_resource
*buffer
= NULL
;
510 /* Upload the user buffer if needed. */
511 if (input
->user_buffer
) {
512 unsigned buffer_offset
;
514 si_upload_const_buffer(sctx
,
515 (struct r600_resource
**)&buffer
, input
->user_buffer
,
516 input
->buffer_size
, &buffer_offset
);
518 /* Just unbind on failure. */
519 si_set_constant_buffer(ctx
, shader
, slot
, NULL
);
522 va
= r600_resource(buffer
)->gpu_address
+ buffer_offset
;
524 pipe_resource_reference(&buffer
, input
->buffer
);
525 va
= r600_resource(buffer
)->gpu_address
+ input
->buffer_offset
;
528 /* Set the descriptor. */
529 uint32_t *desc
= buffers
->desc
.list
+ slot
*4;
531 desc
[1] = S_008F04_BASE_ADDRESS_HI(va
>> 32) |
533 desc
[2] = input
->buffer_size
;
534 desc
[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
535 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
536 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) |
537 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
) |
538 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT
) |
539 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
);
541 buffers
->buffers
[slot
] = buffer
;
542 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
543 (struct r600_resource
*)buffer
,
544 buffers
->shader_usage
, buffers
->priority
);
545 buffers
->desc
.enabled_mask
|= 1llu << slot
;
547 /* Clear the descriptor. */
548 memset(buffers
->desc
.list
+ slot
*4, 0, sizeof(uint32_t) * 4);
549 buffers
->desc
.enabled_mask
&= ~(1llu << slot
);
552 buffers
->desc
.list_dirty
= true;
557 void si_set_ring_buffer(struct pipe_context
*ctx
, uint shader
, uint slot
,
558 struct pipe_resource
*buffer
,
559 unsigned stride
, unsigned num_records
,
560 bool add_tid
, bool swizzle
,
561 unsigned element_size
, unsigned index_stride
, uint64_t offset
)
563 struct si_context
*sctx
= (struct si_context
*)ctx
;
564 struct si_buffer_resources
*buffers
= &sctx
->rw_buffers
[shader
];
566 if (shader
>= SI_NUM_SHADERS
)
569 /* The stride field in the resource descriptor has 14 bits */
570 assert(stride
< (1 << 14));
572 assert(slot
< buffers
->desc
.num_elements
);
573 pipe_resource_reference(&buffers
->buffers
[slot
], NULL
);
578 va
= r600_resource(buffer
)->gpu_address
+ offset
;
580 switch (element_size
) {
582 assert(!"Unsupported ring buffer element size");
598 switch (index_stride
) {
600 assert(!"Unsupported ring buffer index stride");
616 if (sctx
->b
.chip_class
>= VI
&& stride
)
617 num_records
*= stride
;
619 /* Set the descriptor. */
620 uint32_t *desc
= buffers
->desc
.list
+ slot
*4;
622 desc
[1] = S_008F04_BASE_ADDRESS_HI(va
>> 32) |
623 S_008F04_STRIDE(stride
) |
624 S_008F04_SWIZZLE_ENABLE(swizzle
);
625 desc
[2] = num_records
;
626 desc
[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
627 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
628 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) |
629 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
) |
630 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT
) |
631 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
) |
632 S_008F0C_ELEMENT_SIZE(element_size
) |
633 S_008F0C_INDEX_STRIDE(index_stride
) |
634 S_008F0C_ADD_TID_ENABLE(add_tid
);
636 pipe_resource_reference(&buffers
->buffers
[slot
], buffer
);
637 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
638 (struct r600_resource
*)buffer
,
639 buffers
->shader_usage
, buffers
->priority
);
640 buffers
->desc
.enabled_mask
|= 1llu << slot
;
642 /* Clear the descriptor. */
643 memset(buffers
->desc
.list
+ slot
*4, 0, sizeof(uint32_t) * 4);
644 buffers
->desc
.enabled_mask
&= ~(1llu << slot
);
647 buffers
->desc
.list_dirty
= true;
650 /* STREAMOUT BUFFERS */
652 static void si_set_streamout_targets(struct pipe_context
*ctx
,
653 unsigned num_targets
,
654 struct pipe_stream_output_target
**targets
,
655 const unsigned *offsets
)
657 struct si_context
*sctx
= (struct si_context
*)ctx
;
658 struct si_buffer_resources
*buffers
= &sctx
->rw_buffers
[PIPE_SHADER_VERTEX
];
659 unsigned old_num_targets
= sctx
->b
.streamout
.num_targets
;
662 /* We are going to unbind the buffers. Mark which caches need to be flushed. */
663 if (sctx
->b
.streamout
.num_targets
&& sctx
->b
.streamout
.begin_emitted
) {
664 /* Since streamout uses vector writes which go through TC L2
665 * and most other clients can use TC L2 as well, we don't need
668 * The only case which requires flushing it is VGT DMA index
669 * fetching, which is a rare case. Thus, flag the TC L2
670 * dirtiness in the resource and handle it when index fetching
673 for (i
= 0; i
< sctx
->b
.streamout
.num_targets
; i
++)
674 if (sctx
->b
.streamout
.targets
[i
])
675 r600_resource(sctx
->b
.streamout
.targets
[i
]->b
.buffer
)->TC_L2_dirty
= true;
677 /* Invalidate the scalar cache in case a streamout buffer is
678 * going to be used as a constant buffer.
680 * Invalidate TC L1, because streamout bypasses it (done by
681 * setting GLC=1 in the store instruction), but it can contain
682 * outdated data of streamout buffers.
684 * VS_PARTIAL_FLUSH is required if the buffers are going to be
685 * used as an input immediately.
687 sctx
->b
.flags
|= SI_CONTEXT_INV_SMEM_L1
|
688 SI_CONTEXT_INV_VMEM_L1
|
689 SI_CONTEXT_VS_PARTIAL_FLUSH
;
692 /* Streamout buffers must be bound in 2 places:
693 * 1) in VGT by setting the VGT_STRMOUT registers
694 * 2) as shader resources
697 /* Set the VGT regs. */
698 r600_set_streamout_targets(ctx
, num_targets
, targets
, offsets
);
700 /* Set the shader resources.*/
701 for (i
= 0; i
< num_targets
; i
++) {
702 bufidx
= SI_SO_BUF_OFFSET
+ i
;
705 struct pipe_resource
*buffer
= targets
[i
]->buffer
;
706 uint64_t va
= r600_resource(buffer
)->gpu_address
;
708 /* Set the descriptor.
710 * On VI, the format must be non-INVALID, otherwise
711 * the buffer will be considered not bound and store
712 * instructions will be no-ops.
714 uint32_t *desc
= buffers
->desc
.list
+ bufidx
*4;
716 desc
[1] = S_008F04_BASE_ADDRESS_HI(va
>> 32);
717 desc
[2] = 0xffffffff;
718 desc
[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
719 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
720 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) |
721 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
) |
722 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
);
724 /* Set the resource. */
725 pipe_resource_reference(&buffers
->buffers
[bufidx
],
727 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
728 (struct r600_resource
*)buffer
,
729 buffers
->shader_usage
, buffers
->priority
);
730 buffers
->desc
.enabled_mask
|= 1llu << bufidx
;
732 /* Clear the descriptor and unset the resource. */
733 memset(buffers
->desc
.list
+ bufidx
*4, 0,
734 sizeof(uint32_t) * 4);
735 pipe_resource_reference(&buffers
->buffers
[bufidx
],
737 buffers
->desc
.enabled_mask
&= ~(1llu << bufidx
);
740 for (; i
< old_num_targets
; i
++) {
741 bufidx
= SI_SO_BUF_OFFSET
+ i
;
742 /* Clear the descriptor and unset the resource. */
743 memset(buffers
->desc
.list
+ bufidx
*4, 0, sizeof(uint32_t) * 4);
744 pipe_resource_reference(&buffers
->buffers
[bufidx
], NULL
);
745 buffers
->desc
.enabled_mask
&= ~(1llu << bufidx
);
748 buffers
->desc
.list_dirty
= true;
751 static void si_desc_reset_buffer_offset(struct pipe_context
*ctx
,
752 uint32_t *desc
, uint64_t old_buf_va
,
753 struct pipe_resource
*new_buf
)
755 /* Retrieve the buffer offset from the descriptor. */
756 uint64_t old_desc_va
=
757 desc
[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc
[1]) << 32);
759 assert(old_buf_va
<= old_desc_va
);
760 uint64_t offset_within_buffer
= old_desc_va
- old_buf_va
;
762 /* Update the descriptor. */
763 uint64_t va
= r600_resource(new_buf
)->gpu_address
+ offset_within_buffer
;
766 desc
[1] = (desc
[1] & C_008F04_BASE_ADDRESS_HI
) |
767 S_008F04_BASE_ADDRESS_HI(va
>> 32);
770 /* BUFFER DISCARD/INVALIDATION */
772 /* Reallocate a buffer a update all resource bindings where the buffer is
775 * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
776 * idle by discarding its contents. Apps usually tell us when to do this using
777 * map_buffer flags, for example.
779 static void si_invalidate_buffer(struct pipe_context
*ctx
, struct pipe_resource
*buf
)
781 struct si_context
*sctx
= (struct si_context
*)ctx
;
782 struct r600_resource
*rbuffer
= r600_resource(buf
);
783 unsigned i
, shader
, alignment
= rbuffer
->buf
->alignment
;
784 uint64_t old_va
= rbuffer
->gpu_address
;
785 unsigned num_elems
= sctx
->vertex_elements
?
786 sctx
->vertex_elements
->count
: 0;
787 struct si_sampler_view
*view
;
789 /* Reallocate the buffer in the same pipe_resource. */
790 r600_init_resource(&sctx
->screen
->b
, rbuffer
, rbuffer
->b
.b
.width0
,
793 /* We changed the buffer, now we need to bind it where the old one
794 * was bound. This consists of 2 things:
795 * 1) Updating the resource descriptor and dirtying it.
796 * 2) Adding a relocation to the CS, so that it's usable.
799 /* Vertex buffers. */
800 for (i
= 0; i
< num_elems
; i
++) {
801 int vb
= sctx
->vertex_elements
->elements
[i
].vertex_buffer_index
;
803 if (vb
>= Elements(sctx
->vertex_buffer
))
805 if (!sctx
->vertex_buffer
[vb
].buffer
)
808 if (sctx
->vertex_buffer
[vb
].buffer
== buf
) {
809 sctx
->vertex_buffers_dirty
= true;
814 /* Read/Write buffers. */
815 for (shader
= 0; shader
< SI_NUM_SHADERS
; shader
++) {
816 struct si_buffer_resources
*buffers
= &sctx
->rw_buffers
[shader
];
817 uint64_t mask
= buffers
->desc
.enabled_mask
;
820 i
= u_bit_scan64(&mask
);
821 if (buffers
->buffers
[i
] == buf
) {
822 si_desc_reset_buffer_offset(ctx
, buffers
->desc
.list
+ i
*4,
824 buffers
->desc
.list_dirty
= true;
826 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
827 rbuffer
, buffers
->shader_usage
,
830 if (i
>= SI_SO_BUF_OFFSET
&& shader
== PIPE_SHADER_VERTEX
) {
831 /* Update the streamout state. */
832 if (sctx
->b
.streamout
.begin_emitted
) {
833 r600_emit_streamout_end(&sctx
->b
);
835 sctx
->b
.streamout
.append_bitmask
=
836 sctx
->b
.streamout
.enabled_mask
;
837 r600_streamout_buffers_dirty(&sctx
->b
);
843 /* Constant buffers. */
844 for (shader
= 0; shader
< SI_NUM_SHADERS
; shader
++) {
845 struct si_buffer_resources
*buffers
= &sctx
->const_buffers
[shader
];
846 uint64_t mask
= buffers
->desc
.enabled_mask
;
849 unsigned i
= u_bit_scan64(&mask
);
850 if (buffers
->buffers
[i
] == buf
) {
851 si_desc_reset_buffer_offset(ctx
, buffers
->desc
.list
+ i
*4,
853 buffers
->desc
.list_dirty
= true;
855 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
856 rbuffer
, buffers
->shader_usage
,
862 /* Texture buffers - update virtual addresses in sampler view descriptors. */
863 LIST_FOR_EACH_ENTRY(view
, &sctx
->b
.texture_buffers
, list
) {
864 if (view
->base
.texture
== buf
) {
865 si_desc_reset_buffer_offset(ctx
, &view
->state
[4], old_va
, buf
);
868 /* Texture buffers - update bindings. */
869 for (shader
= 0; shader
< SI_NUM_SHADERS
; shader
++) {
870 struct si_sampler_views
*views
= &sctx
->samplers
[shader
].views
;
871 uint64_t mask
= views
->desc
.enabled_mask
;
874 unsigned i
= u_bit_scan64(&mask
);
875 if (views
->views
[i
]->texture
== buf
) {
876 si_desc_reset_buffer_offset(ctx
,
880 views
->desc
.list_dirty
= true;
882 radeon_add_to_buffer_list(&sctx
->b
, &sctx
->b
.gfx
,
883 rbuffer
, RADEON_USAGE_READ
,
884 RADEON_PRIO_SAMPLER_BUFFER
);
890 /* SHADER USER DATA */
892 static void si_mark_shader_pointers_dirty(struct si_context
*sctx
,
895 sctx
->const_buffers
[shader
].desc
.pointer_dirty
= true;
896 sctx
->rw_buffers
[shader
].desc
.pointer_dirty
= true;
897 sctx
->samplers
[shader
].views
.desc
.pointer_dirty
= true;
899 if (shader
== PIPE_SHADER_VERTEX
)
900 sctx
->vertex_buffers
.pointer_dirty
= true;
902 si_mark_atom_dirty(sctx
, &sctx
->shader_userdata
.atom
);
905 static void si_shader_userdata_begin_new_cs(struct si_context
*sctx
)
909 for (i
= 0; i
< SI_NUM_SHADERS
; i
++) {
910 si_mark_shader_pointers_dirty(sctx
, i
);
914 /* Set a base register address for user data constants in the given shader.
915 * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
917 static void si_set_user_data_base(struct si_context
*sctx
,
918 unsigned shader
, uint32_t new_base
)
920 uint32_t *base
= &sctx
->shader_userdata
.sh_base
[shader
];
922 if (*base
!= new_base
) {
926 si_mark_shader_pointers_dirty(sctx
, shader
);
930 /* This must be called when these shaders are changed from non-NULL to NULL
933 * - tessellation control shader
934 * - tessellation evaluation shader
936 void si_shader_change_notify(struct si_context
*sctx
)
938 /* VS can be bound as VS, ES, or LS. */
939 if (sctx
->tes_shader
.cso
)
940 si_set_user_data_base(sctx
, PIPE_SHADER_VERTEX
,
941 R_00B530_SPI_SHADER_USER_DATA_LS_0
);
942 else if (sctx
->gs_shader
.cso
)
943 si_set_user_data_base(sctx
, PIPE_SHADER_VERTEX
,
944 R_00B330_SPI_SHADER_USER_DATA_ES_0
);
946 si_set_user_data_base(sctx
, PIPE_SHADER_VERTEX
,
947 R_00B130_SPI_SHADER_USER_DATA_VS_0
);
949 /* TES can be bound as ES, VS, or not bound. */
950 if (sctx
->tes_shader
.cso
) {
951 if (sctx
->gs_shader
.cso
)
952 si_set_user_data_base(sctx
, PIPE_SHADER_TESS_EVAL
,
953 R_00B330_SPI_SHADER_USER_DATA_ES_0
);
955 si_set_user_data_base(sctx
, PIPE_SHADER_TESS_EVAL
,
956 R_00B130_SPI_SHADER_USER_DATA_VS_0
);
958 si_set_user_data_base(sctx
, PIPE_SHADER_TESS_EVAL
, 0);
962 static void si_emit_shader_pointer(struct si_context
*sctx
,
963 struct si_descriptors
*desc
,
964 unsigned sh_base
, bool keep_dirty
)
966 struct radeon_winsys_cs
*cs
= sctx
->b
.gfx
.cs
;
969 if (!desc
->pointer_dirty
|| !desc
->buffer
)
972 va
= desc
->buffer
->gpu_address
+
975 radeon_emit(cs
, PKT3(PKT3_SET_SH_REG
, 2, 0));
976 radeon_emit(cs
, (sh_base
+ desc
->shader_userdata_offset
- SI_SH_REG_OFFSET
) >> 2);
978 radeon_emit(cs
, va
>> 32);
980 desc
->pointer_dirty
= keep_dirty
;
983 void si_emit_shader_userdata(struct si_context
*sctx
, struct r600_atom
*atom
)
986 uint32_t *sh_base
= sctx
->shader_userdata
.sh_base
;
988 if (sctx
->gs_shader
.cso
) {
989 /* The VS copy shader needs these for clipping, streamout, and rings. */
990 unsigned vs_base
= R_00B130_SPI_SHADER_USER_DATA_VS_0
;
991 unsigned i
= PIPE_SHADER_VERTEX
;
993 si_emit_shader_pointer(sctx
, &sctx
->const_buffers
[i
].desc
, vs_base
, true);
994 si_emit_shader_pointer(sctx
, &sctx
->rw_buffers
[i
].desc
, vs_base
, true);
996 if (sctx
->tes_shader
.cso
) {
997 /* The TESSEVAL shader needs this for the ESGS ring buffer. */
998 si_emit_shader_pointer(sctx
, &sctx
->rw_buffers
[i
].desc
,
999 R_00B330_SPI_SHADER_USER_DATA_ES_0
, true);
1001 } else if (sctx
->tes_shader
.cso
) {
1002 /* The TESSEVAL shader needs this for streamout. */
1003 si_emit_shader_pointer(sctx
, &sctx
->rw_buffers
[PIPE_SHADER_VERTEX
].desc
,
1004 R_00B130_SPI_SHADER_USER_DATA_VS_0
, true);
1007 for (i
= 0; i
< SI_NUM_SHADERS
; i
++) {
1008 unsigned base
= sh_base
[i
];
1013 if (i
!= PIPE_SHADER_TESS_EVAL
)
1014 si_emit_shader_pointer(sctx
, &sctx
->rw_buffers
[i
].desc
, base
, false);
1016 si_emit_shader_pointer(sctx
, &sctx
->const_buffers
[i
].desc
, base
, false);
1017 si_emit_shader_pointer(sctx
, &sctx
->samplers
[i
].views
.desc
, base
, false);
1019 si_emit_shader_pointer(sctx
, &sctx
->vertex_buffers
, sh_base
[PIPE_SHADER_VERTEX
], false);
1022 /* INIT/DEINIT/UPLOAD */
1024 void si_init_all_descriptors(struct si_context
*sctx
)
1028 for (i
= 0; i
< SI_NUM_SHADERS
; i
++) {
1029 si_init_buffer_resources(&sctx
->const_buffers
[i
],
1030 SI_NUM_CONST_BUFFERS
, SI_SGPR_CONST_BUFFERS
,
1031 RADEON_USAGE_READ
, RADEON_PRIO_CONST_BUFFER
);
1032 si_init_buffer_resources(&sctx
->rw_buffers
[i
],
1033 SI_NUM_RW_BUFFERS
, SI_SGPR_RW_BUFFERS
,
1034 RADEON_USAGE_READWRITE
, RADEON_PRIO_RINGS_STREAMOUT
);
1036 si_init_descriptors(&sctx
->samplers
[i
].views
.desc
,
1037 SI_SGPR_SAMPLERS
, 16, SI_NUM_SAMPLERS
);
1040 si_init_descriptors(&sctx
->vertex_buffers
, SI_SGPR_VERTEX_BUFFERS
,
1041 4, SI_NUM_VERTEX_BUFFERS
);
1043 /* Set pipe_context functions. */
1044 sctx
->b
.b
.bind_sampler_states
= si_bind_sampler_states
;
1045 sctx
->b
.b
.set_constant_buffer
= si_set_constant_buffer
;
1046 sctx
->b
.b
.set_sampler_views
= si_set_sampler_views
;
1047 sctx
->b
.b
.set_stream_output_targets
= si_set_streamout_targets
;
1048 sctx
->b
.invalidate_buffer
= si_invalidate_buffer
;
1050 /* Shader user data. */
1051 si_init_atom(sctx
, &sctx
->shader_userdata
.atom
, &sctx
->atoms
.s
.shader_userdata
,
1052 si_emit_shader_userdata
);
1054 /* Set default and immutable mappings. */
1055 si_set_user_data_base(sctx
, PIPE_SHADER_VERTEX
, R_00B130_SPI_SHADER_USER_DATA_VS_0
);
1056 si_set_user_data_base(sctx
, PIPE_SHADER_TESS_CTRL
, R_00B430_SPI_SHADER_USER_DATA_HS_0
);
1057 si_set_user_data_base(sctx
, PIPE_SHADER_GEOMETRY
, R_00B230_SPI_SHADER_USER_DATA_GS_0
);
1058 si_set_user_data_base(sctx
, PIPE_SHADER_FRAGMENT
, R_00B030_SPI_SHADER_USER_DATA_PS_0
);
1061 bool si_upload_shader_descriptors(struct si_context
*sctx
)
1065 for (i
= 0; i
< SI_NUM_SHADERS
; i
++) {
1066 if (!si_upload_descriptors(sctx
, &sctx
->const_buffers
[i
].desc
) ||
1067 !si_upload_descriptors(sctx
, &sctx
->rw_buffers
[i
].desc
) ||
1068 !si_upload_descriptors(sctx
, &sctx
->samplers
[i
].views
.desc
))
1071 return si_upload_vertex_buffer_descriptors(sctx
);
1074 void si_release_all_descriptors(struct si_context
*sctx
)
1078 for (i
= 0; i
< SI_NUM_SHADERS
; i
++) {
1079 si_release_buffer_resources(&sctx
->const_buffers
[i
]);
1080 si_release_buffer_resources(&sctx
->rw_buffers
[i
]);
1081 si_release_sampler_views(&sctx
->samplers
[i
].views
);
1083 si_release_descriptors(&sctx
->vertex_buffers
);
1086 void si_all_descriptors_begin_new_cs(struct si_context
*sctx
)
1090 for (i
= 0; i
< SI_NUM_SHADERS
; i
++) {
1091 si_buffer_resources_begin_new_cs(sctx
, &sctx
->const_buffers
[i
]);
1092 si_buffer_resources_begin_new_cs(sctx
, &sctx
->rw_buffers
[i
]);
1093 si_sampler_views_begin_new_cs(sctx
, &sctx
->samplers
[i
].views
);
1095 si_vertex_buffers_begin_new_cs(sctx
);
1096 si_shader_userdata_begin_new_cs(sctx
);