radeonsi: support thread-safe shaders shared by multiple contexts
[mesa.git] / src / gallium / drivers / radeonsi / si_descriptors.c
1 /*
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Marek Olšák <marek.olsak@amd.com>
25 */
26
27 /* Resource binding slots and sampler states (each described with 8 or
28 * 4 dwords) are stored in lists in memory which is accessed by shaders
29 * using scalar load instructions.
30 *
31 * This file is responsible for managing such lists. It keeps a copy of all
32 * descriptors in CPU memory and re-uploads a whole list if some slots have
33 * been changed.
34 *
35 * This code is also reponsible for updating shader pointers to those lists.
36 *
37 * Note that CP DMA can't be used for updating the lists, because a GPU hang
38 * could leave the list in a mid-IB state and the next IB would get wrong
39 * descriptors and the whole context would be unusable at that point.
40 * (Note: The register shadowing can't be used due to the same reason)
41 *
42 * Also, uploading descriptors to newly allocated memory doesn't require
43 * a KCACHE flush.
44 */
45
46 #include "radeon/r600_cs.h"
47 #include "si_pipe.h"
48 #include "si_shader.h"
49 #include "sid.h"
50
51 #include "util/u_memory.h"
52 #include "util/u_upload_mgr.h"
53
54
55 /* NULL image and buffer descriptor.
56 *
57 * For images, all fields must be zero except for the swizzle, which
58 * supports arbitrary combinations of 0s and 1s. The texture type must be
59 * any valid type (e.g. 1D). If the texture type isn't set, the hw hangs.
60 *
61 * For buffers, all fields must be zero. If they are not, the hw hangs.
62 *
63 * This is the only reason why the buffer descriptor must be in words [4:7].
64 */
65 static uint32_t null_descriptor[8] = {
66 0,
67 0,
68 0,
69 S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) |
70 S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
71 /* the rest must contain zeros, which is also used by the buffer
72 * descriptor */
73 };
74
75 static void si_init_descriptors(struct si_descriptors *desc,
76 unsigned shader_userdata_index,
77 unsigned element_dw_size,
78 unsigned num_elements)
79 {
80 int i;
81
82 assert(num_elements <= sizeof(desc->enabled_mask)*8);
83
84 desc->list = CALLOC(num_elements, element_dw_size * 4);
85 desc->element_dw_size = element_dw_size;
86 desc->num_elements = num_elements;
87 desc->list_dirty = true; /* upload the list before the next draw */
88 desc->shader_userdata_offset = shader_userdata_index * 4;
89
90 /* Initialize the array to NULL descriptors if the element size is 8. */
91 if (element_dw_size == 8)
92 for (i = 0; i < num_elements; i++)
93 memcpy(desc->list + i*element_dw_size, null_descriptor,
94 sizeof(null_descriptor));
95 }
96
97 static void si_release_descriptors(struct si_descriptors *desc)
98 {
99 pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
100 FREE(desc->list);
101 }
102
103 static bool si_upload_descriptors(struct si_context *sctx,
104 struct si_descriptors *desc)
105 {
106 unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
107 void *ptr;
108
109 if (!desc->list_dirty)
110 return true;
111
112 u_upload_alloc(sctx->b.uploader, 0, list_size,
113 &desc->buffer_offset,
114 (struct pipe_resource**)&desc->buffer, &ptr);
115 if (!desc->buffer)
116 return false; /* skip the draw call */
117
118 util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
119
120 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
121 RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
122
123 desc->list_dirty = false;
124 desc->pointer_dirty = true;
125 si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
126 return true;
127 }
128
129 /* SAMPLER VIEWS */
130
131 static void si_release_sampler_views(struct si_sampler_views *views)
132 {
133 int i;
134
135 for (i = 0; i < Elements(views->views); i++) {
136 pipe_sampler_view_reference(&views->views[i], NULL);
137 }
138 si_release_descriptors(&views->desc);
139 }
140
141 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
142 struct si_sampler_views *views)
143 {
144 uint64_t mask = views->desc.enabled_mask;
145
146 /* Add buffers to the CS. */
147 while (mask) {
148 int i = u_bit_scan64(&mask);
149 struct si_sampler_view *rview =
150 (struct si_sampler_view*)views->views[i];
151
152 if (!rview->resource)
153 continue;
154
155 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
156 rview->resource, RADEON_USAGE_READ,
157 r600_get_sampler_view_priority(rview->resource));
158 }
159
160 if (!views->desc.buffer)
161 return;
162 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
163 RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
164 }
165
166 static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
167 unsigned slot, struct pipe_sampler_view *view,
168 unsigned *view_desc)
169 {
170 struct si_sampler_views *views = &sctx->samplers[shader].views;
171
172 if (views->views[slot] == view)
173 return;
174
175 if (view) {
176 struct si_sampler_view *rview =
177 (struct si_sampler_view*)view;
178
179 if (rview->resource)
180 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
181 rview->resource, RADEON_USAGE_READ,
182 r600_get_sampler_view_priority(rview->resource));
183
184 pipe_sampler_view_reference(&views->views[slot], view);
185 memcpy(views->desc.list + slot*8, view_desc, 8*4);
186 views->desc.enabled_mask |= 1llu << slot;
187 } else {
188 pipe_sampler_view_reference(&views->views[slot], NULL);
189 memcpy(views->desc.list + slot*8, null_descriptor, 8*4);
190 views->desc.enabled_mask &= ~(1llu << slot);
191 }
192
193 views->desc.list_dirty = true;
194 }
195
196 static void si_set_sampler_views(struct pipe_context *ctx,
197 unsigned shader, unsigned start,
198 unsigned count,
199 struct pipe_sampler_view **views)
200 {
201 struct si_context *sctx = (struct si_context *)ctx;
202 struct si_textures_info *samplers = &sctx->samplers[shader];
203 struct si_sampler_view **rviews = (struct si_sampler_view **)views;
204 int i;
205
206 if (!count || shader >= SI_NUM_SHADERS)
207 return;
208
209 for (i = 0; i < count; i++) {
210 unsigned slot = start + i;
211
212 if (!views || !views[i]) {
213 samplers->depth_texture_mask &= ~(1 << slot);
214 samplers->compressed_colortex_mask &= ~(1 << slot);
215 si_set_sampler_view(sctx, shader, slot, NULL, NULL);
216 si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
217 NULL, NULL);
218 continue;
219 }
220
221 si_set_sampler_view(sctx, shader, slot, views[i], rviews[i]->state);
222
223 if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
224 struct r600_texture *rtex =
225 (struct r600_texture*)views[i]->texture;
226
227 if (rtex->is_depth && !rtex->is_flushing_texture) {
228 samplers->depth_texture_mask |= 1 << slot;
229 } else {
230 samplers->depth_texture_mask &= ~(1 << slot);
231 }
232 if (rtex->cmask.size || rtex->fmask.size) {
233 samplers->compressed_colortex_mask |= 1 << slot;
234 } else {
235 samplers->compressed_colortex_mask &= ~(1 << slot);
236 }
237
238 if (rtex->fmask.size) {
239 si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
240 views[i], rviews[i]->fmask_state);
241 } else {
242 si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
243 NULL, NULL);
244 }
245 } else {
246 samplers->depth_texture_mask &= ~(1 << slot);
247 samplers->compressed_colortex_mask &= ~(1 << slot);
248 si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
249 NULL, NULL);
250 }
251 }
252 }
253
254 /* SAMPLER STATES */
255
256 static void si_sampler_states_begin_new_cs(struct si_context *sctx,
257 struct si_sampler_states *states)
258 {
259 if (!states->desc.buffer)
260 return;
261 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
262 RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
263 }
264
265 static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader,
266 unsigned start, unsigned count, void **states)
267 {
268 struct si_context *sctx = (struct si_context *)ctx;
269 struct si_sampler_states *samplers = &sctx->samplers[shader].states;
270 struct si_sampler_state **sstates = (struct si_sampler_state**)states;
271 int i;
272
273 if (!count || shader >= SI_NUM_SHADERS)
274 return;
275
276 if (start == 0)
277 samplers->saved_states[0] = states[0];
278 if (start == 1)
279 samplers->saved_states[1] = states[0];
280 else if (start == 0 && count >= 2)
281 samplers->saved_states[1] = states[1];
282
283 for (i = 0; i < count; i++) {
284 unsigned slot = start + i;
285
286 if (!sstates[i])
287 continue;
288
289 memcpy(samplers->desc.list + slot*4, sstates[i]->val, 4*4);
290 samplers->desc.list_dirty = true;
291 }
292 }
293
294 /* BUFFER RESOURCES */
295
296 static void si_init_buffer_resources(struct si_buffer_resources *buffers,
297 unsigned num_buffers,
298 unsigned shader_userdata_index,
299 enum radeon_bo_usage shader_usage,
300 enum radeon_bo_priority priority)
301 {
302 buffers->shader_usage = shader_usage;
303 buffers->priority = priority;
304 buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
305
306 si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
307 num_buffers);
308 }
309
310 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
311 {
312 int i;
313
314 for (i = 0; i < buffers->desc.num_elements; i++) {
315 pipe_resource_reference(&buffers->buffers[i], NULL);
316 }
317
318 FREE(buffers->buffers);
319 si_release_descriptors(&buffers->desc);
320 }
321
322 static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
323 struct si_buffer_resources *buffers)
324 {
325 uint64_t mask = buffers->desc.enabled_mask;
326
327 /* Add buffers to the CS. */
328 while (mask) {
329 int i = u_bit_scan64(&mask);
330
331 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
332 (struct r600_resource*)buffers->buffers[i],
333 buffers->shader_usage, buffers->priority);
334 }
335
336 if (!buffers->desc.buffer)
337 return;
338 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
339 buffers->desc.buffer, RADEON_USAGE_READWRITE,
340 RADEON_PRIO_DESCRIPTORS);
341 }
342
343 /* VERTEX BUFFERS */
344
345 static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
346 {
347 struct si_descriptors *desc = &sctx->vertex_buffers;
348 int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
349 int i;
350
351 for (i = 0; i < count; i++) {
352 int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
353
354 if (vb >= Elements(sctx->vertex_buffer))
355 continue;
356 if (!sctx->vertex_buffer[vb].buffer)
357 continue;
358
359 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
360 (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
361 RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
362 }
363
364 if (!desc->buffer)
365 return;
366 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
367 desc->buffer, RADEON_USAGE_READ,
368 RADEON_PRIO_DESCRIPTORS);
369 }
370
371 static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
372 {
373 struct si_descriptors *desc = &sctx->vertex_buffers;
374 bool bound[SI_NUM_VERTEX_BUFFERS] = {};
375 unsigned i, count = sctx->vertex_elements->count;
376 uint64_t va;
377 uint32_t *ptr;
378
379 if (!sctx->vertex_buffers_dirty)
380 return true;
381 if (!count || !sctx->vertex_elements)
382 return true;
383
384 /* Vertex buffer descriptors are the only ones which are uploaded
385 * directly through a staging buffer and don't go through
386 * the fine-grained upload path.
387 */
388 u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
389 (struct pipe_resource**)&desc->buffer, (void**)&ptr);
390 if (!desc->buffer)
391 return false;
392
393 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
394 desc->buffer, RADEON_USAGE_READ,
395 RADEON_PRIO_DESCRIPTORS);
396
397 assert(count <= SI_NUM_VERTEX_BUFFERS);
398
399 for (i = 0; i < count; i++) {
400 struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
401 struct pipe_vertex_buffer *vb;
402 struct r600_resource *rbuffer;
403 unsigned offset;
404 uint32_t *desc = &ptr[i*4];
405
406 if (ve->vertex_buffer_index >= Elements(sctx->vertex_buffer)) {
407 memset(desc, 0, 16);
408 continue;
409 }
410
411 vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
412 rbuffer = (struct r600_resource*)vb->buffer;
413 if (rbuffer == NULL) {
414 memset(desc, 0, 16);
415 continue;
416 }
417
418 offset = vb->buffer_offset + ve->src_offset;
419 va = rbuffer->gpu_address + offset;
420
421 /* Fill in T# buffer resource description */
422 desc[0] = va;
423 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
424 S_008F04_STRIDE(vb->stride);
425
426 if (sctx->b.chip_class <= CIK && vb->stride)
427 /* Round up by rounding down and adding 1 */
428 desc[2] = (vb->buffer->width0 - offset -
429 sctx->vertex_elements->format_size[i]) /
430 vb->stride + 1;
431 else
432 desc[2] = vb->buffer->width0 - offset;
433
434 desc[3] = sctx->vertex_elements->rsrc_word3[i];
435
436 if (!bound[ve->vertex_buffer_index]) {
437 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
438 (struct r600_resource*)vb->buffer,
439 RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
440 bound[ve->vertex_buffer_index] = true;
441 }
442 }
443
444 /* Don't flush the const cache. It would have a very negative effect
445 * on performance (confirmed by testing). New descriptors are always
446 * uploaded to a fresh new buffer, so I don't think flushing the const
447 * cache is needed. */
448 desc->pointer_dirty = true;
449 si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
450 sctx->vertex_buffers_dirty = false;
451 return true;
452 }
453
454
455 /* CONSTANT BUFFERS */
456
457 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
458 const uint8_t *ptr, unsigned size, uint32_t *const_offset)
459 {
460 void *tmp;
461
462 u_upload_alloc(sctx->b.uploader, 0, size, const_offset,
463 (struct pipe_resource**)rbuffer, &tmp);
464 if (rbuffer)
465 util_memcpy_cpu_to_le32(tmp, ptr, size);
466 }
467
468 static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint slot,
469 struct pipe_constant_buffer *input)
470 {
471 struct si_context *sctx = (struct si_context *)ctx;
472 struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
473
474 if (shader >= SI_NUM_SHADERS)
475 return;
476
477 assert(slot < buffers->desc.num_elements);
478 pipe_resource_reference(&buffers->buffers[slot], NULL);
479
480 /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
481 * with a NULL buffer). We need to use a dummy buffer instead. */
482 if (sctx->b.chip_class == CIK &&
483 (!input || (!input->buffer && !input->user_buffer)))
484 input = &sctx->null_const_buf;
485
486 if (input && (input->buffer || input->user_buffer)) {
487 struct pipe_resource *buffer = NULL;
488 uint64_t va;
489
490 /* Upload the user buffer if needed. */
491 if (input->user_buffer) {
492 unsigned buffer_offset;
493
494 si_upload_const_buffer(sctx,
495 (struct r600_resource**)&buffer, input->user_buffer,
496 input->buffer_size, &buffer_offset);
497 if (!buffer) {
498 /* Just unbind on failure. */
499 si_set_constant_buffer(ctx, shader, slot, NULL);
500 return;
501 }
502 va = r600_resource(buffer)->gpu_address + buffer_offset;
503 } else {
504 pipe_resource_reference(&buffer, input->buffer);
505 va = r600_resource(buffer)->gpu_address + input->buffer_offset;
506 }
507
508 /* Set the descriptor. */
509 uint32_t *desc = buffers->desc.list + slot*4;
510 desc[0] = va;
511 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
512 S_008F04_STRIDE(0);
513 desc[2] = input->buffer_size;
514 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
515 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
516 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
517 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
518 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
519 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
520
521 buffers->buffers[slot] = buffer;
522 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
523 (struct r600_resource*)buffer,
524 buffers->shader_usage, buffers->priority);
525 buffers->desc.enabled_mask |= 1llu << slot;
526 } else {
527 /* Clear the descriptor. */
528 memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
529 buffers->desc.enabled_mask &= ~(1llu << slot);
530 }
531
532 buffers->desc.list_dirty = true;
533 }
534
535 /* RING BUFFERS */
536
537 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
538 struct pipe_resource *buffer,
539 unsigned stride, unsigned num_records,
540 bool add_tid, bool swizzle,
541 unsigned element_size, unsigned index_stride, uint64_t offset)
542 {
543 struct si_context *sctx = (struct si_context *)ctx;
544 struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
545
546 if (shader >= SI_NUM_SHADERS)
547 return;
548
549 /* The stride field in the resource descriptor has 14 bits */
550 assert(stride < (1 << 14));
551
552 assert(slot < buffers->desc.num_elements);
553 pipe_resource_reference(&buffers->buffers[slot], NULL);
554
555 if (buffer) {
556 uint64_t va;
557
558 va = r600_resource(buffer)->gpu_address + offset;
559
560 switch (element_size) {
561 default:
562 assert(!"Unsupported ring buffer element size");
563 case 0:
564 case 2:
565 element_size = 0;
566 break;
567 case 4:
568 element_size = 1;
569 break;
570 case 8:
571 element_size = 2;
572 break;
573 case 16:
574 element_size = 3;
575 break;
576 }
577
578 switch (index_stride) {
579 default:
580 assert(!"Unsupported ring buffer index stride");
581 case 0:
582 case 8:
583 index_stride = 0;
584 break;
585 case 16:
586 index_stride = 1;
587 break;
588 case 32:
589 index_stride = 2;
590 break;
591 case 64:
592 index_stride = 3;
593 break;
594 }
595
596 if (sctx->b.chip_class >= VI && stride)
597 num_records *= stride;
598
599 /* Set the descriptor. */
600 uint32_t *desc = buffers->desc.list + slot*4;
601 desc[0] = va;
602 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
603 S_008F04_STRIDE(stride) |
604 S_008F04_SWIZZLE_ENABLE(swizzle);
605 desc[2] = num_records;
606 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
607 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
608 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
609 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
610 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
611 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
612 S_008F0C_ELEMENT_SIZE(element_size) |
613 S_008F0C_INDEX_STRIDE(index_stride) |
614 S_008F0C_ADD_TID_ENABLE(add_tid);
615
616 pipe_resource_reference(&buffers->buffers[slot], buffer);
617 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
618 (struct r600_resource*)buffer,
619 buffers->shader_usage, buffers->priority);
620 buffers->desc.enabled_mask |= 1llu << slot;
621 } else {
622 /* Clear the descriptor. */
623 memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
624 buffers->desc.enabled_mask &= ~(1llu << slot);
625 }
626
627 buffers->desc.list_dirty = true;
628 }
629
630 /* STREAMOUT BUFFERS */
631
632 static void si_set_streamout_targets(struct pipe_context *ctx,
633 unsigned num_targets,
634 struct pipe_stream_output_target **targets,
635 const unsigned *offsets)
636 {
637 struct si_context *sctx = (struct si_context *)ctx;
638 struct si_buffer_resources *buffers = &sctx->rw_buffers[PIPE_SHADER_VERTEX];
639 unsigned old_num_targets = sctx->b.streamout.num_targets;
640 unsigned i, bufidx;
641
642 /* We are going to unbind the buffers. Mark which caches need to be flushed. */
643 if (sctx->b.streamout.num_targets && sctx->b.streamout.begin_emitted) {
644 /* Since streamout uses vector writes which go through TC L2
645 * and most other clients can use TC L2 as well, we don't need
646 * to flush it.
647 *
648 * The only case which requires flushing it is VGT DMA index
649 * fetching, which is a rare case. Thus, flag the TC L2
650 * dirtiness in the resource and handle it when index fetching
651 * is used.
652 */
653 for (i = 0; i < sctx->b.streamout.num_targets; i++)
654 if (sctx->b.streamout.targets[i])
655 r600_resource(sctx->b.streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
656
657 /* Invalidate the scalar cache in case a streamout buffer is
658 * going to be used as a constant buffer.
659 *
660 * Invalidate TC L1, because streamout bypasses it (done by
661 * setting GLC=1 in the store instruction), but it can contain
662 * outdated data of streamout buffers.
663 *
664 * VS_PARTIAL_FLUSH is required if the buffers are going to be
665 * used as an input immediately.
666 */
667 sctx->b.flags |= SI_CONTEXT_INV_KCACHE |
668 SI_CONTEXT_INV_TC_L1 |
669 SI_CONTEXT_VS_PARTIAL_FLUSH;
670 }
671
672 /* Streamout buffers must be bound in 2 places:
673 * 1) in VGT by setting the VGT_STRMOUT registers
674 * 2) as shader resources
675 */
676
677 /* Set the VGT regs. */
678 r600_set_streamout_targets(ctx, num_targets, targets, offsets);
679
680 /* Set the shader resources.*/
681 for (i = 0; i < num_targets; i++) {
682 bufidx = SI_SO_BUF_OFFSET + i;
683
684 if (targets[i]) {
685 struct pipe_resource *buffer = targets[i]->buffer;
686 uint64_t va = r600_resource(buffer)->gpu_address;
687
688 /* Set the descriptor.
689 *
690 * On VI, the format must be non-INVALID, otherwise
691 * the buffer will be considered not bound and store
692 * instructions will be no-ops.
693 */
694 uint32_t *desc = buffers->desc.list + bufidx*4;
695 desc[0] = va;
696 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
697 desc[2] = 0xffffffff;
698 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
699 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
700 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
701 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
702 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
703
704 /* Set the resource. */
705 pipe_resource_reference(&buffers->buffers[bufidx],
706 buffer);
707 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
708 (struct r600_resource*)buffer,
709 buffers->shader_usage, buffers->priority);
710 buffers->desc.enabled_mask |= 1llu << bufidx;
711 } else {
712 /* Clear the descriptor and unset the resource. */
713 memset(buffers->desc.list + bufidx*4, 0,
714 sizeof(uint32_t) * 4);
715 pipe_resource_reference(&buffers->buffers[bufidx],
716 NULL);
717 buffers->desc.enabled_mask &= ~(1llu << bufidx);
718 }
719 }
720 for (; i < old_num_targets; i++) {
721 bufidx = SI_SO_BUF_OFFSET + i;
722 /* Clear the descriptor and unset the resource. */
723 memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4);
724 pipe_resource_reference(&buffers->buffers[bufidx], NULL);
725 buffers->desc.enabled_mask &= ~(1llu << bufidx);
726 }
727
728 buffers->desc.list_dirty = true;
729 }
730
731 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
732 uint32_t *desc, uint64_t old_buf_va,
733 struct pipe_resource *new_buf)
734 {
735 /* Retrieve the buffer offset from the descriptor. */
736 uint64_t old_desc_va =
737 desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
738
739 assert(old_buf_va <= old_desc_va);
740 uint64_t offset_within_buffer = old_desc_va - old_buf_va;
741
742 /* Update the descriptor. */
743 uint64_t va = r600_resource(new_buf)->gpu_address + offset_within_buffer;
744
745 desc[0] = va;
746 desc[1] = (desc[1] & C_008F04_BASE_ADDRESS_HI) |
747 S_008F04_BASE_ADDRESS_HI(va >> 32);
748 }
749
750 /* BUFFER DISCARD/INVALIDATION */
751
752 /* Reallocate a buffer a update all resource bindings where the buffer is
753 * bound.
754 *
755 * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
756 * idle by discarding its contents. Apps usually tell us when to do this using
757 * map_buffer flags, for example.
758 */
759 static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf)
760 {
761 struct si_context *sctx = (struct si_context*)ctx;
762 struct r600_resource *rbuffer = r600_resource(buf);
763 unsigned i, shader, alignment = rbuffer->buf->alignment;
764 uint64_t old_va = rbuffer->gpu_address;
765 unsigned num_elems = sctx->vertex_elements ?
766 sctx->vertex_elements->count : 0;
767 struct si_sampler_view *view;
768
769 /* Reallocate the buffer in the same pipe_resource. */
770 r600_init_resource(&sctx->screen->b, rbuffer, rbuffer->b.b.width0,
771 alignment, TRUE);
772
773 /* We changed the buffer, now we need to bind it where the old one
774 * was bound. This consists of 2 things:
775 * 1) Updating the resource descriptor and dirtying it.
776 * 2) Adding a relocation to the CS, so that it's usable.
777 */
778
779 /* Vertex buffers. */
780 for (i = 0; i < num_elems; i++) {
781 int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
782
783 if (vb >= Elements(sctx->vertex_buffer))
784 continue;
785 if (!sctx->vertex_buffer[vb].buffer)
786 continue;
787
788 if (sctx->vertex_buffer[vb].buffer == buf) {
789 sctx->vertex_buffers_dirty = true;
790 break;
791 }
792 }
793
794 /* Read/Write buffers. */
795 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
796 struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
797 uint64_t mask = buffers->desc.enabled_mask;
798
799 while (mask) {
800 i = u_bit_scan64(&mask);
801 if (buffers->buffers[i] == buf) {
802 si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
803 old_va, buf);
804 buffers->desc.list_dirty = true;
805
806 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
807 rbuffer, buffers->shader_usage,
808 buffers->priority);
809
810 if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) {
811 /* Update the streamout state. */
812 if (sctx->b.streamout.begin_emitted) {
813 r600_emit_streamout_end(&sctx->b);
814 }
815 sctx->b.streamout.append_bitmask =
816 sctx->b.streamout.enabled_mask;
817 r600_streamout_buffers_dirty(&sctx->b);
818 }
819 }
820 }
821 }
822
823 /* Constant buffers. */
824 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
825 struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
826 uint64_t mask = buffers->desc.enabled_mask;
827
828 while (mask) {
829 unsigned i = u_bit_scan64(&mask);
830 if (buffers->buffers[i] == buf) {
831 si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
832 old_va, buf);
833 buffers->desc.list_dirty = true;
834
835 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
836 rbuffer, buffers->shader_usage,
837 buffers->priority);
838 }
839 }
840 }
841
842 /* Texture buffers - update virtual addresses in sampler view descriptors. */
843 LIST_FOR_EACH_ENTRY(view, &sctx->b.texture_buffers, list) {
844 if (view->base.texture == buf) {
845 si_desc_reset_buffer_offset(ctx, &view->state[4], old_va, buf);
846 }
847 }
848 /* Texture buffers - update bindings. */
849 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
850 struct si_sampler_views *views = &sctx->samplers[shader].views;
851 uint64_t mask = views->desc.enabled_mask;
852
853 while (mask) {
854 unsigned i = u_bit_scan64(&mask);
855 if (views->views[i]->texture == buf) {
856 si_desc_reset_buffer_offset(ctx, views->desc.list + i*8+4,
857 old_va, buf);
858 views->desc.list_dirty = true;
859
860 radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
861 rbuffer, RADEON_USAGE_READ,
862 RADEON_PRIO_SAMPLER_BUFFER);
863 }
864 }
865 }
866 }
867
868 /* SHADER USER DATA */
869
870 static void si_mark_shader_pointers_dirty(struct si_context *sctx,
871 unsigned shader)
872 {
873 sctx->const_buffers[shader].desc.pointer_dirty = true;
874 sctx->rw_buffers[shader].desc.pointer_dirty = true;
875 sctx->samplers[shader].views.desc.pointer_dirty = true;
876 sctx->samplers[shader].states.desc.pointer_dirty = true;
877
878 if (shader == PIPE_SHADER_VERTEX)
879 sctx->vertex_buffers.pointer_dirty = true;
880
881 si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
882 }
883
884 static void si_shader_userdata_begin_new_cs(struct si_context *sctx)
885 {
886 int i;
887
888 for (i = 0; i < SI_NUM_SHADERS; i++) {
889 si_mark_shader_pointers_dirty(sctx, i);
890 }
891 }
892
893 /* Set a base register address for user data constants in the given shader.
894 * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
895 */
896 static void si_set_user_data_base(struct si_context *sctx,
897 unsigned shader, uint32_t new_base)
898 {
899 uint32_t *base = &sctx->shader_userdata.sh_base[shader];
900
901 if (*base != new_base) {
902 *base = new_base;
903
904 if (new_base)
905 si_mark_shader_pointers_dirty(sctx, shader);
906 }
907 }
908
909 /* This must be called when these shaders are changed from non-NULL to NULL
910 * and vice versa:
911 * - geometry shader
912 * - tessellation control shader
913 * - tessellation evaluation shader
914 */
915 void si_shader_change_notify(struct si_context *sctx)
916 {
917 /* VS can be bound as VS, ES, or LS. */
918 if (sctx->tes_shader.cso)
919 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
920 R_00B530_SPI_SHADER_USER_DATA_LS_0);
921 else if (sctx->gs_shader.cso)
922 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
923 R_00B330_SPI_SHADER_USER_DATA_ES_0);
924 else
925 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
926 R_00B130_SPI_SHADER_USER_DATA_VS_0);
927
928 /* TES can be bound as ES, VS, or not bound. */
929 if (sctx->tes_shader.cso) {
930 if (sctx->gs_shader.cso)
931 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
932 R_00B330_SPI_SHADER_USER_DATA_ES_0);
933 else
934 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
935 R_00B130_SPI_SHADER_USER_DATA_VS_0);
936 } else {
937 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
938 }
939 }
940
941 static void si_emit_shader_pointer(struct si_context *sctx,
942 struct si_descriptors *desc,
943 unsigned sh_base, bool keep_dirty)
944 {
945 struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
946 uint64_t va;
947
948 if (!desc->pointer_dirty || !desc->buffer)
949 return;
950
951 va = desc->buffer->gpu_address +
952 desc->buffer_offset;
953
954 radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
955 radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
956 radeon_emit(cs, va);
957 radeon_emit(cs, va >> 32);
958
959 desc->pointer_dirty = keep_dirty;
960 }
961
962 void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom)
963 {
964 unsigned i;
965 uint32_t *sh_base = sctx->shader_userdata.sh_base;
966
967 if (sctx->gs_shader.cso) {
968 /* The VS copy shader needs these for clipping, streamout, and rings. */
969 unsigned vs_base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
970 unsigned i = PIPE_SHADER_VERTEX;
971
972 si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, vs_base, true);
973 si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, vs_base, true);
974
975 /* The TESSEVAL shader needs this for the ESGS ring buffer. */
976 si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc,
977 R_00B330_SPI_SHADER_USER_DATA_ES_0, true);
978 } else if (sctx->tes_shader.cso) {
979 /* The TESSEVAL shader needs this for streamout. */
980 si_emit_shader_pointer(sctx, &sctx->rw_buffers[PIPE_SHADER_VERTEX].desc,
981 R_00B130_SPI_SHADER_USER_DATA_VS_0, true);
982 }
983
984 for (i = 0; i < SI_NUM_SHADERS; i++) {
985 unsigned base = sh_base[i];
986
987 if (!base)
988 continue;
989
990 if (i != PIPE_SHADER_TESS_EVAL)
991 si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, false);
992
993 si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false);
994 si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false);
995 si_emit_shader_pointer(sctx, &sctx->samplers[i].states.desc, base, false);
996 }
997 si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
998 }
999
1000 /* INIT/DEINIT/UPLOAD */
1001
1002 void si_init_all_descriptors(struct si_context *sctx)
1003 {
1004 int i;
1005
1006 for (i = 0; i < SI_NUM_SHADERS; i++) {
1007 si_init_buffer_resources(&sctx->const_buffers[i],
1008 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
1009 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
1010 si_init_buffer_resources(&sctx->rw_buffers[i],
1011 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
1012 RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT);
1013
1014 si_init_descriptors(&sctx->samplers[i].views.desc,
1015 SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
1016 si_init_descriptors(&sctx->samplers[i].states.desc,
1017 SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
1018 }
1019
1020 si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
1021 4, SI_NUM_VERTEX_BUFFERS);
1022
1023 /* Set pipe_context functions. */
1024 sctx->b.b.bind_sampler_states = si_bind_sampler_states;
1025 sctx->b.b.set_constant_buffer = si_set_constant_buffer;
1026 sctx->b.b.set_sampler_views = si_set_sampler_views;
1027 sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
1028 sctx->b.invalidate_buffer = si_invalidate_buffer;
1029
1030 /* Shader user data. */
1031 si_init_atom(sctx, &sctx->shader_userdata.atom, &sctx->atoms.s.shader_userdata,
1032 si_emit_shader_userdata);
1033
1034 /* Set default and immutable mappings. */
1035 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
1036 si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0);
1037 si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
1038 si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
1039 }
1040
1041 bool si_upload_shader_descriptors(struct si_context *sctx)
1042 {
1043 int i;
1044
1045 for (i = 0; i < SI_NUM_SHADERS; i++) {
1046 if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) ||
1047 !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) ||
1048 !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) ||
1049 !si_upload_descriptors(sctx, &sctx->samplers[i].states.desc))
1050 return false;
1051 }
1052 return si_upload_vertex_buffer_descriptors(sctx);
1053 }
1054
1055 void si_release_all_descriptors(struct si_context *sctx)
1056 {
1057 int i;
1058
1059 for (i = 0; i < SI_NUM_SHADERS; i++) {
1060 si_release_buffer_resources(&sctx->const_buffers[i]);
1061 si_release_buffer_resources(&sctx->rw_buffers[i]);
1062 si_release_sampler_views(&sctx->samplers[i].views);
1063 si_release_descriptors(&sctx->samplers[i].states.desc);
1064 }
1065 si_release_descriptors(&sctx->vertex_buffers);
1066 }
1067
1068 void si_all_descriptors_begin_new_cs(struct si_context *sctx)
1069 {
1070 int i;
1071
1072 for (i = 0; i < SI_NUM_SHADERS; i++) {
1073 si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
1074 si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]);
1075 si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
1076 si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
1077 }
1078 si_vertex_buffers_begin_new_cs(sctx);
1079 si_shader_userdata_begin_new_cs(sctx);
1080 }