gallium/radeon: inline the r600_rings structure
[mesa.git] / src / gallium / drivers / radeonsi / si_descriptors.c
1 /*
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Marek Olšák <marek.olsak@amd.com>
25 */
26
27 /* Resource binding slots and sampler states (each described with 8 or
28 * 4 dwords) are stored in lists in memory which is accessed by shaders
29 * using scalar load instructions.
30 *
31 * This file is responsible for managing such lists. It keeps a copy of all
32 * descriptors in CPU memory and re-uploads a whole list if some slots have
33 * been changed.
34 *
35 * This code is also reponsible for updating shader pointers to those lists.
36 *
37 * Note that CP DMA can't be used for updating the lists, because a GPU hang
38 * could leave the list in a mid-IB state and the next IB would get wrong
39 * descriptors and the whole context would be unusable at that point.
40 * (Note: The register shadowing can't be used due to the same reason)
41 *
42 * Also, uploading descriptors to newly allocated memory doesn't require
43 * a KCACHE flush.
44 */
45
46 #include "radeon/r600_cs.h"
47 #include "si_pipe.h"
48 #include "si_shader.h"
49 #include "sid.h"
50
51 #include "util/u_memory.h"
52 #include "util/u_upload_mgr.h"
53
54
55 /* NULL image and buffer descriptor.
56 *
57 * For images, all fields must be zero except for the swizzle, which
58 * supports arbitrary combinations of 0s and 1s. The texture type must be
59 * any valid type (e.g. 1D). If the texture type isn't set, the hw hangs.
60 *
61 * For buffers, all fields must be zero. If they are not, the hw hangs.
62 *
63 * This is the only reason why the buffer descriptor must be in words [4:7].
64 */
65 static uint32_t null_descriptor[8] = {
66 0,
67 0,
68 0,
69 S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) |
70 S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
71 /* the rest must contain zeros, which is also used by the buffer
72 * descriptor */
73 };
74
75 static void si_init_descriptors(struct si_descriptors *desc,
76 unsigned shader_userdata_index,
77 unsigned element_dw_size,
78 unsigned num_elements)
79 {
80 int i;
81
82 assert(num_elements <= sizeof(desc->enabled_mask)*8);
83
84 desc->list = CALLOC(num_elements, element_dw_size * 4);
85 desc->element_dw_size = element_dw_size;
86 desc->num_elements = num_elements;
87 desc->list_dirty = true; /* upload the list before the next draw */
88 desc->shader_userdata_offset = shader_userdata_index * 4;
89
90 /* Initialize the array to NULL descriptors if the element size is 8. */
91 if (element_dw_size == 8)
92 for (i = 0; i < num_elements; i++)
93 memcpy(desc->list + i*element_dw_size, null_descriptor,
94 sizeof(null_descriptor));
95 }
96
97 static void si_release_descriptors(struct si_descriptors *desc)
98 {
99 pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
100 FREE(desc->list);
101 }
102
103 static bool si_upload_descriptors(struct si_context *sctx,
104 struct si_descriptors *desc)
105 {
106 unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
107 void *ptr;
108
109 if (!desc->list_dirty)
110 return true;
111
112 u_upload_alloc(sctx->b.uploader, 0, list_size,
113 &desc->buffer_offset,
114 (struct pipe_resource**)&desc->buffer, &ptr);
115 if (!desc->buffer)
116 return false; /* skip the draw call */
117
118 util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
119
120 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
121 RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
122
123 desc->list_dirty = false;
124 desc->pointer_dirty = true;
125 si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
126 return true;
127 }
128
129 /* SAMPLER VIEWS */
130
131 static void si_release_sampler_views(struct si_sampler_views *views)
132 {
133 int i;
134
135 for (i = 0; i < Elements(views->views); i++) {
136 pipe_sampler_view_reference(&views->views[i], NULL);
137 }
138 si_release_descriptors(&views->desc);
139 }
140
141 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
142 struct si_sampler_views *views)
143 {
144 uint64_t mask = views->desc.enabled_mask;
145
146 /* Add buffers to the CS. */
147 while (mask) {
148 int i = u_bit_scan64(&mask);
149 struct si_sampler_view *rview =
150 (struct si_sampler_view*)views->views[i];
151
152 if (!rview->resource)
153 continue;
154
155 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
156 rview->resource, RADEON_USAGE_READ,
157 r600_get_sampler_view_priority(rview->resource));
158 }
159
160 if (!views->desc.buffer)
161 return;
162 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer,
163 RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
164 }
165
166 static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
167 unsigned slot, struct pipe_sampler_view *view,
168 unsigned *view_desc)
169 {
170 struct si_sampler_views *views = &sctx->samplers[shader].views;
171
172 if (views->views[slot] == view)
173 return;
174
175 if (view) {
176 struct si_sampler_view *rview =
177 (struct si_sampler_view*)view;
178
179 if (rview->resource)
180 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
181 rview->resource, RADEON_USAGE_READ,
182 r600_get_sampler_view_priority(rview->resource));
183
184 if (rview->dcc_buffer && rview->dcc_buffer != rview->resource)
185 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
186 rview->dcc_buffer, RADEON_USAGE_READ,
187 RADEON_PRIO_DCC);
188
189 pipe_sampler_view_reference(&views->views[slot], view);
190 memcpy(views->desc.list + slot*8, view_desc, 8*4);
191 views->desc.enabled_mask |= 1llu << slot;
192 } else {
193 pipe_sampler_view_reference(&views->views[slot], NULL);
194 memcpy(views->desc.list + slot*8, null_descriptor, 8*4);
195 views->desc.enabled_mask &= ~(1llu << slot);
196 }
197
198 views->desc.list_dirty = true;
199 }
200
201 static void si_set_sampler_views(struct pipe_context *ctx,
202 unsigned shader, unsigned start,
203 unsigned count,
204 struct pipe_sampler_view **views)
205 {
206 struct si_context *sctx = (struct si_context *)ctx;
207 struct si_textures_info *samplers = &sctx->samplers[shader];
208 struct si_sampler_view **rviews = (struct si_sampler_view **)views;
209 int i;
210
211 if (!count || shader >= SI_NUM_SHADERS)
212 return;
213
214 for (i = 0; i < count; i++) {
215 unsigned slot = start + i;
216
217 if (!views || !views[i]) {
218 samplers->depth_texture_mask &= ~(1 << slot);
219 samplers->compressed_colortex_mask &= ~(1 << slot);
220 si_set_sampler_view(sctx, shader, slot, NULL, NULL);
221 si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
222 NULL, NULL);
223 continue;
224 }
225
226 si_set_sampler_view(sctx, shader, slot, views[i], rviews[i]->state);
227
228 if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
229 struct r600_texture *rtex =
230 (struct r600_texture*)views[i]->texture;
231
232 if (rtex->is_depth && !rtex->is_flushing_texture) {
233 samplers->depth_texture_mask |= 1 << slot;
234 } else {
235 samplers->depth_texture_mask &= ~(1 << slot);
236 }
237 if (rtex->cmask.size || rtex->fmask.size ||
238 (rtex->dcc_buffer && rtex->dirty_level_mask)) {
239 samplers->compressed_colortex_mask |= 1 << slot;
240 } else {
241 samplers->compressed_colortex_mask &= ~(1 << slot);
242 }
243
244 if (rtex->fmask.size) {
245 si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
246 views[i], rviews[i]->fmask_state);
247 } else {
248 si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
249 NULL, NULL);
250 }
251 } else {
252 samplers->depth_texture_mask &= ~(1 << slot);
253 samplers->compressed_colortex_mask &= ~(1 << slot);
254 si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
255 NULL, NULL);
256 }
257 }
258 }
259
260 /* SAMPLER STATES */
261
262 static void si_sampler_states_begin_new_cs(struct si_context *sctx,
263 struct si_sampler_states *states)
264 {
265 if (!states->desc.buffer)
266 return;
267 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, states->desc.buffer,
268 RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
269 }
270
271 static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader,
272 unsigned start, unsigned count, void **states)
273 {
274 struct si_context *sctx = (struct si_context *)ctx;
275 struct si_sampler_states *samplers = &sctx->samplers[shader].states;
276 struct si_sampler_state **sstates = (struct si_sampler_state**)states;
277 int i;
278
279 if (!count || shader >= SI_NUM_SHADERS)
280 return;
281
282 if (start == 0)
283 samplers->saved_states[0] = states[0];
284 if (start == 1)
285 samplers->saved_states[1] = states[0];
286 else if (start == 0 && count >= 2)
287 samplers->saved_states[1] = states[1];
288
289 for (i = 0; i < count; i++) {
290 unsigned slot = start + i;
291
292 if (!sstates[i])
293 continue;
294
295 memcpy(samplers->desc.list + slot*4, sstates[i]->val, 4*4);
296 samplers->desc.list_dirty = true;
297 }
298 }
299
300 /* BUFFER RESOURCES */
301
302 static void si_init_buffer_resources(struct si_buffer_resources *buffers,
303 unsigned num_buffers,
304 unsigned shader_userdata_index,
305 enum radeon_bo_usage shader_usage,
306 enum radeon_bo_priority priority)
307 {
308 buffers->shader_usage = shader_usage;
309 buffers->priority = priority;
310 buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
311
312 si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
313 num_buffers);
314 }
315
316 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
317 {
318 int i;
319
320 for (i = 0; i < buffers->desc.num_elements; i++) {
321 pipe_resource_reference(&buffers->buffers[i], NULL);
322 }
323
324 FREE(buffers->buffers);
325 si_release_descriptors(&buffers->desc);
326 }
327
328 static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
329 struct si_buffer_resources *buffers)
330 {
331 uint64_t mask = buffers->desc.enabled_mask;
332
333 /* Add buffers to the CS. */
334 while (mask) {
335 int i = u_bit_scan64(&mask);
336
337 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
338 (struct r600_resource*)buffers->buffers[i],
339 buffers->shader_usage, buffers->priority);
340 }
341
342 if (!buffers->desc.buffer)
343 return;
344 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
345 buffers->desc.buffer, RADEON_USAGE_READWRITE,
346 RADEON_PRIO_DESCRIPTORS);
347 }
348
349 /* VERTEX BUFFERS */
350
351 static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
352 {
353 struct si_descriptors *desc = &sctx->vertex_buffers;
354 int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
355 int i;
356
357 for (i = 0; i < count; i++) {
358 int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
359
360 if (vb >= Elements(sctx->vertex_buffer))
361 continue;
362 if (!sctx->vertex_buffer[vb].buffer)
363 continue;
364
365 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
366 (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
367 RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
368 }
369
370 if (!desc->buffer)
371 return;
372 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
373 desc->buffer, RADEON_USAGE_READ,
374 RADEON_PRIO_DESCRIPTORS);
375 }
376
377 static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
378 {
379 struct si_descriptors *desc = &sctx->vertex_buffers;
380 bool bound[SI_NUM_VERTEX_BUFFERS] = {};
381 unsigned i, count = sctx->vertex_elements->count;
382 uint64_t va;
383 uint32_t *ptr;
384
385 if (!sctx->vertex_buffers_dirty)
386 return true;
387 if (!count || !sctx->vertex_elements)
388 return true;
389
390 /* Vertex buffer descriptors are the only ones which are uploaded
391 * directly through a staging buffer and don't go through
392 * the fine-grained upload path.
393 */
394 u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
395 (struct pipe_resource**)&desc->buffer, (void**)&ptr);
396 if (!desc->buffer)
397 return false;
398
399 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
400 desc->buffer, RADEON_USAGE_READ,
401 RADEON_PRIO_DESCRIPTORS);
402
403 assert(count <= SI_NUM_VERTEX_BUFFERS);
404
405 for (i = 0; i < count; i++) {
406 struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
407 struct pipe_vertex_buffer *vb;
408 struct r600_resource *rbuffer;
409 unsigned offset;
410 uint32_t *desc = &ptr[i*4];
411
412 if (ve->vertex_buffer_index >= Elements(sctx->vertex_buffer)) {
413 memset(desc, 0, 16);
414 continue;
415 }
416
417 vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
418 rbuffer = (struct r600_resource*)vb->buffer;
419 if (rbuffer == NULL) {
420 memset(desc, 0, 16);
421 continue;
422 }
423
424 offset = vb->buffer_offset + ve->src_offset;
425 va = rbuffer->gpu_address + offset;
426
427 /* Fill in T# buffer resource description */
428 desc[0] = va;
429 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
430 S_008F04_STRIDE(vb->stride);
431
432 if (sctx->b.chip_class <= CIK && vb->stride)
433 /* Round up by rounding down and adding 1 */
434 desc[2] = (vb->buffer->width0 - offset -
435 sctx->vertex_elements->format_size[i]) /
436 vb->stride + 1;
437 else
438 desc[2] = vb->buffer->width0 - offset;
439
440 desc[3] = sctx->vertex_elements->rsrc_word3[i];
441
442 if (!bound[ve->vertex_buffer_index]) {
443 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
444 (struct r600_resource*)vb->buffer,
445 RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
446 bound[ve->vertex_buffer_index] = true;
447 }
448 }
449
450 /* Don't flush the const cache. It would have a very negative effect
451 * on performance (confirmed by testing). New descriptors are always
452 * uploaded to a fresh new buffer, so I don't think flushing the const
453 * cache is needed. */
454 desc->pointer_dirty = true;
455 si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
456 sctx->vertex_buffers_dirty = false;
457 return true;
458 }
459
460
461 /* CONSTANT BUFFERS */
462
463 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
464 const uint8_t *ptr, unsigned size, uint32_t *const_offset)
465 {
466 void *tmp;
467
468 u_upload_alloc(sctx->b.uploader, 0, size, const_offset,
469 (struct pipe_resource**)rbuffer, &tmp);
470 if (rbuffer)
471 util_memcpy_cpu_to_le32(tmp, ptr, size);
472 }
473
474 static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint slot,
475 struct pipe_constant_buffer *input)
476 {
477 struct si_context *sctx = (struct si_context *)ctx;
478 struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
479
480 if (shader >= SI_NUM_SHADERS)
481 return;
482
483 assert(slot < buffers->desc.num_elements);
484 pipe_resource_reference(&buffers->buffers[slot], NULL);
485
486 /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
487 * with a NULL buffer). We need to use a dummy buffer instead. */
488 if (sctx->b.chip_class == CIK &&
489 (!input || (!input->buffer && !input->user_buffer)))
490 input = &sctx->null_const_buf;
491
492 if (input && (input->buffer || input->user_buffer)) {
493 struct pipe_resource *buffer = NULL;
494 uint64_t va;
495
496 /* Upload the user buffer if needed. */
497 if (input->user_buffer) {
498 unsigned buffer_offset;
499
500 si_upload_const_buffer(sctx,
501 (struct r600_resource**)&buffer, input->user_buffer,
502 input->buffer_size, &buffer_offset);
503 if (!buffer) {
504 /* Just unbind on failure. */
505 si_set_constant_buffer(ctx, shader, slot, NULL);
506 return;
507 }
508 va = r600_resource(buffer)->gpu_address + buffer_offset;
509 } else {
510 pipe_resource_reference(&buffer, input->buffer);
511 va = r600_resource(buffer)->gpu_address + input->buffer_offset;
512 }
513
514 /* Set the descriptor. */
515 uint32_t *desc = buffers->desc.list + slot*4;
516 desc[0] = va;
517 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
518 S_008F04_STRIDE(0);
519 desc[2] = input->buffer_size;
520 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
521 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
522 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
523 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
524 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
525 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
526
527 buffers->buffers[slot] = buffer;
528 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
529 (struct r600_resource*)buffer,
530 buffers->shader_usage, buffers->priority);
531 buffers->desc.enabled_mask |= 1llu << slot;
532 } else {
533 /* Clear the descriptor. */
534 memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
535 buffers->desc.enabled_mask &= ~(1llu << slot);
536 }
537
538 buffers->desc.list_dirty = true;
539 }
540
541 /* RING BUFFERS */
542
543 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
544 struct pipe_resource *buffer,
545 unsigned stride, unsigned num_records,
546 bool add_tid, bool swizzle,
547 unsigned element_size, unsigned index_stride, uint64_t offset)
548 {
549 struct si_context *sctx = (struct si_context *)ctx;
550 struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
551
552 if (shader >= SI_NUM_SHADERS)
553 return;
554
555 /* The stride field in the resource descriptor has 14 bits */
556 assert(stride < (1 << 14));
557
558 assert(slot < buffers->desc.num_elements);
559 pipe_resource_reference(&buffers->buffers[slot], NULL);
560
561 if (buffer) {
562 uint64_t va;
563
564 va = r600_resource(buffer)->gpu_address + offset;
565
566 switch (element_size) {
567 default:
568 assert(!"Unsupported ring buffer element size");
569 case 0:
570 case 2:
571 element_size = 0;
572 break;
573 case 4:
574 element_size = 1;
575 break;
576 case 8:
577 element_size = 2;
578 break;
579 case 16:
580 element_size = 3;
581 break;
582 }
583
584 switch (index_stride) {
585 default:
586 assert(!"Unsupported ring buffer index stride");
587 case 0:
588 case 8:
589 index_stride = 0;
590 break;
591 case 16:
592 index_stride = 1;
593 break;
594 case 32:
595 index_stride = 2;
596 break;
597 case 64:
598 index_stride = 3;
599 break;
600 }
601
602 if (sctx->b.chip_class >= VI && stride)
603 num_records *= stride;
604
605 /* Set the descriptor. */
606 uint32_t *desc = buffers->desc.list + slot*4;
607 desc[0] = va;
608 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
609 S_008F04_STRIDE(stride) |
610 S_008F04_SWIZZLE_ENABLE(swizzle);
611 desc[2] = num_records;
612 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
613 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
614 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
615 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
616 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
617 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
618 S_008F0C_ELEMENT_SIZE(element_size) |
619 S_008F0C_INDEX_STRIDE(index_stride) |
620 S_008F0C_ADD_TID_ENABLE(add_tid);
621
622 pipe_resource_reference(&buffers->buffers[slot], buffer);
623 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
624 (struct r600_resource*)buffer,
625 buffers->shader_usage, buffers->priority);
626 buffers->desc.enabled_mask |= 1llu << slot;
627 } else {
628 /* Clear the descriptor. */
629 memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
630 buffers->desc.enabled_mask &= ~(1llu << slot);
631 }
632
633 buffers->desc.list_dirty = true;
634 }
635
636 /* STREAMOUT BUFFERS */
637
638 static void si_set_streamout_targets(struct pipe_context *ctx,
639 unsigned num_targets,
640 struct pipe_stream_output_target **targets,
641 const unsigned *offsets)
642 {
643 struct si_context *sctx = (struct si_context *)ctx;
644 struct si_buffer_resources *buffers = &sctx->rw_buffers[PIPE_SHADER_VERTEX];
645 unsigned old_num_targets = sctx->b.streamout.num_targets;
646 unsigned i, bufidx;
647
648 /* We are going to unbind the buffers. Mark which caches need to be flushed. */
649 if (sctx->b.streamout.num_targets && sctx->b.streamout.begin_emitted) {
650 /* Since streamout uses vector writes which go through TC L2
651 * and most other clients can use TC L2 as well, we don't need
652 * to flush it.
653 *
654 * The only case which requires flushing it is VGT DMA index
655 * fetching, which is a rare case. Thus, flag the TC L2
656 * dirtiness in the resource and handle it when index fetching
657 * is used.
658 */
659 for (i = 0; i < sctx->b.streamout.num_targets; i++)
660 if (sctx->b.streamout.targets[i])
661 r600_resource(sctx->b.streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
662
663 /* Invalidate the scalar cache in case a streamout buffer is
664 * going to be used as a constant buffer.
665 *
666 * Invalidate TC L1, because streamout bypasses it (done by
667 * setting GLC=1 in the store instruction), but it can contain
668 * outdated data of streamout buffers.
669 *
670 * VS_PARTIAL_FLUSH is required if the buffers are going to be
671 * used as an input immediately.
672 */
673 sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
674 SI_CONTEXT_INV_VMEM_L1 |
675 SI_CONTEXT_VS_PARTIAL_FLUSH;
676 }
677
678 /* Streamout buffers must be bound in 2 places:
679 * 1) in VGT by setting the VGT_STRMOUT registers
680 * 2) as shader resources
681 */
682
683 /* Set the VGT regs. */
684 r600_set_streamout_targets(ctx, num_targets, targets, offsets);
685
686 /* Set the shader resources.*/
687 for (i = 0; i < num_targets; i++) {
688 bufidx = SI_SO_BUF_OFFSET + i;
689
690 if (targets[i]) {
691 struct pipe_resource *buffer = targets[i]->buffer;
692 uint64_t va = r600_resource(buffer)->gpu_address;
693
694 /* Set the descriptor.
695 *
696 * On VI, the format must be non-INVALID, otherwise
697 * the buffer will be considered not bound and store
698 * instructions will be no-ops.
699 */
700 uint32_t *desc = buffers->desc.list + bufidx*4;
701 desc[0] = va;
702 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
703 desc[2] = 0xffffffff;
704 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
705 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
706 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
707 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
708 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
709
710 /* Set the resource. */
711 pipe_resource_reference(&buffers->buffers[bufidx],
712 buffer);
713 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
714 (struct r600_resource*)buffer,
715 buffers->shader_usage, buffers->priority);
716 buffers->desc.enabled_mask |= 1llu << bufidx;
717 } else {
718 /* Clear the descriptor and unset the resource. */
719 memset(buffers->desc.list + bufidx*4, 0,
720 sizeof(uint32_t) * 4);
721 pipe_resource_reference(&buffers->buffers[bufidx],
722 NULL);
723 buffers->desc.enabled_mask &= ~(1llu << bufidx);
724 }
725 }
726 for (; i < old_num_targets; i++) {
727 bufidx = SI_SO_BUF_OFFSET + i;
728 /* Clear the descriptor and unset the resource. */
729 memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4);
730 pipe_resource_reference(&buffers->buffers[bufidx], NULL);
731 buffers->desc.enabled_mask &= ~(1llu << bufidx);
732 }
733
734 buffers->desc.list_dirty = true;
735 }
736
737 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
738 uint32_t *desc, uint64_t old_buf_va,
739 struct pipe_resource *new_buf)
740 {
741 /* Retrieve the buffer offset from the descriptor. */
742 uint64_t old_desc_va =
743 desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
744
745 assert(old_buf_va <= old_desc_va);
746 uint64_t offset_within_buffer = old_desc_va - old_buf_va;
747
748 /* Update the descriptor. */
749 uint64_t va = r600_resource(new_buf)->gpu_address + offset_within_buffer;
750
751 desc[0] = va;
752 desc[1] = (desc[1] & C_008F04_BASE_ADDRESS_HI) |
753 S_008F04_BASE_ADDRESS_HI(va >> 32);
754 }
755
756 /* BUFFER DISCARD/INVALIDATION */
757
758 /* Reallocate a buffer a update all resource bindings where the buffer is
759 * bound.
760 *
761 * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
762 * idle by discarding its contents. Apps usually tell us when to do this using
763 * map_buffer flags, for example.
764 */
765 static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf)
766 {
767 struct si_context *sctx = (struct si_context*)ctx;
768 struct r600_resource *rbuffer = r600_resource(buf);
769 unsigned i, shader, alignment = rbuffer->buf->alignment;
770 uint64_t old_va = rbuffer->gpu_address;
771 unsigned num_elems = sctx->vertex_elements ?
772 sctx->vertex_elements->count : 0;
773 struct si_sampler_view *view;
774
775 /* Reallocate the buffer in the same pipe_resource. */
776 r600_init_resource(&sctx->screen->b, rbuffer, rbuffer->b.b.width0,
777 alignment, TRUE);
778
779 /* We changed the buffer, now we need to bind it where the old one
780 * was bound. This consists of 2 things:
781 * 1) Updating the resource descriptor and dirtying it.
782 * 2) Adding a relocation to the CS, so that it's usable.
783 */
784
785 /* Vertex buffers. */
786 for (i = 0; i < num_elems; i++) {
787 int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
788
789 if (vb >= Elements(sctx->vertex_buffer))
790 continue;
791 if (!sctx->vertex_buffer[vb].buffer)
792 continue;
793
794 if (sctx->vertex_buffer[vb].buffer == buf) {
795 sctx->vertex_buffers_dirty = true;
796 break;
797 }
798 }
799
800 /* Read/Write buffers. */
801 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
802 struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
803 uint64_t mask = buffers->desc.enabled_mask;
804
805 while (mask) {
806 i = u_bit_scan64(&mask);
807 if (buffers->buffers[i] == buf) {
808 si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
809 old_va, buf);
810 buffers->desc.list_dirty = true;
811
812 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
813 rbuffer, buffers->shader_usage,
814 buffers->priority);
815
816 if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) {
817 /* Update the streamout state. */
818 if (sctx->b.streamout.begin_emitted) {
819 r600_emit_streamout_end(&sctx->b);
820 }
821 sctx->b.streamout.append_bitmask =
822 sctx->b.streamout.enabled_mask;
823 r600_streamout_buffers_dirty(&sctx->b);
824 }
825 }
826 }
827 }
828
829 /* Constant buffers. */
830 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
831 struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
832 uint64_t mask = buffers->desc.enabled_mask;
833
834 while (mask) {
835 unsigned i = u_bit_scan64(&mask);
836 if (buffers->buffers[i] == buf) {
837 si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
838 old_va, buf);
839 buffers->desc.list_dirty = true;
840
841 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
842 rbuffer, buffers->shader_usage,
843 buffers->priority);
844 }
845 }
846 }
847
848 /* Texture buffers - update virtual addresses in sampler view descriptors. */
849 LIST_FOR_EACH_ENTRY(view, &sctx->b.texture_buffers, list) {
850 if (view->base.texture == buf) {
851 si_desc_reset_buffer_offset(ctx, &view->state[4], old_va, buf);
852 }
853 }
854 /* Texture buffers - update bindings. */
855 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
856 struct si_sampler_views *views = &sctx->samplers[shader].views;
857 uint64_t mask = views->desc.enabled_mask;
858
859 while (mask) {
860 unsigned i = u_bit_scan64(&mask);
861 if (views->views[i]->texture == buf) {
862 si_desc_reset_buffer_offset(ctx, views->desc.list + i*8+4,
863 old_va, buf);
864 views->desc.list_dirty = true;
865
866 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
867 rbuffer, RADEON_USAGE_READ,
868 RADEON_PRIO_SAMPLER_BUFFER);
869 }
870 }
871 }
872 }
873
874 /* SHADER USER DATA */
875
876 static void si_mark_shader_pointers_dirty(struct si_context *sctx,
877 unsigned shader)
878 {
879 sctx->const_buffers[shader].desc.pointer_dirty = true;
880 sctx->rw_buffers[shader].desc.pointer_dirty = true;
881 sctx->samplers[shader].views.desc.pointer_dirty = true;
882 sctx->samplers[shader].states.desc.pointer_dirty = true;
883
884 if (shader == PIPE_SHADER_VERTEX)
885 sctx->vertex_buffers.pointer_dirty = true;
886
887 si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
888 }
889
890 static void si_shader_userdata_begin_new_cs(struct si_context *sctx)
891 {
892 int i;
893
894 for (i = 0; i < SI_NUM_SHADERS; i++) {
895 si_mark_shader_pointers_dirty(sctx, i);
896 }
897 }
898
899 /* Set a base register address for user data constants in the given shader.
900 * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
901 */
902 static void si_set_user_data_base(struct si_context *sctx,
903 unsigned shader, uint32_t new_base)
904 {
905 uint32_t *base = &sctx->shader_userdata.sh_base[shader];
906
907 if (*base != new_base) {
908 *base = new_base;
909
910 if (new_base)
911 si_mark_shader_pointers_dirty(sctx, shader);
912 }
913 }
914
915 /* This must be called when these shaders are changed from non-NULL to NULL
916 * and vice versa:
917 * - geometry shader
918 * - tessellation control shader
919 * - tessellation evaluation shader
920 */
921 void si_shader_change_notify(struct si_context *sctx)
922 {
923 /* VS can be bound as VS, ES, or LS. */
924 if (sctx->tes_shader.cso)
925 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
926 R_00B530_SPI_SHADER_USER_DATA_LS_0);
927 else if (sctx->gs_shader.cso)
928 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
929 R_00B330_SPI_SHADER_USER_DATA_ES_0);
930 else
931 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
932 R_00B130_SPI_SHADER_USER_DATA_VS_0);
933
934 /* TES can be bound as ES, VS, or not bound. */
935 if (sctx->tes_shader.cso) {
936 if (sctx->gs_shader.cso)
937 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
938 R_00B330_SPI_SHADER_USER_DATA_ES_0);
939 else
940 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
941 R_00B130_SPI_SHADER_USER_DATA_VS_0);
942 } else {
943 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
944 }
945 }
946
947 static void si_emit_shader_pointer(struct si_context *sctx,
948 struct si_descriptors *desc,
949 unsigned sh_base, bool keep_dirty)
950 {
951 struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
952 uint64_t va;
953
954 if (!desc->pointer_dirty || !desc->buffer)
955 return;
956
957 va = desc->buffer->gpu_address +
958 desc->buffer_offset;
959
960 radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
961 radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
962 radeon_emit(cs, va);
963 radeon_emit(cs, va >> 32);
964
965 desc->pointer_dirty = keep_dirty;
966 }
967
968 void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom)
969 {
970 unsigned i;
971 uint32_t *sh_base = sctx->shader_userdata.sh_base;
972
973 if (sctx->gs_shader.cso) {
974 /* The VS copy shader needs these for clipping, streamout, and rings. */
975 unsigned vs_base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
976 unsigned i = PIPE_SHADER_VERTEX;
977
978 si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, vs_base, true);
979 si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, vs_base, true);
980
981 /* The TESSEVAL shader needs this for the ESGS ring buffer. */
982 si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc,
983 R_00B330_SPI_SHADER_USER_DATA_ES_0, true);
984 } else if (sctx->tes_shader.cso) {
985 /* The TESSEVAL shader needs this for streamout. */
986 si_emit_shader_pointer(sctx, &sctx->rw_buffers[PIPE_SHADER_VERTEX].desc,
987 R_00B130_SPI_SHADER_USER_DATA_VS_0, true);
988 }
989
990 for (i = 0; i < SI_NUM_SHADERS; i++) {
991 unsigned base = sh_base[i];
992
993 if (!base)
994 continue;
995
996 if (i != PIPE_SHADER_TESS_EVAL)
997 si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, false);
998
999 si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false);
1000 si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false);
1001 si_emit_shader_pointer(sctx, &sctx->samplers[i].states.desc, base, false);
1002 }
1003 si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
1004 }
1005
1006 /* INIT/DEINIT/UPLOAD */
1007
1008 void si_init_all_descriptors(struct si_context *sctx)
1009 {
1010 int i;
1011
1012 for (i = 0; i < SI_NUM_SHADERS; i++) {
1013 si_init_buffer_resources(&sctx->const_buffers[i],
1014 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
1015 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
1016 si_init_buffer_resources(&sctx->rw_buffers[i],
1017 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
1018 RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT);
1019
1020 si_init_descriptors(&sctx->samplers[i].views.desc,
1021 SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
1022 si_init_descriptors(&sctx->samplers[i].states.desc,
1023 SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
1024 }
1025
1026 si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
1027 4, SI_NUM_VERTEX_BUFFERS);
1028
1029 /* Set pipe_context functions. */
1030 sctx->b.b.bind_sampler_states = si_bind_sampler_states;
1031 sctx->b.b.set_constant_buffer = si_set_constant_buffer;
1032 sctx->b.b.set_sampler_views = si_set_sampler_views;
1033 sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
1034 sctx->b.invalidate_buffer = si_invalidate_buffer;
1035
1036 /* Shader user data. */
1037 si_init_atom(sctx, &sctx->shader_userdata.atom, &sctx->atoms.s.shader_userdata,
1038 si_emit_shader_userdata);
1039
1040 /* Set default and immutable mappings. */
1041 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
1042 si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0);
1043 si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
1044 si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
1045 }
1046
1047 bool si_upload_shader_descriptors(struct si_context *sctx)
1048 {
1049 int i;
1050
1051 for (i = 0; i < SI_NUM_SHADERS; i++) {
1052 if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) ||
1053 !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) ||
1054 !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) ||
1055 !si_upload_descriptors(sctx, &sctx->samplers[i].states.desc))
1056 return false;
1057 }
1058 return si_upload_vertex_buffer_descriptors(sctx);
1059 }
1060
1061 void si_release_all_descriptors(struct si_context *sctx)
1062 {
1063 int i;
1064
1065 for (i = 0; i < SI_NUM_SHADERS; i++) {
1066 si_release_buffer_resources(&sctx->const_buffers[i]);
1067 si_release_buffer_resources(&sctx->rw_buffers[i]);
1068 si_release_sampler_views(&sctx->samplers[i].views);
1069 si_release_descriptors(&sctx->samplers[i].states.desc);
1070 }
1071 si_release_descriptors(&sctx->vertex_buffers);
1072 }
1073
1074 void si_all_descriptors_begin_new_cs(struct si_context *sctx)
1075 {
1076 int i;
1077
1078 for (i = 0; i < SI_NUM_SHADERS; i++) {
1079 si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
1080 si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]);
1081 si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
1082 si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
1083 }
1084 si_vertex_buffers_begin_new_cs(sctx);
1085 si_shader_userdata_begin_new_cs(sctx);
1086 }