radeonsi: set descriptor dirty mask on shader buffer unbind
[mesa.git] / src / gallium / drivers / radeonsi / si_descriptors.c
1 /*
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Marek Olšák <marek.olsak@amd.com>
25 */
26
27 /* Resource binding slots and sampler states (each described with 8 or
28 * 4 dwords) are stored in lists in memory which is accessed by shaders
29 * using scalar load instructions.
30 *
31 * This file is responsible for managing such lists. It keeps a copy of all
32 * descriptors in CPU memory and re-uploads a whole list if some slots have
33 * been changed.
34 *
35 * This code is also reponsible for updating shader pointers to those lists.
36 *
37 * Note that CP DMA can't be used for updating the lists, because a GPU hang
38 * could leave the list in a mid-IB state and the next IB would get wrong
39 * descriptors and the whole context would be unusable at that point.
40 * (Note: The register shadowing can't be used due to the same reason)
41 *
42 * Also, uploading descriptors to newly allocated memory doesn't require
43 * a KCACHE flush.
44 *
45 *
46 * Possible scenarios for one 16 dword image+sampler slot:
47 *
48 * | Image | w/ FMASK | Buffer | NULL
49 * [ 0: 3] Image[0:3] | Image[0:3] | Null[0:3] | Null[0:3]
50 * [ 4: 7] Image[4:7] | Image[4:7] | Buffer[0:3] | 0
51 * [ 8:11] Null[0:3] | Fmask[0:3] | Null[0:3] | Null[0:3]
52 * [12:15] Sampler[0:3] | Fmask[4:7] | Sampler[0:3] | Sampler[0:3]
53 *
54 * FMASK implies MSAA, therefore no sampler state.
55 * Sampler states are never unbound except when FMASK is bound.
56 */
57
58 #include "radeon/r600_cs.h"
59 #include "si_pipe.h"
60 #include "si_shader.h"
61 #include "sid.h"
62
63 #include "util/u_format.h"
64 #include "util/u_math.h"
65 #include "util/u_memory.h"
66 #include "util/u_suballoc.h"
67 #include "util/u_upload_mgr.h"
68
69
70 /* NULL image and buffer descriptor for textures (alpha = 1) and images
71 * (alpha = 0).
72 *
73 * For images, all fields must be zero except for the swizzle, which
74 * supports arbitrary combinations of 0s and 1s. The texture type must be
75 * any valid type (e.g. 1D). If the texture type isn't set, the hw hangs.
76 *
77 * For buffers, all fields must be zero. If they are not, the hw hangs.
78 *
79 * This is the only reason why the buffer descriptor must be in words [4:7].
80 */
81 static uint32_t null_texture_descriptor[8] = {
82 0,
83 0,
84 0,
85 S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) |
86 S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
87 /* the rest must contain zeros, which is also used by the buffer
88 * descriptor */
89 };
90
91 static uint32_t null_image_descriptor[8] = {
92 0,
93 0,
94 0,
95 S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
96 /* the rest must contain zeros, which is also used by the buffer
97 * descriptor */
98 };
99
100 static void si_init_descriptors(struct si_descriptors *desc,
101 unsigned shader_userdata_index,
102 unsigned element_dw_size,
103 unsigned num_elements,
104 const uint32_t *null_descriptor,
105 unsigned *ce_offset)
106 {
107 int i;
108
109 assert(num_elements <= sizeof(desc->enabled_mask)*8);
110
111 desc->list = CALLOC(num_elements, element_dw_size * 4);
112 desc->element_dw_size = element_dw_size;
113 desc->num_elements = num_elements;
114 desc->dirty_mask = num_elements == 32 ? ~0u : (1u << num_elements) - 1;
115 desc->shader_userdata_offset = shader_userdata_index * 4;
116
117 if (ce_offset) {
118 desc->ce_offset = *ce_offset;
119
120 /* make sure that ce_offset stays 32 byte aligned */
121 *ce_offset += align(element_dw_size * num_elements * 4, 32);
122 }
123
124 /* Initialize the array to NULL descriptors if the element size is 8. */
125 if (null_descriptor) {
126 assert(element_dw_size % 8 == 0);
127 for (i = 0; i < num_elements * element_dw_size / 8; i++)
128 memcpy(desc->list + i * 8, null_descriptor,
129 8 * 4);
130 }
131 }
132
133 static void si_release_descriptors(struct si_descriptors *desc)
134 {
135 pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
136 FREE(desc->list);
137 }
138
139 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
140 unsigned *out_offset, struct r600_resource **out_buf) {
141 uint64_t va;
142
143 u_suballocator_alloc(sctx->ce_suballocator, size, 64, out_offset,
144 (struct pipe_resource**)out_buf);
145 if (!out_buf)
146 return false;
147
148 va = (*out_buf)->gpu_address + *out_offset;
149
150 radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
151 radeon_emit(sctx->ce_ib, ce_offset);
152 radeon_emit(sctx->ce_ib, size / 4);
153 radeon_emit(sctx->ce_ib, va);
154 radeon_emit(sctx->ce_ib, va >> 32);
155
156 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf,
157 RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
158
159 sctx->ce_need_synchronization = true;
160 return true;
161 }
162
163 static void si_reinitialize_ce_ram(struct si_context *sctx,
164 struct si_descriptors *desc)
165 {
166 if (desc->buffer) {
167 struct r600_resource *buffer = (struct r600_resource*)desc->buffer;
168 unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
169 uint64_t va = buffer->gpu_address + desc->buffer_offset;
170 struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
171
172 if (!ib)
173 ib = sctx->ce_ib;
174
175 list_size = align(list_size, 32);
176
177 radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
178 radeon_emit(ib, va);
179 radeon_emit(ib, va >> 32);
180 radeon_emit(ib, list_size / 4);
181 radeon_emit(ib, desc->ce_offset);
182
183 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
184 RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
185 }
186 desc->ce_ram_dirty = false;
187 }
188
189 void si_ce_enable_loads(struct radeon_winsys_cs *ib)
190 {
191 radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
192 radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
193 CONTEXT_CONTROL_LOAD_CE_RAM(1));
194 radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
195 }
196
197 static bool si_upload_descriptors(struct si_context *sctx,
198 struct si_descriptors *desc,
199 struct r600_atom * atom)
200 {
201 unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
202
203 if (!desc->dirty_mask)
204 return true;
205
206 if (sctx->ce_ib) {
207 uint32_t const* list = (uint32_t const*)desc->list;
208
209 if (desc->ce_ram_dirty)
210 si_reinitialize_ce_ram(sctx, desc);
211
212 while(desc->dirty_mask) {
213 int begin, count;
214 u_bit_scan_consecutive_range(&desc->dirty_mask, &begin,
215 &count);
216
217 begin *= desc->element_dw_size;
218 count *= desc->element_dw_size;
219
220 radeon_emit(sctx->ce_ib,
221 PKT3(PKT3_WRITE_CONST_RAM, count, 0));
222 radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
223 radeon_emit_array(sctx->ce_ib, list + begin, count);
224 }
225
226 if (!si_ce_upload(sctx, desc->ce_offset, list_size,
227 &desc->buffer_offset, &desc->buffer))
228 return false;
229 } else {
230 void *ptr;
231
232 u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
233 &desc->buffer_offset,
234 (struct pipe_resource**)&desc->buffer, &ptr);
235 if (!desc->buffer)
236 return false; /* skip the draw call */
237
238 util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
239
240 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
241 RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
242 }
243 desc->pointer_dirty = true;
244 desc->dirty_mask = 0;
245
246 if (atom)
247 si_mark_atom_dirty(sctx, atom);
248
249 return true;
250 }
251
252 /* SAMPLER VIEWS */
253
254 static void si_release_sampler_views(struct si_sampler_views *views)
255 {
256 int i;
257
258 for (i = 0; i < ARRAY_SIZE(views->views); i++) {
259 pipe_sampler_view_reference(&views->views[i], NULL);
260 }
261 si_release_descriptors(&views->desc);
262 }
263
264 static void si_sampler_view_add_buffer(struct si_context *sctx,
265 struct pipe_resource *resource,
266 enum radeon_bo_usage usage)
267 {
268 struct r600_resource *rres = (struct r600_resource*)resource;
269
270 if (!resource)
271 return;
272
273 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rres, usage,
274 r600_get_sampler_view_priority(rres));
275 }
276
277 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
278 struct si_sampler_views *views)
279 {
280 unsigned mask = views->desc.enabled_mask;
281
282 /* Add buffers to the CS. */
283 while (mask) {
284 int i = u_bit_scan(&mask);
285
286 si_sampler_view_add_buffer(sctx, views->views[i]->texture,
287 RADEON_USAGE_READ);
288 }
289
290 views->desc.ce_ram_dirty = true;
291
292 if (!views->desc.buffer)
293 return;
294 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer,
295 RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
296 }
297
298 void si_set_mutable_tex_desc_fields(struct r600_texture *tex,
299 const struct radeon_surf_level *base_level_info,
300 unsigned base_level, unsigned block_width,
301 bool is_stencil, uint32_t *state)
302 {
303 uint64_t va = tex->resource.gpu_address + base_level_info->offset;
304 unsigned pitch = base_level_info->nblk_x * block_width;
305
306 state[1] &= C_008F14_BASE_ADDRESS_HI;
307 state[3] &= C_008F1C_TILING_INDEX;
308 state[4] &= C_008F20_PITCH;
309 state[6] &= C_008F28_COMPRESSION_EN;
310
311 state[0] = va >> 8;
312 state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
313 state[3] |= S_008F1C_TILING_INDEX(si_tile_mode_index(tex, base_level,
314 is_stencil));
315 state[4] |= S_008F20_PITCH(pitch - 1);
316
317 if (tex->dcc_offset) {
318 state[6] |= S_008F28_COMPRESSION_EN(1);
319 state[7] = (tex->resource.gpu_address +
320 tex->dcc_offset +
321 base_level_info->dcc_offset) >> 8;
322 }
323 }
324
325 static void si_set_sampler_view(struct si_context *sctx,
326 struct si_sampler_views *views,
327 unsigned slot, struct pipe_sampler_view *view,
328 bool disallow_early_out)
329 {
330 struct si_sampler_view *rview = (struct si_sampler_view*)view;
331
332 if (views->views[slot] == view && !disallow_early_out)
333 return;
334
335 if (view) {
336 struct r600_texture *rtex = (struct r600_texture *)view->texture;
337 uint32_t *desc = views->desc.list + slot * 16;
338
339 si_sampler_view_add_buffer(sctx, view->texture,
340 RADEON_USAGE_READ);
341
342 pipe_sampler_view_reference(&views->views[slot], view);
343 memcpy(desc, rview->state, 8*4);
344
345 if (view->texture && view->texture->target != PIPE_BUFFER) {
346 bool is_separate_stencil =
347 rtex->is_depth && !rtex->is_flushing_texture &&
348 rview->is_stencil_sampler;
349
350 si_set_mutable_tex_desc_fields(rtex,
351 rview->base_level_info,
352 rview->base_level,
353 rview->block_width,
354 is_separate_stencil,
355 desc);
356 }
357
358 if (view->texture && view->texture->target != PIPE_BUFFER &&
359 rtex->fmask.size) {
360 memcpy(desc + 8,
361 rview->fmask_state, 8*4);
362 } else {
363 /* Disable FMASK and bind sampler state in [12:15]. */
364 memcpy(desc + 8,
365 null_texture_descriptor, 4*4);
366
367 if (views->sampler_states[slot])
368 memcpy(desc + 12,
369 views->sampler_states[slot], 4*4);
370 }
371
372 views->desc.enabled_mask |= 1u << slot;
373 } else {
374 pipe_sampler_view_reference(&views->views[slot], NULL);
375 memcpy(views->desc.list + slot*16, null_texture_descriptor, 8*4);
376 /* Only clear the lower dwords of FMASK. */
377 memcpy(views->desc.list + slot*16 + 8, null_texture_descriptor, 4*4);
378 views->desc.enabled_mask &= ~(1u << slot);
379 }
380
381 views->desc.dirty_mask |= 1u << slot;
382 }
383
384 static bool is_compressed_colortex(struct r600_texture *rtex)
385 {
386 return rtex->cmask.size || rtex->fmask.size ||
387 (rtex->dcc_offset && rtex->dirty_level_mask);
388 }
389
390 static void si_set_sampler_views(struct pipe_context *ctx,
391 unsigned shader, unsigned start,
392 unsigned count,
393 struct pipe_sampler_view **views)
394 {
395 struct si_context *sctx = (struct si_context *)ctx;
396 struct si_textures_info *samplers = &sctx->samplers[shader];
397 int i;
398
399 if (!count || shader >= SI_NUM_SHADERS)
400 return;
401
402 for (i = 0; i < count; i++) {
403 unsigned slot = start + i;
404
405 if (!views || !views[i]) {
406 samplers->depth_texture_mask &= ~(1u << slot);
407 samplers->compressed_colortex_mask &= ~(1u << slot);
408 si_set_sampler_view(sctx, &samplers->views, slot, NULL, false);
409 continue;
410 }
411
412 si_set_sampler_view(sctx, &samplers->views, slot, views[i], false);
413
414 if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
415 struct r600_texture *rtex =
416 (struct r600_texture*)views[i]->texture;
417
418 if (rtex->is_depth && !rtex->is_flushing_texture) {
419 samplers->depth_texture_mask |= 1u << slot;
420 } else {
421 samplers->depth_texture_mask &= ~(1u << slot);
422 }
423 if (is_compressed_colortex(rtex)) {
424 samplers->compressed_colortex_mask |= 1u << slot;
425 } else {
426 samplers->compressed_colortex_mask &= ~(1u << slot);
427 }
428
429 if (rtex->dcc_offset &&
430 p_atomic_read(&rtex->framebuffers_bound))
431 sctx->need_check_render_feedback = true;
432 } else {
433 samplers->depth_texture_mask &= ~(1u << slot);
434 samplers->compressed_colortex_mask &= ~(1u << slot);
435 }
436 }
437 }
438
439 static void
440 si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers)
441 {
442 unsigned mask = samplers->views.desc.enabled_mask;
443
444 while (mask) {
445 int i = u_bit_scan(&mask);
446 struct pipe_resource *res = samplers->views.views[i]->texture;
447
448 if (res && res->target != PIPE_BUFFER) {
449 struct r600_texture *rtex = (struct r600_texture *)res;
450
451 if (is_compressed_colortex(rtex)) {
452 samplers->compressed_colortex_mask |= 1u << i;
453 } else {
454 samplers->compressed_colortex_mask &= ~(1u << i);
455 }
456 }
457 }
458 }
459
460 /* IMAGE VIEWS */
461
462 static void
463 si_release_image_views(struct si_images_info *images)
464 {
465 unsigned i;
466
467 for (i = 0; i < SI_NUM_IMAGES; ++i) {
468 struct pipe_image_view *view = &images->views[i];
469
470 pipe_resource_reference(&view->resource, NULL);
471 }
472
473 si_release_descriptors(&images->desc);
474 }
475
476 static void
477 si_image_views_begin_new_cs(struct si_context *sctx, struct si_images_info *images)
478 {
479 uint mask = images->desc.enabled_mask;
480
481 /* Add buffers to the CS. */
482 while (mask) {
483 int i = u_bit_scan(&mask);
484 struct pipe_image_view *view = &images->views[i];
485
486 assert(view->resource);
487
488 si_sampler_view_add_buffer(sctx, view->resource,
489 RADEON_USAGE_READWRITE);
490 }
491
492 images->desc.ce_ram_dirty = true;
493
494 if (images->desc.buffer) {
495 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
496 images->desc.buffer,
497 RADEON_USAGE_READ,
498 RADEON_PRIO_DESCRIPTORS);
499 }
500 }
501
502 static void
503 si_disable_shader_image(struct si_images_info *images, unsigned slot)
504 {
505 if (images->desc.enabled_mask & (1u << slot)) {
506 pipe_resource_reference(&images->views[slot].resource, NULL);
507 images->compressed_colortex_mask &= ~(1 << slot);
508
509 memcpy(images->desc.list + slot*8, null_image_descriptor, 8*4);
510 images->desc.enabled_mask &= ~(1u << slot);
511 images->desc.dirty_mask |= 1u << slot;
512 }
513 }
514
515 static void
516 si_mark_image_range_valid(struct pipe_image_view *view)
517 {
518 struct r600_resource *res = (struct r600_resource *)view->resource;
519 const struct util_format_description *desc;
520 unsigned stride;
521
522 assert(res && res->b.b.target == PIPE_BUFFER);
523
524 desc = util_format_description(view->format);
525 stride = desc->block.bits / 8;
526
527 util_range_add(&res->valid_buffer_range,
528 stride * (view->u.buf.first_element),
529 stride * (view->u.buf.last_element + 1));
530 }
531
532 static void si_set_shader_image(struct si_context *ctx,
533 struct si_images_info *images,
534 unsigned slot, struct pipe_image_view *view)
535 {
536 struct si_screen *screen = ctx->screen;
537 struct r600_resource *res;
538
539 if (!view || !view->resource) {
540 si_disable_shader_image(images, slot);
541 return;
542 }
543
544 res = (struct r600_resource *)view->resource;
545
546 if (&images->views[slot] != view)
547 util_copy_image_view(&images->views[slot], view);
548
549 si_sampler_view_add_buffer(ctx, &res->b.b,
550 RADEON_USAGE_READWRITE);
551
552 if (res->b.b.target == PIPE_BUFFER) {
553 if (view->access & PIPE_IMAGE_ACCESS_WRITE)
554 si_mark_image_range_valid(view);
555
556 si_make_buffer_descriptor(screen, res,
557 view->format,
558 view->u.buf.first_element,
559 view->u.buf.last_element,
560 images->desc.list + slot * 8);
561 images->compressed_colortex_mask &= ~(1 << slot);
562 } else {
563 static const unsigned char swizzle[4] = { 0, 1, 2, 3 };
564 struct r600_texture *tex = (struct r600_texture *)res;
565 unsigned level;
566 unsigned width, height, depth;
567 uint32_t *desc = images->desc.list + slot * 8;
568
569 assert(!tex->is_depth);
570 assert(tex->fmask.size == 0);
571
572 if (tex->dcc_offset &&
573 view->access & PIPE_IMAGE_ACCESS_WRITE)
574 r600_texture_disable_dcc(&screen->b, tex);
575
576 if (is_compressed_colortex(tex)) {
577 images->compressed_colortex_mask |= 1 << slot;
578 } else {
579 images->compressed_colortex_mask &= ~(1 << slot);
580 }
581
582 if (tex->dcc_offset &&
583 p_atomic_read(&tex->framebuffers_bound))
584 ctx->need_check_render_feedback = true;
585
586 /* Always force the base level to the selected level.
587 *
588 * This is required for 3D textures, where otherwise
589 * selecting a single slice for non-layered bindings
590 * fails. It doesn't hurt the other targets.
591 */
592 level = view->u.tex.level;
593 width = u_minify(res->b.b.width0, level);
594 height = u_minify(res->b.b.height0, level);
595 depth = u_minify(res->b.b.depth0, level);
596
597 si_make_texture_descriptor(screen, tex,
598 false, res->b.b.target,
599 view->format, swizzle,
600 0, 0,
601 view->u.tex.first_layer,
602 view->u.tex.last_layer,
603 width, height, depth,
604 desc, NULL);
605 si_set_mutable_tex_desc_fields(tex, &tex->surface.level[level], level,
606 util_format_get_blockwidth(view->format),
607 false, desc);
608 }
609
610 images->desc.enabled_mask |= 1u << slot;
611 images->desc.dirty_mask |= 1u << slot;
612 }
613
614 static void
615 si_set_shader_images(struct pipe_context *pipe, unsigned shader,
616 unsigned start_slot, unsigned count,
617 struct pipe_image_view *views)
618 {
619 struct si_context *ctx = (struct si_context *)pipe;
620 struct si_images_info *images = &ctx->images[shader];
621 unsigned i, slot;
622
623 assert(shader < SI_NUM_SHADERS);
624
625 if (!count)
626 return;
627
628 assert(start_slot + count <= SI_NUM_IMAGES);
629
630 if (views) {
631 for (i = 0, slot = start_slot; i < count; ++i, ++slot)
632 si_set_shader_image(ctx, images, slot, &views[i]);
633 } else {
634 for (i = 0, slot = start_slot; i < count; ++i, ++slot)
635 si_set_shader_image(ctx, images, slot, NULL);
636 }
637 }
638
639 static void
640 si_images_update_compressed_colortex_mask(struct si_images_info *images)
641 {
642 unsigned mask = images->desc.enabled_mask;
643
644 while (mask) {
645 int i = u_bit_scan(&mask);
646 struct pipe_resource *res = images->views[i].resource;
647
648 if (res && res->target != PIPE_BUFFER) {
649 struct r600_texture *rtex = (struct r600_texture *)res;
650
651 if (is_compressed_colortex(rtex)) {
652 images->compressed_colortex_mask |= 1 << i;
653 } else {
654 images->compressed_colortex_mask &= ~(1 << i);
655 }
656 }
657 }
658 }
659
660 /* SAMPLER STATES */
661
662 static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader,
663 unsigned start, unsigned count, void **states)
664 {
665 struct si_context *sctx = (struct si_context *)ctx;
666 struct si_textures_info *samplers = &sctx->samplers[shader];
667 struct si_descriptors *desc = &samplers->views.desc;
668 struct si_sampler_state **sstates = (struct si_sampler_state**)states;
669 int i;
670
671 if (!count || shader >= SI_NUM_SHADERS)
672 return;
673
674 for (i = 0; i < count; i++) {
675 unsigned slot = start + i;
676
677 if (!sstates[i] ||
678 sstates[i] == samplers->views.sampler_states[slot])
679 continue;
680
681 samplers->views.sampler_states[slot] = sstates[i];
682
683 /* If FMASK is bound, don't overwrite it.
684 * The sampler state will be set after FMASK is unbound.
685 */
686 if (samplers->views.views[i] &&
687 samplers->views.views[i]->texture &&
688 samplers->views.views[i]->texture->target != PIPE_BUFFER &&
689 ((struct r600_texture*)samplers->views.views[i]->texture)->fmask.size)
690 continue;
691
692 memcpy(desc->list + slot * 16 + 12, sstates[i]->val, 4*4);
693 desc->dirty_mask |= 1u << slot;
694 }
695 }
696
697 /* BUFFER RESOURCES */
698
699 static void si_init_buffer_resources(struct si_buffer_resources *buffers,
700 unsigned num_buffers,
701 unsigned shader_userdata_index,
702 enum radeon_bo_usage shader_usage,
703 enum radeon_bo_priority priority,
704 unsigned *ce_offset)
705 {
706 buffers->shader_usage = shader_usage;
707 buffers->priority = priority;
708 buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
709
710 si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
711 num_buffers, NULL, ce_offset);
712 }
713
714 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
715 {
716 int i;
717
718 for (i = 0; i < buffers->desc.num_elements; i++) {
719 pipe_resource_reference(&buffers->buffers[i], NULL);
720 }
721
722 FREE(buffers->buffers);
723 si_release_descriptors(&buffers->desc);
724 }
725
726 static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
727 struct si_buffer_resources *buffers)
728 {
729 unsigned mask = buffers->desc.enabled_mask;
730
731 /* Add buffers to the CS. */
732 while (mask) {
733 int i = u_bit_scan(&mask);
734
735 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
736 (struct r600_resource*)buffers->buffers[i],
737 buffers->shader_usage, buffers->priority);
738 }
739
740 buffers->desc.ce_ram_dirty = true;
741
742 if (!buffers->desc.buffer)
743 return;
744 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
745 buffers->desc.buffer, RADEON_USAGE_READWRITE,
746 RADEON_PRIO_DESCRIPTORS);
747 }
748
749 /* VERTEX BUFFERS */
750
751 static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
752 {
753 struct si_descriptors *desc = &sctx->vertex_buffers;
754 int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
755 int i;
756
757 for (i = 0; i < count; i++) {
758 int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
759
760 if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
761 continue;
762 if (!sctx->vertex_buffer[vb].buffer)
763 continue;
764
765 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
766 (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
767 RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
768 }
769
770 if (!desc->buffer)
771 return;
772 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
773 desc->buffer, RADEON_USAGE_READ,
774 RADEON_PRIO_DESCRIPTORS);
775 }
776
777 static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
778 {
779 struct si_descriptors *desc = &sctx->vertex_buffers;
780 bool bound[SI_NUM_VERTEX_BUFFERS] = {};
781 unsigned i, count = sctx->vertex_elements->count;
782 uint64_t va;
783 uint32_t *ptr;
784
785 if (!sctx->vertex_buffers_dirty)
786 return true;
787 if (!count || !sctx->vertex_elements)
788 return true;
789
790 /* Vertex buffer descriptors are the only ones which are uploaded
791 * directly through a staging buffer and don't go through
792 * the fine-grained upload path.
793 */
794 u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, &desc->buffer_offset,
795 (struct pipe_resource**)&desc->buffer, (void**)&ptr);
796 if (!desc->buffer)
797 return false;
798
799 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
800 desc->buffer, RADEON_USAGE_READ,
801 RADEON_PRIO_DESCRIPTORS);
802
803 assert(count <= SI_NUM_VERTEX_BUFFERS);
804
805 for (i = 0; i < count; i++) {
806 struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
807 struct pipe_vertex_buffer *vb;
808 struct r600_resource *rbuffer;
809 unsigned offset;
810 uint32_t *desc = &ptr[i*4];
811
812 if (ve->vertex_buffer_index >= ARRAY_SIZE(sctx->vertex_buffer)) {
813 memset(desc, 0, 16);
814 continue;
815 }
816
817 vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
818 rbuffer = (struct r600_resource*)vb->buffer;
819 if (!rbuffer) {
820 memset(desc, 0, 16);
821 continue;
822 }
823
824 offset = vb->buffer_offset + ve->src_offset;
825 va = rbuffer->gpu_address + offset;
826
827 /* Fill in T# buffer resource description */
828 desc[0] = va;
829 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
830 S_008F04_STRIDE(vb->stride);
831
832 if (sctx->b.chip_class <= CIK && vb->stride)
833 /* Round up by rounding down and adding 1 */
834 desc[2] = (vb->buffer->width0 - offset -
835 sctx->vertex_elements->format_size[i]) /
836 vb->stride + 1;
837 else
838 desc[2] = vb->buffer->width0 - offset;
839
840 desc[3] = sctx->vertex_elements->rsrc_word3[i];
841
842 if (!bound[ve->vertex_buffer_index]) {
843 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
844 (struct r600_resource*)vb->buffer,
845 RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
846 bound[ve->vertex_buffer_index] = true;
847 }
848 }
849
850 /* Don't flush the const cache. It would have a very negative effect
851 * on performance (confirmed by testing). New descriptors are always
852 * uploaded to a fresh new buffer, so I don't think flushing the const
853 * cache is needed. */
854 desc->pointer_dirty = true;
855 si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
856 sctx->vertex_buffers_dirty = false;
857 return true;
858 }
859
860
861 /* CONSTANT BUFFERS */
862
863 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
864 const uint8_t *ptr, unsigned size, uint32_t *const_offset)
865 {
866 void *tmp;
867
868 u_upload_alloc(sctx->b.uploader, 0, size, 256, const_offset,
869 (struct pipe_resource**)rbuffer, &tmp);
870 if (*rbuffer)
871 util_memcpy_cpu_to_le32(tmp, ptr, size);
872 }
873
874 void si_set_constant_buffer(struct si_context *sctx,
875 struct si_buffer_resources *buffers,
876 uint slot, struct pipe_constant_buffer *input)
877 {
878 assert(slot < buffers->desc.num_elements);
879 pipe_resource_reference(&buffers->buffers[slot], NULL);
880
881 /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
882 * with a NULL buffer). We need to use a dummy buffer instead. */
883 if (sctx->b.chip_class == CIK &&
884 (!input || (!input->buffer && !input->user_buffer)))
885 input = &sctx->null_const_buf;
886
887 if (input && (input->buffer || input->user_buffer)) {
888 struct pipe_resource *buffer = NULL;
889 uint64_t va;
890
891 /* Upload the user buffer if needed. */
892 if (input->user_buffer) {
893 unsigned buffer_offset;
894
895 si_upload_const_buffer(sctx,
896 (struct r600_resource**)&buffer, input->user_buffer,
897 input->buffer_size, &buffer_offset);
898 if (!buffer) {
899 /* Just unbind on failure. */
900 si_set_constant_buffer(sctx, buffers, slot, NULL);
901 return;
902 }
903 va = r600_resource(buffer)->gpu_address + buffer_offset;
904 } else {
905 pipe_resource_reference(&buffer, input->buffer);
906 va = r600_resource(buffer)->gpu_address + input->buffer_offset;
907 }
908
909 /* Set the descriptor. */
910 uint32_t *desc = buffers->desc.list + slot*4;
911 desc[0] = va;
912 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
913 S_008F04_STRIDE(0);
914 desc[2] = input->buffer_size;
915 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
916 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
917 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
918 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
919 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
920 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
921
922 buffers->buffers[slot] = buffer;
923 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
924 (struct r600_resource*)buffer,
925 buffers->shader_usage, buffers->priority);
926 buffers->desc.enabled_mask |= 1u << slot;
927 } else {
928 /* Clear the descriptor. */
929 memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
930 buffers->desc.enabled_mask &= ~(1u << slot);
931 }
932
933 buffers->desc.dirty_mask |= 1u << slot;
934 }
935
936 static void si_pipe_set_constant_buffer(struct pipe_context *ctx,
937 uint shader, uint slot,
938 struct pipe_constant_buffer *input)
939 {
940 struct si_context *sctx = (struct si_context *)ctx;
941
942 if (shader >= SI_NUM_SHADERS)
943 return;
944
945 si_set_constant_buffer(sctx, &sctx->const_buffers[shader], slot, input);
946 }
947
948 /* SHADER BUFFERS */
949
950 static void si_set_shader_buffers(struct pipe_context *ctx, unsigned shader,
951 unsigned start_slot, unsigned count,
952 struct pipe_shader_buffer *sbuffers)
953 {
954 struct si_context *sctx = (struct si_context *)ctx;
955 struct si_buffer_resources *buffers = &sctx->shader_buffers[shader];
956 unsigned i;
957
958 assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
959
960 for (i = 0; i < count; ++i) {
961 struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
962 struct r600_resource *buf;
963 unsigned slot = start_slot + i;
964 uint32_t *desc = buffers->desc.list + slot * 4;
965 uint64_t va;
966
967 if (!sbuffer || !sbuffer->buffer) {
968 pipe_resource_reference(&buffers->buffers[slot], NULL);
969 memset(desc, 0, sizeof(uint32_t) * 4);
970 buffers->desc.enabled_mask &= ~(1u << slot);
971 buffers->desc.dirty_mask |= 1u << slot;
972 continue;
973 }
974
975 buf = (struct r600_resource *)sbuffer->buffer;
976 va = buf->gpu_address + sbuffer->buffer_offset;
977
978 desc[0] = va;
979 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
980 S_008F04_STRIDE(0);
981 desc[2] = sbuffer->buffer_size;
982 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
983 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
984 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
985 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
986 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
987 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
988
989 pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
990 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buf,
991 buffers->shader_usage, buffers->priority);
992 buffers->desc.enabled_mask |= 1u << slot;
993 buffers->desc.dirty_mask |= 1u << slot;
994 }
995 }
996
997 /* RING BUFFERS */
998
999 void si_set_ring_buffer(struct pipe_context *ctx, uint slot,
1000 struct pipe_resource *buffer,
1001 unsigned stride, unsigned num_records,
1002 bool add_tid, bool swizzle,
1003 unsigned element_size, unsigned index_stride, uint64_t offset)
1004 {
1005 struct si_context *sctx = (struct si_context *)ctx;
1006 struct si_buffer_resources *buffers = &sctx->rw_buffers;
1007
1008 /* The stride field in the resource descriptor has 14 bits */
1009 assert(stride < (1 << 14));
1010
1011 assert(slot < buffers->desc.num_elements);
1012 pipe_resource_reference(&buffers->buffers[slot], NULL);
1013
1014 if (buffer) {
1015 uint64_t va;
1016
1017 va = r600_resource(buffer)->gpu_address + offset;
1018
1019 switch (element_size) {
1020 default:
1021 assert(!"Unsupported ring buffer element size");
1022 case 0:
1023 case 2:
1024 element_size = 0;
1025 break;
1026 case 4:
1027 element_size = 1;
1028 break;
1029 case 8:
1030 element_size = 2;
1031 break;
1032 case 16:
1033 element_size = 3;
1034 break;
1035 }
1036
1037 switch (index_stride) {
1038 default:
1039 assert(!"Unsupported ring buffer index stride");
1040 case 0:
1041 case 8:
1042 index_stride = 0;
1043 break;
1044 case 16:
1045 index_stride = 1;
1046 break;
1047 case 32:
1048 index_stride = 2;
1049 break;
1050 case 64:
1051 index_stride = 3;
1052 break;
1053 }
1054
1055 if (sctx->b.chip_class >= VI && stride)
1056 num_records *= stride;
1057
1058 /* Set the descriptor. */
1059 uint32_t *desc = buffers->desc.list + slot*4;
1060 desc[0] = va;
1061 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
1062 S_008F04_STRIDE(stride) |
1063 S_008F04_SWIZZLE_ENABLE(swizzle);
1064 desc[2] = num_records;
1065 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1066 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1067 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1068 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1069 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1070 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
1071 S_008F0C_ELEMENT_SIZE(element_size) |
1072 S_008F0C_INDEX_STRIDE(index_stride) |
1073 S_008F0C_ADD_TID_ENABLE(add_tid);
1074
1075 pipe_resource_reference(&buffers->buffers[slot], buffer);
1076 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1077 (struct r600_resource*)buffer,
1078 buffers->shader_usage, buffers->priority);
1079 buffers->desc.enabled_mask |= 1u << slot;
1080 } else {
1081 /* Clear the descriptor. */
1082 memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
1083 buffers->desc.enabled_mask &= ~(1u << slot);
1084 }
1085
1086 buffers->desc.dirty_mask |= 1u << slot;
1087 }
1088
1089 /* STREAMOUT BUFFERS */
1090
1091 static void si_set_streamout_targets(struct pipe_context *ctx,
1092 unsigned num_targets,
1093 struct pipe_stream_output_target **targets,
1094 const unsigned *offsets)
1095 {
1096 struct si_context *sctx = (struct si_context *)ctx;
1097 struct si_buffer_resources *buffers = &sctx->rw_buffers;
1098 unsigned old_num_targets = sctx->b.streamout.num_targets;
1099 unsigned i, bufidx;
1100
1101 /* We are going to unbind the buffers. Mark which caches need to be flushed. */
1102 if (sctx->b.streamout.num_targets && sctx->b.streamout.begin_emitted) {
1103 /* Since streamout uses vector writes which go through TC L2
1104 * and most other clients can use TC L2 as well, we don't need
1105 * to flush it.
1106 *
1107 * The only case which requires flushing it is VGT DMA index
1108 * fetching, which is a rare case. Thus, flag the TC L2
1109 * dirtiness in the resource and handle it when index fetching
1110 * is used.
1111 */
1112 for (i = 0; i < sctx->b.streamout.num_targets; i++)
1113 if (sctx->b.streamout.targets[i])
1114 r600_resource(sctx->b.streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
1115
1116 /* Invalidate the scalar cache in case a streamout buffer is
1117 * going to be used as a constant buffer.
1118 *
1119 * Invalidate TC L1, because streamout bypasses it (done by
1120 * setting GLC=1 in the store instruction), but it can contain
1121 * outdated data of streamout buffers.
1122 *
1123 * VS_PARTIAL_FLUSH is required if the buffers are going to be
1124 * used as an input immediately.
1125 */
1126 sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
1127 SI_CONTEXT_INV_VMEM_L1 |
1128 SI_CONTEXT_VS_PARTIAL_FLUSH;
1129 }
1130
1131 /* All readers of the streamout targets need to be finished before we can
1132 * start writing to the targets.
1133 */
1134 if (num_targets)
1135 sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
1136 SI_CONTEXT_CS_PARTIAL_FLUSH;
1137
1138 /* Streamout buffers must be bound in 2 places:
1139 * 1) in VGT by setting the VGT_STRMOUT registers
1140 * 2) as shader resources
1141 */
1142
1143 /* Set the VGT regs. */
1144 r600_set_streamout_targets(ctx, num_targets, targets, offsets);
1145
1146 /* Set the shader resources.*/
1147 for (i = 0; i < num_targets; i++) {
1148 bufidx = SI_VS_STREAMOUT_BUF0 + i;
1149
1150 if (targets[i]) {
1151 struct pipe_resource *buffer = targets[i]->buffer;
1152 uint64_t va = r600_resource(buffer)->gpu_address;
1153
1154 /* Set the descriptor.
1155 *
1156 * On VI, the format must be non-INVALID, otherwise
1157 * the buffer will be considered not bound and store
1158 * instructions will be no-ops.
1159 */
1160 uint32_t *desc = buffers->desc.list + bufidx*4;
1161 desc[0] = va;
1162 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
1163 desc[2] = 0xffffffff;
1164 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1165 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1166 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1167 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1168 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1169
1170 /* Set the resource. */
1171 pipe_resource_reference(&buffers->buffers[bufidx],
1172 buffer);
1173 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1174 (struct r600_resource*)buffer,
1175 buffers->shader_usage, buffers->priority);
1176 buffers->desc.enabled_mask |= 1u << bufidx;
1177 } else {
1178 /* Clear the descriptor and unset the resource. */
1179 memset(buffers->desc.list + bufidx*4, 0,
1180 sizeof(uint32_t) * 4);
1181 pipe_resource_reference(&buffers->buffers[bufidx],
1182 NULL);
1183 buffers->desc.enabled_mask &= ~(1u << bufidx);
1184 }
1185 buffers->desc.dirty_mask |= 1u << bufidx;
1186 }
1187 for (; i < old_num_targets; i++) {
1188 bufidx = SI_VS_STREAMOUT_BUF0 + i;
1189 /* Clear the descriptor and unset the resource. */
1190 memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4);
1191 pipe_resource_reference(&buffers->buffers[bufidx], NULL);
1192 buffers->desc.enabled_mask &= ~(1u << bufidx);
1193 buffers->desc.dirty_mask |= 1u << bufidx;
1194 }
1195 }
1196
1197 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
1198 uint32_t *desc, uint64_t old_buf_va,
1199 struct pipe_resource *new_buf)
1200 {
1201 /* Retrieve the buffer offset from the descriptor. */
1202 uint64_t old_desc_va =
1203 desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
1204
1205 assert(old_buf_va <= old_desc_va);
1206 uint64_t offset_within_buffer = old_desc_va - old_buf_va;
1207
1208 /* Update the descriptor. */
1209 uint64_t va = r600_resource(new_buf)->gpu_address + offset_within_buffer;
1210
1211 desc[0] = va;
1212 desc[1] = (desc[1] & C_008F04_BASE_ADDRESS_HI) |
1213 S_008F04_BASE_ADDRESS_HI(va >> 32);
1214 }
1215
1216 /* INTERNAL CONST BUFFERS */
1217
1218 static void si_set_polygon_stipple(struct pipe_context *ctx,
1219 const struct pipe_poly_stipple *state)
1220 {
1221 struct si_context *sctx = (struct si_context *)ctx;
1222 struct pipe_constant_buffer cb = {};
1223 unsigned stipple[32];
1224 int i;
1225
1226 for (i = 0; i < 32; i++)
1227 stipple[i] = util_bitreverse(state->stipple[i]);
1228
1229 cb.user_buffer = stipple;
1230 cb.buffer_size = sizeof(stipple);
1231
1232 si_set_constant_buffer(sctx, &sctx->rw_buffers,
1233 SI_PS_CONST_POLY_STIPPLE, &cb);
1234 }
1235
1236 /* TEXTURE METADATA ENABLE/DISABLE */
1237
1238 /* CMASK can be enabled (for fast clear) and disabled (for texture export)
1239 * while the texture is bound, possibly by a different context. In that case,
1240 * call this function to update compressed_colortex_masks.
1241 */
1242 void si_update_compressed_colortex_masks(struct si_context *sctx)
1243 {
1244 for (int i = 0; i < SI_NUM_SHADERS; ++i) {
1245 si_samplers_update_compressed_colortex_mask(&sctx->samplers[i]);
1246 si_images_update_compressed_colortex_mask(&sctx->images[i]);
1247 }
1248 }
1249
1250 /* BUFFER DISCARD/INVALIDATION */
1251
1252 /** Reset descriptors of buffer resources after \p buf has been invalidated. */
1253 static void si_reset_buffer_resources(struct si_context *sctx,
1254 struct si_buffer_resources *buffers,
1255 struct pipe_resource *buf,
1256 uint64_t old_va)
1257 {
1258 unsigned mask = buffers->desc.enabled_mask;
1259
1260 while (mask) {
1261 unsigned i = u_bit_scan(&mask);
1262 if (buffers->buffers[i] == buf) {
1263 si_desc_reset_buffer_offset(&sctx->b.b,
1264 buffers->desc.list + i*4,
1265 old_va, buf);
1266 buffers->desc.dirty_mask |= 1u << i;
1267
1268 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1269 (struct r600_resource *)buf,
1270 buffers->shader_usage,
1271 buffers->priority);
1272 }
1273 }
1274 }
1275
1276 /* Reallocate a buffer a update all resource bindings where the buffer is
1277 * bound.
1278 *
1279 * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
1280 * idle by discarding its contents. Apps usually tell us when to do this using
1281 * map_buffer flags, for example.
1282 */
1283 static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf)
1284 {
1285 struct si_context *sctx = (struct si_context*)ctx;
1286 struct r600_resource *rbuffer = r600_resource(buf);
1287 unsigned i, shader, alignment = rbuffer->buf->alignment;
1288 uint64_t old_va = rbuffer->gpu_address;
1289 unsigned num_elems = sctx->vertex_elements ?
1290 sctx->vertex_elements->count : 0;
1291 struct si_sampler_view *view;
1292
1293 /* Reallocate the buffer in the same pipe_resource. */
1294 r600_init_resource(&sctx->screen->b, rbuffer, rbuffer->b.b.width0,
1295 alignment);
1296
1297 /* We changed the buffer, now we need to bind it where the old one
1298 * was bound. This consists of 2 things:
1299 * 1) Updating the resource descriptor and dirtying it.
1300 * 2) Adding a relocation to the CS, so that it's usable.
1301 */
1302
1303 /* Vertex buffers. */
1304 for (i = 0; i < num_elems; i++) {
1305 int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
1306
1307 if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
1308 continue;
1309 if (!sctx->vertex_buffer[vb].buffer)
1310 continue;
1311
1312 if (sctx->vertex_buffer[vb].buffer == buf) {
1313 sctx->vertex_buffers_dirty = true;
1314 break;
1315 }
1316 }
1317
1318 /* Streamout buffers. (other internal buffers can't be invalidated) */
1319 for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) {
1320 struct si_buffer_resources *buffers = &sctx->rw_buffers;
1321
1322 if (buffers->buffers[i] != buf)
1323 continue;
1324
1325 si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
1326 old_va, buf);
1327 buffers->desc.dirty_mask |= 1u << i;
1328
1329 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1330 rbuffer, buffers->shader_usage,
1331 buffers->priority);
1332
1333 /* Update the streamout state. */
1334 if (sctx->b.streamout.begin_emitted)
1335 r600_emit_streamout_end(&sctx->b);
1336 sctx->b.streamout.append_bitmask =
1337 sctx->b.streamout.enabled_mask;
1338 r600_streamout_buffers_dirty(&sctx->b);
1339 }
1340
1341 /* Constant and shader buffers. */
1342 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1343 si_reset_buffer_resources(sctx, &sctx->const_buffers[shader],
1344 buf, old_va);
1345 si_reset_buffer_resources(sctx, &sctx->shader_buffers[shader],
1346 buf, old_va);
1347 }
1348
1349 /* Texture buffers - update virtual addresses in sampler view descriptors. */
1350 LIST_FOR_EACH_ENTRY(view, &sctx->b.texture_buffers, list) {
1351 if (view->base.texture == buf) {
1352 si_desc_reset_buffer_offset(ctx, &view->state[4], old_va, buf);
1353 }
1354 }
1355 /* Texture buffers - update bindings. */
1356 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1357 struct si_sampler_views *views = &sctx->samplers[shader].views;
1358 unsigned mask = views->desc.enabled_mask;
1359
1360 while (mask) {
1361 unsigned i = u_bit_scan(&mask);
1362 if (views->views[i]->texture == buf) {
1363 si_desc_reset_buffer_offset(ctx,
1364 views->desc.list +
1365 i * 16 + 4,
1366 old_va, buf);
1367 views->desc.dirty_mask |= 1u << i;
1368
1369 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1370 rbuffer, RADEON_USAGE_READ,
1371 RADEON_PRIO_SAMPLER_BUFFER);
1372 }
1373 }
1374 }
1375
1376 /* Shader images */
1377 for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
1378 struct si_images_info *images = &sctx->images[shader];
1379 unsigned mask = images->desc.enabled_mask;
1380
1381 while (mask) {
1382 unsigned i = u_bit_scan(&mask);
1383
1384 if (images->views[i].resource == buf) {
1385 if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)
1386 si_mark_image_range_valid(&images->views[i]);
1387
1388 si_desc_reset_buffer_offset(
1389 ctx, images->desc.list + i * 8 + 4,
1390 old_va, buf);
1391 images->desc.dirty_mask |= 1u << i;
1392
1393 radeon_add_to_buffer_list(
1394 &sctx->b, &sctx->b.gfx, rbuffer,
1395 RADEON_USAGE_READWRITE,
1396 RADEON_PRIO_SAMPLER_BUFFER);
1397 }
1398 }
1399 }
1400 }
1401
1402 /* Update mutable image descriptor fields of all bound textures. */
1403 void si_update_all_texture_descriptors(struct si_context *sctx)
1404 {
1405 unsigned shader;
1406
1407 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1408 struct si_sampler_views *samplers = &sctx->samplers[shader].views;
1409 struct si_images_info *images = &sctx->images[shader];
1410 unsigned mask;
1411
1412 /* Images. */
1413 mask = images->desc.enabled_mask;
1414 while (mask) {
1415 unsigned i = u_bit_scan(&mask);
1416 struct pipe_image_view *view = &images->views[i];
1417
1418 if (!view->resource ||
1419 view->resource->target == PIPE_BUFFER)
1420 continue;
1421
1422 si_set_shader_image(sctx, images, i, view);
1423 }
1424
1425 /* Sampler views. */
1426 mask = samplers->desc.enabled_mask;
1427 while (mask) {
1428 unsigned i = u_bit_scan(&mask);
1429 struct pipe_sampler_view *view = samplers->views[i];
1430
1431 if (!view ||
1432 !view->texture ||
1433 view->texture->target == PIPE_BUFFER)
1434 continue;
1435
1436 si_set_sampler_view(sctx, samplers, i,
1437 samplers->views[i], true);
1438 }
1439 }
1440 }
1441
1442 /* SHADER USER DATA */
1443
1444 static void si_mark_shader_pointers_dirty(struct si_context *sctx,
1445 unsigned shader)
1446 {
1447 sctx->const_buffers[shader].desc.pointer_dirty = true;
1448 sctx->shader_buffers[shader].desc.pointer_dirty = true;
1449 sctx->samplers[shader].views.desc.pointer_dirty = true;
1450 sctx->images[shader].desc.pointer_dirty = true;
1451
1452 if (shader == PIPE_SHADER_VERTEX)
1453 sctx->vertex_buffers.pointer_dirty = true;
1454
1455 si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
1456 }
1457
1458 static void si_shader_userdata_begin_new_cs(struct si_context *sctx)
1459 {
1460 int i;
1461
1462 for (i = 0; i < SI_NUM_SHADERS; i++) {
1463 si_mark_shader_pointers_dirty(sctx, i);
1464 }
1465 sctx->rw_buffers.desc.pointer_dirty = true;
1466 }
1467
1468 /* Set a base register address for user data constants in the given shader.
1469 * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
1470 */
1471 static void si_set_user_data_base(struct si_context *sctx,
1472 unsigned shader, uint32_t new_base)
1473 {
1474 uint32_t *base = &sctx->shader_userdata.sh_base[shader];
1475
1476 if (*base != new_base) {
1477 *base = new_base;
1478
1479 if (new_base)
1480 si_mark_shader_pointers_dirty(sctx, shader);
1481 }
1482 }
1483
1484 /* This must be called when these shaders are changed from non-NULL to NULL
1485 * and vice versa:
1486 * - geometry shader
1487 * - tessellation control shader
1488 * - tessellation evaluation shader
1489 */
1490 void si_shader_change_notify(struct si_context *sctx)
1491 {
1492 /* VS can be bound as VS, ES, or LS. */
1493 if (sctx->tes_shader.cso)
1494 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
1495 R_00B530_SPI_SHADER_USER_DATA_LS_0);
1496 else if (sctx->gs_shader.cso)
1497 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
1498 R_00B330_SPI_SHADER_USER_DATA_ES_0);
1499 else
1500 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
1501 R_00B130_SPI_SHADER_USER_DATA_VS_0);
1502
1503 /* TES can be bound as ES, VS, or not bound. */
1504 if (sctx->tes_shader.cso) {
1505 if (sctx->gs_shader.cso)
1506 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
1507 R_00B330_SPI_SHADER_USER_DATA_ES_0);
1508 else
1509 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
1510 R_00B130_SPI_SHADER_USER_DATA_VS_0);
1511 } else {
1512 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
1513 }
1514 }
1515
1516 static void si_emit_shader_pointer(struct si_context *sctx,
1517 struct si_descriptors *desc,
1518 unsigned sh_base, bool keep_dirty)
1519 {
1520 struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
1521 uint64_t va;
1522
1523 if (!desc->pointer_dirty || !desc->buffer)
1524 return;
1525
1526 va = desc->buffer->gpu_address +
1527 desc->buffer_offset;
1528
1529 radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
1530 radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
1531 radeon_emit(cs, va);
1532 radeon_emit(cs, va >> 32);
1533
1534 desc->pointer_dirty = keep_dirty;
1535 }
1536
1537 void si_emit_graphics_shader_userdata(struct si_context *sctx,
1538 struct r600_atom *atom)
1539 {
1540 unsigned i;
1541 uint32_t *sh_base = sctx->shader_userdata.sh_base;
1542
1543 if (sctx->rw_buffers.desc.pointer_dirty) {
1544 si_emit_shader_pointer(sctx, &sctx->rw_buffers.desc,
1545 R_00B030_SPI_SHADER_USER_DATA_PS_0, true);
1546 si_emit_shader_pointer(sctx, &sctx->rw_buffers.desc,
1547 R_00B130_SPI_SHADER_USER_DATA_VS_0, true);
1548 si_emit_shader_pointer(sctx, &sctx->rw_buffers.desc,
1549 R_00B230_SPI_SHADER_USER_DATA_GS_0, true);
1550 si_emit_shader_pointer(sctx, &sctx->rw_buffers.desc,
1551 R_00B330_SPI_SHADER_USER_DATA_ES_0, true);
1552 si_emit_shader_pointer(sctx, &sctx->rw_buffers.desc,
1553 R_00B430_SPI_SHADER_USER_DATA_HS_0, true);
1554 sctx->rw_buffers.desc.pointer_dirty = false;
1555 }
1556
1557 for (i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
1558 unsigned base = sh_base[i];
1559
1560 if (!base)
1561 continue;
1562
1563 si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false);
1564 si_emit_shader_pointer(sctx, &sctx->shader_buffers[i].desc, base, false);
1565 si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false);
1566 si_emit_shader_pointer(sctx, &sctx->images[i].desc, base, false);
1567 }
1568 si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
1569 }
1570
1571 void si_emit_compute_shader_userdata(struct si_context *sctx)
1572 {
1573 unsigned base = R_00B900_COMPUTE_USER_DATA_0;
1574
1575 si_emit_shader_pointer(sctx, &sctx->const_buffers[PIPE_SHADER_COMPUTE].desc,
1576 base, false);
1577 si_emit_shader_pointer(sctx, &sctx->shader_buffers[PIPE_SHADER_COMPUTE].desc,
1578 base, false);
1579 si_emit_shader_pointer(sctx, &sctx->samplers[PIPE_SHADER_COMPUTE].views.desc,
1580 base, false);
1581 si_emit_shader_pointer(sctx, &sctx->images[PIPE_SHADER_COMPUTE].desc,
1582 base, false);
1583 }
1584
1585 /* INIT/DEINIT/UPLOAD */
1586
1587 void si_init_all_descriptors(struct si_context *sctx)
1588 {
1589 int i;
1590 unsigned ce_offset = 0;
1591
1592 for (i = 0; i < SI_NUM_SHADERS; i++) {
1593 si_init_buffer_resources(&sctx->const_buffers[i],
1594 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST_BUFFERS,
1595 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER,
1596 &ce_offset);
1597 si_init_buffer_resources(&sctx->shader_buffers[i],
1598 SI_NUM_SHADER_BUFFERS, SI_SGPR_SHADER_BUFFERS,
1599 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER,
1600 &ce_offset);
1601
1602 si_init_descriptors(&sctx->samplers[i].views.desc,
1603 SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS,
1604 null_texture_descriptor, &ce_offset);
1605
1606 si_init_descriptors(&sctx->images[i].desc,
1607 SI_SGPR_IMAGES, 8, SI_NUM_IMAGES,
1608 null_image_descriptor, &ce_offset);
1609 }
1610
1611 si_init_buffer_resources(&sctx->rw_buffers,
1612 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
1613 RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT,
1614 &ce_offset);
1615 si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
1616 4, SI_NUM_VERTEX_BUFFERS, NULL, NULL);
1617
1618 assert(ce_offset <= 32768);
1619
1620 /* Set pipe_context functions. */
1621 sctx->b.b.bind_sampler_states = si_bind_sampler_states;
1622 sctx->b.b.set_shader_images = si_set_shader_images;
1623 sctx->b.b.set_constant_buffer = si_pipe_set_constant_buffer;
1624 sctx->b.b.set_polygon_stipple = si_set_polygon_stipple;
1625 sctx->b.b.set_shader_buffers = si_set_shader_buffers;
1626 sctx->b.b.set_sampler_views = si_set_sampler_views;
1627 sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
1628 sctx->b.invalidate_buffer = si_invalidate_buffer;
1629
1630 /* Shader user data. */
1631 si_init_atom(sctx, &sctx->shader_userdata.atom, &sctx->atoms.s.shader_userdata,
1632 si_emit_graphics_shader_userdata);
1633
1634 /* Set default and immutable mappings. */
1635 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
1636 si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0);
1637 si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
1638 si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
1639 }
1640
1641 bool si_upload_graphics_shader_descriptors(struct si_context *sctx)
1642 {
1643 int i;
1644
1645 for (i = 0; i < SI_NUM_SHADERS; i++) {
1646 if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc,
1647 &sctx->shader_userdata.atom) ||
1648 !si_upload_descriptors(sctx, &sctx->shader_buffers[i].desc,
1649 &sctx->shader_userdata.atom) ||
1650 !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc,
1651 &sctx->shader_userdata.atom) ||
1652 !si_upload_descriptors(sctx, &sctx->images[i].desc,
1653 &sctx->shader_userdata.atom))
1654 return false;
1655 }
1656 return si_upload_descriptors(sctx, &sctx->rw_buffers.desc,
1657 &sctx->shader_userdata.atom) &&
1658 si_upload_vertex_buffer_descriptors(sctx);
1659 }
1660
1661 bool si_upload_compute_shader_descriptors(struct si_context *sctx)
1662 {
1663 /* Does not update rw_buffers as that is not needed for compute shaders
1664 * and the input buffer is using the same SGPR's anyway.
1665 */
1666 return si_upload_descriptors(sctx,
1667 &sctx->const_buffers[PIPE_SHADER_COMPUTE].desc, NULL) &&
1668 si_upload_descriptors(sctx,
1669 &sctx->shader_buffers[PIPE_SHADER_COMPUTE].desc, NULL) &&
1670 si_upload_descriptors(sctx,
1671 &sctx->samplers[PIPE_SHADER_COMPUTE].views.desc, NULL) &&
1672 si_upload_descriptors(sctx,
1673 &sctx->images[PIPE_SHADER_COMPUTE].desc, NULL);
1674 }
1675
1676 void si_release_all_descriptors(struct si_context *sctx)
1677 {
1678 int i;
1679
1680 for (i = 0; i < SI_NUM_SHADERS; i++) {
1681 si_release_buffer_resources(&sctx->const_buffers[i]);
1682 si_release_buffer_resources(&sctx->shader_buffers[i]);
1683 si_release_sampler_views(&sctx->samplers[i].views);
1684 si_release_image_views(&sctx->images[i]);
1685 }
1686 si_release_buffer_resources(&sctx->rw_buffers);
1687 si_release_descriptors(&sctx->vertex_buffers);
1688 }
1689
1690 void si_all_descriptors_begin_new_cs(struct si_context *sctx)
1691 {
1692 int i;
1693
1694 for (i = 0; i < SI_NUM_SHADERS; i++) {
1695 si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
1696 si_buffer_resources_begin_new_cs(sctx, &sctx->shader_buffers[i]);
1697 si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
1698 si_image_views_begin_new_cs(sctx, &sctx->images[i]);
1699 }
1700 si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
1701 si_vertex_buffers_begin_new_cs(sctx);
1702 si_shader_userdata_begin_new_cs(sctx);
1703 }