radeonsi: access descriptor sets via local variables
[mesa.git] / src / gallium / drivers / radeonsi / si_descriptors.c
1 /*
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Marek Olšák <marek.olsak@amd.com>
25 */
26
27 /* Resource binding slots and sampler states (each described with 8 or
28 * 4 dwords) are stored in lists in memory which is accessed by shaders
29 * using scalar load instructions.
30 *
31 * This file is responsible for managing such lists. It keeps a copy of all
32 * descriptors in CPU memory and re-uploads a whole list if some slots have
33 * been changed.
34 *
35 * This code is also reponsible for updating shader pointers to those lists.
36 *
37 * Note that CP DMA can't be used for updating the lists, because a GPU hang
38 * could leave the list in a mid-IB state and the next IB would get wrong
39 * descriptors and the whole context would be unusable at that point.
40 * (Note: The register shadowing can't be used due to the same reason)
41 *
42 * Also, uploading descriptors to newly allocated memory doesn't require
43 * a KCACHE flush.
44 *
45 *
46 * Possible scenarios for one 16 dword image+sampler slot:
47 *
48 * | Image | w/ FMASK | Buffer | NULL
49 * [ 0: 3] Image[0:3] | Image[0:3] | Null[0:3] | Null[0:3]
50 * [ 4: 7] Image[4:7] | Image[4:7] | Buffer[0:3] | 0
51 * [ 8:11] Null[0:3] | Fmask[0:3] | Null[0:3] | Null[0:3]
52 * [12:15] Sampler[0:3] | Fmask[4:7] | Sampler[0:3] | Sampler[0:3]
53 *
54 * FMASK implies MSAA, therefore no sampler state.
55 * Sampler states are never unbound except when FMASK is bound.
56 */
57
58 #include "radeon/r600_cs.h"
59 #include "si_pipe.h"
60 #include "si_shader.h"
61 #include "sid.h"
62
63 #include "util/u_format.h"
64 #include "util/u_math.h"
65 #include "util/u_memory.h"
66 #include "util/u_suballoc.h"
67 #include "util/u_upload_mgr.h"
68
69
70 /* NULL image and buffer descriptor for textures (alpha = 1) and images
71 * (alpha = 0).
72 *
73 * For images, all fields must be zero except for the swizzle, which
74 * supports arbitrary combinations of 0s and 1s. The texture type must be
75 * any valid type (e.g. 1D). If the texture type isn't set, the hw hangs.
76 *
77 * For buffers, all fields must be zero. If they are not, the hw hangs.
78 *
79 * This is the only reason why the buffer descriptor must be in words [4:7].
80 */
81 static uint32_t null_texture_descriptor[8] = {
82 0,
83 0,
84 0,
85 S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) |
86 S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
87 /* the rest must contain zeros, which is also used by the buffer
88 * descriptor */
89 };
90
91 static uint32_t null_image_descriptor[8] = {
92 0,
93 0,
94 0,
95 S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
96 /* the rest must contain zeros, which is also used by the buffer
97 * descriptor */
98 };
99
100 static void si_init_descriptors(struct si_descriptors *desc,
101 unsigned shader_userdata_index,
102 unsigned element_dw_size,
103 unsigned num_elements,
104 const uint32_t *null_descriptor,
105 unsigned *ce_offset)
106 {
107 int i;
108
109 assert(num_elements <= sizeof(desc->dirty_mask)*8);
110
111 desc->list = CALLOC(num_elements, element_dw_size * 4);
112 desc->element_dw_size = element_dw_size;
113 desc->num_elements = num_elements;
114 desc->dirty_mask = num_elements == 32 ? ~0u : (1u << num_elements) - 1;
115 desc->shader_userdata_offset = shader_userdata_index * 4;
116
117 if (ce_offset) {
118 desc->ce_offset = *ce_offset;
119
120 /* make sure that ce_offset stays 32 byte aligned */
121 *ce_offset += align(element_dw_size * num_elements * 4, 32);
122 }
123
124 /* Initialize the array to NULL descriptors if the element size is 8. */
125 if (null_descriptor) {
126 assert(element_dw_size % 8 == 0);
127 for (i = 0; i < num_elements * element_dw_size / 8; i++)
128 memcpy(desc->list + i * 8, null_descriptor,
129 8 * 4);
130 }
131 }
132
133 static void si_release_descriptors(struct si_descriptors *desc)
134 {
135 pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
136 FREE(desc->list);
137 }
138
139 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
140 unsigned *out_offset, struct r600_resource **out_buf) {
141 uint64_t va;
142
143 u_suballocator_alloc(sctx->ce_suballocator, size, 64, out_offset,
144 (struct pipe_resource**)out_buf);
145 if (!out_buf)
146 return false;
147
148 va = (*out_buf)->gpu_address + *out_offset;
149
150 radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
151 radeon_emit(sctx->ce_ib, ce_offset);
152 radeon_emit(sctx->ce_ib, size / 4);
153 radeon_emit(sctx->ce_ib, va);
154 radeon_emit(sctx->ce_ib, va >> 32);
155
156 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf,
157 RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
158
159 sctx->ce_need_synchronization = true;
160 return true;
161 }
162
163 static void si_reinitialize_ce_ram(struct si_context *sctx,
164 struct si_descriptors *desc)
165 {
166 if (desc->buffer) {
167 struct r600_resource *buffer = (struct r600_resource*)desc->buffer;
168 unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
169 uint64_t va = buffer->gpu_address + desc->buffer_offset;
170 struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
171
172 if (!ib)
173 ib = sctx->ce_ib;
174
175 list_size = align(list_size, 32);
176
177 radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
178 radeon_emit(ib, va);
179 radeon_emit(ib, va >> 32);
180 radeon_emit(ib, list_size / 4);
181 radeon_emit(ib, desc->ce_offset);
182
183 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
184 RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
185 }
186 desc->ce_ram_dirty = false;
187 }
188
189 void si_ce_enable_loads(struct radeon_winsys_cs *ib)
190 {
191 radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
192 radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
193 CONTEXT_CONTROL_LOAD_CE_RAM(1));
194 radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
195 }
196
197 static bool si_upload_descriptors(struct si_context *sctx,
198 struct si_descriptors *desc,
199 struct r600_atom * atom)
200 {
201 unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
202
203 if (!desc->dirty_mask)
204 return true;
205
206 if (sctx->ce_ib) {
207 uint32_t const* list = (uint32_t const*)desc->list;
208
209 if (desc->ce_ram_dirty)
210 si_reinitialize_ce_ram(sctx, desc);
211
212 while(desc->dirty_mask) {
213 int begin, count;
214 u_bit_scan_consecutive_range(&desc->dirty_mask, &begin,
215 &count);
216
217 begin *= desc->element_dw_size;
218 count *= desc->element_dw_size;
219
220 radeon_emit(sctx->ce_ib,
221 PKT3(PKT3_WRITE_CONST_RAM, count, 0));
222 radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
223 radeon_emit_array(sctx->ce_ib, list + begin, count);
224 }
225
226 if (!si_ce_upload(sctx, desc->ce_offset, list_size,
227 &desc->buffer_offset, &desc->buffer))
228 return false;
229 } else {
230 void *ptr;
231
232 u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
233 &desc->buffer_offset,
234 (struct pipe_resource**)&desc->buffer, &ptr);
235 if (!desc->buffer)
236 return false; /* skip the draw call */
237
238 util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
239
240 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
241 RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
242 }
243 desc->pointer_dirty = true;
244 desc->dirty_mask = 0;
245
246 if (atom)
247 si_mark_atom_dirty(sctx, atom);
248
249 return true;
250 }
251
252 static void
253 si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
254 {
255 desc->ce_ram_dirty = true;
256
257 if (!desc->buffer)
258 return;
259
260 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
261 RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
262 }
263
264 /* SAMPLER VIEWS */
265
266 static void si_release_sampler_views(struct si_sampler_views *views)
267 {
268 int i;
269
270 for (i = 0; i < ARRAY_SIZE(views->views); i++) {
271 pipe_sampler_view_reference(&views->views[i], NULL);
272 }
273 si_release_descriptors(&views->desc);
274 }
275
276 static void si_sampler_view_add_buffer(struct si_context *sctx,
277 struct pipe_resource *resource,
278 enum radeon_bo_usage usage)
279 {
280 struct r600_resource *rres = (struct r600_resource*)resource;
281
282 if (!resource)
283 return;
284
285 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rres, usage,
286 r600_get_sampler_view_priority(rres));
287 }
288
289 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
290 struct si_sampler_views *views)
291 {
292 unsigned mask = views->enabled_mask;
293
294 /* Add buffers to the CS. */
295 while (mask) {
296 int i = u_bit_scan(&mask);
297
298 si_sampler_view_add_buffer(sctx, views->views[i]->texture,
299 RADEON_USAGE_READ);
300 }
301
302 si_descriptors_begin_new_cs(sctx, &views->desc);
303 }
304
305 void si_set_mutable_tex_desc_fields(struct r600_texture *tex,
306 const struct radeon_surf_level *base_level_info,
307 unsigned base_level, unsigned block_width,
308 bool is_stencil, uint32_t *state)
309 {
310 uint64_t va = tex->resource.gpu_address + base_level_info->offset;
311 unsigned pitch = base_level_info->nblk_x * block_width;
312
313 state[1] &= C_008F14_BASE_ADDRESS_HI;
314 state[3] &= C_008F1C_TILING_INDEX;
315 state[4] &= C_008F20_PITCH;
316 state[6] &= C_008F28_COMPRESSION_EN;
317
318 state[0] = va >> 8;
319 state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
320 state[3] |= S_008F1C_TILING_INDEX(si_tile_mode_index(tex, base_level,
321 is_stencil));
322 state[4] |= S_008F20_PITCH(pitch - 1);
323
324 if (tex->dcc_offset) {
325 state[6] |= S_008F28_COMPRESSION_EN(1);
326 state[7] = (tex->resource.gpu_address +
327 tex->dcc_offset +
328 base_level_info->dcc_offset) >> 8;
329 }
330 }
331
332 static void si_set_sampler_view(struct si_context *sctx,
333 unsigned shader,
334 unsigned slot, struct pipe_sampler_view *view,
335 bool disallow_early_out)
336 {
337 struct si_sampler_views *views = &sctx->samplers[shader].views;
338 struct si_sampler_view *rview = (struct si_sampler_view*)view;
339 struct si_descriptors *descs = &views->desc;
340
341 if (views->views[slot] == view && !disallow_early_out)
342 return;
343
344 if (view) {
345 struct r600_texture *rtex = (struct r600_texture *)view->texture;
346 uint32_t *desc = descs->list + slot * 16;
347
348 si_sampler_view_add_buffer(sctx, view->texture,
349 RADEON_USAGE_READ);
350
351 pipe_sampler_view_reference(&views->views[slot], view);
352 memcpy(desc, rview->state, 8*4);
353
354 if (view->texture && view->texture->target != PIPE_BUFFER) {
355 bool is_separate_stencil =
356 rtex->is_depth && !rtex->is_flushing_texture &&
357 rview->is_stencil_sampler;
358
359 si_set_mutable_tex_desc_fields(rtex,
360 rview->base_level_info,
361 rview->base_level,
362 rview->block_width,
363 is_separate_stencil,
364 desc);
365 }
366
367 if (view->texture && view->texture->target != PIPE_BUFFER &&
368 rtex->fmask.size) {
369 memcpy(desc + 8,
370 rview->fmask_state, 8*4);
371 } else {
372 /* Disable FMASK and bind sampler state in [12:15]. */
373 memcpy(desc + 8,
374 null_texture_descriptor, 4*4);
375
376 if (views->sampler_states[slot])
377 memcpy(desc + 12,
378 views->sampler_states[slot], 4*4);
379 }
380
381 views->enabled_mask |= 1u << slot;
382 } else {
383 pipe_sampler_view_reference(&views->views[slot], NULL);
384 memcpy(descs->list + slot*16, null_texture_descriptor, 8*4);
385 /* Only clear the lower dwords of FMASK. */
386 memcpy(descs->list + slot*16 + 8, null_texture_descriptor, 4*4);
387 views->enabled_mask &= ~(1u << slot);
388 }
389
390 descs->dirty_mask |= 1u << slot;
391 }
392
393 static bool is_compressed_colortex(struct r600_texture *rtex)
394 {
395 return rtex->cmask.size || rtex->fmask.size ||
396 (rtex->dcc_offset && rtex->dirty_level_mask);
397 }
398
399 static void si_set_sampler_views(struct pipe_context *ctx,
400 unsigned shader, unsigned start,
401 unsigned count,
402 struct pipe_sampler_view **views)
403 {
404 struct si_context *sctx = (struct si_context *)ctx;
405 struct si_textures_info *samplers = &sctx->samplers[shader];
406 int i;
407
408 if (!count || shader >= SI_NUM_SHADERS)
409 return;
410
411 for (i = 0; i < count; i++) {
412 unsigned slot = start + i;
413
414 if (!views || !views[i]) {
415 samplers->depth_texture_mask &= ~(1u << slot);
416 samplers->compressed_colortex_mask &= ~(1u << slot);
417 si_set_sampler_view(sctx, shader, slot, NULL, false);
418 continue;
419 }
420
421 si_set_sampler_view(sctx, shader, slot, views[i], false);
422
423 if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
424 struct r600_texture *rtex =
425 (struct r600_texture*)views[i]->texture;
426
427 if (rtex->is_depth && !rtex->is_flushing_texture) {
428 samplers->depth_texture_mask |= 1u << slot;
429 } else {
430 samplers->depth_texture_mask &= ~(1u << slot);
431 }
432 if (is_compressed_colortex(rtex)) {
433 samplers->compressed_colortex_mask |= 1u << slot;
434 } else {
435 samplers->compressed_colortex_mask &= ~(1u << slot);
436 }
437
438 if (rtex->dcc_offset &&
439 p_atomic_read(&rtex->framebuffers_bound))
440 sctx->need_check_render_feedback = true;
441 } else {
442 samplers->depth_texture_mask &= ~(1u << slot);
443 samplers->compressed_colortex_mask &= ~(1u << slot);
444 }
445 }
446 }
447
448 static void
449 si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers)
450 {
451 unsigned mask = samplers->views.enabled_mask;
452
453 while (mask) {
454 int i = u_bit_scan(&mask);
455 struct pipe_resource *res = samplers->views.views[i]->texture;
456
457 if (res && res->target != PIPE_BUFFER) {
458 struct r600_texture *rtex = (struct r600_texture *)res;
459
460 if (is_compressed_colortex(rtex)) {
461 samplers->compressed_colortex_mask |= 1u << i;
462 } else {
463 samplers->compressed_colortex_mask &= ~(1u << i);
464 }
465 }
466 }
467 }
468
469 /* IMAGE VIEWS */
470
471 static void
472 si_release_image_views(struct si_images_info *images)
473 {
474 unsigned i;
475
476 for (i = 0; i < SI_NUM_IMAGES; ++i) {
477 struct pipe_image_view *view = &images->views[i];
478
479 pipe_resource_reference(&view->resource, NULL);
480 }
481
482 si_release_descriptors(&images->desc);
483 }
484
485 static void
486 si_image_views_begin_new_cs(struct si_context *sctx, struct si_images_info *images)
487 {
488 uint mask = images->enabled_mask;
489
490 /* Add buffers to the CS. */
491 while (mask) {
492 int i = u_bit_scan(&mask);
493 struct pipe_image_view *view = &images->views[i];
494
495 assert(view->resource);
496
497 si_sampler_view_add_buffer(sctx, view->resource,
498 RADEON_USAGE_READWRITE);
499 }
500
501 si_descriptors_begin_new_cs(sctx, &images->desc);
502 }
503
504 static void
505 si_disable_shader_image(struct si_images_info *images, unsigned slot)
506 {
507 if (images->enabled_mask & (1u << slot)) {
508 pipe_resource_reference(&images->views[slot].resource, NULL);
509 images->compressed_colortex_mask &= ~(1 << slot);
510
511 memcpy(images->desc.list + slot*8, null_image_descriptor, 8*4);
512 images->enabled_mask &= ~(1u << slot);
513 images->desc.dirty_mask |= 1u << slot;
514 }
515 }
516
517 static void
518 si_mark_image_range_valid(struct pipe_image_view *view)
519 {
520 struct r600_resource *res = (struct r600_resource *)view->resource;
521 const struct util_format_description *desc;
522 unsigned stride;
523
524 assert(res && res->b.b.target == PIPE_BUFFER);
525
526 desc = util_format_description(view->format);
527 stride = desc->block.bits / 8;
528
529 util_range_add(&res->valid_buffer_range,
530 stride * (view->u.buf.first_element),
531 stride * (view->u.buf.last_element + 1));
532 }
533
534 static void si_set_shader_image(struct si_context *ctx,
535 unsigned shader,
536 unsigned slot, struct pipe_image_view *view)
537 {
538 struct si_screen *screen = ctx->screen;
539 struct si_images_info *images = &ctx->images[shader];
540 struct si_descriptors *descs = &images->desc;
541 struct r600_resource *res;
542
543 if (!view || !view->resource) {
544 si_disable_shader_image(images, slot);
545 return;
546 }
547
548 res = (struct r600_resource *)view->resource;
549
550 if (&images->views[slot] != view)
551 util_copy_image_view(&images->views[slot], view);
552
553 si_sampler_view_add_buffer(ctx, &res->b.b,
554 RADEON_USAGE_READWRITE);
555
556 if (res->b.b.target == PIPE_BUFFER) {
557 if (view->access & PIPE_IMAGE_ACCESS_WRITE)
558 si_mark_image_range_valid(view);
559
560 si_make_buffer_descriptor(screen, res,
561 view->format,
562 view->u.buf.first_element,
563 view->u.buf.last_element,
564 descs->list + slot * 8);
565 images->compressed_colortex_mask &= ~(1 << slot);
566 } else {
567 static const unsigned char swizzle[4] = { 0, 1, 2, 3 };
568 struct r600_texture *tex = (struct r600_texture *)res;
569 unsigned level;
570 unsigned width, height, depth;
571 uint32_t *desc = descs->list + slot * 8;
572
573 assert(!tex->is_depth);
574 assert(tex->fmask.size == 0);
575
576 if (tex->dcc_offset &&
577 view->access & PIPE_IMAGE_ACCESS_WRITE) {
578 /* If DCC can't be disabled, at least decompress it.
579 * The decompression is relatively cheap if the surface
580 * has been decompressed already.
581 */
582 if (!r600_texture_disable_dcc(&screen->b, tex))
583 ctx->b.decompress_dcc(&ctx->b.b, tex);
584 }
585
586 if (is_compressed_colortex(tex)) {
587 images->compressed_colortex_mask |= 1 << slot;
588 } else {
589 images->compressed_colortex_mask &= ~(1 << slot);
590 }
591
592 if (tex->dcc_offset &&
593 p_atomic_read(&tex->framebuffers_bound))
594 ctx->need_check_render_feedback = true;
595
596 /* Always force the base level to the selected level.
597 *
598 * This is required for 3D textures, where otherwise
599 * selecting a single slice for non-layered bindings
600 * fails. It doesn't hurt the other targets.
601 */
602 level = view->u.tex.level;
603 width = u_minify(res->b.b.width0, level);
604 height = u_minify(res->b.b.height0, level);
605 depth = u_minify(res->b.b.depth0, level);
606
607 si_make_texture_descriptor(screen, tex,
608 false, res->b.b.target,
609 view->format, swizzle,
610 0, 0,
611 view->u.tex.first_layer,
612 view->u.tex.last_layer,
613 width, height, depth,
614 desc, NULL);
615 si_set_mutable_tex_desc_fields(tex, &tex->surface.level[level], level,
616 util_format_get_blockwidth(view->format),
617 false, desc);
618 }
619
620 images->enabled_mask |= 1u << slot;
621 descs->dirty_mask |= 1u << slot;
622 }
623
624 static void
625 si_set_shader_images(struct pipe_context *pipe, unsigned shader,
626 unsigned start_slot, unsigned count,
627 struct pipe_image_view *views)
628 {
629 struct si_context *ctx = (struct si_context *)pipe;
630 unsigned i, slot;
631
632 assert(shader < SI_NUM_SHADERS);
633
634 if (!count)
635 return;
636
637 assert(start_slot + count <= SI_NUM_IMAGES);
638
639 if (views) {
640 for (i = 0, slot = start_slot; i < count; ++i, ++slot)
641 si_set_shader_image(ctx, shader, slot, &views[i]);
642 } else {
643 for (i = 0, slot = start_slot; i < count; ++i, ++slot)
644 si_set_shader_image(ctx, shader, slot, NULL);
645 }
646 }
647
648 static void
649 si_images_update_compressed_colortex_mask(struct si_images_info *images)
650 {
651 unsigned mask = images->enabled_mask;
652
653 while (mask) {
654 int i = u_bit_scan(&mask);
655 struct pipe_resource *res = images->views[i].resource;
656
657 if (res && res->target != PIPE_BUFFER) {
658 struct r600_texture *rtex = (struct r600_texture *)res;
659
660 if (is_compressed_colortex(rtex)) {
661 images->compressed_colortex_mask |= 1 << i;
662 } else {
663 images->compressed_colortex_mask &= ~(1 << i);
664 }
665 }
666 }
667 }
668
669 /* SAMPLER STATES */
670
671 static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader,
672 unsigned start, unsigned count, void **states)
673 {
674 struct si_context *sctx = (struct si_context *)ctx;
675 struct si_textures_info *samplers = &sctx->samplers[shader];
676 struct si_descriptors *desc = &samplers->views.desc;
677 struct si_sampler_state **sstates = (struct si_sampler_state**)states;
678 int i;
679
680 if (!count || shader >= SI_NUM_SHADERS)
681 return;
682
683 for (i = 0; i < count; i++) {
684 unsigned slot = start + i;
685
686 if (!sstates[i] ||
687 sstates[i] == samplers->views.sampler_states[slot])
688 continue;
689
690 samplers->views.sampler_states[slot] = sstates[i];
691
692 /* If FMASK is bound, don't overwrite it.
693 * The sampler state will be set after FMASK is unbound.
694 */
695 if (samplers->views.views[i] &&
696 samplers->views.views[i]->texture &&
697 samplers->views.views[i]->texture->target != PIPE_BUFFER &&
698 ((struct r600_texture*)samplers->views.views[i]->texture)->fmask.size)
699 continue;
700
701 memcpy(desc->list + slot * 16 + 12, sstates[i]->val, 4*4);
702 desc->dirty_mask |= 1u << slot;
703 }
704 }
705
706 /* BUFFER RESOURCES */
707
708 static void si_init_buffer_resources(struct si_buffer_resources *buffers,
709 unsigned num_buffers,
710 unsigned shader_userdata_index,
711 enum radeon_bo_usage shader_usage,
712 enum radeon_bo_priority priority,
713 unsigned *ce_offset)
714 {
715 buffers->shader_usage = shader_usage;
716 buffers->priority = priority;
717 buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
718
719 si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
720 num_buffers, NULL, ce_offset);
721 }
722
723 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
724 {
725 int i;
726
727 for (i = 0; i < buffers->desc.num_elements; i++) {
728 pipe_resource_reference(&buffers->buffers[i], NULL);
729 }
730
731 FREE(buffers->buffers);
732 si_release_descriptors(&buffers->desc);
733 }
734
735 static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
736 struct si_buffer_resources *buffers)
737 {
738 unsigned mask = buffers->enabled_mask;
739
740 /* Add buffers to the CS. */
741 while (mask) {
742 int i = u_bit_scan(&mask);
743
744 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
745 (struct r600_resource*)buffers->buffers[i],
746 buffers->shader_usage, buffers->priority);
747 }
748
749 si_descriptors_begin_new_cs(sctx, &buffers->desc);
750 }
751
752 /* VERTEX BUFFERS */
753
754 static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
755 {
756 struct si_descriptors *desc = &sctx->vertex_buffers;
757 int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
758 int i;
759
760 for (i = 0; i < count; i++) {
761 int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
762
763 if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
764 continue;
765 if (!sctx->vertex_buffer[vb].buffer)
766 continue;
767
768 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
769 (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
770 RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
771 }
772
773 if (!desc->buffer)
774 return;
775 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
776 desc->buffer, RADEON_USAGE_READ,
777 RADEON_PRIO_DESCRIPTORS);
778 }
779
780 static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
781 {
782 struct si_descriptors *desc = &sctx->vertex_buffers;
783 bool bound[SI_NUM_VERTEX_BUFFERS] = {};
784 unsigned i, count = sctx->vertex_elements->count;
785 uint64_t va;
786 uint32_t *ptr;
787
788 if (!sctx->vertex_buffers_dirty)
789 return true;
790 if (!count || !sctx->vertex_elements)
791 return true;
792
793 /* Vertex buffer descriptors are the only ones which are uploaded
794 * directly through a staging buffer and don't go through
795 * the fine-grained upload path.
796 */
797 u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, &desc->buffer_offset,
798 (struct pipe_resource**)&desc->buffer, (void**)&ptr);
799 if (!desc->buffer)
800 return false;
801
802 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
803 desc->buffer, RADEON_USAGE_READ,
804 RADEON_PRIO_DESCRIPTORS);
805
806 assert(count <= SI_NUM_VERTEX_BUFFERS);
807
808 for (i = 0; i < count; i++) {
809 struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
810 struct pipe_vertex_buffer *vb;
811 struct r600_resource *rbuffer;
812 unsigned offset;
813 uint32_t *desc = &ptr[i*4];
814
815 if (ve->vertex_buffer_index >= ARRAY_SIZE(sctx->vertex_buffer)) {
816 memset(desc, 0, 16);
817 continue;
818 }
819
820 vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
821 rbuffer = (struct r600_resource*)vb->buffer;
822 if (!rbuffer) {
823 memset(desc, 0, 16);
824 continue;
825 }
826
827 offset = vb->buffer_offset + ve->src_offset;
828 va = rbuffer->gpu_address + offset;
829
830 /* Fill in T# buffer resource description */
831 desc[0] = va;
832 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
833 S_008F04_STRIDE(vb->stride);
834
835 if (sctx->b.chip_class <= CIK && vb->stride)
836 /* Round up by rounding down and adding 1 */
837 desc[2] = (vb->buffer->width0 - offset -
838 sctx->vertex_elements->format_size[i]) /
839 vb->stride + 1;
840 else
841 desc[2] = vb->buffer->width0 - offset;
842
843 desc[3] = sctx->vertex_elements->rsrc_word3[i];
844
845 if (!bound[ve->vertex_buffer_index]) {
846 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
847 (struct r600_resource*)vb->buffer,
848 RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
849 bound[ve->vertex_buffer_index] = true;
850 }
851 }
852
853 /* Don't flush the const cache. It would have a very negative effect
854 * on performance (confirmed by testing). New descriptors are always
855 * uploaded to a fresh new buffer, so I don't think flushing the const
856 * cache is needed. */
857 desc->pointer_dirty = true;
858 si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
859 sctx->vertex_buffers_dirty = false;
860 return true;
861 }
862
863
864 /* CONSTANT BUFFERS */
865
866 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
867 const uint8_t *ptr, unsigned size, uint32_t *const_offset)
868 {
869 void *tmp;
870
871 u_upload_alloc(sctx->b.uploader, 0, size, 256, const_offset,
872 (struct pipe_resource**)rbuffer, &tmp);
873 if (*rbuffer)
874 util_memcpy_cpu_to_le32(tmp, ptr, size);
875 }
876
877 static void si_set_constant_buffer(struct si_context *sctx,
878 struct si_buffer_resources *buffers,
879 uint slot, struct pipe_constant_buffer *input)
880 {
881 struct si_descriptors *descs = &buffers->desc;
882 assert(slot < descs->num_elements);
883 pipe_resource_reference(&buffers->buffers[slot], NULL);
884
885 /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
886 * with a NULL buffer). We need to use a dummy buffer instead. */
887 if (sctx->b.chip_class == CIK &&
888 (!input || (!input->buffer && !input->user_buffer)))
889 input = &sctx->null_const_buf;
890
891 if (input && (input->buffer || input->user_buffer)) {
892 struct pipe_resource *buffer = NULL;
893 uint64_t va;
894
895 /* Upload the user buffer if needed. */
896 if (input->user_buffer) {
897 unsigned buffer_offset;
898
899 si_upload_const_buffer(sctx,
900 (struct r600_resource**)&buffer, input->user_buffer,
901 input->buffer_size, &buffer_offset);
902 if (!buffer) {
903 /* Just unbind on failure. */
904 si_set_constant_buffer(sctx, buffers, slot, NULL);
905 return;
906 }
907 va = r600_resource(buffer)->gpu_address + buffer_offset;
908 } else {
909 pipe_resource_reference(&buffer, input->buffer);
910 va = r600_resource(buffer)->gpu_address + input->buffer_offset;
911 }
912
913 /* Set the descriptor. */
914 uint32_t *desc = descs->list + slot*4;
915 desc[0] = va;
916 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
917 S_008F04_STRIDE(0);
918 desc[2] = input->buffer_size;
919 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
920 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
921 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
922 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
923 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
924 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
925
926 buffers->buffers[slot] = buffer;
927 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
928 (struct r600_resource*)buffer,
929 buffers->shader_usage, buffers->priority);
930 buffers->enabled_mask |= 1u << slot;
931 } else {
932 /* Clear the descriptor. */
933 memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
934 buffers->enabled_mask &= ~(1u << slot);
935 }
936
937 descs->dirty_mask |= 1u << slot;
938 }
939
940 void si_set_rw_buffer(struct si_context *sctx,
941 uint slot, struct pipe_constant_buffer *input)
942 {
943 si_set_constant_buffer(sctx, &sctx->rw_buffers, slot, input);
944 }
945
946 static void si_pipe_set_constant_buffer(struct pipe_context *ctx,
947 uint shader, uint slot,
948 struct pipe_constant_buffer *input)
949 {
950 struct si_context *sctx = (struct si_context *)ctx;
951
952 if (shader >= SI_NUM_SHADERS)
953 return;
954
955 si_set_constant_buffer(sctx, &sctx->const_buffers[shader], slot, input);
956 }
957
958 /* SHADER BUFFERS */
959
960 static void si_set_shader_buffers(struct pipe_context *ctx, unsigned shader,
961 unsigned start_slot, unsigned count,
962 struct pipe_shader_buffer *sbuffers)
963 {
964 struct si_context *sctx = (struct si_context *)ctx;
965 struct si_buffer_resources *buffers = &sctx->shader_buffers[shader];
966 struct si_descriptors *descs = &buffers->desc;
967 unsigned i;
968
969 assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
970
971 for (i = 0; i < count; ++i) {
972 struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
973 struct r600_resource *buf;
974 unsigned slot = start_slot + i;
975 uint32_t *desc = descs->list + slot * 4;
976 uint64_t va;
977
978 if (!sbuffer || !sbuffer->buffer) {
979 pipe_resource_reference(&buffers->buffers[slot], NULL);
980 memset(desc, 0, sizeof(uint32_t) * 4);
981 buffers->enabled_mask &= ~(1u << slot);
982 descs->dirty_mask |= 1u << slot;
983 continue;
984 }
985
986 buf = (struct r600_resource *)sbuffer->buffer;
987 va = buf->gpu_address + sbuffer->buffer_offset;
988
989 desc[0] = va;
990 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
991 S_008F04_STRIDE(0);
992 desc[2] = sbuffer->buffer_size;
993 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
994 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
995 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
996 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
997 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
998 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
999
1000 pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
1001 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buf,
1002 buffers->shader_usage, buffers->priority);
1003 buffers->enabled_mask |= 1u << slot;
1004 descs->dirty_mask |= 1u << slot;
1005 }
1006 }
1007
1008 /* RING BUFFERS */
1009
1010 void si_set_ring_buffer(struct pipe_context *ctx, uint slot,
1011 struct pipe_resource *buffer,
1012 unsigned stride, unsigned num_records,
1013 bool add_tid, bool swizzle,
1014 unsigned element_size, unsigned index_stride, uint64_t offset)
1015 {
1016 struct si_context *sctx = (struct si_context *)ctx;
1017 struct si_buffer_resources *buffers = &sctx->rw_buffers;
1018 struct si_descriptors *descs = &buffers->desc;
1019
1020 /* The stride field in the resource descriptor has 14 bits */
1021 assert(stride < (1 << 14));
1022
1023 assert(slot < descs->num_elements);
1024 pipe_resource_reference(&buffers->buffers[slot], NULL);
1025
1026 if (buffer) {
1027 uint64_t va;
1028
1029 va = r600_resource(buffer)->gpu_address + offset;
1030
1031 switch (element_size) {
1032 default:
1033 assert(!"Unsupported ring buffer element size");
1034 case 0:
1035 case 2:
1036 element_size = 0;
1037 break;
1038 case 4:
1039 element_size = 1;
1040 break;
1041 case 8:
1042 element_size = 2;
1043 break;
1044 case 16:
1045 element_size = 3;
1046 break;
1047 }
1048
1049 switch (index_stride) {
1050 default:
1051 assert(!"Unsupported ring buffer index stride");
1052 case 0:
1053 case 8:
1054 index_stride = 0;
1055 break;
1056 case 16:
1057 index_stride = 1;
1058 break;
1059 case 32:
1060 index_stride = 2;
1061 break;
1062 case 64:
1063 index_stride = 3;
1064 break;
1065 }
1066
1067 if (sctx->b.chip_class >= VI && stride)
1068 num_records *= stride;
1069
1070 /* Set the descriptor. */
1071 uint32_t *desc = descs->list + slot*4;
1072 desc[0] = va;
1073 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
1074 S_008F04_STRIDE(stride) |
1075 S_008F04_SWIZZLE_ENABLE(swizzle);
1076 desc[2] = num_records;
1077 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1078 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1079 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1080 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1081 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1082 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
1083 S_008F0C_ELEMENT_SIZE(element_size) |
1084 S_008F0C_INDEX_STRIDE(index_stride) |
1085 S_008F0C_ADD_TID_ENABLE(add_tid);
1086
1087 pipe_resource_reference(&buffers->buffers[slot], buffer);
1088 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1089 (struct r600_resource*)buffer,
1090 buffers->shader_usage, buffers->priority);
1091 buffers->enabled_mask |= 1u << slot;
1092 } else {
1093 /* Clear the descriptor. */
1094 memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
1095 buffers->enabled_mask &= ~(1u << slot);
1096 }
1097
1098 descs->dirty_mask |= 1u << slot;
1099 }
1100
1101 /* STREAMOUT BUFFERS */
1102
1103 static void si_set_streamout_targets(struct pipe_context *ctx,
1104 unsigned num_targets,
1105 struct pipe_stream_output_target **targets,
1106 const unsigned *offsets)
1107 {
1108 struct si_context *sctx = (struct si_context *)ctx;
1109 struct si_buffer_resources *buffers = &sctx->rw_buffers;
1110 struct si_descriptors *descs = &buffers->desc;
1111 unsigned old_num_targets = sctx->b.streamout.num_targets;
1112 unsigned i, bufidx;
1113
1114 /* We are going to unbind the buffers. Mark which caches need to be flushed. */
1115 if (sctx->b.streamout.num_targets && sctx->b.streamout.begin_emitted) {
1116 /* Since streamout uses vector writes which go through TC L2
1117 * and most other clients can use TC L2 as well, we don't need
1118 * to flush it.
1119 *
1120 * The only case which requires flushing it is VGT DMA index
1121 * fetching, which is a rare case. Thus, flag the TC L2
1122 * dirtiness in the resource and handle it when index fetching
1123 * is used.
1124 */
1125 for (i = 0; i < sctx->b.streamout.num_targets; i++)
1126 if (sctx->b.streamout.targets[i])
1127 r600_resource(sctx->b.streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
1128
1129 /* Invalidate the scalar cache in case a streamout buffer is
1130 * going to be used as a constant buffer.
1131 *
1132 * Invalidate TC L1, because streamout bypasses it (done by
1133 * setting GLC=1 in the store instruction), but it can contain
1134 * outdated data of streamout buffers.
1135 *
1136 * VS_PARTIAL_FLUSH is required if the buffers are going to be
1137 * used as an input immediately.
1138 */
1139 sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
1140 SI_CONTEXT_INV_VMEM_L1 |
1141 SI_CONTEXT_VS_PARTIAL_FLUSH;
1142 }
1143
1144 /* All readers of the streamout targets need to be finished before we can
1145 * start writing to the targets.
1146 */
1147 if (num_targets)
1148 sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
1149 SI_CONTEXT_CS_PARTIAL_FLUSH;
1150
1151 /* Streamout buffers must be bound in 2 places:
1152 * 1) in VGT by setting the VGT_STRMOUT registers
1153 * 2) as shader resources
1154 */
1155
1156 /* Set the VGT regs. */
1157 r600_set_streamout_targets(ctx, num_targets, targets, offsets);
1158
1159 /* Set the shader resources.*/
1160 for (i = 0; i < num_targets; i++) {
1161 bufidx = SI_VS_STREAMOUT_BUF0 + i;
1162
1163 if (targets[i]) {
1164 struct pipe_resource *buffer = targets[i]->buffer;
1165 uint64_t va = r600_resource(buffer)->gpu_address;
1166
1167 /* Set the descriptor.
1168 *
1169 * On VI, the format must be non-INVALID, otherwise
1170 * the buffer will be considered not bound and store
1171 * instructions will be no-ops.
1172 */
1173 uint32_t *desc = descs->list + bufidx*4;
1174 desc[0] = va;
1175 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
1176 desc[2] = 0xffffffff;
1177 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1178 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1179 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1180 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1181 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1182
1183 /* Set the resource. */
1184 pipe_resource_reference(&buffers->buffers[bufidx],
1185 buffer);
1186 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1187 (struct r600_resource*)buffer,
1188 buffers->shader_usage, buffers->priority);
1189 buffers->enabled_mask |= 1u << bufidx;
1190 } else {
1191 /* Clear the descriptor and unset the resource. */
1192 memset(descs->list + bufidx*4, 0,
1193 sizeof(uint32_t) * 4);
1194 pipe_resource_reference(&buffers->buffers[bufidx],
1195 NULL);
1196 buffers->enabled_mask &= ~(1u << bufidx);
1197 }
1198 descs->dirty_mask |= 1u << bufidx;
1199 }
1200 for (; i < old_num_targets; i++) {
1201 bufidx = SI_VS_STREAMOUT_BUF0 + i;
1202 /* Clear the descriptor and unset the resource. */
1203 memset(descs->list + bufidx*4, 0, sizeof(uint32_t) * 4);
1204 pipe_resource_reference(&buffers->buffers[bufidx], NULL);
1205 buffers->enabled_mask &= ~(1u << bufidx);
1206 descs->dirty_mask |= 1u << bufidx;
1207 }
1208 }
1209
1210 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
1211 uint32_t *desc, uint64_t old_buf_va,
1212 struct pipe_resource *new_buf)
1213 {
1214 /* Retrieve the buffer offset from the descriptor. */
1215 uint64_t old_desc_va =
1216 desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
1217
1218 assert(old_buf_va <= old_desc_va);
1219 uint64_t offset_within_buffer = old_desc_va - old_buf_va;
1220
1221 /* Update the descriptor. */
1222 uint64_t va = r600_resource(new_buf)->gpu_address + offset_within_buffer;
1223
1224 desc[0] = va;
1225 desc[1] = (desc[1] & C_008F04_BASE_ADDRESS_HI) |
1226 S_008F04_BASE_ADDRESS_HI(va >> 32);
1227 }
1228
1229 /* INTERNAL CONST BUFFERS */
1230
1231 static void si_set_polygon_stipple(struct pipe_context *ctx,
1232 const struct pipe_poly_stipple *state)
1233 {
1234 struct si_context *sctx = (struct si_context *)ctx;
1235 struct pipe_constant_buffer cb = {};
1236 unsigned stipple[32];
1237 int i;
1238
1239 for (i = 0; i < 32; i++)
1240 stipple[i] = util_bitreverse(state->stipple[i]);
1241
1242 cb.user_buffer = stipple;
1243 cb.buffer_size = sizeof(stipple);
1244
1245 si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb);
1246 }
1247
1248 /* TEXTURE METADATA ENABLE/DISABLE */
1249
1250 /* CMASK can be enabled (for fast clear) and disabled (for texture export)
1251 * while the texture is bound, possibly by a different context. In that case,
1252 * call this function to update compressed_colortex_masks.
1253 */
1254 void si_update_compressed_colortex_masks(struct si_context *sctx)
1255 {
1256 for (int i = 0; i < SI_NUM_SHADERS; ++i) {
1257 si_samplers_update_compressed_colortex_mask(&sctx->samplers[i]);
1258 si_images_update_compressed_colortex_mask(&sctx->images[i]);
1259 }
1260 }
1261
1262 /* BUFFER DISCARD/INVALIDATION */
1263
1264 /** Reset descriptors of buffer resources after \p buf has been invalidated. */
1265 static void si_reset_buffer_resources(struct si_context *sctx,
1266 struct si_buffer_resources *buffers,
1267 struct pipe_resource *buf,
1268 uint64_t old_va)
1269 {
1270 struct si_descriptors *descs = &buffers->desc;
1271 unsigned mask = buffers->enabled_mask;
1272
1273 while (mask) {
1274 unsigned i = u_bit_scan(&mask);
1275 if (buffers->buffers[i] == buf) {
1276 si_desc_reset_buffer_offset(&sctx->b.b,
1277 descs->list + i*4,
1278 old_va, buf);
1279 descs->dirty_mask |= 1u << i;
1280
1281 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1282 (struct r600_resource *)buf,
1283 buffers->shader_usage,
1284 buffers->priority);
1285 }
1286 }
1287 }
1288
1289 /* Reallocate a buffer a update all resource bindings where the buffer is
1290 * bound.
1291 *
1292 * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
1293 * idle by discarding its contents. Apps usually tell us when to do this using
1294 * map_buffer flags, for example.
1295 */
1296 static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf)
1297 {
1298 struct si_context *sctx = (struct si_context*)ctx;
1299 struct r600_resource *rbuffer = r600_resource(buf);
1300 unsigned i, shader, alignment = rbuffer->buf->alignment;
1301 uint64_t old_va = rbuffer->gpu_address;
1302 unsigned num_elems = sctx->vertex_elements ?
1303 sctx->vertex_elements->count : 0;
1304 struct si_sampler_view *view;
1305
1306 /* Reallocate the buffer in the same pipe_resource. */
1307 r600_init_resource(&sctx->screen->b, rbuffer, rbuffer->b.b.width0,
1308 alignment);
1309
1310 /* We changed the buffer, now we need to bind it where the old one
1311 * was bound. This consists of 2 things:
1312 * 1) Updating the resource descriptor and dirtying it.
1313 * 2) Adding a relocation to the CS, so that it's usable.
1314 */
1315
1316 /* Vertex buffers. */
1317 for (i = 0; i < num_elems; i++) {
1318 int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
1319
1320 if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
1321 continue;
1322 if (!sctx->vertex_buffer[vb].buffer)
1323 continue;
1324
1325 if (sctx->vertex_buffer[vb].buffer == buf) {
1326 sctx->vertex_buffers_dirty = true;
1327 break;
1328 }
1329 }
1330
1331 /* Streamout buffers. (other internal buffers can't be invalidated) */
1332 for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) {
1333 struct si_buffer_resources *buffers = &sctx->rw_buffers;
1334 struct si_descriptors *descs = &buffers->desc;
1335
1336 if (buffers->buffers[i] != buf)
1337 continue;
1338
1339 si_desc_reset_buffer_offset(ctx, descs->list + i*4,
1340 old_va, buf);
1341 descs->dirty_mask |= 1u << i;
1342
1343 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1344 rbuffer, buffers->shader_usage,
1345 buffers->priority);
1346
1347 /* Update the streamout state. */
1348 if (sctx->b.streamout.begin_emitted)
1349 r600_emit_streamout_end(&sctx->b);
1350 sctx->b.streamout.append_bitmask =
1351 sctx->b.streamout.enabled_mask;
1352 r600_streamout_buffers_dirty(&sctx->b);
1353 }
1354
1355 /* Constant and shader buffers. */
1356 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1357 si_reset_buffer_resources(sctx, &sctx->const_buffers[shader],
1358 buf, old_va);
1359 si_reset_buffer_resources(sctx, &sctx->shader_buffers[shader],
1360 buf, old_va);
1361 }
1362
1363 /* Texture buffers - update virtual addresses in sampler view descriptors. */
1364 LIST_FOR_EACH_ENTRY(view, &sctx->b.texture_buffers, list) {
1365 if (view->base.texture == buf) {
1366 si_desc_reset_buffer_offset(ctx, &view->state[4], old_va, buf);
1367 }
1368 }
1369 /* Texture buffers - update bindings. */
1370 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1371 struct si_sampler_views *views = &sctx->samplers[shader].views;
1372 struct si_descriptors *descs = &views->desc;
1373 unsigned mask = views->enabled_mask;
1374
1375 while (mask) {
1376 unsigned i = u_bit_scan(&mask);
1377 if (views->views[i]->texture == buf) {
1378 si_desc_reset_buffer_offset(ctx,
1379 descs->list +
1380 i * 16 + 4,
1381 old_va, buf);
1382 descs->dirty_mask |= 1u << i;
1383
1384 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1385 rbuffer, RADEON_USAGE_READ,
1386 RADEON_PRIO_SAMPLER_BUFFER);
1387 }
1388 }
1389 }
1390
1391 /* Shader images */
1392 for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
1393 struct si_images_info *images = &sctx->images[shader];
1394 struct si_descriptors *descs = &images->desc;
1395 unsigned mask = images->enabled_mask;
1396
1397 while (mask) {
1398 unsigned i = u_bit_scan(&mask);
1399
1400 if (images->views[i].resource == buf) {
1401 if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)
1402 si_mark_image_range_valid(&images->views[i]);
1403
1404 si_desc_reset_buffer_offset(
1405 ctx, descs->list + i * 8 + 4,
1406 old_va, buf);
1407 descs->dirty_mask |= 1u << i;
1408
1409 radeon_add_to_buffer_list(
1410 &sctx->b, &sctx->b.gfx, rbuffer,
1411 RADEON_USAGE_READWRITE,
1412 RADEON_PRIO_SAMPLER_BUFFER);
1413 }
1414 }
1415 }
1416 }
1417
1418 /* Update mutable image descriptor fields of all bound textures. */
1419 void si_update_all_texture_descriptors(struct si_context *sctx)
1420 {
1421 unsigned shader;
1422
1423 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1424 struct si_sampler_views *samplers = &sctx->samplers[shader].views;
1425 struct si_images_info *images = &sctx->images[shader];
1426 unsigned mask;
1427
1428 /* Images. */
1429 mask = images->enabled_mask;
1430 while (mask) {
1431 unsigned i = u_bit_scan(&mask);
1432 struct pipe_image_view *view = &images->views[i];
1433
1434 if (!view->resource ||
1435 view->resource->target == PIPE_BUFFER)
1436 continue;
1437
1438 si_set_shader_image(sctx, shader, i, view);
1439 }
1440
1441 /* Sampler views. */
1442 mask = samplers->enabled_mask;
1443 while (mask) {
1444 unsigned i = u_bit_scan(&mask);
1445 struct pipe_sampler_view *view = samplers->views[i];
1446
1447 if (!view ||
1448 !view->texture ||
1449 view->texture->target == PIPE_BUFFER)
1450 continue;
1451
1452 si_set_sampler_view(sctx, shader, i,
1453 samplers->views[i], true);
1454 }
1455 }
1456 }
1457
1458 /* SHADER USER DATA */
1459
1460 static void si_mark_shader_pointers_dirty(struct si_context *sctx,
1461 unsigned shader)
1462 {
1463 sctx->const_buffers[shader].desc.pointer_dirty = true;
1464 sctx->shader_buffers[shader].desc.pointer_dirty = true;
1465 sctx->samplers[shader].views.desc.pointer_dirty = true;
1466 sctx->images[shader].desc.pointer_dirty = true;
1467
1468 if (shader == PIPE_SHADER_VERTEX)
1469 sctx->vertex_buffers.pointer_dirty = true;
1470
1471 si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
1472 }
1473
1474 static void si_shader_userdata_begin_new_cs(struct si_context *sctx)
1475 {
1476 int i;
1477
1478 for (i = 0; i < SI_NUM_SHADERS; i++) {
1479 si_mark_shader_pointers_dirty(sctx, i);
1480 }
1481 sctx->rw_buffers.desc.pointer_dirty = true;
1482 }
1483
1484 /* Set a base register address for user data constants in the given shader.
1485 * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
1486 */
1487 static void si_set_user_data_base(struct si_context *sctx,
1488 unsigned shader, uint32_t new_base)
1489 {
1490 uint32_t *base = &sctx->shader_userdata.sh_base[shader];
1491
1492 if (*base != new_base) {
1493 *base = new_base;
1494
1495 if (new_base)
1496 si_mark_shader_pointers_dirty(sctx, shader);
1497 }
1498 }
1499
1500 /* This must be called when these shaders are changed from non-NULL to NULL
1501 * and vice versa:
1502 * - geometry shader
1503 * - tessellation control shader
1504 * - tessellation evaluation shader
1505 */
1506 void si_shader_change_notify(struct si_context *sctx)
1507 {
1508 /* VS can be bound as VS, ES, or LS. */
1509 if (sctx->tes_shader.cso)
1510 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
1511 R_00B530_SPI_SHADER_USER_DATA_LS_0);
1512 else if (sctx->gs_shader.cso)
1513 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
1514 R_00B330_SPI_SHADER_USER_DATA_ES_0);
1515 else
1516 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
1517 R_00B130_SPI_SHADER_USER_DATA_VS_0);
1518
1519 /* TES can be bound as ES, VS, or not bound. */
1520 if (sctx->tes_shader.cso) {
1521 if (sctx->gs_shader.cso)
1522 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
1523 R_00B330_SPI_SHADER_USER_DATA_ES_0);
1524 else
1525 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
1526 R_00B130_SPI_SHADER_USER_DATA_VS_0);
1527 } else {
1528 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
1529 }
1530 }
1531
1532 static void si_emit_shader_pointer(struct si_context *sctx,
1533 struct si_descriptors *desc,
1534 unsigned sh_base, bool keep_dirty)
1535 {
1536 struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
1537 uint64_t va;
1538
1539 if (!desc->pointer_dirty || !desc->buffer)
1540 return;
1541
1542 va = desc->buffer->gpu_address +
1543 desc->buffer_offset;
1544
1545 radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
1546 radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
1547 radeon_emit(cs, va);
1548 radeon_emit(cs, va >> 32);
1549
1550 desc->pointer_dirty = keep_dirty;
1551 }
1552
1553 void si_emit_graphics_shader_userdata(struct si_context *sctx,
1554 struct r600_atom *atom)
1555 {
1556 unsigned i;
1557 uint32_t *sh_base = sctx->shader_userdata.sh_base;
1558
1559 if (sctx->rw_buffers.desc.pointer_dirty) {
1560 si_emit_shader_pointer(sctx, &sctx->rw_buffers.desc,
1561 R_00B030_SPI_SHADER_USER_DATA_PS_0, true);
1562 si_emit_shader_pointer(sctx, &sctx->rw_buffers.desc,
1563 R_00B130_SPI_SHADER_USER_DATA_VS_0, true);
1564 si_emit_shader_pointer(sctx, &sctx->rw_buffers.desc,
1565 R_00B230_SPI_SHADER_USER_DATA_GS_0, true);
1566 si_emit_shader_pointer(sctx, &sctx->rw_buffers.desc,
1567 R_00B330_SPI_SHADER_USER_DATA_ES_0, true);
1568 si_emit_shader_pointer(sctx, &sctx->rw_buffers.desc,
1569 R_00B430_SPI_SHADER_USER_DATA_HS_0, true);
1570 sctx->rw_buffers.desc.pointer_dirty = false;
1571 }
1572
1573 for (i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
1574 unsigned base = sh_base[i];
1575
1576 if (!base)
1577 continue;
1578
1579 si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false);
1580 si_emit_shader_pointer(sctx, &sctx->shader_buffers[i].desc, base, false);
1581 si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false);
1582 si_emit_shader_pointer(sctx, &sctx->images[i].desc, base, false);
1583 }
1584 si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
1585 }
1586
1587 void si_emit_compute_shader_userdata(struct si_context *sctx)
1588 {
1589 unsigned base = R_00B900_COMPUTE_USER_DATA_0;
1590
1591 si_emit_shader_pointer(sctx, &sctx->const_buffers[PIPE_SHADER_COMPUTE].desc,
1592 base, false);
1593 si_emit_shader_pointer(sctx, &sctx->shader_buffers[PIPE_SHADER_COMPUTE].desc,
1594 base, false);
1595 si_emit_shader_pointer(sctx, &sctx->samplers[PIPE_SHADER_COMPUTE].views.desc,
1596 base, false);
1597 si_emit_shader_pointer(sctx, &sctx->images[PIPE_SHADER_COMPUTE].desc,
1598 base, false);
1599 }
1600
1601 /* INIT/DEINIT/UPLOAD */
1602
1603 void si_init_all_descriptors(struct si_context *sctx)
1604 {
1605 int i;
1606 unsigned ce_offset = 0;
1607
1608 for (i = 0; i < SI_NUM_SHADERS; i++) {
1609 si_init_buffer_resources(&sctx->const_buffers[i],
1610 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST_BUFFERS,
1611 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER,
1612 &ce_offset);
1613 si_init_buffer_resources(&sctx->shader_buffers[i],
1614 SI_NUM_SHADER_BUFFERS, SI_SGPR_SHADER_BUFFERS,
1615 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER,
1616 &ce_offset);
1617
1618 si_init_descriptors(&sctx->samplers[i].views.desc,
1619 SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS,
1620 null_texture_descriptor, &ce_offset);
1621
1622 si_init_descriptors(&sctx->images[i].desc,
1623 SI_SGPR_IMAGES, 8, SI_NUM_IMAGES,
1624 null_image_descriptor, &ce_offset);
1625 }
1626
1627 si_init_buffer_resources(&sctx->rw_buffers,
1628 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
1629 RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT,
1630 &ce_offset);
1631 si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
1632 4, SI_NUM_VERTEX_BUFFERS, NULL, NULL);
1633
1634 assert(ce_offset <= 32768);
1635
1636 /* Set pipe_context functions. */
1637 sctx->b.b.bind_sampler_states = si_bind_sampler_states;
1638 sctx->b.b.set_shader_images = si_set_shader_images;
1639 sctx->b.b.set_constant_buffer = si_pipe_set_constant_buffer;
1640 sctx->b.b.set_polygon_stipple = si_set_polygon_stipple;
1641 sctx->b.b.set_shader_buffers = si_set_shader_buffers;
1642 sctx->b.b.set_sampler_views = si_set_sampler_views;
1643 sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
1644 sctx->b.invalidate_buffer = si_invalidate_buffer;
1645
1646 /* Shader user data. */
1647 si_init_atom(sctx, &sctx->shader_userdata.atom, &sctx->atoms.s.shader_userdata,
1648 si_emit_graphics_shader_userdata);
1649
1650 /* Set default and immutable mappings. */
1651 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
1652 si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0);
1653 si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
1654 si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
1655 }
1656
1657 bool si_upload_graphics_shader_descriptors(struct si_context *sctx)
1658 {
1659 int i;
1660
1661 for (i = 0; i < SI_NUM_SHADERS; i++) {
1662 if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc,
1663 &sctx->shader_userdata.atom) ||
1664 !si_upload_descriptors(sctx, &sctx->shader_buffers[i].desc,
1665 &sctx->shader_userdata.atom) ||
1666 !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc,
1667 &sctx->shader_userdata.atom) ||
1668 !si_upload_descriptors(sctx, &sctx->images[i].desc,
1669 &sctx->shader_userdata.atom))
1670 return false;
1671 }
1672 return si_upload_descriptors(sctx, &sctx->rw_buffers.desc,
1673 &sctx->shader_userdata.atom) &&
1674 si_upload_vertex_buffer_descriptors(sctx);
1675 }
1676
1677 bool si_upload_compute_shader_descriptors(struct si_context *sctx)
1678 {
1679 /* Does not update rw_buffers as that is not needed for compute shaders
1680 * and the input buffer is using the same SGPR's anyway.
1681 */
1682 return si_upload_descriptors(sctx,
1683 &sctx->const_buffers[PIPE_SHADER_COMPUTE].desc, NULL) &&
1684 si_upload_descriptors(sctx,
1685 &sctx->shader_buffers[PIPE_SHADER_COMPUTE].desc, NULL) &&
1686 si_upload_descriptors(sctx,
1687 &sctx->samplers[PIPE_SHADER_COMPUTE].views.desc, NULL) &&
1688 si_upload_descriptors(sctx,
1689 &sctx->images[PIPE_SHADER_COMPUTE].desc, NULL);
1690 }
1691
1692 void si_release_all_descriptors(struct si_context *sctx)
1693 {
1694 int i;
1695
1696 for (i = 0; i < SI_NUM_SHADERS; i++) {
1697 si_release_buffer_resources(&sctx->const_buffers[i]);
1698 si_release_buffer_resources(&sctx->shader_buffers[i]);
1699 si_release_sampler_views(&sctx->samplers[i].views);
1700 si_release_image_views(&sctx->images[i]);
1701 }
1702 si_release_buffer_resources(&sctx->rw_buffers);
1703 si_release_descriptors(&sctx->vertex_buffers);
1704 }
1705
1706 void si_all_descriptors_begin_new_cs(struct si_context *sctx)
1707 {
1708 int i;
1709
1710 for (i = 0; i < SI_NUM_SHADERS; i++) {
1711 si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
1712 si_buffer_resources_begin_new_cs(sctx, &sctx->shader_buffers[i]);
1713 si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
1714 si_image_views_begin_new_cs(sctx, &sctx->images[i]);
1715 }
1716 si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
1717 si_vertex_buffers_begin_new_cs(sctx);
1718 si_shader_userdata_begin_new_cs(sctx);
1719 }