src/gallium/drivers/radeonsi/si_descriptors.c

   1 /*
   2  * Copyright 2013 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Marek Olšák <marek.olsak@amd.com>
  25  */
  26
  27 /* Resource binding slots and sampler states (each described with 8 or
  28  * 4 dwords) are stored in lists in memory which is accessed by shaders
  29  * using scalar load instructions.
  30  *
  31  * This file is responsible for managing such lists. It keeps a copy of all
  32  * descriptors in CPU memory and re-uploads a whole list if some slots have
  33  * been changed.
  34  *
  35  * This code is also reponsible for updating shader pointers to those lists.
  36  *
  37  * Note that CP DMA can't be used for updating the lists, because a GPU hang
  38  * could leave the list in a mid-IB state and the next IB would get wrong
  39  * descriptors and the whole context would be unusable at that point.
  40  * (Note: The register shadowing can't be used due to the same reason)
  41  *
  42  * Also, uploading descriptors to newly allocated memory doesn't require
  43  * a KCACHE flush.
  44  *
  45  *
  46  * Possible scenarios for one 16 dword image+sampler slot:
  47  *
  48  *       | Image        | w/ FMASK   | Buffer       | NULL
  49  * [ 0: 3] Image[0:3]   | Image[0:3] | Null[0:3]    | Null[0:3]
  50  * [ 4: 7] Image[4:7]   | Image[4:7] | Buffer[0:3]  | 0
  51  * [ 8:11] Null[0:3]    | Fmask[0:3] | Null[0:3]    | Null[0:3]
  52  * [12:15] Sampler[0:3] | Fmask[4:7] | Sampler[0:3] | Sampler[0:3]
  53  *
  54  * FMASK implies MSAA, therefore no sampler state.
  55  * Sampler states are never unbound except when FMASK is bound.
  56  */
  57
  58 #include "radeon/r600_cs.h"
  59 #include "si_pipe.h"
  60 #include "sid.h"
  61 #include "gfx9d.h"
  62
  63 #include "util/u_format.h"
  64 #include "util/u_memory.h"
  65 #include "util/u_upload_mgr.h"
  66
  67
  68 /* NULL image and buffer descriptor for textures (alpha = 1) and images
  69  * (alpha = 0).
  70  *
  71  * For images, all fields must be zero except for the swizzle, which
  72  * supports arbitrary combinations of 0s and 1s. The texture type must be
  73  * any valid type (e.g. 1D). If the texture type isn't set, the hw hangs.
  74  *
  75  * For buffers, all fields must be zero. If they are not, the hw hangs.
  76  *
  77  * This is the only reason why the buffer descriptor must be in words [4:7].
  78  */
  79 static uint32_t null_texture_descriptor[8] = {
  80         0,
  81         0,
  82         0,
  83         S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) |
  84         S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
  85         /* the rest must contain zeros, which is also used by the buffer
  86          * descriptor */
  87 };
  88
  89 static uint32_t null_image_descriptor[8] = {
  90         0,
  91         0,
  92         0,
  93         S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
  94         /* the rest must contain zeros, which is also used by the buffer
  95          * descriptor */
  96 };
  97
  98 static void si_init_descriptors(struct si_descriptors *desc,
  99                                 unsigned shader_userdata_index,
 100                                 unsigned element_dw_size,
 101                                 unsigned num_elements,
 102                                 const uint32_t *null_descriptor,
 103                                 unsigned *ce_offset)
 104 {
 105         int i;
 106
 107         assert(num_elements <= sizeof(desc->dirty_mask)*8);
 108
 109         desc->list = CALLOC(num_elements, element_dw_size * 4);
 110         desc->element_dw_size = element_dw_size;
 111         desc->num_elements = num_elements;
 112         desc->dirty_mask = num_elements == 32 ? ~0u : (1u << num_elements) - 1;
 113         desc->shader_userdata_offset = shader_userdata_index * 4;
 114
 115         if (ce_offset) {
 116                 desc->uses_ce = true;
 117                 desc->ce_offset = *ce_offset;
 118
 119                 /* make sure that ce_offset stays 32 byte aligned */
 120                 *ce_offset += align(element_dw_size * num_elements * 4, 32);
 121         }
 122
 123         /* Initialize the array to NULL descriptors if the element size is 8. */
 124         if (null_descriptor) {
 125                 assert(element_dw_size % 8 == 0);
 126                 for (i = 0; i < num_elements * element_dw_size / 8; i++)
 127                         memcpy(desc->list + i * 8, null_descriptor,
 128                                8 * 4);
 129         }
 130 }
 131
 132 static void si_release_descriptors(struct si_descriptors *desc)
 133 {
 134         r600_resource_reference(&desc->buffer, NULL);
 135         FREE(desc->list);
 136 }
 137
 138 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
 139                          unsigned *out_offset, struct r600_resource **out_buf) {
 140         uint64_t va;
 141
 142         u_suballocator_alloc(sctx->ce_suballocator, size,
 143                              sctx->screen->b.info.tcc_cache_line_size,
 144                              out_offset, (struct pipe_resource**)out_buf);
 145         if (!out_buf)
 146                         return false;
 147
 148         va = (*out_buf)->gpu_address + *out_offset;
 149
 150         radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
 151         radeon_emit(sctx->ce_ib, ce_offset);
 152         radeon_emit(sctx->ce_ib, size / 4);
 153         radeon_emit(sctx->ce_ib, va);
 154         radeon_emit(sctx->ce_ib, va >> 32);
 155
 156         radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf,
 157                                RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
 158
 159         sctx->ce_need_synchronization = true;
 160         return true;
 161 }
 162
 163 static void si_ce_reinitialize_descriptors(struct si_context *sctx,
 164                                            struct si_descriptors *desc)
 165 {
 166         if (desc->buffer) {
 167                 struct r600_resource *buffer = (struct r600_resource*)desc->buffer;
 168                 unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
 169                 uint64_t va = buffer->gpu_address + desc->buffer_offset;
 170                 struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
 171
 172                 if (!ib)
 173                         ib = sctx->ce_ib;
 174
 175                 list_size = align(list_size, 32);
 176
 177                 radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
 178                 radeon_emit(ib, va);
 179                 radeon_emit(ib, va >> 32);
 180                 radeon_emit(ib, list_size / 4);
 181                 radeon_emit(ib, desc->ce_offset);
 182
 183                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
 184                                     RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 185         }
 186         desc->ce_ram_dirty = false;
 187 }
 188
 189 void si_ce_reinitialize_all_descriptors(struct si_context *sctx)
 190 {
 191         int i;
 192
 193         for (i = 0; i < SI_NUM_DESCS; ++i)
 194                 si_ce_reinitialize_descriptors(sctx, &sctx->descriptors[i]);
 195 }
 196
 197 void si_ce_enable_loads(struct radeon_winsys_cs *ib)
 198 {
 199         radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 200         radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
 201                         CONTEXT_CONTROL_LOAD_CE_RAM(1));
 202         radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
 203 }
 204
 205 static bool si_upload_descriptors(struct si_context *sctx,
 206                                   struct si_descriptors *desc,
 207                                   struct r600_atom * atom)
 208 {
 209         unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
 210
 211         if (!desc->dirty_mask)
 212                 return true;
 213
 214         if (sctx->ce_ib && desc->uses_ce) {
 215                 uint32_t const* list = (uint32_t const*)desc->list;
 216
 217                 if (desc->ce_ram_dirty)
 218                         si_ce_reinitialize_descriptors(sctx, desc);
 219
 220                 while(desc->dirty_mask) {
 221                         int begin, count;
 222                         u_bit_scan_consecutive_range(&desc->dirty_mask, &begin,
 223                                                      &count);
 224
 225                         begin *= desc->element_dw_size;
 226                         count *= desc->element_dw_size;
 227
 228                         radeon_emit(sctx->ce_ib,
 229                                     PKT3(PKT3_WRITE_CONST_RAM, count, 0));
 230                         radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
 231                         radeon_emit_array(sctx->ce_ib, list + begin, count);
 232                 }
 233
 234                 if (!si_ce_upload(sctx, desc->ce_offset, list_size,
 235                                            &desc->buffer_offset, &desc->buffer))
 236                         return false;
 237         } else {
 238                 void *ptr;
 239
 240                 u_upload_alloc(sctx->b.b.const_uploader, 0, list_size,
 241                                sctx->screen->b.info.tcc_cache_line_size,
 242                                &desc->buffer_offset,
 243                                (struct pipe_resource**)&desc->buffer, &ptr);
 244                 if (!desc->buffer)
 245                         return false; /* skip the draw call */
 246
 247                 util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
 248                 desc->gpu_list = ptr;
 249
 250                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
 251                                     RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 252         }
 253         desc->dirty_mask = 0;
 254
 255         if (atom)
 256                 si_mark_atom_dirty(sctx, atom);
 257
 258         return true;
 259 }
 260
 261 static void
 262 si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
 263 {
 264         desc->ce_ram_dirty = true;
 265
 266         if (!desc->buffer)
 267                 return;
 268
 269         radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
 270                                   RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 271 }
 272
 273 /* SAMPLER VIEWS */
 274
 275 static unsigned
 276 si_sampler_descriptors_idx(unsigned shader)
 277 {
 278         return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
 279                SI_SHADER_DESCS_SAMPLERS;
 280 }
 281
 282 static struct si_descriptors *
 283 si_sampler_descriptors(struct si_context *sctx, unsigned shader)
 284 {
 285         return &sctx->descriptors[si_sampler_descriptors_idx(shader)];
 286 }
 287
 288 static void si_release_sampler_views(struct si_sampler_views *views)
 289 {
 290         int i;
 291
 292         for (i = 0; i < ARRAY_SIZE(views->views); i++) {
 293                 pipe_sampler_view_reference(&views->views[i], NULL);
 294         }
 295 }
 296
 297 static void si_sampler_view_add_buffer(struct si_context *sctx,
 298                                        struct pipe_resource *resource,
 299                                        enum radeon_bo_usage usage,
 300                                        bool is_stencil_sampler,
 301                                        bool check_mem)
 302 {
 303         struct r600_resource *rres;
 304         struct r600_texture *rtex;
 305         enum radeon_bo_priority priority;
 306
 307         if (!resource)
 308                 return;
 309
 310         if (resource->target != PIPE_BUFFER) {
 311                 struct r600_texture *tex = (struct r600_texture*)resource;
 312
 313                 if (tex->is_depth && !r600_can_sample_zs(tex, is_stencil_sampler))
 314                         resource = &tex->flushed_depth_texture->resource.b.b;
 315         }
 316
 317         rres = (struct r600_resource*)resource;
 318         priority = r600_get_sampler_view_priority(rres);
 319
 320         radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
 321                                             rres, usage, priority,
 322                                             check_mem);
 323
 324         if (resource->target == PIPE_BUFFER)
 325                 return;
 326
 327         /* Now add separate DCC or HTILE. */
 328         rtex = (struct r600_texture*)resource;
 329         if (rtex->dcc_separate_buffer) {
 330                 radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
 331                                                     rtex->dcc_separate_buffer, usage,
 332                                                     RADEON_PRIO_DCC, check_mem);
 333         }
 334
 335         if (rtex->htile_buffer &&
 336             rtex->tc_compatible_htile &&
 337             !is_stencil_sampler) {
 338                 radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
 339                                                     rtex->htile_buffer, usage,
 340                                                     RADEON_PRIO_HTILE, check_mem);
 341         }
 342 }
 343
 344 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 345                                           struct si_sampler_views *views)
 346 {
 347         unsigned mask = views->enabled_mask;
 348
 349         /* Add buffers to the CS. */
 350         while (mask) {
 351                 int i = u_bit_scan(&mask);
 352                 struct si_sampler_view *sview = (struct si_sampler_view *)views->views[i];
 353
 354                 si_sampler_view_add_buffer(sctx, sview->base.texture,
 355                                            RADEON_USAGE_READ,
 356                                            sview->is_stencil_sampler, false);
 357         }
 358 }
 359
 360 /* Set buffer descriptor fields that can be changed by reallocations. */
 361 static void si_set_buf_desc_address(struct r600_resource *buf,
 362                                     uint64_t offset, uint32_t *state)
 363 {
 364         uint64_t va = buf->gpu_address + offset;
 365
 366         state[0] = va;
 367         state[1] &= C_008F04_BASE_ADDRESS_HI;
 368         state[1] |= S_008F04_BASE_ADDRESS_HI(va >> 32);
 369 }
 370
 371 /* Set texture descriptor fields that can be changed by reallocations.
 372  *
 373  * \param tex                   texture
 374  * \param base_level_info       information of the level of BASE_ADDRESS
 375  * \param base_level            the level of BASE_ADDRESS
 376  * \param first_level           pipe_sampler_view.u.tex.first_level
 377  * \param block_width           util_format_get_blockwidth()
 378  * \param is_stencil            select between separate Z & Stencil
 379  * \param state                 descriptor to update
 380  */
 381 void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
 382                                     struct r600_texture *tex,
 383                                     const struct legacy_surf_level *base_level_info,
 384                                     unsigned base_level, unsigned first_level,
 385                                     unsigned block_width, bool is_stencil,
 386                                     uint32_t *state)
 387 {
 388         uint64_t va, meta_va = 0;
 389
 390         if (tex->is_depth && !r600_can_sample_zs(tex, is_stencil)) {
 391                 tex = tex->flushed_depth_texture;
 392                 is_stencil = false;
 393         }
 394
 395         va = tex->resource.gpu_address;
 396
 397         if (sscreen->b.chip_class >= GFX9) {
 398                 /* Only stencil_offset needs to be added here. */
 399                 if (is_stencil)
 400                         va += tex->surface.u.gfx9.stencil_offset;
 401                 else
 402                         va += tex->surface.u.gfx9.surf_offset;
 403         } else {
 404                 va += base_level_info->offset;
 405         }
 406
 407         if (vi_dcc_enabled(tex, first_level)) {
 408                 meta_va = (!tex->dcc_separate_buffer ? tex->resource.gpu_address : 0) +
 409                           tex->dcc_offset;
 410
 411                 if (sscreen->b.chip_class <= VI)
 412                         meta_va += base_level_info->dcc_offset;
 413         } else if (tex->tc_compatible_htile && !is_stencil) {
 414                 meta_va = tex->htile_buffer->gpu_address;
 415         }
 416
 417         state[0] = va >> 8;
 418         state[1] &= C_008F14_BASE_ADDRESS_HI;
 419         state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
 420
 421         state[6] &= C_008F28_COMPRESSION_EN;
 422         state[7] = 0;
 423
 424         if (meta_va) {
 425                 state[6] |= S_008F28_COMPRESSION_EN(1);
 426                 state[7] = meta_va >> 8;
 427         }
 428
 429         if (sscreen->b.chip_class >= GFX9) {
 430                 state[3] &= C_008F1C_SW_MODE;
 431                 state[4] &= C_008F20_PITCH_GFX9;
 432
 433                 if (is_stencil) {
 434                         state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
 435                         state[4] |= S_008F20_PITCH_GFX9(tex->surface.u.gfx9.stencil.epitch);
 436                 } else {
 437                         state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
 438                         state[4] |= S_008F20_PITCH_GFX9(tex->surface.u.gfx9.surf.epitch);
 439                 }
 440
 441                 state[5] &= C_008F24_META_DATA_ADDRESS &
 442                             C_008F24_META_PIPE_ALIGNED &
 443                             C_008F24_META_RB_ALIGNED;
 444                 if (meta_va) {
 445                         struct gfx9_surf_meta_flags meta;
 446
 447                         if (tex->dcc_offset)
 448                                 meta = tex->surface.u.gfx9.dcc;
 449                         else
 450                                 meta = tex->surface.u.gfx9.htile;
 451
 452                         state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) |
 453                                     S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) |
 454                                     S_008F24_META_RB_ALIGNED(meta.rb_aligned);
 455                 }
 456         } else {
 457                 /* SI-CI-VI */
 458                 unsigned pitch = base_level_info->nblk_x * block_width;
 459                 unsigned index = si_tile_mode_index(tex, base_level, is_stencil);
 460
 461                 state[3] &= C_008F1C_TILING_INDEX;
 462                 state[3] |= S_008F1C_TILING_INDEX(index);
 463                 state[4] &= C_008F20_PITCH_GFX6;
 464                 state[4] |= S_008F20_PITCH_GFX6(pitch - 1);
 465         }
 466 }
 467
 468 static void si_set_sampler_view(struct si_context *sctx,
 469                                 unsigned shader,
 470                                 unsigned slot, struct pipe_sampler_view *view,
 471                                 bool disallow_early_out)
 472 {
 473         struct si_sampler_views *views = &sctx->samplers[shader].views;
 474         struct si_sampler_view *rview = (struct si_sampler_view*)view;
 475         struct si_descriptors *descs = si_sampler_descriptors(sctx, shader);
 476         uint32_t *desc = descs->list + slot * 16;
 477
 478         if (views->views[slot] == view && !disallow_early_out)
 479                 return;
 480
 481         if (view) {
 482                 struct r600_texture *rtex = (struct r600_texture *)view->texture;
 483
 484                 assert(rtex); /* views with texture == NULL aren't supported */
 485                 pipe_sampler_view_reference(&views->views[slot], view);
 486                 memcpy(desc, rview->state, 8*4);
 487
 488                 if (rtex->resource.b.b.target == PIPE_BUFFER) {
 489                         rtex->resource.bind_history |= PIPE_BIND_SAMPLER_VIEW;
 490
 491                         si_set_buf_desc_address(&rtex->resource,
 492                                                 view->u.buf.offset,
 493                                                 desc + 4);
 494                 } else {
 495                         bool is_separate_stencil =
 496                                 rtex->db_compatible &&
 497                                 rview->is_stencil_sampler;
 498
 499                         si_set_mutable_tex_desc_fields(sctx->screen, rtex,
 500                                                        rview->base_level_info,
 501                                                        rview->base_level,
 502                                                        rview->base.u.tex.first_level,
 503                                                        rview->block_width,
 504                                                        is_separate_stencil,
 505                                                        desc);
 506                 }
 507
 508                 if (rtex->resource.b.b.target != PIPE_BUFFER &&
 509                     rtex->fmask.size) {
 510                         memcpy(desc + 8,
 511                                rview->fmask_state, 8*4);
 512                 } else {
 513                         /* Disable FMASK and bind sampler state in [12:15]. */
 514                         memcpy(desc + 8,
 515                                null_texture_descriptor, 4*4);
 516
 517                         if (views->sampler_states[slot])
 518                                 memcpy(desc + 12,
 519                                        views->sampler_states[slot]->val, 4*4);
 520                 }
 521
 522                 views->enabled_mask |= 1u << slot;
 523
 524                 /* Since this can flush, it must be done after enabled_mask is
 525                  * updated. */
 526                 si_sampler_view_add_buffer(sctx, view->texture,
 527                                            RADEON_USAGE_READ,
 528                                            rview->is_stencil_sampler, true);
 529         } else {
 530                 pipe_sampler_view_reference(&views->views[slot], NULL);
 531                 memcpy(desc, null_texture_descriptor, 8*4);
 532                 /* Only clear the lower dwords of FMASK. */
 533                 memcpy(desc + 8, null_texture_descriptor, 4*4);
 534                 /* Re-set the sampler state if we are transitioning from FMASK. */
 535                 if (views->sampler_states[slot])
 536                         memcpy(desc + 12,
 537                                views->sampler_states[slot]->val, 4*4);
 538
 539                 views->enabled_mask &= ~(1u << slot);
 540         }
 541
 542         descs->dirty_mask |= 1u << slot;
 543         sctx->descriptors_dirty |= 1u << si_sampler_descriptors_idx(shader);
 544 }
 545
 546 static bool is_compressed_colortex(struct r600_texture *rtex)
 547 {
 548         return rtex->cmask.size || rtex->fmask.size ||
 549                (rtex->dcc_offset && rtex->dirty_level_mask);
 550 }
 551
 552 static void si_update_compressed_tex_shader_mask(struct si_context *sctx,
 553                                                  unsigned shader)
 554 {
 555         struct si_textures_info *samplers = &sctx->samplers[shader];
 556         unsigned shader_bit = 1 << shader;
 557
 558         if (samplers->depth_texture_mask ||
 559             samplers->compressed_colortex_mask ||
 560             sctx->images[shader].compressed_colortex_mask)
 561                 sctx->compressed_tex_shader_mask |= shader_bit;
 562         else
 563                 sctx->compressed_tex_shader_mask &= ~shader_bit;
 564 }
 565
 566 static void si_set_sampler_views(struct pipe_context *ctx,
 567                                  enum pipe_shader_type shader, unsigned start,
 568                                  unsigned count,
 569                                  struct pipe_sampler_view **views)
 570 {
 571         struct si_context *sctx = (struct si_context *)ctx;
 572         struct si_textures_info *samplers = &sctx->samplers[shader];
 573         int i;
 574
 575         if (!count || shader >= SI_NUM_SHADERS)
 576                 return;
 577
 578         for (i = 0; i < count; i++) {
 579                 unsigned slot = start + i;
 580
 581                 if (!views || !views[i]) {
 582                         samplers->depth_texture_mask &= ~(1u << slot);
 583                         samplers->compressed_colortex_mask &= ~(1u << slot);
 584                         si_set_sampler_view(sctx, shader, slot, NULL, false);
 585                         continue;
 586                 }
 587
 588                 si_set_sampler_view(sctx, shader, slot, views[i], false);
 589
 590                 if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
 591                         struct r600_texture *rtex =
 592                                 (struct r600_texture*)views[i]->texture;
 593                         struct si_sampler_view *rview = (struct si_sampler_view *)views[i];
 594
 595                         if (rtex->db_compatible &&
 596                             (!rtex->tc_compatible_htile || rview->is_stencil_sampler)) {
 597                                 samplers->depth_texture_mask |= 1u << slot;
 598                         } else {
 599                                 samplers->depth_texture_mask &= ~(1u << slot);
 600                         }
 601                         if (is_compressed_colortex(rtex)) {
 602                                 samplers->compressed_colortex_mask |= 1u << slot;
 603                         } else {
 604                                 samplers->compressed_colortex_mask &= ~(1u << slot);
 605                         }
 606
 607                         if (rtex->dcc_offset &&
 608                             p_atomic_read(&rtex->framebuffers_bound))
 609                                 sctx->need_check_render_feedback = true;
 610                 } else {
 611                         samplers->depth_texture_mask &= ~(1u << slot);
 612                         samplers->compressed_colortex_mask &= ~(1u << slot);
 613                 }
 614         }
 615
 616         si_update_compressed_tex_shader_mask(sctx, shader);
 617 }
 618
 619 static void
 620 si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers)
 621 {
 622         unsigned mask = samplers->views.enabled_mask;
 623
 624         while (mask) {
 625                 int i = u_bit_scan(&mask);
 626                 struct pipe_resource *res = samplers->views.views[i]->texture;
 627
 628                 if (res && res->target != PIPE_BUFFER) {
 629                         struct r600_texture *rtex = (struct r600_texture *)res;
 630
 631                         if (is_compressed_colortex(rtex)) {
 632                                 samplers->compressed_colortex_mask |= 1u << i;
 633                         } else {
 634                                 samplers->compressed_colortex_mask &= ~(1u << i);
 635                         }
 636                 }
 637         }
 638 }
 639
 640 /* IMAGE VIEWS */
 641
 642 static unsigned
 643 si_image_descriptors_idx(unsigned shader)
 644 {
 645         return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
 646                SI_SHADER_DESCS_IMAGES;
 647 }
 648
 649 static struct si_descriptors*
 650 si_image_descriptors(struct si_context *sctx, unsigned shader)
 651 {
 652         return &sctx->descriptors[si_image_descriptors_idx(shader)];
 653 }
 654
 655 static void
 656 si_release_image_views(struct si_images_info *images)
 657 {
 658         unsigned i;
 659
 660         for (i = 0; i < SI_NUM_IMAGES; ++i) {
 661                 struct pipe_image_view *view = &images->views[i];
 662
 663                 pipe_resource_reference(&view->resource, NULL);
 664         }
 665 }
 666
 667 static void
 668 si_image_views_begin_new_cs(struct si_context *sctx, struct si_images_info *images)
 669 {
 670         uint mask = images->enabled_mask;
 671
 672         /* Add buffers to the CS. */
 673         while (mask) {
 674                 int i = u_bit_scan(&mask);
 675                 struct pipe_image_view *view = &images->views[i];
 676
 677                 assert(view->resource);
 678
 679                 si_sampler_view_add_buffer(sctx, view->resource,
 680                                            RADEON_USAGE_READWRITE, false, false);
 681         }
 682 }
 683
 684 static void
 685 si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot)
 686 {
 687         struct si_images_info *images = &ctx->images[shader];
 688
 689         if (images->enabled_mask & (1u << slot)) {
 690                 struct si_descriptors *descs = si_image_descriptors(ctx, shader);
 691
 692                 pipe_resource_reference(&images->views[slot].resource, NULL);
 693                 images->compressed_colortex_mask &= ~(1 << slot);
 694
 695                 memcpy(descs->list + slot*8, null_image_descriptor, 8*4);
 696                 images->enabled_mask &= ~(1u << slot);
 697                 descs->dirty_mask |= 1u << slot;
 698                 ctx->descriptors_dirty |= 1u << si_image_descriptors_idx(shader);
 699         }
 700 }
 701
 702 static void
 703 si_mark_image_range_valid(const struct pipe_image_view *view)
 704 {
 705         struct r600_resource *res = (struct r600_resource *)view->resource;
 706
 707         assert(res && res->b.b.target == PIPE_BUFFER);
 708
 709         util_range_add(&res->valid_buffer_range,
 710                        view->u.buf.offset,
 711                        view->u.buf.offset + view->u.buf.size);
 712 }
 713
 714 static void si_set_shader_image(struct si_context *ctx,
 715                                 unsigned shader,
 716                                 unsigned slot, const struct pipe_image_view *view,
 717                                 bool skip_decompress)
 718 {
 719         struct si_screen *screen = ctx->screen;
 720         struct si_images_info *images = &ctx->images[shader];
 721         struct si_descriptors *descs = si_image_descriptors(ctx, shader);
 722         struct r600_resource *res;
 723         uint32_t *desc = descs->list + slot * 8;
 724
 725         if (!view || !view->resource) {
 726                 si_disable_shader_image(ctx, shader, slot);
 727                 return;
 728         }
 729
 730         res = (struct r600_resource *)view->resource;
 731
 732         if (&images->views[slot] != view)
 733                 util_copy_image_view(&images->views[slot], view);
 734
 735         if (res->b.b.target == PIPE_BUFFER) {
 736                 if (view->access & PIPE_IMAGE_ACCESS_WRITE)
 737                         si_mark_image_range_valid(view);
 738
 739                 si_make_buffer_descriptor(screen, res,
 740                                           view->format,
 741                                           view->u.buf.offset,
 742                                           view->u.buf.size,
 743                                           descs->list + slot * 8);
 744                 si_set_buf_desc_address(res, view->u.buf.offset, desc + 4);
 745
 746                 images->compressed_colortex_mask &= ~(1 << slot);
 747                 res->bind_history |= PIPE_BIND_SHADER_IMAGE;
 748         } else {
 749                 static const unsigned char swizzle[4] = { 0, 1, 2, 3 };
 750                 struct r600_texture *tex = (struct r600_texture *)res;
 751                 unsigned level = view->u.tex.level;
 752                 unsigned width, height, depth;
 753                 bool uses_dcc = vi_dcc_enabled(tex, level);
 754
 755                 assert(!tex->is_depth);
 756                 assert(tex->fmask.size == 0);
 757
 758                 if (uses_dcc && !skip_decompress &&
 759                     (view->access & PIPE_IMAGE_ACCESS_WRITE ||
 760                      !vi_dcc_formats_compatible(res->b.b.format, view->format))) {
 761                         /* If DCC can't be disabled, at least decompress it.
 762                          * The decompression is relatively cheap if the surface
 763                          * has been decompressed already.
 764                          */
 765                         if (r600_texture_disable_dcc(&ctx->b, tex))
 766                                 uses_dcc = false;
 767                         else
 768                                 ctx->b.decompress_dcc(&ctx->b.b, tex);
 769                 }
 770
 771                 if (is_compressed_colortex(tex)) {
 772                         images->compressed_colortex_mask |= 1 << slot;
 773                 } else {
 774                         images->compressed_colortex_mask &= ~(1 << slot);
 775                 }
 776
 777                 if (uses_dcc &&
 778                     p_atomic_read(&tex->framebuffers_bound))
 779                         ctx->need_check_render_feedback = true;
 780
 781                 /* Always force the base level to the selected level.
 782                  *
 783                  * This is required for 3D textures, where otherwise
 784                  * selecting a single slice for non-layered bindings
 785                  * fails. It doesn't hurt the other targets.
 786                  */
 787                 width = u_minify(res->b.b.width0, level);
 788                 height = u_minify(res->b.b.height0, level);
 789                 depth = u_minify(res->b.b.depth0, level);
 790
 791                 si_make_texture_descriptor(screen, tex,
 792                                            false, res->b.b.target,
 793                                            view->format, swizzle,
 794                                            0, 0,
 795                                            view->u.tex.first_layer,
 796                                            view->u.tex.last_layer,
 797                                            width, height, depth,
 798                                            desc, NULL);
 799                 si_set_mutable_tex_desc_fields(screen, tex,
 800                                                &tex->surface.u.legacy.level[level],
 801                                                level, level,
 802                                                util_format_get_blockwidth(view->format),
 803                                                false, desc);
 804         }
 805
 806         images->enabled_mask |= 1u << slot;
 807         descs->dirty_mask |= 1u << slot;
 808         ctx->descriptors_dirty |= 1u << si_image_descriptors_idx(shader);
 809
 810         /* Since this can flush, it must be done after enabled_mask is updated. */
 811         si_sampler_view_add_buffer(ctx, &res->b.b,
 812                                    RADEON_USAGE_READWRITE, false, true);
 813 }
 814
 815 static void
 816 si_set_shader_images(struct pipe_context *pipe,
 817                      enum pipe_shader_type shader,
 818                      unsigned start_slot, unsigned count,
 819                      const struct pipe_image_view *views)
 820 {
 821         struct si_context *ctx = (struct si_context *)pipe;
 822         unsigned i, slot;
 823
 824         assert(shader < SI_NUM_SHADERS);
 825
 826         if (!count)
 827                 return;
 828
 829         assert(start_slot + count <= SI_NUM_IMAGES);
 830
 831         if (views) {
 832                 for (i = 0, slot = start_slot; i < count; ++i, ++slot)
 833                         si_set_shader_image(ctx, shader, slot, &views[i], false);
 834         } else {
 835                 for (i = 0, slot = start_slot; i < count; ++i, ++slot)
 836                         si_set_shader_image(ctx, shader, slot, NULL, false);
 837         }
 838
 839         si_update_compressed_tex_shader_mask(ctx, shader);
 840 }
 841
 842 static void
 843 si_images_update_compressed_colortex_mask(struct si_images_info *images)
 844 {
 845         unsigned mask = images->enabled_mask;
 846
 847         while (mask) {
 848                 int i = u_bit_scan(&mask);
 849                 struct pipe_resource *res = images->views[i].resource;
 850
 851                 if (res && res->target != PIPE_BUFFER) {
 852                         struct r600_texture *rtex = (struct r600_texture *)res;
 853
 854                         if (is_compressed_colortex(rtex)) {
 855                                 images->compressed_colortex_mask |= 1 << i;
 856                         } else {
 857                                 images->compressed_colortex_mask &= ~(1 << i);
 858                         }
 859                 }
 860         }
 861 }
 862
 863 /* SAMPLER STATES */
 864
 865 static void si_bind_sampler_states(struct pipe_context *ctx,
 866                                    enum pipe_shader_type shader,
 867                                    unsigned start, unsigned count, void **states)
 868 {
 869         struct si_context *sctx = (struct si_context *)ctx;
 870         struct si_textures_info *samplers = &sctx->samplers[shader];
 871         struct si_descriptors *desc = si_sampler_descriptors(sctx, shader);
 872         struct si_sampler_state **sstates = (struct si_sampler_state**)states;
 873         int i;
 874
 875         if (!count || shader >= SI_NUM_SHADERS)
 876                 return;
 877
 878         for (i = 0; i < count; i++) {
 879                 unsigned slot = start + i;
 880
 881                 if (!sstates[i] ||
 882                     sstates[i] == samplers->views.sampler_states[slot])
 883                         continue;
 884
 885 #ifdef DEBUG
 886                 assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC);
 887 #endif
 888                 samplers->views.sampler_states[slot] = sstates[i];
 889
 890                 /* If FMASK is bound, don't overwrite it.
 891                  * The sampler state will be set after FMASK is unbound.
 892                  */
 893                 if (samplers->views.views[slot] &&
 894                     samplers->views.views[slot]->texture &&
 895                     samplers->views.views[slot]->texture->target != PIPE_BUFFER &&
 896                     ((struct r600_texture*)samplers->views.views[slot]->texture)->fmask.size)
 897                         continue;
 898
 899                 memcpy(desc->list + slot * 16 + 12, sstates[i]->val, 4*4);
 900                 desc->dirty_mask |= 1u << slot;
 901                 sctx->descriptors_dirty |= 1u << si_sampler_descriptors_idx(shader);
 902         }
 903 }
 904
 905 /* BUFFER RESOURCES */
 906
 907 static void si_init_buffer_resources(struct si_buffer_resources *buffers,
 908                                      struct si_descriptors *descs,
 909                                      unsigned num_buffers,
 910                                      unsigned shader_userdata_index,
 911                                      enum radeon_bo_usage shader_usage,
 912                                      enum radeon_bo_priority priority,
 913                                      unsigned *ce_offset)
 914 {
 915         buffers->shader_usage = shader_usage;
 916         buffers->priority = priority;
 917         buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
 918
 919         si_init_descriptors(descs, shader_userdata_index, 4,
 920                             num_buffers, NULL, ce_offset);
 921 }
 922
 923 static void si_release_buffer_resources(struct si_buffer_resources *buffers,
 924                                         struct si_descriptors *descs)
 925 {
 926         int i;
 927
 928         for (i = 0; i < descs->num_elements; i++) {
 929                 pipe_resource_reference(&buffers->buffers[i], NULL);
 930         }
 931
 932         FREE(buffers->buffers);
 933 }
 934
 935 static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 936                                              struct si_buffer_resources *buffers)
 937 {
 938         unsigned mask = buffers->enabled_mask;
 939
 940         /* Add buffers to the CS. */
 941         while (mask) {
 942                 int i = u_bit_scan(&mask);
 943
 944                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 945                                       (struct r600_resource*)buffers->buffers[i],
 946                                       buffers->shader_usage, buffers->priority);
 947         }
 948 }
 949
 950 static void si_get_buffer_from_descriptors(struct si_buffer_resources *buffers,
 951                                            struct si_descriptors *descs,
 952                                            unsigned idx, struct pipe_resource **buf,
 953                                            unsigned *offset, unsigned *size)
 954 {
 955         pipe_resource_reference(buf, buffers->buffers[idx]);
 956         if (*buf) {
 957                 struct r600_resource *res = r600_resource(*buf);
 958                 const uint32_t *desc = descs->list + idx * 4;
 959                 uint64_t va;
 960
 961                 *size = desc[2];
 962
 963                 assert(G_008F04_STRIDE(desc[1]) == 0);
 964                 va = ((uint64_t)desc[1] << 32) | desc[0];
 965
 966                 assert(va >= res->gpu_address && va + *size <= res->gpu_address + res->bo_size);
 967                 *offset = va - res->gpu_address;
 968         }
 969 }
 970
 971 /* VERTEX BUFFERS */
 972
 973 static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 974 {
 975         struct si_descriptors *desc = &sctx->vertex_buffers;
 976         int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
 977         int i;
 978
 979         for (i = 0; i < count; i++) {
 980                 int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
 981
 982                 if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
 983                         continue;
 984                 if (!sctx->vertex_buffer[vb].buffer)
 985                         continue;
 986
 987                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 988                                       (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
 989                                       RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 990         }
 991
 992         if (!desc->buffer)
 993                 return;
 994         radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 995                               desc->buffer, RADEON_USAGE_READ,
 996                               RADEON_PRIO_DESCRIPTORS);
 997 }
 998
 999 bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
1000 {
1001         struct si_vertex_element *velems = sctx->vertex_elements;
1002         struct si_descriptors *desc = &sctx->vertex_buffers;
1003         unsigned i, count;
1004         unsigned desc_list_byte_size;
1005         unsigned first_vb_use_mask;
1006         uint64_t va;
1007         uint32_t *ptr;
1008
1009         if (!sctx->vertex_buffers_dirty || !velems)
1010                 return true;
1011
1012         count = velems->count;
1013
1014         if (!count)
1015                 return true;
1016
1017         desc_list_byte_size = velems->desc_list_byte_size;
1018         first_vb_use_mask = velems->first_vb_use_mask;
1019
1020         /* Vertex buffer descriptors are the only ones which are uploaded
1021          * directly through a staging buffer and don't go through
1022          * the fine-grained upload path.
1023          */
1024         u_upload_alloc(sctx->b.b.const_uploader, 0,
1025                        desc_list_byte_size,
1026                        si_optimal_tcc_alignment(sctx, desc_list_byte_size),
1027                        &desc->buffer_offset,
1028                        (struct pipe_resource**)&desc->buffer, (void**)&ptr);
1029         if (!desc->buffer)
1030                 return false;
1031
1032         radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1033                               desc->buffer, RADEON_USAGE_READ,
1034                               RADEON_PRIO_DESCRIPTORS);
1035
1036         assert(count <= SI_MAX_ATTRIBS);
1037
1038         for (i = 0; i < count; i++) {
1039                 struct pipe_vertex_element *ve = &velems->elements[i];
1040                 struct pipe_vertex_buffer *vb;
1041                 struct r600_resource *rbuffer;
1042                 unsigned offset;
1043                 unsigned vbo_index = ve->vertex_buffer_index;
1044                 uint32_t *desc = &ptr[i*4];
1045
1046                 vb = &sctx->vertex_buffer[vbo_index];
1047                 rbuffer = (struct r600_resource*)vb->buffer;
1048                 if (!rbuffer) {
1049                         memset(desc, 0, 16);
1050                         continue;
1051                 }
1052
1053                 offset = vb->buffer_offset + ve->src_offset;
1054                 va = rbuffer->gpu_address + offset;
1055
1056                 /* Fill in T# buffer resource description */
1057                 desc[0] = va;
1058                 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
1059                           S_008F04_STRIDE(vb->stride);
1060
1061                 if (sctx->b.chip_class != VI && vb->stride) {
1062                         /* Round up by rounding down and adding 1 */
1063                         desc[2] = (vb->buffer->width0 - offset -
1064                                    velems->format_size[i]) /
1065                                   vb->stride + 1;
1066                 } else {
1067                         desc[2] = vb->buffer->width0 - offset;
1068                 }
1069
1070                 desc[3] = velems->rsrc_word3[i];
1071
1072                 if (first_vb_use_mask & (1 << i)) {
1073                         radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1074                                               (struct r600_resource*)vb->buffer,
1075                                               RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
1076                 }
1077         }
1078
1079         /* Don't flush the const cache. It would have a very negative effect
1080          * on performance (confirmed by testing). New descriptors are always
1081          * uploaded to a fresh new buffer, so I don't think flushing the const
1082          * cache is needed. */
1083         si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
1084         if (sctx->b.chip_class >= CIK)
1085                 si_mark_atom_dirty(sctx, &sctx->prefetch_L2);
1086         sctx->vertex_buffers_dirty = false;
1087         sctx->vertex_buffer_pointer_dirty = true;
1088         return true;
1089 }
1090
1091
1092 /* CONSTANT BUFFERS */
1093
1094 static unsigned
1095 si_const_buffer_descriptors_idx(unsigned shader)
1096 {
1097         return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
1098                SI_SHADER_DESCS_CONST_BUFFERS;
1099 }
1100
1101 static struct si_descriptors *
1102 si_const_buffer_descriptors(struct si_context *sctx, unsigned shader)
1103 {
1104         return &sctx->descriptors[si_const_buffer_descriptors_idx(shader)];
1105 }
1106
1107 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
1108                             const uint8_t *ptr, unsigned size, uint32_t *const_offset)
1109 {
1110         void *tmp;
1111
1112         u_upload_alloc(sctx->b.b.const_uploader, 0, size,
1113                        si_optimal_tcc_alignment(sctx, size),
1114                        const_offset,
1115                        (struct pipe_resource**)rbuffer, &tmp);
1116         if (*rbuffer)
1117                 util_memcpy_cpu_to_le32(tmp, ptr, size);
1118 }
1119
1120 static void si_set_constant_buffer(struct si_context *sctx,
1121                                    struct si_buffer_resources *buffers,
1122                                    unsigned descriptors_idx,
1123                                    uint slot, const struct pipe_constant_buffer *input)
1124 {
1125         struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
1126         assert(slot < descs->num_elements);
1127         pipe_resource_reference(&buffers->buffers[slot], NULL);
1128
1129         /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
1130          * with a NULL buffer). We need to use a dummy buffer instead. */
1131         if (sctx->b.chip_class == CIK &&
1132             (!input || (!input->buffer && !input->user_buffer)))
1133                 input = &sctx->null_const_buf;
1134
1135         if (input && (input->buffer || input->user_buffer)) {
1136                 struct pipe_resource *buffer = NULL;
1137                 uint64_t va;
1138
1139                 /* Upload the user buffer if needed. */
1140                 if (input->user_buffer) {
1141                         unsigned buffer_offset;
1142
1143                         si_upload_const_buffer(sctx,
1144                                                (struct r600_resource**)&buffer, input->user_buffer,
1145                                                input->buffer_size, &buffer_offset);
1146                         if (!buffer) {
1147                                 /* Just unbind on failure. */
1148                                 si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL);
1149                                 return;
1150                         }
1151                         va = r600_resource(buffer)->gpu_address + buffer_offset;
1152                 } else {
1153                         pipe_resource_reference(&buffer, input->buffer);
1154                         va = r600_resource(buffer)->gpu_address + input->buffer_offset;
1155                         /* Only track usage for non-user buffers. */
1156                         r600_resource(buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
1157                 }
1158
1159                 /* Set the descriptor. */
1160                 uint32_t *desc = descs->list + slot*4;
1161                 desc[0] = va;
1162                 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
1163                           S_008F04_STRIDE(0);
1164                 desc[2] = input->buffer_size;
1165                 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1166                           S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1167                           S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1168                           S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1169                           S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1170                           S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1171
1172                 buffers->buffers[slot] = buffer;
1173                 radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
1174                                                     (struct r600_resource*)buffer,
1175                                                     buffers->shader_usage,
1176                                                     buffers->priority, true);
1177                 buffers->enabled_mask |= 1u << slot;
1178         } else {
1179                 /* Clear the descriptor. */
1180                 memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
1181                 buffers->enabled_mask &= ~(1u << slot);
1182         }
1183
1184         descs->dirty_mask |= 1u << slot;
1185         sctx->descriptors_dirty |= 1u << descriptors_idx;
1186 }
1187
1188 void si_set_rw_buffer(struct si_context *sctx,
1189                       uint slot, const struct pipe_constant_buffer *input)
1190 {
1191         si_set_constant_buffer(sctx, &sctx->rw_buffers,
1192                                                 SI_DESCS_RW_BUFFERS, slot, input);
1193 }
1194
1195 static void si_pipe_set_constant_buffer(struct pipe_context *ctx,
1196                                         enum pipe_shader_type shader, uint slot,
1197                                         const struct pipe_constant_buffer *input)
1198 {
1199         struct si_context *sctx = (struct si_context *)ctx;
1200
1201         if (shader >= SI_NUM_SHADERS)
1202                 return;
1203
1204         si_set_constant_buffer(sctx, &sctx->const_buffers[shader],
1205                                si_const_buffer_descriptors_idx(shader),
1206                                slot, input);
1207 }
1208
1209 void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader,
1210                                  uint slot, struct pipe_constant_buffer *cbuf)
1211 {
1212         cbuf->user_buffer = NULL;
1213         si_get_buffer_from_descriptors(
1214                 &sctx->const_buffers[shader],
1215                 si_const_buffer_descriptors(sctx, shader),
1216                 slot, &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size);
1217 }
1218
1219 /* SHADER BUFFERS */
1220
1221 static unsigned
1222 si_shader_buffer_descriptors_idx(enum pipe_shader_type shader)
1223 {
1224         return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
1225                SI_SHADER_DESCS_SHADER_BUFFERS;
1226 }
1227
1228 static struct si_descriptors *
1229 si_shader_buffer_descriptors(struct si_context *sctx,
1230                                   enum pipe_shader_type shader)
1231 {
1232         return &sctx->descriptors[si_shader_buffer_descriptors_idx(shader)];
1233 }
1234
1235 static void si_set_shader_buffers(struct pipe_context *ctx,
1236                                   enum pipe_shader_type shader,
1237                                   unsigned start_slot, unsigned count,
1238                                   const struct pipe_shader_buffer *sbuffers)
1239 {
1240         struct si_context *sctx = (struct si_context *)ctx;
1241         struct si_buffer_resources *buffers = &sctx->shader_buffers[shader];
1242         struct si_descriptors *descs = si_shader_buffer_descriptors(sctx, shader);
1243         unsigned i;
1244
1245         assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
1246
1247         for (i = 0; i < count; ++i) {
1248                 const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
1249                 struct r600_resource *buf;
1250                 unsigned slot = start_slot + i;
1251                 uint32_t *desc = descs->list + slot * 4;
1252                 uint64_t va;
1253
1254                 if (!sbuffer || !sbuffer->buffer) {
1255                         pipe_resource_reference(&buffers->buffers[slot], NULL);
1256                         memset(desc, 0, sizeof(uint32_t) * 4);
1257                         buffers->enabled_mask &= ~(1u << slot);
1258                         descs->dirty_mask |= 1u << slot;
1259                         sctx->descriptors_dirty |=
1260                                 1u << si_shader_buffer_descriptors_idx(shader);
1261                         continue;
1262                 }
1263
1264                 buf = (struct r600_resource *)sbuffer->buffer;
1265                 va = buf->gpu_address + sbuffer->buffer_offset;
1266
1267                 desc[0] = va;
1268                 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
1269                           S_008F04_STRIDE(0);
1270                 desc[2] = sbuffer->buffer_size;
1271                 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1272                           S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1273                           S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1274                           S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1275                           S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1276                           S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1277
1278                 pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
1279                 radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, buf,
1280                                                     buffers->shader_usage,
1281                                                     buffers->priority, true);
1282                 buf->bind_history |= PIPE_BIND_SHADER_BUFFER;
1283
1284                 buffers->enabled_mask |= 1u << slot;
1285                 descs->dirty_mask |= 1u << slot;
1286                 sctx->descriptors_dirty |=
1287                         1u << si_shader_buffer_descriptors_idx(shader);
1288
1289                 util_range_add(&buf->valid_buffer_range, sbuffer->buffer_offset,
1290                                sbuffer->buffer_offset + sbuffer->buffer_size);
1291         }
1292 }
1293
1294 void si_get_shader_buffers(struct si_context *sctx,
1295                            enum pipe_shader_type shader,
1296                            uint start_slot, uint count,
1297                            struct pipe_shader_buffer *sbuf)
1298 {
1299         struct si_buffer_resources *buffers = &sctx->shader_buffers[shader];
1300         struct si_descriptors *descs = si_shader_buffer_descriptors(sctx, shader);
1301
1302         for (unsigned i = 0; i < count; ++i) {
1303                 si_get_buffer_from_descriptors(
1304                         buffers, descs, start_slot + i,
1305                         &sbuf[i].buffer, &sbuf[i].buffer_offset,
1306                         &sbuf[i].buffer_size);
1307         }
1308 }
1309
1310 /* RING BUFFERS */
1311
1312 void si_set_ring_buffer(struct pipe_context *ctx, uint slot,
1313                         struct pipe_resource *buffer,
1314                         unsigned stride, unsigned num_records,
1315                         bool add_tid, bool swizzle,
1316                         unsigned element_size, unsigned index_stride, uint64_t offset)
1317 {
1318         struct si_context *sctx = (struct si_context *)ctx;
1319         struct si_buffer_resources *buffers = &sctx->rw_buffers;
1320         struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
1321
1322         /* The stride field in the resource descriptor has 14 bits */
1323         assert(stride < (1 << 14));
1324
1325         assert(slot < descs->num_elements);
1326         pipe_resource_reference(&buffers->buffers[slot], NULL);
1327
1328         if (buffer) {
1329                 uint64_t va;
1330
1331                 va = r600_resource(buffer)->gpu_address + offset;
1332
1333                 switch (element_size) {
1334                 default:
1335                         assert(!"Unsupported ring buffer element size");
1336                 case 0:
1337                 case 2:
1338                         element_size = 0;
1339                         break;
1340                 case 4:
1341                         element_size = 1;
1342                         break;
1343                 case 8:
1344                         element_size = 2;
1345                         break;
1346                 case 16:
1347                         element_size = 3;
1348                         break;
1349                 }
1350
1351                 switch (index_stride) {
1352                 default:
1353                         assert(!"Unsupported ring buffer index stride");
1354                 case 0:
1355                 case 8:
1356                         index_stride = 0;
1357                         break;
1358                 case 16:
1359                         index_stride = 1;
1360                         break;
1361                 case 32:
1362                         index_stride = 2;
1363                         break;
1364                 case 64:
1365                         index_stride = 3;
1366                         break;
1367                 }
1368
1369                 if (sctx->b.chip_class >= VI && stride)
1370                         num_records *= stride;
1371
1372                 /* Set the descriptor. */
1373                 uint32_t *desc = descs->list + slot*4;
1374                 desc[0] = va;
1375                 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
1376                           S_008F04_STRIDE(stride) |
1377                           S_008F04_SWIZZLE_ENABLE(swizzle);
1378                 desc[2] = num_records;
1379                 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1380                           S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1381                           S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1382                           S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1383                           S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1384                           S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
1385                           S_008F0C_INDEX_STRIDE(index_stride) |
1386                           S_008F0C_ADD_TID_ENABLE(add_tid);
1387
1388                 if (sctx->b.chip_class >= GFX9)
1389                         assert(!swizzle || element_size == 1); /* always 4 bytes on GFX9 */
1390                 else
1391                         desc[3] |= S_008F0C_ELEMENT_SIZE(element_size);
1392
1393                 pipe_resource_reference(&buffers->buffers[slot], buffer);
1394                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
1395                                       (struct r600_resource*)buffer,
1396                                       buffers->shader_usage, buffers->priority);
1397                 buffers->enabled_mask |= 1u << slot;
1398         } else {
1399                 /* Clear the descriptor. */
1400                 memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
1401                 buffers->enabled_mask &= ~(1u << slot);
1402         }
1403
1404         descs->dirty_mask |= 1u << slot;
1405         sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
1406 }
1407
1408 /* STREAMOUT BUFFERS */
1409
1410 static void si_set_streamout_targets(struct pipe_context *ctx,
1411                                      unsigned num_targets,
1412                                      struct pipe_stream_output_target **targets,
1413                                      const unsigned *offsets)
1414 {
1415         struct si_context *sctx = (struct si_context *)ctx;
1416         struct si_buffer_resources *buffers = &sctx->rw_buffers;
1417         struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
1418         unsigned old_num_targets = sctx->b.streamout.num_targets;
1419         unsigned i, bufidx;
1420
1421         /* We are going to unbind the buffers. Mark which caches need to be flushed. */
1422         if (sctx->b.streamout.num_targets && sctx->b.streamout.begin_emitted) {
1423                 /* Since streamout uses vector writes which go through TC L2
1424                  * and most other clients can use TC L2 as well, we don't need
1425                  * to flush it.
1426                  *
1427                  * The only cases which requires flushing it is VGT DMA index
1428                  * fetching (on <= CIK) and indirect draw data, which are rare
1429                  * cases. Thus, flag the TC L2 dirtiness in the resource and
1430                  * handle it at draw call time.
1431                  */
1432                 for (i = 0; i < sctx->b.streamout.num_targets; i++)
1433                         if (sctx->b.streamout.targets[i])
1434                                 r600_resource(sctx->b.streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
1435
1436                 /* Invalidate the scalar cache in case a streamout buffer is
1437                  * going to be used as a constant buffer.
1438                  *
1439                  * Invalidate TC L1, because streamout bypasses it (done by
1440                  * setting GLC=1 in the store instruction), but it can contain
1441                  * outdated data of streamout buffers.
1442                  *
1443                  * VS_PARTIAL_FLUSH is required if the buffers are going to be
1444                  * used as an input immediately.
1445                  */
1446                 sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
1447                                  SI_CONTEXT_INV_VMEM_L1 |
1448                                  SI_CONTEXT_VS_PARTIAL_FLUSH;
1449         }
1450
1451         /* All readers of the streamout targets need to be finished before we can
1452          * start writing to the targets.
1453          */
1454         if (num_targets)
1455                 sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
1456                                  SI_CONTEXT_CS_PARTIAL_FLUSH;
1457
1458         /* Streamout buffers must be bound in 2 places:
1459          * 1) in VGT by setting the VGT_STRMOUT registers
1460          * 2) as shader resources
1461          */
1462
1463         /* Set the VGT regs. */
1464         r600_set_streamout_targets(ctx, num_targets, targets, offsets);
1465
1466         /* Set the shader resources.*/
1467         for (i = 0; i < num_targets; i++) {
1468                 bufidx = SI_VS_STREAMOUT_BUF0 + i;
1469
1470                 if (targets[i]) {
1471                         struct pipe_resource *buffer = targets[i]->buffer;
1472                         uint64_t va = r600_resource(buffer)->gpu_address;
1473
1474                         /* Set the descriptor.
1475                          *
1476                          * On VI, the format must be non-INVALID, otherwise
1477                          * the buffer will be considered not bound and store
1478                          * instructions will be no-ops.
1479                          */
1480                         uint32_t *desc = descs->list + bufidx*4;
1481                         desc[0] = va;
1482                         desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
1483                         desc[2] = 0xffffffff;
1484                         desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1485                                   S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1486                                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1487                                   S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1488                                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1489
1490                         /* Set the resource. */
1491                         pipe_resource_reference(&buffers->buffers[bufidx],
1492                                                 buffer);
1493                         radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
1494                                                             (struct r600_resource*)buffer,
1495                                                             buffers->shader_usage,
1496                                                             RADEON_PRIO_SHADER_RW_BUFFER,
1497                                                             true);
1498                         r600_resource(buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
1499
1500                         buffers->enabled_mask |= 1u << bufidx;
1501                 } else {
1502                         /* Clear the descriptor and unset the resource. */
1503                         memset(descs->list + bufidx*4, 0,
1504                                sizeof(uint32_t) * 4);
1505                         pipe_resource_reference(&buffers->buffers[bufidx],
1506                                                 NULL);
1507                         buffers->enabled_mask &= ~(1u << bufidx);
1508                 }
1509                 descs->dirty_mask |= 1u << bufidx;
1510         }
1511         for (; i < old_num_targets; i++) {
1512                 bufidx = SI_VS_STREAMOUT_BUF0 + i;
1513                 /* Clear the descriptor and unset the resource. */
1514                 memset(descs->list + bufidx*4, 0, sizeof(uint32_t) * 4);
1515                 pipe_resource_reference(&buffers->buffers[bufidx], NULL);
1516                 buffers->enabled_mask &= ~(1u << bufidx);
1517                 descs->dirty_mask |= 1u << bufidx;
1518         }
1519
1520         sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
1521 }
1522
1523 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
1524                                         uint32_t *desc, uint64_t old_buf_va,
1525                                         struct pipe_resource *new_buf)
1526 {
1527         /* Retrieve the buffer offset from the descriptor. */
1528         uint64_t old_desc_va =
1529                 desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
1530
1531         assert(old_buf_va <= old_desc_va);
1532         uint64_t offset_within_buffer = old_desc_va - old_buf_va;
1533
1534         /* Update the descriptor. */
1535         si_set_buf_desc_address(r600_resource(new_buf), offset_within_buffer,
1536                                 desc);
1537 }
1538
1539 /* INTERNAL CONST BUFFERS */
1540
1541 static void si_set_polygon_stipple(struct pipe_context *ctx,
1542                                    const struct pipe_poly_stipple *state)
1543 {
1544         struct si_context *sctx = (struct si_context *)ctx;
1545         struct pipe_constant_buffer cb = {};
1546         unsigned stipple[32];
1547         int i;
1548
1549         for (i = 0; i < 32; i++)
1550                 stipple[i] = util_bitreverse(state->stipple[i]);
1551
1552         cb.user_buffer = stipple;
1553         cb.buffer_size = sizeof(stipple);
1554
1555         si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb);
1556 }
1557
1558 /* TEXTURE METADATA ENABLE/DISABLE */
1559
1560 /* CMASK can be enabled (for fast clear) and disabled (for texture export)
1561  * while the texture is bound, possibly by a different context. In that case,
1562  * call this function to update compressed_colortex_masks.
1563  */
1564 void si_update_compressed_colortex_masks(struct si_context *sctx)
1565 {
1566         for (int i = 0; i < SI_NUM_SHADERS; ++i) {
1567                 si_samplers_update_compressed_colortex_mask(&sctx->samplers[i]);
1568                 si_images_update_compressed_colortex_mask(&sctx->images[i]);
1569                 si_update_compressed_tex_shader_mask(sctx, i);
1570         }
1571 }
1572
1573 /* BUFFER DISCARD/INVALIDATION */
1574
1575 /** Reset descriptors of buffer resources after \p buf has been invalidated. */
1576 static void si_reset_buffer_resources(struct si_context *sctx,
1577                                       struct si_buffer_resources *buffers,
1578                                       unsigned descriptors_idx,
1579                                       struct pipe_resource *buf,
1580                                       uint64_t old_va)
1581 {
1582         struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
1583         unsigned mask = buffers->enabled_mask;
1584
1585         while (mask) {
1586                 unsigned i = u_bit_scan(&mask);
1587                 if (buffers->buffers[i] == buf) {
1588                         si_desc_reset_buffer_offset(&sctx->b.b,
1589                                                     descs->list + i*4,
1590                                                     old_va, buf);
1591                         descs->dirty_mask |= 1u << i;
1592                         sctx->descriptors_dirty |= 1u << descriptors_idx;
1593
1594                         radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
1595                                                             (struct r600_resource *)buf,
1596                                                             buffers->shader_usage,
1597                                                             buffers->priority, true);
1598                 }
1599         }
1600 }
1601
1602 /* Reallocate a buffer a update all resource bindings where the buffer is
1603  * bound.
1604  *
1605  * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
1606  * idle by discarding its contents. Apps usually tell us when to do this using
1607  * map_buffer flags, for example.
1608  */
1609 static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf)
1610 {
1611         struct si_context *sctx = (struct si_context*)ctx;
1612         struct r600_resource *rbuffer = r600_resource(buf);
1613         unsigned i, shader;
1614         uint64_t old_va = rbuffer->gpu_address;
1615         unsigned num_elems = sctx->vertex_elements ?
1616                                        sctx->vertex_elements->count : 0;
1617
1618         /* Reallocate the buffer in the same pipe_resource. */
1619         r600_alloc_resource(&sctx->screen->b, rbuffer);
1620
1621         /* We changed the buffer, now we need to bind it where the old one
1622          * was bound. This consists of 2 things:
1623          *   1) Updating the resource descriptor and dirtying it.
1624          *   2) Adding a relocation to the CS, so that it's usable.
1625          */
1626
1627         /* Vertex buffers. */
1628         if (rbuffer->bind_history & PIPE_BIND_VERTEX_BUFFER) {
1629                 for (i = 0; i < num_elems; i++) {
1630                         int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
1631
1632                         if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
1633                                 continue;
1634                         if (!sctx->vertex_buffer[vb].buffer)
1635                                 continue;
1636
1637                         if (sctx->vertex_buffer[vb].buffer == buf) {
1638                                 sctx->vertex_buffers_dirty = true;
1639                                 break;
1640                         }
1641                 }
1642         }
1643
1644         /* Streamout buffers. (other internal buffers can't be invalidated) */
1645         if (rbuffer->bind_history & PIPE_BIND_STREAM_OUTPUT) {
1646                 for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) {
1647                         struct si_buffer_resources *buffers = &sctx->rw_buffers;
1648                         struct si_descriptors *descs =
1649                                 &sctx->descriptors[SI_DESCS_RW_BUFFERS];
1650
1651                         if (buffers->buffers[i] != buf)
1652                                 continue;
1653
1654                         si_desc_reset_buffer_offset(ctx, descs->list + i*4,
1655                                                     old_va, buf);
1656                         descs->dirty_mask |= 1u << i;
1657                         sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
1658
1659                         radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
1660                                                             rbuffer, buffers->shader_usage,
1661                                                             RADEON_PRIO_SHADER_RW_BUFFER,
1662                                                             true);
1663
1664                         /* Update the streamout state. */
1665                         if (sctx->b.streamout.begin_emitted)
1666                                 r600_emit_streamout_end(&sctx->b);
1667                         sctx->b.streamout.append_bitmask =
1668                                         sctx->b.streamout.enabled_mask;
1669                         r600_streamout_buffers_dirty(&sctx->b);
1670                 }
1671         }
1672
1673         /* Constant and shader buffers. */
1674         if (rbuffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
1675                 for (shader = 0; shader < SI_NUM_SHADERS; shader++)
1676                         si_reset_buffer_resources(sctx, &sctx->const_buffers[shader],
1677                                                   si_const_buffer_descriptors_idx(shader),
1678                                                   buf, old_va);
1679         }
1680
1681         if (rbuffer->bind_history & PIPE_BIND_SHADER_BUFFER) {
1682                 for (shader = 0; shader < SI_NUM_SHADERS; shader++)
1683                         si_reset_buffer_resources(sctx, &sctx->shader_buffers[shader],
1684                                                   si_shader_buffer_descriptors_idx(shader),
1685                                                   buf, old_va);
1686         }
1687
1688         if (rbuffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {
1689                 /* Texture buffers - update bindings. */
1690                 for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1691                         struct si_sampler_views *views = &sctx->samplers[shader].views;
1692                         struct si_descriptors *descs =
1693                                 si_sampler_descriptors(sctx, shader);
1694                         unsigned mask = views->enabled_mask;
1695
1696                         while (mask) {
1697                                 unsigned i = u_bit_scan(&mask);
1698                                 if (views->views[i]->texture == buf) {
1699                                         si_desc_reset_buffer_offset(ctx,
1700                                                                     descs->list +
1701                                                                     i * 16 + 4,
1702                                                                     old_va, buf);
1703                                         descs->dirty_mask |= 1u << i;
1704                                         sctx->descriptors_dirty |=
1705                                                 1u << si_sampler_descriptors_idx(shader);
1706
1707                                         radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
1708                                                                             rbuffer, RADEON_USAGE_READ,
1709                                                                             RADEON_PRIO_SAMPLER_BUFFER,
1710                                                                             true);
1711                                 }
1712                         }
1713                 }
1714         }
1715
1716         /* Shader images */
1717         if (rbuffer->bind_history & PIPE_BIND_SHADER_IMAGE) {
1718                 for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
1719                         struct si_images_info *images = &sctx->images[shader];
1720                         struct si_descriptors *descs =
1721                                 si_image_descriptors(sctx, shader);
1722                         unsigned mask = images->enabled_mask;
1723
1724                         while (mask) {
1725                                 unsigned i = u_bit_scan(&mask);
1726
1727                                 if (images->views[i].resource == buf) {
1728                                         if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)
1729                                                 si_mark_image_range_valid(&images->views[i]);
1730
1731                                         si_desc_reset_buffer_offset(
1732                                                 ctx, descs->list + i * 8 + 4,
1733                                                 old_va, buf);
1734                                         descs->dirty_mask |= 1u << i;
1735                                         sctx->descriptors_dirty |=
1736                                                 1u << si_image_descriptors_idx(shader);
1737
1738                                         radeon_add_to_buffer_list_check_mem(
1739                                                 &sctx->b, &sctx->b.gfx, rbuffer,
1740                                                 RADEON_USAGE_READWRITE,
1741                                                 RADEON_PRIO_SAMPLER_BUFFER, true);
1742                                 }
1743                         }
1744                 }
1745         }
1746 }
1747
1748 /* Update mutable image descriptor fields of all bound textures. */
1749 void si_update_all_texture_descriptors(struct si_context *sctx)
1750 {
1751         unsigned shader;
1752
1753         for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1754                 struct si_sampler_views *samplers = &sctx->samplers[shader].views;
1755                 struct si_images_info *images = &sctx->images[shader];
1756                 unsigned mask;
1757
1758                 /* Images. */
1759                 mask = images->enabled_mask;
1760                 while (mask) {
1761                         unsigned i = u_bit_scan(&mask);
1762                         struct pipe_image_view *view = &images->views[i];
1763
1764                         if (!view->resource ||
1765                             view->resource->target == PIPE_BUFFER)
1766                                 continue;
1767
1768                         si_set_shader_image(sctx, shader, i, view, true);
1769                 }
1770
1771                 /* Sampler views. */
1772                 mask = samplers->enabled_mask;
1773                 while (mask) {
1774                         unsigned i = u_bit_scan(&mask);
1775                         struct pipe_sampler_view *view = samplers->views[i];
1776
1777                         if (!view ||
1778                             !view->texture ||
1779                             view->texture->target == PIPE_BUFFER)
1780                                 continue;
1781
1782                         si_set_sampler_view(sctx, shader, i,
1783                                             samplers->views[i], true);
1784                 }
1785
1786                 si_update_compressed_tex_shader_mask(sctx, shader);
1787         }
1788 }
1789
1790 /* SHADER USER DATA */
1791
1792 static void si_mark_shader_pointers_dirty(struct si_context *sctx,
1793                                           unsigned shader)
1794 {
1795         sctx->shader_pointers_dirty |=
1796                 u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS,
1797                                   SI_NUM_SHADER_DESCS);
1798
1799         if (shader == PIPE_SHADER_VERTEX)
1800                 sctx->vertex_buffer_pointer_dirty = sctx->vertex_buffers.buffer != NULL;
1801
1802         si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
1803 }
1804
1805 static void si_shader_userdata_begin_new_cs(struct si_context *sctx)
1806 {
1807         sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
1808         sctx->vertex_buffer_pointer_dirty = sctx->vertex_buffers.buffer != NULL;
1809         si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
1810 }
1811
1812 /* Set a base register address for user data constants in the given shader.
1813  * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
1814  */
1815 static void si_set_user_data_base(struct si_context *sctx,
1816                                   unsigned shader, uint32_t new_base)
1817 {
1818         uint32_t *base = &sctx->shader_userdata.sh_base[shader];
1819
1820         if (*base != new_base) {
1821                 *base = new_base;
1822
1823                 if (new_base)
1824                         si_mark_shader_pointers_dirty(sctx, shader);
1825         }
1826 }
1827
1828 /* This must be called when these shaders are changed from non-NULL to NULL
1829  * and vice versa:
1830  * - geometry shader
1831  * - tessellation control shader
1832  * - tessellation evaluation shader
1833  */
1834 void si_shader_change_notify(struct si_context *sctx)
1835 {
1836         /* VS can be bound as VS, ES, or LS. */
1837         if (sctx->tes_shader.cso) {
1838                 if (sctx->b.chip_class >= GFX9) {
1839                         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
1840                                               R_00B430_SPI_SHADER_USER_DATA_LS_0);
1841                 } else {
1842                         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
1843                                               R_00B530_SPI_SHADER_USER_DATA_LS_0);
1844                 }
1845         } else if (sctx->gs_shader.cso) {
1846                 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
1847                                       R_00B330_SPI_SHADER_USER_DATA_ES_0);
1848         } else {
1849                 si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
1850                                       R_00B130_SPI_SHADER_USER_DATA_VS_0);
1851         }
1852
1853         /* TES can be bound as ES, VS, or not bound. */
1854         if (sctx->tes_shader.cso) {
1855                 if (sctx->gs_shader.cso)
1856                         si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
1857                                               R_00B330_SPI_SHADER_USER_DATA_ES_0);
1858                 else
1859                         si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
1860                                               R_00B130_SPI_SHADER_USER_DATA_VS_0);
1861         } else {
1862                 si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
1863         }
1864 }
1865
1866 static void si_emit_shader_pointer(struct si_context *sctx,
1867                                    struct si_descriptors *desc,
1868                                    unsigned sh_base)
1869 {
1870         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
1871         uint64_t va;
1872
1873         assert(desc->buffer);
1874
1875         va = desc->buffer->gpu_address +
1876              desc->buffer_offset;
1877
1878         radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
1879         radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
1880         radeon_emit(cs, va);
1881         radeon_emit(cs, va >> 32);
1882 }
1883
1884 void si_emit_graphics_shader_userdata(struct si_context *sctx,
1885                                       struct r600_atom *atom)
1886 {
1887         unsigned mask;
1888         uint32_t *sh_base = sctx->shader_userdata.sh_base;
1889         struct si_descriptors *descs;
1890
1891         descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
1892
1893         if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) {
1894                 si_emit_shader_pointer(sctx, descs,
1895                                        R_00B030_SPI_SHADER_USER_DATA_PS_0);
1896                 si_emit_shader_pointer(sctx, descs,
1897                                        R_00B130_SPI_SHADER_USER_DATA_VS_0);
1898                 si_emit_shader_pointer(sctx, descs,
1899                                        R_00B330_SPI_SHADER_USER_DATA_ES_0);
1900
1901                 /* GFX9 merged LS-HS and ES-GS. Only set RW_BUFFERS for ES and LS. */
1902                 if (sctx->b.chip_class >= GFX9) {
1903                         si_emit_shader_pointer(sctx, descs,
1904                                                R_00B430_SPI_SHADER_USER_DATA_LS_0);
1905                 } else {
1906                         si_emit_shader_pointer(sctx, descs,
1907                                                R_00B230_SPI_SHADER_USER_DATA_GS_0);
1908                         si_emit_shader_pointer(sctx, descs,
1909                                                R_00B430_SPI_SHADER_USER_DATA_HS_0);
1910                 }
1911         }
1912
1913         mask = sctx->shader_pointers_dirty &
1914                u_bit_consecutive(SI_DESCS_FIRST_SHADER,
1915                                  SI_DESCS_FIRST_COMPUTE - SI_DESCS_FIRST_SHADER);
1916
1917         while (mask) {
1918                 unsigned i = u_bit_scan(&mask);
1919                 unsigned shader = (i - SI_DESCS_FIRST_SHADER) / SI_NUM_SHADER_DESCS;
1920                 unsigned base = sh_base[shader];
1921
1922                 if (base)
1923                         si_emit_shader_pointer(sctx, descs + i, base);
1924         }
1925         sctx->shader_pointers_dirty &=
1926                 ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE);
1927
1928         if (sctx->vertex_buffer_pointer_dirty) {
1929                 si_emit_shader_pointer(sctx, &sctx->vertex_buffers,
1930                                        sh_base[PIPE_SHADER_VERTEX]);
1931                 sctx->vertex_buffer_pointer_dirty = false;
1932         }
1933 }
1934
1935 void si_emit_compute_shader_userdata(struct si_context *sctx)
1936 {
1937         unsigned base = R_00B900_COMPUTE_USER_DATA_0;
1938         struct si_descriptors *descs = sctx->descriptors;
1939         unsigned compute_mask =
1940                 u_bit_consecutive(SI_DESCS_FIRST_COMPUTE, SI_NUM_SHADER_DESCS);
1941         unsigned mask = sctx->shader_pointers_dirty & compute_mask;
1942
1943         while (mask) {
1944                 unsigned i = u_bit_scan(&mask);
1945
1946                 si_emit_shader_pointer(sctx, descs + i, base);
1947         }
1948         sctx->shader_pointers_dirty &= ~compute_mask;
1949 }
1950
1951 /* INIT/DEINIT/UPLOAD */
1952
1953 void si_init_all_descriptors(struct si_context *sctx)
1954 {
1955         int i;
1956         unsigned ce_offset = 0;
1957
1958         for (i = 0; i < SI_NUM_SHADERS; i++) {
1959                 /* GFX9 has only 4KB of CE, while previous chips had 32KB.
1960                  * Rarely used descriptors don't use CE RAM.
1961                  */
1962                 bool big_ce = sctx->b.chip_class <= VI;
1963                 bool images_use_ce = big_ce;
1964                 bool shaderbufs_use_ce = big_ce ||
1965                                          i == PIPE_SHADER_COMPUTE;
1966                 bool samplers_use_ce = big_ce ||
1967                                        i == PIPE_SHADER_FRAGMENT;
1968
1969                 si_init_buffer_resources(&sctx->const_buffers[i],
1970                                          si_const_buffer_descriptors(sctx, i),
1971                                          SI_NUM_CONST_BUFFERS, SI_SGPR_CONST_BUFFERS,
1972                                          RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER,
1973                                          &ce_offset);
1974                 si_init_buffer_resources(&sctx->shader_buffers[i],
1975                                          si_shader_buffer_descriptors(sctx, i),
1976                                          SI_NUM_SHADER_BUFFERS, SI_SGPR_SHADER_BUFFERS,
1977                                          RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER,
1978                                          shaderbufs_use_ce ? &ce_offset : NULL);
1979
1980                 si_init_descriptors(si_sampler_descriptors(sctx, i),
1981                                     SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS,
1982                                     null_texture_descriptor,
1983                                     samplers_use_ce ? &ce_offset : NULL);
1984
1985                 si_init_descriptors(si_image_descriptors(sctx, i),
1986                                     SI_SGPR_IMAGES, 8, SI_NUM_IMAGES,
1987                                     null_image_descriptor,
1988                                     images_use_ce ? &ce_offset : NULL);
1989         }
1990
1991         si_init_buffer_resources(&sctx->rw_buffers,
1992                                  &sctx->descriptors[SI_DESCS_RW_BUFFERS],
1993                                  SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
1994                                  RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS,
1995                                  &ce_offset);
1996         si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
1997                             4, SI_NUM_VERTEX_BUFFERS, NULL, NULL);
1998
1999         sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
2000
2001         if (sctx->b.chip_class >= GFX9)
2002                 assert(ce_offset <= 4096);
2003         else
2004                 assert(ce_offset <= 32768);
2005
2006         /* Set pipe_context functions. */
2007         sctx->b.b.bind_sampler_states = si_bind_sampler_states;
2008         sctx->b.b.set_shader_images = si_set_shader_images;
2009         sctx->b.b.set_constant_buffer = si_pipe_set_constant_buffer;
2010         sctx->b.b.set_polygon_stipple = si_set_polygon_stipple;
2011         sctx->b.b.set_shader_buffers = si_set_shader_buffers;
2012         sctx->b.b.set_sampler_views = si_set_sampler_views;
2013         sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
2014         sctx->b.invalidate_buffer = si_invalidate_buffer;
2015
2016         /* Shader user data. */
2017         si_init_atom(sctx, &sctx->shader_userdata.atom, &sctx->atoms.s.shader_userdata,
2018                      si_emit_graphics_shader_userdata);
2019
2020         /* Set default and immutable mappings. */
2021         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
2022
2023         if (sctx->b.chip_class >= GFX9) {
2024                 si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL,
2025                                       R_00B430_SPI_SHADER_USER_DATA_LS_0);
2026                 si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY,
2027                                       R_00B330_SPI_SHADER_USER_DATA_ES_0);
2028         } else {
2029                 si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL,
2030                                       R_00B430_SPI_SHADER_USER_DATA_HS_0);
2031                 si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY,
2032                                       R_00B230_SPI_SHADER_USER_DATA_GS_0);
2033         }
2034         si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
2035 }
2036
2037 bool si_upload_graphics_shader_descriptors(struct si_context *sctx)
2038 {
2039         const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE);
2040         unsigned dirty = sctx->descriptors_dirty & mask;
2041
2042         /* Assume nothing will go wrong: */
2043         sctx->shader_pointers_dirty |= dirty;
2044
2045         while (dirty) {
2046                 unsigned i = u_bit_scan(&dirty);
2047
2048                 if (!si_upload_descriptors(sctx, &sctx->descriptors[i],
2049                                            &sctx->shader_userdata.atom))
2050                         return false;
2051         }
2052
2053         sctx->descriptors_dirty &= ~mask;
2054         return true;
2055 }
2056
2057 bool si_upload_compute_shader_descriptors(struct si_context *sctx)
2058 {
2059         /* Does not update rw_buffers as that is not needed for compute shaders
2060          * and the input buffer is using the same SGPR's anyway.
2061          */
2062         const unsigned mask = u_bit_consecutive(SI_DESCS_FIRST_COMPUTE,
2063                                                 SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE);
2064         unsigned dirty = sctx->descriptors_dirty & mask;
2065
2066         /* Assume nothing will go wrong: */
2067         sctx->shader_pointers_dirty |= dirty;
2068
2069         while (dirty) {
2070                 unsigned i = u_bit_scan(&dirty);
2071
2072                 if (!si_upload_descriptors(sctx, &sctx->descriptors[i], NULL))
2073                         return false;
2074         }
2075
2076         sctx->descriptors_dirty &= ~mask;
2077
2078         return true;
2079 }
2080
2081 void si_release_all_descriptors(struct si_context *sctx)
2082 {
2083         int i;
2084
2085         for (i = 0; i < SI_NUM_SHADERS; i++) {
2086                 si_release_buffer_resources(&sctx->const_buffers[i],
2087                                             si_const_buffer_descriptors(sctx, i));
2088                 si_release_buffer_resources(&sctx->shader_buffers[i],
2089                                             si_shader_buffer_descriptors(sctx, i));
2090                 si_release_sampler_views(&sctx->samplers[i].views);
2091                 si_release_image_views(&sctx->images[i]);
2092         }
2093         si_release_buffer_resources(&sctx->rw_buffers,
2094                                     &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
2095
2096         for (i = 0; i < SI_NUM_DESCS; ++i)
2097                 si_release_descriptors(&sctx->descriptors[i]);
2098         si_release_descriptors(&sctx->vertex_buffers);
2099 }
2100
2101 void si_all_descriptors_begin_new_cs(struct si_context *sctx)
2102 {
2103         int i;
2104
2105         for (i = 0; i < SI_NUM_SHADERS; i++) {
2106                 si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
2107                 si_buffer_resources_begin_new_cs(sctx, &sctx->shader_buffers[i]);
2108                 si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
2109                 si_image_views_begin_new_cs(sctx, &sctx->images[i]);
2110         }
2111         si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
2112         si_vertex_buffers_begin_new_cs(sctx);
2113
2114         for (i = 0; i < SI_NUM_DESCS; ++i)
2115                 si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
2116
2117         si_shader_userdata_begin_new_cs(sctx);
2118 }