src/gallium/drivers/radeonsi/si_descriptors.c

   1 /*
   2  * Copyright 2013 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Marek Olšák <marek.olsak@amd.com>
  25  */
  26
  27 /* Resource binding slots and sampler states (each described with 8 or 4 dwords)
  28  * live in memory on SI.
  29  *
  30  * This file is responsible for managing lists of resources and sampler states
  31  * in memory and binding them, which means updating those structures in memory.
  32  *
  33  * There is also code for updating shader pointers to resources and sampler
  34  * states. CP DMA functions are here too.
  35  */
  36
  37 #include "radeon/r600_cs.h"
  38 #include "si_pipe.h"
  39 #include "si_shader.h"
  40 #include "sid.h"
  41
  42 #include "util/u_memory.h"
  43 #include "util/u_upload_mgr.h"
  44
  45 #define SI_NUM_CONTEXTS 16
  46
  47 static uint32_t null_desc[8]; /* zeros */
  48
  49 /* Set this if you want the 3D engine to wait until CP DMA is done.
  50  * It should be set on the last CP DMA packet. */
  51 #define R600_CP_DMA_SYNC        (1 << 0) /* R600+ */
  52
  53 /* Set this if the source data was used as a destination in a previous CP DMA
  54  * packet. It's for preventing a read-after-write (RAW) hazard between two
  55  * CP DMA packets. */
  56 #define SI_CP_DMA_RAW_WAIT      (1 << 1) /* SI+ */
  57 #define CIK_CP_DMA_USE_L2       (1 << 2)
  58
  59 /* Emit a CP DMA packet to do a copy from one buffer to another.
  60  * The size must fit in bits [20:0].
  61  */
  62 static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
  63                                        uint64_t dst_va, uint64_t src_va,
  64                                        unsigned size, unsigned flags)
  65 {
  66         struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
  67         uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
  68         uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
  69         uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
  70                            PKT3_CP_DMA_SRC_SEL(3) | PKT3_CP_DMA_DST_SEL(3) : 0;
  71
  72         assert(size);
  73         assert((size & ((1<<21)-1)) == size);
  74
  75         if (sctx->b.chip_class >= CIK) {
  76                 radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
  77                 radeon_emit(cs, sync_flag | sel);       /* CP_SYNC [31] */
  78                 radeon_emit(cs, src_va);                /* SRC_ADDR_LO [31:0] */
  79                 radeon_emit(cs, src_va >> 32);          /* SRC_ADDR_HI [31:0] */
  80                 radeon_emit(cs, dst_va);                /* DST_ADDR_LO [31:0] */
  81                 radeon_emit(cs, dst_va >> 32);          /* DST_ADDR_HI [31:0] */
  82                 radeon_emit(cs, size | raw_wait);       /* COMMAND [29:22] | BYTE_COUNT [20:0] */
  83         } else {
  84                 radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
  85                 radeon_emit(cs, src_va);                        /* SRC_ADDR_LO [31:0] */
  86                 radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
  87                 radeon_emit(cs, dst_va);                        /* DST_ADDR_LO [31:0] */
  88                 radeon_emit(cs, (dst_va >> 32) & 0xffff);       /* DST_ADDR_HI [15:0] */
  89                 radeon_emit(cs, size | raw_wait);               /* COMMAND [29:22] | BYTE_COUNT [20:0] */
  90         }
  91 }
  92
  93 /* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
  94 static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
  95                                         uint64_t dst_va, unsigned size,
  96                                         uint32_t clear_value, unsigned flags)
  97 {
  98         struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
  99         uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
 100         uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
 101         uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? PKT3_CP_DMA_DST_SEL(3) : 0;
 102
 103         assert(size);
 104         assert((size & ((1<<21)-1)) == size);
 105
 106         if (sctx->b.chip_class >= CIK) {
 107                 radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
 108                 radeon_emit(cs, sync_flag | dst_sel | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
 109                 radeon_emit(cs, clear_value);           /* DATA [31:0] */
 110                 radeon_emit(cs, 0);
 111                 radeon_emit(cs, dst_va);                /* DST_ADDR_LO [31:0] */
 112                 radeon_emit(cs, dst_va >> 32);          /* DST_ADDR_HI [15:0] */
 113                 radeon_emit(cs, size | raw_wait);       /* COMMAND [29:22] | BYTE_COUNT [20:0] */
 114         } else {
 115                 radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
 116                 radeon_emit(cs, clear_value);           /* DATA [31:0] */
 117                 radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
 118                 radeon_emit(cs, dst_va);                        /* DST_ADDR_LO [31:0] */
 119                 radeon_emit(cs, (dst_va >> 32) & 0xffff);       /* DST_ADDR_HI [15:0] */
 120                 radeon_emit(cs, size | raw_wait);               /* COMMAND [29:22] | BYTE_COUNT [20:0] */
 121         }
 122 }
 123
 124 static void si_init_descriptors(struct si_context *sctx,
 125                                 struct si_descriptors *desc,
 126                                 unsigned shader_userdata_reg,
 127                                 unsigned element_dw_size,
 128                                 unsigned num_elements,
 129                                 void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
 130 {
 131         assert(num_elements <= sizeof(desc->enabled_mask)*8);
 132         assert(num_elements <= sizeof(desc->dirty_mask)*8);
 133
 134         desc->atom.emit = (void*)emit_func;
 135         desc->shader_userdata_reg = shader_userdata_reg;
 136         desc->element_dw_size = element_dw_size;
 137         desc->num_elements = num_elements;
 138         desc->context_size = num_elements * element_dw_size * 4;
 139
 140         desc->buffer = (struct r600_resource*)
 141                 pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
 142                                    PIPE_USAGE_DEFAULT,
 143                                    SI_NUM_CONTEXTS * desc->context_size);
 144
 145         r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
 146                               RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 147
 148         /* We don't check for CS space here, because this should be called
 149          * only once at context initialization. */
 150         si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address,
 151                                     desc->buffer->b.b.width0, 0,
 152                                     R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
 153 }
 154
 155 static void si_release_descriptors(struct si_descriptors *desc)
 156 {
 157         pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
 158 }
 159
 160 static void si_update_descriptors(struct si_context *sctx,
 161                                   struct si_descriptors *desc)
 162 {
 163         if (desc->dirty_mask) {
 164                 desc->atom.num_dw =
 165                         7 + /* copy */
 166                         (4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask) + /* update */
 167                         4; /* pointer update */
 168
 169                 if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
 170                     desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0)
 171                         desc->atom.num_dw += 4; /* second pointer update */
 172
 173                 desc->atom.dirty = true;
 174
 175                 /* TODO: Investigate if these flushes can be removed after
 176                  * adding CE support. */
 177
 178                 /* The descriptors are read with the K cache. */
 179                 sctx->b.flags |= SI_CONTEXT_INV_KCACHE;
 180
 181                 /* Since SI uses uncached CP DMA to update descriptors,
 182                  * we have to flush TC L2, which is used to fetch constants
 183                  * along with KCACHE. */
 184                 if (sctx->b.chip_class == SI)
 185                         sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
 186         } else {
 187                 desc->atom.dirty = false;
 188         }
 189 }
 190
 191 static void si_emit_shader_pointer(struct si_context *sctx,
 192                                    struct r600_atom *atom)
 193 {
 194         struct si_descriptors *desc = (struct si_descriptors*)atom;
 195         struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 196         uint64_t va = desc->buffer->gpu_address +
 197                       desc->current_context_id * desc->context_size +
 198                       desc->buffer_offset;
 199
 200         radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
 201         radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
 202         radeon_emit(cs, va);
 203         radeon_emit(cs, va >> 32);
 204
 205         if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
 206             desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0) {
 207                 radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
 208                 radeon_emit(cs, (desc->shader_userdata_reg +
 209                                  (R_00B330_SPI_SHADER_USER_DATA_ES_0 -
 210                                   R_00B130_SPI_SHADER_USER_DATA_VS_0) -
 211                                  SI_SH_REG_OFFSET) >> 2);
 212                 radeon_emit(cs, va);
 213                 radeon_emit(cs, va >> 32);
 214         }
 215 }
 216
 217 static void si_emit_descriptors(struct si_context *sctx,
 218                                 struct si_descriptors *desc,
 219                                 uint32_t **descriptors)
 220 {
 221         struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 222         uint64_t va_base;
 223         int packet_start = 0;
 224         int packet_size = 0;
 225         int last_index = desc->num_elements; /* point to a non-existing element */
 226         unsigned dirty_mask = desc->dirty_mask;
 227         unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
 228
 229         assert(dirty_mask);
 230
 231         va_base = desc->buffer->gpu_address;
 232
 233         /* Copy the descriptors to a new context slot. */
 234         si_emit_cp_dma_copy_buffer(sctx,
 235                                    va_base + new_context_id * desc->context_size,
 236                                    va_base + desc->current_context_id * desc->context_size,
 237                                    desc->context_size, R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
 238
 239         va_base += new_context_id * desc->context_size;
 240
 241         /* Update the descriptors.
 242          * Updates of consecutive descriptors are merged to one WRITE_DATA packet.
 243          *
 244          * XXX When unbinding lots of resources, consider clearing the memory
 245          *     with CP DMA instead of emitting zeros.
 246          */
 247         while (dirty_mask) {
 248                 int i = u_bit_scan(&dirty_mask);
 249
 250                 assert(i < desc->num_elements);
 251
 252                 if (last_index+1 == i && packet_size) {
 253                         /* Append new data at the end of the last packet. */
 254                         packet_size += desc->element_dw_size;
 255                         cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
 256                 } else {
 257                         /* Start a new packet. */
 258                         uint64_t va = va_base + i * desc->element_dw_size * 4;
 259
 260                         packet_start = cs->cdw;
 261                         packet_size = 2 + desc->element_dw_size;
 262
 263                         radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
 264                         radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ?
 265                                                 PKT3_WRITE_DATA_DST_SEL_MEM_SYNC :
 266                                                 PKT3_WRITE_DATA_DST_SEL_TC_L2) |
 267                                              PKT3_WRITE_DATA_WR_CONFIRM |
 268                                              PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
 269                         radeon_emit(cs, va & 0xFFFFFFFFUL);
 270                         radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL);
 271                 }
 272
 273                 radeon_emit_array(cs, descriptors[i], desc->element_dw_size);
 274
 275                 last_index = i;
 276         }
 277
 278         desc->dirty_mask = 0;
 279         desc->current_context_id = new_context_id;
 280
 281         /* Now update the shader userdata pointer. */
 282         si_emit_shader_pointer(sctx, &desc->atom);
 283 }
 284
 285 static unsigned si_get_shader_user_data_base(unsigned shader)
 286 {
 287         switch (shader) {
 288         case PIPE_SHADER_VERTEX:
 289                 return R_00B130_SPI_SHADER_USER_DATA_VS_0;
 290         case PIPE_SHADER_GEOMETRY:
 291                 return R_00B230_SPI_SHADER_USER_DATA_GS_0;
 292         case PIPE_SHADER_FRAGMENT:
 293                 return R_00B030_SPI_SHADER_USER_DATA_PS_0;
 294         default:
 295                 assert(0);
 296                 return 0;
 297         }
 298 }
 299
 300 /* SAMPLER VIEWS */
 301
 302 static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *atom)
 303 {
 304         struct si_sampler_views *views = (struct si_sampler_views*)atom;
 305
 306         si_emit_descriptors(sctx, &views->desc, views->desc_data);
 307 }
 308
 309 static void si_init_sampler_views(struct si_context *sctx,
 310                                   struct si_sampler_views *views,
 311                                   unsigned shader)
 312 {
 313         si_init_descriptors(sctx, &views->desc,
 314                             si_get_shader_user_data_base(shader) +
 315                             SI_SGPR_RESOURCE * 4,
 316                             8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views);
 317 }
 318
 319 static void si_release_sampler_views(struct si_sampler_views *views)
 320 {
 321         int i;
 322
 323         for (i = 0; i < Elements(views->views); i++) {
 324                 pipe_sampler_view_reference(&views->views[i], NULL);
 325         }
 326         si_release_descriptors(&views->desc);
 327 }
 328
 329 static enum radeon_bo_priority si_get_resource_ro_priority(struct r600_resource *res)
 330 {
 331         if (res->b.b.target == PIPE_BUFFER)
 332                 return RADEON_PRIO_SHADER_BUFFER_RO;
 333
 334         if (res->b.b.nr_samples > 1)
 335                 return RADEON_PRIO_SHADER_TEXTURE_MSAA;
 336
 337         return RADEON_PRIO_SHADER_TEXTURE_RO;
 338 }
 339
 340 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 341                                           struct si_sampler_views *views)
 342 {
 343         unsigned mask = views->desc.enabled_mask;
 344
 345         /* Add relocations to the CS. */
 346         while (mask) {
 347                 int i = u_bit_scan(&mask);
 348                 struct si_sampler_view *rview =
 349                         (struct si_sampler_view*)views->views[i];
 350
 351                 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 352                                       rview->resource, RADEON_USAGE_READ,
 353                                       si_get_resource_ro_priority(rview->resource));
 354         }
 355
 356         r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
 357                               RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 358
 359         si_emit_shader_pointer(sctx, &views->desc.atom);
 360 }
 361
 362 static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
 363                                 unsigned slot, struct pipe_sampler_view *view,
 364                                 unsigned *view_desc)
 365 {
 366         struct si_sampler_views *views = &sctx->samplers[shader].views;
 367
 368         if (views->views[slot] == view)
 369                 return;
 370
 371         if (view) {
 372                 struct si_sampler_view *rview =
 373                         (struct si_sampler_view*)view;
 374
 375                 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 376                                       rview->resource, RADEON_USAGE_READ,
 377                                       si_get_resource_ro_priority(rview->resource));
 378
 379                 pipe_sampler_view_reference(&views->views[slot], view);
 380                 views->desc_data[slot] = view_desc;
 381                 views->desc.enabled_mask |= 1 << slot;
 382         } else {
 383                 pipe_sampler_view_reference(&views->views[slot], NULL);
 384                 views->desc_data[slot] = null_desc;
 385                 views->desc.enabled_mask &= ~(1 << slot);
 386         }
 387
 388         views->desc.dirty_mask |= 1 << slot;
 389 }
 390
 391 static void si_set_sampler_views(struct pipe_context *ctx,
 392                                  unsigned shader, unsigned start,
 393                                  unsigned count,
 394                                  struct pipe_sampler_view **views)
 395 {
 396         struct si_context *sctx = (struct si_context *)ctx;
 397         struct si_textures_info *samplers = &sctx->samplers[shader];
 398         struct si_sampler_view **rviews = (struct si_sampler_view **)views;
 399         int i;
 400
 401         if (!count || shader >= SI_NUM_SHADERS)
 402                 return;
 403
 404         for (i = 0; i < count; i++) {
 405                 unsigned slot = start + i;
 406
 407                 if (!views[i]) {
 408                         samplers->depth_texture_mask &= ~(1 << slot);
 409                         samplers->compressed_colortex_mask &= ~(1 << slot);
 410                         si_set_sampler_view(sctx, shader, slot, NULL, NULL);
 411                         si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
 412                                             NULL, NULL);
 413                         continue;
 414                 }
 415
 416                 si_set_sampler_view(sctx, shader, slot, views[i], rviews[i]->state);
 417
 418                 if (views[i]->texture->target != PIPE_BUFFER) {
 419                         struct r600_texture *rtex =
 420                                 (struct r600_texture*)views[i]->texture;
 421
 422                         if (rtex->is_depth && !rtex->is_flushing_texture) {
 423                                 samplers->depth_texture_mask |= 1 << slot;
 424                         } else {
 425                                 samplers->depth_texture_mask &= ~(1 << slot);
 426                         }
 427                         if (rtex->cmask.size || rtex->fmask.size) {
 428                                 samplers->compressed_colortex_mask |= 1 << slot;
 429                         } else {
 430                                 samplers->compressed_colortex_mask &= ~(1 << slot);
 431                         }
 432
 433                         if (rtex->fmask.size) {
 434                                 si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
 435                                                     views[i], rviews[i]->fmask_state);
 436                         } else {
 437                                 si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
 438                                                     NULL, NULL);
 439                         }
 440                 } else {
 441                         samplers->depth_texture_mask &= ~(1 << slot);
 442                         samplers->compressed_colortex_mask &= ~(1 << slot);
 443                         si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
 444                                             NULL, NULL);
 445                 }
 446         }
 447
 448         si_update_descriptors(sctx, &samplers->views.desc);
 449 }
 450
 451 /* SAMPLER STATES */
 452
 453 static void si_emit_sampler_states(struct si_context *sctx, struct r600_atom *atom)
 454 {
 455         struct si_sampler_states *states = (struct si_sampler_states*)atom;
 456
 457         si_emit_descriptors(sctx, &states->desc, states->desc_data);
 458 }
 459
 460 static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 461                                            struct si_sampler_states *states)
 462 {
 463         r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
 464                               RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 465         si_emit_shader_pointer(sctx, &states->desc.atom);
 466 }
 467
 468 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
 469                                 unsigned start, unsigned count, void **states)
 470 {
 471         struct si_sampler_states *samplers = &sctx->samplers[shader].states;
 472         struct si_sampler_state **sstates = (struct si_sampler_state**)states;
 473         int i;
 474
 475         if (start == 0)
 476                 samplers->saved_states[0] = states[0];
 477         if (start == 1)
 478                 samplers->saved_states[1] = states[0];
 479         else if (start == 0 && count >= 2)
 480                 samplers->saved_states[1] = states[1];
 481
 482         for (i = 0; i < count; i++) {
 483                 unsigned slot = start + i;
 484
 485                 if (!sstates[i]) {
 486                         samplers->desc.dirty_mask &= ~(1 << slot);
 487                         continue;
 488                 }
 489
 490                 samplers->desc_data[slot] = sstates[i]->val;
 491                 samplers->desc.dirty_mask |= 1 << slot;
 492         }
 493
 494         si_update_descriptors(sctx, &samplers->desc);
 495 }
 496
 497 /* BUFFER RESOURCES */
 498
 499 static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *atom)
 500 {
 501         struct si_buffer_resources *buffers = (struct si_buffer_resources*)atom;
 502
 503         si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data);
 504 }
 505
 506 static void si_init_buffer_resources(struct si_context *sctx,
 507                                      struct si_buffer_resources *buffers,
 508                                      unsigned num_buffers, unsigned shader,
 509                                      unsigned shader_userdata_index,
 510                                      enum radeon_bo_usage shader_usage,
 511                                      enum radeon_bo_priority priority)
 512 {
 513         int i;
 514
 515         buffers->num_buffers = num_buffers;
 516         buffers->shader_usage = shader_usage;
 517         buffers->priority = priority;
 518         buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
 519         buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4);
 520
 521         /* si_emit_descriptors only accepts an array of arrays.
 522          * This adds such an array. */
 523         buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*));
 524         for (i = 0; i < num_buffers; i++) {
 525                 buffers->desc_data[i] = &buffers->desc_storage[i*4];
 526         }
 527
 528         si_init_descriptors(sctx, &buffers->desc,
 529                             si_get_shader_user_data_base(shader) +
 530                             shader_userdata_index*4, 4, num_buffers,
 531                             si_emit_buffer_resources);
 532 }
 533
 534 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
 535 {
 536         int i;
 537
 538         for (i = 0; i < buffers->num_buffers; i++) {
 539                 pipe_resource_reference(&buffers->buffers[i], NULL);
 540         }
 541
 542         FREE(buffers->buffers);
 543         FREE(buffers->desc_storage);
 544         FREE(buffers->desc_data);
 545         si_release_descriptors(&buffers->desc);
 546 }
 547
 548 static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 549                                              struct si_buffer_resources *buffers)
 550 {
 551         unsigned mask = buffers->desc.enabled_mask;
 552
 553         /* Add relocations to the CS. */
 554         while (mask) {
 555                 int i = u_bit_scan(&mask);
 556
 557                 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 558                                       (struct r600_resource*)buffers->buffers[i],
 559                                       buffers->shader_usage, buffers->priority);
 560         }
 561
 562         r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 563                               buffers->desc.buffer, RADEON_USAGE_READWRITE,
 564                               RADEON_PRIO_SHADER_DATA);
 565
 566         si_emit_shader_pointer(sctx, &buffers->desc.atom);
 567 }
 568
 569 /* VERTEX BUFFERS */
 570
 571 static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 572 {
 573         struct si_descriptors *desc = &sctx->vertex_buffers;
 574         int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
 575         int i;
 576
 577         for (i = 0; i < count; i++) {
 578                 int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
 579
 580                 if (vb >= Elements(sctx->vertex_buffer))
 581                         continue;
 582                 if (!sctx->vertex_buffer[vb].buffer)
 583                         continue;
 584
 585                 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 586                                       (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
 587                                       RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
 588         }
 589         r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 590                               desc->buffer, RADEON_USAGE_READ,
 591                               RADEON_PRIO_SHADER_DATA);
 592
 593         si_emit_shader_pointer(sctx, &desc->atom);
 594 }
 595
 596 void si_update_vertex_buffers(struct si_context *sctx)
 597 {
 598         struct si_descriptors *desc = &sctx->vertex_buffers;
 599         bool bound[SI_NUM_VERTEX_BUFFERS] = {};
 600         unsigned i, count = sctx->vertex_elements->count;
 601         uint64_t va;
 602         uint32_t *ptr;
 603
 604         if (!count || !sctx->vertex_elements)
 605                 return;
 606
 607         /* Vertex buffer descriptors are the only ones which are uploaded
 608          * directly through a staging buffer and don't go through
 609          * the fine-grained upload path.
 610          */
 611         u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
 612                        (struct pipe_resource**)&desc->buffer, (void**)&ptr);
 613
 614         r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 615                               desc->buffer, RADEON_USAGE_READ,
 616                               RADEON_PRIO_SHADER_DATA);
 617
 618         assert(count <= SI_NUM_VERTEX_BUFFERS);
 619         assert(desc->current_context_id == 0);
 620
 621         for (i = 0; i < count; i++) {
 622                 struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
 623                 struct pipe_vertex_buffer *vb;
 624                 struct r600_resource *rbuffer;
 625                 unsigned offset;
 626                 uint32_t *desc = &ptr[i*4];
 627
 628                 if (ve->vertex_buffer_index >= Elements(sctx->vertex_buffer)) {
 629                         memset(desc, 0, 16);
 630                         continue;
 631                 }
 632
 633                 vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
 634                 rbuffer = (struct r600_resource*)vb->buffer;
 635                 if (rbuffer == NULL) {
 636                         memset(desc, 0, 16);
 637                         continue;
 638                 }
 639
 640                 offset = vb->buffer_offset + ve->src_offset;
 641                 va = rbuffer->gpu_address + offset;
 642
 643                 /* Fill in T# buffer resource description */
 644                 desc[0] = va & 0xFFFFFFFF;
 645                 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 646                           S_008F04_STRIDE(vb->stride);
 647                 if (vb->stride)
 648                         /* Round up by rounding down and adding 1 */
 649                         desc[2] = (vb->buffer->width0 - offset -
 650                                    sctx->vertex_elements->format_size[i]) /
 651                                   vb->stride + 1;
 652                 else
 653                         desc[2] = vb->buffer->width0 - offset;
 654
 655                 desc[3] = sctx->vertex_elements->rsrc_word3[i];
 656
 657                 if (!bound[ve->vertex_buffer_index]) {
 658                         r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 659                                               (struct r600_resource*)vb->buffer,
 660                                               RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
 661                         bound[ve->vertex_buffer_index] = true;
 662                 }
 663         }
 664
 665         desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */
 666         desc->atom.dirty = true;
 667
 668         /* Don't flush the const cache. It would have a very negative effect
 669          * on performance (confirmed by testing). New descriptors are always
 670          * uploaded to a fresh new buffer, so I don't think flushing the const
 671          * cache is needed. */
 672 }
 673
 674
 675 /* CONSTANT BUFFERS */
 676
 677 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
 678                             const uint8_t *ptr, unsigned size, uint32_t *const_offset)
 679 {
 680         void *tmp;
 681
 682         u_upload_alloc(sctx->b.uploader, 0, size, const_offset,
 683                        (struct pipe_resource**)rbuffer, &tmp);
 684         util_memcpy_cpu_to_le32(tmp, ptr, size);
 685 }
 686
 687 static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint slot,
 688                                    struct pipe_constant_buffer *input)
 689 {
 690         struct si_context *sctx = (struct si_context *)ctx;
 691         struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
 692
 693         if (shader >= SI_NUM_SHADERS)
 694                 return;
 695
 696         assert(slot < buffers->num_buffers);
 697         pipe_resource_reference(&buffers->buffers[slot], NULL);
 698
 699         /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
 700          * with a NULL buffer). We need to use a dummy buffer instead. */
 701         if (sctx->b.chip_class == CIK &&
 702             (!input || (!input->buffer && !input->user_buffer)))
 703                 input = &sctx->null_const_buf;
 704
 705         if (input && (input->buffer || input->user_buffer)) {
 706                 struct pipe_resource *buffer = NULL;
 707                 uint64_t va;
 708
 709                 /* Upload the user buffer if needed. */
 710                 if (input->user_buffer) {
 711                         unsigned buffer_offset;
 712
 713                         si_upload_const_buffer(sctx,
 714                                                (struct r600_resource**)&buffer, input->user_buffer,
 715                                                input->buffer_size, &buffer_offset);
 716                         va = r600_resource(buffer)->gpu_address + buffer_offset;
 717                 } else {
 718                         pipe_resource_reference(&buffer, input->buffer);
 719                         va = r600_resource(buffer)->gpu_address + input->buffer_offset;
 720                 }
 721
 722                 /* Set the descriptor. */
 723                 uint32_t *desc = buffers->desc_data[slot];
 724                 desc[0] = va;
 725                 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 726                           S_008F04_STRIDE(0);
 727                 desc[2] = input->buffer_size;
 728                 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
 729                           S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
 730                           S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
 731                           S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
 732                           S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
 733                           S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 734
 735                 buffers->buffers[slot] = buffer;
 736                 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 737                                       (struct r600_resource*)buffer,
 738                                       buffers->shader_usage, buffers->priority);
 739                 buffers->desc.enabled_mask |= 1 << slot;
 740         } else {
 741                 /* Clear the descriptor. */
 742                 memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
 743                 buffers->desc.enabled_mask &= ~(1 << slot);
 744         }
 745
 746         buffers->desc.dirty_mask |= 1 << slot;
 747         si_update_descriptors(sctx, &buffers->desc);
 748 }
 749
 750 /* RING BUFFERS */
 751
 752 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 753                         struct pipe_resource *buffer,
 754                         unsigned stride, unsigned num_records,
 755                         bool add_tid, bool swizzle,
 756                         unsigned element_size, unsigned index_stride)
 757 {
 758         struct si_context *sctx = (struct si_context *)ctx;
 759         struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
 760
 761         if (shader >= SI_NUM_SHADERS)
 762                 return;
 763
 764         /* The stride field in the resource descriptor has 14 bits */
 765         assert(stride < (1 << 14));
 766
 767         assert(slot < buffers->num_buffers);
 768         pipe_resource_reference(&buffers->buffers[slot], NULL);
 769
 770         if (buffer) {
 771                 uint64_t va;
 772
 773                 va = r600_resource(buffer)->gpu_address;
 774
 775                 switch (element_size) {
 776                 default:
 777                         assert(!"Unsupported ring buffer element size");
 778                 case 0:
 779                 case 2:
 780                         element_size = 0;
 781                         break;
 782                 case 4:
 783                         element_size = 1;
 784                         break;
 785                 case 8:
 786                         element_size = 2;
 787                         break;
 788                 case 16:
 789                         element_size = 3;
 790                         break;
 791                 }
 792
 793                 switch (index_stride) {
 794                 default:
 795                         assert(!"Unsupported ring buffer index stride");
 796                 case 0:
 797                 case 8:
 798                         index_stride = 0;
 799                         break;
 800                 case 16:
 801                         index_stride = 1;
 802                         break;
 803                 case 32:
 804                         index_stride = 2;
 805                         break;
 806                 case 64:
 807                         index_stride = 3;
 808                         break;
 809                 }
 810
 811                 /* Set the descriptor. */
 812                 uint32_t *desc = buffers->desc_data[slot];
 813                 desc[0] = va;
 814                 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 815                           S_008F04_STRIDE(stride) |
 816                           S_008F04_SWIZZLE_ENABLE(swizzle);
 817                 desc[2] = num_records;
 818                 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
 819                           S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
 820                           S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
 821                           S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
 822                           S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
 823                           S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
 824                           S_008F0C_ELEMENT_SIZE(element_size) |
 825                           S_008F0C_INDEX_STRIDE(index_stride) |
 826                           S_008F0C_ADD_TID_ENABLE(add_tid);
 827
 828                 pipe_resource_reference(&buffers->buffers[slot], buffer);
 829                 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 830                                       (struct r600_resource*)buffer,
 831                                       buffers->shader_usage, buffers->priority);
 832                 buffers->desc.enabled_mask |= 1 << slot;
 833         } else {
 834                 /* Clear the descriptor. */
 835                 memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
 836                 buffers->desc.enabled_mask &= ~(1 << slot);
 837         }
 838
 839         buffers->desc.dirty_mask |= 1 << slot;
 840         si_update_descriptors(sctx, &buffers->desc);
 841 }
 842
 843 /* STREAMOUT BUFFERS */
 844
 845 static void si_set_streamout_targets(struct pipe_context *ctx,
 846                                      unsigned num_targets,
 847                                      struct pipe_stream_output_target **targets,
 848                                      const unsigned *offsets)
 849 {
 850         struct si_context *sctx = (struct si_context *)ctx;
 851         struct si_buffer_resources *buffers = &sctx->rw_buffers[PIPE_SHADER_VERTEX];
 852         unsigned old_num_targets = sctx->b.streamout.num_targets;
 853         unsigned i, bufidx;
 854
 855         /* We are going to unbind the buffers. Mark which caches need to be flushed. */
 856         if (sctx->b.streamout.num_targets && sctx->b.streamout.begin_emitted) {
 857                 /* Since streamout uses vector writes which go through TC L2
 858                  * and most other clients can use TC L2 as well, we don't need
 859                  * to flush it.
 860                  *
 861                  * The only case which requires flushing it is VGT DMA index
 862                  * fetching, which is a rare case. Thus, flag the TC L2
 863                  * dirtiness in the resource and handle it when index fetching
 864                  * is used.
 865                  */
 866                 for (i = 0; i < sctx->b.streamout.num_targets; i++)
 867                         if (sctx->b.streamout.targets[i])
 868                                 r600_resource(sctx->b.streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
 869
 870                 /* Invalidate the scalar cache in case a streamout buffer is
 871                  * going to be used as a constant buffer.
 872                  *
 873                  * Invalidate TC L1, because streamout bypasses it (done by
 874                  * setting GLC=1 in the store instruction), but it can contain
 875                  * outdated data of streamout buffers.
 876                  *
 877                  * VS_PARTIAL_FLUSH is required if the buffers are going to be
 878                  * used as an input immediately.
 879                  */
 880                 sctx->b.flags |= SI_CONTEXT_INV_KCACHE |
 881                                  SI_CONTEXT_INV_TC_L1 |
 882                                  SI_CONTEXT_VS_PARTIAL_FLUSH;
 883         }
 884
 885         /* Streamout buffers must be bound in 2 places:
 886          * 1) in VGT by setting the VGT_STRMOUT registers
 887          * 2) as shader resources
 888          */
 889
 890         /* Set the VGT regs. */
 891         r600_set_streamout_targets(ctx, num_targets, targets, offsets);
 892
 893         /* Set the shader resources.*/
 894         for (i = 0; i < num_targets; i++) {
 895                 bufidx = SI_SO_BUF_OFFSET + i;
 896
 897                 if (targets[i]) {
 898                         struct pipe_resource *buffer = targets[i]->buffer;
 899                         uint64_t va = r600_resource(buffer)->gpu_address;
 900
 901                         /* Set the descriptor. */
 902                         uint32_t *desc = buffers->desc_data[bufidx];
 903                         desc[0] = va;
 904                         desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
 905                         desc[2] = 0xffffffff;
 906                         desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
 907                                   S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
 908                                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
 909                                   S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
 910
 911                         /* Set the resource. */
 912                         pipe_resource_reference(&buffers->buffers[bufidx],
 913                                                 buffer);
 914                         r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 915                                               (struct r600_resource*)buffer,
 916                                               buffers->shader_usage, buffers->priority);
 917                         buffers->desc.enabled_mask |= 1 << bufidx;
 918                 } else {
 919                         /* Clear the descriptor and unset the resource. */
 920                         memset(buffers->desc_data[bufidx], 0,
 921                                sizeof(uint32_t) * 4);
 922                         pipe_resource_reference(&buffers->buffers[bufidx],
 923                                                 NULL);
 924                         buffers->desc.enabled_mask &= ~(1 << bufidx);
 925                 }
 926                 buffers->desc.dirty_mask |= 1 << bufidx;
 927         }
 928         for (; i < old_num_targets; i++) {
 929                 bufidx = SI_SO_BUF_OFFSET + i;
 930                 /* Clear the descriptor and unset the resource. */
 931                 memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4);
 932                 pipe_resource_reference(&buffers->buffers[bufidx], NULL);
 933                 buffers->desc.enabled_mask &= ~(1 << bufidx);
 934                 buffers->desc.dirty_mask |= 1 << bufidx;
 935         }
 936
 937         si_update_descriptors(sctx, &buffers->desc);
 938 }
 939
 940 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
 941                                         uint32_t *desc, uint64_t old_buf_va,
 942                                         struct pipe_resource *new_buf)
 943 {
 944         /* Retrieve the buffer offset from the descriptor. */
 945         uint64_t old_desc_va =
 946                 desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
 947
 948         assert(old_buf_va <= old_desc_va);
 949         uint64_t offset_within_buffer = old_desc_va - old_buf_va;
 950
 951         /* Update the descriptor. */
 952         uint64_t va = r600_resource(new_buf)->gpu_address + offset_within_buffer;
 953
 954         desc[0] = va;
 955         desc[1] = (desc[1] & C_008F04_BASE_ADDRESS_HI) |
 956                   S_008F04_BASE_ADDRESS_HI(va >> 32);
 957 }
 958
 959 /* BUFFER DISCARD/INVALIDATION */
 960
 961 /* Reallocate a buffer a update all resource bindings where the buffer is
 962  * bound.
 963  *
 964  * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
 965  * idle by discarding its contents. Apps usually tell us when to do this using
 966  * map_buffer flags, for example.
 967  */
 968 static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf)
 969 {
 970         struct si_context *sctx = (struct si_context*)ctx;
 971         struct r600_resource *rbuffer = r600_resource(buf);
 972         unsigned i, shader, alignment = rbuffer->buf->alignment;
 973         uint64_t old_va = rbuffer->gpu_address;
 974         unsigned num_elems = sctx->vertex_elements ?
 975                                        sctx->vertex_elements->count : 0;
 976         struct si_sampler_view *view;
 977
 978         /* Reallocate the buffer in the same pipe_resource. */
 979         r600_init_resource(&sctx->screen->b, rbuffer, rbuffer->b.b.width0,
 980                            alignment, TRUE);
 981
 982         /* We changed the buffer, now we need to bind it where the old one
 983          * was bound. This consists of 2 things:
 984          *   1) Updating the resource descriptor and dirtying it.
 985          *   2) Adding a relocation to the CS, so that it's usable.
 986          */
 987
 988         /* Vertex buffers. */
 989         for (i = 0; i < num_elems; i++) {
 990                 int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
 991
 992                 if (vb >= Elements(sctx->vertex_buffer))
 993                         continue;
 994                 if (!sctx->vertex_buffer[vb].buffer)
 995                         continue;
 996
 997                 if (sctx->vertex_buffer[vb].buffer == buf) {
 998                         sctx->vertex_buffers_dirty = true;
 999                         break;
1000                 }
1001         }
1002
1003         /* Read/Write buffers. */
1004         for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1005                 struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
1006                 bool found = false;
1007                 uint32_t mask = buffers->desc.enabled_mask;
1008
1009                 while (mask) {
1010                         i = u_bit_scan(&mask);
1011                         if (buffers->buffers[i] == buf) {
1012                                 si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
1013                                                             old_va, buf);
1014
1015                                 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
1016                                                       rbuffer, buffers->shader_usage,
1017                                                       buffers->priority);
1018
1019                                 buffers->desc.dirty_mask |= 1 << i;
1020                                 found = true;
1021
1022                                 if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) {
1023                                         /* Update the streamout state. */
1024                                         if (sctx->b.streamout.begin_emitted) {
1025                                                 r600_emit_streamout_end(&sctx->b);
1026                                         }
1027                                         sctx->b.streamout.append_bitmask =
1028                                                 sctx->b.streamout.enabled_mask;
1029                                         r600_streamout_buffers_dirty(&sctx->b);
1030                                 }
1031                         }
1032                 }
1033                 if (found) {
1034                         si_update_descriptors(sctx, &buffers->desc);
1035                 }
1036         }
1037
1038         /* Constant buffers. */
1039         for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1040                 struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
1041                 bool found = false;
1042                 uint32_t mask = buffers->desc.enabled_mask;
1043
1044                 while (mask) {
1045                         unsigned i = u_bit_scan(&mask);
1046                         if (buffers->buffers[i] == buf) {
1047                                 si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
1048                                                             old_va, buf);
1049
1050                                 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
1051                                                       rbuffer, buffers->shader_usage,
1052                                                       buffers->priority);
1053
1054                                 buffers->desc.dirty_mask |= 1 << i;
1055                                 found = true;
1056                         }
1057                 }
1058                 if (found) {
1059                         si_update_descriptors(sctx, &buffers->desc);
1060                 }
1061         }
1062
1063         /* Texture buffers - update virtual addresses in sampler view descriptors. */
1064         LIST_FOR_EACH_ENTRY(view, &sctx->b.texture_buffers, list) {
1065                 if (view->base.texture == buf) {
1066                         si_desc_reset_buffer_offset(ctx, view->state, old_va, buf);
1067                 }
1068         }
1069         /* Texture buffers - update bindings. */
1070         for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1071                 struct si_sampler_views *views = &sctx->samplers[shader].views;
1072                 bool found = false;
1073                 uint32_t mask = views->desc.enabled_mask;
1074
1075                 while (mask) {
1076                         unsigned i = u_bit_scan(&mask);
1077                         if (views->views[i]->texture == buf) {
1078                                 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
1079                                                       rbuffer, RADEON_USAGE_READ,
1080                                                       RADEON_PRIO_SHADER_BUFFER_RO);
1081
1082                                 views->desc.dirty_mask |= 1 << i;
1083                                 found = true;
1084                         }
1085                 }
1086                 if (found) {
1087                         si_update_descriptors(sctx, &views->desc);
1088                 }
1089         }
1090 }
1091
1092 /* CP DMA */
1093
1094 /* The max number of bytes to copy per packet. */
1095 #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
1096
1097 static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
1098                             unsigned offset, unsigned size, unsigned value,
1099                             bool is_framebuffer)
1100 {
1101         struct si_context *sctx = (struct si_context*)ctx;
1102         unsigned flush_flags, tc_l2_flag;
1103
1104         if (!size)
1105                 return;
1106
1107         /* Mark the buffer range of destination as valid (initialized),
1108          * so that transfer_map knows it should wait for the GPU when mapping
1109          * that range. */
1110         util_range_add(&r600_resource(dst)->valid_buffer_range, offset,
1111                        offset + size);
1112
1113         /* Fallback for unaligned clears. */
1114         if (offset % 4 != 0 || size % 4 != 0) {
1115                 uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
1116                                                        sctx->b.rings.gfx.cs,
1117                                                        PIPE_TRANSFER_WRITE);
1118                 size /= 4;
1119                 for (unsigned i = 0; i < size; i++)
1120                         *map++ = value;
1121                 return;
1122         }
1123
1124         uint64_t va = r600_resource(dst)->gpu_address + offset;
1125
1126         /* Flush the caches where the resource is bound. */
1127         if (is_framebuffer) {
1128                 flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
1129                 tc_l2_flag = 0;
1130         } else {
1131                 flush_flags = SI_CONTEXT_INV_TC_L1 |
1132                               (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
1133                               SI_CONTEXT_INV_KCACHE;
1134                 tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
1135         }
1136
1137         sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
1138                          flush_flags;
1139
1140         while (size) {
1141                 unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
1142                 unsigned dma_flags = tc_l2_flag;
1143
1144                 si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0),
1145                                  FALSE);
1146
1147                 /* This must be done after need_cs_space. */
1148                 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
1149                                       (struct r600_resource*)dst, RADEON_USAGE_WRITE,
1150                                       RADEON_PRIO_MIN);
1151
1152                 /* Flush the caches for the first copy only.
1153                  * Also wait for the previous CP DMA operations. */
1154                 if (sctx->b.flags) {
1155                         si_emit_cache_flush(&sctx->b, NULL);
1156                         dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
1157                 }
1158
1159                 /* Do the synchronization after the last copy, so that all data is written to memory. */
1160                 if (size == byte_count)
1161                         dma_flags |= R600_CP_DMA_SYNC;
1162
1163                 /* Emit the clear packet. */
1164                 si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
1165
1166                 size -= byte_count;
1167                 va += byte_count;
1168         }
1169
1170         /* Flush the caches again in case the 3D engine has been prefetching
1171          * the resource. */
1172         sctx->b.flags |= flush_flags;
1173
1174         if (tc_l2_flag)
1175                 r600_resource(dst)->TC_L2_dirty = true;
1176 }
1177
1178 void si_copy_buffer(struct si_context *sctx,
1179                     struct pipe_resource *dst, struct pipe_resource *src,
1180                     uint64_t dst_offset, uint64_t src_offset, unsigned size,
1181                     bool is_framebuffer)
1182 {
1183         unsigned flush_flags, tc_l2_flag;
1184
1185         if (!size)
1186                 return;
1187
1188         /* Mark the buffer range of destination as valid (initialized),
1189          * so that transfer_map knows it should wait for the GPU when mapping
1190          * that range. */
1191         util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
1192                        dst_offset + size);
1193
1194         dst_offset += r600_resource(dst)->gpu_address;
1195         src_offset += r600_resource(src)->gpu_address;
1196
1197         /* Flush the caches where the resource is bound. */
1198         if (is_framebuffer) {
1199                 flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
1200                 tc_l2_flag = 0;
1201         } else {
1202                 flush_flags = SI_CONTEXT_INV_TC_L1 |
1203                               (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
1204                               SI_CONTEXT_INV_KCACHE;
1205                 tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
1206         }
1207
1208         sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
1209                          flush_flags;
1210
1211         while (size) {
1212                 unsigned sync_flags = tc_l2_flag;
1213                 unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
1214
1215                 si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE);
1216
1217                 /* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
1218                 if (sctx->b.flags) {
1219                         si_emit_cache_flush(&sctx->b, NULL);
1220                         sync_flags |= SI_CP_DMA_RAW_WAIT;
1221                 }
1222
1223                 /* Do the synchronization after the last copy, so that all data is written to memory. */
1224                 if (size == byte_count) {
1225                         sync_flags |= R600_CP_DMA_SYNC;
1226                 }
1227
1228                 /* This must be done after r600_need_cs_space. */
1229                 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
1230                                       RADEON_USAGE_READ, RADEON_PRIO_MIN);
1231                 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
1232                                       RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
1233
1234                 si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
1235
1236                 size -= byte_count;
1237                 src_offset += byte_count;
1238                 dst_offset += byte_count;
1239         }
1240
1241         /* Flush the caches again in case the 3D engine has been prefetching
1242          * the resource. */
1243         sctx->b.flags |= flush_flags;
1244
1245         if (tc_l2_flag)
1246                 r600_resource(dst)->TC_L2_dirty = true;
1247 }
1248
1249 /* INIT/DEINIT */
1250
1251 void si_init_all_descriptors(struct si_context *sctx)
1252 {
1253         int i;
1254
1255         for (i = 0; i < SI_NUM_SHADERS; i++) {
1256                 si_init_buffer_resources(sctx, &sctx->const_buffers[i],
1257                                          SI_NUM_CONST_BUFFERS, i, SI_SGPR_CONST,
1258                                          RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
1259                 si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
1260                                          i == PIPE_SHADER_VERTEX ?
1261                                          SI_NUM_RW_BUFFERS : SI_NUM_RING_BUFFERS,
1262                                          i, SI_SGPR_RW_BUFFERS,
1263                                          RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
1264
1265                 si_init_sampler_views(sctx, &sctx->samplers[i].views, i);
1266
1267                 si_init_descriptors(sctx, &sctx->samplers[i].states.desc,
1268                                     si_get_shader_user_data_base(i) + SI_SGPR_SAMPLER * 4,
1269                                     4, SI_NUM_SAMPLER_STATES, si_emit_sampler_states);
1270
1271                 sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
1272                 sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
1273                 sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom;
1274                 sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
1275         }
1276
1277         si_init_descriptors(sctx, &sctx->vertex_buffers,
1278                             si_get_shader_user_data_base(PIPE_SHADER_VERTEX) +
1279                             SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS,
1280                             si_emit_shader_pointer);
1281         sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom;
1282
1283         /* Set pipe_context functions. */
1284         sctx->b.b.set_constant_buffer = si_set_constant_buffer;
1285         sctx->b.b.set_sampler_views = si_set_sampler_views;
1286         sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
1287         sctx->b.clear_buffer = si_clear_buffer;
1288         sctx->b.invalidate_buffer = si_invalidate_buffer;
1289 }
1290
1291 void si_release_all_descriptors(struct si_context *sctx)
1292 {
1293         int i;
1294
1295         for (i = 0; i < SI_NUM_SHADERS; i++) {
1296                 si_release_buffer_resources(&sctx->const_buffers[i]);
1297                 si_release_buffer_resources(&sctx->rw_buffers[i]);
1298                 si_release_sampler_views(&sctx->samplers[i].views);
1299                 si_release_descriptors(&sctx->samplers[i].states.desc);
1300         }
1301         si_release_descriptors(&sctx->vertex_buffers);
1302 }
1303
1304 void si_all_descriptors_begin_new_cs(struct si_context *sctx)
1305 {
1306         int i;
1307
1308         for (i = 0; i < SI_NUM_SHADERS; i++) {
1309                 si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
1310                 si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]);
1311                 si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
1312                 si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
1313         }
1314         si_vertex_buffers_begin_new_cs(sctx);
1315 }