src/gallium/drivers/radeonsi/si_state_streamout.c

   1 /*
   2  * Copyright 2013 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "si_build_pm4.h"
  26
  27 #include "util/u_memory.h"
  28 #include "util/u_suballoc.h"
  29
  30 static void si_set_streamout_enable(struct si_context *sctx, bool enable);
  31
  32 static inline void si_so_target_reference(struct si_streamout_target **dst,
  33                                           struct pipe_stream_output_target *src)
  34 {
  35         pipe_so_target_reference((struct pipe_stream_output_target**)dst, src);
  36 }
  37
  38 static struct pipe_stream_output_target *
  39 si_create_so_target(struct pipe_context *ctx,
  40                     struct pipe_resource *buffer,
  41                     unsigned buffer_offset,
  42                     unsigned buffer_size)
  43 {
  44         struct si_context *sctx = (struct si_context *)ctx;
  45         struct si_streamout_target *t;
  46         struct si_resource *buf = si_resource(buffer);
  47
  48         t = CALLOC_STRUCT(si_streamout_target);
  49         if (!t) {
  50                 return NULL;
  51         }
  52
  53         unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
  54         u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
  55                              &t->buf_filled_size_offset,
  56                              (struct pipe_resource**)&t->buf_filled_size);
  57         if (!t->buf_filled_size) {
  58                 FREE(t);
  59                 return NULL;
  60         }
  61
  62         t->b.reference.count = 1;
  63         t->b.context = ctx;
  64         pipe_resource_reference(&t->b.buffer, buffer);
  65         t->b.buffer_offset = buffer_offset;
  66         t->b.buffer_size = buffer_size;
  67
  68         util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset,
  69                        buffer_offset + buffer_size);
  70         return &t->b;
  71 }
  72
  73 static void si_so_target_destroy(struct pipe_context *ctx,
  74                                  struct pipe_stream_output_target *target)
  75 {
  76         struct si_streamout_target *t = (struct si_streamout_target*)target;
  77         pipe_resource_reference(&t->b.buffer, NULL);
  78         si_resource_reference(&t->buf_filled_size, NULL);
  79         FREE(t);
  80 }
  81
  82 void si_streamout_buffers_dirty(struct si_context *sctx)
  83 {
  84         if (!sctx->streamout.enabled_mask)
  85                 return;
  86
  87         si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
  88         si_set_streamout_enable(sctx, true);
  89 }
  90
  91 static void si_set_streamout_targets(struct pipe_context *ctx,
  92                                      unsigned num_targets,
  93                                      struct pipe_stream_output_target **targets,
  94                                      const unsigned *offsets)
  95 {
  96         struct si_context *sctx = (struct si_context *)ctx;
  97         unsigned old_num_targets = sctx->streamout.num_targets;
  98         unsigned i;
  99         bool wait_now = false;
 100
 101         /* We are going to unbind the buffers. Mark which caches need to be flushed. */
 102         if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
 103                 /* Since streamout uses vector writes which go through TC L2
 104                  * and most other clients can use TC L2 as well, we don't need
 105                  * to flush it.
 106                  *
 107                  * The only cases which requires flushing it is VGT DMA index
 108                  * fetching (on <= GFX7) and indirect draw data, which are rare
 109                  * cases. Thus, flag the TC L2 dirtiness in the resource and
 110                  * handle it at draw call time.
 111                  */
 112                 for (i = 0; i < sctx->streamout.num_targets; i++)
 113                         if (sctx->streamout.targets[i])
 114                                 si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
 115
 116                 /* Invalidate the scalar cache in case a streamout buffer is
 117                  * going to be used as a constant buffer.
 118                  *
 119                  * Invalidate vL1, because streamout bypasses it (done by
 120                  * setting GLC=1 in the store instruction), but vL1 in other
 121                  * CUs can contain outdated data of streamout buffers.
 122                  *
 123                  * VS_PARTIAL_FLUSH is required if the buffers are going to be
 124                  * used as an input immediately.
 125                  */
 126                 sctx->flags |= SI_CONTEXT_INV_SCACHE |
 127                                SI_CONTEXT_INV_VCACHE;
 128
 129                 /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
 130                 if (sctx->screen->use_ngg_streamout) {
 131                         sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
 132
 133                         /* Wait now. This is needed to make sure that GDS is not
 134                          * busy at the end of IBs.
 135                          *
 136                          * Also, the next streamout operation will overwrite GDS,
 137                          * so we need to make sure that it's idle.
 138                          */
 139                         wait_now = true;
 140                 } else {
 141                         sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
 142                 }
 143         }
 144
 145         /* All readers of the streamout targets need to be finished before we can
 146          * start writing to the targets.
 147          */
 148         if (num_targets) {
 149                 if (sctx->screen->use_ngg_streamout)
 150                         si_allocate_gds(sctx);
 151
 152                 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 153                                SI_CONTEXT_CS_PARTIAL_FLUSH;
 154         }
 155
 156         /* Streamout buffers must be bound in 2 places:
 157          * 1) in VGT by setting the VGT_STRMOUT registers
 158          * 2) as shader resources
 159          */
 160
 161         /* Stop streamout. */
 162         if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
 163                 si_emit_streamout_end(sctx);
 164
 165         /* Set the new targets. */
 166         unsigned enabled_mask = 0, append_bitmask = 0;
 167         for (i = 0; i < num_targets; i++) {
 168                 si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
 169                 if (!targets[i])
 170                         continue;
 171
 172                 si_context_add_resource_size(sctx, targets[i]->buffer);
 173                 enabled_mask |= 1 << i;
 174
 175                 if (offsets[i] == ((unsigned)-1))
 176                         append_bitmask |= 1 << i;
 177         }
 178
 179         for (; i < sctx->streamout.num_targets; i++)
 180                 si_so_target_reference(&sctx->streamout.targets[i], NULL);
 181
 182         sctx->streamout.enabled_mask = enabled_mask;
 183         sctx->streamout.num_targets = num_targets;
 184         sctx->streamout.append_bitmask = append_bitmask;
 185
 186         /* Update dirty state bits. */
 187         if (num_targets) {
 188                 si_streamout_buffers_dirty(sctx);
 189         } else {
 190                 si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
 191                 si_set_streamout_enable(sctx, false);
 192         }
 193
 194         /* Set the shader resources.*/
 195         for (i = 0; i < num_targets; i++) {
 196                 if (targets[i]) {
 197                         struct pipe_shader_buffer sbuf;
 198                         sbuf.buffer = targets[i]->buffer;
 199
 200                         if (sctx->screen->use_ngg_streamout) {
 201                                 sbuf.buffer_offset = targets[i]->buffer_offset;
 202                                 sbuf.buffer_size = targets[i]->buffer_size;
 203                         } else {
 204                                 sbuf.buffer_offset = 0;
 205                                 sbuf.buffer_size = targets[i]->buffer_offset +
 206                                                    targets[i]->buffer_size;
 207                         }
 208
 209                         si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
 210                         si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
 211                 } else {
 212                         si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
 213                 }
 214         }
 215         for (; i < old_num_targets; i++)
 216                 si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
 217
 218         if (wait_now)
 219                 sctx->emit_cache_flush(sctx);
 220 }
 221
 222 static void gfx10_emit_streamout_begin(struct si_context *sctx)
 223 {
 224         struct si_streamout_target **t = sctx->streamout.targets;
 225         struct radeon_cmdbuf *cs = sctx->gfx_cs;
 226         unsigned last_target = 0;
 227
 228         for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
 229                 if (t[i])
 230                         last_target = i;
 231         }
 232
 233         for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
 234                 if (!t[i])
 235                         continue;
 236
 237                 t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
 238
 239                 bool append = sctx->streamout.append_bitmask & (1 << i);
 240                 uint64_t va = 0;
 241
 242                 if (append) {
 243                         radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
 244                                                   t[i]->buf_filled_size,
 245                                                   RADEON_USAGE_READ,
 246                                                   RADEON_PRIO_SO_FILLED_SIZE);
 247
 248                         va = t[i]->buf_filled_size->gpu_address +
 249                              t[i]->buf_filled_size_offset;
 250                 }
 251
 252                 radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
 253                 radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
 254                                 S_411_DST_SEL(V_411_GDS) |
 255                                 S_411_CP_SYNC(i == last_target));
 256                 radeon_emit(cs, va);
 257                 radeon_emit(cs, va >> 32);
 258                 radeon_emit(cs, 4 * i); /* destination in GDS */
 259                 radeon_emit(cs, 0);
 260                 radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) |
 261                                 S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
 262         }
 263
 264         sctx->streamout.begin_emitted = true;
 265 }
 266
 267 static void gfx10_emit_streamout_end(struct si_context *sctx)
 268 {
 269         struct si_streamout_target **t = sctx->streamout.targets;
 270
 271         for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
 272                 if (!t[i])
 273                         continue;
 274
 275                 uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 276
 277                 si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0,
 278                                   EOP_DST_SEL_TC_L2,
 279                                   EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
 280                                   EOP_DATA_SEL_GDS,
 281                                   t[i]->buf_filled_size, va,
 282                                   EOP_DATA_GDS(i, 1), 0);
 283
 284                 t[i]->buf_filled_size_valid = true;
 285         }
 286
 287         sctx->streamout.begin_emitted = false;
 288 }
 289
 290 static void si_flush_vgt_streamout(struct si_context *sctx)
 291 {
 292         struct radeon_cmdbuf *cs = sctx->gfx_cs;
 293         unsigned reg_strmout_cntl;
 294
 295         /* The register is at different places on different ASICs. */
 296         if (sctx->chip_class >= GFX7) {
 297                 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
 298                 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
 299         } else {
 300                 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
 301                 radeon_set_config_reg(cs, reg_strmout_cntl, 0);
 302         }
 303
 304         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 305         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
 306
 307         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
 308         radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
 309         radeon_emit(cs, reg_strmout_cntl >> 2);  /* register */
 310         radeon_emit(cs, 0);
 311         radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
 312         radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
 313         radeon_emit(cs, 4); /* poll interval */
 314 }
 315
 316 static void si_emit_streamout_begin(struct si_context *sctx)
 317 {
 318         struct radeon_cmdbuf *cs = sctx->gfx_cs;
 319         struct si_streamout_target **t = sctx->streamout.targets;
 320         uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
 321         unsigned i;
 322
 323         si_flush_vgt_streamout(sctx);
 324
 325         for (i = 0; i < sctx->streamout.num_targets; i++) {
 326                 if (!t[i])
 327                         continue;
 328
 329                 t[i]->stride_in_dw = stride_in_dw[i];
 330
 331                 /* AMD GCN binds streamout buffers as shader resources.
 332                  * VGT only counts primitives and tells the shader
 333                  * through SGPRs what to do. */
 334                 radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
 335                 radeon_emit(cs, (t[i]->b.buffer_offset +
 336                                  t[i]->b.buffer_size) >> 2);    /* BUFFER_SIZE (in DW) */
 337                 radeon_emit(cs, stride_in_dw[i]);               /* VTX_STRIDE (in DW) */
 338
 339                 if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
 340                         uint64_t va = t[i]->buf_filled_size->gpu_address +
 341                                       t[i]->buf_filled_size_offset;
 342
 343                         /* Append. */
 344                         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 345                         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
 346                                     STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
 347                         radeon_emit(cs, 0); /* unused */
 348                         radeon_emit(cs, 0); /* unused */
 349                         radeon_emit(cs, va); /* src address lo */
 350                         radeon_emit(cs, va >> 32); /* src address hi */
 351
 352                         radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
 353                                                   t[i]->buf_filled_size,
 354                                                   RADEON_USAGE_READ,
 355                                                   RADEON_PRIO_SO_FILLED_SIZE);
 356                 } else {
 357                         /* Start from the beginning. */
 358                         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 359                         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
 360                                     STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
 361                         radeon_emit(cs, 0); /* unused */
 362                         radeon_emit(cs, 0); /* unused */
 363                         radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
 364                         radeon_emit(cs, 0); /* unused */
 365                 }
 366         }
 367
 368         sctx->streamout.begin_emitted = true;
 369 }
 370
 371 void si_emit_streamout_end(struct si_context *sctx)
 372 {
 373         if (sctx->screen->use_ngg_streamout) {
 374                 gfx10_emit_streamout_end(sctx);
 375                 return;
 376         }
 377
 378         struct radeon_cmdbuf *cs = sctx->gfx_cs;
 379         struct si_streamout_target **t = sctx->streamout.targets;
 380         unsigned i;
 381         uint64_t va;
 382
 383         si_flush_vgt_streamout(sctx);
 384
 385         for (i = 0; i < sctx->streamout.num_targets; i++) {
 386                 if (!t[i])
 387                         continue;
 388
 389                 va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 390                 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 391                 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
 392                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
 393                             STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
 394                 radeon_emit(cs, va);     /* dst address lo */
 395                 radeon_emit(cs, va >> 32); /* dst address hi */
 396                 radeon_emit(cs, 0); /* unused */
 397                 radeon_emit(cs, 0); /* unused */
 398
 399                 radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
 400                                           t[i]->buf_filled_size,
 401                                           RADEON_USAGE_WRITE,
 402                                           RADEON_PRIO_SO_FILLED_SIZE);
 403
 404                 /* Zero the buffer size. The counters (primitives generated,
 405                  * primitives emitted) may be enabled even if there is not
 406                  * buffer bound. This ensures that the primitives-emitted query
 407                  * won't increment. */
 408                 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
 409                 sctx->context_roll = true;
 410
 411                 t[i]->buf_filled_size_valid = true;
 412         }
 413
 414         sctx->streamout.begin_emitted = false;
 415 }
 416
 417 /* STREAMOUT CONFIG DERIVED STATE
 418  *
 419  * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
 420  * The buffer mask is an independent state, so no writes occur if there
 421  * are no buffers bound.
 422  */
 423
 424 static void si_emit_streamout_enable(struct si_context *sctx)
 425 {
 426         assert(!sctx->screen->use_ngg_streamout);
 427
 428         radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
 429         radeon_emit(sctx->gfx_cs,
 430                     S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
 431                     S_028B94_RAST_STREAM(0) |
 432                     S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
 433                     S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
 434                     S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
 435         radeon_emit(sctx->gfx_cs,
 436                     sctx->streamout.hw_enabled_mask &
 437                     sctx->streamout.enabled_stream_buffers_mask);
 438 }
 439
 440 static void si_set_streamout_enable(struct si_context *sctx, bool enable)
 441 {
 442         bool old_strmout_en = si_get_strmout_en(sctx);
 443         unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
 444
 445         sctx->streamout.streamout_enabled = enable;
 446
 447         sctx->streamout.hw_enabled_mask = sctx->streamout.enabled_mask |
 448                                           (sctx->streamout.enabled_mask << 4) |
 449                                           (sctx->streamout.enabled_mask << 8) |
 450                                           (sctx->streamout.enabled_mask << 12);
 451
 452         if (!sctx->screen->use_ngg_streamout &&
 453             ((old_strmout_en != si_get_strmout_en(sctx)) ||
 454              (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
 455                 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 456 }
 457
 458 void si_update_prims_generated_query_state(struct si_context *sctx,
 459                                            unsigned type, int diff)
 460 {
 461         if (!sctx->screen->use_ngg_streamout &&
 462             type == PIPE_QUERY_PRIMITIVES_GENERATED) {
 463                 bool old_strmout_en = si_get_strmout_en(sctx);
 464
 465                 sctx->streamout.num_prims_gen_queries += diff;
 466                 assert(sctx->streamout.num_prims_gen_queries >= 0);
 467
 468                 sctx->streamout.prims_gen_query_enabled =
 469                         sctx->streamout.num_prims_gen_queries != 0;
 470
 471                 if (old_strmout_en != si_get_strmout_en(sctx))
 472                         si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 473
 474                 if (si_update_ngg(sctx)) {
 475                         si_shader_change_notify(sctx);
 476                         sctx->do_update_shaders = true;
 477                 }
 478         }
 479 }
 480
 481 void si_init_streamout_functions(struct si_context *sctx)
 482 {
 483         sctx->b.create_stream_output_target = si_create_so_target;
 484         sctx->b.stream_output_target_destroy = si_so_target_destroy;
 485         sctx->b.set_stream_output_targets = si_set_streamout_targets;
 486
 487         if (sctx->screen->use_ngg_streamout) {
 488                 sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
 489         } else {
 490                 sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
 491                 sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
 492         }
 493 }