src/gallium/drivers/radeonsi/si_state_streamout.c

   1 /*
   2  * Copyright 2013 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "si_build_pm4.h"
  26
  27 #include "util/u_memory.h"
  28 #include "util/u_suballoc.h"
  29
  30 static void si_set_streamout_enable(struct si_context *sctx, bool enable);
  31
  32 static inline void si_so_target_reference(struct si_streamout_target **dst,
  33                                           struct pipe_stream_output_target *src)
  34 {
  35         pipe_so_target_reference((struct pipe_stream_output_target**)dst, src);
  36 }
  37
  38 static struct pipe_stream_output_target *
  39 si_create_so_target(struct pipe_context *ctx,
  40                     struct pipe_resource *buffer,
  41                     unsigned buffer_offset,
  42                     unsigned buffer_size)
  43 {
  44         struct si_context *sctx = (struct si_context *)ctx;
  45         struct si_streamout_target *t;
  46         struct si_resource *buf = si_resource(buffer);
  47
  48         t = CALLOC_STRUCT(si_streamout_target);
  49         if (!t) {
  50                 return NULL;
  51         }
  52
  53         unsigned buf_filled_size_size = sctx->chip_class >= GFX10 ? 8 : 4;
  54         u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
  55                              &t->buf_filled_size_offset,
  56                              (struct pipe_resource**)&t->buf_filled_size);
  57         if (!t->buf_filled_size) {
  58                 FREE(t);
  59                 return NULL;
  60         }
  61
  62         t->b.reference.count = 1;
  63         t->b.context = ctx;
  64         pipe_resource_reference(&t->b.buffer, buffer);
  65         t->b.buffer_offset = buffer_offset;
  66         t->b.buffer_size = buffer_size;
  67
  68         util_range_add(&buf->valid_buffer_range, buffer_offset,
  69                        buffer_offset + buffer_size);
  70         return &t->b;
  71 }
  72
  73 static void si_so_target_destroy(struct pipe_context *ctx,
  74                                  struct pipe_stream_output_target *target)
  75 {
  76         struct si_streamout_target *t = (struct si_streamout_target*)target;
  77         pipe_resource_reference(&t->b.buffer, NULL);
  78         si_resource_reference(&t->buf_filled_size, NULL);
  79         FREE(t);
  80 }
  81
  82 void si_streamout_buffers_dirty(struct si_context *sctx)
  83 {
  84         if (!sctx->streamout.enabled_mask)
  85                 return;
  86
  87         si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
  88         si_set_streamout_enable(sctx, true);
  89 }
  90
  91 static void si_set_streamout_targets(struct pipe_context *ctx,
  92                                      unsigned num_targets,
  93                                      struct pipe_stream_output_target **targets,
  94                                      const unsigned *offsets)
  95 {
  96         struct si_context *sctx = (struct si_context *)ctx;
  97         unsigned old_num_targets = sctx->streamout.num_targets;
  98         unsigned i;
  99
 100         /* We are going to unbind the buffers. Mark which caches need to be flushed. */
 101         if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
 102                 /* Since streamout uses vector writes which go through TC L2
 103                  * and most other clients can use TC L2 as well, we don't need
 104                  * to flush it.
 105                  *
 106                  * The only cases which requires flushing it is VGT DMA index
 107                  * fetching (on <= GFX7) and indirect draw data, which are rare
 108                  * cases. Thus, flag the TC L2 dirtiness in the resource and
 109                  * handle it at draw call time.
 110                  */
 111                 for (i = 0; i < sctx->streamout.num_targets; i++)
 112                         if (sctx->streamout.targets[i])
 113                                 si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
 114
 115                 /* Invalidate the scalar cache in case a streamout buffer is
 116                  * going to be used as a constant buffer.
 117                  *
 118                  * Invalidate vL1, because streamout bypasses it (done by
 119                  * setting GLC=1 in the store instruction), but vL1 in other
 120                  * CUs can contain outdated data of streamout buffers.
 121                  *
 122                  * VS_PARTIAL_FLUSH is required if the buffers are going to be
 123                  * used as an input immediately.
 124                  */
 125                 sctx->flags |= SI_CONTEXT_INV_SCACHE |
 126                                SI_CONTEXT_INV_VCACHE;
 127
 128                 /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
 129                 if (sctx->chip_class >= GFX10)
 130                         sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
 131                 else
 132                         sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
 133         }
 134
 135         /* All readers of the streamout targets need to be finished before we can
 136          * start writing to the targets.
 137          */
 138         if (num_targets)
 139                 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 140                                  SI_CONTEXT_CS_PARTIAL_FLUSH;
 141
 142         /* Streamout buffers must be bound in 2 places:
 143          * 1) in VGT by setting the VGT_STRMOUT registers
 144          * 2) as shader resources
 145          */
 146
 147         /* Stop streamout. */
 148         if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
 149                 si_emit_streamout_end(sctx);
 150
 151         /* Set the new targets. */
 152         unsigned enabled_mask = 0, append_bitmask = 0;
 153         for (i = 0; i < num_targets; i++) {
 154                 si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
 155                 if (!targets[i])
 156                         continue;
 157
 158                 si_context_add_resource_size(sctx, targets[i]->buffer);
 159                 enabled_mask |= 1 << i;
 160
 161                 if (offsets[i] == ((unsigned)-1))
 162                         append_bitmask |= 1 << i;
 163         }
 164
 165         for (; i < sctx->streamout.num_targets; i++)
 166                 si_so_target_reference(&sctx->streamout.targets[i], NULL);
 167
 168         sctx->streamout.enabled_mask = enabled_mask;
 169         sctx->streamout.num_targets = num_targets;
 170         sctx->streamout.append_bitmask = append_bitmask;
 171
 172         /* Update dirty state bits. */
 173         if (num_targets) {
 174                 si_streamout_buffers_dirty(sctx);
 175         } else {
 176                 si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
 177                 si_set_streamout_enable(sctx, false);
 178         }
 179
 180         /* Set the shader resources.*/
 181         for (i = 0; i < num_targets; i++) {
 182                 if (targets[i]) {
 183                         struct pipe_shader_buffer sbuf;
 184                         sbuf.buffer = targets[i]->buffer;
 185
 186                         if (sctx->chip_class >= GFX10) {
 187                                 sbuf.buffer_offset = targets[i]->buffer_offset;
 188                                 sbuf.buffer_size = targets[i]->buffer_size;
 189                         } else {
 190                                 sbuf.buffer_offset = 0;
 191                                 sbuf.buffer_size = targets[i]->buffer_offset +
 192                                                    targets[i]->buffer_size;
 193                         }
 194
 195                         si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
 196                         si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
 197                 } else {
 198                         si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
 199                 }
 200         }
 201         for (; i < old_num_targets; i++)
 202                 si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
 203 }
 204
 205 static void gfx10_emit_streamout_begin(struct si_context *sctx)
 206 {
 207         struct si_streamout_target **t = sctx->streamout.targets;
 208         struct radeon_cmdbuf *cs = sctx->gfx_cs;
 209         unsigned last_target = 0;
 210
 211         for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
 212                 if (t[i])
 213                         last_target = i;
 214         }
 215
 216         for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
 217                 if (!t[i])
 218                         continue;
 219
 220                 t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
 221
 222                 bool append = sctx->streamout.append_bitmask & (1 << i);
 223                 uint64_t va = 0;
 224
 225                 if (append) {
 226                         radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
 227                                                   t[i]->buf_filled_size,
 228                                                   RADEON_USAGE_READ,
 229                                                   RADEON_PRIO_SO_FILLED_SIZE);
 230
 231                         va = t[i]->buf_filled_size->gpu_address +
 232                              t[i]->buf_filled_size_offset;
 233                 }
 234
 235                 radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
 236                 radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
 237                                 S_411_DST_SEL(V_411_GDS) |
 238                                 S_411_CP_SYNC(i == last_target));
 239                 radeon_emit(cs, va);
 240                 radeon_emit(cs, va >> 32);
 241                 radeon_emit(cs, 4 * i); /* destination in GDS */
 242                 radeon_emit(cs, 0);
 243                 radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) |
 244                                 S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
 245         }
 246
 247         sctx->streamout.begin_emitted = true;
 248 }
 249
 250 static void gfx10_emit_streamout_end(struct si_context *sctx)
 251 {
 252         struct si_streamout_target **t = sctx->streamout.targets;
 253
 254         for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
 255                 if (!t[i])
 256                         continue;
 257
 258                 uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 259
 260                 si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0,
 261                                   EOP_DST_SEL_TC_L2,
 262                                   EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
 263                                   EOP_DATA_SEL_GDS,
 264                                   t[i]->buf_filled_size, va,
 265                                   EOP_DATA_GDS(i, 1), 0);
 266
 267                 t[i]->buf_filled_size_valid = true;
 268         }
 269
 270         sctx->streamout.begin_emitted = false;
 271 }
 272
 273 static void si_flush_vgt_streamout(struct si_context *sctx)
 274 {
 275         struct radeon_cmdbuf *cs = sctx->gfx_cs;
 276         unsigned reg_strmout_cntl;
 277
 278         /* The register is at different places on different ASICs. */
 279         if (sctx->chip_class >= GFX7) {
 280                 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
 281                 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
 282         } else {
 283                 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
 284                 radeon_set_config_reg(cs, reg_strmout_cntl, 0);
 285         }
 286
 287         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 288         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
 289
 290         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
 291         radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
 292         radeon_emit(cs, reg_strmout_cntl >> 2);  /* register */
 293         radeon_emit(cs, 0);
 294         radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
 295         radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
 296         radeon_emit(cs, 4); /* poll interval */
 297 }
 298
 299 static void si_emit_streamout_begin(struct si_context *sctx)
 300 {
 301         struct radeon_cmdbuf *cs = sctx->gfx_cs;
 302         struct si_streamout_target **t = sctx->streamout.targets;
 303         uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
 304         unsigned i;
 305
 306         si_flush_vgt_streamout(sctx);
 307
 308         for (i = 0; i < sctx->streamout.num_targets; i++) {
 309                 if (!t[i])
 310                         continue;
 311
 312                 t[i]->stride_in_dw = stride_in_dw[i];
 313
 314                 /* AMD GCN binds streamout buffers as shader resources.
 315                  * VGT only counts primitives and tells the shader
 316                  * through SGPRs what to do. */
 317                 radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
 318                 radeon_emit(cs, (t[i]->b.buffer_offset +
 319                                  t[i]->b.buffer_size) >> 2);    /* BUFFER_SIZE (in DW) */
 320                 radeon_emit(cs, stride_in_dw[i]);               /* VTX_STRIDE (in DW) */
 321
 322                 if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
 323                         uint64_t va = t[i]->buf_filled_size->gpu_address +
 324                                       t[i]->buf_filled_size_offset;
 325
 326                         /* Append. */
 327                         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 328                         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
 329                                     STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
 330                         radeon_emit(cs, 0); /* unused */
 331                         radeon_emit(cs, 0); /* unused */
 332                         radeon_emit(cs, va); /* src address lo */
 333                         radeon_emit(cs, va >> 32); /* src address hi */
 334
 335                         radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
 336                                                   t[i]->buf_filled_size,
 337                                                   RADEON_USAGE_READ,
 338                                                   RADEON_PRIO_SO_FILLED_SIZE);
 339                 } else {
 340                         /* Start from the beginning. */
 341                         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 342                         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
 343                                     STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
 344                         radeon_emit(cs, 0); /* unused */
 345                         radeon_emit(cs, 0); /* unused */
 346                         radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
 347                         radeon_emit(cs, 0); /* unused */
 348                 }
 349         }
 350
 351         sctx->streamout.begin_emitted = true;
 352 }
 353
 354 void si_emit_streamout_end(struct si_context *sctx)
 355 {
 356         if (sctx->chip_class >= GFX10) {
 357                 gfx10_emit_streamout_end(sctx);
 358                 return;
 359         }
 360
 361         struct radeon_cmdbuf *cs = sctx->gfx_cs;
 362         struct si_streamout_target **t = sctx->streamout.targets;
 363         unsigned i;
 364         uint64_t va;
 365
 366         si_flush_vgt_streamout(sctx);
 367
 368         for (i = 0; i < sctx->streamout.num_targets; i++) {
 369                 if (!t[i])
 370                         continue;
 371
 372                 va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 373                 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 374                 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
 375                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
 376                             STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
 377                 radeon_emit(cs, va);     /* dst address lo */
 378                 radeon_emit(cs, va >> 32); /* dst address hi */
 379                 radeon_emit(cs, 0); /* unused */
 380                 radeon_emit(cs, 0); /* unused */
 381
 382                 radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
 383                                           t[i]->buf_filled_size,
 384                                           RADEON_USAGE_WRITE,
 385                                           RADEON_PRIO_SO_FILLED_SIZE);
 386
 387                 /* Zero the buffer size. The counters (primitives generated,
 388                  * primitives emitted) may be enabled even if there is not
 389                  * buffer bound. This ensures that the primitives-emitted query
 390                  * won't increment. */
 391                 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
 392                 sctx->context_roll = true;
 393
 394                 t[i]->buf_filled_size_valid = true;
 395         }
 396
 397         sctx->streamout.begin_emitted = false;
 398 }
 399
 400 /* STREAMOUT CONFIG DERIVED STATE
 401  *
 402  * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
 403  * The buffer mask is an independent state, so no writes occur if there
 404  * are no buffers bound.
 405  */
 406
 407 static void si_emit_streamout_enable(struct si_context *sctx)
 408 {
 409         assert(sctx->chip_class < GFX10);
 410
 411         radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
 412         radeon_emit(sctx->gfx_cs,
 413                     S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
 414                     S_028B94_RAST_STREAM(0) |
 415                     S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
 416                     S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
 417                     S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
 418         radeon_emit(sctx->gfx_cs,
 419                     sctx->streamout.hw_enabled_mask &
 420                     sctx->streamout.enabled_stream_buffers_mask);
 421 }
 422
 423 static void si_set_streamout_enable(struct si_context *sctx, bool enable)
 424 {
 425         bool old_strmout_en = si_get_strmout_en(sctx);
 426         unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
 427
 428         sctx->streamout.streamout_enabled = enable;
 429
 430         sctx->streamout.hw_enabled_mask = sctx->streamout.enabled_mask |
 431                                           (sctx->streamout.enabled_mask << 4) |
 432                                           (sctx->streamout.enabled_mask << 8) |
 433                                           (sctx->streamout.enabled_mask << 12);
 434
 435         if (sctx->chip_class < GFX10 &&
 436             ((old_strmout_en != si_get_strmout_en(sctx)) ||
 437              (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
 438                 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 439 }
 440
 441 void si_update_prims_generated_query_state(struct si_context *sctx,
 442                                            unsigned type, int diff)
 443 {
 444         if (sctx->chip_class < GFX10 &&
 445             type == PIPE_QUERY_PRIMITIVES_GENERATED) {
 446                 bool old_strmout_en = si_get_strmout_en(sctx);
 447
 448                 sctx->streamout.num_prims_gen_queries += diff;
 449                 assert(sctx->streamout.num_prims_gen_queries >= 0);
 450
 451                 sctx->streamout.prims_gen_query_enabled =
 452                         sctx->streamout.num_prims_gen_queries != 0;
 453
 454                 if (old_strmout_en != si_get_strmout_en(sctx))
 455                         si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 456         }
 457 }
 458
 459 void si_init_streamout_functions(struct si_context *sctx)
 460 {
 461         sctx->b.create_stream_output_target = si_create_so_target;
 462         sctx->b.stream_output_target_destroy = si_so_target_destroy;
 463         sctx->b.set_stream_output_targets = si_set_streamout_targets;
 464
 465         if (sctx->chip_class >= GFX10) {
 466                 sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
 467         } else {
 468                 sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
 469                 sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
 470         }
 471 }