src/gallium/drivers/radeonsi/si_state_streamout.c

   1 /*
   2  * Copyright 2013 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "si_build_pm4.h"
  26 #include "util/u_memory.h"
  27 #include "util/u_suballoc.h"
  28
  29 static void si_set_streamout_enable(struct si_context *sctx, bool enable);
  30
  31 static inline void si_so_target_reference(struct si_streamout_target **dst,
  32                                           struct pipe_stream_output_target *src)
  33 {
  34    pipe_so_target_reference((struct pipe_stream_output_target **)dst, src);
  35 }
  36
  37 static struct pipe_stream_output_target *si_create_so_target(struct pipe_context *ctx,
  38                                                              struct pipe_resource *buffer,
  39                                                              unsigned buffer_offset,
  40                                                              unsigned buffer_size)
  41 {
  42    struct si_context *sctx = (struct si_context *)ctx;
  43    struct si_streamout_target *t;
  44    struct si_resource *buf = si_resource(buffer);
  45
  46    t = CALLOC_STRUCT(si_streamout_target);
  47    if (!t) {
  48       return NULL;
  49    }
  50
  51    unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
  52    u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
  53                         &t->buf_filled_size_offset, (struct pipe_resource **)&t->buf_filled_size);
  54    if (!t->buf_filled_size) {
  55       FREE(t);
  56       return NULL;
  57    }
  58
  59    t->b.reference.count = 1;
  60    t->b.context = ctx;
  61    pipe_resource_reference(&t->b.buffer, buffer);
  62    t->b.buffer_offset = buffer_offset;
  63    t->b.buffer_size = buffer_size;
  64
  65    util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size);
  66    return &t->b;
  67 }
  68
  69 static void si_so_target_destroy(struct pipe_context *ctx, struct pipe_stream_output_target *target)
  70 {
  71    struct si_streamout_target *t = (struct si_streamout_target *)target;
  72    pipe_resource_reference(&t->b.buffer, NULL);
  73    si_resource_reference(&t->buf_filled_size, NULL);
  74    FREE(t);
  75 }
  76
  77 void si_streamout_buffers_dirty(struct si_context *sctx)
  78 {
  79    if (!sctx->streamout.enabled_mask)
  80       return;
  81
  82    si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
  83    si_set_streamout_enable(sctx, true);
  84 }
  85
  86 static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targets,
  87                                      struct pipe_stream_output_target **targets,
  88                                      const unsigned *offsets)
  89 {
  90    struct si_context *sctx = (struct si_context *)ctx;
  91    unsigned old_num_targets = sctx->streamout.num_targets;
  92    unsigned i;
  93    bool wait_now = false;
  94
  95    /* We are going to unbind the buffers. Mark which caches need to be flushed. */
  96    if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
  97       /* Since streamout uses vector writes which go through TC L2
  98        * and most other clients can use TC L2 as well, we don't need
  99        * to flush it.
 100        *
 101        * The only cases which requires flushing it is VGT DMA index
 102        * fetching (on <= GFX7) and indirect draw data, which are rare
 103        * cases. Thus, flag the TC L2 dirtiness in the resource and
 104        * handle it at draw call time.
 105        */
 106       for (i = 0; i < sctx->streamout.num_targets; i++)
 107          if (sctx->streamout.targets[i])
 108             si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
 109
 110       /* Invalidate the scalar cache in case a streamout buffer is
 111        * going to be used as a constant buffer.
 112        *
 113        * Invalidate vL1, because streamout bypasses it (done by
 114        * setting GLC=1 in the store instruction), but vL1 in other
 115        * CUs can contain outdated data of streamout buffers.
 116        *
 117        * VS_PARTIAL_FLUSH is required if the buffers are going to be
 118        * used as an input immediately.
 119        */
 120       sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
 121
 122       /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
 123       if (sctx->screen->use_ngg_streamout) {
 124          sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
 125
 126          /* Wait now. This is needed to make sure that GDS is not
 127           * busy at the end of IBs.
 128           *
 129           * Also, the next streamout operation will overwrite GDS,
 130           * so we need to make sure that it's idle.
 131           */
 132          wait_now = true;
 133       } else {
 134          sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
 135       }
 136    }
 137
 138    /* All readers of the streamout targets need to be finished before we can
 139     * start writing to the targets.
 140     */
 141    if (num_targets) {
 142       if (sctx->screen->use_ngg_streamout)
 143          si_allocate_gds(sctx);
 144
 145       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
 146    }
 147
 148    /* Streamout buffers must be bound in 2 places:
 149     * 1) in VGT by setting the VGT_STRMOUT registers
 150     * 2) as shader resources
 151     */
 152
 153    /* Stop streamout. */
 154    if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
 155       si_emit_streamout_end(sctx);
 156
 157    /* Set the new targets. */
 158    unsigned enabled_mask = 0, append_bitmask = 0;
 159    for (i = 0; i < num_targets; i++) {
 160       si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
 161       if (!targets[i])
 162          continue;
 163
 164       si_context_add_resource_size(sctx, targets[i]->buffer);
 165       enabled_mask |= 1 << i;
 166
 167       if (offsets[i] == ((unsigned)-1))
 168          append_bitmask |= 1 << i;
 169    }
 170
 171    for (; i < sctx->streamout.num_targets; i++)
 172       si_so_target_reference(&sctx->streamout.targets[i], NULL);
 173
 174    sctx->streamout.enabled_mask = enabled_mask;
 175    sctx->streamout.num_targets = num_targets;
 176    sctx->streamout.append_bitmask = append_bitmask;
 177
 178    /* Update dirty state bits. */
 179    if (num_targets) {
 180       si_streamout_buffers_dirty(sctx);
 181    } else {
 182       si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
 183       si_set_streamout_enable(sctx, false);
 184    }
 185
 186    /* Set the shader resources.*/
 187    for (i = 0; i < num_targets; i++) {
 188       if (targets[i]) {
 189          struct pipe_shader_buffer sbuf;
 190          sbuf.buffer = targets[i]->buffer;
 191
 192          if (sctx->screen->use_ngg_streamout) {
 193             sbuf.buffer_offset = targets[i]->buffer_offset;
 194             sbuf.buffer_size = targets[i]->buffer_size;
 195          } else {
 196             sbuf.buffer_offset = 0;
 197             sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size;
 198          }
 199
 200          si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
 201          si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
 202       } else {
 203          si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
 204       }
 205    }
 206    for (; i < old_num_targets; i++)
 207       si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
 208
 209    if (wait_now)
 210       sctx->emit_cache_flush(sctx);
 211 }
 212
 213 static void gfx10_emit_streamout_begin(struct si_context *sctx)
 214 {
 215    struct si_streamout_target **t = sctx->streamout.targets;
 216    struct radeon_cmdbuf *cs = sctx->gfx_cs;
 217    unsigned last_target = 0;
 218
 219    for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
 220       if (t[i])
 221          last_target = i;
 222    }
 223
 224    for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
 225       if (!t[i])
 226          continue;
 227
 228       t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
 229
 230       bool append = sctx->streamout.append_bitmask & (1 << i);
 231       uint64_t va = 0;
 232
 233       if (append) {
 234          radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
 235                                    RADEON_PRIO_SO_FILLED_SIZE);
 236
 237          va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 238       }
 239
 240       radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
 241       radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
 242                          S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
 243       radeon_emit(cs, va);
 244       radeon_emit(cs, va >> 32);
 245       radeon_emit(cs, 4 * i); /* destination in GDS */
 246       radeon_emit(cs, 0);
 247       radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) | S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
 248    }
 249
 250    sctx->streamout.begin_emitted = true;
 251 }
 252
 253 static void gfx10_emit_streamout_end(struct si_context *sctx)
 254 {
 255    struct si_streamout_target **t = sctx->streamout.targets;
 256
 257    for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
 258       if (!t[i])
 259          continue;
 260
 261       uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 262
 263       si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
 264                         EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,
 265                         t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0);
 266
 267       t[i]->buf_filled_size_valid = true;
 268    }
 269
 270    sctx->streamout.begin_emitted = false;
 271 }
 272
 273 static void si_flush_vgt_streamout(struct si_context *sctx)
 274 {
 275    struct radeon_cmdbuf *cs = sctx->gfx_cs;
 276    unsigned reg_strmout_cntl;
 277
 278    /* The register is at different places on different ASICs. */
 279    if (sctx->chip_class >= GFX7) {
 280       reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
 281       radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
 282    } else {
 283       reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
 284       radeon_set_config_reg(cs, reg_strmout_cntl, 0);
 285    }
 286
 287    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 288    radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
 289
 290    radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
 291    radeon_emit(cs,
 292                WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
 293    radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
 294    radeon_emit(cs, 0);
 295    radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
 296    radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
 297    radeon_emit(cs, 4);                              /* poll interval */
 298 }
 299
 300 static void si_emit_streamout_begin(struct si_context *sctx)
 301 {
 302    struct radeon_cmdbuf *cs = sctx->gfx_cs;
 303    struct si_streamout_target **t = sctx->streamout.targets;
 304    uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
 305    unsigned i;
 306
 307    si_flush_vgt_streamout(sctx);
 308
 309    for (i = 0; i < sctx->streamout.num_targets; i++) {
 310       if (!t[i])
 311          continue;
 312
 313       t[i]->stride_in_dw = stride_in_dw[i];
 314
 315       /* AMD GCN binds streamout buffers as shader resources.
 316        * VGT only counts primitives and tells the shader
 317        * through SGPRs what to do. */
 318       radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
 319       radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
 320       radeon_emit(cs, stride_in_dw[i]);                                    /* VTX_STRIDE (in DW) */
 321
 322       if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
 323          uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 324
 325          /* Append. */
 326          radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 327          radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
 328                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
 329          radeon_emit(cs, 0);                                                 /* unused */
 330          radeon_emit(cs, 0);                                                 /* unused */
 331          radeon_emit(cs, va);                                                /* src address lo */
 332          radeon_emit(cs, va >> 32);                                          /* src address hi */
 333
 334          radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
 335                                    RADEON_PRIO_SO_FILLED_SIZE);
 336       } else {
 337          /* Start from the beginning. */
 338          radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 339          radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
 340                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
 341          radeon_emit(cs, 0);                                                    /* unused */
 342          radeon_emit(cs, 0);                                                    /* unused */
 343          radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
 344          radeon_emit(cs, 0);                          /* unused */
 345       }
 346    }
 347
 348    sctx->streamout.begin_emitted = true;
 349 }
 350
 351 void si_emit_streamout_end(struct si_context *sctx)
 352 {
 353    if (sctx->screen->use_ngg_streamout) {
 354       gfx10_emit_streamout_end(sctx);
 355       return;
 356    }
 357
 358    struct radeon_cmdbuf *cs = sctx->gfx_cs;
 359    struct si_streamout_target **t = sctx->streamout.targets;
 360    unsigned i;
 361    uint64_t va;
 362
 363    si_flush_vgt_streamout(sctx);
 364
 365    for (i = 0; i < sctx->streamout.num_targets; i++) {
 366       if (!t[i])
 367          continue;
 368
 369       va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 370       radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 371       radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
 372                          STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
 373       radeon_emit(cs, va);                                  /* dst address lo */
 374       radeon_emit(cs, va >> 32);                            /* dst address hi */
 375       radeon_emit(cs, 0);                                   /* unused */
 376       radeon_emit(cs, 0);                                   /* unused */
 377
 378       radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_WRITE,
 379                                 RADEON_PRIO_SO_FILLED_SIZE);
 380
 381       /* Zero the buffer size. The counters (primitives generated,
 382        * primitives emitted) may be enabled even if there is not
 383        * buffer bound. This ensures that the primitives-emitted query
 384        * won't increment. */
 385       radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
 386       sctx->context_roll = true;
 387
 388       t[i]->buf_filled_size_valid = true;
 389    }
 390
 391    sctx->streamout.begin_emitted = false;
 392 }
 393
 394 /* STREAMOUT CONFIG DERIVED STATE
 395  *
 396  * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
 397  * The buffer mask is an independent state, so no writes occur if there
 398  * are no buffers bound.
 399  */
 400
 401 static void si_emit_streamout_enable(struct si_context *sctx)
 402 {
 403    assert(!sctx->screen->use_ngg_streamout);
 404
 405    radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
 406    radeon_emit(sctx->gfx_cs, S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
 407                                 S_028B94_RAST_STREAM(0) |
 408                                 S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
 409                                 S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
 410                                 S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
 411    radeon_emit(sctx->gfx_cs,
 412                sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
 413 }
 414
 415 static void si_set_streamout_enable(struct si_context *sctx, bool enable)
 416 {
 417    bool old_strmout_en = si_get_strmout_en(sctx);
 418    unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
 419
 420    sctx->streamout.streamout_enabled = enable;
 421
 422    sctx->streamout.hw_enabled_mask =
 423       sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
 424       (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);
 425
 426    if (!sctx->screen->use_ngg_streamout &&
 427        ((old_strmout_en != si_get_strmout_en(sctx)) ||
 428         (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
 429       si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 430 }
 431
 432 void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff)
 433 {
 434    if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) {
 435       bool old_strmout_en = si_get_strmout_en(sctx);
 436
 437       sctx->streamout.num_prims_gen_queries += diff;
 438       assert(sctx->streamout.num_prims_gen_queries >= 0);
 439
 440       sctx->streamout.prims_gen_query_enabled = sctx->streamout.num_prims_gen_queries != 0;
 441
 442       if (old_strmout_en != si_get_strmout_en(sctx))
 443          si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 444
 445       if (si_update_ngg(sctx)) {
 446          si_shader_change_notify(sctx);
 447          sctx->do_update_shaders = true;
 448       }
 449    }
 450 }
 451
 452 void si_init_streamout_functions(struct si_context *sctx)
 453 {
 454    sctx->b.create_stream_output_target = si_create_so_target;
 455    sctx->b.stream_output_target_destroy = si_so_target_destroy;
 456    sctx->b.set_stream_output_targets = si_set_streamout_targets;
 457
 458    if (sctx->screen->use_ngg_streamout) {
 459       sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
 460    } else {
 461       sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
 462       sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
 463    }
 464 }