src/gallium/drivers/radeonsi/si_state_streamout.c

   1 /*
   2  * Copyright 2013 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "si_build_pm4.h"
  26
  27 #include "util/u_memory.h"
  28 #include "util/u_suballoc.h"
  29
  30 static void si_set_streamout_enable(struct si_context *sctx, bool enable);
  31
  32 static inline void si_so_target_reference(struct si_streamout_target **dst,
  33                                           struct pipe_stream_output_target *src)
  34 {
  35         pipe_so_target_reference((struct pipe_stream_output_target**)dst, src);
  36 }
  37
  38 static struct pipe_stream_output_target *
  39 si_create_so_target(struct pipe_context *ctx,
  40                     struct pipe_resource *buffer,
  41                     unsigned buffer_offset,
  42                     unsigned buffer_size)
  43 {
  44         struct si_context *sctx = (struct si_context *)ctx;
  45         struct si_streamout_target *t;
  46         struct si_resource *buf = si_resource(buffer);
  47
  48         t = CALLOC_STRUCT(si_streamout_target);
  49         if (!t) {
  50                 return NULL;
  51         }
  52
  53         u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4,
  54                              &t->buf_filled_size_offset,
  55                              (struct pipe_resource**)&t->buf_filled_size);
  56         if (!t->buf_filled_size) {
  57                 FREE(t);
  58                 return NULL;
  59         }
  60
  61         t->b.reference.count = 1;
  62         t->b.context = ctx;
  63         pipe_resource_reference(&t->b.buffer, buffer);
  64         t->b.buffer_offset = buffer_offset;
  65         t->b.buffer_size = buffer_size;
  66
  67         util_range_add(&buf->valid_buffer_range, buffer_offset,
  68                        buffer_offset + buffer_size);
  69         return &t->b;
  70 }
  71
  72 static void si_so_target_destroy(struct pipe_context *ctx,
  73                                  struct pipe_stream_output_target *target)
  74 {
  75         struct si_streamout_target *t = (struct si_streamout_target*)target;
  76         pipe_resource_reference(&t->b.buffer, NULL);
  77         si_resource_reference(&t->buf_filled_size, NULL);
  78         FREE(t);
  79 }
  80
  81 void si_streamout_buffers_dirty(struct si_context *sctx)
  82 {
  83         if (!sctx->streamout.enabled_mask)
  84                 return;
  85
  86         si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
  87         si_set_streamout_enable(sctx, true);
  88 }
  89
  90 static void si_set_streamout_targets(struct pipe_context *ctx,
  91                                      unsigned num_targets,
  92                                      struct pipe_stream_output_target **targets,
  93                                      const unsigned *offsets)
  94 {
  95         struct si_context *sctx = (struct si_context *)ctx;
  96         unsigned old_num_targets = sctx->streamout.num_targets;
  97         unsigned i;
  98
  99         /* We are going to unbind the buffers. Mark which caches need to be flushed. */
 100         if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
 101                 /* Since streamout uses vector writes which go through TC L2
 102                  * and most other clients can use TC L2 as well, we don't need
 103                  * to flush it.
 104                  *
 105                  * The only cases which requires flushing it is VGT DMA index
 106                  * fetching (on <= CIK) and indirect draw data, which are rare
 107                  * cases. Thus, flag the TC L2 dirtiness in the resource and
 108                  * handle it at draw call time.
 109                  */
 110                 for (i = 0; i < sctx->streamout.num_targets; i++)
 111                         if (sctx->streamout.targets[i])
 112                                 si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
 113
 114                 /* Invalidate the scalar cache in case a streamout buffer is
 115                  * going to be used as a constant buffer.
 116                  *
 117                  * Invalidate vL1, because streamout bypasses it (done by
 118                  * setting GLC=1 in the store instruction), but vL1 in other
 119                  * CUs can contain outdated data of streamout buffers.
 120                  *
 121                  * VS_PARTIAL_FLUSH is required if the buffers are going to be
 122                  * used as an input immediately.
 123                  */
 124                 sctx->flags |= SI_CONTEXT_INV_SMEM_L1 |
 125                                  SI_CONTEXT_INV_VMEM_L1 |
 126                                  SI_CONTEXT_VS_PARTIAL_FLUSH;
 127         }
 128
 129         /* All readers of the streamout targets need to be finished before we can
 130          * start writing to the targets.
 131          */
 132         if (num_targets)
 133                 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 134                                  SI_CONTEXT_CS_PARTIAL_FLUSH;
 135
 136         /* Streamout buffers must be bound in 2 places:
 137          * 1) in VGT by setting the VGT_STRMOUT registers
 138          * 2) as shader resources
 139          */
 140
 141         /* Stop streamout. */
 142         if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
 143                 si_emit_streamout_end(sctx);
 144
 145         /* Set the new targets. */
 146         unsigned enabled_mask = 0, append_bitmask = 0;
 147         for (i = 0; i < num_targets; i++) {
 148                 si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
 149                 if (!targets[i])
 150                         continue;
 151
 152                 si_context_add_resource_size(sctx, targets[i]->buffer);
 153                 enabled_mask |= 1 << i;
 154
 155                 if (offsets[i] == ((unsigned)-1))
 156                         append_bitmask |= 1 << i;
 157         }
 158
 159         for (; i < sctx->streamout.num_targets; i++)
 160                 si_so_target_reference(&sctx->streamout.targets[i], NULL);
 161
 162         sctx->streamout.enabled_mask = enabled_mask;
 163         sctx->streamout.num_targets = num_targets;
 164         sctx->streamout.append_bitmask = append_bitmask;
 165
 166         /* Update dirty state bits. */
 167         if (num_targets) {
 168                 si_streamout_buffers_dirty(sctx);
 169         } else {
 170                 si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
 171                 si_set_streamout_enable(sctx, false);
 172         }
 173
 174         /* Set the shader resources.*/
 175         for (i = 0; i < num_targets; i++) {
 176                 if (targets[i]) {
 177                         struct pipe_shader_buffer sbuf;
 178                         sbuf.buffer = targets[i]->buffer;
 179                         sbuf.buffer_offset = 0;
 180                         sbuf.buffer_size = targets[i]->buffer_offset +
 181                                            targets[i]->buffer_size;
 182                         si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
 183                         si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
 184                 } else {
 185                         si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
 186                 }
 187         }
 188         for (; i < old_num_targets; i++)
 189                 si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
 190 }
 191
 192 static void si_flush_vgt_streamout(struct si_context *sctx)
 193 {
 194         struct radeon_cmdbuf *cs = sctx->gfx_cs;
 195         unsigned reg_strmout_cntl;
 196
 197         /* The register is at different places on different ASICs. */
 198         if (sctx->chip_class >= CIK) {
 199                 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
 200                 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
 201         } else {
 202                 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
 203                 radeon_set_config_reg(cs, reg_strmout_cntl, 0);
 204         }
 205
 206         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 207         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
 208
 209         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
 210         radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
 211         radeon_emit(cs, reg_strmout_cntl >> 2);  /* register */
 212         radeon_emit(cs, 0);
 213         radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
 214         radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
 215         radeon_emit(cs, 4); /* poll interval */
 216 }
 217
 218 static void si_emit_streamout_begin(struct si_context *sctx)
 219 {
 220         struct radeon_cmdbuf *cs = sctx->gfx_cs;
 221         struct si_streamout_target **t = sctx->streamout.targets;
 222         uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
 223         unsigned i;
 224
 225         si_flush_vgt_streamout(sctx);
 226
 227         for (i = 0; i < sctx->streamout.num_targets; i++) {
 228                 if (!t[i])
 229                         continue;
 230
 231                 t[i]->stride_in_dw = stride_in_dw[i];
 232
 233                 /* SI binds streamout buffers as shader resources.
 234                  * VGT only counts primitives and tells the shader
 235                  * through SGPRs what to do. */
 236                 radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
 237                 radeon_emit(cs, (t[i]->b.buffer_offset +
 238                                  t[i]->b.buffer_size) >> 2);    /* BUFFER_SIZE (in DW) */
 239                 radeon_emit(cs, stride_in_dw[i]);               /* VTX_STRIDE (in DW) */
 240
 241                 if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
 242                         uint64_t va = t[i]->buf_filled_size->gpu_address +
 243                                       t[i]->buf_filled_size_offset;
 244
 245                         /* Append. */
 246                         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 247                         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
 248                                     STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
 249                         radeon_emit(cs, 0); /* unused */
 250                         radeon_emit(cs, 0); /* unused */
 251                         radeon_emit(cs, va); /* src address lo */
 252                         radeon_emit(cs, va >> 32); /* src address hi */
 253
 254                         radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
 255                                                   t[i]->buf_filled_size,
 256                                                   RADEON_USAGE_READ,
 257                                                   RADEON_PRIO_SO_FILLED_SIZE);
 258                 } else {
 259                         /* Start from the beginning. */
 260                         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 261                         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
 262                                     STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
 263                         radeon_emit(cs, 0); /* unused */
 264                         radeon_emit(cs, 0); /* unused */
 265                         radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
 266                         radeon_emit(cs, 0); /* unused */
 267                 }
 268         }
 269
 270         sctx->streamout.begin_emitted = true;
 271 }
 272
 273 void si_emit_streamout_end(struct si_context *sctx)
 274 {
 275         struct radeon_cmdbuf *cs = sctx->gfx_cs;
 276         struct si_streamout_target **t = sctx->streamout.targets;
 277         unsigned i;
 278         uint64_t va;
 279
 280         si_flush_vgt_streamout(sctx);
 281
 282         for (i = 0; i < sctx->streamout.num_targets; i++) {
 283                 if (!t[i])
 284                         continue;
 285
 286                 va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 287                 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 288                 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
 289                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
 290                             STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
 291                 radeon_emit(cs, va);     /* dst address lo */
 292                 radeon_emit(cs, va >> 32); /* dst address hi */
 293                 radeon_emit(cs, 0); /* unused */
 294                 radeon_emit(cs, 0); /* unused */
 295
 296                 radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
 297                                           t[i]->buf_filled_size,
 298                                           RADEON_USAGE_WRITE,
 299                                           RADEON_PRIO_SO_FILLED_SIZE);
 300
 301                 /* Zero the buffer size. The counters (primitives generated,
 302                  * primitives emitted) may be enabled even if there is not
 303                  * buffer bound. This ensures that the primitives-emitted query
 304                  * won't increment. */
 305                 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
 306                 sctx->context_roll = true;
 307
 308                 t[i]->buf_filled_size_valid = true;
 309         }
 310
 311         sctx->streamout.begin_emitted = false;
 312 }
 313
 314 /* STREAMOUT CONFIG DERIVED STATE
 315  *
 316  * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
 317  * The buffer mask is an independent state, so no writes occur if there
 318  * are no buffers bound.
 319  */
 320
 321 static void si_emit_streamout_enable(struct si_context *sctx)
 322 {
 323         radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
 324         radeon_emit(sctx->gfx_cs,
 325                     S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
 326                     S_028B94_RAST_STREAM(0) |
 327                     S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
 328                     S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
 329                     S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
 330         radeon_emit(sctx->gfx_cs,
 331                     sctx->streamout.hw_enabled_mask &
 332                     sctx->streamout.enabled_stream_buffers_mask);
 333 }
 334
 335 static void si_set_streamout_enable(struct si_context *sctx, bool enable)
 336 {
 337         bool old_strmout_en = si_get_strmout_en(sctx);
 338         unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
 339
 340         sctx->streamout.streamout_enabled = enable;
 341
 342         sctx->streamout.hw_enabled_mask = sctx->streamout.enabled_mask |
 343                                           (sctx->streamout.enabled_mask << 4) |
 344                                           (sctx->streamout.enabled_mask << 8) |
 345                                           (sctx->streamout.enabled_mask << 12);
 346
 347         if ((old_strmout_en != si_get_strmout_en(sctx)) ||
 348             (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))
 349                 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 350 }
 351
 352 void si_update_prims_generated_query_state(struct si_context *sctx,
 353                                            unsigned type, int diff)
 354 {
 355         if (type == PIPE_QUERY_PRIMITIVES_GENERATED) {
 356                 bool old_strmout_en = si_get_strmout_en(sctx);
 357
 358                 sctx->streamout.num_prims_gen_queries += diff;
 359                 assert(sctx->streamout.num_prims_gen_queries >= 0);
 360
 361                 sctx->streamout.prims_gen_query_enabled =
 362                         sctx->streamout.num_prims_gen_queries != 0;
 363
 364                 if (old_strmout_en != si_get_strmout_en(sctx))
 365                         si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 366         }
 367 }
 368
 369 void si_init_streamout_functions(struct si_context *sctx)
 370 {
 371         sctx->b.create_stream_output_target = si_create_so_target;
 372         sctx->b.stream_output_target_destroy = si_so_target_destroy;
 373         sctx->b.set_stream_output_targets = si_set_streamout_targets;
 374         sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
 375         sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
 376 }