src/gallium/drivers/radeon/r600_pipe_common.c

   1 /*
   2  * Copyright 2013 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "r600_pipe_common.h"
  25 #include "r600_cs.h"
  26 #include "util/u_memory.h"
  27 #include "util/u_upload_mgr.h"
  28 #include "radeon/radeon_video.h"
  29
  30 /*
  31  * pipe_context
  32  */
  33
  34 /**
  35  * Write an EOP event.
  36  *
  37  * \param event         EVENT_TYPE_*
  38  * \param event_flags   Optional cache flush flags (TC)
  39  * \param data_sel      1 = fence, 3 = timestamp
  40  * \param buf           Buffer
  41  * \param va            GPU address
  42  * \param old_value     Previous fence value (for a bug workaround)
  43  * \param new_value     Fence value to write for this event.
  44  */
  45 void si_gfx_write_event_eop(struct r600_common_context *ctx,
  46                             unsigned event, unsigned event_flags,
  47                             unsigned data_sel,
  48                             struct r600_resource *buf, uint64_t va,
  49                             uint32_t new_fence, unsigned query_type)
  50 {
  51         struct radeon_winsys_cs *cs = ctx->gfx.cs;
  52         unsigned op = EVENT_TYPE(event) |
  53                       EVENT_INDEX(5) |
  54                       event_flags;
  55         unsigned sel = EOP_DATA_SEL(data_sel);
  56
  57         /* Wait for write confirmation before writing data, but don't send
  58          * an interrupt. */
  59         if (data_sel != EOP_DATA_SEL_DISCARD)
  60                 sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM);
  61
  62         if (ctx->chip_class >= GFX9) {
  63                 /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
  64                  * counters) must immediately precede every timestamp event to
  65                  * prevent a GPU hang on GFX9.
  66                  *
  67                  * Occlusion queries don't need to do it here, because they
  68                  * always do ZPASS_DONE before the timestamp.
  69                  */
  70                 if (ctx->chip_class == GFX9 &&
  71                     query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
  72                     query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
  73                     query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
  74                         struct r600_resource *scratch = ctx->eop_bug_scratch;
  75
  76                         assert(16 * ctx->screen->info.num_render_backends <=
  77                                scratch->b.b.width0);
  78                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
  79                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
  80                         radeon_emit(cs, scratch->gpu_address);
  81                         radeon_emit(cs, scratch->gpu_address >> 32);
  82
  83                         radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
  84                                                   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
  85                 }
  86
  87                 radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
  88                 radeon_emit(cs, op);
  89                 radeon_emit(cs, sel);
  90                 radeon_emit(cs, va);            /* address lo */
  91                 radeon_emit(cs, va >> 32);      /* address hi */
  92                 radeon_emit(cs, new_fence);     /* immediate data lo */
  93                 radeon_emit(cs, 0); /* immediate data hi */
  94                 radeon_emit(cs, 0); /* unused */
  95         } else {
  96                 if (ctx->chip_class == CIK ||
  97                     ctx->chip_class == VI) {
  98                         struct r600_resource *scratch = ctx->eop_bug_scratch;
  99                         uint64_t va = scratch->gpu_address;
 100
 101                         /* Two EOP events are required to make all engines go idle
 102                          * (and optional cache flushes executed) before the timestamp
 103                          * is written.
 104                          */
 105                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 106                         radeon_emit(cs, op);
 107                         radeon_emit(cs, va);
 108                         radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 109                         radeon_emit(cs, 0); /* immediate data */
 110                         radeon_emit(cs, 0); /* unused */
 111
 112                         radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
 113                                                   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 114                 }
 115
 116                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 117                 radeon_emit(cs, op);
 118                 radeon_emit(cs, va);
 119                 radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 120                 radeon_emit(cs, new_fence); /* immediate data */
 121                 radeon_emit(cs, 0); /* unused */
 122         }
 123
 124         if (buf) {
 125                 radeon_add_to_buffer_list(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE,
 126                                           RADEON_PRIO_QUERY);
 127         }
 128 }
 129
 130 unsigned si_gfx_write_fence_dwords(struct si_screen *screen)
 131 {
 132         unsigned dwords = 6;
 133
 134         if (screen->info.chip_class == CIK ||
 135             screen->info.chip_class == VI)
 136                 dwords *= 2;
 137
 138         return dwords;
 139 }
 140
 141 void si_gfx_wait_fence(struct r600_common_context *ctx,
 142                        uint64_t va, uint32_t ref, uint32_t mask)
 143 {
 144         struct radeon_winsys_cs *cs = ctx->gfx.cs;
 145
 146         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
 147         radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
 148         radeon_emit(cs, va);
 149         radeon_emit(cs, va >> 32);
 150         radeon_emit(cs, ref); /* reference value */
 151         radeon_emit(cs, mask); /* mask */
 152         radeon_emit(cs, 4); /* poll interval */
 153 }
 154
 155 static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
 156 {
 157         struct radeon_winsys_cs *cs = rctx->dma.cs;
 158
 159         /* NOP waits for idle on Evergreen and later. */
 160         if (rctx->chip_class >= CIK)
 161                 radeon_emit(cs, 0x00000000); /* NOP */
 162         else
 163                 radeon_emit(cs, 0xf0000000); /* NOP */
 164 }
 165
 166 void si_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 167                        struct r600_resource *dst, struct r600_resource *src)
 168 {
 169         uint64_t vram = ctx->dma.cs->used_vram;
 170         uint64_t gtt = ctx->dma.cs->used_gart;
 171
 172         if (dst) {
 173                 vram += dst->vram_usage;
 174                 gtt += dst->gart_usage;
 175         }
 176         if (src) {
 177                 vram += src->vram_usage;
 178                 gtt += src->gart_usage;
 179         }
 180
 181         /* Flush the GFX IB if DMA depends on it. */
 182         if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
 183             ((dst &&
 184               ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf,
 185                                                RADEON_USAGE_READWRITE)) ||
 186              (src &&
 187               ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf,
 188                                                RADEON_USAGE_WRITE))))
 189                 ctx->gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
 190
 191         /* Flush if there's not enough space, or if the memory usage per IB
 192          * is too large.
 193          *
 194          * IBs using too little memory are limited by the IB submission overhead.
 195          * IBs using too much memory are limited by the kernel/TTM overhead.
 196          * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 197          *
 198          * This heuristic makes sure that DMA requests are executed
 199          * very soon after the call is made and lowers memory usage.
 200          * It improves texture upload performance by keeping the DMA
 201          * engine busy while uploads are being submitted.
 202          */
 203         num_dw++; /* for emit_wait_idle below */
 204         if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
 205             ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
 206             !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
 207                 ctx->dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
 208                 assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
 209         }
 210
 211         /* Wait for idle if either buffer has been used in the IB before to
 212          * prevent read-after-write hazards.
 213          */
 214         if ((dst &&
 215              ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf,
 216                                               RADEON_USAGE_READWRITE)) ||
 217             (src &&
 218              ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf,
 219                                               RADEON_USAGE_WRITE)))
 220                 r600_dma_emit_wait_idle(ctx);
 221
 222         if (dst) {
 223                 radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
 224                                           RADEON_USAGE_WRITE,
 225                                           RADEON_PRIO_SDMA_BUFFER);
 226         }
 227         if (src) {
 228                 radeon_add_to_buffer_list(ctx, &ctx->dma, src,
 229                                           RADEON_USAGE_READ,
 230                                           RADEON_PRIO_SDMA_BUFFER);
 231         }
 232
 233         /* this function is called before all DMA calls, so increment this. */
 234         ctx->num_dma_calls++;
 235 }
 236
 237 static void r600_flush_dma_ring(void *ctx, unsigned flags,
 238                                 struct pipe_fence_handle **fence)
 239 {
 240         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 241         struct radeon_winsys_cs *cs = rctx->dma.cs;
 242         struct radeon_saved_cs saved;
 243         bool check_vm =
 244                 (rctx->screen->debug_flags & DBG(CHECK_VM)) &&
 245                 rctx->check_vm_faults;
 246
 247         if (!radeon_emitted(cs, 0)) {
 248                 if (fence)
 249                         rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 250                 return;
 251         }
 252
 253         if (check_vm)
 254                 si_save_cs(rctx->ws, cs, &saved, true);
 255
 256         rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence);
 257         if (fence)
 258                 rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 259
 260         if (check_vm) {
 261                 /* Use conservative timeout 800ms, after which we won't wait any
 262                  * longer and assume the GPU is hung.
 263                  */
 264                 rctx->ws->fence_wait(rctx->ws, rctx->last_sdma_fence, 800*1000*1000);
 265
 266                 rctx->check_vm_faults(rctx, &saved, RING_DMA);
 267                 si_clear_saved_cs(&saved);
 268         }
 269 }
 270
 271 /**
 272  * Store a linearized copy of all chunks of \p cs together with the buffer
 273  * list in \p saved.
 274  */
 275 void si_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
 276                 struct radeon_saved_cs *saved, bool get_buffer_list)
 277 {
 278         uint32_t *buf;
 279         unsigned i;
 280
 281         /* Save the IB chunks. */
 282         saved->num_dw = cs->prev_dw + cs->current.cdw;
 283         saved->ib = MALLOC(4 * saved->num_dw);
 284         if (!saved->ib)
 285                 goto oom;
 286
 287         buf = saved->ib;
 288         for (i = 0; i < cs->num_prev; ++i) {
 289                 memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
 290                 buf += cs->prev[i].cdw;
 291         }
 292         memcpy(buf, cs->current.buf, cs->current.cdw * 4);
 293
 294         if (!get_buffer_list)
 295                 return;
 296
 297         /* Save the buffer list. */
 298         saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
 299         saved->bo_list = CALLOC(saved->bo_count,
 300                                 sizeof(saved->bo_list[0]));
 301         if (!saved->bo_list) {
 302                 FREE(saved->ib);
 303                 goto oom;
 304         }
 305         ws->cs_get_buffer_list(cs, saved->bo_list);
 306
 307         return;
 308
 309 oom:
 310         fprintf(stderr, "%s: out of memory\n", __func__);
 311         memset(saved, 0, sizeof(*saved));
 312 }
 313
 314 void si_clear_saved_cs(struct radeon_saved_cs *saved)
 315 {
 316         FREE(saved->ib);
 317         FREE(saved->bo_list);
 318
 319         memset(saved, 0, sizeof(*saved));
 320 }
 321
 322 static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
 323 {
 324         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 325         unsigned latest = rctx->ws->query_value(rctx->ws,
 326                                                 RADEON_GPU_RESET_COUNTER);
 327
 328         if (rctx->gpu_reset_counter == latest)
 329                 return PIPE_NO_RESET;
 330
 331         rctx->gpu_reset_counter = latest;
 332         return PIPE_UNKNOWN_CONTEXT_RESET;
 333 }
 334
 335 static void r600_set_device_reset_callback(struct pipe_context *ctx,
 336                                            const struct pipe_device_reset_callback *cb)
 337 {
 338         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 339
 340         if (cb)
 341                 rctx->device_reset_callback = *cb;
 342         else
 343                 memset(&rctx->device_reset_callback, 0,
 344                        sizeof(rctx->device_reset_callback));
 345 }
 346
 347 bool si_check_device_reset(struct r600_common_context *rctx)
 348 {
 349         enum pipe_reset_status status;
 350
 351         if (!rctx->device_reset_callback.reset)
 352                 return false;
 353
 354         if (!rctx->b.get_device_reset_status)
 355                 return false;
 356
 357         status = rctx->b.get_device_reset_status(&rctx->b);
 358         if (status == PIPE_NO_RESET)
 359                 return false;
 360
 361         rctx->device_reset_callback.reset(rctx->device_reset_callback.data, status);
 362         return true;
 363 }
 364
 365 static bool r600_resource_commit(struct pipe_context *pctx,
 366                                  struct pipe_resource *resource,
 367                                  unsigned level, struct pipe_box *box,
 368                                  bool commit)
 369 {
 370         struct r600_common_context *ctx = (struct r600_common_context *)pctx;
 371         struct r600_resource *res = r600_resource(resource);
 372
 373         /*
 374          * Since buffer commitment changes cannot be pipelined, we need to
 375          * (a) flush any pending commands that refer to the buffer we're about
 376          *     to change, and
 377          * (b) wait for threaded submit to finish, including those that were
 378          *     triggered by some other, earlier operation.
 379          */
 380         if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
 381             ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
 382                                              res->buf, RADEON_USAGE_READWRITE)) {
 383                 ctx->gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
 384         }
 385         if (radeon_emitted(ctx->dma.cs, 0) &&
 386             ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
 387                                              res->buf, RADEON_USAGE_READWRITE)) {
 388                 ctx->dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
 389         }
 390
 391         ctx->ws->cs_sync_flush(ctx->dma.cs);
 392         ctx->ws->cs_sync_flush(ctx->gfx.cs);
 393
 394         assert(resource->target == PIPE_BUFFER);
 395
 396         return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
 397 }
 398
 399 bool si_common_context_init(struct r600_common_context *rctx,
 400                             struct si_screen *sscreen,
 401                             unsigned context_flags)
 402 {
 403         slab_create_child(&rctx->pool_transfers, &sscreen->pool_transfers);
 404         slab_create_child(&rctx->pool_transfers_unsync, &sscreen->pool_transfers);
 405
 406         rctx->screen = sscreen;
 407         rctx->ws = sscreen->ws;
 408         rctx->family = sscreen->info.family;
 409         rctx->chip_class = sscreen->info.chip_class;
 410
 411         rctx->b.resource_commit = r600_resource_commit;
 412
 413         if (sscreen->info.drm_major == 2 && sscreen->info.drm_minor >= 43) {
 414                 rctx->b.get_device_reset_status = r600_get_reset_status;
 415                 rctx->gpu_reset_counter =
 416                         rctx->ws->query_value(rctx->ws,
 417                                               RADEON_GPU_RESET_COUNTER);
 418         }
 419
 420         rctx->b.set_device_reset_callback = r600_set_device_reset_callback;
 421
 422         si_init_context_texture_functions(rctx);
 423         si_init_query_functions(rctx);
 424
 425         if (rctx->chip_class == CIK ||
 426             rctx->chip_class == VI ||
 427             rctx->chip_class == GFX9) {
 428                 rctx->eop_bug_scratch = (struct r600_resource*)
 429                         pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
 430                                            16 * sscreen->info.num_render_backends);
 431                 if (!rctx->eop_bug_scratch)
 432                         return false;
 433         }
 434
 435         rctx->allocator_zeroed_memory =
 436                 u_suballocator_create(&rctx->b, sscreen->info.gart_page_size,
 437                                       0, PIPE_USAGE_DEFAULT, 0, true);
 438         if (!rctx->allocator_zeroed_memory)
 439                 return false;
 440
 441         rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024,
 442                                                   0, PIPE_USAGE_STREAM,
 443                                                   R600_RESOURCE_FLAG_READ_ONLY);
 444         if (!rctx->b.stream_uploader)
 445                 return false;
 446
 447         rctx->b.const_uploader = u_upload_create(&rctx->b, 128 * 1024,
 448                                                  0, PIPE_USAGE_DEFAULT,
 449                                                  R600_RESOURCE_FLAG_32BIT |
 450                                                  (sscreen->cpdma_prefetch_writes_memory ?
 451                                                         0 : R600_RESOURCE_FLAG_READ_ONLY));
 452         if (!rctx->b.const_uploader)
 453                 return false;
 454
 455         rctx->cached_gtt_allocator = u_upload_create(&rctx->b, 16 * 1024,
 456                                                      0, PIPE_USAGE_STAGING, 0);
 457         if (!rctx->cached_gtt_allocator)
 458                 return false;
 459
 460         rctx->ctx = rctx->ws->ctx_create(rctx->ws);
 461         if (!rctx->ctx)
 462                 return false;
 463
 464         if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags & DBG(NO_ASYNC_DMA))) {
 465                 rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 466                                                    r600_flush_dma_ring,
 467                                                    rctx);
 468                 rctx->dma.flush = r600_flush_dma_ring;
 469         }
 470
 471         return true;
 472 }
 473
 474 void si_common_context_cleanup(struct r600_common_context *rctx)
 475 {
 476         unsigned i,j;
 477
 478         /* Release DCC stats. */
 479         for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
 480                 assert(!rctx->dcc_stats[i].query_active);
 481
 482                 for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
 483                         if (rctx->dcc_stats[i].ps_stats[j])
 484                                 rctx->b.destroy_query(&rctx->b,
 485                                                       rctx->dcc_stats[i].ps_stats[j]);
 486
 487                 r600_texture_reference(&rctx->dcc_stats[i].tex, NULL);
 488         }
 489
 490         if (rctx->query_result_shader)
 491                 rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader);
 492
 493         if (rctx->gfx.cs)
 494                 rctx->ws->cs_destroy(rctx->gfx.cs);
 495         if (rctx->dma.cs)
 496                 rctx->ws->cs_destroy(rctx->dma.cs);
 497         if (rctx->ctx)
 498                 rctx->ws->ctx_destroy(rctx->ctx);
 499
 500         if (rctx->b.stream_uploader)
 501                 u_upload_destroy(rctx->b.stream_uploader);
 502         if (rctx->b.const_uploader)
 503                 u_upload_destroy(rctx->b.const_uploader);
 504         if (rctx->cached_gtt_allocator)
 505                 u_upload_destroy(rctx->cached_gtt_allocator);
 506
 507         slab_destroy_child(&rctx->pool_transfers);
 508         slab_destroy_child(&rctx->pool_transfers_unsync);
 509
 510         if (rctx->allocator_zeroed_memory) {
 511                 u_suballocator_destroy(rctx->allocator_zeroed_memory);
 512         }
 513         rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL);
 514         rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
 515         r600_resource_reference(&rctx->eop_bug_scratch, NULL);
 516 }
 517
 518
 519 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
 520                             uint64_t offset, uint64_t size, unsigned value)
 521 {
 522         struct r600_common_context *rctx = (struct r600_common_context*)sscreen->aux_context;
 523
 524         mtx_lock(&sscreen->aux_context_lock);
 525         rctx->dma_clear_buffer(&rctx->b, dst, offset, size, value);
 526         sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
 527         mtx_unlock(&sscreen->aux_context_lock);
 528 }