src/gallium/drivers/radeon/r600_pipe_common.c

   1 /*
   2  * Copyright 2013 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "r600_pipe_common.h"
  25 #include "r600_cs.h"
  26 #include "util/u_memory.h"
  27 #include "util/u_upload_mgr.h"
  28 #include "radeon/radeon_video.h"
  29
  30 /*
  31  * pipe_context
  32  */
  33
  34 /**
  35  * Write an EOP event.
  36  *
  37  * \param event         EVENT_TYPE_*
  38  * \param event_flags   Optional cache flush flags (TC)
  39  * \param data_sel      1 = fence, 3 = timestamp
  40  * \param buf           Buffer
  41  * \param va            GPU address
  42  * \param old_value     Previous fence value (for a bug workaround)
  43  * \param new_value     Fence value to write for this event.
  44  */
  45 void si_gfx_write_event_eop(struct r600_common_context *ctx,
  46                             unsigned event, unsigned event_flags,
  47                             unsigned data_sel,
  48                             struct r600_resource *buf, uint64_t va,
  49                             uint32_t new_fence, unsigned query_type)
  50 {
  51         struct radeon_winsys_cs *cs = ctx->gfx.cs;
  52         unsigned op = EVENT_TYPE(event) |
  53                       EVENT_INDEX(5) |
  54                       event_flags;
  55         unsigned sel = EOP_DATA_SEL(data_sel);
  56
  57         /* Wait for write confirmation before writing data, but don't send
  58          * an interrupt. */
  59         if (data_sel != EOP_DATA_SEL_DISCARD)
  60                 sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM);
  61
  62         if (ctx->chip_class >= GFX9) {
  63                 /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
  64                  * counters) must immediately precede every timestamp event to
  65                  * prevent a GPU hang on GFX9.
  66                  *
  67                  * Occlusion queries don't need to do it here, because they
  68                  * always do ZPASS_DONE before the timestamp.
  69                  */
  70                 if (ctx->chip_class == GFX9 &&
  71                     query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
  72                     query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
  73                     query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
  74                         struct r600_resource *scratch = ctx->eop_bug_scratch;
  75
  76                         assert(16 * ctx->screen->info.num_render_backends <=
  77                                scratch->b.b.width0);
  78                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
  79                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
  80                         radeon_emit(cs, scratch->gpu_address);
  81                         radeon_emit(cs, scratch->gpu_address >> 32);
  82
  83                         radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
  84                                                   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
  85                 }
  86
  87                 radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
  88                 radeon_emit(cs, op);
  89                 radeon_emit(cs, sel);
  90                 radeon_emit(cs, va);            /* address lo */
  91                 radeon_emit(cs, va >> 32);      /* address hi */
  92                 radeon_emit(cs, new_fence);     /* immediate data lo */
  93                 radeon_emit(cs, 0); /* immediate data hi */
  94                 radeon_emit(cs, 0); /* unused */
  95         } else {
  96                 if (ctx->chip_class == CIK ||
  97                     ctx->chip_class == VI) {
  98                         struct r600_resource *scratch = ctx->eop_bug_scratch;
  99                         uint64_t va = scratch->gpu_address;
 100
 101                         /* Two EOP events are required to make all engines go idle
 102                          * (and optional cache flushes executed) before the timestamp
 103                          * is written.
 104                          */
 105                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 106                         radeon_emit(cs, op);
 107                         radeon_emit(cs, va);
 108                         radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 109                         radeon_emit(cs, 0); /* immediate data */
 110                         radeon_emit(cs, 0); /* unused */
 111
 112                         radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
 113                                                   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 114                 }
 115
 116                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 117                 radeon_emit(cs, op);
 118                 radeon_emit(cs, va);
 119                 radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 120                 radeon_emit(cs, new_fence); /* immediate data */
 121                 radeon_emit(cs, 0); /* unused */
 122         }
 123
 124         if (buf) {
 125                 radeon_add_to_buffer_list(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE,
 126                                           RADEON_PRIO_QUERY);
 127         }
 128 }
 129
 130 unsigned si_gfx_write_fence_dwords(struct si_screen *screen)
 131 {
 132         unsigned dwords = 6;
 133
 134         if (screen->info.chip_class == CIK ||
 135             screen->info.chip_class == VI)
 136                 dwords *= 2;
 137
 138         if (!screen->info.has_virtual_memory)
 139                 dwords += 2;
 140
 141         return dwords;
 142 }
 143
 144 void si_gfx_wait_fence(struct r600_common_context *ctx,
 145                        uint64_t va, uint32_t ref, uint32_t mask)
 146 {
 147         struct radeon_winsys_cs *cs = ctx->gfx.cs;
 148
 149         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
 150         radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
 151         radeon_emit(cs, va);
 152         radeon_emit(cs, va >> 32);
 153         radeon_emit(cs, ref); /* reference value */
 154         radeon_emit(cs, mask); /* mask */
 155         radeon_emit(cs, 4); /* poll interval */
 156 }
 157
 158 static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
 159 {
 160         struct radeon_winsys_cs *cs = rctx->dma.cs;
 161
 162         /* NOP waits for idle on Evergreen and later. */
 163         if (rctx->chip_class >= CIK)
 164                 radeon_emit(cs, 0x00000000); /* NOP */
 165         else
 166                 radeon_emit(cs, 0xf0000000); /* NOP */
 167 }
 168
 169 void si_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 170                        struct r600_resource *dst, struct r600_resource *src)
 171 {
 172         uint64_t vram = ctx->dma.cs->used_vram;
 173         uint64_t gtt = ctx->dma.cs->used_gart;
 174
 175         if (dst) {
 176                 vram += dst->vram_usage;
 177                 gtt += dst->gart_usage;
 178         }
 179         if (src) {
 180                 vram += src->vram_usage;
 181                 gtt += src->gart_usage;
 182         }
 183
 184         /* Flush the GFX IB if DMA depends on it. */
 185         if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
 186             ((dst &&
 187               ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf,
 188                                                RADEON_USAGE_READWRITE)) ||
 189              (src &&
 190               ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf,
 191                                                RADEON_USAGE_WRITE))))
 192                 ctx->gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
 193
 194         /* Flush if there's not enough space, or if the memory usage per IB
 195          * is too large.
 196          *
 197          * IBs using too little memory are limited by the IB submission overhead.
 198          * IBs using too much memory are limited by the kernel/TTM overhead.
 199          * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 200          *
 201          * This heuristic makes sure that DMA requests are executed
 202          * very soon after the call is made and lowers memory usage.
 203          * It improves texture upload performance by keeping the DMA
 204          * engine busy while uploads are being submitted.
 205          */
 206         num_dw++; /* for emit_wait_idle below */
 207         if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
 208             ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
 209             !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
 210                 ctx->dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
 211                 assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
 212         }
 213
 214         /* Wait for idle if either buffer has been used in the IB before to
 215          * prevent read-after-write hazards.
 216          */
 217         if ((dst &&
 218              ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf,
 219                                               RADEON_USAGE_READWRITE)) ||
 220             (src &&
 221              ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf,
 222                                               RADEON_USAGE_WRITE)))
 223                 r600_dma_emit_wait_idle(ctx);
 224
 225         /* If GPUVM is not supported, the CS checker needs 2 entries
 226          * in the buffer list per packet, which has to be done manually.
 227          */
 228         if (ctx->screen->info.has_virtual_memory) {
 229                 if (dst)
 230                         radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
 231                                                   RADEON_USAGE_WRITE,
 232                                                   RADEON_PRIO_SDMA_BUFFER);
 233                 if (src)
 234                         radeon_add_to_buffer_list(ctx, &ctx->dma, src,
 235                                                   RADEON_USAGE_READ,
 236                                                   RADEON_PRIO_SDMA_BUFFER);
 237         }
 238
 239         /* this function is called before all DMA calls, so increment this. */
 240         ctx->num_dma_calls++;
 241 }
 242
 243 static void r600_flush_dma_ring(void *ctx, unsigned flags,
 244                                 struct pipe_fence_handle **fence)
 245 {
 246         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 247         struct radeon_winsys_cs *cs = rctx->dma.cs;
 248         struct radeon_saved_cs saved;
 249         bool check_vm =
 250                 (rctx->screen->debug_flags & DBG(CHECK_VM)) &&
 251                 rctx->check_vm_faults;
 252
 253         if (!radeon_emitted(cs, 0)) {
 254                 if (fence)
 255                         rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 256                 return;
 257         }
 258
 259         if (check_vm)
 260                 si_save_cs(rctx->ws, cs, &saved, true);
 261
 262         rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence);
 263         if (fence)
 264                 rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 265
 266         if (check_vm) {
 267                 /* Use conservative timeout 800ms, after which we won't wait any
 268                  * longer and assume the GPU is hung.
 269                  */
 270                 rctx->ws->fence_wait(rctx->ws, rctx->last_sdma_fence, 800*1000*1000);
 271
 272                 rctx->check_vm_faults(rctx, &saved, RING_DMA);
 273                 si_clear_saved_cs(&saved);
 274         }
 275 }
 276
 277 /**
 278  * Store a linearized copy of all chunks of \p cs together with the buffer
 279  * list in \p saved.
 280  */
 281 void si_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
 282                 struct radeon_saved_cs *saved, bool get_buffer_list)
 283 {
 284         uint32_t *buf;
 285         unsigned i;
 286
 287         /* Save the IB chunks. */
 288         saved->num_dw = cs->prev_dw + cs->current.cdw;
 289         saved->ib = MALLOC(4 * saved->num_dw);
 290         if (!saved->ib)
 291                 goto oom;
 292
 293         buf = saved->ib;
 294         for (i = 0; i < cs->num_prev; ++i) {
 295                 memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
 296                 buf += cs->prev[i].cdw;
 297         }
 298         memcpy(buf, cs->current.buf, cs->current.cdw * 4);
 299
 300         if (!get_buffer_list)
 301                 return;
 302
 303         /* Save the buffer list. */
 304         saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
 305         saved->bo_list = CALLOC(saved->bo_count,
 306                                 sizeof(saved->bo_list[0]));
 307         if (!saved->bo_list) {
 308                 FREE(saved->ib);
 309                 goto oom;
 310         }
 311         ws->cs_get_buffer_list(cs, saved->bo_list);
 312
 313         return;
 314
 315 oom:
 316         fprintf(stderr, "%s: out of memory\n", __func__);
 317         memset(saved, 0, sizeof(*saved));
 318 }
 319
 320 void si_clear_saved_cs(struct radeon_saved_cs *saved)
 321 {
 322         FREE(saved->ib);
 323         FREE(saved->bo_list);
 324
 325         memset(saved, 0, sizeof(*saved));
 326 }
 327
 328 static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
 329 {
 330         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 331         unsigned latest = rctx->ws->query_value(rctx->ws,
 332                                                 RADEON_GPU_RESET_COUNTER);
 333
 334         if (rctx->gpu_reset_counter == latest)
 335                 return PIPE_NO_RESET;
 336
 337         rctx->gpu_reset_counter = latest;
 338         return PIPE_UNKNOWN_CONTEXT_RESET;
 339 }
 340
 341 static void r600_set_device_reset_callback(struct pipe_context *ctx,
 342                                            const struct pipe_device_reset_callback *cb)
 343 {
 344         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 345
 346         if (cb)
 347                 rctx->device_reset_callback = *cb;
 348         else
 349                 memset(&rctx->device_reset_callback, 0,
 350                        sizeof(rctx->device_reset_callback));
 351 }
 352
 353 bool si_check_device_reset(struct r600_common_context *rctx)
 354 {
 355         enum pipe_reset_status status;
 356
 357         if (!rctx->device_reset_callback.reset)
 358                 return false;
 359
 360         if (!rctx->b.get_device_reset_status)
 361                 return false;
 362
 363         status = rctx->b.get_device_reset_status(&rctx->b);
 364         if (status == PIPE_NO_RESET)
 365                 return false;
 366
 367         rctx->device_reset_callback.reset(rctx->device_reset_callback.data, status);
 368         return true;
 369 }
 370
 371 static bool r600_resource_commit(struct pipe_context *pctx,
 372                                  struct pipe_resource *resource,
 373                                  unsigned level, struct pipe_box *box,
 374                                  bool commit)
 375 {
 376         struct r600_common_context *ctx = (struct r600_common_context *)pctx;
 377         struct r600_resource *res = r600_resource(resource);
 378
 379         /*
 380          * Since buffer commitment changes cannot be pipelined, we need to
 381          * (a) flush any pending commands that refer to the buffer we're about
 382          *     to change, and
 383          * (b) wait for threaded submit to finish, including those that were
 384          *     triggered by some other, earlier operation.
 385          */
 386         if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
 387             ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
 388                                              res->buf, RADEON_USAGE_READWRITE)) {
 389                 ctx->gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
 390         }
 391         if (radeon_emitted(ctx->dma.cs, 0) &&
 392             ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
 393                                              res->buf, RADEON_USAGE_READWRITE)) {
 394                 ctx->dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
 395         }
 396
 397         ctx->ws->cs_sync_flush(ctx->dma.cs);
 398         ctx->ws->cs_sync_flush(ctx->gfx.cs);
 399
 400         assert(resource->target == PIPE_BUFFER);
 401
 402         return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
 403 }
 404
 405 bool si_common_context_init(struct r600_common_context *rctx,
 406                             struct si_screen *sscreen,
 407                             unsigned context_flags)
 408 {
 409         slab_create_child(&rctx->pool_transfers, &sscreen->pool_transfers);
 410         slab_create_child(&rctx->pool_transfers_unsync, &sscreen->pool_transfers);
 411
 412         rctx->screen = sscreen;
 413         rctx->ws = sscreen->ws;
 414         rctx->family = sscreen->info.family;
 415         rctx->chip_class = sscreen->info.chip_class;
 416
 417         rctx->b.resource_commit = r600_resource_commit;
 418
 419         if (sscreen->info.drm_major == 2 && sscreen->info.drm_minor >= 43) {
 420                 rctx->b.get_device_reset_status = r600_get_reset_status;
 421                 rctx->gpu_reset_counter =
 422                         rctx->ws->query_value(rctx->ws,
 423                                               RADEON_GPU_RESET_COUNTER);
 424         }
 425
 426         rctx->b.set_device_reset_callback = r600_set_device_reset_callback;
 427
 428         si_init_context_texture_functions(rctx);
 429         si_init_query_functions(rctx);
 430
 431         if (rctx->chip_class == CIK ||
 432             rctx->chip_class == VI ||
 433             rctx->chip_class == GFX9) {
 434                 rctx->eop_bug_scratch = (struct r600_resource*)
 435                         pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
 436                                            16 * sscreen->info.num_render_backends);
 437                 if (!rctx->eop_bug_scratch)
 438                         return false;
 439         }
 440
 441         rctx->allocator_zeroed_memory =
 442                 u_suballocator_create(&rctx->b, sscreen->info.gart_page_size,
 443                                       0, PIPE_USAGE_DEFAULT, 0, true);
 444         if (!rctx->allocator_zeroed_memory)
 445                 return false;
 446
 447         rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024,
 448                                                   0, PIPE_USAGE_STREAM,
 449                                                   R600_RESOURCE_FLAG_READ_ONLY);
 450         if (!rctx->b.stream_uploader)
 451                 return false;
 452
 453         rctx->b.const_uploader = u_upload_create(&rctx->b, 128 * 1024,
 454                                                  0, PIPE_USAGE_DEFAULT,
 455                                                  R600_RESOURCE_FLAG_32BIT |
 456                                                  (sscreen->cpdma_prefetch_writes_memory ?
 457                                                         0 : R600_RESOURCE_FLAG_READ_ONLY));
 458         if (!rctx->b.const_uploader)
 459                 return false;
 460
 461         rctx->cached_gtt_allocator = u_upload_create(&rctx->b, 16 * 1024,
 462                                                      0, PIPE_USAGE_STAGING, 0);
 463         if (!rctx->cached_gtt_allocator)
 464                 return false;
 465
 466         rctx->ctx = rctx->ws->ctx_create(rctx->ws);
 467         if (!rctx->ctx)
 468                 return false;
 469
 470         if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags & DBG(NO_ASYNC_DMA))) {
 471                 rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 472                                                    r600_flush_dma_ring,
 473                                                    rctx);
 474                 rctx->dma.flush = r600_flush_dma_ring;
 475         }
 476
 477         return true;
 478 }
 479
 480 void si_common_context_cleanup(struct r600_common_context *rctx)
 481 {
 482         unsigned i,j;
 483
 484         /* Release DCC stats. */
 485         for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
 486                 assert(!rctx->dcc_stats[i].query_active);
 487
 488                 for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
 489                         if (rctx->dcc_stats[i].ps_stats[j])
 490                                 rctx->b.destroy_query(&rctx->b,
 491                                                       rctx->dcc_stats[i].ps_stats[j]);
 492
 493                 r600_texture_reference(&rctx->dcc_stats[i].tex, NULL);
 494         }
 495
 496         if (rctx->query_result_shader)
 497                 rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader);
 498
 499         if (rctx->gfx.cs)
 500                 rctx->ws->cs_destroy(rctx->gfx.cs);
 501         if (rctx->dma.cs)
 502                 rctx->ws->cs_destroy(rctx->dma.cs);
 503         if (rctx->ctx)
 504                 rctx->ws->ctx_destroy(rctx->ctx);
 505
 506         if (rctx->b.stream_uploader)
 507                 u_upload_destroy(rctx->b.stream_uploader);
 508         if (rctx->b.const_uploader)
 509                 u_upload_destroy(rctx->b.const_uploader);
 510         if (rctx->cached_gtt_allocator)
 511                 u_upload_destroy(rctx->cached_gtt_allocator);
 512
 513         slab_destroy_child(&rctx->pool_transfers);
 514         slab_destroy_child(&rctx->pool_transfers_unsync);
 515
 516         if (rctx->allocator_zeroed_memory) {
 517                 u_suballocator_destroy(rctx->allocator_zeroed_memory);
 518         }
 519         rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL);
 520         rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
 521         r600_resource_reference(&rctx->eop_bug_scratch, NULL);
 522 }
 523
 524
 525 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
 526                             uint64_t offset, uint64_t size, unsigned value)
 527 {
 528         struct r600_common_context *rctx = (struct r600_common_context*)sscreen->aux_context;
 529
 530         mtx_lock(&sscreen->aux_context_lock);
 531         rctx->dma_clear_buffer(&rctx->b, dst, offset, size, value);
 532         sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
 533         mtx_unlock(&sscreen->aux_context_lock);
 534 }