src/gallium/drivers/radeon/r600_pipe_common.c

   1 /*
   2  * Copyright 2013 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "r600_pipe_common.h"
  25 #include "r600_cs.h"
  26 #include "util/u_memory.h"
  27 #include "util/u_upload_mgr.h"
  28 #include "radeon/radeon_video.h"
  29
  30 /*
  31  * pipe_context
  32  */
  33
  34 /**
  35  * Write an EOP event.
  36  *
  37  * \param event         EVENT_TYPE_*
  38  * \param event_flags   Optional cache flush flags (TC)
  39  * \param data_sel      1 = fence, 3 = timestamp
  40  * \param buf           Buffer
  41  * \param va            GPU address
  42  * \param old_value     Previous fence value (for a bug workaround)
  43  * \param new_value     Fence value to write for this event.
  44  */
  45 void si_gfx_write_event_eop(struct r600_common_context *ctx,
  46                             unsigned event, unsigned event_flags,
  47                             unsigned data_sel,
  48                             struct r600_resource *buf, uint64_t va,
  49                             uint32_t new_fence, unsigned query_type)
  50 {
  51         struct radeon_winsys_cs *cs = ctx->gfx.cs;
  52         unsigned op = EVENT_TYPE(event) |
  53                       EVENT_INDEX(5) |
  54                       event_flags;
  55         unsigned sel = EOP_DATA_SEL(data_sel);
  56
  57         /* Wait for write confirmation before writing data, but don't send
  58          * an interrupt. */
  59         if (data_sel != EOP_DATA_SEL_DISCARD)
  60                 sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM);
  61
  62         if (ctx->chip_class >= GFX9) {
  63                 /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
  64                  * counters) must immediately precede every timestamp event to
  65                  * prevent a GPU hang on GFX9.
  66                  *
  67                  * Occlusion queries don't need to do it here, because they
  68                  * always do ZPASS_DONE before the timestamp.
  69                  */
  70                 if (ctx->chip_class == GFX9 &&
  71                     query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
  72                     query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
  73                     query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
  74                         struct r600_resource *scratch = ctx->eop_bug_scratch;
  75
  76                         assert(16 * ctx->screen->info.num_render_backends <=
  77                                scratch->b.b.width0);
  78                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
  79                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
  80                         radeon_emit(cs, scratch->gpu_address);
  81                         radeon_emit(cs, scratch->gpu_address >> 32);
  82
  83                         radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
  84                                                   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
  85                 }
  86
  87                 radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
  88                 radeon_emit(cs, op);
  89                 radeon_emit(cs, sel);
  90                 radeon_emit(cs, va);            /* address lo */
  91                 radeon_emit(cs, va >> 32);      /* address hi */
  92                 radeon_emit(cs, new_fence);     /* immediate data lo */
  93                 radeon_emit(cs, 0); /* immediate data hi */
  94                 radeon_emit(cs, 0); /* unused */
  95         } else {
  96                 if (ctx->chip_class == CIK ||
  97                     ctx->chip_class == VI) {
  98                         struct r600_resource *scratch = ctx->eop_bug_scratch;
  99                         uint64_t va = scratch->gpu_address;
 100
 101                         /* Two EOP events are required to make all engines go idle
 102                          * (and optional cache flushes executed) before the timestamp
 103                          * is written.
 104                          */
 105                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 106                         radeon_emit(cs, op);
 107                         radeon_emit(cs, va);
 108                         radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 109                         radeon_emit(cs, 0); /* immediate data */
 110                         radeon_emit(cs, 0); /* unused */
 111
 112                         radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
 113                                                   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 114                 }
 115
 116                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 117                 radeon_emit(cs, op);
 118                 radeon_emit(cs, va);
 119                 radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 120                 radeon_emit(cs, new_fence); /* immediate data */
 121                 radeon_emit(cs, 0); /* unused */
 122         }
 123
 124         if (buf) {
 125                 radeon_add_to_buffer_list(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE,
 126                                           RADEON_PRIO_QUERY);
 127         }
 128 }
 129
 130 unsigned si_gfx_write_fence_dwords(struct si_screen *screen)
 131 {
 132         unsigned dwords = 6;
 133
 134         if (screen->info.chip_class == CIK ||
 135             screen->info.chip_class == VI)
 136                 dwords *= 2;
 137
 138         return dwords;
 139 }
 140
 141 void si_gfx_wait_fence(struct r600_common_context *ctx,
 142                        uint64_t va, uint32_t ref, uint32_t mask)
 143 {
 144         struct radeon_winsys_cs *cs = ctx->gfx.cs;
 145
 146         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
 147         radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
 148         radeon_emit(cs, va);
 149         radeon_emit(cs, va >> 32);
 150         radeon_emit(cs, ref); /* reference value */
 151         radeon_emit(cs, mask); /* mask */
 152         radeon_emit(cs, 4); /* poll interval */
 153 }
 154
 155 static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
 156 {
 157         struct radeon_winsys_cs *cs = rctx->dma.cs;
 158
 159         /* NOP waits for idle on Evergreen and later. */
 160         if (rctx->chip_class >= CIK)
 161                 radeon_emit(cs, 0x00000000); /* NOP */
 162         else
 163                 radeon_emit(cs, 0xf0000000); /* NOP */
 164 }
 165
 166 void si_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 167                        struct r600_resource *dst, struct r600_resource *src)
 168 {
 169         uint64_t vram = ctx->dma.cs->used_vram;
 170         uint64_t gtt = ctx->dma.cs->used_gart;
 171
 172         if (dst) {
 173                 vram += dst->vram_usage;
 174                 gtt += dst->gart_usage;
 175         }
 176         if (src) {
 177                 vram += src->vram_usage;
 178                 gtt += src->gart_usage;
 179         }
 180
 181         /* Flush the GFX IB if DMA depends on it. */
 182         if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
 183             ((dst &&
 184               ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf,
 185                                                RADEON_USAGE_READWRITE)) ||
 186              (src &&
 187               ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf,
 188                                                RADEON_USAGE_WRITE))))
 189                 si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
 190
 191         /* Flush if there's not enough space, or if the memory usage per IB
 192          * is too large.
 193          *
 194          * IBs using too little memory are limited by the IB submission overhead.
 195          * IBs using too much memory are limited by the kernel/TTM overhead.
 196          * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 197          *
 198          * This heuristic makes sure that DMA requests are executed
 199          * very soon after the call is made and lowers memory usage.
 200          * It improves texture upload performance by keeping the DMA
 201          * engine busy while uploads are being submitted.
 202          */
 203         num_dw++; /* for emit_wait_idle below */
 204         if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
 205             ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
 206             !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
 207                 si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
 208                 assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
 209         }
 210
 211         /* Wait for idle if either buffer has been used in the IB before to
 212          * prevent read-after-write hazards.
 213          */
 214         if ((dst &&
 215              ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf,
 216                                               RADEON_USAGE_READWRITE)) ||
 217             (src &&
 218              ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf,
 219                                               RADEON_USAGE_WRITE)))
 220                 r600_dma_emit_wait_idle(ctx);
 221
 222         if (dst) {
 223                 radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
 224                                           RADEON_USAGE_WRITE,
 225                                           RADEON_PRIO_SDMA_BUFFER);
 226         }
 227         if (src) {
 228                 radeon_add_to_buffer_list(ctx, &ctx->dma, src,
 229                                           RADEON_USAGE_READ,
 230                                           RADEON_PRIO_SDMA_BUFFER);
 231         }
 232
 233         /* this function is called before all DMA calls, so increment this. */
 234         ctx->num_dma_calls++;
 235 }
 236
 237 void si_flush_dma_cs(void *ctx, unsigned flags, struct pipe_fence_handle **fence)
 238 {
 239         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 240         struct radeon_winsys_cs *cs = rctx->dma.cs;
 241         struct radeon_saved_cs saved;
 242         bool check_vm = (rctx->screen->debug_flags & DBG(CHECK_VM));
 243
 244         if (!radeon_emitted(cs, 0)) {
 245                 if (fence)
 246                         rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 247                 return;
 248         }
 249
 250         if (check_vm)
 251                 si_save_cs(rctx->ws, cs, &saved, true);
 252
 253         rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence);
 254         if (fence)
 255                 rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 256
 257         if (check_vm) {
 258                 /* Use conservative timeout 800ms, after which we won't wait any
 259                  * longer and assume the GPU is hung.
 260                  */
 261                 rctx->ws->fence_wait(rctx->ws, rctx->last_sdma_fence, 800*1000*1000);
 262
 263                 si_check_vm_faults(rctx, &saved, RING_DMA);
 264                 si_clear_saved_cs(&saved);
 265         }
 266 }
 267
 268 /**
 269  * Store a linearized copy of all chunks of \p cs together with the buffer
 270  * list in \p saved.
 271  */
 272 void si_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
 273                 struct radeon_saved_cs *saved, bool get_buffer_list)
 274 {
 275         uint32_t *buf;
 276         unsigned i;
 277
 278         /* Save the IB chunks. */
 279         saved->num_dw = cs->prev_dw + cs->current.cdw;
 280         saved->ib = MALLOC(4 * saved->num_dw);
 281         if (!saved->ib)
 282                 goto oom;
 283
 284         buf = saved->ib;
 285         for (i = 0; i < cs->num_prev; ++i) {
 286                 memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
 287                 buf += cs->prev[i].cdw;
 288         }
 289         memcpy(buf, cs->current.buf, cs->current.cdw * 4);
 290
 291         if (!get_buffer_list)
 292                 return;
 293
 294         /* Save the buffer list. */
 295         saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
 296         saved->bo_list = CALLOC(saved->bo_count,
 297                                 sizeof(saved->bo_list[0]));
 298         if (!saved->bo_list) {
 299                 FREE(saved->ib);
 300                 goto oom;
 301         }
 302         ws->cs_get_buffer_list(cs, saved->bo_list);
 303
 304         return;
 305
 306 oom:
 307         fprintf(stderr, "%s: out of memory\n", __func__);
 308         memset(saved, 0, sizeof(*saved));
 309 }
 310
 311 void si_clear_saved_cs(struct radeon_saved_cs *saved)
 312 {
 313         FREE(saved->ib);
 314         FREE(saved->bo_list);
 315
 316         memset(saved, 0, sizeof(*saved));
 317 }
 318
 319 static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
 320 {
 321         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 322         unsigned latest = rctx->ws->query_value(rctx->ws,
 323                                                 RADEON_GPU_RESET_COUNTER);
 324
 325         if (rctx->gpu_reset_counter == latest)
 326                 return PIPE_NO_RESET;
 327
 328         rctx->gpu_reset_counter = latest;
 329         return PIPE_UNKNOWN_CONTEXT_RESET;
 330 }
 331
 332 static void r600_set_device_reset_callback(struct pipe_context *ctx,
 333                                            const struct pipe_device_reset_callback *cb)
 334 {
 335         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 336
 337         if (cb)
 338                 rctx->device_reset_callback = *cb;
 339         else
 340                 memset(&rctx->device_reset_callback, 0,
 341                        sizeof(rctx->device_reset_callback));
 342 }
 343
 344 bool si_check_device_reset(struct r600_common_context *rctx)
 345 {
 346         enum pipe_reset_status status;
 347
 348         if (!rctx->device_reset_callback.reset)
 349                 return false;
 350
 351         if (!rctx->b.get_device_reset_status)
 352                 return false;
 353
 354         status = rctx->b.get_device_reset_status(&rctx->b);
 355         if (status == PIPE_NO_RESET)
 356                 return false;
 357
 358         rctx->device_reset_callback.reset(rctx->device_reset_callback.data, status);
 359         return true;
 360 }
 361
 362 static bool r600_resource_commit(struct pipe_context *pctx,
 363                                  struct pipe_resource *resource,
 364                                  unsigned level, struct pipe_box *box,
 365                                  bool commit)
 366 {
 367         struct r600_common_context *ctx = (struct r600_common_context *)pctx;
 368         struct r600_resource *res = r600_resource(resource);
 369
 370         /*
 371          * Since buffer commitment changes cannot be pipelined, we need to
 372          * (a) flush any pending commands that refer to the buffer we're about
 373          *     to change, and
 374          * (b) wait for threaded submit to finish, including those that were
 375          *     triggered by some other, earlier operation.
 376          */
 377         if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
 378             ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
 379                                              res->buf, RADEON_USAGE_READWRITE)) {
 380                 si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
 381         }
 382         if (radeon_emitted(ctx->dma.cs, 0) &&
 383             ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
 384                                              res->buf, RADEON_USAGE_READWRITE)) {
 385                 si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
 386         }
 387
 388         ctx->ws->cs_sync_flush(ctx->dma.cs);
 389         ctx->ws->cs_sync_flush(ctx->gfx.cs);
 390
 391         assert(resource->target == PIPE_BUFFER);
 392
 393         return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
 394 }
 395
 396 bool si_common_context_init(struct r600_common_context *rctx,
 397                             struct si_screen *sscreen,
 398                             unsigned context_flags)
 399 {
 400         slab_create_child(&rctx->pool_transfers, &sscreen->pool_transfers);
 401         slab_create_child(&rctx->pool_transfers_unsync, &sscreen->pool_transfers);
 402
 403         rctx->screen = sscreen;
 404         rctx->ws = sscreen->ws;
 405         rctx->family = sscreen->info.family;
 406         rctx->chip_class = sscreen->info.chip_class;
 407
 408         rctx->b.resource_commit = r600_resource_commit;
 409
 410         if (sscreen->info.drm_major == 2 && sscreen->info.drm_minor >= 43) {
 411                 rctx->b.get_device_reset_status = r600_get_reset_status;
 412                 rctx->gpu_reset_counter =
 413                         rctx->ws->query_value(rctx->ws,
 414                                               RADEON_GPU_RESET_COUNTER);
 415         }
 416
 417         rctx->b.set_device_reset_callback = r600_set_device_reset_callback;
 418
 419         si_init_context_texture_functions(rctx);
 420         si_init_query_functions(rctx);
 421
 422         if (rctx->chip_class == CIK ||
 423             rctx->chip_class == VI ||
 424             rctx->chip_class == GFX9) {
 425                 rctx->eop_bug_scratch = (struct r600_resource*)
 426                         pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
 427                                            16 * sscreen->info.num_render_backends);
 428                 if (!rctx->eop_bug_scratch)
 429                         return false;
 430         }
 431
 432         rctx->allocator_zeroed_memory =
 433                 u_suballocator_create(&rctx->b, sscreen->info.gart_page_size,
 434                                       0, PIPE_USAGE_DEFAULT, 0, true);
 435         if (!rctx->allocator_zeroed_memory)
 436                 return false;
 437
 438         rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024,
 439                                                   0, PIPE_USAGE_STREAM,
 440                                                   R600_RESOURCE_FLAG_READ_ONLY);
 441         if (!rctx->b.stream_uploader)
 442                 return false;
 443
 444         rctx->b.const_uploader = u_upload_create(&rctx->b, 128 * 1024,
 445                                                  0, PIPE_USAGE_DEFAULT,
 446                                                  R600_RESOURCE_FLAG_32BIT |
 447                                                  (sscreen->cpdma_prefetch_writes_memory ?
 448                                                         0 : R600_RESOURCE_FLAG_READ_ONLY));
 449         if (!rctx->b.const_uploader)
 450                 return false;
 451
 452         rctx->cached_gtt_allocator = u_upload_create(&rctx->b, 16 * 1024,
 453                                                      0, PIPE_USAGE_STAGING, 0);
 454         if (!rctx->cached_gtt_allocator)
 455                 return false;
 456
 457         rctx->ctx = rctx->ws->ctx_create(rctx->ws);
 458         if (!rctx->ctx)
 459                 return false;
 460
 461         if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags & DBG(NO_ASYNC_DMA))) {
 462                 rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 463                                                    si_flush_dma_cs,
 464                                                    rctx);
 465                 rctx->dma.flush = si_flush_dma_cs;
 466         }
 467
 468         return true;
 469 }
 470
 471 void si_common_context_cleanup(struct r600_common_context *rctx)
 472 {
 473         unsigned i,j;
 474
 475         /* Release DCC stats. */
 476         for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
 477                 assert(!rctx->dcc_stats[i].query_active);
 478
 479                 for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
 480                         if (rctx->dcc_stats[i].ps_stats[j])
 481                                 rctx->b.destroy_query(&rctx->b,
 482                                                       rctx->dcc_stats[i].ps_stats[j]);
 483
 484                 r600_texture_reference(&rctx->dcc_stats[i].tex, NULL);
 485         }
 486
 487         if (rctx->query_result_shader)
 488                 rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader);
 489
 490         if (rctx->gfx.cs)
 491                 rctx->ws->cs_destroy(rctx->gfx.cs);
 492         if (rctx->dma.cs)
 493                 rctx->ws->cs_destroy(rctx->dma.cs);
 494         if (rctx->ctx)
 495                 rctx->ws->ctx_destroy(rctx->ctx);
 496
 497         if (rctx->b.stream_uploader)
 498                 u_upload_destroy(rctx->b.stream_uploader);
 499         if (rctx->b.const_uploader)
 500                 u_upload_destroy(rctx->b.const_uploader);
 501         if (rctx->cached_gtt_allocator)
 502                 u_upload_destroy(rctx->cached_gtt_allocator);
 503
 504         slab_destroy_child(&rctx->pool_transfers);
 505         slab_destroy_child(&rctx->pool_transfers_unsync);
 506
 507         if (rctx->allocator_zeroed_memory) {
 508                 u_suballocator_destroy(rctx->allocator_zeroed_memory);
 509         }
 510         rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL);
 511         rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
 512         r600_resource_reference(&rctx->eop_bug_scratch, NULL);
 513 }
 514
 515
 516 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
 517                             uint64_t offset, uint64_t size, unsigned value)
 518 {
 519         struct r600_common_context *rctx = (struct r600_common_context*)sscreen->aux_context;
 520
 521         mtx_lock(&sscreen->aux_context_lock);
 522         rctx->dma_clear_buffer(&rctx->b, dst, offset, size, value);
 523         sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
 524         mtx_unlock(&sscreen->aux_context_lock);
 525 }