src/gallium/winsys/radeon/drm/radeon_drm_cs.c

   1 /*
   2  * Copyright © 2008 Jérôme Glisse
   3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * The above copyright notice and this permission notice (including the
  24  * next paragraph) shall be included in all copies or substantial portions
  25  * of the Software.
  26  */
  27
  28 /*
  29     This file replaces libdrm's radeon_cs_gem with our own implemention.
  30     It's optimized specifically for Radeon DRM.
  31     Adding buffers and space checking are faster and simpler than their
  32     counterparts in libdrm (the time complexity of all the functions
  33     is O(1) in nearly all scenarios, thanks to hashing).
  34
  35     It works like this:
  36
  37     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
  38     also adds the size of 'buf' to the used_gart and used_vram winsys variables
  39     based on the domains, which are simply or'd for the accounting purposes.
  40     The adding is skipped if the reloc is already present in the list, but it
  41     accounts any newly-referenced domains.
  42
  43     cs_validate is then called, which just checks:
  44         used_vram/gart < vram/gart_size * 0.8
  45     The 0.8 number allows for some memory fragmentation. If the validation
  46     fails, the pipe driver flushes CS and tries do the validation again,
  47     i.e. it validates only that one operation. If it fails again, it drops
  48     the operation on the floor and prints some nasty message to stderr.
  49     (done in the pipe driver)
  50
  51     cs_write_reloc(cs, buf) just writes a reloc that has been added using
  52     cs_add_buffer. The read_domain and write_domain parameters have been removed,
  53     because we already specify them in cs_add_buffer.
  54 */
  55
  56 #include "radeon_drm_cs.h"
  57
  58 #include "util/u_memory.h"
  59 #include "util/os_time.h"
  60
  61 #include <stdio.h>
  62 #include <stdlib.h>
  63 #include <stdint.h>
  64 #include <xf86drm.h>
  65
  66
  67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
  68
  69 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
  70 static void radeon_fence_reference(struct pipe_fence_handle **dst,
  71                                    struct pipe_fence_handle *src);
  72
  73 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
  74 {
  75    struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
  76    if (!ctx)
  77       return NULL;
  78
  79    ctx->ws = (struct radeon_drm_winsys*)ws;
  80    ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
  81    return (struct radeon_winsys_ctx*)ctx;
  82 }
  83
  84 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
  85 {
  86    FREE(ctx);
  87 }
  88
  89 static enum pipe_reset_status
  90 radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx)
  91 {
  92    struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
  93
  94    unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
  95
  96    if (ctx->gpu_reset_counter == latest)
  97       return PIPE_NO_RESET;
  98
  99    ctx->gpu_reset_counter = latest;
 100    return PIPE_UNKNOWN_CONTEXT_RESET;
 101 }
 102
 103 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
 104                                    struct radeon_drm_winsys *ws)
 105 {
 106    int i;
 107
 108    csc->fd = ws->fd;
 109
 110    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
 111    csc->chunks[0].length_dw = 0;
 112    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
 113    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
 114    csc->chunks[1].length_dw = 0;
 115    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 116    csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
 117    csc->chunks[2].length_dw = 2;
 118    csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
 119
 120    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
 121    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
 122    csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
 123
 124    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
 125
 126    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 127       csc->reloc_indices_hashlist[i] = -1;
 128    }
 129    return true;
 130 }
 131
 132 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
 133 {
 134    unsigned i;
 135
 136    for (i = 0; i < csc->num_relocs; i++) {
 137       p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
 138       radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
 139    }
 140    for (i = 0; i < csc->num_slab_buffers; ++i) {
 141       p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
 142       radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
 143    }
 144
 145    csc->num_relocs = 0;
 146    csc->num_validated_relocs = 0;
 147    csc->num_slab_buffers = 0;
 148    csc->chunks[0].length_dw = 0;
 149    csc->chunks[1].length_dw = 0;
 150
 151    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 152       csc->reloc_indices_hashlist[i] = -1;
 153    }
 154 }
 155
 156 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
 157 {
 158    radeon_cs_context_cleanup(csc);
 159    FREE(csc->slab_buffers);
 160    FREE(csc->relocs_bo);
 161    FREE(csc->relocs);
 162 }
 163
 164
 165 static struct radeon_cmdbuf *
 166 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
 167                      enum ring_type ring_type,
 168                      void (*flush)(void *ctx, unsigned flags,
 169                                    struct pipe_fence_handle **fence),
 170                      void *flush_ctx,
 171                      bool stop_exec_on_failure)
 172 {
 173    struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
 174    struct radeon_drm_cs *cs;
 175
 176    cs = CALLOC_STRUCT(radeon_drm_cs);
 177    if (!cs) {
 178       return NULL;
 179    }
 180    util_queue_fence_init(&cs->flush_completed);
 181
 182    cs->ws = ws;
 183    cs->flush_cs = flush;
 184    cs->flush_data = flush_ctx;
 185
 186    if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
 187       FREE(cs);
 188       return NULL;
 189    }
 190    if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
 191       radeon_destroy_cs_context(&cs->csc1);
 192       FREE(cs);
 193       return NULL;
 194    }
 195
 196    /* Set the first command buffer as current. */
 197    cs->csc = &cs->csc1;
 198    cs->cst = &cs->csc2;
 199    cs->base.current.buf = cs->csc->buf;
 200    cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
 201    cs->ring_type = ring_type;
 202
 203    p_atomic_inc(&ws->num_cs);
 204    return &cs->base;
 205 }
 206
 207 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
 208 {
 209    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 210    struct radeon_bo_item *buffers;
 211    unsigned num_buffers;
 212    int i = csc->reloc_indices_hashlist[hash];
 213
 214    if (bo->handle) {
 215       buffers = csc->relocs_bo;
 216       num_buffers = csc->num_relocs;
 217    } else {
 218       buffers = csc->slab_buffers;
 219       num_buffers = csc->num_slab_buffers;
 220    }
 221
 222    /* not found or found */
 223    if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
 224       return i;
 225
 226    /* Hash collision, look for the BO in the list of relocs linearly. */
 227    for (i = num_buffers - 1; i >= 0; i--) {
 228       if (buffers[i].bo == bo) {
 229          /* Put this reloc in the hash list.
 230           * This will prevent additional hash collisions if there are
 231           * several consecutive lookup_buffer calls for the same buffer.
 232           *
 233           * Example: Assuming buffers A,B,C collide in the hash list,
 234           * the following sequence of relocs:
 235           *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
 236           * will collide here: ^ and here:   ^,
 237           * meaning that we should get very few collisions in the end. */
 238          csc->reloc_indices_hashlist[hash] = i;
 239          return i;
 240       }
 241    }
 242    return -1;
 243 }
 244
 245 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
 246                                                  struct radeon_bo *bo)
 247 {
 248    struct radeon_cs_context *csc = cs->csc;
 249    struct drm_radeon_cs_reloc *reloc;
 250    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 251    int i = -1;
 252
 253    i = radeon_lookup_buffer(csc, bo);
 254
 255    if (i >= 0) {
 256       /* For async DMA, every add_buffer call must add a buffer to the list
 257        * no matter how many duplicates there are. This is due to the fact
 258        * the DMA CS checker doesn't use NOP packets for offset patching,
 259        * but always uses the i-th buffer from the list to patch the i-th
 260        * offset. If there are N offsets in a DMA CS, there must also be N
 261        * buffers in the relocation list.
 262        *
 263        * This doesn't have to be done if virtual memory is enabled,
 264        * because there is no offset patching with virtual memory.
 265        */
 266       if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) {
 267          return i;
 268       }
 269    }
 270
 271    /* New relocation, check if the backing array is large enough. */
 272    if (csc->num_relocs >= csc->max_relocs) {
 273       uint32_t size;
 274       csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
 275
 276       size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
 277       csc->relocs_bo = realloc(csc->relocs_bo, size);
 278
 279       size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
 280       csc->relocs = realloc(csc->relocs, size);
 281
 282       csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 283    }
 284
 285    /* Initialize the new relocation. */
 286    csc->relocs_bo[csc->num_relocs].bo = NULL;
 287    csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
 288    radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
 289    p_atomic_inc(&bo->num_cs_references);
 290    reloc = &csc->relocs[csc->num_relocs];
 291    reloc->handle = bo->handle;
 292    reloc->read_domains = 0;
 293    reloc->write_domain = 0;
 294    reloc->flags = 0;
 295
 296    csc->reloc_indices_hashlist[hash] = csc->num_relocs;
 297
 298    csc->chunks[1].length_dw += RELOC_DWORDS;
 299
 300    return csc->num_relocs++;
 301 }
 302
 303 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
 304                                             struct radeon_bo *bo)
 305 {
 306    struct radeon_cs_context *csc = cs->csc;
 307    unsigned hash;
 308    struct radeon_bo_item *item;
 309    int idx;
 310    int real_idx;
 311
 312    idx = radeon_lookup_buffer(csc, bo);
 313    if (idx >= 0)
 314       return idx;
 315
 316    real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
 317
 318    /* Check if the backing array is large enough. */
 319    if (csc->num_slab_buffers >= csc->max_slab_buffers) {
 320       unsigned new_max = MAX2(csc->max_slab_buffers + 16,
 321                               (unsigned)(csc->max_slab_buffers * 1.3));
 322       struct radeon_bo_item *new_buffers =
 323             REALLOC(csc->slab_buffers,
 324                     csc->max_slab_buffers * sizeof(*new_buffers),
 325                     new_max * sizeof(*new_buffers));
 326       if (!new_buffers) {
 327          fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
 328          return -1;
 329       }
 330
 331       csc->max_slab_buffers = new_max;
 332       csc->slab_buffers = new_buffers;
 333    }
 334
 335    /* Initialize the new relocation. */
 336    idx = csc->num_slab_buffers++;
 337    item = &csc->slab_buffers[idx];
 338
 339    item->bo = NULL;
 340    item->u.slab.real_idx = real_idx;
 341    radeon_bo_reference(&item->bo, bo);
 342    p_atomic_inc(&bo->num_cs_references);
 343
 344    hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 345    csc->reloc_indices_hashlist[hash] = idx;
 346
 347    return idx;
 348 }
 349
 350 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
 351                                          struct pb_buffer *buf,
 352                                          enum radeon_bo_usage usage,
 353                                          enum radeon_bo_domain domains,
 354                                          enum radeon_bo_priority priority)
 355 {
 356    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 357    struct radeon_bo *bo = (struct radeon_bo*)buf;
 358    enum radeon_bo_domain added_domains;
 359
 360    /* If VRAM is just stolen system memory, allow both VRAM and
 361     * GTT, whichever has free space. If a buffer is evicted from
 362     * VRAM to GTT, it will stay there.
 363     */
 364    if (!cs->ws->info.has_dedicated_vram)
 365       domains |= RADEON_DOMAIN_GTT;
 366
 367    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
 368    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
 369    struct drm_radeon_cs_reloc *reloc;
 370    int index;
 371
 372    if (!bo->handle) {
 373       index = radeon_lookup_or_add_slab_buffer(cs, bo);
 374       if (index < 0)
 375          return 0;
 376
 377       index = cs->csc->slab_buffers[index].u.slab.real_idx;
 378    } else {
 379       index = radeon_lookup_or_add_real_buffer(cs, bo);
 380    }
 381
 382    reloc = &cs->csc->relocs[index];
 383    added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
 384    reloc->read_domains |= rd;
 385    reloc->write_domain |= wd;
 386    reloc->flags = MAX2(reloc->flags, priority);
 387    cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;
 388
 389    if (added_domains & RADEON_DOMAIN_VRAM)
 390       cs->base.used_vram += bo->base.size;
 391    else if (added_domains & RADEON_DOMAIN_GTT)
 392       cs->base.used_gart += bo->base.size;
 393
 394    return index;
 395 }
 396
 397 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
 398                                        struct pb_buffer *buf)
 399 {
 400    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 401
 402    return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
 403 }
 404
 405 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
 406 {
 407    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 408    bool status =
 409          cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
 410          cs->base.used_vram < cs->ws->info.vram_size * 0.8;
 411
 412    if (status) {
 413       cs->csc->num_validated_relocs = cs->csc->num_relocs;
 414    } else {
 415       /* Remove lately-added buffers. The validation failed with them
 416        * and the CS is about to be flushed because of that. Keep only
 417        * the already-validated buffers. */
 418       unsigned i;
 419
 420       for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
 421          p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
 422          radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
 423       }
 424       cs->csc->num_relocs = cs->csc->num_validated_relocs;
 425
 426       /* Flush if there are any relocs. Clean up otherwise. */
 427       if (cs->csc->num_relocs) {
 428          cs->flush_cs(cs->flush_data,
 429                       RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 430       } else {
 431          radeon_cs_context_cleanup(cs->csc);
 432          cs->base.used_vram = 0;
 433          cs->base.used_gart = 0;
 434
 435          assert(cs->base.current.cdw == 0);
 436          if (cs->base.current.cdw != 0) {
 437             fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
 438          }
 439       }
 440    }
 441    return status;
 442 }
 443
 444 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
 445                                       bool force_chaining)
 446 {
 447    assert(rcs->current.cdw <= rcs->current.max_dw);
 448    return rcs->current.max_dw - rcs->current.cdw >= dw;
 449 }
 450
 451 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
 452                                               struct radeon_bo_list_item *list)
 453 {
 454    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 455    int i;
 456
 457    if (list) {
 458       for (i = 0; i < cs->csc->num_relocs; i++) {
 459          list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
 460          list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
 461          list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
 462       }
 463    }
 464    return cs->csc->num_relocs;
 465 }
 466
 467 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
 468 {
 469    struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
 470    unsigned i;
 471    int r;
 472
 473    r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
 474                            &csc->cs, sizeof(struct drm_radeon_cs));
 475    if (r) {
 476       if (r == -ENOMEM)
 477          fprintf(stderr, "radeon: Not enough memory for command submission.\n");
 478       else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
 479          unsigned i;
 480
 481          fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
 482          for (i = 0; i < csc->chunks[0].length_dw; i++) {
 483             fprintf(stderr, "0x%08X\n", csc->buf[i]);
 484          }
 485       } else {
 486          fprintf(stderr, "radeon: The kernel rejected CS, "
 487                          "see dmesg for more information (%i).\n", r);
 488       }
 489    }
 490
 491    for (i = 0; i < csc->num_relocs; i++)
 492       p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
 493    for (i = 0; i < csc->num_slab_buffers; i++)
 494       p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
 495
 496    radeon_cs_context_cleanup(csc);
 497 }
 498
 499 /*
 500  * Make sure previous submission of this cs are completed
 501  */
 502 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
 503 {
 504    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 505
 506    /* Wait for any pending ioctl of this CS to complete. */
 507    if (util_queue_is_initialized(&cs->ws->cs_queue))
 508       util_queue_fence_wait(&cs->flush_completed);
 509 }
 510
 511 /* Add the given fence to a slab buffer fence list.
 512  *
 513  * There is a potential race condition when bo participates in submissions on
 514  * two or more threads simultaneously. Since we do not know which of the
 515  * submissions will be sent to the GPU first, we have to keep the fences
 516  * of all submissions.
 517  *
 518  * However, fences that belong to submissions that have already returned from
 519  * their respective ioctl do not have to be kept, because we know that they
 520  * will signal earlier.
 521  */
 522 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
 523 {
 524    unsigned dst;
 525
 526    assert(fence->num_cs_references);
 527
 528    /* Cleanup older fences */
 529    dst = 0;
 530    for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
 531       if (bo->u.slab.fences[src]->num_cs_references) {
 532          bo->u.slab.fences[dst] = bo->u.slab.fences[src];
 533          dst++;
 534       } else {
 535          radeon_bo_reference(&bo->u.slab.fences[src], NULL);
 536       }
 537    }
 538    bo->u.slab.num_fences = dst;
 539
 540    /* Check available space for the new fence */
 541    if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
 542       unsigned new_max_fences = bo->u.slab.max_fences + 1;
 543       struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
 544                                               bo->u.slab.max_fences * sizeof(*new_fences),
 545                                               new_max_fences * sizeof(*new_fences));
 546       if (!new_fences) {
 547          fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
 548          return;
 549       }
 550
 551       bo->u.slab.fences = new_fences;
 552       bo->u.slab.max_fences = new_max_fences;
 553    }
 554
 555    /* Add the new fence */
 556    bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
 557    radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
 558    bo->u.slab.num_fences++;
 559 }
 560
 561 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
 562
 563 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
 564                                unsigned flags,
 565                                struct pipe_fence_handle **pfence)
 566 {
 567    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 568    struct radeon_cs_context *tmp;
 569
 570    switch (cs->ring_type) {
 571    case RING_DMA:
 572       /* pad DMA ring to 8 DWs */
 573       if (cs->ws->info.chip_class <= GFX6) {
 574          while (rcs->current.cdw & 7)
 575             radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
 576       } else {
 577          while (rcs->current.cdw & 7)
 578             radeon_emit(&cs->base, 0x00000000); /* NOP packet */
 579       }
 580       break;
 581    case RING_GFX:
 582       /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
 583        * r6xx, requires at least 4 dw alignment to avoid a hw bug.
 584        */
 585       if (cs->ws->info.gfx_ib_pad_with_type2) {
 586          while (rcs->current.cdw & 7)
 587             radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
 588       } else {
 589          while (rcs->current.cdw & 7)
 590             radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
 591       }
 592       break;
 593    case RING_UVD:
 594       while (rcs->current.cdw & 15)
 595          radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
 596       break;
 597    default:
 598       break;
 599    }
 600
 601    if (rcs->current.cdw > rcs->current.max_dw) {
 602       fprintf(stderr, "radeon: command stream overflowed\n");
 603    }
 604
 605    if (pfence || cs->csc->num_slab_buffers) {
 606       struct pipe_fence_handle *fence;
 607
 608       if (cs->next_fence) {
 609          fence = cs->next_fence;
 610          cs->next_fence = NULL;
 611       } else {
 612          fence = radeon_cs_create_fence(rcs);
 613       }
 614
 615       if (fence) {
 616          if (pfence)
 617             radeon_fence_reference(pfence, fence);
 618
 619          mtx_lock(&cs->ws->bo_fence_lock);
 620          for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
 621             struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
 622             p_atomic_inc(&bo->num_active_ioctls);
 623             radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
 624          }
 625          mtx_unlock(&cs->ws->bo_fence_lock);
 626
 627          radeon_fence_reference(&fence, NULL);
 628       }
 629    } else {
 630       radeon_fence_reference(&cs->next_fence, NULL);
 631    }
 632
 633    radeon_drm_cs_sync_flush(rcs);
 634
 635    /* Swap command streams. */
 636    tmp = cs->csc;
 637    cs->csc = cs->cst;
 638    cs->cst = tmp;
 639
 640    /* If the CS is not empty or overflowed, emit it in a separate thread. */
 641    if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
 642       unsigned i, num_relocs;
 643
 644       num_relocs = cs->cst->num_relocs;
 645
 646       cs->cst->chunks[0].length_dw = cs->base.current.cdw;
 647
 648       for (i = 0; i < num_relocs; i++) {
 649          /* Update the number of active asynchronous CS ioctls for the buffer. */
 650          p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
 651       }
 652
 653       switch (cs->ring_type) {
 654       case RING_DMA:
 655          cs->cst->flags[0] = 0;
 656          cs->cst->flags[1] = RADEON_CS_RING_DMA;
 657          cs->cst->cs.num_chunks = 3;
 658          if (cs->ws->info.r600_has_virtual_memory) {
 659             cs->cst->flags[0] |= RADEON_CS_USE_VM;
 660          }
 661          break;
 662
 663       case RING_UVD:
 664          cs->cst->flags[0] = 0;
 665          cs->cst->flags[1] = RADEON_CS_RING_UVD;
 666          cs->cst->cs.num_chunks = 3;
 667          break;
 668
 669       case RING_VCE:
 670          cs->cst->flags[0] = 0;
 671          cs->cst->flags[1] = RADEON_CS_RING_VCE;
 672          cs->cst->cs.num_chunks = 3;
 673          break;
 674
 675       default:
 676       case RING_GFX:
 677       case RING_COMPUTE:
 678          cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
 679          cs->cst->flags[1] = RADEON_CS_RING_GFX;
 680          cs->cst->cs.num_chunks = 3;
 681
 682          if (cs->ws->info.r600_has_virtual_memory) {
 683             cs->cst->flags[0] |= RADEON_CS_USE_VM;
 684             cs->cst->cs.num_chunks = 3;
 685          }
 686          if (flags & PIPE_FLUSH_END_OF_FRAME) {
 687             cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
 688             cs->cst->cs.num_chunks = 3;
 689          }
 690          if (cs->ring_type == RING_COMPUTE) {
 691             cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
 692             cs->cst->cs.num_chunks = 3;
 693          }
 694          break;
 695       }
 696
 697       if (util_queue_is_initialized(&cs->ws->cs_queue)) {
 698          util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
 699                             radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
 700          if (!(flags & PIPE_FLUSH_ASYNC))
 701             radeon_drm_cs_sync_flush(rcs);
 702       } else {
 703          radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
 704       }
 705    } else {
 706       radeon_cs_context_cleanup(cs->cst);
 707    }
 708
 709    /* Prepare a new CS. */
 710    cs->base.current.buf = cs->csc->buf;
 711    cs->base.current.cdw = 0;
 712    cs->base.used_vram = 0;
 713    cs->base.used_gart = 0;
 714
 715    if (cs->ring_type == RING_GFX)
 716       cs->ws->num_gfx_IBs++;
 717    else if (cs->ring_type == RING_DMA)
 718       cs->ws->num_sdma_IBs++;
 719    return 0;
 720 }
 721
 722 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
 723 {
 724    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 725
 726    radeon_drm_cs_sync_flush(rcs);
 727    util_queue_fence_destroy(&cs->flush_completed);
 728    radeon_cs_context_cleanup(&cs->csc1);
 729    radeon_cs_context_cleanup(&cs->csc2);
 730    p_atomic_dec(&cs->ws->num_cs);
 731    radeon_destroy_cs_context(&cs->csc1);
 732    radeon_destroy_cs_context(&cs->csc2);
 733    radeon_fence_reference(&cs->next_fence, NULL);
 734    FREE(cs);
 735 }
 736
 737 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
 738                                     struct pb_buffer *_buf,
 739                                     enum radeon_bo_usage usage)
 740 {
 741    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 742    struct radeon_bo *bo = (struct radeon_bo*)_buf;
 743    int index;
 744
 745    if (!bo->num_cs_references)
 746       return false;
 747
 748    index = radeon_lookup_buffer(cs->csc, bo);
 749    if (index == -1)
 750       return false;
 751
 752    if (!bo->handle)
 753       index = cs->csc->slab_buffers[index].u.slab.real_idx;
 754
 755    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
 756       return true;
 757    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
 758       return true;
 759
 760    return false;
 761 }
 762
 763 /* FENCES */
 764
 765 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
 766 {
 767    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 768    struct pb_buffer *fence;
 769
 770    /* Create a fence, which is a dummy BO. */
 771    fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
 772                                       RADEON_DOMAIN_GTT,
 773                                       RADEON_FLAG_NO_SUBALLOC
 774                                       | RADEON_FLAG_NO_INTERPROCESS_SHARING);
 775    if (!fence)
 776       return NULL;
 777
 778    /* Add the fence as a dummy relocation. */
 779    cs->ws->base.cs_add_buffer(rcs, fence,
 780                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
 781                               RADEON_PRIO_FENCE);
 782    return (struct pipe_fence_handle*)fence;
 783 }
 784
 785 static bool radeon_fence_wait(struct radeon_winsys *ws,
 786                               struct pipe_fence_handle *fence,
 787                               uint64_t timeout)
 788 {
 789    return ws->buffer_wait((struct pb_buffer*)fence, timeout,
 790                           RADEON_USAGE_READWRITE);
 791 }
 792
 793 static void radeon_fence_reference(struct pipe_fence_handle **dst,
 794                                    struct pipe_fence_handle *src)
 795 {
 796    pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
 797 }
 798
 799 static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
 800 {
 801    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 802    struct pipe_fence_handle *fence = NULL;
 803
 804    if (cs->next_fence) {
 805       radeon_fence_reference(&fence, cs->next_fence);
 806       return fence;
 807    }
 808
 809    fence = radeon_cs_create_fence(rcs);
 810    if (!fence)
 811       return NULL;
 812
 813    radeon_fence_reference(&cs->next_fence, fence);
 814    return fence;
 815 }
 816
 817 static void
 818 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
 819                                    struct pipe_fence_handle *fence,
 820                                    unsigned dependency_flags)
 821 {
 822    /* TODO: Handle the following unlikely multi-threaded scenario:
 823     *
 824     *  Thread 1 / Context 1                   Thread 2 / Context 2
 825     *  --------------------                   --------------------
 826     *  f = cs_get_next_fence()
 827     *                                         cs_add_fence_dependency(f)
 828     *                                         cs_flush()
 829     *  cs_flush()
 830     *
 831     * We currently assume that this does not happen because we don't support
 832     * asynchronous flushes on Radeon.
 833     */
 834 }
 835
 836 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
 837 {
 838    ws->base.ctx_create = radeon_drm_ctx_create;
 839    ws->base.ctx_destroy = radeon_drm_ctx_destroy;
 840    ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
 841    ws->base.cs_create = radeon_drm_cs_create;
 842    ws->base.cs_destroy = radeon_drm_cs_destroy;
 843    ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
 844    ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
 845    ws->base.cs_validate = radeon_drm_cs_validate;
 846    ws->base.cs_check_space = radeon_drm_cs_check_space;
 847    ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
 848    ws->base.cs_flush = radeon_drm_cs_flush;
 849    ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
 850    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
 851    ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
 852    ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
 853    ws->base.fence_wait = radeon_fence_wait;
 854    ws->base.fence_reference = radeon_fence_reference;
 855 }