src/gallium/winsys/radeon/drm/radeon_drm_cs.c

   1 /*
   2  * Copyright © 2008 Jérôme Glisse
   3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * The above copyright notice and this permission notice (including the
  24  * next paragraph) shall be included in all copies or substantial portions
  25  * of the Software.
  26  */
  27
  28 /*
  29     This file replaces libdrm's radeon_cs_gem with our own implemention.
  30     It's optimized specifically for Radeon DRM.
  31     Adding buffers and space checking are faster and simpler than their
  32     counterparts in libdrm (the time complexity of all the functions
  33     is O(1) in nearly all scenarios, thanks to hashing).
  34
  35     It works like this:
  36
  37     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
  38     also adds the size of 'buf' to the used_gart and used_vram winsys variables
  39     based on the domains, which are simply or'd for the accounting purposes.
  40     The adding is skipped if the reloc is already present in the list, but it
  41     accounts any newly-referenced domains.
  42
  43     cs_validate is then called, which just checks:
  44         used_vram/gart < vram/gart_size * 0.8
  45     The 0.8 number allows for some memory fragmentation. If the validation
  46     fails, the pipe driver flushes CS and tries do the validation again,
  47     i.e. it validates only that one operation. If it fails again, it drops
  48     the operation on the floor and prints some nasty message to stderr.
  49     (done in the pipe driver)
  50
  51     cs_write_reloc(cs, buf) just writes a reloc that has been added using
  52     cs_add_buffer. The read_domain and write_domain parameters have been removed,
  53     because we already specify them in cs_add_buffer.
  54 */
  55
  56 #include "radeon_drm_cs.h"
  57
  58 #include "util/u_memory.h"
  59 #include "util/os_time.h"
  60
  61 #include <stdio.h>
  62 #include <stdlib.h>
  63 #include <stdint.h>
  64 #include <xf86drm.h>
  65
  66
  67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
  68
  69 static struct pipe_fence_handle *
  70 radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
  71 static void radeon_fence_reference(struct pipe_fence_handle **dst,
  72                                    struct pipe_fence_handle *src);
  73
  74 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
  75 {
  76     struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
  77     if (!ctx)
  78         return NULL;
  79
  80     ctx->ws = (struct radeon_drm_winsys*)ws;
  81     ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
  82     return (struct radeon_winsys_ctx*)ctx;
  83 }
  84
  85 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
  86 {
  87     FREE(ctx);
  88 }
  89
  90 static enum pipe_reset_status
  91 radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx)
  92 {
  93     struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
  94
  95     unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
  96
  97     if (ctx->gpu_reset_counter == latest)
  98         return PIPE_NO_RESET;
  99
 100     ctx->gpu_reset_counter = latest;
 101     return PIPE_UNKNOWN_CONTEXT_RESET;
 102 }
 103
 104 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
 105                                    struct radeon_drm_winsys *ws)
 106 {
 107     int i;
 108
 109     csc->fd = ws->fd;
 110
 111     csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
 112     csc->chunks[0].length_dw = 0;
 113     csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
 114     csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
 115     csc->chunks[1].length_dw = 0;
 116     csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 117     csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
 118     csc->chunks[2].length_dw = 2;
 119     csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
 120
 121     csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
 122     csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
 123     csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
 124
 125     csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
 126
 127     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 128         csc->reloc_indices_hashlist[i] = -1;
 129     }
 130     return true;
 131 }
 132
 133 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
 134 {
 135     unsigned i;
 136
 137     for (i = 0; i < csc->num_relocs; i++) {
 138         p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
 139         radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
 140     }
 141     for (i = 0; i < csc->num_slab_buffers; ++i) {
 142         p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
 143         radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
 144     }
 145
 146     csc->num_relocs = 0;
 147     csc->num_validated_relocs = 0;
 148     csc->num_slab_buffers = 0;
 149     csc->chunks[0].length_dw = 0;
 150     csc->chunks[1].length_dw = 0;
 151
 152     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 153         csc->reloc_indices_hashlist[i] = -1;
 154     }
 155 }
 156
 157 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
 158 {
 159     radeon_cs_context_cleanup(csc);
 160     FREE(csc->slab_buffers);
 161     FREE(csc->relocs_bo);
 162     FREE(csc->relocs);
 163 }
 164
 165
 166 static struct radeon_cmdbuf *
 167 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
 168                      enum ring_type ring_type,
 169                      void (*flush)(void *ctx, unsigned flags,
 170                                    struct pipe_fence_handle **fence),
 171                      void *flush_ctx,
 172                      bool stop_exec_on_failure)
 173 {
 174     struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
 175     struct radeon_drm_cs *cs;
 176
 177     cs = CALLOC_STRUCT(radeon_drm_cs);
 178     if (!cs) {
 179         return NULL;
 180     }
 181     util_queue_fence_init(&cs->flush_completed);
 182
 183     cs->ws = ws;
 184     cs->flush_cs = flush;
 185     cs->flush_data = flush_ctx;
 186
 187     if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
 188         FREE(cs);
 189         return NULL;
 190     }
 191     if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
 192         radeon_destroy_cs_context(&cs->csc1);
 193         FREE(cs);
 194         return NULL;
 195     }
 196
 197     /* Set the first command buffer as current. */
 198     cs->csc = &cs->csc1;
 199     cs->cst = &cs->csc2;
 200     cs->base.current.buf = cs->csc->buf;
 201     cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
 202     cs->ring_type = ring_type;
 203
 204     p_atomic_inc(&ws->num_cs);
 205     return &cs->base;
 206 }
 207
 208 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
 209 {
 210     unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 211     struct radeon_bo_item *buffers;
 212     unsigned num_buffers;
 213     int i = csc->reloc_indices_hashlist[hash];
 214
 215     if (bo->handle) {
 216         buffers = csc->relocs_bo;
 217         num_buffers = csc->num_relocs;
 218     } else {
 219         buffers = csc->slab_buffers;
 220         num_buffers = csc->num_slab_buffers;
 221     }
 222
 223     /* not found or found */
 224     if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
 225         return i;
 226
 227     /* Hash collision, look for the BO in the list of relocs linearly. */
 228     for (i = num_buffers - 1; i >= 0; i--) {
 229         if (buffers[i].bo == bo) {
 230             /* Put this reloc in the hash list.
 231              * This will prevent additional hash collisions if there are
 232              * several consecutive lookup_buffer calls for the same buffer.
 233              *
 234              * Example: Assuming buffers A,B,C collide in the hash list,
 235              * the following sequence of relocs:
 236              *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
 237              * will collide here: ^ and here:   ^,
 238              * meaning that we should get very few collisions in the end. */
 239             csc->reloc_indices_hashlist[hash] = i;
 240             return i;
 241         }
 242     }
 243     return -1;
 244 }
 245
 246 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
 247                                                  struct radeon_bo *bo)
 248 {
 249     struct radeon_cs_context *csc = cs->csc;
 250     struct drm_radeon_cs_reloc *reloc;
 251     unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 252     int i = -1;
 253
 254     i = radeon_lookup_buffer(csc, bo);
 255
 256     if (i >= 0) {
 257         /* For async DMA, every add_buffer call must add a buffer to the list
 258          * no matter how many duplicates there are. This is due to the fact
 259          * the DMA CS checker doesn't use NOP packets for offset patching,
 260          * but always uses the i-th buffer from the list to patch the i-th
 261          * offset. If there are N offsets in a DMA CS, there must also be N
 262          * buffers in the relocation list.
 263          *
 264          * This doesn't have to be done if virtual memory is enabled,
 265          * because there is no offset patching with virtual memory.
 266          */
 267         if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) {
 268             return i;
 269         }
 270     }
 271
 272     /* New relocation, check if the backing array is large enough. */
 273     if (csc->num_relocs >= csc->max_relocs) {
 274         uint32_t size;
 275         csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
 276
 277         size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
 278         csc->relocs_bo = realloc(csc->relocs_bo, size);
 279
 280         size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
 281         csc->relocs = realloc(csc->relocs, size);
 282
 283         csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 284     }
 285
 286     /* Initialize the new relocation. */
 287     csc->relocs_bo[csc->num_relocs].bo = NULL;
 288     csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
 289     radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
 290     p_atomic_inc(&bo->num_cs_references);
 291     reloc = &csc->relocs[csc->num_relocs];
 292     reloc->handle = bo->handle;
 293     reloc->read_domains = 0;
 294     reloc->write_domain = 0;
 295     reloc->flags = 0;
 296
 297     csc->reloc_indices_hashlist[hash] = csc->num_relocs;
 298
 299     csc->chunks[1].length_dw += RELOC_DWORDS;
 300
 301     return csc->num_relocs++;
 302 }
 303
 304 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
 305                                             struct radeon_bo *bo)
 306 {
 307     struct radeon_cs_context *csc = cs->csc;
 308     unsigned hash;
 309     struct radeon_bo_item *item;
 310     int idx;
 311     int real_idx;
 312
 313     idx = radeon_lookup_buffer(csc, bo);
 314     if (idx >= 0)
 315         return idx;
 316
 317     real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
 318
 319     /* Check if the backing array is large enough. */
 320     if (csc->num_slab_buffers >= csc->max_slab_buffers) {
 321         unsigned new_max = MAX2(csc->max_slab_buffers + 16,
 322                                 (unsigned)(csc->max_slab_buffers * 1.3));
 323         struct radeon_bo_item *new_buffers =
 324             REALLOC(csc->slab_buffers,
 325                     csc->max_slab_buffers * sizeof(*new_buffers),
 326                     new_max * sizeof(*new_buffers));
 327         if (!new_buffers) {
 328             fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
 329             return -1;
 330         }
 331
 332         csc->max_slab_buffers = new_max;
 333         csc->slab_buffers = new_buffers;
 334     }
 335
 336     /* Initialize the new relocation. */
 337     idx = csc->num_slab_buffers++;
 338     item = &csc->slab_buffers[idx];
 339
 340     item->bo = NULL;
 341     item->u.slab.real_idx = real_idx;
 342     radeon_bo_reference(&item->bo, bo);
 343     p_atomic_inc(&bo->num_cs_references);
 344
 345     hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 346     csc->reloc_indices_hashlist[hash] = idx;
 347
 348     return idx;
 349 }
 350
 351 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
 352                                         struct pb_buffer *buf,
 353                                         enum radeon_bo_usage usage,
 354                                         enum radeon_bo_domain domains,
 355                                         enum radeon_bo_priority priority)
 356 {
 357     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 358     struct radeon_bo *bo = (struct radeon_bo*)buf;
 359     enum radeon_bo_domain added_domains;
 360
 361     /* If VRAM is just stolen system memory, allow both VRAM and
 362      * GTT, whichever has free space. If a buffer is evicted from
 363      * VRAM to GTT, it will stay there.
 364      */
 365     if (!cs->ws->info.has_dedicated_vram)
 366         domains |= RADEON_DOMAIN_GTT;
 367
 368     enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
 369     enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
 370     struct drm_radeon_cs_reloc *reloc;
 371     int index;
 372
 373     if (!bo->handle) {
 374         index = radeon_lookup_or_add_slab_buffer(cs, bo);
 375         if (index < 0)
 376             return 0;
 377
 378         index = cs->csc->slab_buffers[index].u.slab.real_idx;
 379     } else {
 380         index = radeon_lookup_or_add_real_buffer(cs, bo);
 381     }
 382
 383     reloc = &cs->csc->relocs[index];
 384     added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
 385     reloc->read_domains |= rd;
 386     reloc->write_domain |= wd;
 387     reloc->flags = MAX2(reloc->flags, priority);
 388     cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;
 389
 390     if (added_domains & RADEON_DOMAIN_VRAM)
 391         cs->base.used_vram += bo->base.size;
 392     else if (added_domains & RADEON_DOMAIN_GTT)
 393         cs->base.used_gart += bo->base.size;
 394
 395     return index;
 396 }
 397
 398 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
 399                                    struct pb_buffer *buf)
 400 {
 401     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 402
 403     return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
 404 }
 405
 406 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
 407 {
 408     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 409     bool status =
 410         cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
 411         cs->base.used_vram < cs->ws->info.vram_size * 0.8;
 412
 413     if (status) {
 414         cs->csc->num_validated_relocs = cs->csc->num_relocs;
 415     } else {
 416         /* Remove lately-added buffers. The validation failed with them
 417          * and the CS is about to be flushed because of that. Keep only
 418          * the already-validated buffers. */
 419         unsigned i;
 420
 421         for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
 422             p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
 423             radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
 424         }
 425         cs->csc->num_relocs = cs->csc->num_validated_relocs;
 426
 427         /* Flush if there are any relocs. Clean up otherwise. */
 428         if (cs->csc->num_relocs) {
 429             cs->flush_cs(cs->flush_data,
 430                          RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 431         } else {
 432             radeon_cs_context_cleanup(cs->csc);
 433             cs->base.used_vram = 0;
 434             cs->base.used_gart = 0;
 435
 436             assert(cs->base.current.cdw == 0);
 437             if (cs->base.current.cdw != 0) {
 438                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
 439             }
 440         }
 441     }
 442     return status;
 443 }
 444
 445 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
 446                                       bool force_chaining)
 447 {
 448    assert(rcs->current.cdw <= rcs->current.max_dw);
 449    return rcs->current.max_dw - rcs->current.cdw >= dw;
 450 }
 451
 452 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
 453                                               struct radeon_bo_list_item *list)
 454 {
 455     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 456     int i;
 457
 458     if (list) {
 459         for (i = 0; i < cs->csc->num_relocs; i++) {
 460             list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
 461             list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
 462             list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
 463         }
 464     }
 465     return cs->csc->num_relocs;
 466 }
 467
 468 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
 469 {
 470     struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
 471     unsigned i;
 472     int r;
 473
 474     r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
 475                             &csc->cs, sizeof(struct drm_radeon_cs));
 476     if (r) {
 477         if (r == -ENOMEM)
 478             fprintf(stderr, "radeon: Not enough memory for command submission.\n");
 479         else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
 480             unsigned i;
 481
 482             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
 483             for (i = 0; i < csc->chunks[0].length_dw; i++) {
 484                 fprintf(stderr, "0x%08X\n", csc->buf[i]);
 485             }
 486         } else {
 487             fprintf(stderr, "radeon: The kernel rejected CS, "
 488                     "see dmesg for more information (%i).\n", r);
 489         }
 490     }
 491
 492     for (i = 0; i < csc->num_relocs; i++)
 493         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
 494     for (i = 0; i < csc->num_slab_buffers; i++)
 495         p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
 496
 497     radeon_cs_context_cleanup(csc);
 498 }
 499
 500 /*
 501  * Make sure previous submission of this cs are completed
 502  */
 503 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
 504 {
 505     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 506
 507     /* Wait for any pending ioctl of this CS to complete. */
 508     if (util_queue_is_initialized(&cs->ws->cs_queue))
 509         util_queue_fence_wait(&cs->flush_completed);
 510 }
 511
 512 /* Add the given fence to a slab buffer fence list.
 513  *
 514  * There is a potential race condition when bo participates in submissions on
 515  * two or more threads simultaneously. Since we do not know which of the
 516  * submissions will be sent to the GPU first, we have to keep the fences
 517  * of all submissions.
 518  *
 519  * However, fences that belong to submissions that have already returned from
 520  * their respective ioctl do not have to be kept, because we know that they
 521  * will signal earlier.
 522  */
 523 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
 524 {
 525     unsigned dst;
 526
 527     assert(fence->num_cs_references);
 528
 529     /* Cleanup older fences */
 530     dst = 0;
 531     for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
 532         if (bo->u.slab.fences[src]->num_cs_references) {
 533             bo->u.slab.fences[dst] = bo->u.slab.fences[src];
 534             dst++;
 535         } else {
 536             radeon_bo_reference(&bo->u.slab.fences[src], NULL);
 537         }
 538     }
 539     bo->u.slab.num_fences = dst;
 540
 541     /* Check available space for the new fence */
 542     if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
 543         unsigned new_max_fences = bo->u.slab.max_fences + 1;
 544         struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
 545                                                 bo->u.slab.max_fences * sizeof(*new_fences),
 546                                                 new_max_fences * sizeof(*new_fences));
 547         if (!new_fences) {
 548             fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
 549             return;
 550         }
 551
 552         bo->u.slab.fences = new_fences;
 553         bo->u.slab.max_fences = new_max_fences;
 554     }
 555
 556     /* Add the new fence */
 557     bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
 558     radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
 559     bo->u.slab.num_fences++;
 560 }
 561
 562 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
 563
 564 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
 565                                unsigned flags,
 566                                struct pipe_fence_handle **pfence)
 567 {
 568     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 569     struct radeon_cs_context *tmp;
 570
 571     switch (cs->ring_type) {
 572     case RING_DMA:
 573         /* pad DMA ring to 8 DWs */
 574         if (cs->ws->info.chip_class <= GFX6) {
 575             while (rcs->current.cdw & 7)
 576                 radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
 577         } else {
 578             while (rcs->current.cdw & 7)
 579                 radeon_emit(&cs->base, 0x00000000); /* NOP packet */
 580         }
 581         break;
 582     case RING_GFX:
 583         /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
 584          * r6xx, requires at least 4 dw alignment to avoid a hw bug.
 585          */
 586         if (cs->ws->info.gfx_ib_pad_with_type2) {
 587             while (rcs->current.cdw & 7)
 588                 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
 589         } else {
 590             while (rcs->current.cdw & 7)
 591                 radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
 592         }
 593         break;
 594     case RING_UVD:
 595         while (rcs->current.cdw & 15)
 596             radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
 597         break;
 598     default:
 599         break;
 600     }
 601
 602     if (rcs->current.cdw > rcs->current.max_dw) {
 603        fprintf(stderr, "radeon: command stream overflowed\n");
 604     }
 605
 606     if (pfence || cs->csc->num_slab_buffers) {
 607         struct pipe_fence_handle *fence;
 608
 609         if (cs->next_fence) {
 610             fence = cs->next_fence;
 611             cs->next_fence = NULL;
 612         } else {
 613             fence = radeon_cs_create_fence(rcs);
 614         }
 615
 616         if (fence) {
 617             if (pfence)
 618                 radeon_fence_reference(pfence, fence);
 619
 620             mtx_lock(&cs->ws->bo_fence_lock);
 621             for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
 622                 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
 623                 p_atomic_inc(&bo->num_active_ioctls);
 624                 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
 625             }
 626             mtx_unlock(&cs->ws->bo_fence_lock);
 627
 628             radeon_fence_reference(&fence, NULL);
 629         }
 630     } else {
 631         radeon_fence_reference(&cs->next_fence, NULL);
 632     }
 633
 634     radeon_drm_cs_sync_flush(rcs);
 635
 636     /* Swap command streams. */
 637     tmp = cs->csc;
 638     cs->csc = cs->cst;
 639     cs->cst = tmp;
 640
 641     /* If the CS is not empty or overflowed, emit it in a separate thread. */
 642     if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
 643         unsigned i, num_relocs;
 644
 645         num_relocs = cs->cst->num_relocs;
 646
 647         cs->cst->chunks[0].length_dw = cs->base.current.cdw;
 648
 649         for (i = 0; i < num_relocs; i++) {
 650             /* Update the number of active asynchronous CS ioctls for the buffer. */
 651             p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
 652         }
 653
 654         switch (cs->ring_type) {
 655         case RING_DMA:
 656             cs->cst->flags[0] = 0;
 657             cs->cst->flags[1] = RADEON_CS_RING_DMA;
 658             cs->cst->cs.num_chunks = 3;
 659             if (cs->ws->info.r600_has_virtual_memory) {
 660                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 661             }
 662             break;
 663
 664         case RING_UVD:
 665             cs->cst->flags[0] = 0;
 666             cs->cst->flags[1] = RADEON_CS_RING_UVD;
 667             cs->cst->cs.num_chunks = 3;
 668             break;
 669
 670         case RING_VCE:
 671             cs->cst->flags[0] = 0;
 672             cs->cst->flags[1] = RADEON_CS_RING_VCE;
 673             cs->cst->cs.num_chunks = 3;
 674             break;
 675
 676         default:
 677         case RING_GFX:
 678         case RING_COMPUTE:
 679             cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
 680             cs->cst->flags[1] = RADEON_CS_RING_GFX;
 681             cs->cst->cs.num_chunks = 3;
 682
 683             if (cs->ws->info.r600_has_virtual_memory) {
 684                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 685                 cs->cst->cs.num_chunks = 3;
 686             }
 687             if (flags & PIPE_FLUSH_END_OF_FRAME) {
 688                 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
 689                 cs->cst->cs.num_chunks = 3;
 690             }
 691             if (cs->ring_type == RING_COMPUTE) {
 692                 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
 693                 cs->cst->cs.num_chunks = 3;
 694             }
 695             break;
 696         }
 697
 698         if (util_queue_is_initialized(&cs->ws->cs_queue)) {
 699             util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
 700                                radeon_drm_cs_emit_ioctl_oneshot, NULL);
 701             if (!(flags & PIPE_FLUSH_ASYNC))
 702                 radeon_drm_cs_sync_flush(rcs);
 703         } else {
 704             radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
 705         }
 706     } else {
 707         radeon_cs_context_cleanup(cs->cst);
 708     }
 709
 710     /* Prepare a new CS. */
 711     cs->base.current.buf = cs->csc->buf;
 712     cs->base.current.cdw = 0;
 713     cs->base.used_vram = 0;
 714     cs->base.used_gart = 0;
 715
 716     if (cs->ring_type == RING_GFX)
 717         cs->ws->num_gfx_IBs++;
 718     else if (cs->ring_type == RING_DMA)
 719         cs->ws->num_sdma_IBs++;
 720     return 0;
 721 }
 722
 723 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
 724 {
 725     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 726
 727     radeon_drm_cs_sync_flush(rcs);
 728     util_queue_fence_destroy(&cs->flush_completed);
 729     radeon_cs_context_cleanup(&cs->csc1);
 730     radeon_cs_context_cleanup(&cs->csc2);
 731     p_atomic_dec(&cs->ws->num_cs);
 732     radeon_destroy_cs_context(&cs->csc1);
 733     radeon_destroy_cs_context(&cs->csc2);
 734     radeon_fence_reference(&cs->next_fence, NULL);
 735     FREE(cs);
 736 }
 737
 738 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
 739                                     struct pb_buffer *_buf,
 740                                     enum radeon_bo_usage usage)
 741 {
 742     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 743     struct radeon_bo *bo = (struct radeon_bo*)_buf;
 744     int index;
 745
 746     if (!bo->num_cs_references)
 747         return false;
 748
 749     index = radeon_lookup_buffer(cs->csc, bo);
 750     if (index == -1)
 751         return false;
 752
 753     if (!bo->handle)
 754         index = cs->csc->slab_buffers[index].u.slab.real_idx;
 755
 756     if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
 757         return true;
 758     if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
 759         return true;
 760
 761     return false;
 762 }
 763
 764 /* FENCES */
 765
 766 static struct pipe_fence_handle *
 767 radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
 768 {
 769     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 770     struct pb_buffer *fence;
 771
 772     /* Create a fence, which is a dummy BO. */
 773     fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
 774                                        RADEON_DOMAIN_GTT, RADEON_FLAG_NO_SUBALLOC);
 775     if (!fence)
 776        return NULL;
 777
 778     /* Add the fence as a dummy relocation. */
 779     cs->ws->base.cs_add_buffer(rcs, fence,
 780                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
 781                               RADEON_PRIO_FENCE);
 782     return (struct pipe_fence_handle*)fence;
 783 }
 784
 785 static bool radeon_fence_wait(struct radeon_winsys *ws,
 786                               struct pipe_fence_handle *fence,
 787                               uint64_t timeout)
 788 {
 789     return ws->buffer_wait((struct pb_buffer*)fence, timeout,
 790                            RADEON_USAGE_READWRITE);
 791 }
 792
 793 static void radeon_fence_reference(struct pipe_fence_handle **dst,
 794                                    struct pipe_fence_handle *src)
 795 {
 796     pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
 797 }
 798
 799 static struct pipe_fence_handle *
 800 radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
 801 {
 802    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 803    struct pipe_fence_handle *fence = NULL;
 804
 805    if (cs->next_fence) {
 806       radeon_fence_reference(&fence, cs->next_fence);
 807       return fence;
 808    }
 809
 810    fence = radeon_cs_create_fence(rcs);
 811    if (!fence)
 812       return NULL;
 813
 814    radeon_fence_reference(&cs->next_fence, fence);
 815    return fence;
 816 }
 817
 818 static void
 819 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
 820                                    struct pipe_fence_handle *fence,
 821                                    unsigned dependency_flags)
 822 {
 823    /* TODO: Handle the following unlikely multi-threaded scenario:
 824     *
 825     *  Thread 1 / Context 1                   Thread 2 / Context 2
 826     *  --------------------                   --------------------
 827     *  f = cs_get_next_fence()
 828     *                                         cs_add_fence_dependency(f)
 829     *                                         cs_flush()
 830     *  cs_flush()
 831     *
 832     * We currently assume that this does not happen because we don't support
 833     * asynchronous flushes on Radeon.
 834     */
 835 }
 836
 837 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
 838 {
 839     ws->base.ctx_create = radeon_drm_ctx_create;
 840     ws->base.ctx_destroy = radeon_drm_ctx_destroy;
 841     ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
 842     ws->base.cs_create = radeon_drm_cs_create;
 843     ws->base.cs_destroy = radeon_drm_cs_destroy;
 844     ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
 845     ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
 846     ws->base.cs_validate = radeon_drm_cs_validate;
 847     ws->base.cs_check_space = radeon_drm_cs_check_space;
 848     ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
 849     ws->base.cs_flush = radeon_drm_cs_flush;
 850     ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
 851     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
 852     ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
 853     ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
 854     ws->base.fence_wait = radeon_fence_wait;
 855     ws->base.fence_reference = radeon_fence_reference;
 856 }