src/gallium/winsys/radeon/drm/radeon_drm_cs.c

   1 /*
   2  * Copyright © 2008 Jérôme Glisse
   3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * The above copyright notice and this permission notice (including the
  24  * next paragraph) shall be included in all copies or substantial portions
  25  * of the Software.
  26  */
  27
  28 /*
  29     This file replaces libdrm's radeon_cs_gem with our own implemention.
  30     It's optimized specifically for Radeon DRM.
  31     Adding buffers and space checking are faster and simpler than their
  32     counterparts in libdrm (the time complexity of all the functions
  33     is O(1) in nearly all scenarios, thanks to hashing).
  34
  35     It works like this:
  36
  37     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
  38     also adds the size of 'buf' to the used_gart and used_vram winsys variables
  39     based on the domains, which are simply or'd for the accounting purposes.
  40     The adding is skipped if the reloc is already present in the list, but it
  41     accounts any newly-referenced domains.
  42
  43     cs_validate is then called, which just checks:
  44         used_vram/gart < vram/gart_size * 0.8
  45     The 0.8 number allows for some memory fragmentation. If the validation
  46     fails, the pipe driver flushes CS and tries do the validation again,
  47     i.e. it validates only that one operation. If it fails again, it drops
  48     the operation on the floor and prints some nasty message to stderr.
  49     (done in the pipe driver)
  50
  51     cs_write_reloc(cs, buf) just writes a reloc that has been added using
  52     cs_add_buffer. The read_domain and write_domain parameters have been removed,
  53     because we already specify them in cs_add_buffer.
  54 */
  55
  56 #include "radeon_drm_cs.h"
  57
  58 #include "util/u_memory.h"
  59 #include "util/os_time.h"
  60
  61 #include <stdio.h>
  62 #include <stdlib.h>
  63 #include <stdint.h>
  64 #include <xf86drm.h>
  65
  66
  67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
  68
  69 static struct pipe_fence_handle *
  70 radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
  71 static void radeon_fence_reference(struct pipe_fence_handle **dst,
  72                                    struct pipe_fence_handle *src);
  73
  74 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
  75 {
  76     /* No context support here. Just return the winsys pointer
  77      * as the "context". */
  78     return (struct radeon_winsys_ctx*)ws;
  79 }
  80
  81 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
  82 {
  83     /* No context support here. */
  84 }
  85
  86 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
  87                                    struct radeon_drm_winsys *ws)
  88 {
  89     int i;
  90
  91     csc->fd = ws->fd;
  92
  93     csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
  94     csc->chunks[0].length_dw = 0;
  95     csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
  96     csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
  97     csc->chunks[1].length_dw = 0;
  98     csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
  99     csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
 100     csc->chunks[2].length_dw = 2;
 101     csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
 102
 103     csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
 104     csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
 105     csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
 106
 107     csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
 108
 109     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 110         csc->reloc_indices_hashlist[i] = -1;
 111     }
 112     return true;
 113 }
 114
 115 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
 116 {
 117     unsigned i;
 118
 119     for (i = 0; i < csc->num_relocs; i++) {
 120         p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
 121         radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
 122     }
 123     for (i = 0; i < csc->num_slab_buffers; ++i) {
 124         p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
 125         radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
 126     }
 127
 128     csc->num_relocs = 0;
 129     csc->num_validated_relocs = 0;
 130     csc->num_slab_buffers = 0;
 131     csc->chunks[0].length_dw = 0;
 132     csc->chunks[1].length_dw = 0;
 133
 134     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 135         csc->reloc_indices_hashlist[i] = -1;
 136     }
 137 }
 138
 139 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
 140 {
 141     radeon_cs_context_cleanup(csc);
 142     FREE(csc->slab_buffers);
 143     FREE(csc->relocs_bo);
 144     FREE(csc->relocs);
 145 }
 146
 147
 148 static struct radeon_cmdbuf *
 149 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
 150                      enum ring_type ring_type,
 151                      void (*flush)(void *ctx, unsigned flags,
 152                                    struct pipe_fence_handle **fence),
 153                      void *flush_ctx,
 154                      bool stop_exec_on_failure)
 155 {
 156     struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
 157     struct radeon_drm_cs *cs;
 158
 159     cs = CALLOC_STRUCT(radeon_drm_cs);
 160     if (!cs) {
 161         return NULL;
 162     }
 163     util_queue_fence_init(&cs->flush_completed);
 164
 165     cs->ws = ws;
 166     cs->flush_cs = flush;
 167     cs->flush_data = flush_ctx;
 168
 169     if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
 170         FREE(cs);
 171         return NULL;
 172     }
 173     if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
 174         radeon_destroy_cs_context(&cs->csc1);
 175         FREE(cs);
 176         return NULL;
 177     }
 178
 179     /* Set the first command buffer as current. */
 180     cs->csc = &cs->csc1;
 181     cs->cst = &cs->csc2;
 182     cs->base.current.buf = cs->csc->buf;
 183     cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
 184     cs->ring_type = ring_type;
 185
 186     p_atomic_inc(&ws->num_cs);
 187     return &cs->base;
 188 }
 189
 190 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
 191 {
 192     unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 193     struct radeon_bo_item *buffers;
 194     unsigned num_buffers;
 195     int i = csc->reloc_indices_hashlist[hash];
 196
 197     if (bo->handle) {
 198         buffers = csc->relocs_bo;
 199         num_buffers = csc->num_relocs;
 200     } else {
 201         buffers = csc->slab_buffers;
 202         num_buffers = csc->num_slab_buffers;
 203     }
 204
 205     /* not found or found */
 206     if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
 207         return i;
 208
 209     /* Hash collision, look for the BO in the list of relocs linearly. */
 210     for (i = num_buffers - 1; i >= 0; i--) {
 211         if (buffers[i].bo == bo) {
 212             /* Put this reloc in the hash list.
 213              * This will prevent additional hash collisions if there are
 214              * several consecutive lookup_buffer calls for the same buffer.
 215              *
 216              * Example: Assuming buffers A,B,C collide in the hash list,
 217              * the following sequence of relocs:
 218              *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
 219              * will collide here: ^ and here:   ^,
 220              * meaning that we should get very few collisions in the end. */
 221             csc->reloc_indices_hashlist[hash] = i;
 222             return i;
 223         }
 224     }
 225     return -1;
 226 }
 227
 228 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
 229                                                  struct radeon_bo *bo)
 230 {
 231     struct radeon_cs_context *csc = cs->csc;
 232     struct drm_radeon_cs_reloc *reloc;
 233     unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 234     int i = -1;
 235
 236     i = radeon_lookup_buffer(csc, bo);
 237
 238     if (i >= 0) {
 239         /* For async DMA, every add_buffer call must add a buffer to the list
 240          * no matter how many duplicates there are. This is due to the fact
 241          * the DMA CS checker doesn't use NOP packets for offset patching,
 242          * but always uses the i-th buffer from the list to patch the i-th
 243          * offset. If there are N offsets in a DMA CS, there must also be N
 244          * buffers in the relocation list.
 245          *
 246          * This doesn't have to be done if virtual memory is enabled,
 247          * because there is no offset patching with virtual memory.
 248          */
 249         if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) {
 250             return i;
 251         }
 252     }
 253
 254     /* New relocation, check if the backing array is large enough. */
 255     if (csc->num_relocs >= csc->max_relocs) {
 256         uint32_t size;
 257         csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
 258
 259         size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
 260         csc->relocs_bo = realloc(csc->relocs_bo, size);
 261
 262         size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
 263         csc->relocs = realloc(csc->relocs, size);
 264
 265         csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 266     }
 267
 268     /* Initialize the new relocation. */
 269     csc->relocs_bo[csc->num_relocs].bo = NULL;
 270     csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
 271     radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
 272     p_atomic_inc(&bo->num_cs_references);
 273     reloc = &csc->relocs[csc->num_relocs];
 274     reloc->handle = bo->handle;
 275     reloc->read_domains = 0;
 276     reloc->write_domain = 0;
 277     reloc->flags = 0;
 278
 279     csc->reloc_indices_hashlist[hash] = csc->num_relocs;
 280
 281     csc->chunks[1].length_dw += RELOC_DWORDS;
 282
 283     return csc->num_relocs++;
 284 }
 285
 286 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
 287                                             struct radeon_bo *bo)
 288 {
 289     struct radeon_cs_context *csc = cs->csc;
 290     unsigned hash;
 291     struct radeon_bo_item *item;
 292     int idx;
 293     int real_idx;
 294
 295     idx = radeon_lookup_buffer(csc, bo);
 296     if (idx >= 0)
 297         return idx;
 298
 299     real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
 300
 301     /* Check if the backing array is large enough. */
 302     if (csc->num_slab_buffers >= csc->max_slab_buffers) {
 303         unsigned new_max = MAX2(csc->max_slab_buffers + 16,
 304                                 (unsigned)(csc->max_slab_buffers * 1.3));
 305         struct radeon_bo_item *new_buffers =
 306             REALLOC(csc->slab_buffers,
 307                     csc->max_slab_buffers * sizeof(*new_buffers),
 308                     new_max * sizeof(*new_buffers));
 309         if (!new_buffers) {
 310             fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
 311             return -1;
 312         }
 313
 314         csc->max_slab_buffers = new_max;
 315         csc->slab_buffers = new_buffers;
 316     }
 317
 318     /* Initialize the new relocation. */
 319     idx = csc->num_slab_buffers++;
 320     item = &csc->slab_buffers[idx];
 321
 322     item->bo = NULL;
 323     item->u.slab.real_idx = real_idx;
 324     radeon_bo_reference(&item->bo, bo);
 325     p_atomic_inc(&bo->num_cs_references);
 326
 327     hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 328     csc->reloc_indices_hashlist[hash] = idx;
 329
 330     return idx;
 331 }
 332
 333 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
 334                                         struct pb_buffer *buf,
 335                                         enum radeon_bo_usage usage,
 336                                         enum radeon_bo_domain domains,
 337                                         enum radeon_bo_priority priority)
 338 {
 339     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 340     struct radeon_bo *bo = (struct radeon_bo*)buf;
 341     enum radeon_bo_domain added_domains;
 342
 343     /* If VRAM is just stolen system memory, allow both VRAM and
 344      * GTT, whichever has free space. If a buffer is evicted from
 345      * VRAM to GTT, it will stay there.
 346      */
 347     if (!cs->ws->info.has_dedicated_vram)
 348         domains |= RADEON_DOMAIN_GTT;
 349
 350     enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
 351     enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
 352     struct drm_radeon_cs_reloc *reloc;
 353     int index;
 354
 355     if (!bo->handle) {
 356         index = radeon_lookup_or_add_slab_buffer(cs, bo);
 357         if (index < 0)
 358             return 0;
 359
 360         index = cs->csc->slab_buffers[index].u.slab.real_idx;
 361     } else {
 362         index = radeon_lookup_or_add_real_buffer(cs, bo);
 363     }
 364
 365     reloc = &cs->csc->relocs[index];
 366     added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
 367     reloc->read_domains |= rd;
 368     reloc->write_domain |= wd;
 369     reloc->flags = MAX2(reloc->flags, priority);
 370     cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;
 371
 372     if (added_domains & RADEON_DOMAIN_VRAM)
 373         cs->base.used_vram += bo->base.size;
 374     else if (added_domains & RADEON_DOMAIN_GTT)
 375         cs->base.used_gart += bo->base.size;
 376
 377     return index;
 378 }
 379
 380 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
 381                                    struct pb_buffer *buf)
 382 {
 383     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 384
 385     return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
 386 }
 387
 388 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
 389 {
 390     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 391     bool status =
 392         cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
 393         cs->base.used_vram < cs->ws->info.vram_size * 0.8;
 394
 395     if (status) {
 396         cs->csc->num_validated_relocs = cs->csc->num_relocs;
 397     } else {
 398         /* Remove lately-added buffers. The validation failed with them
 399          * and the CS is about to be flushed because of that. Keep only
 400          * the already-validated buffers. */
 401         unsigned i;
 402
 403         for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
 404             p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
 405             radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
 406         }
 407         cs->csc->num_relocs = cs->csc->num_validated_relocs;
 408
 409         /* Flush if there are any relocs. Clean up otherwise. */
 410         if (cs->csc->num_relocs) {
 411             cs->flush_cs(cs->flush_data,
 412                          RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 413         } else {
 414             radeon_cs_context_cleanup(cs->csc);
 415             cs->base.used_vram = 0;
 416             cs->base.used_gart = 0;
 417
 418             assert(cs->base.current.cdw == 0);
 419             if (cs->base.current.cdw != 0) {
 420                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
 421             }
 422         }
 423     }
 424     return status;
 425 }
 426
 427 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
 428 {
 429    assert(rcs->current.cdw <= rcs->current.max_dw);
 430    return rcs->current.max_dw - rcs->current.cdw >= dw;
 431 }
 432
 433 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
 434                                               struct radeon_bo_list_item *list)
 435 {
 436     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 437     int i;
 438
 439     if (list) {
 440         for (i = 0; i < cs->csc->num_relocs; i++) {
 441             list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
 442             list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
 443             list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
 444         }
 445     }
 446     return cs->csc->num_relocs;
 447 }
 448
 449 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
 450 {
 451     struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
 452     unsigned i;
 453     int r;
 454
 455     r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
 456                             &csc->cs, sizeof(struct drm_radeon_cs));
 457     if (r) {
 458         if (r == -ENOMEM)
 459             fprintf(stderr, "radeon: Not enough memory for command submission.\n");
 460         else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
 461             unsigned i;
 462
 463             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
 464             for (i = 0; i < csc->chunks[0].length_dw; i++) {
 465                 fprintf(stderr, "0x%08X\n", csc->buf[i]);
 466             }
 467         } else {
 468             fprintf(stderr, "radeon: The kernel rejected CS, "
 469                     "see dmesg for more information (%i).\n", r);
 470         }
 471     }
 472
 473     for (i = 0; i < csc->num_relocs; i++)
 474         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
 475     for (i = 0; i < csc->num_slab_buffers; i++)
 476         p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
 477
 478     radeon_cs_context_cleanup(csc);
 479 }
 480
 481 /*
 482  * Make sure previous submission of this cs are completed
 483  */
 484 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
 485 {
 486     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 487
 488     /* Wait for any pending ioctl of this CS to complete. */
 489     if (util_queue_is_initialized(&cs->ws->cs_queue))
 490         util_queue_fence_wait(&cs->flush_completed);
 491 }
 492
 493 /* Add the given fence to a slab buffer fence list.
 494  *
 495  * There is a potential race condition when bo participates in submissions on
 496  * two or more threads simultaneously. Since we do not know which of the
 497  * submissions will be sent to the GPU first, we have to keep the fences
 498  * of all submissions.
 499  *
 500  * However, fences that belong to submissions that have already returned from
 501  * their respective ioctl do not have to be kept, because we know that they
 502  * will signal earlier.
 503  */
 504 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
 505 {
 506     unsigned dst;
 507
 508     assert(fence->num_cs_references);
 509
 510     /* Cleanup older fences */
 511     dst = 0;
 512     for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
 513         if (bo->u.slab.fences[src]->num_cs_references) {
 514             bo->u.slab.fences[dst] = bo->u.slab.fences[src];
 515             dst++;
 516         } else {
 517             radeon_bo_reference(&bo->u.slab.fences[src], NULL);
 518         }
 519     }
 520     bo->u.slab.num_fences = dst;
 521
 522     /* Check available space for the new fence */
 523     if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
 524         unsigned new_max_fences = bo->u.slab.max_fences + 1;
 525         struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
 526                                                 bo->u.slab.max_fences * sizeof(*new_fences),
 527                                                 new_max_fences * sizeof(*new_fences));
 528         if (!new_fences) {
 529             fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
 530             return;
 531         }
 532
 533         bo->u.slab.fences = new_fences;
 534         bo->u.slab.max_fences = new_max_fences;
 535     }
 536
 537     /* Add the new fence */
 538     bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
 539     radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
 540     bo->u.slab.num_fences++;
 541 }
 542
 543 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
 544
 545 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
 546                                unsigned flags,
 547                                struct pipe_fence_handle **pfence)
 548 {
 549     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 550     struct radeon_cs_context *tmp;
 551
 552     switch (cs->ring_type) {
 553     case RING_DMA:
 554         /* pad DMA ring to 8 DWs */
 555         if (cs->ws->info.chip_class <= GFX6) {
 556             while (rcs->current.cdw & 7)
 557                 radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
 558         } else {
 559             while (rcs->current.cdw & 7)
 560                 radeon_emit(&cs->base, 0x00000000); /* NOP packet */
 561         }
 562         break;
 563     case RING_GFX:
 564         /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
 565          * r6xx, requires at least 4 dw alignment to avoid a hw bug.
 566          */
 567         if (cs->ws->info.gfx_ib_pad_with_type2) {
 568             while (rcs->current.cdw & 7)
 569                 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
 570         } else {
 571             while (rcs->current.cdw & 7)
 572                 radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
 573         }
 574         break;
 575     case RING_UVD:
 576         while (rcs->current.cdw & 15)
 577             radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
 578         break;
 579     default:
 580         break;
 581     }
 582
 583     if (rcs->current.cdw > rcs->current.max_dw) {
 584        fprintf(stderr, "radeon: command stream overflowed\n");
 585     }
 586
 587     if (pfence || cs->csc->num_slab_buffers) {
 588         struct pipe_fence_handle *fence;
 589
 590         if (cs->next_fence) {
 591             fence = cs->next_fence;
 592             cs->next_fence = NULL;
 593         } else {
 594             fence = radeon_cs_create_fence(rcs);
 595         }
 596
 597         if (fence) {
 598             if (pfence)
 599                 radeon_fence_reference(pfence, fence);
 600
 601             mtx_lock(&cs->ws->bo_fence_lock);
 602             for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
 603                 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
 604                 p_atomic_inc(&bo->num_active_ioctls);
 605                 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
 606             }
 607             mtx_unlock(&cs->ws->bo_fence_lock);
 608
 609             radeon_fence_reference(&fence, NULL);
 610         }
 611     } else {
 612         radeon_fence_reference(&cs->next_fence, NULL);
 613     }
 614
 615     radeon_drm_cs_sync_flush(rcs);
 616
 617     /* Swap command streams. */
 618     tmp = cs->csc;
 619     cs->csc = cs->cst;
 620     cs->cst = tmp;
 621
 622     /* If the CS is not empty or overflowed, emit it in a separate thread. */
 623     if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
 624         unsigned i, num_relocs;
 625
 626         num_relocs = cs->cst->num_relocs;
 627
 628         cs->cst->chunks[0].length_dw = cs->base.current.cdw;
 629
 630         for (i = 0; i < num_relocs; i++) {
 631             /* Update the number of active asynchronous CS ioctls for the buffer. */
 632             p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
 633         }
 634
 635         switch (cs->ring_type) {
 636         case RING_DMA:
 637             cs->cst->flags[0] = 0;
 638             cs->cst->flags[1] = RADEON_CS_RING_DMA;
 639             cs->cst->cs.num_chunks = 3;
 640             if (cs->ws->info.r600_has_virtual_memory) {
 641                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 642             }
 643             break;
 644
 645         case RING_UVD:
 646             cs->cst->flags[0] = 0;
 647             cs->cst->flags[1] = RADEON_CS_RING_UVD;
 648             cs->cst->cs.num_chunks = 3;
 649             break;
 650
 651         case RING_VCE:
 652             cs->cst->flags[0] = 0;
 653             cs->cst->flags[1] = RADEON_CS_RING_VCE;
 654             cs->cst->cs.num_chunks = 3;
 655             break;
 656
 657         default:
 658         case RING_GFX:
 659         case RING_COMPUTE:
 660             cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
 661             cs->cst->flags[1] = RADEON_CS_RING_GFX;
 662             cs->cst->cs.num_chunks = 3;
 663
 664             if (cs->ws->info.r600_has_virtual_memory) {
 665                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 666                 cs->cst->cs.num_chunks = 3;
 667             }
 668             if (flags & PIPE_FLUSH_END_OF_FRAME) {
 669                 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
 670                 cs->cst->cs.num_chunks = 3;
 671             }
 672             if (cs->ring_type == RING_COMPUTE) {
 673                 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
 674                 cs->cst->cs.num_chunks = 3;
 675             }
 676             break;
 677         }
 678
 679         if (util_queue_is_initialized(&cs->ws->cs_queue)) {
 680             util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
 681                                radeon_drm_cs_emit_ioctl_oneshot, NULL);
 682             if (!(flags & PIPE_FLUSH_ASYNC))
 683                 radeon_drm_cs_sync_flush(rcs);
 684         } else {
 685             radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
 686         }
 687     } else {
 688         radeon_cs_context_cleanup(cs->cst);
 689     }
 690
 691     /* Prepare a new CS. */
 692     cs->base.current.buf = cs->csc->buf;
 693     cs->base.current.cdw = 0;
 694     cs->base.used_vram = 0;
 695     cs->base.used_gart = 0;
 696
 697     if (cs->ring_type == RING_GFX)
 698         cs->ws->num_gfx_IBs++;
 699     else if (cs->ring_type == RING_DMA)
 700         cs->ws->num_sdma_IBs++;
 701     return 0;
 702 }
 703
 704 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
 705 {
 706     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 707
 708     radeon_drm_cs_sync_flush(rcs);
 709     util_queue_fence_destroy(&cs->flush_completed);
 710     radeon_cs_context_cleanup(&cs->csc1);
 711     radeon_cs_context_cleanup(&cs->csc2);
 712     p_atomic_dec(&cs->ws->num_cs);
 713     radeon_destroy_cs_context(&cs->csc1);
 714     radeon_destroy_cs_context(&cs->csc2);
 715     radeon_fence_reference(&cs->next_fence, NULL);
 716     FREE(cs);
 717 }
 718
 719 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
 720                                     struct pb_buffer *_buf,
 721                                     enum radeon_bo_usage usage)
 722 {
 723     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 724     struct radeon_bo *bo = (struct radeon_bo*)_buf;
 725     int index;
 726
 727     if (!bo->num_cs_references)
 728         return false;
 729
 730     index = radeon_lookup_buffer(cs->csc, bo);
 731     if (index == -1)
 732         return false;
 733
 734     if (!bo->handle)
 735         index = cs->csc->slab_buffers[index].u.slab.real_idx;
 736
 737     if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
 738         return true;
 739     if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
 740         return true;
 741
 742     return false;
 743 }
 744
 745 /* FENCES */
 746
 747 static struct pipe_fence_handle *
 748 radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
 749 {
 750     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 751     struct pb_buffer *fence;
 752
 753     /* Create a fence, which is a dummy BO. */
 754     fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
 755                                        RADEON_DOMAIN_GTT, RADEON_FLAG_NO_SUBALLOC);
 756     if (!fence)
 757        return NULL;
 758
 759     /* Add the fence as a dummy relocation. */
 760     cs->ws->base.cs_add_buffer(rcs, fence,
 761                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
 762                               RADEON_PRIO_FENCE);
 763     return (struct pipe_fence_handle*)fence;
 764 }
 765
 766 static bool radeon_fence_wait(struct radeon_winsys *ws,
 767                               struct pipe_fence_handle *fence,
 768                               uint64_t timeout)
 769 {
 770     return ws->buffer_wait((struct pb_buffer*)fence, timeout,
 771                            RADEON_USAGE_READWRITE);
 772 }
 773
 774 static void radeon_fence_reference(struct pipe_fence_handle **dst,
 775                                    struct pipe_fence_handle *src)
 776 {
 777     pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
 778 }
 779
 780 static struct pipe_fence_handle *
 781 radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
 782 {
 783    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 784    struct pipe_fence_handle *fence = NULL;
 785
 786    if (cs->next_fence) {
 787       radeon_fence_reference(&fence, cs->next_fence);
 788       return fence;
 789    }
 790
 791    fence = radeon_cs_create_fence(rcs);
 792    if (!fence)
 793       return NULL;
 794
 795    radeon_fence_reference(&cs->next_fence, fence);
 796    return fence;
 797 }
 798
 799 static void
 800 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
 801                                    struct pipe_fence_handle *fence,
 802                                    unsigned dependency_flags)
 803 {
 804    /* TODO: Handle the following unlikely multi-threaded scenario:
 805     *
 806     *  Thread 1 / Context 1                   Thread 2 / Context 2
 807     *  --------------------                   --------------------
 808     *  f = cs_get_next_fence()
 809     *                                         cs_add_fence_dependency(f)
 810     *                                         cs_flush()
 811     *  cs_flush()
 812     *
 813     * We currently assume that this does not happen because we don't support
 814     * asynchronous flushes on Radeon.
 815     */
 816 }
 817
 818 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
 819 {
 820     ws->base.ctx_create = radeon_drm_ctx_create;
 821     ws->base.ctx_destroy = radeon_drm_ctx_destroy;
 822     ws->base.cs_create = radeon_drm_cs_create;
 823     ws->base.cs_destroy = radeon_drm_cs_destroy;
 824     ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
 825     ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
 826     ws->base.cs_validate = radeon_drm_cs_validate;
 827     ws->base.cs_check_space = radeon_drm_cs_check_space;
 828     ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
 829     ws->base.cs_flush = radeon_drm_cs_flush;
 830     ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
 831     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
 832     ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
 833     ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
 834     ws->base.fence_wait = radeon_fence_wait;
 835     ws->base.fence_reference = radeon_fence_reference;
 836 }