src/gallium/winsys/radeon/drm/radeon_drm_cs.c

   1 /*
   2  * Copyright © 2008 Jérôme Glisse
   3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * The above copyright notice and this permission notice (including the
  24  * next paragraph) shall be included in all copies or substantial portions
  25  * of the Software.
  26  */
  27
  28 /*
  29     This file replaces libdrm's radeon_cs_gem with our own implemention.
  30     It's optimized specifically for Radeon DRM.
  31     Adding buffers and space checking are faster and simpler than their
  32     counterparts in libdrm (the time complexity of all the functions
  33     is O(1) in nearly all scenarios, thanks to hashing).
  34
  35     It works like this:
  36
  37     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
  38     also adds the size of 'buf' to the used_gart and used_vram winsys variables
  39     based on the domains, which are simply or'd for the accounting purposes.
  40     The adding is skipped if the reloc is already present in the list, but it
  41     accounts any newly-referenced domains.
  42
  43     cs_validate is then called, which just checks:
  44         used_vram/gart < vram/gart_size * 0.8
  45     The 0.8 number allows for some memory fragmentation. If the validation
  46     fails, the pipe driver flushes CS and tries do the validation again,
  47     i.e. it validates only that one operation. If it fails again, it drops
  48     the operation on the floor and prints some nasty message to stderr.
  49     (done in the pipe driver)
  50
  51     cs_write_reloc(cs, buf) just writes a reloc that has been added using
  52     cs_add_buffer. The read_domain and write_domain parameters have been removed,
  53     because we already specify them in cs_add_buffer.
  54 */
  55
  56 #include "radeon_drm_cs.h"
  57
  58 #include "util/u_memory.h"
  59 #include "os/os_time.h"
  60
  61 #include <stdio.h>
  62 #include <stdlib.h>
  63 #include <stdint.h>
  64 #include <xf86drm.h>
  65
  66
  67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
  68
  69 static struct pipe_fence_handle *
  70 radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
  71 static void radeon_fence_reference(struct pipe_fence_handle **dst,
  72                                    struct pipe_fence_handle *src);
  73
  74 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
  75 {
  76     /* No context support here. Just return the winsys pointer
  77      * as the "context". */
  78     return (struct radeon_winsys_ctx*)ws;
  79 }
  80
  81 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
  82 {
  83     /* No context support here. */
  84 }
  85
  86 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
  87                                    struct radeon_drm_winsys *ws)
  88 {
  89     int i;
  90
  91     csc->fd = ws->fd;
  92
  93     csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
  94     csc->chunks[0].length_dw = 0;
  95     csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
  96     csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
  97     csc->chunks[1].length_dw = 0;
  98     csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
  99     csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
 100     csc->chunks[2].length_dw = 2;
 101     csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
 102
 103     csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
 104     csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
 105     csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
 106
 107     csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
 108
 109     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 110         csc->reloc_indices_hashlist[i] = -1;
 111     }
 112     return true;
 113 }
 114
 115 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
 116 {
 117     unsigned i;
 118
 119     for (i = 0; i < csc->num_relocs; i++) {
 120         p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
 121         radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
 122     }
 123     for (i = 0; i < csc->num_slab_buffers; ++i) {
 124         p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
 125         radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
 126     }
 127
 128     csc->num_relocs = 0;
 129     csc->num_validated_relocs = 0;
 130     csc->num_slab_buffers = 0;
 131     csc->chunks[0].length_dw = 0;
 132     csc->chunks[1].length_dw = 0;
 133
 134     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 135         csc->reloc_indices_hashlist[i] = -1;
 136     }
 137 }
 138
 139 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
 140 {
 141     radeon_cs_context_cleanup(csc);
 142     FREE(csc->slab_buffers);
 143     FREE(csc->relocs_bo);
 144     FREE(csc->relocs);
 145 }
 146
 147
 148 static struct radeon_winsys_cs *
 149 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
 150                      enum ring_type ring_type,
 151                      void (*flush)(void *ctx, unsigned flags,
 152                                    struct pipe_fence_handle **fence),
 153                      void *flush_ctx)
 154 {
 155     struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
 156     struct radeon_drm_cs *cs;
 157
 158     cs = CALLOC_STRUCT(radeon_drm_cs);
 159     if (!cs) {
 160         return NULL;
 161     }
 162     util_queue_fence_init(&cs->flush_completed);
 163
 164     cs->ws = ws;
 165     cs->flush_cs = flush;
 166     cs->flush_data = flush_ctx;
 167
 168     if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
 169         FREE(cs);
 170         return NULL;
 171     }
 172     if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
 173         radeon_destroy_cs_context(&cs->csc1);
 174         FREE(cs);
 175         return NULL;
 176     }
 177
 178     /* Set the first command buffer as current. */
 179     cs->csc = &cs->csc1;
 180     cs->cst = &cs->csc2;
 181     cs->base.current.buf = cs->csc->buf;
 182     cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
 183     cs->ring_type = ring_type;
 184
 185     p_atomic_inc(&ws->num_cs);
 186     return &cs->base;
 187 }
 188
 189 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
 190 {
 191     unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 192     struct radeon_bo_item *buffers;
 193     unsigned num_buffers;
 194     int i = csc->reloc_indices_hashlist[hash];
 195
 196     if (bo->handle) {
 197         buffers = csc->relocs_bo;
 198         num_buffers = csc->num_relocs;
 199     } else {
 200         buffers = csc->slab_buffers;
 201         num_buffers = csc->num_slab_buffers;
 202     }
 203
 204     /* not found or found */
 205     if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
 206         return i;
 207
 208     /* Hash collision, look for the BO in the list of relocs linearly. */
 209     for (i = num_buffers - 1; i >= 0; i--) {
 210         if (buffers[i].bo == bo) {
 211             /* Put this reloc in the hash list.
 212              * This will prevent additional hash collisions if there are
 213              * several consecutive lookup_buffer calls for the same buffer.
 214              *
 215              * Example: Assuming buffers A,B,C collide in the hash list,
 216              * the following sequence of relocs:
 217              *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
 218              * will collide here: ^ and here:   ^,
 219              * meaning that we should get very few collisions in the end. */
 220             csc->reloc_indices_hashlist[hash] = i;
 221             return i;
 222         }
 223     }
 224     return -1;
 225 }
 226
 227 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
 228                                                  struct radeon_bo *bo)
 229 {
 230     struct radeon_cs_context *csc = cs->csc;
 231     struct drm_radeon_cs_reloc *reloc;
 232     unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 233     int i = -1;
 234
 235     i = radeon_lookup_buffer(csc, bo);
 236
 237     if (i >= 0) {
 238         /* For async DMA, every add_buffer call must add a buffer to the list
 239          * no matter how many duplicates there are. This is due to the fact
 240          * the DMA CS checker doesn't use NOP packets for offset patching,
 241          * but always uses the i-th buffer from the list to patch the i-th
 242          * offset. If there are N offsets in a DMA CS, there must also be N
 243          * buffers in the relocation list.
 244          *
 245          * This doesn't have to be done if virtual memory is enabled,
 246          * because there is no offset patching with virtual memory.
 247          */
 248         if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
 249             return i;
 250         }
 251     }
 252
 253     /* New relocation, check if the backing array is large enough. */
 254     if (csc->num_relocs >= csc->max_relocs) {
 255         uint32_t size;
 256         csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
 257
 258         size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
 259         csc->relocs_bo = realloc(csc->relocs_bo, size);
 260
 261         size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
 262         csc->relocs = realloc(csc->relocs, size);
 263
 264         csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 265     }
 266
 267     /* Initialize the new relocation. */
 268     csc->relocs_bo[csc->num_relocs].bo = NULL;
 269     csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
 270     radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
 271     p_atomic_inc(&bo->num_cs_references);
 272     reloc = &csc->relocs[csc->num_relocs];
 273     reloc->handle = bo->handle;
 274     reloc->read_domains = 0;
 275     reloc->write_domain = 0;
 276     reloc->flags = 0;
 277
 278     csc->reloc_indices_hashlist[hash] = csc->num_relocs;
 279
 280     csc->chunks[1].length_dw += RELOC_DWORDS;
 281
 282     return csc->num_relocs++;
 283 }
 284
 285 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
 286                                             struct radeon_bo *bo)
 287 {
 288     struct radeon_cs_context *csc = cs->csc;
 289     unsigned hash;
 290     struct radeon_bo_item *item;
 291     int idx;
 292     int real_idx;
 293
 294     idx = radeon_lookup_buffer(csc, bo);
 295     if (idx >= 0)
 296         return idx;
 297
 298     real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
 299
 300     /* Check if the backing array is large enough. */
 301     if (csc->num_slab_buffers >= csc->max_slab_buffers) {
 302         unsigned new_max = MAX2(csc->max_slab_buffers + 16,
 303                                 (unsigned)(csc->max_slab_buffers * 1.3));
 304         struct radeon_bo_item *new_buffers =
 305             REALLOC(csc->slab_buffers,
 306                     csc->max_slab_buffers * sizeof(*new_buffers),
 307                     new_max * sizeof(*new_buffers));
 308         if (!new_buffers) {
 309             fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
 310             return -1;
 311         }
 312
 313         csc->max_slab_buffers = new_max;
 314         csc->slab_buffers = new_buffers;
 315     }
 316
 317     /* Initialize the new relocation. */
 318     idx = csc->num_slab_buffers++;
 319     item = &csc->slab_buffers[idx];
 320
 321     item->bo = NULL;
 322     item->u.slab.real_idx = real_idx;
 323     radeon_bo_reference(&item->bo, bo);
 324     p_atomic_inc(&bo->num_cs_references);
 325
 326     hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 327     csc->reloc_indices_hashlist[hash] = idx;
 328
 329     return idx;
 330 }
 331
 332 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
 333                                         struct pb_buffer *buf,
 334                                         enum radeon_bo_usage usage,
 335                                         enum radeon_bo_domain domains,
 336                                         enum radeon_bo_priority priority)
 337 {
 338     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 339     struct radeon_bo *bo = (struct radeon_bo*)buf;
 340     enum radeon_bo_domain added_domains;
 341     enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
 342     enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
 343     struct drm_radeon_cs_reloc *reloc;
 344     int index;
 345
 346     if (!bo->handle) {
 347         index = radeon_lookup_or_add_slab_buffer(cs, bo);
 348         if (index < 0)
 349             return 0;
 350
 351         index = cs->csc->slab_buffers[index].u.slab.real_idx;
 352     } else {
 353         index = radeon_lookup_or_add_real_buffer(cs, bo);
 354     }
 355
 356     reloc = &cs->csc->relocs[index];
 357     added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
 358     reloc->read_domains |= rd;
 359     reloc->write_domain |= wd;
 360     reloc->flags = MAX2(reloc->flags, priority);
 361     cs->csc->relocs_bo[index].u.real.priority_usage |= 1ull << priority;
 362
 363     if (added_domains & RADEON_DOMAIN_VRAM)
 364         cs->base.used_vram += bo->base.size;
 365     else if (added_domains & RADEON_DOMAIN_GTT)
 366         cs->base.used_gart += bo->base.size;
 367
 368     return index;
 369 }
 370
 371 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
 372                                    struct pb_buffer *buf)
 373 {
 374     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 375
 376     return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
 377 }
 378
 379 static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
 380 {
 381     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 382     bool status =
 383         cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
 384         cs->base.used_vram < cs->ws->info.vram_size * 0.8;
 385
 386     if (status) {
 387         cs->csc->num_validated_relocs = cs->csc->num_relocs;
 388     } else {
 389         /* Remove lately-added buffers. The validation failed with them
 390          * and the CS is about to be flushed because of that. Keep only
 391          * the already-validated buffers. */
 392         unsigned i;
 393
 394         for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
 395             p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
 396             radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
 397         }
 398         cs->csc->num_relocs = cs->csc->num_validated_relocs;
 399
 400         /* Flush if there are any relocs. Clean up otherwise. */
 401         if (cs->csc->num_relocs) {
 402             cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
 403         } else {
 404             radeon_cs_context_cleanup(cs->csc);
 405             cs->base.used_vram = 0;
 406             cs->base.used_gart = 0;
 407
 408             assert(cs->base.current.cdw == 0);
 409             if (cs->base.current.cdw != 0) {
 410                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
 411             }
 412         }
 413     }
 414     return status;
 415 }
 416
 417 static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
 418 {
 419    assert(rcs->current.cdw <= rcs->current.max_dw);
 420    return rcs->current.max_dw - rcs->current.cdw >= dw;
 421 }
 422
 423 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
 424                                               struct radeon_bo_list_item *list)
 425 {
 426     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 427     int i;
 428
 429     if (list) {
 430         for (i = 0; i < cs->csc->num_relocs; i++) {
 431             list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
 432             list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
 433             list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
 434         }
 435     }
 436     return cs->csc->num_relocs;
 437 }
 438
 439 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
 440 {
 441     struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
 442     unsigned i;
 443     int r;
 444
 445     r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
 446                             &csc->cs, sizeof(struct drm_radeon_cs));
 447     if (r) {
 448         if (r == -ENOMEM)
 449             fprintf(stderr, "radeon: Not enough memory for command submission.\n");
 450         else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
 451             unsigned i;
 452
 453             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
 454             for (i = 0; i < csc->chunks[0].length_dw; i++) {
 455                 fprintf(stderr, "0x%08X\n", csc->buf[i]);
 456             }
 457         } else {
 458             fprintf(stderr, "radeon: The kernel rejected CS, "
 459                     "see dmesg for more information (%i).\n", r);
 460         }
 461     }
 462
 463     for (i = 0; i < csc->num_relocs; i++)
 464         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
 465     for (i = 0; i < csc->num_slab_buffers; i++)
 466         p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
 467
 468     radeon_cs_context_cleanup(csc);
 469 }
 470
 471 /*
 472  * Make sure previous submission of this cs are completed
 473  */
 474 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
 475 {
 476     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 477
 478     /* Wait for any pending ioctl of this CS to complete. */
 479     if (util_queue_is_initialized(&cs->ws->cs_queue))
 480         util_queue_fence_wait(&cs->flush_completed);
 481 }
 482
 483 /* Add the given fence to a slab buffer fence list.
 484  *
 485  * There is a potential race condition when bo participates in submissions on
 486  * two or more threads simultaneously. Since we do not know which of the
 487  * submissions will be sent to the GPU first, we have to keep the fences
 488  * of all submissions.
 489  *
 490  * However, fences that belong to submissions that have already returned from
 491  * their respective ioctl do not have to be kept, because we know that they
 492  * will signal earlier.
 493  */
 494 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
 495 {
 496     unsigned dst;
 497
 498     assert(fence->num_cs_references);
 499
 500     /* Cleanup older fences */
 501     dst = 0;
 502     for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
 503         if (bo->u.slab.fences[src]->num_cs_references) {
 504             bo->u.slab.fences[dst] = bo->u.slab.fences[src];
 505             dst++;
 506         } else {
 507             radeon_bo_reference(&bo->u.slab.fences[src], NULL);
 508         }
 509     }
 510     bo->u.slab.num_fences = dst;
 511
 512     /* Check available space for the new fence */
 513     if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
 514         unsigned new_max_fences = bo->u.slab.max_fences + 1;
 515         struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
 516                                                 bo->u.slab.max_fences * sizeof(*new_fences),
 517                                                 new_max_fences * sizeof(*new_fences));
 518         if (!new_fences) {
 519             fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
 520             return;
 521         }
 522
 523         bo->u.slab.fences = new_fences;
 524         bo->u.slab.max_fences = new_max_fences;
 525     }
 526
 527     /* Add the new fence */
 528     bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
 529     radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
 530     bo->u.slab.num_fences++;
 531 }
 532
 533 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
 534
 535 static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
 536                                unsigned flags,
 537                                struct pipe_fence_handle **pfence)
 538 {
 539     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 540     struct radeon_cs_context *tmp;
 541
 542     switch (cs->ring_type) {
 543     case RING_DMA:
 544         /* pad DMA ring to 8 DWs */
 545         if (cs->ws->info.chip_class <= SI) {
 546             while (rcs->current.cdw & 7)
 547                 radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
 548         } else {
 549             while (rcs->current.cdw & 7)
 550                 radeon_emit(&cs->base, 0x00000000); /* NOP packet */
 551         }
 552         break;
 553     case RING_GFX:
 554         /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
 555          * r6xx, requires at least 4 dw alignment to avoid a hw bug.
 556          */
 557         if (cs->ws->info.gfx_ib_pad_with_type2) {
 558             while (rcs->current.cdw & 7)
 559                 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
 560         } else {
 561             while (rcs->current.cdw & 7)
 562                 radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
 563         }
 564         break;
 565     case RING_UVD:
 566         while (rcs->current.cdw & 15)
 567             radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
 568         break;
 569     default:
 570         break;
 571     }
 572
 573     if (rcs->current.cdw > rcs->current.max_dw) {
 574        fprintf(stderr, "radeon: command stream overflowed\n");
 575     }
 576
 577     if (pfence || cs->csc->num_slab_buffers) {
 578         struct pipe_fence_handle *fence;
 579
 580         if (cs->next_fence) {
 581             fence = cs->next_fence;
 582             cs->next_fence = NULL;
 583         } else {
 584             fence = radeon_cs_create_fence(rcs);
 585         }
 586
 587         if (fence) {
 588             if (pfence)
 589                 radeon_fence_reference(pfence, fence);
 590
 591             mtx_lock(&cs->ws->bo_fence_lock);
 592             for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
 593                 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
 594                 p_atomic_inc(&bo->num_active_ioctls);
 595                 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
 596             }
 597             mtx_unlock(&cs->ws->bo_fence_lock);
 598
 599             radeon_fence_reference(&fence, NULL);
 600         }
 601     } else {
 602         radeon_fence_reference(&cs->next_fence, NULL);
 603     }
 604
 605     radeon_drm_cs_sync_flush(rcs);
 606
 607     /* Swap command streams. */
 608     tmp = cs->csc;
 609     cs->csc = cs->cst;
 610     cs->cst = tmp;
 611
 612     /* If the CS is not empty or overflowed, emit it in a separate thread. */
 613     if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
 614         unsigned i, num_relocs;
 615
 616         num_relocs = cs->cst->num_relocs;
 617
 618         cs->cst->chunks[0].length_dw = cs->base.current.cdw;
 619
 620         for (i = 0; i < num_relocs; i++) {
 621             /* Update the number of active asynchronous CS ioctls for the buffer. */
 622             p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
 623         }
 624
 625         switch (cs->ring_type) {
 626         case RING_DMA:
 627             cs->cst->flags[0] = 0;
 628             cs->cst->flags[1] = RADEON_CS_RING_DMA;
 629             cs->cst->cs.num_chunks = 3;
 630             if (cs->ws->info.has_virtual_memory) {
 631                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 632             }
 633             break;
 634
 635         case RING_UVD:
 636             cs->cst->flags[0] = 0;
 637             cs->cst->flags[1] = RADEON_CS_RING_UVD;
 638             cs->cst->cs.num_chunks = 3;
 639             break;
 640
 641         case RING_VCE:
 642             cs->cst->flags[0] = 0;
 643             cs->cst->flags[1] = RADEON_CS_RING_VCE;
 644             cs->cst->cs.num_chunks = 3;
 645             break;
 646
 647         default:
 648         case RING_GFX:
 649         case RING_COMPUTE:
 650             cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
 651             cs->cst->flags[1] = RADEON_CS_RING_GFX;
 652             cs->cst->cs.num_chunks = 3;
 653
 654             if (cs->ws->info.has_virtual_memory) {
 655                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 656                 cs->cst->cs.num_chunks = 3;
 657             }
 658             if (flags & RADEON_FLUSH_END_OF_FRAME) {
 659                 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
 660                 cs->cst->cs.num_chunks = 3;
 661             }
 662             if (cs->ring_type == RING_COMPUTE) {
 663                 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
 664                 cs->cst->cs.num_chunks = 3;
 665             }
 666             break;
 667         }
 668
 669         if (util_queue_is_initialized(&cs->ws->cs_queue)) {
 670             util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
 671                                radeon_drm_cs_emit_ioctl_oneshot, NULL);
 672             if (!(flags & RADEON_FLUSH_ASYNC))
 673                 radeon_drm_cs_sync_flush(rcs);
 674         } else {
 675             radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
 676         }
 677     } else {
 678         radeon_cs_context_cleanup(cs->cst);
 679     }
 680
 681     /* Prepare a new CS. */
 682     cs->base.current.buf = cs->csc->buf;
 683     cs->base.current.cdw = 0;
 684     cs->base.used_vram = 0;
 685     cs->base.used_gart = 0;
 686
 687     if (cs->ring_type == RING_GFX)
 688         cs->ws->num_gfx_IBs++;
 689     else if (cs->ring_type == RING_DMA)
 690         cs->ws->num_sdma_IBs++;
 691     return 0;
 692 }
 693
 694 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
 695 {
 696     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 697
 698     radeon_drm_cs_sync_flush(rcs);
 699     util_queue_fence_destroy(&cs->flush_completed);
 700     radeon_cs_context_cleanup(&cs->csc1);
 701     radeon_cs_context_cleanup(&cs->csc2);
 702     p_atomic_dec(&cs->ws->num_cs);
 703     radeon_destroy_cs_context(&cs->csc1);
 704     radeon_destroy_cs_context(&cs->csc2);
 705     radeon_fence_reference(&cs->next_fence, NULL);
 706     FREE(cs);
 707 }
 708
 709 static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
 710                                     struct pb_buffer *_buf,
 711                                     enum radeon_bo_usage usage)
 712 {
 713     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 714     struct radeon_bo *bo = (struct radeon_bo*)_buf;
 715     int index;
 716
 717     if (!bo->num_cs_references)
 718         return false;
 719
 720     index = radeon_lookup_buffer(cs->csc, bo);
 721     if (index == -1)
 722         return false;
 723
 724     if (!bo->handle)
 725         index = cs->csc->slab_buffers[index].u.slab.real_idx;
 726
 727     if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
 728         return true;
 729     if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
 730         return true;
 731
 732     return false;
 733 }
 734
 735 /* FENCES */
 736
 737 static struct pipe_fence_handle *
 738 radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
 739 {
 740     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 741     struct pb_buffer *fence;
 742
 743     /* Create a fence, which is a dummy BO. */
 744     fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
 745                                        RADEON_DOMAIN_GTT, RADEON_FLAG_NO_SUBALLOC);
 746     if (!fence)
 747        return NULL;
 748
 749     /* Add the fence as a dummy relocation. */
 750     cs->ws->base.cs_add_buffer(rcs, fence,
 751                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
 752                               RADEON_PRIO_FENCE);
 753     return (struct pipe_fence_handle*)fence;
 754 }
 755
 756 static bool radeon_fence_wait(struct radeon_winsys *ws,
 757                               struct pipe_fence_handle *fence,
 758                               uint64_t timeout)
 759 {
 760     return ws->buffer_wait((struct pb_buffer*)fence, timeout,
 761                            RADEON_USAGE_READWRITE);
 762 }
 763
 764 static void radeon_fence_reference(struct pipe_fence_handle **dst,
 765                                    struct pipe_fence_handle *src)
 766 {
 767     pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
 768 }
 769
 770 static struct pipe_fence_handle *
 771 radeon_drm_cs_get_next_fence(struct radeon_winsys_cs *rcs)
 772 {
 773    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 774    struct pipe_fence_handle *fence = NULL;
 775
 776    if (cs->next_fence) {
 777       radeon_fence_reference(&fence, cs->next_fence);
 778       return fence;
 779    }
 780
 781    fence = radeon_cs_create_fence(rcs);
 782    if (!fence)
 783       return NULL;
 784
 785    radeon_fence_reference(&cs->next_fence, fence);
 786    return fence;
 787 }
 788
 789 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
 790 {
 791     ws->base.ctx_create = radeon_drm_ctx_create;
 792     ws->base.ctx_destroy = radeon_drm_ctx_destroy;
 793     ws->base.cs_create = radeon_drm_cs_create;
 794     ws->base.cs_destroy = radeon_drm_cs_destroy;
 795     ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
 796     ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
 797     ws->base.cs_validate = radeon_drm_cs_validate;
 798     ws->base.cs_check_space = radeon_drm_cs_check_space;
 799     ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
 800     ws->base.cs_flush = radeon_drm_cs_flush;
 801     ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
 802     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
 803     ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
 804     ws->base.fence_wait = radeon_fence_wait;
 805     ws->base.fence_reference = radeon_fence_reference;
 806 }