src/gallium/winsys/radeon/drm/radeon_drm_cs.c

   1 /*
   2  * Copyright © 2008 Jérôme Glisse
   3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * The above copyright notice and this permission notice (including the
  24  * next paragraph) shall be included in all copies or substantial portions
  25  * of the Software.
  26  */
  27
  28 /*
  29     This file replaces libdrm's radeon_cs_gem with our own implemention.
  30     It's optimized specifically for Radeon DRM.
  31     Adding buffers and space checking are faster and simpler than their
  32     counterparts in libdrm (the time complexity of all the functions
  33     is O(1) in nearly all scenarios, thanks to hashing).
  34
  35     It works like this:
  36
  37     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
  38     also adds the size of 'buf' to the used_gart and used_vram winsys variables
  39     based on the domains, which are simply or'd for the accounting purposes.
  40     The adding is skipped if the reloc is already present in the list, but it
  41     accounts any newly-referenced domains.
  42
  43     cs_validate is then called, which just checks:
  44         used_vram/gart < vram/gart_size * 0.8
  45     The 0.8 number allows for some memory fragmentation. If the validation
  46     fails, the pipe driver flushes CS and tries do the validation again,
  47     i.e. it validates only that one operation. If it fails again, it drops
  48     the operation on the floor and prints some nasty message to stderr.
  49     (done in the pipe driver)
  50
  51     cs_write_reloc(cs, buf) just writes a reloc that has been added using
  52     cs_add_buffer. The read_domain and write_domain parameters have been removed,
  53     because we already specify them in cs_add_buffer.
  54 */
  55
  56 #include "radeon_drm_cs.h"
  57
  58 #include "util/u_memory.h"
  59 #include "util/os_time.h"
  60
  61 #include <stdio.h>
  62 #include <stdlib.h>
  63 #include <stdint.h>
  64 #include <xf86drm.h>
  65
  66
  67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
  68
  69 static struct pipe_fence_handle *
  70 radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
  71 static void radeon_fence_reference(struct pipe_fence_handle **dst,
  72                                    struct pipe_fence_handle *src);
  73
  74 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
  75 {
  76     /* No context support here. Just return the winsys pointer
  77      * as the "context". */
  78     return (struct radeon_winsys_ctx*)ws;
  79 }
  80
  81 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
  82 {
  83     /* No context support here. */
  84 }
  85
  86 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
  87                                    struct radeon_drm_winsys *ws)
  88 {
  89     int i;
  90
  91     csc->fd = ws->fd;
  92
  93     csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
  94     csc->chunks[0].length_dw = 0;
  95     csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
  96     csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
  97     csc->chunks[1].length_dw = 0;
  98     csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
  99     csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
 100     csc->chunks[2].length_dw = 2;
 101     csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
 102
 103     csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
 104     csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
 105     csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
 106
 107     csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
 108
 109     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 110         csc->reloc_indices_hashlist[i] = -1;
 111     }
 112     return true;
 113 }
 114
 115 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
 116 {
 117     unsigned i;
 118
 119     for (i = 0; i < csc->num_relocs; i++) {
 120         p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
 121         radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
 122     }
 123     for (i = 0; i < csc->num_slab_buffers; ++i) {
 124         p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
 125         radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
 126     }
 127
 128     csc->num_relocs = 0;
 129     csc->num_validated_relocs = 0;
 130     csc->num_slab_buffers = 0;
 131     csc->chunks[0].length_dw = 0;
 132     csc->chunks[1].length_dw = 0;
 133
 134     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 135         csc->reloc_indices_hashlist[i] = -1;
 136     }
 137 }
 138
 139 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
 140 {
 141     radeon_cs_context_cleanup(csc);
 142     FREE(csc->slab_buffers);
 143     FREE(csc->relocs_bo);
 144     FREE(csc->relocs);
 145 }
 146
 147
 148 static struct radeon_winsys_cs *
 149 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
 150                      enum ring_type ring_type,
 151                      void (*flush)(void *ctx, unsigned flags,
 152                                    struct pipe_fence_handle **fence),
 153                      void *flush_ctx)
 154 {
 155     struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
 156     struct radeon_drm_cs *cs;
 157
 158     cs = CALLOC_STRUCT(radeon_drm_cs);
 159     if (!cs) {
 160         return NULL;
 161     }
 162     util_queue_fence_init(&cs->flush_completed);
 163
 164     cs->ws = ws;
 165     cs->flush_cs = flush;
 166     cs->flush_data = flush_ctx;
 167
 168     if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
 169         FREE(cs);
 170         return NULL;
 171     }
 172     if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
 173         radeon_destroy_cs_context(&cs->csc1);
 174         FREE(cs);
 175         return NULL;
 176     }
 177
 178     /* Set the first command buffer as current. */
 179     cs->csc = &cs->csc1;
 180     cs->cst = &cs->csc2;
 181     cs->base.current.buf = cs->csc->buf;
 182     cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
 183     cs->ring_type = ring_type;
 184
 185     p_atomic_inc(&ws->num_cs);
 186     return &cs->base;
 187 }
 188
 189 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
 190 {
 191     unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 192     struct radeon_bo_item *buffers;
 193     unsigned num_buffers;
 194     int i = csc->reloc_indices_hashlist[hash];
 195
 196     if (bo->handle) {
 197         buffers = csc->relocs_bo;
 198         num_buffers = csc->num_relocs;
 199     } else {
 200         buffers = csc->slab_buffers;
 201         num_buffers = csc->num_slab_buffers;
 202     }
 203
 204     /* not found or found */
 205     if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
 206         return i;
 207
 208     /* Hash collision, look for the BO in the list of relocs linearly. */
 209     for (i = num_buffers - 1; i >= 0; i--) {
 210         if (buffers[i].bo == bo) {
 211             /* Put this reloc in the hash list.
 212              * This will prevent additional hash collisions if there are
 213              * several consecutive lookup_buffer calls for the same buffer.
 214              *
 215              * Example: Assuming buffers A,B,C collide in the hash list,
 216              * the following sequence of relocs:
 217              *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
 218              * will collide here: ^ and here:   ^,
 219              * meaning that we should get very few collisions in the end. */
 220             csc->reloc_indices_hashlist[hash] = i;
 221             return i;
 222         }
 223     }
 224     return -1;
 225 }
 226
 227 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
 228                                                  struct radeon_bo *bo)
 229 {
 230     struct radeon_cs_context *csc = cs->csc;
 231     struct drm_radeon_cs_reloc *reloc;
 232     unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 233     int i = -1;
 234
 235     i = radeon_lookup_buffer(csc, bo);
 236
 237     if (i >= 0) {
 238         /* For async DMA, every add_buffer call must add a buffer to the list
 239          * no matter how many duplicates there are. This is due to the fact
 240          * the DMA CS checker doesn't use NOP packets for offset patching,
 241          * but always uses the i-th buffer from the list to patch the i-th
 242          * offset. If there are N offsets in a DMA CS, there must also be N
 243          * buffers in the relocation list.
 244          *
 245          * This doesn't have to be done if virtual memory is enabled,
 246          * because there is no offset patching with virtual memory.
 247          */
 248         if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
 249             return i;
 250         }
 251     }
 252
 253     /* New relocation, check if the backing array is large enough. */
 254     if (csc->num_relocs >= csc->max_relocs) {
 255         uint32_t size;
 256         csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
 257
 258         size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
 259         csc->relocs_bo = realloc(csc->relocs_bo, size);
 260
 261         size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
 262         csc->relocs = realloc(csc->relocs, size);
 263
 264         csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 265     }
 266
 267     /* Initialize the new relocation. */
 268     csc->relocs_bo[csc->num_relocs].bo = NULL;
 269     csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
 270     radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
 271     p_atomic_inc(&bo->num_cs_references);
 272     reloc = &csc->relocs[csc->num_relocs];
 273     reloc->handle = bo->handle;
 274     reloc->read_domains = 0;
 275     reloc->write_domain = 0;
 276     reloc->flags = 0;
 277
 278     csc->reloc_indices_hashlist[hash] = csc->num_relocs;
 279
 280     csc->chunks[1].length_dw += RELOC_DWORDS;
 281
 282     return csc->num_relocs++;
 283 }
 284
 285 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
 286                                             struct radeon_bo *bo)
 287 {
 288     struct radeon_cs_context *csc = cs->csc;
 289     unsigned hash;
 290     struct radeon_bo_item *item;
 291     int idx;
 292     int real_idx;
 293
 294     idx = radeon_lookup_buffer(csc, bo);
 295     if (idx >= 0)
 296         return idx;
 297
 298     real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
 299
 300     /* Check if the backing array is large enough. */
 301     if (csc->num_slab_buffers >= csc->max_slab_buffers) {
 302         unsigned new_max = MAX2(csc->max_slab_buffers + 16,
 303                                 (unsigned)(csc->max_slab_buffers * 1.3));
 304         struct radeon_bo_item *new_buffers =
 305             REALLOC(csc->slab_buffers,
 306                     csc->max_slab_buffers * sizeof(*new_buffers),
 307                     new_max * sizeof(*new_buffers));
 308         if (!new_buffers) {
 309             fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
 310             return -1;
 311         }
 312
 313         csc->max_slab_buffers = new_max;
 314         csc->slab_buffers = new_buffers;
 315     }
 316
 317     /* Initialize the new relocation. */
 318     idx = csc->num_slab_buffers++;
 319     item = &csc->slab_buffers[idx];
 320
 321     item->bo = NULL;
 322     item->u.slab.real_idx = real_idx;
 323     radeon_bo_reference(&item->bo, bo);
 324     p_atomic_inc(&bo->num_cs_references);
 325
 326     hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 327     csc->reloc_indices_hashlist[hash] = idx;
 328
 329     return idx;
 330 }
 331
 332 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
 333                                         struct pb_buffer *buf,
 334                                         enum radeon_bo_usage usage,
 335                                         enum radeon_bo_domain domains,
 336                                         enum radeon_bo_priority priority)
 337 {
 338     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 339     struct radeon_bo *bo = (struct radeon_bo*)buf;
 340     enum radeon_bo_domain added_domains;
 341
 342     /* If VRAM is just stolen system memory, allow both VRAM and
 343      * GTT, whichever has free space. If a buffer is evicted from
 344      * VRAM to GTT, it will stay there.
 345      */
 346     if (!cs->ws->info.has_dedicated_vram)
 347         domains |= RADEON_DOMAIN_GTT;
 348
 349     enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
 350     enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
 351     struct drm_radeon_cs_reloc *reloc;
 352     int index;
 353
 354     if (!bo->handle) {
 355         index = radeon_lookup_or_add_slab_buffer(cs, bo);
 356         if (index < 0)
 357             return 0;
 358
 359         index = cs->csc->slab_buffers[index].u.slab.real_idx;
 360     } else {
 361         index = radeon_lookup_or_add_real_buffer(cs, bo);
 362     }
 363
 364     reloc = &cs->csc->relocs[index];
 365     added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
 366     reloc->read_domains |= rd;
 367     reloc->write_domain |= wd;
 368     reloc->flags = MAX2(reloc->flags, priority);
 369     cs->csc->relocs_bo[index].u.real.priority_usage |= 1ull << priority;
 370
 371     if (added_domains & RADEON_DOMAIN_VRAM)
 372         cs->base.used_vram += bo->base.size;
 373     else if (added_domains & RADEON_DOMAIN_GTT)
 374         cs->base.used_gart += bo->base.size;
 375
 376     return index;
 377 }
 378
 379 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
 380                                    struct pb_buffer *buf)
 381 {
 382     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 383
 384     return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
 385 }
 386
 387 static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
 388 {
 389     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 390     bool status =
 391         cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
 392         cs->base.used_vram < cs->ws->info.vram_size * 0.8;
 393
 394     if (status) {
 395         cs->csc->num_validated_relocs = cs->csc->num_relocs;
 396     } else {
 397         /* Remove lately-added buffers. The validation failed with them
 398          * and the CS is about to be flushed because of that. Keep only
 399          * the already-validated buffers. */
 400         unsigned i;
 401
 402         for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
 403             p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
 404             radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
 405         }
 406         cs->csc->num_relocs = cs->csc->num_validated_relocs;
 407
 408         /* Flush if there are any relocs. Clean up otherwise. */
 409         if (cs->csc->num_relocs) {
 410             cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
 411         } else {
 412             radeon_cs_context_cleanup(cs->csc);
 413             cs->base.used_vram = 0;
 414             cs->base.used_gart = 0;
 415
 416             assert(cs->base.current.cdw == 0);
 417             if (cs->base.current.cdw != 0) {
 418                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
 419             }
 420         }
 421     }
 422     return status;
 423 }
 424
 425 static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
 426 {
 427    assert(rcs->current.cdw <= rcs->current.max_dw);
 428    return rcs->current.max_dw - rcs->current.cdw >= dw;
 429 }
 430
 431 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
 432                                               struct radeon_bo_list_item *list)
 433 {
 434     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 435     int i;
 436
 437     if (list) {
 438         for (i = 0; i < cs->csc->num_relocs; i++) {
 439             list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
 440             list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
 441             list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
 442         }
 443     }
 444     return cs->csc->num_relocs;
 445 }
 446
 447 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
 448 {
 449     struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
 450     unsigned i;
 451     int r;
 452
 453     r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
 454                             &csc->cs, sizeof(struct drm_radeon_cs));
 455     if (r) {
 456         if (r == -ENOMEM)
 457             fprintf(stderr, "radeon: Not enough memory for command submission.\n");
 458         else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
 459             unsigned i;
 460
 461             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
 462             for (i = 0; i < csc->chunks[0].length_dw; i++) {
 463                 fprintf(stderr, "0x%08X\n", csc->buf[i]);
 464             }
 465         } else {
 466             fprintf(stderr, "radeon: The kernel rejected CS, "
 467                     "see dmesg for more information (%i).\n", r);
 468         }
 469     }
 470
 471     for (i = 0; i < csc->num_relocs; i++)
 472         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
 473     for (i = 0; i < csc->num_slab_buffers; i++)
 474         p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
 475
 476     radeon_cs_context_cleanup(csc);
 477 }
 478
 479 /*
 480  * Make sure previous submission of this cs are completed
 481  */
 482 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
 483 {
 484     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 485
 486     /* Wait for any pending ioctl of this CS to complete. */
 487     if (util_queue_is_initialized(&cs->ws->cs_queue))
 488         util_queue_fence_wait(&cs->flush_completed);
 489 }
 490
 491 /* Add the given fence to a slab buffer fence list.
 492  *
 493  * There is a potential race condition when bo participates in submissions on
 494  * two or more threads simultaneously. Since we do not know which of the
 495  * submissions will be sent to the GPU first, we have to keep the fences
 496  * of all submissions.
 497  *
 498  * However, fences that belong to submissions that have already returned from
 499  * their respective ioctl do not have to be kept, because we know that they
 500  * will signal earlier.
 501  */
 502 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
 503 {
 504     unsigned dst;
 505
 506     assert(fence->num_cs_references);
 507
 508     /* Cleanup older fences */
 509     dst = 0;
 510     for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
 511         if (bo->u.slab.fences[src]->num_cs_references) {
 512             bo->u.slab.fences[dst] = bo->u.slab.fences[src];
 513             dst++;
 514         } else {
 515             radeon_bo_reference(&bo->u.slab.fences[src], NULL);
 516         }
 517     }
 518     bo->u.slab.num_fences = dst;
 519
 520     /* Check available space for the new fence */
 521     if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
 522         unsigned new_max_fences = bo->u.slab.max_fences + 1;
 523         struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
 524                                                 bo->u.slab.max_fences * sizeof(*new_fences),
 525                                                 new_max_fences * sizeof(*new_fences));
 526         if (!new_fences) {
 527             fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
 528             return;
 529         }
 530
 531         bo->u.slab.fences = new_fences;
 532         bo->u.slab.max_fences = new_max_fences;
 533     }
 534
 535     /* Add the new fence */
 536     bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
 537     radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
 538     bo->u.slab.num_fences++;
 539 }
 540
 541 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
 542
 543 static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
 544                                unsigned flags,
 545                                struct pipe_fence_handle **pfence)
 546 {
 547     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 548     struct radeon_cs_context *tmp;
 549
 550     switch (cs->ring_type) {
 551     case RING_DMA:
 552         /* pad DMA ring to 8 DWs */
 553         if (cs->ws->info.chip_class <= SI) {
 554             while (rcs->current.cdw & 7)
 555                 radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
 556         } else {
 557             while (rcs->current.cdw & 7)
 558                 radeon_emit(&cs->base, 0x00000000); /* NOP packet */
 559         }
 560         break;
 561     case RING_GFX:
 562         /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
 563          * r6xx, requires at least 4 dw alignment to avoid a hw bug.
 564          */
 565         if (cs->ws->info.gfx_ib_pad_with_type2) {
 566             while (rcs->current.cdw & 7)
 567                 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
 568         } else {
 569             while (rcs->current.cdw & 7)
 570                 radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
 571         }
 572         break;
 573     case RING_UVD:
 574         while (rcs->current.cdw & 15)
 575             radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
 576         break;
 577     default:
 578         break;
 579     }
 580
 581     if (rcs->current.cdw > rcs->current.max_dw) {
 582        fprintf(stderr, "radeon: command stream overflowed\n");
 583     }
 584
 585     if (pfence || cs->csc->num_slab_buffers) {
 586         struct pipe_fence_handle *fence;
 587
 588         if (cs->next_fence) {
 589             fence = cs->next_fence;
 590             cs->next_fence = NULL;
 591         } else {
 592             fence = radeon_cs_create_fence(rcs);
 593         }
 594
 595         if (fence) {
 596             if (pfence)
 597                 radeon_fence_reference(pfence, fence);
 598
 599             mtx_lock(&cs->ws->bo_fence_lock);
 600             for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
 601                 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
 602                 p_atomic_inc(&bo->num_active_ioctls);
 603                 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
 604             }
 605             mtx_unlock(&cs->ws->bo_fence_lock);
 606
 607             radeon_fence_reference(&fence, NULL);
 608         }
 609     } else {
 610         radeon_fence_reference(&cs->next_fence, NULL);
 611     }
 612
 613     radeon_drm_cs_sync_flush(rcs);
 614
 615     /* Swap command streams. */
 616     tmp = cs->csc;
 617     cs->csc = cs->cst;
 618     cs->cst = tmp;
 619
 620     /* If the CS is not empty or overflowed, emit it in a separate thread. */
 621     if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
 622         unsigned i, num_relocs;
 623
 624         num_relocs = cs->cst->num_relocs;
 625
 626         cs->cst->chunks[0].length_dw = cs->base.current.cdw;
 627
 628         for (i = 0; i < num_relocs; i++) {
 629             /* Update the number of active asynchronous CS ioctls for the buffer. */
 630             p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
 631         }
 632
 633         switch (cs->ring_type) {
 634         case RING_DMA:
 635             cs->cst->flags[0] = 0;
 636             cs->cst->flags[1] = RADEON_CS_RING_DMA;
 637             cs->cst->cs.num_chunks = 3;
 638             if (cs->ws->info.has_virtual_memory) {
 639                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 640             }
 641             break;
 642
 643         case RING_UVD:
 644             cs->cst->flags[0] = 0;
 645             cs->cst->flags[1] = RADEON_CS_RING_UVD;
 646             cs->cst->cs.num_chunks = 3;
 647             break;
 648
 649         case RING_VCE:
 650             cs->cst->flags[0] = 0;
 651             cs->cst->flags[1] = RADEON_CS_RING_VCE;
 652             cs->cst->cs.num_chunks = 3;
 653             break;
 654
 655         default:
 656         case RING_GFX:
 657         case RING_COMPUTE:
 658             cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
 659             cs->cst->flags[1] = RADEON_CS_RING_GFX;
 660             cs->cst->cs.num_chunks = 3;
 661
 662             if (cs->ws->info.has_virtual_memory) {
 663                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 664                 cs->cst->cs.num_chunks = 3;
 665             }
 666             if (flags & PIPE_FLUSH_END_OF_FRAME) {
 667                 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
 668                 cs->cst->cs.num_chunks = 3;
 669             }
 670             if (cs->ring_type == RING_COMPUTE) {
 671                 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
 672                 cs->cst->cs.num_chunks = 3;
 673             }
 674             break;
 675         }
 676
 677         if (util_queue_is_initialized(&cs->ws->cs_queue)) {
 678             util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
 679                                radeon_drm_cs_emit_ioctl_oneshot, NULL);
 680             if (!(flags & PIPE_FLUSH_ASYNC))
 681                 radeon_drm_cs_sync_flush(rcs);
 682         } else {
 683             radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
 684         }
 685     } else {
 686         radeon_cs_context_cleanup(cs->cst);
 687     }
 688
 689     /* Prepare a new CS. */
 690     cs->base.current.buf = cs->csc->buf;
 691     cs->base.current.cdw = 0;
 692     cs->base.used_vram = 0;
 693     cs->base.used_gart = 0;
 694
 695     if (cs->ring_type == RING_GFX)
 696         cs->ws->num_gfx_IBs++;
 697     else if (cs->ring_type == RING_DMA)
 698         cs->ws->num_sdma_IBs++;
 699     return 0;
 700 }
 701
 702 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
 703 {
 704     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 705
 706     radeon_drm_cs_sync_flush(rcs);
 707     util_queue_fence_destroy(&cs->flush_completed);
 708     radeon_cs_context_cleanup(&cs->csc1);
 709     radeon_cs_context_cleanup(&cs->csc2);
 710     p_atomic_dec(&cs->ws->num_cs);
 711     radeon_destroy_cs_context(&cs->csc1);
 712     radeon_destroy_cs_context(&cs->csc2);
 713     radeon_fence_reference(&cs->next_fence, NULL);
 714     FREE(cs);
 715 }
 716
 717 static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
 718                                     struct pb_buffer *_buf,
 719                                     enum radeon_bo_usage usage)
 720 {
 721     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 722     struct radeon_bo *bo = (struct radeon_bo*)_buf;
 723     int index;
 724
 725     if (!bo->num_cs_references)
 726         return false;
 727
 728     index = radeon_lookup_buffer(cs->csc, bo);
 729     if (index == -1)
 730         return false;
 731
 732     if (!bo->handle)
 733         index = cs->csc->slab_buffers[index].u.slab.real_idx;
 734
 735     if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
 736         return true;
 737     if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
 738         return true;
 739
 740     return false;
 741 }
 742
 743 /* FENCES */
 744
 745 static struct pipe_fence_handle *
 746 radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
 747 {
 748     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 749     struct pb_buffer *fence;
 750
 751     /* Create a fence, which is a dummy BO. */
 752     fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
 753                                        RADEON_DOMAIN_GTT, RADEON_FLAG_NO_SUBALLOC);
 754     if (!fence)
 755        return NULL;
 756
 757     /* Add the fence as a dummy relocation. */
 758     cs->ws->base.cs_add_buffer(rcs, fence,
 759                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
 760                               RADEON_PRIO_FENCE);
 761     return (struct pipe_fence_handle*)fence;
 762 }
 763
 764 static bool radeon_fence_wait(struct radeon_winsys *ws,
 765                               struct pipe_fence_handle *fence,
 766                               uint64_t timeout)
 767 {
 768     return ws->buffer_wait((struct pb_buffer*)fence, timeout,
 769                            RADEON_USAGE_READWRITE);
 770 }
 771
 772 static void radeon_fence_reference(struct pipe_fence_handle **dst,
 773                                    struct pipe_fence_handle *src)
 774 {
 775     pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
 776 }
 777
 778 static struct pipe_fence_handle *
 779 radeon_drm_cs_get_next_fence(struct radeon_winsys_cs *rcs)
 780 {
 781    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 782    struct pipe_fence_handle *fence = NULL;
 783
 784    if (cs->next_fence) {
 785       radeon_fence_reference(&fence, cs->next_fence);
 786       return fence;
 787    }
 788
 789    fence = radeon_cs_create_fence(rcs);
 790    if (!fence)
 791       return NULL;
 792
 793    radeon_fence_reference(&cs->next_fence, fence);
 794    return fence;
 795 }
 796
 797 static void
 798 radeon_drm_cs_add_fence_dependency(struct radeon_winsys_cs *cs,
 799                                    struct pipe_fence_handle *fence)
 800 {
 801    /* TODO: Handle the following unlikely multi-threaded scenario:
 802     *
 803     *  Thread 1 / Context 1                   Thread 2 / Context 2
 804     *  --------------------                   --------------------
 805     *  f = cs_get_next_fence()
 806     *                                         cs_add_fence_dependency(f)
 807     *                                         cs_flush()
 808     *  cs_flush()
 809     *
 810     * We currently assume that this does not happen because we don't support
 811     * asynchronous flushes on Radeon.
 812     */
 813 }
 814
 815 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
 816 {
 817     ws->base.ctx_create = radeon_drm_ctx_create;
 818     ws->base.ctx_destroy = radeon_drm_ctx_destroy;
 819     ws->base.cs_create = radeon_drm_cs_create;
 820     ws->base.cs_destroy = radeon_drm_cs_destroy;
 821     ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
 822     ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
 823     ws->base.cs_validate = radeon_drm_cs_validate;
 824     ws->base.cs_check_space = radeon_drm_cs_check_space;
 825     ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
 826     ws->base.cs_flush = radeon_drm_cs_flush;
 827     ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
 828     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
 829     ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
 830     ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
 831     ws->base.fence_wait = radeon_fence_wait;
 832     ws->base.fence_reference = radeon_fence_reference;
 833 }