src/gallium/winsys/radeon/drm/radeon_drm_cs.c

   1 /*
   2  * Copyright © 2008 Jérôme Glisse
   3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * The above copyright notice and this permission notice (including the
  24  * next paragraph) shall be included in all copies or substantial portions
  25  * of the Software.
  26  */
  27 /*
  28  * Authors:
  29  *      Marek Olšák <maraeo@gmail.com>
  30  *
  31  * Based on work from libdrm_radeon by:
  32  *      Aapo Tahkola <aet@rasterburn.org>
  33  *      Nicolai Haehnle <prefect_@gmx.net>
  34  *      Jérôme Glisse <glisse@freedesktop.org>
  35  */
  36
  37 /*
  38     This file replaces libdrm's radeon_cs_gem with our own implemention.
  39     It's optimized specifically for Radeon DRM.
  40     Adding buffers and space checking are faster and simpler than their
  41     counterparts in libdrm (the time complexity of all the functions
  42     is O(1) in nearly all scenarios, thanks to hashing).
  43
  44     It works like this:
  45
  46     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
  47     also adds the size of 'buf' to the used_gart and used_vram winsys variables
  48     based on the domains, which are simply or'd for the accounting purposes.
  49     The adding is skipped if the reloc is already present in the list, but it
  50     accounts any newly-referenced domains.
  51
  52     cs_validate is then called, which just checks:
  53         used_vram/gart < vram/gart_size * 0.8
  54     The 0.8 number allows for some memory fragmentation. If the validation
  55     fails, the pipe driver flushes CS and tries do the validation again,
  56     i.e. it validates only that one operation. If it fails again, it drops
  57     the operation on the floor and prints some nasty message to stderr.
  58     (done in the pipe driver)
  59
  60     cs_write_reloc(cs, buf) just writes a reloc that has been added using
  61     cs_add_buffer. The read_domain and write_domain parameters have been removed,
  62     because we already specify them in cs_add_buffer.
  63 */
  64
  65 #include "radeon_drm_cs.h"
  66
  67 #include "util/u_memory.h"
  68 #include "os/os_time.h"
  69
  70 #include <stdio.h>
  71 #include <stdlib.h>
  72 #include <stdint.h>
  73 #include <xf86drm.h>
  74
  75
  76 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
  77
  78 static struct pipe_fence_handle *
  79 radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
  80 static void radeon_fence_reference(struct pipe_fence_handle **dst,
  81                                    struct pipe_fence_handle *src);
  82
  83 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
  84 {
  85     /* No context support here. Just return the winsys pointer
  86      * as the "context". */
  87     return (struct radeon_winsys_ctx*)ws;
  88 }
  89
  90 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
  91 {
  92     /* No context support here. */
  93 }
  94
  95 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
  96                                    struct radeon_drm_winsys *ws)
  97 {
  98     int i;
  99
 100     csc->fd = ws->fd;
 101     csc->nrelocs = 512;
 102     csc->relocs_bo = (struct radeon_bo_item*)
 103                      CALLOC(1, csc->nrelocs * sizeof(csc->relocs_bo[0]));
 104     if (!csc->relocs_bo) {
 105         return false;
 106     }
 107
 108     csc->relocs = (struct drm_radeon_cs_reloc*)
 109                   CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
 110     if (!csc->relocs) {
 111         FREE(csc->relocs_bo);
 112         return false;
 113     }
 114
 115     csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
 116     csc->chunks[0].length_dw = 0;
 117     csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
 118     csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
 119     csc->chunks[1].length_dw = 0;
 120     csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 121     csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
 122     csc->chunks[2].length_dw = 2;
 123     csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
 124
 125     csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
 126     csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
 127     csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
 128
 129     csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
 130
 131     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 132         csc->reloc_indices_hashlist[i] = -1;
 133     }
 134     return true;
 135 }
 136
 137 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
 138 {
 139     unsigned i;
 140
 141     for (i = 0; i < csc->crelocs; i++) {
 142         p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
 143         radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
 144     }
 145
 146     csc->crelocs = 0;
 147     csc->validated_crelocs = 0;
 148     csc->chunks[0].length_dw = 0;
 149     csc->chunks[1].length_dw = 0;
 150
 151     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 152         csc->reloc_indices_hashlist[i] = -1;
 153     }
 154 }
 155
 156 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
 157 {
 158     radeon_cs_context_cleanup(csc);
 159     FREE(csc->relocs_bo);
 160     FREE(csc->relocs);
 161 }
 162
 163
 164 static struct radeon_winsys_cs *
 165 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
 166                      enum ring_type ring_type,
 167                      void (*flush)(void *ctx, unsigned flags,
 168                                    struct pipe_fence_handle **fence),
 169                      void *flush_ctx)
 170 {
 171     struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
 172     struct radeon_drm_cs *cs;
 173
 174     cs = CALLOC_STRUCT(radeon_drm_cs);
 175     if (!cs) {
 176         return NULL;
 177     }
 178     util_queue_fence_init(&cs->flush_completed);
 179
 180     cs->ws = ws;
 181     cs->flush_cs = flush;
 182     cs->flush_data = flush_ctx;
 183
 184     if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
 185         FREE(cs);
 186         return NULL;
 187     }
 188     if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
 189         radeon_destroy_cs_context(&cs->csc1);
 190         FREE(cs);
 191         return NULL;
 192     }
 193
 194     /* Set the first command buffer as current. */
 195     cs->csc = &cs->csc1;
 196     cs->cst = &cs->csc2;
 197     cs->base.current.buf = cs->csc->buf;
 198     cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
 199     cs->ring_type = ring_type;
 200
 201     p_atomic_inc(&ws->num_cs);
 202     return &cs->base;
 203 }
 204
 205 #define OUT_CS(cs, value) (cs)->current.buf[(cs)->current.cdw++] = (value)
 206
 207 static inline void update_reloc(struct drm_radeon_cs_reloc *reloc,
 208                                 enum radeon_bo_domain rd,
 209                                 enum radeon_bo_domain wd,
 210                                 unsigned priority,
 211                                 enum radeon_bo_domain *added_domains)
 212 {
 213     *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
 214
 215     reloc->read_domains |= rd;
 216     reloc->write_domain |= wd;
 217     reloc->flags = MAX2(reloc->flags, priority);
 218 }
 219
 220 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
 221 {
 222     unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 223     int i = csc->reloc_indices_hashlist[hash];
 224
 225     /* not found or found */
 226     if (i == -1 || csc->relocs_bo[i].bo == bo)
 227         return i;
 228
 229     /* Hash collision, look for the BO in the list of relocs linearly. */
 230     for (i = csc->crelocs - 1; i >= 0; i--) {
 231         if (csc->relocs_bo[i].bo == bo) {
 232             /* Put this reloc in the hash list.
 233              * This will prevent additional hash collisions if there are
 234              * several consecutive lookup_buffer calls for the same buffer.
 235              *
 236              * Example: Assuming buffers A,B,C collide in the hash list,
 237              * the following sequence of relocs:
 238              *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
 239              * will collide here: ^ and here:   ^,
 240              * meaning that we should get very few collisions in the end. */
 241             csc->reloc_indices_hashlist[hash] = i;
 242             return i;
 243         }
 244     }
 245     return -1;
 246 }
 247
 248 static unsigned radeon_add_buffer(struct radeon_drm_cs *cs,
 249                                  struct radeon_bo *bo,
 250                                  enum radeon_bo_usage usage,
 251                                  enum radeon_bo_domain domains,
 252                                  unsigned priority,
 253                                  enum radeon_bo_domain *added_domains)
 254 {
 255     struct radeon_cs_context *csc = cs->csc;
 256     struct drm_radeon_cs_reloc *reloc;
 257     unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 258     enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
 259     enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
 260     int i = -1;
 261
 262     assert(priority < 64);
 263     *added_domains = 0;
 264
 265     i = radeon_lookup_buffer(csc, bo);
 266
 267     if (i >= 0) {
 268         reloc = &csc->relocs[i];
 269         update_reloc(reloc, rd, wd, priority / 4, added_domains);
 270         csc->relocs_bo[i].priority_usage |= 1llu << priority;
 271
 272         /* For async DMA, every add_buffer call must add a buffer to the list
 273          * no matter how many duplicates there are. This is due to the fact
 274          * the DMA CS checker doesn't use NOP packets for offset patching,
 275          * but always uses the i-th buffer from the list to patch the i-th
 276          * offset. If there are N offsets in a DMA CS, there must also be N
 277          * buffers in the relocation list.
 278          *
 279          * This doesn't have to be done if virtual memory is enabled,
 280          * because there is no offset patching with virtual memory.
 281          */
 282         if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
 283             return i;
 284         }
 285     }
 286
 287     /* New relocation, check if the backing array is large enough. */
 288     if (csc->crelocs >= csc->nrelocs) {
 289         uint32_t size;
 290         csc->nrelocs += 10;
 291
 292         size = csc->nrelocs * sizeof(csc->relocs_bo[0]);
 293         csc->relocs_bo = realloc(csc->relocs_bo, size);
 294
 295         size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
 296         csc->relocs = realloc(csc->relocs, size);
 297
 298         csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 299     }
 300
 301     /* Initialize the new relocation. */
 302     csc->relocs_bo[csc->crelocs].bo = NULL;
 303     csc->relocs_bo[csc->crelocs].priority_usage = 1llu << priority;
 304     radeon_bo_reference(&csc->relocs_bo[csc->crelocs].bo, bo);
 305     p_atomic_inc(&bo->num_cs_references);
 306     reloc = &csc->relocs[csc->crelocs];
 307     reloc->handle = bo->handle;
 308     reloc->read_domains = rd;
 309     reloc->write_domain = wd;
 310     reloc->flags = priority / 4;
 311
 312     csc->reloc_indices_hashlist[hash] = csc->crelocs;
 313
 314     csc->chunks[1].length_dw += RELOC_DWORDS;
 315
 316     *added_domains = rd | wd;
 317     return csc->crelocs++;
 318 }
 319
 320 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
 321                                         struct pb_buffer *buf,
 322                                         enum radeon_bo_usage usage,
 323                                         enum radeon_bo_domain domains,
 324                                         enum radeon_bo_priority priority)
 325 {
 326     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 327     struct radeon_bo *bo = (struct radeon_bo*)buf;
 328     enum radeon_bo_domain added_domains;
 329     unsigned index = radeon_add_buffer(cs, bo, usage, domains, priority,
 330                                        &added_domains);
 331
 332     if (added_domains & RADEON_DOMAIN_VRAM)
 333         cs->base.used_vram += bo->base.size;
 334     else if (added_domains & RADEON_DOMAIN_GTT)
 335         cs->base.used_gart += bo->base.size;
 336
 337     return index;
 338 }
 339
 340 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
 341                                    struct pb_buffer *buf)
 342 {
 343     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 344
 345     return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
 346 }
 347
 348 static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
 349 {
 350     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 351     bool status =
 352         cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
 353         cs->base.used_vram < cs->ws->info.vram_size * 0.8;
 354
 355     if (status) {
 356         cs->csc->validated_crelocs = cs->csc->crelocs;
 357     } else {
 358         /* Remove lately-added buffers. The validation failed with them
 359          * and the CS is about to be flushed because of that. Keep only
 360          * the already-validated buffers. */
 361         unsigned i;
 362
 363         for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
 364             p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
 365             radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
 366         }
 367         cs->csc->crelocs = cs->csc->validated_crelocs;
 368
 369         /* Flush if there are any relocs. Clean up otherwise. */
 370         if (cs->csc->crelocs) {
 371             cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
 372         } else {
 373             radeon_cs_context_cleanup(cs->csc);
 374             cs->base.used_vram = 0;
 375             cs->base.used_gart = 0;
 376
 377             assert(cs->base.current.cdw == 0);
 378             if (cs->base.current.cdw != 0) {
 379                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
 380             }
 381         }
 382     }
 383     return status;
 384 }
 385
 386 static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
 387 {
 388    assert(rcs->current.cdw <= rcs->current.max_dw);
 389    return rcs->current.max_dw - rcs->current.cdw >= dw;
 390 }
 391
 392 static bool radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
 393 {
 394     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 395
 396     vram += cs->base.used_vram;
 397     gtt += cs->base.used_gart;
 398
 399     /* Anything that goes above the VRAM size should go to GTT. */
 400     if (vram > cs->ws->info.vram_size)
 401         gtt += vram - cs->ws->info.vram_size;
 402
 403     /* Now we just need to check if we have enough GTT. */
 404     return gtt < cs->ws->info.gart_size * 0.7;
 405 }
 406
 407 static uint64_t radeon_drm_cs_query_memory_usage(struct radeon_winsys_cs *rcs)
 408 {
 409    return rcs->used_vram + rcs->used_gart;
 410 }
 411
 412 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
 413                                               struct radeon_bo_list_item *list)
 414 {
 415     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 416     int i;
 417
 418     if (list) {
 419         for (i = 0; i < cs->csc->crelocs; i++) {
 420             list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
 421             list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
 422             list[i].priority_usage = cs->csc->relocs_bo[i].priority_usage;
 423         }
 424     }
 425     return cs->csc->crelocs;
 426 }
 427
 428 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
 429 {
 430     struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
 431     unsigned i;
 432     int r;
 433
 434     r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
 435                             &csc->cs, sizeof(struct drm_radeon_cs));
 436     if (r) {
 437         if (r == -ENOMEM)
 438             fprintf(stderr, "radeon: Not enough memory for command submission.\n");
 439         else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
 440             unsigned i;
 441
 442             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
 443             for (i = 0; i < csc->chunks[0].length_dw; i++) {
 444                 fprintf(stderr, "0x%08X\n", csc->buf[i]);
 445             }
 446         } else {
 447             fprintf(stderr, "radeon: The kernel rejected CS, "
 448                     "see dmesg for more information (%i).\n", r);
 449         }
 450     }
 451
 452     for (i = 0; i < csc->crelocs; i++)
 453         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
 454
 455     radeon_cs_context_cleanup(csc);
 456 }
 457
 458 /*
 459  * Make sure previous submission of this cs are completed
 460  */
 461 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
 462 {
 463     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 464
 465     /* Wait for any pending ioctl of this CS to complete. */
 466     if (util_queue_is_initialized(&cs->ws->cs_queue))
 467         util_queue_job_wait(&cs->flush_completed);
 468 }
 469
 470 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
 471
 472 static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
 473                                unsigned flags,
 474                                struct pipe_fence_handle **fence)
 475 {
 476     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 477     struct radeon_cs_context *tmp;
 478
 479     switch (cs->ring_type) {
 480     case RING_DMA:
 481         /* pad DMA ring to 8 DWs */
 482         if (cs->ws->info.chip_class <= SI) {
 483             while (rcs->current.cdw & 7)
 484                 OUT_CS(&cs->base, 0xf0000000); /* NOP packet */
 485         } else {
 486             while (rcs->current.cdw & 7)
 487                 OUT_CS(&cs->base, 0x00000000); /* NOP packet */
 488         }
 489         break;
 490     case RING_GFX:
 491         /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
 492          * r6xx, requires at least 4 dw alignment to avoid a hw bug.
 493          */
 494         if (cs->ws->info.gfx_ib_pad_with_type2) {
 495             while (rcs->current.cdw & 7)
 496                 OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
 497         } else {
 498             while (rcs->current.cdw & 7)
 499                 OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */
 500         }
 501         break;
 502     case RING_UVD:
 503         while (rcs->current.cdw & 15)
 504             OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
 505         break;
 506     default:
 507         break;
 508     }
 509
 510     if (rcs->current.cdw > rcs->current.max_dw) {
 511        fprintf(stderr, "radeon: command stream overflowed\n");
 512     }
 513
 514     if (fence) {
 515         radeon_fence_reference(fence, NULL);
 516         *fence = radeon_cs_create_fence(rcs);
 517     }
 518
 519     radeon_drm_cs_sync_flush(rcs);
 520
 521     /* Swap command streams. */
 522     tmp = cs->csc;
 523     cs->csc = cs->cst;
 524     cs->cst = tmp;
 525
 526     /* If the CS is not empty or overflowed, emit it in a separate thread. */
 527     if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
 528         unsigned i, crelocs;
 529
 530         crelocs = cs->cst->crelocs;
 531
 532         cs->cst->chunks[0].length_dw = cs->base.current.cdw;
 533
 534         for (i = 0; i < crelocs; i++) {
 535             /* Update the number of active asynchronous CS ioctls for the buffer. */
 536             p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
 537         }
 538
 539         switch (cs->ring_type) {
 540         case RING_DMA:
 541             cs->cst->flags[0] = 0;
 542             cs->cst->flags[1] = RADEON_CS_RING_DMA;
 543             cs->cst->cs.num_chunks = 3;
 544             if (cs->ws->info.has_virtual_memory) {
 545                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 546             }
 547             break;
 548
 549         case RING_UVD:
 550             cs->cst->flags[0] = 0;
 551             cs->cst->flags[1] = RADEON_CS_RING_UVD;
 552             cs->cst->cs.num_chunks = 3;
 553             break;
 554
 555         case RING_VCE:
 556             cs->cst->flags[0] = 0;
 557             cs->cst->flags[1] = RADEON_CS_RING_VCE;
 558             cs->cst->cs.num_chunks = 3;
 559             break;
 560
 561         default:
 562         case RING_GFX:
 563         case RING_COMPUTE:
 564             cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
 565             cs->cst->flags[1] = RADEON_CS_RING_GFX;
 566             cs->cst->cs.num_chunks = 3;
 567
 568             if (cs->ws->info.has_virtual_memory) {
 569                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 570                 cs->cst->cs.num_chunks = 3;
 571             }
 572             if (flags & RADEON_FLUSH_END_OF_FRAME) {
 573                 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
 574                 cs->cst->cs.num_chunks = 3;
 575             }
 576             if (cs->ring_type == RING_COMPUTE) {
 577                 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
 578                 cs->cst->cs.num_chunks = 3;
 579             }
 580             break;
 581         }
 582
 583         if (util_queue_is_initialized(&cs->ws->cs_queue)) {
 584             util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
 585                                radeon_drm_cs_emit_ioctl_oneshot, NULL);
 586             if (!(flags & RADEON_FLUSH_ASYNC))
 587                 radeon_drm_cs_sync_flush(rcs);
 588         } else {
 589             radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
 590         }
 591     } else {
 592         radeon_cs_context_cleanup(cs->cst);
 593     }
 594
 595     /* Prepare a new CS. */
 596     cs->base.current.buf = cs->csc->buf;
 597     cs->base.current.cdw = 0;
 598     cs->base.used_vram = 0;
 599     cs->base.used_gart = 0;
 600
 601     cs->ws->num_cs_flushes++;
 602     return 0;
 603 }
 604
 605 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
 606 {
 607     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 608
 609     radeon_drm_cs_sync_flush(rcs);
 610     util_queue_fence_destroy(&cs->flush_completed);
 611     radeon_cs_context_cleanup(&cs->csc1);
 612     radeon_cs_context_cleanup(&cs->csc2);
 613     p_atomic_dec(&cs->ws->num_cs);
 614     radeon_destroy_cs_context(&cs->csc1);
 615     radeon_destroy_cs_context(&cs->csc2);
 616     FREE(cs);
 617 }
 618
 619 static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
 620                                     struct pb_buffer *_buf,
 621                                     enum radeon_bo_usage usage)
 622 {
 623     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 624     struct radeon_bo *bo = (struct radeon_bo*)_buf;
 625     int index;
 626
 627     if (!bo->num_cs_references)
 628         return false;
 629
 630     index = radeon_lookup_buffer(cs->csc, bo);
 631     if (index == -1)
 632         return false;
 633
 634     if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
 635         return true;
 636     if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
 637         return true;
 638
 639     return false;
 640 }
 641
 642 /* FENCES */
 643
 644 static struct pipe_fence_handle *
 645 radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
 646 {
 647     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 648     struct pb_buffer *fence;
 649
 650     /* Create a fence, which is a dummy BO. */
 651     fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
 652                                        RADEON_DOMAIN_GTT, 0);
 653     /* Add the fence as a dummy relocation. */
 654     cs->ws->base.cs_add_buffer(rcs, fence,
 655                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
 656                               RADEON_PRIO_FENCE);
 657     return (struct pipe_fence_handle*)fence;
 658 }
 659
 660 static bool radeon_fence_wait(struct radeon_winsys *ws,
 661                               struct pipe_fence_handle *fence,
 662                               uint64_t timeout)
 663 {
 664     return ws->buffer_wait((struct pb_buffer*)fence, timeout,
 665                            RADEON_USAGE_READWRITE);
 666 }
 667
 668 static void radeon_fence_reference(struct pipe_fence_handle **dst,
 669                                    struct pipe_fence_handle *src)
 670 {
 671     pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
 672 }
 673
 674 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
 675 {
 676     ws->base.ctx_create = radeon_drm_ctx_create;
 677     ws->base.ctx_destroy = radeon_drm_ctx_destroy;
 678     ws->base.cs_create = radeon_drm_cs_create;
 679     ws->base.cs_destroy = radeon_drm_cs_destroy;
 680     ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
 681     ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
 682     ws->base.cs_validate = radeon_drm_cs_validate;
 683     ws->base.cs_check_space = radeon_drm_cs_check_space;
 684     ws->base.cs_memory_below_limit = radeon_drm_cs_memory_below_limit;
 685     ws->base.cs_query_memory_usage = radeon_drm_cs_query_memory_usage;
 686     ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
 687     ws->base.cs_flush = radeon_drm_cs_flush;
 688     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
 689     ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
 690     ws->base.fence_wait = radeon_fence_wait;
 691     ws->base.fence_reference = radeon_fence_reference;
 692 }