src/gallium/winsys/radeon/drm/radeon_drm_cs.c

   1 /*
   2  * Copyright © 2008 Jérôme Glisse
   3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * The above copyright notice and this permission notice (including the
  24  * next paragraph) shall be included in all copies or substantial portions
  25  * of the Software.
  26  */
  27 /*
  28  * Authors:
  29  *      Marek Olšák <maraeo@gmail.com>
  30  *
  31  * Based on work from libdrm_radeon by:
  32  *      Aapo Tahkola <aet@rasterburn.org>
  33  *      Nicolai Haehnle <prefect_@gmx.net>
  34  *      Jérôme Glisse <glisse@freedesktop.org>
  35  */
  36
  37 /*
  38     This file replaces libdrm's radeon_cs_gem with our own implemention.
  39     It's optimized specifically for Radeon DRM.
  40     Adding buffers and space checking are faster and simpler than their
  41     counterparts in libdrm (the time complexity of all the functions
  42     is O(1) in nearly all scenarios, thanks to hashing).
  43
  44     It works like this:
  45
  46     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
  47     also adds the size of 'buf' to the used_gart and used_vram winsys variables
  48     based on the domains, which are simply or'd for the accounting purposes.
  49     The adding is skipped if the reloc is already present in the list, but it
  50     accounts any newly-referenced domains.
  51
  52     cs_validate is then called, which just checks:
  53         used_vram/gart < vram/gart_size * 0.8
  54     The 0.8 number allows for some memory fragmentation. If the validation
  55     fails, the pipe driver flushes CS and tries do the validation again,
  56     i.e. it validates only that one operation. If it fails again, it drops
  57     the operation on the floor and prints some nasty message to stderr.
  58     (done in the pipe driver)
  59
  60     cs_write_reloc(cs, buf) just writes a reloc that has been added using
  61     cs_add_buffer. The read_domain and write_domain parameters have been removed,
  62     because we already specify them in cs_add_buffer.
  63 */
  64
  65 #include "radeon_drm_cs.h"
  66
  67 #include "util/u_memory.h"
  68 #include "os/os_time.h"
  69
  70 #include <stdio.h>
  71 #include <stdlib.h>
  72 #include <stdint.h>
  73 #include <xf86drm.h>
  74
  75
  76 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
  77
  78 static struct pipe_fence_handle *
  79 radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
  80 static void radeon_fence_reference(struct pipe_fence_handle **dst,
  81                                    struct pipe_fence_handle *src);
  82
  83 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
  84 {
  85     /* No context support here. Just return the winsys pointer
  86      * as the "context". */
  87     return (struct radeon_winsys_ctx*)ws;
  88 }
  89
  90 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
  91 {
  92     /* No context support here. */
  93 }
  94
  95 static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
  96                                       struct radeon_drm_winsys *ws)
  97 {
  98     int i;
  99
 100     csc->fd = ws->fd;
 101     csc->nrelocs = 512;
 102     csc->relocs_bo = (struct radeon_bo_item*)
 103                      CALLOC(1, csc->nrelocs * sizeof(csc->relocs_bo[0]));
 104     if (!csc->relocs_bo) {
 105         return FALSE;
 106     }
 107
 108     csc->relocs = (struct drm_radeon_cs_reloc*)
 109                   CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
 110     if (!csc->relocs) {
 111         FREE(csc->relocs_bo);
 112         return FALSE;
 113     }
 114
 115     csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
 116     csc->chunks[0].length_dw = 0;
 117     csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
 118     csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
 119     csc->chunks[1].length_dw = 0;
 120     csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 121     csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
 122     csc->chunks[2].length_dw = 2;
 123     csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
 124
 125     csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
 126     csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
 127     csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
 128
 129     csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
 130
 131     for (i = 0; i < Elements(csc->reloc_indices_hashlist); i++) {
 132         csc->reloc_indices_hashlist[i] = -1;
 133     }
 134     return TRUE;
 135 }
 136
 137 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
 138 {
 139     unsigned i;
 140
 141     for (i = 0; i < csc->crelocs; i++) {
 142         p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
 143         radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
 144     }
 145
 146     csc->crelocs = 0;
 147     csc->validated_crelocs = 0;
 148     csc->chunks[0].length_dw = 0;
 149     csc->chunks[1].length_dw = 0;
 150     csc->used_gart = 0;
 151     csc->used_vram = 0;
 152
 153     for (i = 0; i < Elements(csc->reloc_indices_hashlist); i++) {
 154         csc->reloc_indices_hashlist[i] = -1;
 155     }
 156 }
 157
 158 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
 159 {
 160     radeon_cs_context_cleanup(csc);
 161     FREE(csc->relocs_bo);
 162     FREE(csc->relocs);
 163 }
 164
 165
 166 static struct radeon_winsys_cs *
 167 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
 168                      enum ring_type ring_type,
 169                      void (*flush)(void *ctx, unsigned flags,
 170                                    struct pipe_fence_handle **fence),
 171                      void *flush_ctx,
 172                      struct pb_buffer *trace_buf)
 173 {
 174     struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
 175     struct radeon_drm_cs *cs;
 176
 177     cs = CALLOC_STRUCT(radeon_drm_cs);
 178     if (!cs) {
 179         return NULL;
 180     }
 181     pipe_semaphore_init(&cs->flush_completed, 1);
 182
 183     cs->ws = ws;
 184     cs->flush_cs = flush;
 185     cs->flush_data = flush_ctx;
 186     cs->trace_buf = (struct radeon_bo*)trace_buf;
 187
 188     if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
 189         FREE(cs);
 190         return NULL;
 191     }
 192     if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
 193         radeon_destroy_cs_context(&cs->csc1);
 194         FREE(cs);
 195         return NULL;
 196     }
 197
 198     /* Set the first command buffer as current. */
 199     cs->csc = &cs->csc1;
 200     cs->cst = &cs->csc2;
 201     cs->base.buf = cs->csc->buf;
 202     cs->base.ring_type = ring_type;
 203     cs->base.max_dw = ARRAY_SIZE(cs->csc->buf);
 204
 205     p_atomic_inc(&ws->num_cs);
 206     return &cs->base;
 207 }
 208
 209 #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
 210
 211 static inline void update_reloc(struct drm_radeon_cs_reloc *reloc,
 212                                 enum radeon_bo_domain rd,
 213                                 enum radeon_bo_domain wd,
 214                                 unsigned priority,
 215                                 enum radeon_bo_domain *added_domains)
 216 {
 217     *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
 218
 219     reloc->read_domains |= rd;
 220     reloc->write_domain |= wd;
 221     reloc->flags = MAX2(reloc->flags, priority);
 222 }
 223
 224 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
 225 {
 226     unsigned hash = bo->handle & (Elements(csc->reloc_indices_hashlist)-1);
 227     int i = csc->reloc_indices_hashlist[hash];
 228
 229     /* not found or found */
 230     if (i == -1 || csc->relocs_bo[i].bo == bo)
 231         return i;
 232
 233     /* Hash collision, look for the BO in the list of relocs linearly. */
 234     for (i = csc->crelocs - 1; i >= 0; i--) {
 235         if (csc->relocs_bo[i].bo == bo) {
 236             /* Put this reloc in the hash list.
 237              * This will prevent additional hash collisions if there are
 238              * several consecutive lookup_buffer calls for the same buffer.
 239              *
 240              * Example: Assuming buffers A,B,C collide in the hash list,
 241              * the following sequence of relocs:
 242              *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
 243              * will collide here: ^ and here:   ^,
 244              * meaning that we should get very few collisions in the end. */
 245             csc->reloc_indices_hashlist[hash] = i;
 246             return i;
 247         }
 248     }
 249     return -1;
 250 }
 251
 252 static unsigned radeon_add_buffer(struct radeon_drm_cs *cs,
 253                                  struct radeon_bo *bo,
 254                                  enum radeon_bo_usage usage,
 255                                  enum radeon_bo_domain domains,
 256                                  unsigned priority,
 257                                  enum radeon_bo_domain *added_domains)
 258 {
 259     struct radeon_cs_context *csc = cs->csc;
 260     struct drm_radeon_cs_reloc *reloc;
 261     unsigned hash = bo->handle & (Elements(csc->reloc_indices_hashlist)-1);
 262     enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
 263     enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
 264     int i = -1;
 265
 266     assert(priority < 64);
 267     *added_domains = 0;
 268
 269     i = radeon_lookup_buffer(csc, bo);
 270
 271     if (i >= 0) {
 272         reloc = &csc->relocs[i];
 273         update_reloc(reloc, rd, wd, priority / 4, added_domains);
 274         csc->relocs_bo[i].priority_usage |= 1llu << priority;
 275
 276         /* For async DMA, every add_buffer call must add a buffer to the list
 277          * no matter how many duplicates there are. This is due to the fact
 278          * the DMA CS checker doesn't use NOP packets for offset patching,
 279          * but always uses the i-th buffer from the list to patch the i-th
 280          * offset. If there are N offsets in a DMA CS, there must also be N
 281          * buffers in the relocation list.
 282          *
 283          * This doesn't have to be done if virtual memory is enabled,
 284          * because there is no offset patching with virtual memory.
 285          */
 286         if (cs->base.ring_type != RING_DMA || cs->ws->info.r600_virtual_address) {
 287             return i;
 288         }
 289     }
 290
 291     /* New relocation, check if the backing array is large enough. */
 292     if (csc->crelocs >= csc->nrelocs) {
 293         uint32_t size;
 294         csc->nrelocs += 10;
 295
 296         size = csc->nrelocs * sizeof(csc->relocs_bo[0]);
 297         csc->relocs_bo = realloc(csc->relocs_bo, size);
 298
 299         size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
 300         csc->relocs = realloc(csc->relocs, size);
 301
 302         csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 303     }
 304
 305     /* Initialize the new relocation. */
 306     csc->relocs_bo[csc->crelocs].bo = NULL;
 307     csc->relocs_bo[csc->crelocs].priority_usage = 1llu << priority;
 308     radeon_bo_reference(&csc->relocs_bo[csc->crelocs].bo, bo);
 309     p_atomic_inc(&bo->num_cs_references);
 310     reloc = &csc->relocs[csc->crelocs];
 311     reloc->handle = bo->handle;
 312     reloc->read_domains = rd;
 313     reloc->write_domain = wd;
 314     reloc->flags = priority / 4;
 315
 316     csc->reloc_indices_hashlist[hash] = csc->crelocs;
 317
 318     csc->chunks[1].length_dw += RELOC_DWORDS;
 319
 320     *added_domains = rd | wd;
 321     return csc->crelocs++;
 322 }
 323
 324 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
 325                                         struct pb_buffer *buf,
 326                                         enum radeon_bo_usage usage,
 327                                         enum radeon_bo_domain domains,
 328                                         enum radeon_bo_priority priority)
 329 {
 330     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 331     struct radeon_bo *bo = (struct radeon_bo*)buf;
 332     enum radeon_bo_domain added_domains;
 333     unsigned index = radeon_add_buffer(cs, bo, usage, domains, priority,
 334                                        &added_domains);
 335
 336     if (added_domains & RADEON_DOMAIN_GTT)
 337         cs->csc->used_gart += bo->base.size;
 338     if (added_domains & RADEON_DOMAIN_VRAM)
 339         cs->csc->used_vram += bo->base.size;
 340
 341     return index;
 342 }
 343
 344 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
 345                                    struct pb_buffer *buf)
 346 {
 347     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 348
 349     return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
 350 }
 351
 352 static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
 353 {
 354     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 355     boolean status =
 356         cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
 357         cs->csc->used_vram < cs->ws->info.vram_size * 0.8;
 358
 359     if (status) {
 360         cs->csc->validated_crelocs = cs->csc->crelocs;
 361     } else {
 362         /* Remove lately-added buffers. The validation failed with them
 363          * and the CS is about to be flushed because of that. Keep only
 364          * the already-validated buffers. */
 365         unsigned i;
 366
 367         for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
 368             p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
 369             radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
 370         }
 371         cs->csc->crelocs = cs->csc->validated_crelocs;
 372
 373         /* Flush if there are any relocs. Clean up otherwise. */
 374         if (cs->csc->crelocs) {
 375             cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
 376         } else {
 377             radeon_cs_context_cleanup(cs->csc);
 378
 379             assert(cs->base.cdw == 0);
 380             if (cs->base.cdw != 0) {
 381                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
 382             }
 383         }
 384     }
 385     return status;
 386 }
 387
 388 static boolean radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
 389 {
 390     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 391
 392     vram += cs->csc->used_vram;
 393     gtt += cs->csc->used_gart;
 394
 395     /* Anything that goes above the VRAM size should go to GTT. */
 396     if (vram > cs->ws->info.vram_size)
 397         gtt += vram - cs->ws->info.vram_size;
 398
 399     /* Now we just need to check if we have enough GTT. */
 400     return gtt < cs->ws->info.gart_size * 0.7;
 401 }
 402
 403 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
 404                                               struct radeon_bo_list_item *list)
 405 {
 406     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 407     int i;
 408
 409     if (list) {
 410         for (i = 0; i < cs->csc->crelocs; i++) {
 411             pb_reference(&list[i].buf, &cs->csc->relocs_bo[i].bo->base);
 412             list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
 413             list[i].priority_usage = cs->csc->relocs_bo[i].priority_usage;
 414         }
 415     }
 416     return cs->csc->crelocs;
 417 }
 418
 419 void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_drm_cs *cs, struct radeon_cs_context *csc)
 420 {
 421     unsigned i;
 422     int r;
 423
 424     r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
 425                             &csc->cs, sizeof(struct drm_radeon_cs));
 426     if (r) {
 427         if (r == -ENOMEM)
 428             fprintf(stderr, "radeon: Not enough memory for command submission.\n");
 429         else if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
 430             unsigned i;
 431
 432             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
 433             for (i = 0; i < csc->chunks[0].length_dw; i++) {
 434                 fprintf(stderr, "0x%08X\n", csc->buf[i]);
 435             }
 436         } else {
 437             fprintf(stderr, "radeon: The kernel rejected CS, "
 438                     "see dmesg for more information.\n");
 439         }
 440     }
 441
 442     if (cs->trace_buf) {
 443         radeon_dump_cs_on_lockup(cs, csc);
 444     }
 445
 446     for (i = 0; i < csc->crelocs; i++)
 447         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
 448
 449     radeon_cs_context_cleanup(csc);
 450 }
 451
 452 /*
 453  * Make sure previous submission of this cs are completed
 454  */
 455 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
 456 {
 457     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 458
 459     /* Wait for any pending ioctl to complete. */
 460     if (cs->ws->thread) {
 461         pipe_semaphore_wait(&cs->flush_completed);
 462         pipe_semaphore_signal(&cs->flush_completed);
 463     }
 464 }
 465
 466 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
 467
 468 static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
 469                                 unsigned flags,
 470                                 struct pipe_fence_handle **fence,
 471                                 uint32_t cs_trace_id)
 472 {
 473     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 474     struct radeon_cs_context *tmp;
 475
 476     switch (cs->base.ring_type) {
 477     case RING_DMA:
 478         /* pad DMA ring to 8 DWs */
 479         if (cs->ws->info.chip_class <= SI) {
 480             while (rcs->cdw & 7)
 481                 OUT_CS(&cs->base, 0xf0000000); /* NOP packet */
 482         } else {
 483             while (rcs->cdw & 7)
 484                 OUT_CS(&cs->base, 0x00000000); /* NOP packet */
 485         }
 486         break;
 487     case RING_GFX:
 488         /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
 489          * r6xx, requires at least 4 dw alignment to avoid a hw bug.
 490          */
 491         if (cs->ws->info.gfx_ib_pad_with_type2) {
 492             while (rcs->cdw & 7)
 493                 OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
 494         } else {
 495             while (rcs->cdw & 7)
 496                 OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */
 497         }
 498         break;
 499     case RING_UVD:
 500         while (rcs->cdw & 15)
 501             OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
 502         break;
 503     default:
 504         break;
 505     }
 506
 507     if (rcs->cdw > rcs->max_dw) {
 508        fprintf(stderr, "radeon: command stream overflowed\n");
 509     }
 510
 511     if (fence) {
 512         radeon_fence_reference(fence, NULL);
 513         *fence = radeon_cs_create_fence(rcs);
 514     }
 515
 516     radeon_drm_cs_sync_flush(rcs);
 517
 518     /* Swap command streams. */
 519     tmp = cs->csc;
 520     cs->csc = cs->cst;
 521     cs->cst = tmp;
 522
 523     cs->cst->cs_trace_id = cs_trace_id;
 524
 525     /* If the CS is not empty or overflowed, emit it in a separate thread. */
 526     if (cs->base.cdw && cs->base.cdw <= cs->base.max_dw && !debug_get_option_noop()) {
 527         unsigned i, crelocs;
 528
 529         crelocs = cs->cst->crelocs;
 530
 531         cs->cst->chunks[0].length_dw = cs->base.cdw;
 532
 533         for (i = 0; i < crelocs; i++) {
 534             /* Update the number of active asynchronous CS ioctls for the buffer. */
 535             p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
 536         }
 537
 538         switch (cs->base.ring_type) {
 539         case RING_DMA:
 540             cs->cst->flags[0] = 0;
 541             cs->cst->flags[1] = RADEON_CS_RING_DMA;
 542             cs->cst->cs.num_chunks = 3;
 543             if (cs->ws->info.r600_virtual_address) {
 544                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 545             }
 546             break;
 547
 548         case RING_UVD:
 549             cs->cst->flags[0] = 0;
 550             cs->cst->flags[1] = RADEON_CS_RING_UVD;
 551             cs->cst->cs.num_chunks = 3;
 552             break;
 553
 554         case RING_VCE:
 555             cs->cst->flags[0] = 0;
 556             cs->cst->flags[1] = RADEON_CS_RING_VCE;
 557             cs->cst->cs.num_chunks = 3;
 558             break;
 559
 560         default:
 561         case RING_GFX:
 562         case RING_COMPUTE:
 563             cs->cst->flags[0] = 0;
 564             cs->cst->flags[1] = RADEON_CS_RING_GFX;
 565             cs->cst->cs.num_chunks = 2;
 566             if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
 567                 cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
 568                 cs->cst->cs.num_chunks = 3;
 569             }
 570             if (cs->ws->info.r600_virtual_address) {
 571                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 572                 cs->cst->cs.num_chunks = 3;
 573             }
 574             if (flags & RADEON_FLUSH_END_OF_FRAME) {
 575                 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
 576                 cs->cst->cs.num_chunks = 3;
 577             }
 578             if (cs->base.ring_type == RING_COMPUTE) {
 579                 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
 580                 cs->cst->cs.num_chunks = 3;
 581             }
 582             break;
 583         }
 584
 585         if (cs->ws->thread) {
 586             pipe_semaphore_wait(&cs->flush_completed);
 587             radeon_drm_ws_queue_cs(cs->ws, cs);
 588             if (!(flags & RADEON_FLUSH_ASYNC))
 589                 radeon_drm_cs_sync_flush(rcs);
 590         } else {
 591             radeon_drm_cs_emit_ioctl_oneshot(cs, cs->cst);
 592         }
 593     } else {
 594         radeon_cs_context_cleanup(cs->cst);
 595     }
 596
 597     /* Prepare a new CS. */
 598     cs->base.buf = cs->csc->buf;
 599     cs->base.cdw = 0;
 600
 601     cs->ws->num_cs_flushes++;
 602 }
 603
 604 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
 605 {
 606     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 607
 608     radeon_drm_cs_sync_flush(rcs);
 609     pipe_semaphore_destroy(&cs->flush_completed);
 610     radeon_cs_context_cleanup(&cs->csc1);
 611     radeon_cs_context_cleanup(&cs->csc2);
 612     p_atomic_dec(&cs->ws->num_cs);
 613     radeon_destroy_cs_context(&cs->csc1);
 614     radeon_destroy_cs_context(&cs->csc2);
 615     FREE(cs);
 616 }
 617
 618 static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
 619                                        struct pb_buffer *_buf,
 620                                        enum radeon_bo_usage usage)
 621 {
 622     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 623     struct radeon_bo *bo = (struct radeon_bo*)_buf;
 624     int index;
 625
 626     if (!bo->num_cs_references)
 627         return FALSE;
 628
 629     index = radeon_lookup_buffer(cs->csc, bo);
 630     if (index == -1)
 631         return FALSE;
 632
 633     if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
 634         return TRUE;
 635     if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
 636         return TRUE;
 637
 638     return FALSE;
 639 }
 640
 641 /* FENCES */
 642
 643 static struct pipe_fence_handle *
 644 radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
 645 {
 646     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 647     struct pb_buffer *fence;
 648
 649     /* Create a fence, which is a dummy BO. */
 650     fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1, TRUE,
 651                                        RADEON_DOMAIN_GTT, 0);
 652     /* Add the fence as a dummy relocation. */
 653     cs->ws->base.cs_add_buffer(rcs, fence,
 654                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
 655                               RADEON_PRIO_FENCE);
 656     return (struct pipe_fence_handle*)fence;
 657 }
 658
 659 static bool radeon_fence_wait(struct radeon_winsys *ws,
 660                               struct pipe_fence_handle *fence,
 661                               uint64_t timeout)
 662 {
 663     return ws->buffer_wait((struct pb_buffer*)fence, timeout,
 664                            RADEON_USAGE_READWRITE);
 665 }
 666
 667 static void radeon_fence_reference(struct pipe_fence_handle **dst,
 668                                    struct pipe_fence_handle *src)
 669 {
 670     pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
 671 }
 672
 673 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
 674 {
 675     ws->base.ctx_create = radeon_drm_ctx_create;
 676     ws->base.ctx_destroy = radeon_drm_ctx_destroy;
 677     ws->base.cs_create = radeon_drm_cs_create;
 678     ws->base.cs_destroy = radeon_drm_cs_destroy;
 679     ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
 680     ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
 681     ws->base.cs_validate = radeon_drm_cs_validate;
 682     ws->base.cs_memory_below_limit = radeon_drm_cs_memory_below_limit;
 683     ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
 684     ws->base.cs_flush = radeon_drm_cs_flush;
 685     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
 686     ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
 687     ws->base.fence_wait = radeon_fence_wait;
 688     ws->base.fence_reference = radeon_fence_reference;
 689 }