src/gallium/winsys/radeon/drm/radeon_drm_cs.c

   1 /*
   2  * Copyright © 2008 Jérôme Glisse
   3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * The above copyright notice and this permission notice (including the
  24  * next paragraph) shall be included in all copies or substantial portions
  25  * of the Software.
  26  */
  27 /*
  28  * Authors:
  29  *      Marek Olšák <maraeo@gmail.com>
  30  *
  31  * Based on work from libdrm_radeon by:
  32  *      Aapo Tahkola <aet@rasterburn.org>
  33  *      Nicolai Haehnle <prefect_@gmx.net>
  34  *      Jérôme Glisse <glisse@freedesktop.org>
  35  */
  36
  37 /*
  38     This file replaces libdrm's radeon_cs_gem with our own implemention.
  39     It's optimized specifically for Radeon DRM.
  40     Adding buffers and space checking are faster and simpler than their
  41     counterparts in libdrm (the time complexity of all the functions
  42     is O(1) in nearly all scenarios, thanks to hashing).
  43
  44     It works like this:
  45
  46     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
  47     also adds the size of 'buf' to the used_gart and used_vram winsys variables
  48     based on the domains, which are simply or'd for the accounting purposes.
  49     The adding is skipped if the reloc is already present in the list, but it
  50     accounts any newly-referenced domains.
  51
  52     cs_validate is then called, which just checks:
  53         used_vram/gart < vram/gart_size * 0.8
  54     The 0.8 number allows for some memory fragmentation. If the validation
  55     fails, the pipe driver flushes CS and tries do the validation again,
  56     i.e. it validates only that one operation. If it fails again, it drops
  57     the operation on the floor and prints some nasty message to stderr.
  58     (done in the pipe driver)
  59
  60     cs_write_reloc(cs, buf) just writes a reloc that has been added using
  61     cs_add_buffer. The read_domain and write_domain parameters have been removed,
  62     because we already specify them in cs_add_buffer.
  63 */
  64
  65 #include "radeon_drm_cs.h"
  66
  67 #include "util/u_memory.h"
  68 #include "os/os_time.h"
  69
  70 #include <stdio.h>
  71 #include <stdlib.h>
  72 #include <stdint.h>
  73 #include <xf86drm.h>
  74
  75
  76 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
  77
  78 static struct pipe_fence_handle *
  79 radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
  80 static void radeon_fence_reference(struct pipe_fence_handle **dst,
  81                                    struct pipe_fence_handle *src);
  82
  83 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
  84 {
  85     /* No context support here. Just return the winsys pointer
  86      * as the "context". */
  87     return (struct radeon_winsys_ctx*)ws;
  88 }
  89
  90 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
  91 {
  92     /* No context support here. */
  93 }
  94
  95 static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
  96                                       struct radeon_drm_winsys *ws)
  97 {
  98     int i;
  99
 100     csc->fd = ws->fd;
 101     csc->nrelocs = 512;
 102     csc->relocs_bo = (struct radeon_bo_item*)
 103                      CALLOC(1, csc->nrelocs * sizeof(csc->relocs_bo[0]));
 104     if (!csc->relocs_bo) {
 105         return FALSE;
 106     }
 107
 108     csc->relocs = (struct drm_radeon_cs_reloc*)
 109                   CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
 110     if (!csc->relocs) {
 111         FREE(csc->relocs_bo);
 112         return FALSE;
 113     }
 114
 115     csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
 116     csc->chunks[0].length_dw = 0;
 117     csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
 118     csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
 119     csc->chunks[1].length_dw = 0;
 120     csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 121     csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
 122     csc->chunks[2].length_dw = 2;
 123     csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
 124
 125     csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
 126     csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
 127     csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
 128
 129     csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
 130
 131     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 132         csc->reloc_indices_hashlist[i] = -1;
 133     }
 134     return TRUE;
 135 }
 136
 137 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
 138 {
 139     unsigned i;
 140
 141     for (i = 0; i < csc->crelocs; i++) {
 142         p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
 143         radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
 144     }
 145
 146     csc->crelocs = 0;
 147     csc->validated_crelocs = 0;
 148     csc->chunks[0].length_dw = 0;
 149     csc->chunks[1].length_dw = 0;
 150     csc->used_gart = 0;
 151     csc->used_vram = 0;
 152
 153     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 154         csc->reloc_indices_hashlist[i] = -1;
 155     }
 156 }
 157
 158 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
 159 {
 160     radeon_cs_context_cleanup(csc);
 161     FREE(csc->relocs_bo);
 162     FREE(csc->relocs);
 163 }
 164
 165
 166 static struct radeon_winsys_cs *
 167 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
 168                      enum ring_type ring_type,
 169                      void (*flush)(void *ctx, unsigned flags,
 170                                    struct pipe_fence_handle **fence),
 171                      void *flush_ctx)
 172 {
 173     struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
 174     struct radeon_drm_cs *cs;
 175
 176     cs = CALLOC_STRUCT(radeon_drm_cs);
 177     if (!cs) {
 178         return NULL;
 179     }
 180     pipe_semaphore_init(&cs->flush_completed, 1);
 181
 182     cs->ws = ws;
 183     cs->flush_cs = flush;
 184     cs->flush_data = flush_ctx;
 185
 186     if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
 187         FREE(cs);
 188         return NULL;
 189     }
 190     if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
 191         radeon_destroy_cs_context(&cs->csc1);
 192         FREE(cs);
 193         return NULL;
 194     }
 195
 196     /* Set the first command buffer as current. */
 197     cs->csc = &cs->csc1;
 198     cs->cst = &cs->csc2;
 199     cs->base.current.buf = cs->csc->buf;
 200     cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
 201     cs->ring_type = ring_type;
 202
 203     p_atomic_inc(&ws->num_cs);
 204     return &cs->base;
 205 }
 206
 207 #define OUT_CS(cs, value) (cs)->current.buf[(cs)->current.cdw++] = (value)
 208
 209 static inline void update_reloc(struct drm_radeon_cs_reloc *reloc,
 210                                 enum radeon_bo_domain rd,
 211                                 enum radeon_bo_domain wd,
 212                                 unsigned priority,
 213                                 enum radeon_bo_domain *added_domains)
 214 {
 215     *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
 216
 217     reloc->read_domains |= rd;
 218     reloc->write_domain |= wd;
 219     reloc->flags = MAX2(reloc->flags, priority);
 220 }
 221
 222 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
 223 {
 224     unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 225     int i = csc->reloc_indices_hashlist[hash];
 226
 227     /* not found or found */
 228     if (i == -1 || csc->relocs_bo[i].bo == bo)
 229         return i;
 230
 231     /* Hash collision, look for the BO in the list of relocs linearly. */
 232     for (i = csc->crelocs - 1; i >= 0; i--) {
 233         if (csc->relocs_bo[i].bo == bo) {
 234             /* Put this reloc in the hash list.
 235              * This will prevent additional hash collisions if there are
 236              * several consecutive lookup_buffer calls for the same buffer.
 237              *
 238              * Example: Assuming buffers A,B,C collide in the hash list,
 239              * the following sequence of relocs:
 240              *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
 241              * will collide here: ^ and here:   ^,
 242              * meaning that we should get very few collisions in the end. */
 243             csc->reloc_indices_hashlist[hash] = i;
 244             return i;
 245         }
 246     }
 247     return -1;
 248 }
 249
 250 static unsigned radeon_add_buffer(struct radeon_drm_cs *cs,
 251                                  struct radeon_bo *bo,
 252                                  enum radeon_bo_usage usage,
 253                                  enum radeon_bo_domain domains,
 254                                  unsigned priority,
 255                                  enum radeon_bo_domain *added_domains)
 256 {
 257     struct radeon_cs_context *csc = cs->csc;
 258     struct drm_radeon_cs_reloc *reloc;
 259     unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 260     enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
 261     enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
 262     int i = -1;
 263
 264     assert(priority < 64);
 265     *added_domains = 0;
 266
 267     i = radeon_lookup_buffer(csc, bo);
 268
 269     if (i >= 0) {
 270         reloc = &csc->relocs[i];
 271         update_reloc(reloc, rd, wd, priority / 4, added_domains);
 272         csc->relocs_bo[i].priority_usage |= 1llu << priority;
 273
 274         /* For async DMA, every add_buffer call must add a buffer to the list
 275          * no matter how many duplicates there are. This is due to the fact
 276          * the DMA CS checker doesn't use NOP packets for offset patching,
 277          * but always uses the i-th buffer from the list to patch the i-th
 278          * offset. If there are N offsets in a DMA CS, there must also be N
 279          * buffers in the relocation list.
 280          *
 281          * This doesn't have to be done if virtual memory is enabled,
 282          * because there is no offset patching with virtual memory.
 283          */
 284         if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
 285             return i;
 286         }
 287     }
 288
 289     /* New relocation, check if the backing array is large enough. */
 290     if (csc->crelocs >= csc->nrelocs) {
 291         uint32_t size;
 292         csc->nrelocs += 10;
 293
 294         size = csc->nrelocs * sizeof(csc->relocs_bo[0]);
 295         csc->relocs_bo = realloc(csc->relocs_bo, size);
 296
 297         size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
 298         csc->relocs = realloc(csc->relocs, size);
 299
 300         csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 301     }
 302
 303     /* Initialize the new relocation. */
 304     csc->relocs_bo[csc->crelocs].bo = NULL;
 305     csc->relocs_bo[csc->crelocs].priority_usage = 1llu << priority;
 306     radeon_bo_reference(&csc->relocs_bo[csc->crelocs].bo, bo);
 307     p_atomic_inc(&bo->num_cs_references);
 308     reloc = &csc->relocs[csc->crelocs];
 309     reloc->handle = bo->handle;
 310     reloc->read_domains = rd;
 311     reloc->write_domain = wd;
 312     reloc->flags = priority / 4;
 313
 314     csc->reloc_indices_hashlist[hash] = csc->crelocs;
 315
 316     csc->chunks[1].length_dw += RELOC_DWORDS;
 317
 318     *added_domains = rd | wd;
 319     return csc->crelocs++;
 320 }
 321
 322 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
 323                                         struct pb_buffer *buf,
 324                                         enum radeon_bo_usage usage,
 325                                         enum radeon_bo_domain domains,
 326                                         enum radeon_bo_priority priority)
 327 {
 328     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 329     struct radeon_bo *bo = (struct radeon_bo*)buf;
 330     enum radeon_bo_domain added_domains;
 331     unsigned index = radeon_add_buffer(cs, bo, usage, domains, priority,
 332                                        &added_domains);
 333
 334     if (added_domains & RADEON_DOMAIN_VRAM)
 335         cs->csc->used_vram += bo->base.size;
 336     else if (added_domains & RADEON_DOMAIN_GTT)
 337         cs->csc->used_gart += bo->base.size;
 338
 339     return index;
 340 }
 341
 342 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
 343                                    struct pb_buffer *buf)
 344 {
 345     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 346
 347     return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
 348 }
 349
 350 static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
 351 {
 352     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 353     boolean status =
 354         cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
 355         cs->csc->used_vram < cs->ws->info.vram_size * 0.8;
 356
 357     if (status) {
 358         cs->csc->validated_crelocs = cs->csc->crelocs;
 359     } else {
 360         /* Remove lately-added buffers. The validation failed with them
 361          * and the CS is about to be flushed because of that. Keep only
 362          * the already-validated buffers. */
 363         unsigned i;
 364
 365         for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
 366             p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
 367             radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
 368         }
 369         cs->csc->crelocs = cs->csc->validated_crelocs;
 370
 371         /* Flush if there are any relocs. Clean up otherwise. */
 372         if (cs->csc->crelocs) {
 373             cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
 374         } else {
 375             radeon_cs_context_cleanup(cs->csc);
 376
 377             assert(cs->base.current.cdw == 0);
 378             if (cs->base.current.cdw != 0) {
 379                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
 380             }
 381         }
 382     }
 383     return status;
 384 }
 385
 386 static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
 387 {
 388    assert(rcs->current.cdw <= rcs->current.max_dw);
 389    return rcs->current.max_dw - rcs->current.cdw >= dw;
 390 }
 391
 392 static boolean radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
 393 {
 394     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 395
 396     vram += cs->csc->used_vram;
 397     gtt += cs->csc->used_gart;
 398
 399     /* Anything that goes above the VRAM size should go to GTT. */
 400     if (vram > cs->ws->info.vram_size)
 401         gtt += vram - cs->ws->info.vram_size;
 402
 403     /* Now we just need to check if we have enough GTT. */
 404     return gtt < cs->ws->info.gart_size * 0.7;
 405 }
 406
 407 static uint64_t radeon_drm_cs_query_memory_usage(struct radeon_winsys_cs *rcs)
 408 {
 409    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 410
 411    return cs->csc->used_vram + cs->csc->used_gart;
 412 }
 413
 414 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
 415                                               struct radeon_bo_list_item *list)
 416 {
 417     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 418     int i;
 419
 420     if (list) {
 421         for (i = 0; i < cs->csc->crelocs; i++) {
 422             pb_reference(&list[i].buf, &cs->csc->relocs_bo[i].bo->base);
 423             list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
 424             list[i].priority_usage = cs->csc->relocs_bo[i].priority_usage;
 425         }
 426     }
 427     return cs->csc->crelocs;
 428 }
 429
 430 void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_drm_cs *cs, struct radeon_cs_context *csc)
 431 {
 432     unsigned i;
 433     int r;
 434
 435     r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
 436                             &csc->cs, sizeof(struct drm_radeon_cs));
 437     if (r) {
 438         if (r == -ENOMEM)
 439             fprintf(stderr, "radeon: Not enough memory for command submission.\n");
 440         else if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
 441             unsigned i;
 442
 443             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
 444             for (i = 0; i < csc->chunks[0].length_dw; i++) {
 445                 fprintf(stderr, "0x%08X\n", csc->buf[i]);
 446             }
 447         } else {
 448             fprintf(stderr, "radeon: The kernel rejected CS, "
 449                     "see dmesg for more information.\n");
 450         }
 451     }
 452
 453     for (i = 0; i < csc->crelocs; i++)
 454         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
 455
 456     radeon_cs_context_cleanup(csc);
 457 }
 458
 459 /*
 460  * Make sure previous submission of this cs are completed
 461  */
 462 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
 463 {
 464     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 465
 466     /* Wait for any pending ioctl to complete. */
 467     if (cs->ws->thread) {
 468         pipe_semaphore_wait(&cs->flush_completed);
 469         pipe_semaphore_signal(&cs->flush_completed);
 470     }
 471 }
 472
 473 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
 474
 475 static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
 476                                 unsigned flags,
 477                                 struct pipe_fence_handle **fence)
 478 {
 479     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 480     struct radeon_cs_context *tmp;
 481
 482     switch (cs->ring_type) {
 483     case RING_DMA:
 484         /* pad DMA ring to 8 DWs */
 485         if (cs->ws->info.chip_class <= SI) {
 486             while (rcs->current.cdw & 7)
 487                 OUT_CS(&cs->base, 0xf0000000); /* NOP packet */
 488         } else {
 489             while (rcs->current.cdw & 7)
 490                 OUT_CS(&cs->base, 0x00000000); /* NOP packet */
 491         }
 492         break;
 493     case RING_GFX:
 494         /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
 495          * r6xx, requires at least 4 dw alignment to avoid a hw bug.
 496          */
 497         if (cs->ws->info.gfx_ib_pad_with_type2) {
 498             while (rcs->current.cdw & 7)
 499                 OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
 500         } else {
 501             while (rcs->current.cdw & 7)
 502                 OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */
 503         }
 504         break;
 505     case RING_UVD:
 506         while (rcs->current.cdw & 15)
 507             OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
 508         break;
 509     default:
 510         break;
 511     }
 512
 513     if (rcs->current.cdw > rcs->current.max_dw) {
 514        fprintf(stderr, "radeon: command stream overflowed\n");
 515     }
 516
 517     if (fence) {
 518         radeon_fence_reference(fence, NULL);
 519         *fence = radeon_cs_create_fence(rcs);
 520     }
 521
 522     radeon_drm_cs_sync_flush(rcs);
 523
 524     /* Swap command streams. */
 525     tmp = cs->csc;
 526     cs->csc = cs->cst;
 527     cs->cst = tmp;
 528
 529     /* If the CS is not empty or overflowed, emit it in a separate thread. */
 530     if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
 531         unsigned i, crelocs;
 532
 533         crelocs = cs->cst->crelocs;
 534
 535         cs->cst->chunks[0].length_dw = cs->base.current.cdw;
 536
 537         for (i = 0; i < crelocs; i++) {
 538             /* Update the number of active asynchronous CS ioctls for the buffer. */
 539             p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
 540         }
 541
 542         switch (cs->ring_type) {
 543         case RING_DMA:
 544             cs->cst->flags[0] = 0;
 545             cs->cst->flags[1] = RADEON_CS_RING_DMA;
 546             cs->cst->cs.num_chunks = 3;
 547             if (cs->ws->info.has_virtual_memory) {
 548                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 549             }
 550             break;
 551
 552         case RING_UVD:
 553             cs->cst->flags[0] = 0;
 554             cs->cst->flags[1] = RADEON_CS_RING_UVD;
 555             cs->cst->cs.num_chunks = 3;
 556             break;
 557
 558         case RING_VCE:
 559             cs->cst->flags[0] = 0;
 560             cs->cst->flags[1] = RADEON_CS_RING_VCE;
 561             cs->cst->cs.num_chunks = 3;
 562             break;
 563
 564         default:
 565         case RING_GFX:
 566         case RING_COMPUTE:
 567             cs->cst->flags[0] = 0;
 568             cs->cst->flags[1] = RADEON_CS_RING_GFX;
 569             cs->cst->cs.num_chunks = 2;
 570             if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
 571                 cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
 572                 cs->cst->cs.num_chunks = 3;
 573             }
 574             if (cs->ws->info.has_virtual_memory) {
 575                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 576                 cs->cst->cs.num_chunks = 3;
 577             }
 578             if (flags & RADEON_FLUSH_END_OF_FRAME) {
 579                 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
 580                 cs->cst->cs.num_chunks = 3;
 581             }
 582             if (cs->ring_type == RING_COMPUTE) {
 583                 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
 584                 cs->cst->cs.num_chunks = 3;
 585             }
 586             break;
 587         }
 588
 589         if (cs->ws->thread) {
 590             pipe_semaphore_wait(&cs->flush_completed);
 591             radeon_drm_ws_queue_cs(cs->ws, cs);
 592             if (!(flags & RADEON_FLUSH_ASYNC))
 593                 radeon_drm_cs_sync_flush(rcs);
 594         } else {
 595             radeon_drm_cs_emit_ioctl_oneshot(cs, cs->cst);
 596         }
 597     } else {
 598         radeon_cs_context_cleanup(cs->cst);
 599     }
 600
 601     /* Prepare a new CS. */
 602     cs->base.current.buf = cs->csc->buf;
 603     cs->base.current.cdw = 0;
 604
 605     cs->ws->num_cs_flushes++;
 606 }
 607
 608 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
 609 {
 610     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 611
 612     radeon_drm_cs_sync_flush(rcs);
 613     pipe_semaphore_destroy(&cs->flush_completed);
 614     radeon_cs_context_cleanup(&cs->csc1);
 615     radeon_cs_context_cleanup(&cs->csc2);
 616     p_atomic_dec(&cs->ws->num_cs);
 617     radeon_destroy_cs_context(&cs->csc1);
 618     radeon_destroy_cs_context(&cs->csc2);
 619     FREE(cs);
 620 }
 621
 622 static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
 623                                        struct pb_buffer *_buf,
 624                                        enum radeon_bo_usage usage)
 625 {
 626     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 627     struct radeon_bo *bo = (struct radeon_bo*)_buf;
 628     int index;
 629
 630     if (!bo->num_cs_references)
 631         return FALSE;
 632
 633     index = radeon_lookup_buffer(cs->csc, bo);
 634     if (index == -1)
 635         return FALSE;
 636
 637     if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
 638         return TRUE;
 639     if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
 640         return TRUE;
 641
 642     return FALSE;
 643 }
 644
 645 /* FENCES */
 646
 647 static struct pipe_fence_handle *
 648 radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
 649 {
 650     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 651     struct pb_buffer *fence;
 652
 653     /* Create a fence, which is a dummy BO. */
 654     fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
 655                                        RADEON_DOMAIN_GTT, 0);
 656     /* Add the fence as a dummy relocation. */
 657     cs->ws->base.cs_add_buffer(rcs, fence,
 658                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
 659                               RADEON_PRIO_FENCE);
 660     return (struct pipe_fence_handle*)fence;
 661 }
 662
 663 static bool radeon_fence_wait(struct radeon_winsys *ws,
 664                               struct pipe_fence_handle *fence,
 665                               uint64_t timeout)
 666 {
 667     return ws->buffer_wait((struct pb_buffer*)fence, timeout,
 668                            RADEON_USAGE_READWRITE);
 669 }
 670
 671 static void radeon_fence_reference(struct pipe_fence_handle **dst,
 672                                    struct pipe_fence_handle *src)
 673 {
 674     pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
 675 }
 676
 677 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
 678 {
 679     ws->base.ctx_create = radeon_drm_ctx_create;
 680     ws->base.ctx_destroy = radeon_drm_ctx_destroy;
 681     ws->base.cs_create = radeon_drm_cs_create;
 682     ws->base.cs_destroy = radeon_drm_cs_destroy;
 683     ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
 684     ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
 685     ws->base.cs_validate = radeon_drm_cs_validate;
 686     ws->base.cs_check_space = radeon_drm_cs_check_space;
 687     ws->base.cs_memory_below_limit = radeon_drm_cs_memory_below_limit;
 688     ws->base.cs_query_memory_usage = radeon_drm_cs_query_memory_usage;
 689     ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
 690     ws->base.cs_flush = radeon_drm_cs_flush;
 691     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
 692     ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
 693     ws->base.fence_wait = radeon_fence_wait;
 694     ws->base.fence_reference = radeon_fence_reference;
 695 }