src/gallium/winsys/radeon/drm/radeon_drm_cs.c

   1 /*
   2  * Copyright © 2008 Jérôme Glisse
   3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * The above copyright notice and this permission notice (including the
  24  * next paragraph) shall be included in all copies or substantial portions
  25  * of the Software.
  26  */
  27 /*
  28  * Authors:
  29  *      Marek Olšák <maraeo@gmail.com>
  30  *
  31  * Based on work from libdrm_radeon by:
  32  *      Aapo Tahkola <aet@rasterburn.org>
  33  *      Nicolai Haehnle <prefect_@gmx.net>
  34  *      Jérôme Glisse <glisse@freedesktop.org>
  35  */
  36
  37 /*
  38     This file replaces libdrm's radeon_cs_gem with our own implemention.
  39     It's optimized specifically for Radeon DRM.
  40     Adding buffers and space checking are faster and simpler than their
  41     counterparts in libdrm (the time complexity of all the functions
  42     is O(1) in nearly all scenarios, thanks to hashing).
  43
  44     It works like this:
  45
  46     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
  47     also adds the size of 'buf' to the used_gart and used_vram winsys variables
  48     based on the domains, which are simply or'd for the accounting purposes.
  49     The adding is skipped if the reloc is already present in the list, but it
  50     accounts any newly-referenced domains.
  51
  52     cs_validate is then called, which just checks:
  53         used_vram/gart < vram/gart_size * 0.8
  54     The 0.8 number allows for some memory fragmentation. If the validation
  55     fails, the pipe driver flushes CS and tries do the validation again,
  56     i.e. it validates only that one operation. If it fails again, it drops
  57     the operation on the floor and prints some nasty message to stderr.
  58     (done in the pipe driver)
  59
  60     cs_write_reloc(cs, buf) just writes a reloc that has been added using
  61     cs_add_buffer. The read_domain and write_domain parameters have been removed,
  62     because we already specify them in cs_add_buffer.
  63 */
  64
  65 #include "radeon_drm_cs.h"
  66
  67 #include "util/u_memory.h"
  68 #include "os/os_time.h"
  69
  70 #include <stdio.h>
  71 #include <stdlib.h>
  72 #include <stdint.h>
  73 #include <xf86drm.h>
  74
  75
  76 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
  77
  78 static struct pipe_fence_handle *
  79 radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
  80 static void radeon_fence_reference(struct pipe_fence_handle **dst,
  81                                    struct pipe_fence_handle *src);
  82
  83 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
  84 {
  85     /* No context support here. Just return the winsys pointer
  86      * as the "context". */
  87     return (struct radeon_winsys_ctx*)ws;
  88 }
  89
  90 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
  91 {
  92     /* No context support here. */
  93 }
  94
  95 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
  96                                    struct radeon_drm_winsys *ws)
  97 {
  98     int i;
  99
 100     csc->fd = ws->fd;
 101     csc->nrelocs = 512;
 102     csc->relocs_bo = (struct radeon_bo_item*)
 103                      CALLOC(1, csc->nrelocs * sizeof(csc->relocs_bo[0]));
 104     if (!csc->relocs_bo) {
 105         return false;
 106     }
 107
 108     csc->relocs = (struct drm_radeon_cs_reloc*)
 109                   CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
 110     if (!csc->relocs) {
 111         FREE(csc->relocs_bo);
 112         return false;
 113     }
 114
 115     csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
 116     csc->chunks[0].length_dw = 0;
 117     csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
 118     csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
 119     csc->chunks[1].length_dw = 0;
 120     csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 121     csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
 122     csc->chunks[2].length_dw = 2;
 123     csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
 124
 125     csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
 126     csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
 127     csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
 128
 129     csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
 130
 131     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 132         csc->reloc_indices_hashlist[i] = -1;
 133     }
 134     return true;
 135 }
 136
 137 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
 138 {
 139     unsigned i;
 140
 141     for (i = 0; i < csc->crelocs; i++) {
 142         p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
 143         radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
 144     }
 145
 146     csc->crelocs = 0;
 147     csc->validated_crelocs = 0;
 148     csc->chunks[0].length_dw = 0;
 149     csc->chunks[1].length_dw = 0;
 150     csc->used_gart = 0;
 151     csc->used_vram = 0;
 152
 153     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
 154         csc->reloc_indices_hashlist[i] = -1;
 155     }
 156 }
 157
 158 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
 159 {
 160     radeon_cs_context_cleanup(csc);
 161     FREE(csc->relocs_bo);
 162     FREE(csc->relocs);
 163 }
 164
 165
 166 static struct radeon_winsys_cs *
 167 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
 168                      enum ring_type ring_type,
 169                      void (*flush)(void *ctx, unsigned flags,
 170                                    struct pipe_fence_handle **fence),
 171                      void *flush_ctx)
 172 {
 173     struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
 174     struct radeon_drm_cs *cs;
 175
 176     cs = CALLOC_STRUCT(radeon_drm_cs);
 177     if (!cs) {
 178         return NULL;
 179     }
 180     util_queue_fence_init(&cs->flush_completed);
 181
 182     cs->ws = ws;
 183     cs->flush_cs = flush;
 184     cs->flush_data = flush_ctx;
 185
 186     if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
 187         FREE(cs);
 188         return NULL;
 189     }
 190     if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
 191         radeon_destroy_cs_context(&cs->csc1);
 192         FREE(cs);
 193         return NULL;
 194     }
 195
 196     /* Set the first command buffer as current. */
 197     cs->csc = &cs->csc1;
 198     cs->cst = &cs->csc2;
 199     cs->base.current.buf = cs->csc->buf;
 200     cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
 201     cs->ring_type = ring_type;
 202
 203     p_atomic_inc(&ws->num_cs);
 204     return &cs->base;
 205 }
 206
 207 #define OUT_CS(cs, value) (cs)->current.buf[(cs)->current.cdw++] = (value)
 208
 209 static inline void update_reloc(struct drm_radeon_cs_reloc *reloc,
 210                                 enum radeon_bo_domain rd,
 211                                 enum radeon_bo_domain wd,
 212                                 unsigned priority,
 213                                 enum radeon_bo_domain *added_domains)
 214 {
 215     *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
 216
 217     reloc->read_domains |= rd;
 218     reloc->write_domain |= wd;
 219     reloc->flags = MAX2(reloc->flags, priority);
 220 }
 221
 222 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
 223 {
 224     unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 225     int i = csc->reloc_indices_hashlist[hash];
 226
 227     /* not found or found */
 228     if (i == -1 || csc->relocs_bo[i].bo == bo)
 229         return i;
 230
 231     /* Hash collision, look for the BO in the list of relocs linearly. */
 232     for (i = csc->crelocs - 1; i >= 0; i--) {
 233         if (csc->relocs_bo[i].bo == bo) {
 234             /* Put this reloc in the hash list.
 235              * This will prevent additional hash collisions if there are
 236              * several consecutive lookup_buffer calls for the same buffer.
 237              *
 238              * Example: Assuming buffers A,B,C collide in the hash list,
 239              * the following sequence of relocs:
 240              *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
 241              * will collide here: ^ and here:   ^,
 242              * meaning that we should get very few collisions in the end. */
 243             csc->reloc_indices_hashlist[hash] = i;
 244             return i;
 245         }
 246     }
 247     return -1;
 248 }
 249
 250 static unsigned radeon_add_buffer(struct radeon_drm_cs *cs,
 251                                  struct radeon_bo *bo,
 252                                  enum radeon_bo_usage usage,
 253                                  enum radeon_bo_domain domains,
 254                                  unsigned priority,
 255                                  enum radeon_bo_domain *added_domains)
 256 {
 257     struct radeon_cs_context *csc = cs->csc;
 258     struct drm_radeon_cs_reloc *reloc;
 259     unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
 260     enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
 261     enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
 262     int i = -1;
 263
 264     assert(priority < 64);
 265     *added_domains = 0;
 266
 267     i = radeon_lookup_buffer(csc, bo);
 268
 269     if (i >= 0) {
 270         reloc = &csc->relocs[i];
 271         update_reloc(reloc, rd, wd, priority / 4, added_domains);
 272         csc->relocs_bo[i].priority_usage |= 1llu << priority;
 273
 274         /* For async DMA, every add_buffer call must add a buffer to the list
 275          * no matter how many duplicates there are. This is due to the fact
 276          * the DMA CS checker doesn't use NOP packets for offset patching,
 277          * but always uses the i-th buffer from the list to patch the i-th
 278          * offset. If there are N offsets in a DMA CS, there must also be N
 279          * buffers in the relocation list.
 280          *
 281          * This doesn't have to be done if virtual memory is enabled,
 282          * because there is no offset patching with virtual memory.
 283          */
 284         if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
 285             return i;
 286         }
 287     }
 288
 289     /* New relocation, check if the backing array is large enough. */
 290     if (csc->crelocs >= csc->nrelocs) {
 291         uint32_t size;
 292         csc->nrelocs += 10;
 293
 294         size = csc->nrelocs * sizeof(csc->relocs_bo[0]);
 295         csc->relocs_bo = realloc(csc->relocs_bo, size);
 296
 297         size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
 298         csc->relocs = realloc(csc->relocs, size);
 299
 300         csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
 301     }
 302
 303     /* Initialize the new relocation. */
 304     csc->relocs_bo[csc->crelocs].bo = NULL;
 305     csc->relocs_bo[csc->crelocs].priority_usage = 1llu << priority;
 306     radeon_bo_reference(&csc->relocs_bo[csc->crelocs].bo, bo);
 307     p_atomic_inc(&bo->num_cs_references);
 308     reloc = &csc->relocs[csc->crelocs];
 309     reloc->handle = bo->handle;
 310     reloc->read_domains = rd;
 311     reloc->write_domain = wd;
 312     reloc->flags = priority / 4;
 313
 314     csc->reloc_indices_hashlist[hash] = csc->crelocs;
 315
 316     csc->chunks[1].length_dw += RELOC_DWORDS;
 317
 318     *added_domains = rd | wd;
 319     return csc->crelocs++;
 320 }
 321
 322 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
 323                                         struct pb_buffer *buf,
 324                                         enum radeon_bo_usage usage,
 325                                         enum radeon_bo_domain domains,
 326                                         enum radeon_bo_priority priority)
 327 {
 328     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 329     struct radeon_bo *bo = (struct radeon_bo*)buf;
 330     enum radeon_bo_domain added_domains;
 331     unsigned index = radeon_add_buffer(cs, bo, usage, domains, priority,
 332                                        &added_domains);
 333
 334     if (added_domains & RADEON_DOMAIN_VRAM)
 335         cs->csc->used_vram += bo->base.size;
 336     else if (added_domains & RADEON_DOMAIN_GTT)
 337         cs->csc->used_gart += bo->base.size;
 338
 339     return index;
 340 }
 341
 342 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
 343                                    struct pb_buffer *buf)
 344 {
 345     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 346
 347     return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
 348 }
 349
 350 static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
 351 {
 352     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 353     bool status =
 354         cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
 355         cs->csc->used_vram < cs->ws->info.vram_size * 0.8;
 356
 357     if (status) {
 358         cs->csc->validated_crelocs = cs->csc->crelocs;
 359     } else {
 360         /* Remove lately-added buffers. The validation failed with them
 361          * and the CS is about to be flushed because of that. Keep only
 362          * the already-validated buffers. */
 363         unsigned i;
 364
 365         for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
 366             p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
 367             radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
 368         }
 369         cs->csc->crelocs = cs->csc->validated_crelocs;
 370
 371         /* Flush if there are any relocs. Clean up otherwise. */
 372         if (cs->csc->crelocs) {
 373             cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
 374         } else {
 375             radeon_cs_context_cleanup(cs->csc);
 376
 377             assert(cs->base.current.cdw == 0);
 378             if (cs->base.current.cdw != 0) {
 379                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
 380             }
 381         }
 382     }
 383     return status;
 384 }
 385
 386 static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
 387 {
 388    assert(rcs->current.cdw <= rcs->current.max_dw);
 389    return rcs->current.max_dw - rcs->current.cdw >= dw;
 390 }
 391
 392 static bool radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
 393 {
 394     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 395
 396     vram += cs->csc->used_vram;
 397     gtt += cs->csc->used_gart;
 398
 399     /* Anything that goes above the VRAM size should go to GTT. */
 400     if (vram > cs->ws->info.vram_size)
 401         gtt += vram - cs->ws->info.vram_size;
 402
 403     /* Now we just need to check if we have enough GTT. */
 404     return gtt < cs->ws->info.gart_size * 0.7;
 405 }
 406
 407 static uint64_t radeon_drm_cs_query_memory_usage(struct radeon_winsys_cs *rcs)
 408 {
 409    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 410
 411    return cs->csc->used_vram + cs->csc->used_gart;
 412 }
 413
 414 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
 415                                               struct radeon_bo_list_item *list)
 416 {
 417     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 418     int i;
 419
 420     if (list) {
 421         for (i = 0; i < cs->csc->crelocs; i++) {
 422             list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
 423             list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
 424             list[i].priority_usage = cs->csc->relocs_bo[i].priority_usage;
 425         }
 426     }
 427     return cs->csc->crelocs;
 428 }
 429
 430 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
 431 {
 432     struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
 433     unsigned i;
 434     int r;
 435
 436     r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
 437                             &csc->cs, sizeof(struct drm_radeon_cs));
 438     if (r) {
 439         if (r == -ENOMEM)
 440             fprintf(stderr, "radeon: Not enough memory for command submission.\n");
 441         else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
 442             unsigned i;
 443
 444             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
 445             for (i = 0; i < csc->chunks[0].length_dw; i++) {
 446                 fprintf(stderr, "0x%08X\n", csc->buf[i]);
 447             }
 448         } else {
 449             fprintf(stderr, "radeon: The kernel rejected CS, "
 450                     "see dmesg for more information.\n");
 451         }
 452     }
 453
 454     for (i = 0; i < csc->crelocs; i++)
 455         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
 456
 457     radeon_cs_context_cleanup(csc);
 458 }
 459
 460 /*
 461  * Make sure previous submission of this cs are completed
 462  */
 463 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
 464 {
 465     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 466
 467     /* Wait for any pending ioctl of this CS to complete. */
 468     if (util_queue_is_initialized(&cs->ws->cs_queue))
 469         util_queue_job_wait(&cs->flush_completed);
 470 }
 471
 472 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
 473
 474 static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
 475                                unsigned flags,
 476                                struct pipe_fence_handle **fence)
 477 {
 478     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 479     struct radeon_cs_context *tmp;
 480
 481     switch (cs->ring_type) {
 482     case RING_DMA:
 483         /* pad DMA ring to 8 DWs */
 484         if (cs->ws->info.chip_class <= SI) {
 485             while (rcs->current.cdw & 7)
 486                 OUT_CS(&cs->base, 0xf0000000); /* NOP packet */
 487         } else {
 488             while (rcs->current.cdw & 7)
 489                 OUT_CS(&cs->base, 0x00000000); /* NOP packet */
 490         }
 491         break;
 492     case RING_GFX:
 493         /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
 494          * r6xx, requires at least 4 dw alignment to avoid a hw bug.
 495          */
 496         if (cs->ws->info.gfx_ib_pad_with_type2) {
 497             while (rcs->current.cdw & 7)
 498                 OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
 499         } else {
 500             while (rcs->current.cdw & 7)
 501                 OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */
 502         }
 503         break;
 504     case RING_UVD:
 505         while (rcs->current.cdw & 15)
 506             OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
 507         break;
 508     default:
 509         break;
 510     }
 511
 512     if (rcs->current.cdw > rcs->current.max_dw) {
 513        fprintf(stderr, "radeon: command stream overflowed\n");
 514     }
 515
 516     if (fence) {
 517         radeon_fence_reference(fence, NULL);
 518         *fence = radeon_cs_create_fence(rcs);
 519     }
 520
 521     radeon_drm_cs_sync_flush(rcs);
 522
 523     /* Swap command streams. */
 524     tmp = cs->csc;
 525     cs->csc = cs->cst;
 526     cs->cst = tmp;
 527
 528     /* If the CS is not empty or overflowed, emit it in a separate thread. */
 529     if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
 530         unsigned i, crelocs;
 531
 532         crelocs = cs->cst->crelocs;
 533
 534         cs->cst->chunks[0].length_dw = cs->base.current.cdw;
 535
 536         for (i = 0; i < crelocs; i++) {
 537             /* Update the number of active asynchronous CS ioctls for the buffer. */
 538             p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
 539         }
 540
 541         switch (cs->ring_type) {
 542         case RING_DMA:
 543             cs->cst->flags[0] = 0;
 544             cs->cst->flags[1] = RADEON_CS_RING_DMA;
 545             cs->cst->cs.num_chunks = 3;
 546             if (cs->ws->info.has_virtual_memory) {
 547                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 548             }
 549             break;
 550
 551         case RING_UVD:
 552             cs->cst->flags[0] = 0;
 553             cs->cst->flags[1] = RADEON_CS_RING_UVD;
 554             cs->cst->cs.num_chunks = 3;
 555             break;
 556
 557         case RING_VCE:
 558             cs->cst->flags[0] = 0;
 559             cs->cst->flags[1] = RADEON_CS_RING_VCE;
 560             cs->cst->cs.num_chunks = 3;
 561             break;
 562
 563         default:
 564         case RING_GFX:
 565         case RING_COMPUTE:
 566             cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
 567             cs->cst->flags[1] = RADEON_CS_RING_GFX;
 568             cs->cst->cs.num_chunks = 3;
 569
 570             if (cs->ws->info.has_virtual_memory) {
 571                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
 572                 cs->cst->cs.num_chunks = 3;
 573             }
 574             if (flags & RADEON_FLUSH_END_OF_FRAME) {
 575                 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
 576                 cs->cst->cs.num_chunks = 3;
 577             }
 578             if (cs->ring_type == RING_COMPUTE) {
 579                 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
 580                 cs->cst->cs.num_chunks = 3;
 581             }
 582             break;
 583         }
 584
 585         if (util_queue_is_initialized(&cs->ws->cs_queue)) {
 586             util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
 587                                radeon_drm_cs_emit_ioctl_oneshot, NULL);
 588             if (!(flags & RADEON_FLUSH_ASYNC))
 589                 radeon_drm_cs_sync_flush(rcs);
 590         } else {
 591             radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
 592         }
 593     } else {
 594         radeon_cs_context_cleanup(cs->cst);
 595     }
 596
 597     /* Prepare a new CS. */
 598     cs->base.current.buf = cs->csc->buf;
 599     cs->base.current.cdw = 0;
 600
 601     cs->ws->num_cs_flushes++;
 602     return 0;
 603 }
 604
 605 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
 606 {
 607     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 608
 609     radeon_drm_cs_sync_flush(rcs);
 610     util_queue_fence_destroy(&cs->flush_completed);
 611     radeon_cs_context_cleanup(&cs->csc1);
 612     radeon_cs_context_cleanup(&cs->csc2);
 613     p_atomic_dec(&cs->ws->num_cs);
 614     radeon_destroy_cs_context(&cs->csc1);
 615     radeon_destroy_cs_context(&cs->csc2);
 616     FREE(cs);
 617 }
 618
 619 static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
 620                                     struct pb_buffer *_buf,
 621                                     enum radeon_bo_usage usage)
 622 {
 623     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 624     struct radeon_bo *bo = (struct radeon_bo*)_buf;
 625     int index;
 626
 627     if (!bo->num_cs_references)
 628         return false;
 629
 630     index = radeon_lookup_buffer(cs->csc, bo);
 631     if (index == -1)
 632         return false;
 633
 634     if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
 635         return true;
 636     if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
 637         return true;
 638
 639     return false;
 640 }
 641
 642 /* FENCES */
 643
 644 static struct pipe_fence_handle *
 645 radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
 646 {
 647     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 648     struct pb_buffer *fence;
 649
 650     /* Create a fence, which is a dummy BO. */
 651     fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
 652                                        RADEON_DOMAIN_GTT, 0);
 653     /* Add the fence as a dummy relocation. */
 654     cs->ws->base.cs_add_buffer(rcs, fence,
 655                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
 656                               RADEON_PRIO_FENCE);
 657     return (struct pipe_fence_handle*)fence;
 658 }
 659
 660 static bool radeon_fence_wait(struct radeon_winsys *ws,
 661                               struct pipe_fence_handle *fence,
 662                               uint64_t timeout)
 663 {
 664     return ws->buffer_wait((struct pb_buffer*)fence, timeout,
 665                            RADEON_USAGE_READWRITE);
 666 }
 667
 668 static void radeon_fence_reference(struct pipe_fence_handle **dst,
 669                                    struct pipe_fence_handle *src)
 670 {
 671     pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
 672 }
 673
 674 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
 675 {
 676     ws->base.ctx_create = radeon_drm_ctx_create;
 677     ws->base.ctx_destroy = radeon_drm_ctx_destroy;
 678     ws->base.cs_create = radeon_drm_cs_create;
 679     ws->base.cs_destroy = radeon_drm_cs_destroy;
 680     ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
 681     ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
 682     ws->base.cs_validate = radeon_drm_cs_validate;
 683     ws->base.cs_check_space = radeon_drm_cs_check_space;
 684     ws->base.cs_memory_below_limit = radeon_drm_cs_memory_below_limit;
 685     ws->base.cs_query_memory_usage = radeon_drm_cs_query_memory_usage;
 686     ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
 687     ws->base.cs_flush = radeon_drm_cs_flush;
 688     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
 689     ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
 690     ws->base.fence_wait = radeon_fence_wait;
 691     ws->base.fence_reference = radeon_fence_reference;
 692 }