gallium/u_queue: allow the execute function to differ per job
[mesa.git] / src / gallium / winsys / radeon / drm / radeon_drm_cs.c
index 951791a17270ee3fd890d05fdab6c295394812c7..efefd7517e78ee60b0fe7a79ae7cd8f7188256c5 100644 (file)
 
 /*
     This file replaces libdrm's radeon_cs_gem with our own implemention.
-    It's optimized specifically for r300g, but r600g could use it as well.
-    Reloc writes and space checking are faster and simpler than their
+    It's optimized specifically for Radeon DRM.
+    Adding buffers and space checking are faster and simpler than their
     counterparts in libdrm (the time complexity of all the functions
     is O(1) in nearly all scenarios, thanks to hashing).
 
     It works like this:
 
-    cs_add_reloc(cs, buf, read_domain, write_domain) adds a new relocation and
+    cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
     also adds the size of 'buf' to the used_gart and used_vram winsys variables
     based on the domains, which are simply or'd for the accounting purposes.
     The adding is skipped if the reloc is already present in the list, but it
     (done in the pipe driver)
 
     cs_write_reloc(cs, buf) just writes a reloc that has been added using
-    cs_add_reloc. The read_domain and write_domain parameters have been removed,
-    because we already specify them in cs_add_reloc.
+    cs_add_buffer. The read_domain and write_domain parameters have been removed,
+    because we already specify them in cs_add_buffer.
 */
 
 #include "radeon_drm_cs.h"
 
 #include "util/u_memory.h"
+#include "os/os_time.h"
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <xf86drm.h>
 
+
 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
 
-static boolean radeon_init_cs_context(struct radeon_cs_context *csc, int fd)
+static struct pipe_fence_handle *
+radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
+static void radeon_fence_reference(struct pipe_fence_handle **dst,
+                                   struct pipe_fence_handle *src);
+
+static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
+{
+    /* No context support here. Just return the winsys pointer
+     * as the "context". */
+    return (struct radeon_winsys_ctx*)ws;
+}
+
+static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
+{
+    /* No context support here. */
+}
+
+static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
+                                      struct radeon_drm_winsys *ws)
 {
-    csc->fd = fd;
+    int i;
+
+    csc->fd = ws->fd;
     csc->nrelocs = 512;
-    csc->relocs_bo = (struct radeon_bo**)
-                     CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
+    csc->relocs_bo = (struct radeon_bo_item*)
+                     CALLOC(1, csc->nrelocs * sizeof(csc->relocs_bo[0]));
     if (!csc->relocs_bo) {
         return FALSE;
     }
@@ -96,12 +118,19 @@ static boolean radeon_init_cs_context(struct radeon_cs_context *csc, int fd)
     csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
     csc->chunks[1].length_dw = 0;
     csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
+    csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
+    csc->chunks[2].length_dw = 2;
+    csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
 
     csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
     csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
+    csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
 
-    csc->cs.num_chunks = 2;
     csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
+
+    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
+        csc->reloc_indices_hashlist[i] = -1;
+    }
     return TRUE;
 }
 
@@ -110,17 +139,20 @@ static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
     unsigned i;
 
     for (i = 0; i < csc->crelocs; i++) {
-        p_atomic_dec(&csc->relocs_bo[i]->num_cs_references);
-        radeon_bo_unref(csc->relocs_bo[i]);
-        csc->relocs_bo[i] = NULL;
+        p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
+        radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
     }
 
     csc->crelocs = 0;
+    csc->validated_crelocs = 0;
     csc->chunks[0].length_dw = 0;
     csc->chunks[1].length_dw = 0;
     csc->used_gart = 0;
     csc->used_vram = 0;
-    memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added));
+
+    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
+        csc->reloc_indices_hashlist[i] = -1;
+    }
 }
 
 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
@@ -130,23 +162,32 @@ static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
     FREE(csc->relocs);
 }
 
-static struct r300_winsys_cs *radeon_drm_cs_create(struct r300_winsys_screen *rws)
+
+static struct radeon_winsys_cs *
+radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
+                     enum ring_type ring_type,
+                     void (*flush)(void *ctx, unsigned flags,
+                                   struct pipe_fence_handle **fence),
+                     void *flush_ctx)
 {
-    struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
+    struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
     struct radeon_drm_cs *cs;
 
     cs = CALLOC_STRUCT(radeon_drm_cs);
     if (!cs) {
         return NULL;
     }
+    util_queue_fence_init(&cs->flush_completed);
 
     cs->ws = ws;
+    cs->flush_cs = flush;
+    cs->flush_data = flush_ctx;
 
-    if (!radeon_init_cs_context(&cs->csc1, cs->ws->fd)) {
+    if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
         FREE(cs);
         return NULL;
     }
-    if (!radeon_init_cs_context(&cs->csc2, cs->ws->fd)) {
+    if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
         radeon_destroy_cs_context(&cs->csc1);
         FREE(cs);
         return NULL;
@@ -155,99 +196,93 @@ static struct r300_winsys_cs *radeon_drm_cs_create(struct r300_winsys_screen *rw
     /* Set the first command buffer as current. */
     cs->csc = &cs->csc1;
     cs->cst = &cs->csc2;
-    cs->base.buf = cs->csc->buf;
+    cs->base.current.buf = cs->csc->buf;
+    cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
+    cs->ring_type = ring_type;
 
     p_atomic_inc(&ws->num_cs);
     return &cs->base;
 }
 
-#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
+#define OUT_CS(cs, value) (cs)->current.buf[(cs)->current.cdw++] = (value)
 
-static INLINE void update_domains(struct drm_radeon_cs_reloc *reloc,
-                                  enum r300_buffer_domain rd,
-                                  enum r300_buffer_domain wd,
-                                  enum r300_buffer_domain *added_domains)
+static inline void update_reloc(struct drm_radeon_cs_reloc *reloc,
+                                enum radeon_bo_domain rd,
+                                enum radeon_bo_domain wd,
+                                unsigned priority,
+                                enum radeon_bo_domain *added_domains)
 {
     *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
 
-    if (reloc->read_domains & wd) {
-        reloc->read_domains = rd;
-        reloc->write_domain = wd;
-    } else if (rd & reloc->write_domain) {
-        reloc->read_domains = rd;
-        reloc->write_domain |= wd;
-    } else {
-        reloc->read_domains |= rd;
-        reloc->write_domain |= wd;
-    }
+    reloc->read_domains |= rd;
+    reloc->write_domain |= wd;
+    reloc->flags = MAX2(reloc->flags, priority);
 }
 
-int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
+int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
 {
-    struct drm_radeon_cs_reloc *reloc;
-    unsigned i;
-    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
-
-    if (csc->is_handle_added[hash]) {
-        reloc = csc->relocs_hashlist[hash];
-        if (reloc->handle == bo->handle) {
-            return csc->reloc_indices_hashlist[hash];
-        }
-
-        /* Hash collision, look for the BO in the list of relocs linearly. */
-        for (i = csc->crelocs; i != 0;) {
-            --i;
-            reloc = &csc->relocs[i];
-            if (reloc->handle == bo->handle) {
-                /* Put this reloc in the hash list.
-                 * This will prevent additional hash collisions if there are
-                 * several subsequent get_reloc calls of the same buffer.
-                 *
-                 * Example: Assuming buffers A,B,C collide in the hash list,
-                 * the following sequence of relocs:
-                 *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
-                 * will collide here: ^ and here:   ^,
-                 * meaning that we should get very few collisions in the end. */
-                csc->relocs_hashlist[hash] = reloc;
-                csc->reloc_indices_hashlist[hash] = i;
-                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
-                return i;
-            }
+    unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
+    int i = csc->reloc_indices_hashlist[hash];
+
+    /* not found or found */
+    if (i == -1 || csc->relocs_bo[i].bo == bo)
+        return i;
+
+    /* Hash collision, look for the BO in the list of relocs linearly. */
+    for (i = csc->crelocs - 1; i >= 0; i--) {
+        if (csc->relocs_bo[i].bo == bo) {
+            /* Put this reloc in the hash list.
+             * This will prevent additional hash collisions if there are
+             * several consecutive lookup_buffer calls for the same buffer.
+             *
+             * Example: Assuming buffers A,B,C collide in the hash list,
+             * the following sequence of relocs:
+             *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
+             * will collide here: ^ and here:   ^,
+             * meaning that we should get very few collisions in the end. */
+            csc->reloc_indices_hashlist[hash] = i;
+            return i;
         }
     }
-
     return -1;
 }
 
-static void radeon_add_reloc(struct radeon_cs_context *csc,
-                             struct radeon_bo *bo,
-                             enum r300_buffer_domain rd,
-                             enum r300_buffer_domain wd,
-                             enum r300_buffer_domain *added_domains)
+static unsigned radeon_add_buffer(struct radeon_drm_cs *cs,
+                                 struct radeon_bo *bo,
+                                 enum radeon_bo_usage usage,
+                                 enum radeon_bo_domain domains,
+                                 unsigned priority,
+                                 enum radeon_bo_domain *added_domains)
 {
+    struct radeon_cs_context *csc = cs->csc;
     struct drm_radeon_cs_reloc *reloc;
-    unsigned i;
-    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
-
-    if (csc->is_handle_added[hash]) {
-        reloc = csc->relocs_hashlist[hash];
-        if (reloc->handle == bo->handle) {
-            update_domains(reloc, rd, wd, added_domains);
-            return;
-        }
-
-        /* Hash collision, look for the BO in the list of relocs linearly. */
-        for (i = csc->crelocs; i != 0;) {
-            --i;
-            reloc = &csc->relocs[i];
-            if (reloc->handle == bo->handle) {
-                update_domains(reloc, rd, wd, added_domains);
-
-                csc->relocs_hashlist[hash] = reloc;
-                csc->reloc_indices_hashlist[hash] = i;
-                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
-                return;
-            }
+    unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
+    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
+    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
+    int i = -1;
+
+    assert(priority < 64);
+    *added_domains = 0;
+
+    i = radeon_lookup_buffer(csc, bo);
+
+    if (i >= 0) {
+        reloc = &csc->relocs[i];
+        update_reloc(reloc, rd, wd, priority / 4, added_domains);
+        csc->relocs_bo[i].priority_usage |= 1llu << priority;
+
+        /* For async DMA, every add_buffer call must add a buffer to the list
+         * no matter how many duplicates there are. This is due to the fact
+         * the DMA CS checker doesn't use NOP packets for offset patching,
+         * but always uses the i-th buffer from the list to patch the i-th
+         * offset. If there are N offsets in a DMA CS, there must also be N
+         * buffers in the relocation list.
+         *
+         * This doesn't have to be done if virtual memory is enabled,
+         * because there is no offset patching with virtual memory.
+         */
+        if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
+            return i;
         }
     }
 
@@ -256,88 +291,154 @@ static void radeon_add_reloc(struct radeon_cs_context *csc,
         uint32_t size;
         csc->nrelocs += 10;
 
-        size = csc->nrelocs * sizeof(struct radeon_bo*);
-        csc->relocs_bo = (struct radeon_bo**)realloc(csc->relocs_bo, size);
+        size = csc->nrelocs * sizeof(csc->relocs_bo[0]);
+        csc->relocs_bo = realloc(csc->relocs_bo, size);
 
         size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
-        csc->relocs = (struct drm_radeon_cs_reloc*)realloc(csc->relocs, size);
+        csc->relocs = realloc(csc->relocs, size);
 
         csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
     }
 
     /* Initialize the new relocation. */
-    radeon_bo_ref(bo);
+    csc->relocs_bo[csc->crelocs].bo = NULL;
+    csc->relocs_bo[csc->crelocs].priority_usage = 1llu << priority;
+    radeon_bo_reference(&csc->relocs_bo[csc->crelocs].bo, bo);
     p_atomic_inc(&bo->num_cs_references);
-    csc->relocs_bo[csc->crelocs] = bo;
     reloc = &csc->relocs[csc->crelocs];
     reloc->handle = bo->handle;
     reloc->read_domains = rd;
     reloc->write_domain = wd;
-    reloc->flags = 0;
+    reloc->flags = priority / 4;
 
-    csc->is_handle_added[hash] = TRUE;
-    csc->relocs_hashlist[hash] = reloc;
     csc->reloc_indices_hashlist[hash] = csc->crelocs;
 
     csc->chunks[1].length_dw += RELOC_DWORDS;
-    csc->crelocs++;
 
     *added_domains = rd | wd;
+    return csc->crelocs++;
 }
 
-static void radeon_drm_cs_add_reloc(struct r300_winsys_cs *rcs,
-                                    struct r300_winsys_cs_handle *buf,
-                                    enum r300_buffer_domain rd,
-                                    enum r300_buffer_domain wd)
+static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
+                                        struct pb_buffer *buf,
+                                        enum radeon_bo_usage usage,
+                                        enum radeon_bo_domain domains,
+                                        enum radeon_bo_priority priority)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct radeon_bo *bo = (struct radeon_bo*)buf;
-    enum r300_buffer_domain added_domains;
-
-    radeon_add_reloc(cs->csc, bo, rd, wd, &added_domains);
+    enum radeon_bo_domain added_domains;
+    unsigned index = radeon_add_buffer(cs, bo, usage, domains, priority,
+                                       &added_domains);
 
-    if (!added_domains)
-        return;
+    if (added_domains & RADEON_DOMAIN_VRAM)
+        cs->csc->used_vram += bo->base.size;
+    else if (added_domains & RADEON_DOMAIN_GTT)
+        cs->csc->used_gart += bo->base.size;
 
-    if (added_domains & R300_DOMAIN_GTT)
-        cs->csc->used_gart += bo->size;
-    if (added_domains & R300_DOMAIN_VRAM)
-        cs->csc->used_vram += bo->size;
+    return index;
 }
 
-static boolean radeon_drm_cs_validate(struct r300_winsys_cs *rcs)
+static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
+                                   struct pb_buffer *buf)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 
-    return cs->csc->used_gart < cs->ws->gart_size * 0.8 &&
-           cs->csc->used_vram < cs->ws->vram_size * 0.8;
+    return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
 }
 
-static void radeon_drm_cs_write_reloc(struct r300_winsys_cs *rcs,
-                                      struct r300_winsys_cs_handle *buf)
+static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
-    struct radeon_bo *bo = (struct radeon_bo*)buf;
+    boolean status =
+        cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
+        cs->csc->used_vram < cs->ws->info.vram_size * 0.8;
 
-    unsigned index = radeon_get_reloc(cs->csc, bo);
+    if (status) {
+        cs->csc->validated_crelocs = cs->csc->crelocs;
+    } else {
+        /* Remove lately-added buffers. The validation failed with them
+         * and the CS is about to be flushed because of that. Keep only
+         * the already-validated buffers. */
+        unsigned i;
+
+        for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
+            p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
+            radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
+        }
+        cs->csc->crelocs = cs->csc->validated_crelocs;
+
+        /* Flush if there are any relocs. Clean up otherwise. */
+        if (cs->csc->crelocs) {
+            cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
+        } else {
+            radeon_cs_context_cleanup(cs->csc);
 
-    if (index == -1) {
-        fprintf(stderr, "r300: Cannot get a relocation in %s.\n", __func__);
-        return;
+            assert(cs->base.current.cdw == 0);
+            if (cs->base.current.cdw != 0) {
+                fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
+            }
+        }
     }
+    return status;
+}
 
-    OUT_CS(&cs->base, 0xc0001000);
-    OUT_CS(&cs->base, index * RELOC_DWORDS);
+static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
+{
+   assert(rcs->current.cdw <= rcs->current.max_dw);
+   return rcs->current.max_dw - rcs->current.cdw >= dw;
 }
 
-static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
+static boolean radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
 {
-    struct radeon_cs_context *csc = (struct radeon_cs_context*)param;
-    unsigned i;
+    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
+
+    vram += cs->csc->used_vram;
+    gtt += cs->csc->used_gart;
 
-    if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
-                            &csc->cs, sizeof(struct drm_radeon_cs))) {
-        if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
+    /* Anything that goes above the VRAM size should go to GTT. */
+    if (vram > cs->ws->info.vram_size)
+        gtt += vram - cs->ws->info.vram_size;
+
+    /* Now we just need to check if we have enough GTT. */
+    return gtt < cs->ws->info.gart_size * 0.7;
+}
+
+static uint64_t radeon_drm_cs_query_memory_usage(struct radeon_winsys_cs *rcs)
+{
+   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
+
+   return cs->csc->used_vram + cs->csc->used_gart;
+}
+
+static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
+                                              struct radeon_bo_list_item *list)
+{
+    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
+    int i;
+
+    if (list) {
+        for (i = 0; i < cs->csc->crelocs; i++) {
+            pb_reference(&list[i].buf, &cs->csc->relocs_bo[i].bo->base);
+            list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
+            list[i].priority_usage = cs->csc->relocs_bo[i].priority_usage;
+        }
+    }
+    return cs->csc->crelocs;
+}
+
+void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
+{
+    struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
+    unsigned i;
+    int r;
+
+    r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
+                            &csc->cs, sizeof(struct drm_radeon_cs));
+    if (r) {
+       if (r == -ENOMEM)
+           fprintf(stderr, "radeon: Not enough memory for command submission.\n");
+       else if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
             unsigned i;
 
             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
@@ -351,62 +452,164 @@ static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
     }
 
     for (i = 0; i < csc->crelocs; i++)
-        p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls);
-    return NULL;
+        p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
+
+    radeon_cs_context_cleanup(csc);
 }
 
-void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs)
+/*
+ * Make sure previous submission of this cs are completed
+ */
+void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
 {
-    /* Wait for any pending ioctl to complete. */
-    if (cs->thread) {
-        pipe_thread_wait(cs->thread);
-        cs->thread = 0;
-    }
+    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
+
+    /* Wait for any pending ioctl of this CS to complete. */
+    if (util_queue_is_initialized(&cs->ws->cs_queue))
+        util_queue_job_wait(&cs->flush_completed);
 }
 
-DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
+DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
 
-static void radeon_drm_cs_flush(struct r300_winsys_cs *rcs, unsigned flags)
+static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
+                                unsigned flags,
+                                struct pipe_fence_handle **fence)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct radeon_cs_context *tmp;
 
-    radeon_drm_cs_sync_flush(cs);
-
-    /* If the CS is not empty, emit it in a newly-spawned thread. */
-    if (cs->base.cdw) {
-        unsigned i, crelocs = cs->csc->crelocs;
-
-        cs->csc->chunks[0].length_dw = cs->base.cdw;
-
-        for (i = 0; i < crelocs; i++)
-            p_atomic_inc(&cs->csc->relocs_bo[i]->num_active_ioctls);
-
-        if (cs->ws->num_cpus > 1 && debug_get_option_thread() &&
-            (flags & R300_FLUSH_ASYNC)) {
-            cs->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, cs->csc);
-            assert(cs->thread);
+    switch (cs->ring_type) {
+    case RING_DMA:
+        /* pad DMA ring to 8 DWs */
+        if (cs->ws->info.chip_class <= SI) {
+            while (rcs->current.cdw & 7)
+                OUT_CS(&cs->base, 0xf0000000); /* NOP packet */
         } else {
-            radeon_drm_cs_emit_ioctl(cs->csc);
+            while (rcs->current.cdw & 7)
+                OUT_CS(&cs->base, 0x00000000); /* NOP packet */
         }
+        break;
+    case RING_GFX:
+        /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
+         * r6xx, requires at least 4 dw alignment to avoid a hw bug.
+         */
+        if (cs->ws->info.gfx_ib_pad_with_type2) {
+            while (rcs->current.cdw & 7)
+                OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
+        } else {
+            while (rcs->current.cdw & 7)
+                OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */
+        }
+        break;
+    case RING_UVD:
+        while (rcs->current.cdw & 15)
+            OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
+        break;
+    default:
+        break;
+    }
+
+    if (rcs->current.cdw > rcs->current.max_dw) {
+       fprintf(stderr, "radeon: command stream overflowed\n");
     }
 
-    /* Flip command streams. */
+    if (fence) {
+        radeon_fence_reference(fence, NULL);
+        *fence = radeon_cs_create_fence(rcs);
+    }
+
+    radeon_drm_cs_sync_flush(rcs);
+
+    /* Swap command streams. */
     tmp = cs->csc;
     cs->csc = cs->cst;
     cs->cst = tmp;
 
+    /* If the CS is not empty or overflowed, emit it in a separate thread. */
+    if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
+        unsigned i, crelocs;
+
+        crelocs = cs->cst->crelocs;
+
+        cs->cst->chunks[0].length_dw = cs->base.current.cdw;
+
+        for (i = 0; i < crelocs; i++) {
+            /* Update the number of active asynchronous CS ioctls for the buffer. */
+            p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
+        }
+
+        switch (cs->ring_type) {
+        case RING_DMA:
+            cs->cst->flags[0] = 0;
+            cs->cst->flags[1] = RADEON_CS_RING_DMA;
+            cs->cst->cs.num_chunks = 3;
+            if (cs->ws->info.has_virtual_memory) {
+                cs->cst->flags[0] |= RADEON_CS_USE_VM;
+            }
+            break;
+
+        case RING_UVD:
+            cs->cst->flags[0] = 0;
+            cs->cst->flags[1] = RADEON_CS_RING_UVD;
+            cs->cst->cs.num_chunks = 3;
+            break;
+
+        case RING_VCE:
+            cs->cst->flags[0] = 0;
+            cs->cst->flags[1] = RADEON_CS_RING_VCE;
+            cs->cst->cs.num_chunks = 3;
+            break;
+
+        default:
+        case RING_GFX:
+        case RING_COMPUTE:
+            cs->cst->flags[0] = 0;
+            cs->cst->flags[1] = RADEON_CS_RING_GFX;
+            cs->cst->cs.num_chunks = 2;
+            if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
+                cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
+                cs->cst->cs.num_chunks = 3;
+            }
+            if (cs->ws->info.has_virtual_memory) {
+                cs->cst->flags[0] |= RADEON_CS_USE_VM;
+                cs->cst->cs.num_chunks = 3;
+            }
+            if (flags & RADEON_FLUSH_END_OF_FRAME) {
+                cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
+                cs->cst->cs.num_chunks = 3;
+            }
+            if (cs->ring_type == RING_COMPUTE) {
+                cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
+                cs->cst->cs.num_chunks = 3;
+            }
+            break;
+        }
+
+        if (util_queue_is_initialized(&cs->ws->cs_queue)) {
+            util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
+                               radeon_drm_cs_emit_ioctl_oneshot);
+            if (!(flags & RADEON_FLUSH_ASYNC))
+                radeon_drm_cs_sync_flush(rcs);
+        } else {
+            radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
+        }
+    } else {
+        radeon_cs_context_cleanup(cs->cst);
+    }
+
     /* Prepare a new CS. */
-    radeon_cs_context_cleanup(cs->csc);
+    cs->base.current.buf = cs->csc->buf;
+    cs->base.current.cdw = 0;
 
-    cs->base.buf = cs->csc->buf;
-    cs->base.cdw = 0;
+    cs->ws->num_cs_flushes++;
 }
 
-static void radeon_drm_cs_destroy(struct r300_winsys_cs *rcs)
+static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
-    radeon_drm_cs_sync_flush(cs);
+
+    radeon_drm_cs_sync_flush(rcs);
+    util_queue_fence_destroy(&cs->flush_completed);
     radeon_cs_context_cleanup(&cs->csc1);
     radeon_cs_context_cleanup(&cs->csc2);
     p_atomic_dec(&cs->ws->num_cs);
@@ -415,32 +618,77 @@ static void radeon_drm_cs_destroy(struct r300_winsys_cs *rcs)
     FREE(cs);
 }
 
-static void radeon_drm_cs_set_flush(struct r300_winsys_cs *rcs,
-                                    void (*flush)(void *ctx, unsigned flags),
-                                    void *user)
+static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
+                                       struct pb_buffer *_buf,
+                                       enum radeon_bo_usage usage)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
-    cs->flush_cs = flush;
-    cs->flush_data = user;
+    struct radeon_bo *bo = (struct radeon_bo*)_buf;
+    int index;
+
+    if (!bo->num_cs_references)
+        return FALSE;
+
+    index = radeon_lookup_buffer(cs->csc, bo);
+    if (index == -1)
+        return FALSE;
+
+    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
+        return TRUE;
+    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
+        return TRUE;
+
+    return FALSE;
 }
 
-static boolean radeon_bo_is_referenced(struct r300_winsys_cs *rcs,
-                                       struct r300_winsys_cs_handle *_buf)
+/* FENCES */
+
+static struct pipe_fence_handle *
+radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
-    struct radeon_bo *bo = (struct radeon_bo*)_buf;
+    struct pb_buffer *fence;
+
+    /* Create a fence, which is a dummy BO. */
+    fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
+                                       RADEON_DOMAIN_GTT, 0);
+    /* Add the fence as a dummy relocation. */
+    cs->ws->base.cs_add_buffer(rcs, fence,
+                              RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
+                              RADEON_PRIO_FENCE);
+    return (struct pipe_fence_handle*)fence;
+}
+
+static bool radeon_fence_wait(struct radeon_winsys *ws,
+                              struct pipe_fence_handle *fence,
+                              uint64_t timeout)
+{
+    return ws->buffer_wait((struct pb_buffer*)fence, timeout,
+                           RADEON_USAGE_READWRITE);
+}
 
-    return radeon_bo_is_referenced_by_cs(cs, bo);
+static void radeon_fence_reference(struct pipe_fence_handle **dst,
+                                   struct pipe_fence_handle *src)
+{
+    pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
 }
 
 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
 {
+    ws->base.ctx_create = radeon_drm_ctx_create;
+    ws->base.ctx_destroy = radeon_drm_ctx_destroy;
     ws->base.cs_create = radeon_drm_cs_create;
     ws->base.cs_destroy = radeon_drm_cs_destroy;
-    ws->base.cs_add_reloc = radeon_drm_cs_add_reloc;
+    ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
+    ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
     ws->base.cs_validate = radeon_drm_cs_validate;
-    ws->base.cs_write_reloc = radeon_drm_cs_write_reloc;
+    ws->base.cs_check_space = radeon_drm_cs_check_space;
+    ws->base.cs_memory_below_limit = radeon_drm_cs_memory_below_limit;
+    ws->base.cs_query_memory_usage = radeon_drm_cs_query_memory_usage;
+    ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
     ws->base.cs_flush = radeon_drm_cs_flush;
-    ws->base.cs_set_flush = radeon_drm_cs_set_flush;
     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
+    ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
+    ws->base.fence_wait = radeon_fence_wait;
+    ws->base.fence_reference = radeon_fence_reference;
 }