r300g,radeong: finish and enable the immediate mode
authorMarek Olšák <maraeo@gmail.com>
Sun, 17 Jan 2010 23:15:52 +0000 (00:15 +0100)
committerCorbin Simpson <MostAwesomeDude@gmail.com>
Mon, 25 Jan 2010 07:03:29 +0000 (23:03 -0800)
Nearly 100% performance increase in glxgears.

src/gallium/drivers/r300/r300_emit.c
src/gallium/drivers/r300/r300_render.c
src/gallium/drivers/r300/r300_state.c
src/gallium/winsys/drm/radeon/core/radeon_buffer.c
src/gallium/winsys/drm/radeon/core/radeon_buffer.h
src/gallium/winsys/drm/radeon/core/radeon_drm.h
src/gallium/winsys/drm/radeon/core/radeon_r300.c
src/gallium/winsys/drm/radeon/core/radeon_winsys.h

index 36d2c64b587e0637e65b7f197db52482b4da8f63..badbf3715c77536642c37af8eab21b04635ce173 100644 (file)
@@ -772,22 +772,6 @@ void r300_emit_texture(struct r300_context* r300,
     END_CS;
 }
 
-static boolean r300_validate_aos(struct r300_context *r300)
-{
-    struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
-    struct pipe_vertex_element *velem = r300->vertex_element;
-    int i;
-
-    /* Check if formats and strides are aligned to the size of DWORD. */
-    for (i = 0; i < r300->vertex_element_count; i++) {
-        if (vbuf[velem[i].vertex_buffer_index].stride % 4 != 0 ||
-            util_format_get_blocksize(velem[i].src_format) % 4 != 0) {
-            return FALSE;
-        }
-    }
-    return TRUE;
-}
-
 void r300_emit_aos(struct r300_context* r300, unsigned offset)
 {
     struct pipe_vertex_buffer *vb1, *vb2, *vbuf = r300->vertex_buffer;
@@ -797,12 +781,6 @@ void r300_emit_aos(struct r300_context* r300, unsigned offset)
     unsigned packet_size = (aos_count * 3 + 1) / 2;
     CS_LOCALS(r300);
 
-    /* XXX Move this checking to a more approriate place. */
-    if (!r300_validate_aos(r300)) {
-        /* XXX We should fallback using Draw. */
-        assert(0);
-    }
-
     BEGIN_CS(2 + packet_size + aos_count * 2);
     OUT_CS_PKT3(R300_PACKET3_3D_LOAD_VBPNTR, packet_size);
     OUT_CS(aos_count);
index 677031ef04ebc46b43072bce346d55037a1def59..7f095bffe7c6a22e6e2942d1b669d6ad176308cc 100644 (file)
@@ -28,6 +28,7 @@
 
 #include "pipe/p_inlines.h"
 
+#include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_prim.h"
 
@@ -114,20 +115,53 @@ static uint32_t r300_provoking_vertex_fixes(struct r300_context *r300,
     return color_control;
 }
 
-static void r300_emit_draw_immediate(struct r300_context *r300,
-                                     unsigned mode,
-                                     unsigned start,
-                                     unsigned count)
+
+static void r300_emit_draw_arrays_immediate(struct r300_context *r300,
+                                            unsigned mode,
+                                            unsigned start,
+                                            unsigned count)
 {
-    struct pipe_buffer* vbo = r300->vertex_buffer[0].buffer;
-    unsigned vertex_size = r300->vertex_buffer[0].stride / sizeof(float);
-    unsigned i;
-    uint32_t* map;
+    struct pipe_vertex_element* velem;
+    struct pipe_vertex_buffer* vbuf;
+    unsigned vertex_element_count = r300->vertex_element_count;
+    unsigned i, v, vbi, dw, elem_offset;
+
+    /* Size of the vertex, in dwords. */
+    unsigned vertex_size = 0;
+
+    /* Offsets of the attribute, in dwords, from the start of the vertex. */
+    unsigned offset[PIPE_MAX_ATTRIBS];
+
+    /* Size of the vertex element, in dwords. */
+    unsigned size[PIPE_MAX_ATTRIBS];
+
+    /* Stride to the same attrib in the next vertex in the vertex buffer,
+     * in dwords. */
+    unsigned stride[PIPE_MAX_ATTRIBS];
+
+    /* Mapped vertex buffers. */
+    uint32_t* map[PIPE_MAX_ATTRIBS] = {0};
+
     CS_LOCALS(r300);
 
-    map = (uint32_t*)pipe_buffer_map_range(r300->context.screen, vbo,
-            start * vertex_size, count * vertex_size,
-            PIPE_BUFFER_USAGE_CPU_READ);
+    /* Calculate the vertex size, offsets, strides etc. and map the buffers. */
+    for (i = 0; i < vertex_element_count; i++) {
+        velem = &r300->vertex_element[i];
+        offset[i] = velem->src_offset >> 2;
+        size[i] = util_format_get_blocksize(velem->src_format) >> 2;
+        vertex_size += size[i];
+        vbi = velem->vertex_buffer_index;
+
+        /* Map the buffer. */
+        if (!map[vbi]) {
+            vbuf = &r300->vertex_buffer[vbi];
+            map[vbi] = (uint32_t*)pipe_buffer_map(r300->context.screen,
+                                                  vbuf->buffer,
+                                                  PIPE_BUFFER_USAGE_CPU_READ);
+            map[vbi] += vbuf->buffer_offset >> 2;
+            stride[vbi] = vbuf->stride >> 2;
+        }
+    }
 
     BEGIN_CS(10 + count * vertex_size);
     OUT_CS_REG(R300_GA_COLOR_CONTROL,
@@ -138,18 +172,31 @@ static void r300_emit_draw_immediate(struct r300_context *r300,
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_IMMD_2, count * vertex_size);
     OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED | (count << 16) |
             r300_translate_primitive(mode));
-    //debug_printf("r300: Immd %d verts, %d attrs\n", count, vertex_size);
-    for (i = 0; i < count * vertex_size; i++) {
-        if (i % vertex_size == 0) {
-            //debug_printf("r300: -- vert --\n");
+
+    /* Emit vertices. */
+    for (v = 0; v < count; v++) {
+        for (i = 0; i < vertex_element_count; i++) {
+            velem = &r300->vertex_element[i];
+            vbi = velem->vertex_buffer_index;
+            elem_offset = offset[i] + stride[vbi] * (v + start);
+
+            for (dw = 0; dw < size[i]; dw++) {
+                OUT_CS(map[vbi][elem_offset + dw]);
+            }
         }
-        //debug_printf("r300: 0x%08x\n", *map);
-        OUT_CS(*map);
-        map++;
     }
     END_CS;
 
-    pipe_buffer_unmap(r300->context.screen, vbo);
+    /* Unmap buffers. */
+    for (i = 0; i < vertex_element_count; i++) {
+        vbi = r300->vertex_element[i].vertex_buffer_index;
+
+        if (map[vbi]) {
+            vbuf = &r300->vertex_buffer[vbi];
+            pipe_buffer_unmap(r300->context.screen, vbuf->buffer);
+            map[vbi] = 0;
+        }
+    }
 }
 
 static void r300_emit_draw_arrays(struct r300_context *r300,
@@ -222,16 +269,49 @@ static void r300_emit_draw_elements(struct r300_context *r300,
 }
 
 
+static boolean r300_setup_local_vertex_buffers(struct r300_context *r300)
+{
+    struct pipe_vertex_buffer *vb;
+    boolean found_local_bo = FALSE, found_managed_bo = FALSE;
+    unsigned i;
+
+    /* See what buffers we got. */
+    for (i = 0; i < r300->vertex_element_count; i++) {
+        vb = &r300->vertex_buffer[r300->vertex_element[i].vertex_buffer_index];
+        if (r300->winsys->buffer_is_local(r300->winsys, vb->buffer)) {
+            found_local_bo = TRUE;
+        } else {
+            found_managed_bo = TRUE;
+        }
+    }
+
+    /* If we found both local and managed buffers, make local buffers managed
+     * because we shouldn't use the immediate mode in case a managed buffer is
+     * present, due to performance reasons. */
+    if (found_local_bo && found_managed_bo) {
+        for (i = 0; i < r300->vertex_element_count; i++) {
+            vb = &r300->vertex_buffer[r300->vertex_element[i].vertex_buffer_index];
+            if (r300->winsys->buffer_is_local(r300->winsys, vb->buffer)) {
+                r300->winsys->buffer_make_managed(r300->winsys, vb->buffer);
+            }
+        }
+    }
+
+    return !found_managed_bo;
+}
+
 static boolean r300_setup_vertex_buffers(struct r300_context *r300)
 {
     struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
     struct pipe_vertex_element *velem = r300->vertex_element;
+    struct pipe_buffer *pbuf;
 
 validate:
     for (int i = 0; i < r300->vertex_element_count; i++) {
-        if (!r300->winsys->add_buffer(r300->winsys,
-                vbuf[velem[i].vertex_buffer_index].buffer,
-            RADEON_GEM_DOMAIN_GTT, 0)) {
+        pbuf = vbuf[velem[i].vertex_buffer_index].buffer;
+
+        if (!r300->winsys->add_buffer(r300->winsys, pbuf,
+                                      RADEON_GEM_DOMAIN_GTT, 0)) {
             r300->context.flush(&r300->context, 0, NULL);
             goto validate;
         }
@@ -245,6 +325,7 @@ validate:
     return TRUE;
 }
 
+
 static void r300_shorten_ubyte_elts(struct r300_context* r300,
                                     struct pipe_buffer** elts,
                                     unsigned count)
@@ -365,15 +446,15 @@ void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
 
     r300_emit_buffer_validate(r300);
 
-    if (!r300_setup_vertex_buffers(r300)) {
-        return;
-    }
-
-    r300_emit_dirty_state(r300);
-
-    if (FALSE && count <= 4 && r300->vertex_buffer_count == 1) {
-        r300_emit_draw_immediate(r300, mode, start, count);
+    if (r300_setup_local_vertex_buffers(r300)) {
+        r300_emit_dirty_state(r300);
+        r300_emit_draw_arrays_immediate(r300, mode, start, count);
     } else {
+        if (!r300_setup_vertex_buffers(r300)) {
+            return;
+        }
+
+        r300_emit_dirty_state(r300);
         r300_emit_aos(r300, start);
         r300_emit_draw_arrays(r300, mode, count);
     }
index e2ec0bc5bd26acb10b68a8e008f6367de75572d5..641e95e7fcae90cd49e085b19949a0cb3f4a1be4 100644 (file)
@@ -924,6 +924,22 @@ static void r300_set_vertex_buffers(struct pipe_context* pipe,
     r300->dirty_state |= R300_NEW_VERTEX_FORMAT;
 }
 
+static boolean r300_validate_aos(struct r300_context *r300)
+{
+    struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
+    struct pipe_vertex_element *velem = r300->vertex_element;
+    int i;
+
+    /* Check if formats and strides are aligned to the size of DWORD. */
+    for (i = 0; i < r300->vertex_element_count; i++) {
+        if (vbuf[velem[i].vertex_buffer_index].stride % 4 != 0 ||
+            util_format_get_blocksize(velem[i].src_format) % 4 != 0) {
+            return FALSE;
+        }
+    }
+    return TRUE;
+}
+
 static void r300_set_vertex_elements(struct pipe_context* pipe,
                                     unsigned count,
                                     const struct pipe_vertex_element* elements)
@@ -939,6 +955,12 @@ static void r300_set_vertex_elements(struct pipe_context* pipe,
         draw_flush(r300->draw);
         draw_set_vertex_elements(r300->draw, count, elements);
     }
+
+    if (!r300_validate_aos(r300)) {
+        /* XXX We should fallback using draw. */
+        assert(0);
+        abort();
+    }
 }
 
 static void* r300_create_vs_state(struct pipe_context* pipe,
index 25e1cdcdb6daf2271905e51c0b7f7e8012f358b7..5214b6d8bcb25075d86e95cf0ae60acd94f89763 100644 (file)
@@ -51,6 +51,23 @@ static const char *radeon_get_name(struct pipe_winsys *ws)
     return "Radeon/GEM+KMS";
 }
 
+uint32_t radeon_domain_from_usage(unsigned usage)
+{
+    uint32_t domain = 0;
+
+    if (usage & PIPE_BUFFER_USAGE_PIXEL) {
+        domain |= RADEON_GEM_DOMAIN_VRAM;
+    }
+    if (usage & PIPE_BUFFER_USAGE_VERTEX) {
+        domain |= RADEON_GEM_DOMAIN_GTT;
+    }
+    if (usage & PIPE_BUFFER_USAGE_INDEX) {
+        domain |= RADEON_GEM_DOMAIN_GTT;
+    }
+
+    return domain;
+}
+
 static struct pipe_buffer *radeon_buffer_create(struct pipe_winsys *ws,
                                                 unsigned alignment,
                                                 unsigned usage,
@@ -71,25 +88,17 @@ static struct pipe_buffer *radeon_buffer_create(struct pipe_winsys *ws,
     radeon_buffer->base.usage = usage;
     radeon_buffer->base.size = size;
 
-    if (usage == PIPE_BUFFER_USAGE_CONSTANT && is_r3xx(radeon_ws->pci_id)) {
+    if ((usage == PIPE_BUFFER_USAGE_CONSTANT && is_r3xx(radeon_ws->pci_id)) ||
+        (usage == PIPE_BUFFER_USAGE_VERTEX && size < 512)) {
         /* Don't bother allocating a BO, as it'll never get to the card. */
+        /* Also, create small vertex buffers in RAM. */
         desc.alignment = alignment;
         desc.usage = usage;
         radeon_buffer->pb = pb_malloc_buffer_create(size, &desc);
         return &radeon_buffer->base;
     }
 
-    domain = 0;
-
-    if (usage & PIPE_BUFFER_USAGE_PIXEL) {
-        domain |= RADEON_GEM_DOMAIN_VRAM;
-    }
-    if (usage & PIPE_BUFFER_USAGE_VERTEX) {
-        domain |= RADEON_GEM_DOMAIN_GTT;
-    }
-    if (usage & PIPE_BUFFER_USAGE_INDEX) {
-        domain |= RADEON_GEM_DOMAIN_GTT;
-    }
+    domain = radeon_domain_from_usage(usage);
 
     radeon_buffer->bo = radeon_bo_open(radeon_ws->priv->bom, 0, size,
             alignment, domain, 0);
@@ -222,6 +231,54 @@ static void radeon_buffer_set_tiling(struct radeon_winsys *ws,
     radeon_bo_set_tiling(radeon_buffer->bo, flags, pitch);
 }
 
+static boolean radeon_buffer_is_local(struct radeon_winsys *ws,
+                                      struct pipe_buffer *buffer)
+{
+    struct radeon_pipe_buffer *radeon_buffer =
+        (struct radeon_pipe_buffer*)buffer;
+
+    return radeon_buffer->pb != NULL;
+}
+
+static void radeon_buffer_make_managed(struct radeon_winsys *ws,
+                                       struct pipe_buffer *buffer)
+{
+    struct radeon_pipe_buffer* radeon_buffer =
+        (struct radeon_pipe_buffer*)buffer;
+    uint32_t domain;
+    void *map;
+
+    if (radeon_buffer->pb) {
+        domain = radeon_domain_from_usage(buffer->usage);
+
+        /* Create a managed buffer. */
+        radeon_buffer->bo = radeon_bo_open(ws->priv->bom, 0,
+                                           buffer->size, buffer->alignment,
+                                           domain, 0);
+        if (radeon_buffer->bo == NULL) {
+            /* XXX What now? */
+            fprintf(stderr, "radeon: cannot create a buffer in function %s\n",
+                    __FUNCTION__);
+            assert(0);
+            abort();
+        }
+
+        /* Move data. */
+        radeon_bo_map(radeon_buffer->bo, 1);
+        map = pb_map(radeon_buffer->pb, PIPE_BUFFER_USAGE_CPU_READ);
+
+        memcpy(radeon_buffer->bo->ptr, map, buffer->size);
+
+        pb_unmap(radeon_buffer->pb);
+        radeon_bo_unmap(radeon_buffer->bo);
+
+        /* Release the locally-created buffer. */
+        pipe_reference_init(&radeon_buffer->pb->base.reference, 0);
+        pb_destroy(radeon_buffer->pb);
+        radeon_buffer->pb = 0;
+    }
+}
+
 static void radeon_fence_reference(struct pipe_winsys *ws,
                                    struct pipe_fence_handle **ptr,
                                    struct pipe_fence_handle *pfence)
@@ -325,6 +382,8 @@ struct radeon_winsys* radeon_pipe_winsys(int fd)
     radeon_ws->base.get_name = radeon_get_name;
 
     radeon_ws->buffer_set_tiling = radeon_buffer_set_tiling;
+    radeon_ws->buffer_is_local = radeon_buffer_is_local;
+    radeon_ws->buffer_make_managed = radeon_buffer_make_managed;
 
     return radeon_ws;
 }
index de71cb2f42d84a9635348b7303552e9bd7dc212e..c46abff793ec1087deaa206081e799e9ab056227 100644 (file)
@@ -77,6 +77,8 @@ struct radeon_winsys_priv {
     void *flush_data;
 };
 
+uint32_t radeon_domain_from_usage(unsigned usage);
+
 struct radeon_winsys* radeon_pipe_winsys(int fb);
 #if 0
 struct pipe_surface *radeon_surface_from_handle(struct radeon_context *radeon_context,
index ddd7983824a5e04f64607d57671271e7e535f7c6..077388ee028a2d497e25bb96f27c91596963b4a0 100644 (file)
@@ -81,7 +81,7 @@ void radeon_destroy_drm_api(struct drm_api* api);
 /* Guess at whether this chipset should use r300g.
  *
  * I believe that this check is valid, but I haven't been exhaustive. */
-static boolean is_r3xx(int pciid)
+static INLINE boolean is_r3xx(int pciid)
 {
     return (pciid > 0x3150) && (pciid < 0x796f);
 }
index 0253bc2527e5974e83f0310c8a750535585c000b..d759beaba137b9eb484fe39c5250bc66b97046bc 100644 (file)
@@ -81,9 +81,13 @@ static void radeon_write_cs_reloc(struct radeon_winsys* winsys,
                                   uint32_t flags)
 {
     int retval = 0;
+    struct radeon_pipe_buffer* radeon_buffer =
+        (struct radeon_pipe_buffer*)pbuffer;
 
-    retval = radeon_cs_write_reloc(winsys->priv->cs,
-            ((struct radeon_pipe_buffer*)pbuffer)->bo, rd, wd, flags);
+    assert(!radeon_buffer->pb);
+
+    retval = radeon_cs_write_reloc(winsys->priv->cs, radeon_buffer->bo,
+                                   rd, wd, flags);
 
     if (retval) {
         debug_printf("radeon: Relocation of %p (%d, %d, %d) failed!\n",
index 864082b99b3894560d9364158092fe027d78ad71..462fba844ef22f089f4b06543a260822f10e146f 100644 (file)
@@ -106,6 +106,12 @@ struct radeon_winsys {
                               uint32_t pitch,
                               boolean microtiled,
                               boolean macrotiled);
+
+    boolean (*buffer_is_local)(struct radeon_winsys* winsys,
+                               struct pipe_buffer* buffer);
+
+    void (*buffer_make_managed)(struct radeon_winsys* winsys,
+                                struct pipe_buffer* buffer);
 };
 
 #endif