From: Bruce Cherniak <bruce.cherniak@intel.com>
Date: Wed, 12 Jul 2017 20:04:47 +0000 (-0500)
Subject: swr: Add path to draw directly from client memory without copy.
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=02735e6cf8821f83e04fb78cd24960a4903ce118;p=mesa.git

swr: Add path to draw directly from client memory without copy.

If size of client memory copy is too large, don't copy. The draw will
access user-buffer directly and then block.  This is faster and more
efficient than queuing many large client draws.

Applications that still use large client arrays benefit from this.  VMD
is an example.

The threshold for this path defaults to 32KB.  This value can be
overridden by setting environment variable SWR_CLIENT_COPY_LIMIT.

v2: Use #define for default value, rather than hard-coded constant.

Reviewed-by: Tim Rowley <timothy.o.rowley@intel.com>
---

diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h
index 753cbf3115a..8bed78f869a 100644
--- a/src/gallium/drivers/swr/swr_context.h
+++ b/src/gallium/drivers/swr/swr_context.h
@@ -51,6 +51,7 @@
 #define SWR_NEW_FRAMEBUFFER (1 << 15)
 #define SWR_NEW_CLIP (1 << 16)
 #define SWR_NEW_SO (1 << 17)
+#define SWR_LARGE_CLIENT_DRAW (1<<18) // Indicates client draw will block
 
 namespace std
 {
diff --git a/src/gallium/drivers/swr/swr_draw.cpp b/src/gallium/drivers/swr/swr_draw.cpp
index eae4b572df3..62ad3f716cf 100644
--- a/src/gallium/drivers/swr/swr_draw.cpp
+++ b/src/gallium/drivers/swr/swr_draw.cpp
@@ -188,6 +188,15 @@ swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
                                    info->instance_count,
                                    info->start,
                                    info->start_instance);
+
+   /* On large client-buffer draw, we used client buffer directly, without
+    * copy.  Block until draw is finished.
+    * VMD is an example application that benefits from this. */
+   if (ctx->dirty & SWR_LARGE_CLIENT_DRAW) {
+      struct swr_screen *screen = swr_screen(pipe->screen);
+      swr_fence_submit(ctx, screen->flush_fence);
+      swr_fence_finish(pipe->screen, NULL, screen->flush_fence, 0);
+   }
 }
 
 
diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp
index 9352181fae7..c8ff810e165 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -61,6 +61,9 @@
 #define SWR_MAX_TEXTURE_CUBE_LEVELS 14  /* 8K x 8K for now */
 #define SWR_MAX_TEXTURE_ARRAY_LAYERS 512 /* 8K x 512 / 8K x 8K x 512 */
 
+/* Default max client_copy_limit */
+#define SWR_CLIENT_COPY_LIMIT 32768
+
 /* Flag indicates creation of alternate surface, to prevent recursive loop
  * in resource creation when msaa_force_enable is set. */
 #define SWR_RESOURCE_FLAG_ALT_SURFACE (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
@@ -1067,6 +1070,16 @@ swr_destroy_screen(struct pipe_screen *p_screen)
 static void
 swr_validate_env_options(struct swr_screen *screen)
 {
+   /* The client_copy_limit sets a maximum on the amount of user-buffer memory
+    * copied to scratch space on a draw.  Past this, the draw will access
+    * user-buffer directly and then block.  This is faster than queuing many
+    * large client draws. */
+   screen->client_copy_limit = SWR_CLIENT_COPY_LIMIT;
+   int client_copy_limit =
+      debug_get_num_option("SWR_CLIENT_COPY_LIMIT", SWR_CLIENT_COPY_LIMIT);
+   if (client_copy_limit > 0)
+      screen->client_copy_limit = client_copy_limit;
+
    /* XXX msaa under development, disable by default for now */
    screen->msaa_max_count = 0; /* was SWR_MAX_NUM_MULTISAMPLES; */
 
diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h
index a10f4265354..a11ea9f41de 100644
--- a/src/gallium/drivers/swr/swr_screen.h
+++ b/src/gallium/drivers/swr/swr_screen.h
@@ -43,8 +43,10 @@ struct swr_screen {
 
    struct sw_winsys *winsys;
 
+   /* Configurable environment settings */
    boolean msaa_force_enable;
    uint8_t msaa_max_count;
+   uint32_t client_copy_limit;
 
    HANDLE hJitMgr;
 
diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp
index 4eef60681a7..534f3c59b1c 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -1267,12 +1267,20 @@ swr_update_derived(struct pipe_context *pipe,
             partial_inbounds = 0;
             min_vertex_index = info.min_index;
 
-            /* Copy only needed vertices to scratch space */
             size = AlignUp(size, 4);
-            const void *ptr = (const uint8_t *) vb->buffer.user + base;
-            ptr = (uint8_t *)swr_copy_to_scratch_space(
-               ctx, &ctx->scratch->vertex_buffer, ptr, size);
-            p_data = (const uint8_t *)ptr - base;
+            /* If size of client memory copy is too large, don't copy. The
+             * draw will access user-buffer directly and then block.  This is
+             * faster than queuing many large client draws. */
+            if (size >= screen->client_copy_limit) {
+               post_update_dirty_flags |= SWR_LARGE_CLIENT_DRAW;
+               p_data = (const uint8_t *) vb->buffer.user;
+            } else {
+               /* Copy only needed vertices to scratch space */
+               const void *ptr = (const uint8_t *) vb->buffer.user + base;
+               ptr = (uint8_t *)swr_copy_to_scratch_space(
+                     ctx, &ctx->scratch->vertex_buffer, ptr, size);
+               p_data = (const uint8_t *)ptr - base;
+            }
          }
 
          swrVertexBuffers[i] = {0};
@@ -1311,12 +1319,19 @@ swr_update_derived(struct pipe_context *pipe,
 
             size = info.count * pitch;
             size = AlignUp(size, 4);
-
-            /* Copy indices to scratch space */
-            const void *ptr = info.index.user;
-            ptr = swr_copy_to_scratch_space(
-               ctx, &ctx->scratch->index_buffer, ptr, size);
-            p_data = (const uint8_t *)ptr;
+            /* If size of client memory copy is too large, don't copy. The
+             * draw will access user-buffer directly and then block.  This is
+             * faster than queuing many large client draws. */
+            if (size >= screen->client_copy_limit) {
+               post_update_dirty_flags |= SWR_LARGE_CLIENT_DRAW;
+               p_data = (const uint8_t *) info.index.user;
+            } else {
+               /* Copy indices to scratch space */
+               const void *ptr = info.index.user;
+               ptr = swr_copy_to_scratch_space(
+                     ctx, &ctx->scratch->index_buffer, ptr, size);
+               p_data = (const uint8_t *)ptr;
+            }
          }
 
          SWR_INDEX_BUFFER_STATE swrIndexBuffer;