gallium: add pipe cap for scissored clears and pass scissor state to clear() hook

[mesa.git] / src / gallium / auxiliary / util / u_helpers.c
diff --git a/src/gallium/auxiliary/util/u_helpers.c b/src/gallium/auxiliary/util/u_helpers.c

index 09020b061a3e4b2d1cbb78b06f42ec01532ef802..31b7d533952d158ebef68ce6c28c478d4d384d0c 100644 (file)
--- a/src/gallium/auxiliary/util/u_helpers.c
+++ b/src/gallium/auxiliary/util/u_helpers.c
@@ -25,8 +25,13 @@
   *
   **************************************************************************/
  
+#include "util/u_cpu_detect.h"
  #include "util/u_helpers.h"
  #include "util/u_inlines.h"
+#include "util/u_upload_mgr.h"
+#include "util/u_thread.h"
+#include "util/os_time.h"
+#include <inttypes.h>
  
  /**
   * This function is used to copy an array of pipe_vertex_buffer structures,
@@ -47,28 +52,28 @@ void util_set_vertex_buffers_mask(struct pipe_vertex_buffer *dst,
  
     dst += start_slot;
  
+   *enabled_buffers &= ~u_bit_consecutive(start_slot, count);
+
     if (src) {
        for (i = 0; i < count; i++) {
-         if (src[i].buffer || src[i].user_buffer) {
+         if (src[i].buffer.resource)
              bitmask |= 1 << i;
-         }
-         pipe_resource_reference(&dst[i].buffer, src[i].buffer);
+
+         pipe_vertex_buffer_unreference(&dst[i]);
+
+         if (!src[i].is_user_buffer)
+            pipe_resource_reference(&dst[i].buffer.resource, src[i].buffer.resource);
        }
  
        /* Copy over the other members of pipe_vertex_buffer. */
        memcpy(dst, src, count * sizeof(struct pipe_vertex_buffer));
  
-      *enabled_buffers &= ~(((1ull << count) - 1) << start_slot);
        *enabled_buffers |= bitmask << start_slot;
     }
     else {
        /* Unreference the buffers. */
-      for (i = 0; i < count; i++) {
-         pipe_resource_reference(&dst[i].buffer, NULL);
-         dst[i].user_buffer = NULL;
-      }
-
-      *enabled_buffers &= ~(((1ull << count) - 1) << start_slot);
+      for (i = 0; i < count; i++)
+         pipe_vertex_buffer_unreference(&dst[i]);
     }
  }
  
@@ -85,7 +90,7 @@ void util_set_vertex_buffers_count(struct pipe_vertex_buffer *dst,
     uint32_t enabled_buffers = 0;
  
     for (i = 0; i < *dst_count; i++) {
-      if (dst[i].buffer || dst[i].user_buffer)
+      if (dst[i].buffer.resource)
           enabled_buffers |= (1ull << i);
     }
  
@@ -95,17 +100,288 @@ void util_set_vertex_buffers_count(struct pipe_vertex_buffer *dst,
     *dst_count = util_last_bit(enabled_buffers);
  }
  
-
-void
-util_set_index_buffer(struct pipe_index_buffer *dst,
-                      const struct pipe_index_buffer *src)
+/**
+ * This function is used to copy an array of pipe_shader_buffer structures,
+ * while properly referencing the pipe_shader_buffer::buffer member.
+ *
+ * \sa util_set_vertex_buffer_mask
+ */
+void util_set_shader_buffers_mask(struct pipe_shader_buffer *dst,
+                                  uint32_t *enabled_buffers,
+                                  const struct pipe_shader_buffer *src,
+                                  unsigned start_slot, unsigned count)
  {
+   unsigned i;
+
+   dst += start_slot;
+
     if (src) {
-      pipe_resource_reference(&dst->buffer, src->buffer);
-      memcpy(dst, src, sizeof(*dst));
+      for (i = 0; i < count; i++) {
+         pipe_resource_reference(&dst[i].buffer, src[i].buffer);
+
+         if (src[i].buffer)
+            *enabled_buffers |= (1ull << (start_slot + i));
+         else
+            *enabled_buffers &= ~(1ull << (start_slot + i));
+      }
+
+      /* Copy over the other members of pipe_shader_buffer. */
+      memcpy(dst, src, count * sizeof(struct pipe_shader_buffer));
     }
     else {
-      pipe_resource_reference(&dst->buffer, NULL);
-      memset(dst, 0, sizeof(*dst));
+      /* Unreference the buffers. */
+      for (i = 0; i < count; i++)
+         pipe_resource_reference(&dst[i].buffer, NULL);
+
+      *enabled_buffers &= ~(((1ull << count) - 1) << start_slot);
+   }
+}
+
+/**
+ * Given a user index buffer, save the structure to "saved", and upload it.
+ */
+bool
+util_upload_index_buffer(struct pipe_context *pipe,
+                         const struct pipe_draw_info *info,
+                         struct pipe_resource **out_buffer,
+                         unsigned *out_offset, unsigned alignment)
+{
+   unsigned start_offset = info->start * info->index_size;
+
+   u_upload_data(pipe->stream_uploader, start_offset,
+                 info->count * info->index_size, alignment,
+                 (char*)info->index.user + start_offset,
+                 out_offset, out_buffer);
+   u_upload_unmap(pipe->stream_uploader);
+   *out_offset -= start_offset;
+   return *out_buffer != NULL;
+}
+
+/**
+ * Called by MakeCurrent. Used to notify the driver that the application
+ * thread may have been changed.
+ *
+ * The function pins the current thread and driver threads to a group of
+ * CPU cores that share the same L3 cache. This is needed for good multi-
+ * threading performance on AMD Zen CPUs.
+ *
+ * \param upper_thread  thread in the state tracker that also needs to be
+ *                      pinned.
+ */
+void
+util_pin_driver_threads_to_random_L3(struct pipe_context *ctx,
+                                     thrd_t *upper_thread)
+{
+   /* If pinning has no effect, don't do anything. */
+   if (util_cpu_caps.nr_cpus == util_cpu_caps.cores_per_L3)
+      return;
+
+   unsigned num_L3_caches = util_cpu_caps.nr_cpus /
+                            util_cpu_caps.cores_per_L3;
+
+   /* Get a semi-random number. */
+   int64_t t = os_time_get_nano();
+   unsigned cache = (t ^ (t >> 8) ^ (t >> 16)) % num_L3_caches;
+
+   /* Tell the driver to pin its threads to the selected L3 cache. */
+   if (ctx->set_context_param) {
+      ctx->set_context_param(ctx, PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
+                             cache);
+   }
+
+   /* Do the same for the upper level thread if there is any (e.g. glthread) */
+   if (upper_thread)
+      util_pin_thread_to_L3(*upper_thread, cache, util_cpu_caps.cores_per_L3);
+
+   /* Optionally pin the application thread to the same L3 to get maximum
+    * performance with glthread on AMD Zen. (this function is only called
+    * with glthread) This is used to estimate and remove the overhead of
+    * Infinity Fabric between L3 caches.
+    */
+#if defined(HAVE_PTHREAD)
+   if (debug_get_bool_option("pin_app_thread", false))
+      util_pin_thread_to_L3(pthread_self(), cache, util_cpu_caps.cores_per_L3);
+#endif
+}
+
+/* This is a helper for hardware bring-up. Don't remove. */
+struct pipe_query *
+util_begin_pipestat_query(struct pipe_context *ctx)
+{
+   struct pipe_query *q =
+      ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0);
+   if (!q)
+      return NULL;
+
+   ctx->begin_query(ctx, q);
+   return q;
+}
+
+/* This is a helper for hardware bring-up. Don't remove. */
+void
+util_end_pipestat_query(struct pipe_context *ctx, struct pipe_query *q,
+                        FILE *f)
+{
+   static unsigned counter;
+   struct pipe_query_data_pipeline_statistics stats;
+
+   ctx->end_query(ctx, q);
+   ctx->get_query_result(ctx, q, true, (void*)&stats);
+   ctx->destroy_query(ctx, q);
+
+   fprintf(f,
+           "Draw call %u:\n"
+           "    ia_vertices    = %"PRIu64"\n"
+           "    ia_primitives  = %"PRIu64"\n"
+           "    vs_invocations = %"PRIu64"\n"
+           "    gs_invocations = %"PRIu64"\n"
+           "    gs_primitives  = %"PRIu64"\n"
+           "    c_invocations  = %"PRIu64"\n"
+           "    c_primitives   = %"PRIu64"\n"
+           "    ps_invocations = %"PRIu64"\n"
+           "    hs_invocations = %"PRIu64"\n"
+           "    ds_invocations = %"PRIu64"\n"
+           "    cs_invocations = %"PRIu64"\n",
+           (unsigned)p_atomic_inc_return(&counter),
+           stats.ia_vertices,
+           stats.ia_primitives,
+           stats.vs_invocations,
+           stats.gs_invocations,
+           stats.gs_primitives,
+           stats.c_invocations,
+           stats.c_primitives,
+           stats.ps_invocations,
+           stats.hs_invocations,
+           stats.ds_invocations,
+           stats.cs_invocations);
+}
+
+/* This is a helper for hardware bring-up. Don't remove. */
+void
+util_wait_for_idle(struct pipe_context *ctx)
+{
+   struct pipe_fence_handle *fence = NULL;
+
+   ctx->flush(ctx, &fence, 0);
+   ctx->screen->fence_finish(ctx->screen, NULL, fence, PIPE_TIMEOUT_INFINITE);
+}
+
+void
+util_throttle_init(struct util_throttle *t, uint64_t max_mem_usage)
+{
+   t->max_mem_usage = max_mem_usage;
+}
+
+void
+util_throttle_deinit(struct pipe_screen *screen, struct util_throttle *t)
+{
+   for (unsigned i = 0; i < ARRAY_SIZE(t->ring); i++)
+      screen->fence_reference(screen, &t->ring[i].fence, NULL);
+}
+
+static uint64_t
+util_get_throttle_total_memory_usage(struct util_throttle *t)
+{
+   uint64_t total_usage = 0;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(t->ring); i++)
+      total_usage += t->ring[i].mem_usage;
+   return total_usage;
+}
+
+static void util_dump_throttle_ring(struct util_throttle *t)
+{
+   printf("Throttle:\n");
+   for (unsigned i = 0; i < ARRAY_SIZE(t->ring); i++) {
+      printf("  ring[%u]: fence = %s, mem_usage = %"PRIu64"%s%s\n",
+             i, t->ring[i].fence ? "yes" : " no",
+             t->ring[i].mem_usage,
+             t->flush_index == i ? " [flush]" : "",
+             t->wait_index == i ? " [wait]" : "");
     }
  }
+
+/**
+ * Notify util_throttle that the next operation allocates memory.
+ * util_throttle tracks memory usage and waits for fences until its tracked
+ * memory usage decreases.
+ *
+ * Example:
+ *   util_throttle_memory_usage(..., w*h*d*Bpp);
+ *   TexSubImage(..., w, h, d, ...);
+ *
+ * This means that TexSubImage can't allocate more memory its maximum limit
+ * set during initialization.
+ */
+void
+util_throttle_memory_usage(struct pipe_context *pipe,
+                           struct util_throttle *t, uint64_t memory_size)
+{
+   (void)util_dump_throttle_ring; /* silence warning */
+
+   if (!t->max_mem_usage)
+      return;
+
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_fence_handle **fence = NULL;
+   unsigned ring_size = ARRAY_SIZE(t->ring);
+   uint64_t total = util_get_throttle_total_memory_usage(t);
+
+   /* If there is not enough memory, walk the list of fences and find
+    * the latest one that we need to wait for.
+    */
+   while (t->wait_index != t->flush_index &&
+          total && total + memory_size > t->max_mem_usage) {
+      assert(t->ring[t->wait_index].fence);
+
+      /* Release an older fence if we need to wait for a newer one. */
+      if (fence)
+         screen->fence_reference(screen, fence, NULL);
+
+      fence = &t->ring[t->wait_index].fence;
+      t->ring[t->wait_index].mem_usage = 0;
+      t->wait_index = (t->wait_index + 1) % ring_size;
+
+      total = util_get_throttle_total_memory_usage(t);
+   }
+
+   /* Wait for the fence to decrease memory usage. */
+   if (fence) {
+      screen->fence_finish(screen, pipe, *fence, PIPE_TIMEOUT_INFINITE);
+      screen->fence_reference(screen, fence, NULL);
+   }
+
+   /* Flush and get a fence if we've exhausted memory usage for the current
+    * slot.
+    */
+   if (t->ring[t->flush_index].mem_usage &&
+       t->ring[t->flush_index].mem_usage + memory_size >
+       t->max_mem_usage / (ring_size / 2)) {
+      struct pipe_fence_handle **fence =
+         &t->ring[t->flush_index].fence;
+
+      /* Expect that the current flush slot doesn't have a fence yet. */
+      assert(!*fence);
+
+      pipe->flush(pipe, fence, PIPE_FLUSH_ASYNC);
+      t->flush_index = (t->flush_index + 1) % ring_size;
+
+      /* Vacate the next slot if it's occupied. This should be rare. */
+      if (t->flush_index == t->wait_index) {
+         struct pipe_fence_handle **fence =
+            &t->ring[t->wait_index].fence;
+
+         t->ring[t->wait_index].mem_usage = 0;
+         t->wait_index = (t->wait_index + 1) % ring_size;
+
+         assert(*fence);
+         screen->fence_finish(screen, pipe, *fence, PIPE_TIMEOUT_INFINITE);
+         screen->fence_reference(screen, fence, NULL);
+      }
+
+      assert(!t->ring[t->flush_index].mem_usage);
+      assert(!t->ring[t->flush_index].fence);
+   }
+
+   t->ring[t->flush_index].mem_usage += memory_size;
+}