gallium/util: start splitting u_debug into generic and gallium specific components

[mesa.git] / src / gallium / auxiliary / util / u_helpers.c
diff --git a/src/gallium/auxiliary/util/u_helpers.c b/src/gallium/auxiliary/util/u_helpers.c

index 09619c1c9e9a1fcdea173fbfc5b227a698c4a66c..4c70c004178b36bbc8973ae0fad882b1b6611186 100644 (file)
--- a/src/gallium/auxiliary/util/u_helpers.c
+++ b/src/gallium/auxiliary/util/u_helpers.c
@@ -25,8 +25,13 @@
   *
   **************************************************************************/
  
+#include "util/u_cpu_detect.h"
  #include "util/u_helpers.h"
  #include "util/u_inlines.h"
+#include "util/u_upload_mgr.h"
+#include "util/u_thread.h"
+#include "util/os_time.h"
+#include <inttypes.h>
  
  /**
   * This function is used to copy an array of pipe_vertex_buffer structures,
@@ -49,10 +54,13 @@ void util_set_vertex_buffers_mask(struct pipe_vertex_buffer *dst,
  
     if (src) {
        for (i = 0; i < count; i++) {
-         if (src[i].buffer || src[i].user_buffer) {
+         if (src[i].buffer.resource)
              bitmask |= 1 << i;
-         }
-         pipe_resource_reference(&dst[i].buffer, src[i].buffer);
+
+         pipe_vertex_buffer_unreference(&dst[i]);
+
+         if (!src[i].is_user_buffer)
+            pipe_resource_reference(&dst[i].buffer.resource, src[i].buffer.resource);
        }
  
        /* Copy over the other members of pipe_vertex_buffer. */
@@ -63,10 +71,8 @@ void util_set_vertex_buffers_mask(struct pipe_vertex_buffer *dst,
     }
     else {
        /* Unreference the buffers. */
-      for (i = 0; i < count; i++) {
-         pipe_resource_reference(&dst[i].buffer, NULL);
-         dst[i].user_buffer = NULL;
-      }
+      for (i = 0; i < count; i++)
+         pipe_vertex_buffer_unreference(&dst[i]);
  
        *enabled_buffers &= ~(((1ull << count) - 1) << start_slot);
     }
@@ -81,7 +87,13 @@ void util_set_vertex_buffers_count(struct pipe_vertex_buffer *dst,
                                     const struct pipe_vertex_buffer *src,
                                     unsigned start_slot, unsigned count)
  {
-   uint32_t enabled_buffers = (1ull << *dst_count) - 1;
+   unsigned i;
+   uint32_t enabled_buffers = 0;
+
+   for (i = 0; i < *dst_count; i++) {
+      if (dst[i].buffer.resource)
+         enabled_buffers |= (1ull << i);
+   }
  
     util_set_vertex_buffers_mask(dst, &enabled_buffers, src, start_slot,
                                  count);
@@ -89,17 +101,288 @@ void util_set_vertex_buffers_count(struct pipe_vertex_buffer *dst,
     *dst_count = util_last_bit(enabled_buffers);
  }
  
+/**
+ * Given a user index buffer, save the structure to "saved", and upload it.
+ */
+bool
+util_upload_index_buffer(struct pipe_context *pipe,
+                         const struct pipe_draw_info *info,
+                         struct pipe_resource **out_buffer,
+                         unsigned *out_offset)
+{
+   unsigned start_offset = info->start * info->index_size;
+
+   u_upload_data(pipe->stream_uploader, start_offset,
+                 info->count * info->index_size, 4,
+                 (char*)info->index.user + start_offset,
+                 out_offset, out_buffer);
+   u_upload_unmap(pipe->stream_uploader);
+   *out_offset -= start_offset;
+   return *out_buffer != NULL;
+}
+
+#ifdef HAVE_PTHREAD_SETAFFINITY
+
+static unsigned L3_cache_number;
+static once_flag thread_pinning_once_flag = ONCE_FLAG_INIT;
+
+static void
+util_set_full_cpu_affinity(void)
+{
+   cpu_set_t cpuset;
+
+   CPU_ZERO(&cpuset);
+   for (unsigned i = 0; i < CPU_SETSIZE; i++)
+      CPU_SET(i, &cpuset);
+
+   pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+}
+
+static void
+util_init_thread_pinning(void)
+{
+   /* Get a semi-random number. */
+   int64_t t = os_time_get_nano();
+   L3_cache_number = (t ^ (t >> 8) ^ (t >> 16));
+
+   /* Reset thread affinity for all child processes to prevent them from
+    * inheriting the current thread's affinity.
+    *
+    * XXX: If the driver is unloaded after this, and the app later calls
+    * fork(), the child process will likely crash before fork() returns,
+    * because the address where util_set_full_cpu_affinity was located
+    * will either be unmapped or point to random other contents.
+    */
+   pthread_atfork(NULL, NULL, util_set_full_cpu_affinity);
+}
+
+#endif
  
+/**
+ * Called by MakeCurrent. Used to notify the driver that the application
+ * thread may have been changed.
+ *
+ * The function pins the current thread and driver threads to a group of
+ * CPU cores that share the same L3 cache. This is needed for good multi-
+ * threading performance on AMD Zen CPUs.
+ *
+ * \param upper_thread  thread in the state tracker that also needs to be
+ *                      pinned.
+ */
  void
-util_set_index_buffer(struct pipe_index_buffer *dst,
-                      const struct pipe_index_buffer *src)
+util_context_thread_changed(struct pipe_context *ctx, thrd_t *upper_thread)
  {
-   if (src) {
-      pipe_resource_reference(&dst->buffer, src->buffer);
-      memcpy(dst, src, sizeof(*dst));
+#ifdef HAVE_PTHREAD_SETAFFINITY
+   /* If pinning has no effect, don't do anything. */
+   if (util_cpu_caps.nr_cpus == util_cpu_caps.cores_per_L3)
+      return;
+
+   thrd_t current = thrd_current();
+   int cache = util_get_L3_for_pinned_thread(current,
+                                             util_cpu_caps.cores_per_L3);
+
+   call_once(&thread_pinning_once_flag, util_init_thread_pinning);
+
+   /* If the main thread is not pinned, choose the L3 cache. */
+   if (cache == -1) {
+      unsigned num_L3_caches = util_cpu_caps.nr_cpus /
+                               util_cpu_caps.cores_per_L3;
+
+      /* Choose a different L3 cache for each subsequent MakeCurrent. */
+      cache = p_atomic_inc_return(&L3_cache_number) % num_L3_caches;
+      util_pin_thread_to_L3(current, cache, util_cpu_caps.cores_per_L3);
     }
-   else {
-      pipe_resource_reference(&dst->buffer, NULL);
-      memset(dst, 0, sizeof(*dst));
+
+   /* Tell the driver to pin its threads to the same L3 cache. */
+   if (ctx->set_context_param) {
+      ctx->set_context_param(ctx, PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
+                             cache);
+   }
+
+   /* Do the same for the upper level thread if there is any (e.g. glthread) */
+   if (upper_thread)
+      util_pin_thread_to_L3(*upper_thread, cache, util_cpu_caps.cores_per_L3);
+#endif
+}
+
+/* This is a helper for hardware bring-up. Don't remove. */
+struct pipe_query *
+util_begin_pipestat_query(struct pipe_context *ctx)
+{
+   struct pipe_query *q =
+      ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0);
+   if (!q)
+      return NULL;
+
+   ctx->begin_query(ctx, q);
+   return q;
+}
+
+/* This is a helper for hardware bring-up. Don't remove. */
+void
+util_end_pipestat_query(struct pipe_context *ctx, struct pipe_query *q,
+                        FILE *f)
+{
+   static unsigned counter;
+   struct pipe_query_data_pipeline_statistics stats;
+
+   ctx->end_query(ctx, q);
+   ctx->get_query_result(ctx, q, true, (void*)&stats);
+   ctx->destroy_query(ctx, q);
+
+   fprintf(f,
+           "Draw call %u:\n"
+           "    ia_vertices    = %"PRIu64"\n"
+           "    ia_primitives  = %"PRIu64"\n"
+           "    vs_invocations = %"PRIu64"\n"
+           "    gs_invocations = %"PRIu64"\n"
+           "    gs_primitives  = %"PRIu64"\n"
+           "    c_invocations  = %"PRIu64"\n"
+           "    c_primitives   = %"PRIu64"\n"
+           "    ps_invocations = %"PRIu64"\n"
+           "    hs_invocations = %"PRIu64"\n"
+           "    ds_invocations = %"PRIu64"\n"
+           "    cs_invocations = %"PRIu64"\n",
+           p_atomic_inc_return(&counter),
+           stats.ia_vertices,
+           stats.ia_primitives,
+           stats.vs_invocations,
+           stats.gs_invocations,
+           stats.gs_primitives,
+           stats.c_invocations,
+           stats.c_primitives,
+           stats.ps_invocations,
+           stats.hs_invocations,
+           stats.ds_invocations,
+           stats.cs_invocations);
+}
+
+/* This is a helper for hardware bring-up. Don't remove. */
+void
+util_wait_for_idle(struct pipe_context *ctx)
+{
+   struct pipe_fence_handle *fence = NULL;
+
+   ctx->flush(ctx, &fence, 0);
+   ctx->screen->fence_finish(ctx->screen, NULL, fence, PIPE_TIMEOUT_INFINITE);
+}
+
+void
+util_throttle_init(struct util_throttle *t, uint64_t max_mem_usage)
+{
+   t->max_mem_usage = max_mem_usage;
+}
+
+void
+util_throttle_deinit(struct pipe_screen *screen, struct util_throttle *t)
+{
+   for (unsigned i = 0; i < ARRAY_SIZE(t->ring); i++)
+      screen->fence_reference(screen, &t->ring[i].fence, NULL);
+}
+
+static uint64_t
+util_get_throttle_total_memory_usage(struct util_throttle *t)
+{
+   uint64_t total_usage = 0;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(t->ring); i++)
+      total_usage += t->ring[i].mem_usage;
+   return total_usage;
+}
+
+static void util_dump_throttle_ring(struct util_throttle *t)
+{
+   printf("Throttle:\n");
+   for (unsigned i = 0; i < ARRAY_SIZE(t->ring); i++) {
+      printf("  ring[%u]: fence = %s, mem_usage = %"PRIu64"%s%s\n",
+             i, t->ring[i].fence ? "yes" : " no",
+             t->ring[i].mem_usage,
+             t->flush_index == i ? " [flush]" : "",
+             t->wait_index == i ? " [wait]" : "");
     }
  }
+
+/**
+ * Notify util_throttle that the next operation allocates memory.
+ * util_throttle tracks memory usage and waits for fences until its tracked
+ * memory usage decreases.
+ *
+ * Example:
+ *   util_throttle_memory_usage(..., w*h*d*Bpp);
+ *   TexSubImage(..., w, h, d, ...);
+ *
+ * This means that TexSubImage can't allocate more memory its maximum limit
+ * set during initialization.
+ */
+void
+util_throttle_memory_usage(struct pipe_context *pipe,
+                           struct util_throttle *t, uint64_t memory_size)
+{
+   (void)util_dump_throttle_ring; /* silence warning */
+
+   if (!t->max_mem_usage)
+      return;
+
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_fence_handle **fence = NULL;
+   unsigned ring_size = ARRAY_SIZE(t->ring);
+   uint64_t total = util_get_throttle_total_memory_usage(t);
+
+   /* If there is not enough memory, walk the list of fences and find
+    * the latest one that we need to wait for.
+    */
+   while (t->wait_index != t->flush_index &&
+          total && total + memory_size > t->max_mem_usage) {
+      assert(t->ring[t->wait_index].fence);
+
+      /* Release an older fence if we need to wait for a newer one. */
+      if (fence)
+         screen->fence_reference(screen, fence, NULL);
+
+      fence = &t->ring[t->wait_index].fence;
+      t->ring[t->wait_index].mem_usage = 0;
+      t->wait_index = (t->wait_index + 1) % ring_size;
+
+      total = util_get_throttle_total_memory_usage(t);
+   }
+
+   /* Wait for the fence to decrease memory usage. */
+   if (fence) {
+      screen->fence_finish(screen, pipe, *fence, PIPE_TIMEOUT_INFINITE);
+      screen->fence_reference(screen, fence, NULL);
+   }
+
+   /* Flush and get a fence if we've exhausted memory usage for the current
+    * slot.
+    */
+   if (t->ring[t->flush_index].mem_usage &&
+       t->ring[t->flush_index].mem_usage + memory_size >
+       t->max_mem_usage / (ring_size / 2)) {
+      struct pipe_fence_handle **fence =
+         &t->ring[t->flush_index].fence;
+
+      /* Expect that the current flush slot doesn't have a fence yet. */
+      assert(!*fence);
+
+      pipe->flush(pipe, fence, PIPE_FLUSH_ASYNC);
+      t->flush_index = (t->flush_index + 1) % ring_size;
+
+      /* Vacate the next slot if it's occupied. This should be rare. */
+      if (t->flush_index == t->wait_index) {
+         struct pipe_fence_handle **fence =
+            &t->ring[t->wait_index].fence;
+
+         t->ring[t->wait_index].mem_usage = 0;
+         t->wait_index = (t->wait_index + 1) % ring_size;
+
+         assert(*fence);
+         screen->fence_finish(screen, pipe, *fence, PIPE_TIMEOUT_INFINITE);
+         screen->fence_reference(screen, fence, NULL);
+      }
+
+      assert(!t->ring[t->flush_index].mem_usage);
+      assert(!t->ring[t->flush_index].fence);
+   }
+
+   t->ring[t->flush_index].mem_usage += memory_size;
+}