lima/ppir: enable vectorize optimization

[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw.c
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c

index 91254bedf1e19eb53a6113e1edc7e8c96145af02..672b3e10eedec86c7c1fa997034475521ebb5077 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -25,13 +25,9 @@
  
  #include "nvc0/nvc0_context.h"
  #include "nvc0/nvc0_query_hw.h"
+#include "nvc0/nvc0_query_hw_metric.h"
  #include "nvc0/nvc0_query_hw_sm.h"
  
-#define NVC0_HW_QUERY_STATE_READY   0
-#define NVC0_HW_QUERY_STATE_ACTIVE  1
-#define NVC0_HW_QUERY_STATE_ENDED   2
-#define NVC0_HW_QUERY_STATE_FLUSHED 3
-
  #define NVC0_HW_QUERY_ALLOC_SPACE 256
  
  bool
@@ -115,12 +111,34 @@ static void
  nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q)
  {
     struct nvc0_hw_query *hq = nvc0_hw_query(q);
+
+   if (hq->funcs && hq->funcs->destroy_query) {
+      hq->funcs->destroy_query(nvc0, hq);
+      return;
+   }
+
     nvc0_hw_query_allocate(nvc0, q, 0);
     nouveau_fence_ref(NULL, &hq->fence);
     FREE(hq);
  }
  
-static boolean
+static void
+nvc0_hw_query_write_compute_invocations(struct nvc0_context *nvc0,
+                                        struct nvc0_hw_query *hq,
+                                        uint32_t offset)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   nouveau_pushbuf_space(push, 16, 0, 8);
+   PUSH_REFN(push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+   BEGIN_1IC0(push, NVC0_3D(MACRO_COMPUTE_COUNTER_TO_QUERY), 4);
+   PUSH_DATA (push, nvc0->compute_invocations);
+   PUSH_DATAh(push, nvc0->compute_invocations);
+   PUSH_DATAh(push, hq->bo->offset + hq->offset + offset);
+   PUSH_DATA (push, hq->bo->offset + hq->offset + offset);
+}
+
+static bool
  nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
  {
     struct nouveau_pushbuf *push = nvc0->base.pushbuf;
@@ -150,14 +168,19 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
     switch (q->type) {
     case PIPE_QUERY_OCCLUSION_COUNTER:
     case PIPE_QUERY_OCCLUSION_PREDICATE:
-      hq->nesting = nvc0->screen->num_occlusion_queries_active++;
-      if (hq->nesting) {
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      if (nvc0->screen->num_occlusion_queries_active++) {
           nvc0_hw_query_get(push, q, 0x10, 0x0100f002);
        } else {
           PUSH_SPACE(push, 3);
           BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1);
           PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT);
           IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
+         /* Given that the counter is reset, the contents at 0x10 are
+          * equivalent to doing the query -- we would get hq->sequence as the
+          * payload and 0 as the reported value. This is already set up above
+          * as in the hq->rotate case.
+          */
        }
        break;
     case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -173,6 +196,10 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
     case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
        nvc0_hw_query_get(push, q, 0x10, 0x03005002 | (q->index << 5));
        break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      /* XXX: This get actually writes the number of overflowed streams */
+      nvc0_hw_query_get(push, q, 0x10, 0x0f005002);
+      break;
     case PIPE_QUERY_TIME_ELAPSED:
        nvc0_hw_query_get(push, q, 0x10, 0x00005002);
        break;
@@ -187,6 +214,7 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
        nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
        nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
        nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
+      nvc0_hw_query_write_compute_invocations(nvc0, hq, 0xc0 + 0xa0);
        break;
     default:
        break;
@@ -217,6 +245,7 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
     switch (q->type) {
     case PIPE_QUERY_OCCLUSION_COUNTER:
     case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
        nvc0_hw_query_get(push, q, 0, 0x0100f002);
        if (--nvc0->screen->num_occlusion_queries_active == 0) {
           PUSH_SPACE(push, 1);
@@ -234,10 +263,11 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
        nvc0_hw_query_get(push, q, 0x10, 0x06805002 | (q->index << 5));
        break;
     case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-      /* TODO: How do we sum over all streams for render condition ? */
-      /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */
        nvc0_hw_query_get(push, q, 0x00, 0x03005002 | (q->index << 5));
-      nvc0_hw_query_get(push, q, 0x20, 0x00005002);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      /* XXX: This get actually writes the number of overflowed streams */
+      nvc0_hw_query_get(push, q, 0x00, 0x0f005002);
        break;
     case PIPE_QUERY_TIMESTAMP:
     case PIPE_QUERY_TIME_ELAPSED:
@@ -257,6 +287,7 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
        nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
        nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
        nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
+      nvc0_hw_query_write_compute_invocations(nvc0, hq, 0xa0);
        break;
     case PIPE_QUERY_TIMESTAMP_DISJOINT:
        /* This query is not issued on GPU because disjoint is forced to false */
@@ -273,9 +304,9 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
        nouveau_fence_ref(nvc0->screen->base.fence.current, &hq->fence);
  }
  
-static boolean
+static bool
  nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
-                         boolean wait, union pipe_query_result *result)
+                         bool wait, union pipe_query_result *result)
  {
     struct nvc0_hw_query *hq = nvc0_hw_query(q);
     uint64_t *res64 = (uint64_t*)result;
@@ -313,6 +344,7 @@ nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
        res64[0] = hq->data[1] - hq->data[5];
        break;
     case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
        res8[0] = hq->data[1] != hq->data[5];
        break;
     case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
@@ -324,6 +356,7 @@ nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
        res64[1] = data64[2] - data64[6];
        break;
     case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
        res8[0] = data64[0] != data64[2];
        break;
     case PIPE_QUERY_TIMESTAMP:
@@ -337,7 +370,7 @@ nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
        res64[0] = data64[1] - data64[3];
        break;
     case PIPE_QUERY_PIPELINE_STATISTICS:
-      for (i = 0; i < 10; ++i)
+      for (i = 0; i < 11; ++i)
           res64[i] = data64[i * 2] - data64[24 + i * 2];
        break;
     case NVC0_HW_QUERY_TFB_BUFFER_OFFSET:
@@ -351,11 +384,142 @@ nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
     return true;
  }
  
+static void
+nvc0_hw_get_query_result_resource(struct nvc0_context *nvc0,
+                                  struct nvc0_query *q,
+                                  bool wait,
+                                  enum pipe_query_value_type result_type,
+                                  int index,
+                                  struct pipe_resource *resource,
+                                  unsigned offset)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+   struct nv04_resource *buf = nv04_resource(resource);
+   unsigned qoffset = 0, stride;
+
+   assert(!hq->funcs || !hq->funcs->get_query_result);
+
+   if (index == -1) {
+      /* TODO: Use a macro to write the availability of the query */
+      if (hq->state != NVC0_HW_QUERY_STATE_READY)
+         nvc0_hw_query_update(nvc0->screen->base.client, q);
+      uint32_t ready[2] = {hq->state == NVC0_HW_QUERY_STATE_READY};
+      nvc0->base.push_cb(&nvc0->base, buf, offset,
+                         result_type >= PIPE_QUERY_TYPE_I64 ? 2 : 1,
+                         ready);
+
+      util_range_add(&buf->valid_buffer_range, offset,
+                     offset + (result_type >= PIPE_QUERY_TYPE_I64 ? 8 : 4));
+
+      nvc0_resource_validate(buf, NOUVEAU_BO_WR);
+
+      return;
+   }
+
+   /* If the fence guarding this query has not been emitted, that makes a lot
+    * of the following logic more complicated.
+    */
+   if (hq->is64bit && hq->fence->state < NOUVEAU_FENCE_STATE_EMITTED)
+      nouveau_fence_emit(hq->fence);
+
+   /* We either need to compute a 32- or 64-bit difference between 2 values,
+    * and then store the result as either a 32- or 64-bit value. As such let's
+    * treat all inputs as 64-bit (and just push an extra 0 for the 32-bit
+    * ones), and have one macro that clamps result to i32, u32, or just
+    * outputs the difference (no need to worry about 64-bit clamping).
+    */
+   if (hq->state != NVC0_HW_QUERY_STATE_READY)
+      nvc0_hw_query_update(nvc0->screen->base.client, q);
+
+   if (wait && hq->state != NVC0_HW_QUERY_STATE_READY)
+      nvc0_hw_query_fifo_wait(nvc0, q);
+
+   nouveau_pushbuf_space(push, 32, 2, 0);
+   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   PUSH_REFN (push, buf->bo, buf->domain | NOUVEAU_BO_WR);
+   BEGIN_1IC0(push, NVC0_3D(MACRO_QUERY_BUFFER_WRITE), 9);
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: /* XXX what if 64-bit? */
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      PUSH_DATA(push, 0x00000001);
+      break;
+   default:
+      if (result_type == PIPE_QUERY_TYPE_I32)
+         PUSH_DATA(push, 0x7fffffff);
+      else if (result_type == PIPE_QUERY_TYPE_U32)
+         PUSH_DATA(push, 0xffffffff);
+      else
+         PUSH_DATA(push, 0x00000000);
+      break;
+   }
+
+   switch (q->type) {
+   case PIPE_QUERY_SO_STATISTICS:
+      stride = 2;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      stride = 12;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+   case PIPE_QUERY_TIMESTAMP:
+      qoffset = 8;
+      /* fallthrough */
+   default:
+      assert(index == 0);
+      stride = 1;
+      break;
+   }
+
+   if (hq->is64bit || qoffset) {
+      nouveau_pushbuf_data(push, hq->bo, hq->offset + qoffset + 16 * index,
+                           8 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+      if (q->type == PIPE_QUERY_TIMESTAMP) {
+         PUSH_DATA(push, 0);
+         PUSH_DATA(push, 0);
+      } else {
+         nouveau_pushbuf_data(push, hq->bo, hq->offset + qoffset +
+                              16 * (index + stride),
+                              8 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+      }
+   } else {
+      nouveau_pushbuf_data(push, hq->bo, hq->offset + 4,
+                           4 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+      PUSH_DATA(push, 0);
+      nouveau_pushbuf_data(push, hq->bo, hq->offset + 16 + 4,
+                           4 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+      PUSH_DATA(push, 0);
+   }
+
+   if (wait || hq->state == NVC0_HW_QUERY_STATE_READY) {
+      PUSH_DATA(push, 0);
+      PUSH_DATA(push, 0);
+   } else if (hq->is64bit) {
+      PUSH_DATA(push, hq->fence->sequence);
+      nouveau_pushbuf_data(push, nvc0->screen->fence.bo, 0,
+                           4 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+   } else {
+      PUSH_DATA(push, hq->sequence);
+      nouveau_pushbuf_data(push, hq->bo, hq->offset,
+                           4 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+   }
+   PUSH_DATAh(push, buf->address + offset);
+   PUSH_DATA (push, buf->address + offset);
+
+   util_range_add(&buf->valid_buffer_range, offset,
+                  offset + (result_type >= PIPE_QUERY_TYPE_I64 ? 8 : 4));
+
+   nvc0_resource_validate(buf, NOUVEAU_BO_WR);
+}
+
  static const struct nvc0_query_funcs hw_query_funcs = {
     .destroy_query = nvc0_hw_destroy_query,
     .begin_query = nvc0_hw_begin_query,
     .end_query = nvc0_hw_end_query,
     .get_query_result = nvc0_hw_get_query_result,
+   .get_query_result_resource = nvc0_hw_get_query_result_resource,
  };
  
  struct nvc0_query *
@@ -371,6 +535,12 @@ nvc0_hw_create_query(struct nvc0_context *nvc0, unsigned type, unsigned index)
        return (struct nvc0_query *)hq;
     }
  
+   hq = nvc0_hw_metric_create_query(nvc0, type);
+   if (hq) {
+      hq->base.funcs = &hw_query_funcs;
+      return (struct nvc0_query *)hq;
+   }
+
     hq = CALLOC_STRUCT(nvc0_hw_query);
     if (!hq)
        return NULL;
@@ -378,10 +548,12 @@ nvc0_hw_create_query(struct nvc0_context *nvc0, unsigned type, unsigned index)
     q = &hq->base;
     q->funcs = &hw_query_funcs;
     q->type = type;
+   q->index = index;
  
     switch (q->type) {
     case PIPE_QUERY_OCCLUSION_COUNTER:
     case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
        hq->rotate = 32;
        space = NVC0_HW_QUERY_ALLOC_SPACE;
        break;
@@ -390,14 +562,14 @@ nvc0_hw_create_query(struct nvc0_context *nvc0, unsigned type, unsigned index)
        space = 512;
        break;
     case PIPE_QUERY_SO_STATISTICS:
-   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
        hq->is64bit = true;
        space = 64;
        break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
     case PIPE_QUERY_PRIMITIVES_GENERATED:
     case PIPE_QUERY_PRIMITIVES_EMITTED:
        hq->is64bit = true;
-      q->index = index;
        space = 32;
        break;
     case PIPE_QUERY_TIME_ELAPSED:
@@ -435,14 +607,20 @@ int
  nvc0_hw_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
                                struct pipe_driver_query_info *info)
  {
-   int num_hw_sm_queries = 0;
+   int num_hw_sm_queries = 0, num_hw_metric_queries = 0;
  
     num_hw_sm_queries = nvc0_hw_sm_get_driver_query_info(screen, 0, NULL);
+   num_hw_metric_queries =
+      nvc0_hw_metric_get_driver_query_info(screen, 0, NULL);
  
     if (!info)
-      return num_hw_sm_queries;
+      return num_hw_sm_queries + num_hw_metric_queries;
  
-   return nvc0_hw_sm_get_driver_query_info(screen, id, info);
+   if (id < num_hw_sm_queries)
+      return nvc0_hw_sm_get_driver_query_info(screen, id, info);
+
+   return nvc0_hw_metric_get_driver_query_info(screen,
+                                               id - num_hw_sm_queries, info);
  }
  
  void
@@ -451,28 +629,34 @@ nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push,
  {
     struct nvc0_hw_query *hq = nvc0_hw_query(q);
  
-#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-
     PUSH_REFN(push, hq->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
-   nouveau_pushbuf_space(push, 0, 0, 1);
     nouveau_pushbuf_data(push, hq->bo, hq->offset + result_offset, 4 |
                          NVC0_IB_ENTRY_1_NO_PREFETCH);
  }
  
  void
-nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *push, struct nvc0_query *q)
+nvc0_hw_query_fifo_wait(struct nvc0_context *nvc0, struct nvc0_query *q)
  {
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
     struct nvc0_hw_query *hq = nvc0_hw_query(q);
     unsigned offset = hq->offset;
  
-   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20;
+   /* ensure the query's fence has been emitted */
+   if (hq->is64bit && hq->fence->state < NOUVEAU_FENCE_STATE_EMITTED)
+      nouveau_fence_emit(hq->fence);
  
     PUSH_SPACE(push, 5);
     PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
     BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
-   PUSH_DATAh(push, hq->bo->offset + offset);
-   PUSH_DATA (push, hq->bo->offset + offset);
-   PUSH_DATA (push, hq->sequence);
+   if (hq->is64bit) {
+      PUSH_DATAh(push, nvc0->screen->fence.bo->offset);
+      PUSH_DATA (push, nvc0->screen->fence.bo->offset);
+      PUSH_DATA (push, hq->fence->sequence);
+   } else {
+      PUSH_DATAh(push, hq->bo->offset + offset);
+      PUSH_DATA (push, hq->bo->offset + offset);
+      PUSH_DATA (push, hq->sequence);
+   }
     PUSH_DATA (push, (1 << 12) |
-              NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+              NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_GEQUAL);
  }