iris: add conditional render support
[mesa.git] / src / gallium / drivers / iris / iris_query.c
index a0df8c6504d1fea80a58ad7b3ca4c5085183916e..27126d7f7bc7e4e2f9ece377d42ea2a4c06288ed 100644 (file)
@@ -98,6 +98,8 @@ struct iris_query {
 
    struct iris_bo *bo;
    struct iris_query_snapshots *map;
+
+   int batch_idx;
 };
 
 struct iris_query_snapshots {
@@ -106,6 +108,14 @@ struct iris_query_snapshots {
    uint64_t end;
 };
 
+struct iris_query_so_overflow {
+   uint64_t snapshots_landed;
+   struct {
+      uint64_t prim_storage_needed[2];
+      uint64_t num_prims[2];
+   } stream[4];
+};
+
 /**
  * Is this type of query written by PIPE_CONTROL?
  */
@@ -129,7 +139,7 @@ iris_is_query_pipelined(struct iris_query *q)
 static void
 mark_available(struct iris_context *ice, struct iris_query *q)
 {
-   struct iris_batch *batch = &ice->render_batch;
+   struct iris_batch *batch = &ice->batches[q->batch_idx];
    unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
    unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
 
@@ -162,7 +172,7 @@ iris_pipelined_write(struct iris_batch *batch,
 static void
 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
 {
-   struct iris_batch *batch = &ice->render_batch;
+   struct iris_batch *batch = &ice->batches[q->batch_idx];
    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 
    if (!iris_is_query_pipelined(q)) {
@@ -182,7 +192,7 @@ write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
           */
          iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
       }
-      iris_pipelined_write(&ice->render_batch, q,
+      iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
                            PIPE_CONTROL_WRITE_DEPTH_COUNT |
                            PIPE_CONTROL_DEPTH_STALL,
                            offset);
@@ -190,7 +200,7 @@ write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
    case PIPE_QUERY_TIME_ELAPSED:
    case PIPE_QUERY_TIMESTAMP:
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      iris_pipelined_write(&ice->render_batch, q,
+      iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
                            PIPE_CONTROL_WRITE_TIMESTAMP,
                            offset);
       break;
@@ -229,6 +239,28 @@ write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
    }
 }
 
+static void
+write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
+{
+   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+   uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
+
+   iris_emit_pipe_control_flush(batch,
+                                PIPE_CONTROL_CS_STALL |
+                                PIPE_CONTROL_STALL_AT_SCOREBOARD);
+   for (uint32_t i = 0; i < count; i++) {
+      int s = q->index + i;
+      int g_idx = offsetof(struct iris_query_so_overflow,
+                           stream[s].num_prims[end]);
+      int w_idx = offsetof(struct iris_query_so_overflow,
+                           stream[s].prim_storage_needed[end]);
+      ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
+                                     q->bo, g_idx, false);
+      ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
+                                     q->bo, w_idx, false);
+   }
+}
+
 uint64_t
 iris_timebase_scale(const struct gen_device_info *devinfo,
                     uint64_t gpu_timestamp)
@@ -246,6 +278,14 @@ iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
    }
 }
 
+static bool
+stream_overflowed(struct iris_query_so_overflow *so, int s)
+{
+   return (so->stream[s].prim_storage_needed[1] -
+           so->stream[s].prim_storage_needed[0]) !=
+          (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
+}
+
 static void
 calculate_result_on_cpu(const struct gen_device_info *devinfo,
                         struct iris_query *q)
@@ -266,6 +306,14 @@ calculate_result_on_cpu(const struct gen_device_info *devinfo,
       q->result = iris_timebase_scale(devinfo, q->result);
       q->result &= (1ull << TIMESTAMP_BITS) - 1;
       break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      q->result = stream_overflowed((void *) q->map, q->index);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      q->result = false;
+      for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
+         q->result |= stream_overflowed((void *) q->map, i);
+      break;
    case PIPE_QUERY_OCCLUSION_COUNTER:
    case PIPE_QUERY_PRIMITIVES_GENERATED:
    case PIPE_QUERY_PRIMITIVES_EMITTED:
@@ -284,7 +332,7 @@ calculate_result_on_cpu(const struct gen_device_info *devinfo,
 static void
 gpr0_to_bool(struct iris_context *ice)
 {
-   struct iris_batch *batch = &ice->render_batch;
+   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 
    ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
 
@@ -302,13 +350,94 @@ gpr0_to_bool(struct iris_context *ice)
    iris_batch_emit(batch, math, sizeof(math));
 }
 
+static void
+load_overflow_data_to_cs_gprs(struct iris_context *ice,
+                              struct iris_query *q,
+                              int idx)
+{
+   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+
+   ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
+                                 offsetof(struct iris_query_so_overflow,
+                                          stream[idx].prim_storage_needed[0]));
+   ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
+                                 offsetof(struct iris_query_so_overflow,
+                                          stream[idx].prim_storage_needed[1]));
+
+   ice->vtbl.load_register_mem64(batch, CS_GPR(3), q->bo,
+                                 offsetof(struct iris_query_so_overflow,
+                                          stream[idx].num_prims[0]));
+   ice->vtbl.load_register_mem64(batch, CS_GPR(4), q->bo,
+                                 offsetof(struct iris_query_so_overflow,
+                                          stream[idx].num_prims[1]));
+}
+
+/*
+ * R3 = R4 - R3;
+ * R1 = R2 - R1;
+ * R1 = R3 - R1;
+ * R0 = R0 | R1;
+ */
+static void
+calc_overflow_for_stream(struct iris_context *ice)
+{
+   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+   static const uint32_t maths[] = {
+      MI_MATH | (17 - 2),
+      MI_ALU2(LOAD, SRCA, R4),
+      MI_ALU2(LOAD, SRCB, R3),
+      MI_ALU0(SUB),
+      MI_ALU2(STORE, R3, ACCU),
+      MI_ALU2(LOAD, SRCA, R2),
+      MI_ALU2(LOAD, SRCB, R1),
+      MI_ALU0(SUB),
+      MI_ALU2(STORE, R1, ACCU),
+      MI_ALU2(LOAD, SRCA, R3),
+      MI_ALU2(LOAD, SRCB, R1),
+      MI_ALU0(SUB),
+      MI_ALU2(STORE, R1, ACCU),
+      MI_ALU2(LOAD, SRCA, R1),
+      MI_ALU2(LOAD, SRCB, R0),
+      MI_ALU0(OR),
+      MI_ALU2(STORE, R0, ACCU),
+   };
+
+   iris_batch_emit(batch, maths, sizeof(maths));
+}
+
+static void
+overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
+{
+   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+
+   ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
+
+   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
+      load_overflow_data_to_cs_gprs(ice, q, q->index);
+      calc_overflow_for_stream(ice);
+   } else {
+      for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
+         load_overflow_data_to_cs_gprs(ice, q, i);
+         calc_overflow_for_stream(ice);
+      }
+   }
+
+   gpr0_to_bool(ice);
+}
+
 /**
  * Calculate the result and store it to CS_GPR0.
  */
 static void
 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
 {
-   struct iris_batch *batch = &ice->render_batch;
+   struct iris_batch *batch = &ice->batches[q->batch_idx];
+
+   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+       q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+      overflow_result_to_gpr0(ice, q);
+      return;
+   }
 
    ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
                                  offsetof(struct iris_query_snapshots, start));
@@ -339,6 +468,10 @@ iris_create_query(struct pipe_context *ctx,
    q->type = query_type;
    q->index = index;
 
+   if (q->type == PIPE_QUERY_PIPELINE_STATISTICS && q->index == 10)
+      q->batch_idx = IRIS_BATCH_COMPUTE;
+   else
+      q->batch_idx = IRIS_BATCH_RENDER;
    return (struct pipe_query *) q;
 }
 
@@ -377,7 +510,11 @@ iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
       ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
    }
 
-   write_value(ice, q, offsetof(struct iris_query_snapshots, start));
+   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+       q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+      write_overflow_values(ice, q, false);
+   else
+      write_value(ice, q, offsetof(struct iris_query_snapshots, start));
 
    return true;
 }
@@ -399,7 +536,11 @@ iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
       ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
    }
 
-   write_value(ice, q, offsetof(struct iris_query_snapshots, end));
+   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+       q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+      write_overflow_values(ice, q, true);
+   else
+      write_value(ice, q, offsetof(struct iris_query_snapshots, end));
    mark_available(ice, q);
 
    return true;
@@ -417,8 +558,8 @@ iris_get_query_result(struct pipe_context *ctx,
    const struct gen_device_info *devinfo = &screen->devinfo;
 
    if (!q->ready) {
-      if (iris_batch_references(&ice->render_batch, q->bo))
-         iris_batch_flush(&ice->render_batch);
+      if (iris_batch_references(&ice->batches[q->batch_idx], q->bo))
+         iris_batch_flush(&ice->batches[q->batch_idx]);
 
       if (!q->map->snapshots_landed) {
          if (wait)
@@ -487,11 +628,14 @@ iris_get_query_result_resource(struct pipe_context *ctx,
 {
    struct iris_context *ice = (void *) ctx;
    struct iris_query *q = (void *) query;
-   struct iris_batch *batch = &ice->render_batch;
+   struct iris_batch *batch = &ice->batches[q->batch_idx];
    const struct gen_device_info *devinfo = &batch->screen->devinfo;
+   struct iris_resource *res = (void *) p_res;
    unsigned snapshots_landed_offset =
       offsetof(struct iris_query_snapshots, snapshots_landed);
 
+   res->bind_history |= PIPE_BIND_QUERY_BUFFER;
+
    if (index == -1) {
       /* They're asking for the availability of the result.  If we still
        * have commands queued up which produce the result, submit them
@@ -580,6 +724,103 @@ iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
                        IRIS_DIRTY_WM;
 }
 
+static void
+set_predicate_enable(struct iris_context *ice,
+                     bool value)
+{
+   if (value)
+      ice->predicate = IRIS_PREDICATE_STATE_RENDER;
+   else
+      ice->predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
+}
+
+static void
+set_predicate_for_overflow(struct iris_context *ice,
+                           struct iris_query *q)
+{
+   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+   ice->predicate = IRIS_PREDICATE_STATE_USE_BIT;
+
+   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
+    * command when loading the values into the predicate source registers for
+    * conditional rendering.
+    */
+   iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
+
+   overflow_result_to_gpr0(ice, q);
+   ice->vtbl.load_register_reg64(batch, CS_GPR(0), MI_PREDICATE_SRC0);
+   ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
+}
+
+static void
+set_predicate_for_occlusion(struct iris_context *ice,
+                     struct iris_query *q)
+{
+   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+   ice->predicate = IRIS_PREDICATE_STATE_USE_BIT;
+
+   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
+    * command when loading the values into the predicate source registers for
+    * conditional rendering.
+    */
+   iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
+
+   ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo, offsetof(struct iris_query_snapshots, start));
+   ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, q->bo, offsetof(struct iris_query_snapshots, end));
+}
+
+static void
+set_predicate_for_result(struct iris_context *ice,
+                         struct iris_query *q,
+                         bool condition)
+{
+   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+   int load_op;
+
+   switch (q->type) {
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      set_predicate_for_overflow(ice, q);
+      break;
+   default:
+      set_predicate_for_occlusion(ice, q);
+      break;
+   }
+
+   if (ice->predicate == IRIS_PREDICATE_STATE_USE_BIT) {
+      if (condition)
+         load_op = MI_PREDICATE_LOADOP_LOAD;
+      else
+         load_op = MI_PREDICATE_LOADOP_LOADINV;
+
+      // batch emit
+      uint32_t predicate = MI_PREDICATE | load_op |
+                           MI_PREDICATE_COMBINEOP_SET |
+                           MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+      iris_batch_emit(batch, &predicate, sizeof(uint32_t));
+   }
+}
+
+static void
+iris_render_condition(struct pipe_context *ctx,
+                     struct pipe_query *query,
+                     boolean condition,
+                     enum pipe_render_cond_flag mode)
+{
+   struct iris_context *ice = (void *) ctx;
+   struct iris_query *q = (void *) query;
+
+   if (!q) {
+      ice->predicate = IRIS_PREDICATE_STATE_RENDER;
+      return;
+   }
+
+   if (q->result || q->ready)
+      set_predicate_enable(ice, (q->result != 0) ^ condition);
+   else
+      set_predicate_for_result(ice, q, condition);
+}
+
 void
 iris_init_query_functions(struct pipe_context *ctx)
 {
@@ -590,4 +831,5 @@ iris_init_query_functions(struct pipe_context *ctx)
    ctx->get_query_result = iris_get_query_result;
    ctx->get_query_result_resource = iris_get_query_result_resource;
    ctx->set_active_query_state = iris_set_active_query_state;
+   ctx->render_condition = iris_render_condition;
 }