From 1b130c42b8dbed3a7cabaf47e2695e7db8429b56 Mon Sep 17 00:00:00 2001
From: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Date: Mon, 1 Jun 2020 14:49:44 -0400
Subject: [PATCH] zink: implement streamout and xfb handling in ntv

this translates streamout info into xfb decorations and adds some workaround
handling for spurious gl_PointSize values

partly based on patches originally written by Dave Airlie <airlied@redhat.com>

Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5163>
---
 .../drivers/zink/nir_to_spirv/nir_to_spirv.c  | 191 +++++++++++++++++-
 .../drivers/zink/nir_to_spirv/nir_to_spirv.h  |   3 +-
 src/gallium/drivers/zink/zink_compiler.c      |  48 ++++-
 src/gallium/drivers/zink/zink_compiler.h      |   3 +-
 src/gallium/drivers/zink/zink_context.c       |   4 +-
 5 files changed, 241 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c b/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
index cabc83b7022..10ff16eaff2 100644
--- a/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
+++ b/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
@@ -111,6 +111,10 @@ struct ntv_context {
    size_t num_regs;
 
    struct hash_table *vars; /* nir_variable -> SpvId */
+   struct hash_table *so_outputs; /* pipe_stream_output -> SpvId */
+   unsigned outputs[VARYING_SLOT_MAX];
+   const struct glsl_type *so_output_gl_types[VARYING_SLOT_MAX];
+   SpvId so_output_types[VARYING_SLOT_MAX];
 
    const SpvId *block_ids;
    size_t num_blocks;
@@ -353,7 +357,6 @@ emit_output(struct ntv_context *ctx, struct nir_variable *var)
 
 
    if (ctx->stage == MESA_SHADER_VERTEX) {
-
       unsigned slot = var->data.location;
       switch (slot) {
       HANDLE_EMIT_BUILTIN(POS, Position);
@@ -368,6 +371,10 @@ emit_output(struct ntv_context *ctx, struct nir_variable *var)
       case VARYING_SLOT_CLIP_DIST0:
          assert(glsl_type_is_array(var->type));
          spirv_builder_emit_builtin(&ctx->builder, var_id, SpvBuiltInClipDistance);
+         /* this can be as large as 2x vec4, which requires 2 slots */
+         ctx->outputs[VARYING_SLOT_CLIP_DIST1] = var_id;
+         ctx->so_output_gl_types[VARYING_SLOT_CLIP_DIST1] = var->type;
+         ctx->so_output_types[VARYING_SLOT_CLIP_DIST1] = var_type;
          break;
 
       default:
@@ -383,6 +390,9 @@ emit_output(struct ntv_context *ctx, struct nir_variable *var)
           * use driver_location for non-builtins with defined slots to avoid overlap
           */
       }
+      ctx->outputs[var->data.location] = var_id;
+      ctx->so_output_gl_types[var->data.location] = var->type;
+      ctx->so_output_types[var->data.location] = var_type;
    } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
       if (var->data.location >= FRAG_RESULT_DATA0)
          spirv_builder_emit_location(&ctx->builder, var_id,
@@ -823,6 +833,167 @@ emit_unop(struct ntv_context *ctx, SpvOp op, SpvId type, SpvId src)
    return spirv_builder_emit_unop(&ctx->builder, op, type, src);
 }
 
+/* return the intended xfb output vec type based on base type and vector size */
+static SpvId
+get_output_type(struct ntv_context *ctx, unsigned register_index, unsigned num_components)
+{
+   const struct glsl_type *out_type = ctx->so_output_gl_types[register_index];
+   enum glsl_base_type base_type = glsl_get_base_type(out_type);
+   if (base_type == GLSL_TYPE_ARRAY)
+      base_type = glsl_get_base_type(glsl_without_array(out_type));
+
+   switch (base_type) {
+   case GLSL_TYPE_BOOL:
+      return get_bvec_type(ctx, num_components);
+
+   case GLSL_TYPE_FLOAT:
+      return get_fvec_type(ctx, 32, num_components);
+
+   case GLSL_TYPE_INT:
+      return get_ivec_type(ctx, 32, num_components);
+
+   case GLSL_TYPE_UINT:
+      return get_uvec_type(ctx, 32, num_components);
+
+   default:
+      break;
+   }
+   unreachable("unknown type");
+   return 0;
+}
+
+/* for streamout create new outputs, as streamout can be done on individual components,
+   from complete outputs, so we just can't use the created packed outputs */
+static void
+emit_so_info(struct ntv_context *ctx, unsigned max_output_location,
+             const struct pipe_stream_output_info *so_info, struct pipe_stream_output_info *local_so_info)
+{
+   for (unsigned i = 0; i < local_so_info->num_outputs; i++) {
+      struct pipe_stream_output so_output = local_so_info->output[i];
+      SpvId out_type = get_output_type(ctx, so_output.register_index, so_output.num_components);
+      SpvId pointer_type = spirv_builder_type_pointer(&ctx->builder,
+                                                      SpvStorageClassOutput,
+                                                      out_type);
+      SpvId var_id = spirv_builder_emit_var(&ctx->builder, pointer_type,
+                                            SpvStorageClassOutput);
+      char name[10];
+
+      snprintf(name, 10, "xfb%d", i);
+      spirv_builder_emit_name(&ctx->builder, var_id, name);
+      spirv_builder_emit_offset(&ctx->builder, var_id, (so_output.dst_offset * 4));
+      spirv_builder_emit_xfb_buffer(&ctx->builder, var_id, so_output.output_buffer);
+      spirv_builder_emit_xfb_stride(&ctx->builder, var_id, so_info->stride[so_output.output_buffer] * 4);
+
+      /* output location is incremented by VARYING_SLOT_VAR0 for non-builtins in vtn,
+       * so we need to ensure that the new xfb location slot doesn't conflict with any previously-emitted
+       * outputs.
+       *
+       * if there's no previous outputs that take up user slots (VAR0+) then we can start right after the
+       * glsl builtin reserved slots, otherwise we start just after the adjusted user output slot
+       */
+      uint32_t location = NTV_MIN_RESERVED_SLOTS + i;
+      if (max_output_location >= VARYING_SLOT_VAR0)
+         location = max_output_location - VARYING_SLOT_VAR0 + 1 + i;
+      assert(location < VARYING_SLOT_VAR0);
+      spirv_builder_emit_location(&ctx->builder, var_id, location);
+
+      /* note: gl_ClipDistance[4] can the 0-indexed member of VARYING_SLOT_CLIP_DIST1 here,
+       * so this is still the 0 component
+       */
+      if (so_output.start_component)
+         spirv_builder_emit_component(&ctx->builder, var_id, so_output.start_component);
+
+      uint32_t *key = ralloc_size(NULL, sizeof(uint32_t));
+      *key = (uint32_t)so_output.register_index << 2 | so_output.start_component;
+      _mesa_hash_table_insert(ctx->so_outputs, key, (void *)(intptr_t)var_id);
+
+      assert(ctx->num_entry_ifaces < ARRAY_SIZE(ctx->entry_ifaces));
+      ctx->entry_ifaces[ctx->num_entry_ifaces++] = var_id;
+   }
+}
+
+static void
+emit_so_outputs(struct ntv_context *ctx,
+                const struct pipe_stream_output_info *so_info, struct pipe_stream_output_info *local_so_info)
+{
+   SpvId loaded_outputs[VARYING_SLOT_MAX] = {};
+   for (unsigned i = 0; i < local_so_info->num_outputs; i++) {
+      uint32_t components[NIR_MAX_VEC_COMPONENTS];
+      struct pipe_stream_output so_output = local_so_info->output[i];
+      uint32_t so_key = (uint32_t) so_output.register_index << 2 | so_output.start_component;
+      struct hash_entry *he = _mesa_hash_table_search(ctx->so_outputs, &so_key);
+      assert(he);
+      SpvId so_output_var_id = (SpvId)(intptr_t)he->data;
+
+      SpvId type = get_output_type(ctx, so_output.register_index, so_output.num_components);
+      SpvId output = ctx->outputs[so_output.register_index];
+      SpvId output_type = ctx->so_output_types[so_output.register_index];
+      const struct glsl_type *out_type = ctx->so_output_gl_types[so_output.register_index];
+
+      if (!loaded_outputs[so_output.register_index])
+         loaded_outputs[so_output.register_index] = spirv_builder_emit_load(&ctx->builder, output_type, output);
+      SpvId src = loaded_outputs[so_output.register_index];
+
+      SpvId result;
+
+      for (unsigned c = 0; c < so_output.num_components; c++) {
+         components[c] = so_output.start_component + c;
+         /* this is the second half of a 2 * vec4 array */
+         if (ctx->stage == MESA_SHADER_VERTEX && so_output.register_index == VARYING_SLOT_CLIP_DIST1)
+            components[c] += 4;
+      }
+
+      /* if we're emitting a scalar or the type we're emitting matches the output's original type and we're
+       * emitting the same number of components, then we can skip any sort of conversion here
+       */
+      if (glsl_type_is_scalar(out_type) || (type == output_type && glsl_get_length(out_type) == so_output.num_components))
+         result = src;
+      else {
+         if (ctx->stage == MESA_SHADER_VERTEX && so_output.register_index == VARYING_SLOT_POS) {
+            /* gl_Position was modified by nir_lower_clip_halfz, so we need to reverse that for streamout here:
+             * 
+             * opengl gl_Position.z = (vulkan gl_Position.z * 2.0) - vulkan gl_Position.w
+             *
+             * to do this, we extract the z and w components, perform the multiply and subtract ops, then reinsert
+             */
+            uint32_t z_component[] = {2};
+            uint32_t w_component[] = {3};
+            SpvId ftype = spirv_builder_type_float(&ctx->builder, 32);
+            SpvId z = spirv_builder_emit_composite_extract(&ctx->builder, ftype, src, z_component, 1);
+            SpvId w = spirv_builder_emit_composite_extract(&ctx->builder, ftype, src, w_component, 1);
+            SpvId new_z = emit_binop(ctx, SpvOpFMul, ftype, z, spirv_builder_const_float(&ctx->builder, 32, 2.0));
+            new_z = emit_binop(ctx, SpvOpFSub, ftype, new_z, w);
+            src = spirv_builder_emit_vector_insert(&ctx->builder, type, src, new_z, 2);
+         }
+         /* OpCompositeExtract can only extract scalars for our use here */
+         if (so_output.num_components == 1) {
+            result = spirv_builder_emit_composite_extract(&ctx->builder, type, src, components, so_output.num_components);
+         } else if (glsl_type_is_vector(out_type)) {
+            /* OpVectorShuffle can select vector members into a differently-sized vector */
+            result = spirv_builder_emit_vector_shuffle(&ctx->builder, type,
+                                                             src, src,
+                                                             components, so_output.num_components);
+            result = emit_unop(ctx, SpvOpBitcast, type, result);
+         } else {
+             /* for arrays, we need to manually extract each desired member
+              * and re-pack them into the desired output type
+              */
+             for (unsigned c = 0; c < so_output.num_components; c++) {
+                uint32_t member[] = { so_output.start_component + c };
+                SpvId base_type = get_glsl_type(ctx, glsl_without_array(out_type));
+
+                if (ctx->stage == MESA_SHADER_VERTEX && so_output.register_index == VARYING_SLOT_CLIP_DIST1)
+                   member[0] += 4;
+                components[c] = spirv_builder_emit_composite_extract(&ctx->builder, base_type, src, member, 1);
+             }
+             result = spirv_builder_emit_composite_construct(&ctx->builder, type, components, so_output.num_components);
+         }
+      }
+
+      spirv_builder_emit_store(&ctx->builder, so_output_var_id, result);
+   }
+}
+
 static SpvId
 emit_binop(struct ntv_context *ctx, SpvOp op, SpvId type,
            SpvId src0, SpvId src1)
@@ -1988,7 +2159,7 @@ emit_cf_list(struct ntv_context *ctx, struct exec_list *list)
 }
 
 struct spirv_shader *
-nir_to_spirv(struct nir_shader *s)
+nir_to_spirv(struct nir_shader *s, const struct pipe_stream_output_info *so_info, struct pipe_stream_output_info *local_so_info)
 {
    struct spirv_shader *ret = NULL;
 
@@ -2061,12 +2232,17 @@ nir_to_spirv(struct nir_shader *s)
    ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                       _mesa_key_pointer_equal);
 
+   ctx.so_outputs = _mesa_hash_table_create(NULL, _mesa_hash_u32,
+                                            _mesa_key_u32_equal);
+
    nir_foreach_variable(var, &s->inputs)
       emit_input(&ctx, var);
 
    nir_foreach_variable(var, &s->outputs)
       emit_output(&ctx, var);
 
+   if (so_info)
+      emit_so_info(&ctx, util_last_bit64(s->info.outputs_written), so_info, local_so_info);
    nir_foreach_variable(var, &s->uniforms)
       emit_uniform(&ctx, var);
 
@@ -2078,6 +2254,11 @@ nir_to_spirv(struct nir_shader *s)
                                       SpvExecutionModeDepthReplacing);
    }
 
+   if (so_info && so_info->num_outputs) {
+      spirv_builder_emit_cap(&ctx.builder, SpvCapabilityTransformFeedback);
+      spirv_builder_emit_exec_mode(&ctx.builder, entry_point,
+                                   SpvExecutionModeXfb);
+   }
 
    spirv_builder_function(&ctx.builder, entry_point, type_void,
                                             SpvFunctionControlMaskNone,
@@ -2124,6 +2305,9 @@ nir_to_spirv(struct nir_shader *s)
 
    free(ctx.defs);
 
+   if (so_info)
+      emit_so_outputs(&ctx, so_info, local_so_info);
+
    spirv_builder_return(&ctx.builder); // doesn't belong here, but whatevz
    spirv_builder_function_end(&ctx.builder);
 
@@ -2154,6 +2338,9 @@ fail:
    if (ctx.vars)
       _mesa_hash_table_destroy(ctx.vars, NULL);
 
+   if (ctx.so_outputs)
+      _mesa_hash_table_destroy(ctx.so_outputs, NULL);
+
    return NULL;
 }
 
diff --git a/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.h b/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.h
index de767018c03..5c4e5583107 100644
--- a/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.h
+++ b/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.h
@@ -36,9 +36,10 @@ struct spirv_shader {
 };
 
 struct nir_shader;
+struct pipe_stream_output_info;
 
 struct spirv_shader *
-nir_to_spirv(struct nir_shader *s);
+nir_to_spirv(struct nir_shader *s, const struct pipe_stream_output_info *so_info, struct pipe_stream_output_info *local_so_info);
 
 void
 spirv_shader_delete(struct spirv_shader *s);
diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c
index 6fbbe8800a2..465056f85cb 100644
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@@ -132,15 +132,55 @@ optimize_nir(struct nir_shader *s)
    } while (progress);
 }
 
+/* check for a genuine gl_PointSize output vs one from nir_lower_point_size_mov */
+static bool
+check_psiz(struct nir_shader *s)
+{
+   nir_foreach_variable(var, &s->outputs) {
+      if (var->data.location == VARYING_SLOT_PSIZ) {
+         /* genuine PSIZ outputs will have this set */
+         return !!var->data.explicit_location;
+      }
+   }
+   return false;
+}
+
+/* semi-copied from iris */
+static void
+update_so_info(struct pipe_stream_output_info *so_info,
+               uint64_t outputs_written, bool have_psiz)
+{
+   uint8_t reverse_map[64] = {};
+   unsigned slot = 0;
+   while (outputs_written) {
+      int bit = u_bit_scan64(&outputs_written);
+      /* PSIZ from nir_lower_point_size_mov breaks stream output, so always skip it */
+      if (bit == VARYING_SLOT_PSIZ && !have_psiz)
+         continue;
+      reverse_map[slot++] = bit;
+   }
+
+   for (unsigned i = 0; i < so_info->num_outputs; i++) {
+      struct pipe_stream_output *output = &so_info->output[i];
+
+      /* Map Gallium's condensed "slots" back to real VARYING_SLOT_* enums */
+      output->register_index = reverse_map[output->register_index];
+   }
+}
+
 struct zink_shader *
-zink_compile_nir(struct zink_screen *screen, struct nir_shader *nir)
+zink_compile_nir(struct zink_screen *screen, struct nir_shader *nir,
+                 const struct pipe_stream_output_info *so_info)
 {
    struct zink_shader *ret = CALLOC_STRUCT(zink_shader);
+   bool have_psiz = false;
 
    ret->programs = _mesa_pointer_set_create(NULL);
 
    NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 1);
    NIR_PASS_V(nir, nir_lower_clip_halfz);
+   if (nir->info.stage == MESA_SHADER_VERTEX)
+      have_psiz = check_psiz(nir);
    NIR_PASS_V(nir, nir_lower_regs_to_ssa);
    optimize_nir(nir);
    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
@@ -189,8 +229,12 @@ zink_compile_nir(struct zink_screen *screen, struct nir_shader *nir)
    }
 
    ret->info = nir->info;
+   if (so_info) {
+      memcpy(&ret->stream_output, so_info, sizeof(ret->stream_output));
+      update_so_info(&ret->stream_output, nir->info.outputs_written, have_psiz);
+   }
 
-   struct spirv_shader *spirv = nir_to_spirv(nir);
+   struct spirv_shader *spirv = nir_to_spirv(nir, so_info, so_info ? &ret->stream_output : NULL);
    assert(spirv);
 
    if (zink_debug & ZINK_DEBUG_SPIRV) {
diff --git a/src/gallium/drivers/zink/zink_compiler.h b/src/gallium/drivers/zink/zink_compiler.h
index f392c636cd3..73bdebec9dd 100644
--- a/src/gallium/drivers/zink/zink_compiler.h
+++ b/src/gallium/drivers/zink/zink_compiler.h
@@ -65,7 +65,8 @@ struct zink_shader {
 };
 
 struct zink_shader *
-zink_compile_nir(struct zink_screen *screen, struct nir_shader *nir);
+zink_compile_nir(struct zink_screen *screen, struct nir_shader *nir,
+                 const struct pipe_stream_output_info *so_info);
 
 void
 zink_shader_free(struct zink_screen *screen, struct zink_shader *shader);
diff --git a/src/gallium/drivers/zink/zink_context.c b/src/gallium/drivers/zink/zink_context.c
index a30f1fa6946..05702ada104 100644
--- a/src/gallium/drivers/zink/zink_context.c
+++ b/src/gallium/drivers/zink/zink_context.c
@@ -296,7 +296,7 @@ zink_create_vs_state(struct pipe_context *pctx,
    else
       nir = (struct nir_shader *)shader->ir.nir;
 
-   return zink_compile_nir(zink_screen(pctx->screen), nir);
+   return zink_compile_nir(zink_screen(pctx->screen), nir, &shader->stream_output);
 }
 
 static void
@@ -332,7 +332,7 @@ zink_create_fs_state(struct pipe_context *pctx,
    else
       nir = (struct nir_shader *)shader->ir.nir;
 
-   return zink_compile_nir(zink_screen(pctx->screen), nir);
+   return zink_compile_nir(zink_screen(pctx->screen), nir, NULL);
 }
 
 static void
-- 
2.30.2