gallivm/nir: allow 64-bit arit ops
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_nir_soa.c
index 7ac9697011694834bfbe09745d42c061b13cc43c..b83acd0b58fda00a4ca07fef64ed5c917194307d 100644 (file)
@@ -69,8 +69,13 @@ emit_fetch_64bit(
    assert(len <= (2 * (LP_MAX_VECTOR_WIDTH/32)));
 
    for (i = 0; i < bld_base->base.type.length * 2; i+=2) {
+#if UTIL_ARCH_LITTLE_ENDIAN
       shuffles[i] = lp_build_const_int32(gallivm, i / 2);
       shuffles[i + 1] = lp_build_const_int32(gallivm, i / 2 + bld_base->base.type.length);
+#else
+      shuffles[i] = lp_build_const_int32(gallivm, i / 2 + bld_base->base.type.length);
+      shuffles[i + 1] = lp_build_const_int32(gallivm, i / 2);
+#endif
    }
    res = LLVMBuildShuffleVector(builder, input, input2, LLVMConstVector(shuffles, len), "");
 
@@ -78,40 +83,54 @@ emit_fetch_64bit(
 }
 
 static void
-emit_store_64bit_chan(struct lp_build_nir_context *bld_base,
-                      LLVMValueRef chan_ptr,
-                      LLVMValueRef chan_ptr2,
-                      LLVMValueRef value)
+emit_store_64bit_split(struct lp_build_nir_context *bld_base,
+                       LLVMValueRef value,
+                       LLVMValueRef split_values[2])
 {
-   struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
-   struct lp_build_context *float_bld = &bld_base->base;
    unsigned i;
-   LLVMValueRef temp, temp2;
    LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH/32];
    LLVMValueRef shuffles2[LP_MAX_VECTOR_WIDTH/32];
    int len = bld_base->base.type.length * 2;
 
    value = LLVMBuildBitCast(gallivm->builder, value, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), len), "");
    for (i = 0; i < bld_base->base.type.length; i++) {
+#if UTIL_ARCH_LITTLE_ENDIAN
       shuffles[i] = lp_build_const_int32(gallivm, i * 2);
       shuffles2[i] = lp_build_const_int32(gallivm, (i * 2) + 1);
+#else
+      shuffles[i] = lp_build_const_int32(gallivm, i * 2 + 1);
+      shuffles2[i] = lp_build_const_int32(gallivm, i * 2);
+#endif
    }
 
-   temp = LLVMBuildShuffleVector(builder, value,
+   split_values[0] = LLVMBuildShuffleVector(builder, value,
                                  LLVMGetUndef(LLVMTypeOf(value)),
                                  LLVMConstVector(shuffles,
                                                  bld_base->base.type.length),
                                  "");
-   temp2 = LLVMBuildShuffleVector(builder, value,
+   split_values[1] = LLVMBuildShuffleVector(builder, value,
                                   LLVMGetUndef(LLVMTypeOf(value)),
                                   LLVMConstVector(shuffles2,
                                                   bld_base->base.type.length),
                                   "");
+}
 
-   lp_exec_mask_store(&bld->exec_mask, float_bld, temp, chan_ptr);
-   lp_exec_mask_store(&bld->exec_mask, float_bld, temp2, chan_ptr2);
+static void
+emit_store_64bit_chan(struct lp_build_nir_context *bld_base,
+                      LLVMValueRef chan_ptr,
+                      LLVMValueRef chan_ptr2,
+                      LLVMValueRef value)
+{
+   struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
+   struct lp_build_context *float_bld = &bld_base->base;
+   LLVMValueRef split_vals[2];
+
+   emit_store_64bit_split(bld_base, value, split_vals);
+
+   lp_exec_mask_store(&bld->exec_mask, float_bld, split_vals[0], chan_ptr);
+   lp_exec_mask_store(&bld->exec_mask, float_bld, split_vals[1], chan_ptr2);
 }
 
 static LLVMValueRef
@@ -280,22 +299,35 @@ static void emit_load_var(struct lp_build_nir_context *bld_base,
                            unsigned bit_size,
                            nir_variable *var,
                            unsigned vertex_index,
+                           LLVMValueRef indir_vertex_index,
                            unsigned const_index,
                            LLVMValueRef indir_index,
-                           LLVMValueRef result[4])
+                           LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
 {
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    int dmul = bit_size == 64 ? 2 : 1;
+   unsigned location = var->data.driver_location;
+   unsigned location_frac = var->data.location_frac;
+
+   if (!var->data.compact && !indir_index)
+      location += const_index;
+   else if (var->data.compact) {
+      location += const_index / 4;
+      location_frac += const_index % 4;
+      const_index = 0;
+   }
    switch (deref_mode) {
-   case nir_var_shader_in: {
+   case nir_var_shader_in:
       for (unsigned i = 0; i < num_components; i++) {
-         int idx = (i * dmul) + var->data.location_frac;
+         int idx = (i * dmul) + location_frac;
+
          if (bld->gs_iface) {
             LLVMValueRef vertex_index_val = lp_build_const_int32(gallivm, vertex_index);
-            LLVMValueRef attrib_index_val = lp_build_const_int32(gallivm, const_index + var->data.driver_location);
+            LLVMValueRef attrib_index_val = lp_build_const_int32(gallivm, location);
             LLVMValueRef swizzle_index_val = lp_build_const_int32(gallivm, idx);
             LLVMValueRef result2;
+
             result[i] = bld->gs_iface->fetch_input(bld->gs_iface, &bld_base->base,
                                                    false, vertex_index_val, 0, attrib_index_val, swizzle_index_val);
             if (bit_size == 64) {
@@ -304,6 +336,71 @@ static void emit_load_var(struct lp_build_nir_context *bld_base,
                                                     false, vertex_index_val, 0, attrib_index_val, swizzle_index_val);
                result[i] = emit_fetch_64bit(bld_base, result[i], result2);
             }
+         } else if (bld->tes_iface) {
+            LLVMValueRef vertex_index_val = lp_build_const_int32(gallivm, vertex_index);
+            LLVMValueRef attrib_index_val;
+            LLVMValueRef swizzle_index_val = lp_build_const_int32(gallivm, idx);
+            LLVMValueRef result2;
+
+            if (indir_index) {
+               if (var->data.compact) {
+                  swizzle_index_val = lp_build_add(&bld_base->uint_bld, indir_index, lp_build_const_int_vec(gallivm, bld_base->uint_bld.type, idx));
+                  attrib_index_val = lp_build_const_int32(gallivm, location);
+               } else
+                  attrib_index_val = lp_build_add(&bld_base->uint_bld, indir_index, lp_build_const_int_vec(gallivm, bld_base->uint_bld.type, var->data.driver_location));
+            } else
+               attrib_index_val = lp_build_const_int32(gallivm, location);
+
+            if (var->data.patch) {
+               result[i] = bld->tes_iface->fetch_patch_input(bld->tes_iface, &bld_base->base,
+                                                             indir_index ? true : false, attrib_index_val, swizzle_index_val);
+               if (bit_size == 64) {
+                  LLVMValueRef swizzle_index_val = lp_build_const_int32(gallivm, idx + 1);
+                  result2 = bld->tes_iface->fetch_patch_input(bld->tes_iface, &bld_base->base,
+                                                              indir_index ? true : false, attrib_index_val, swizzle_index_val);
+                  result[i] = emit_fetch_64bit(bld_base, result[i], result2);
+               }
+            }
+            else {
+               result[i] = bld->tes_iface->fetch_vertex_input(bld->tes_iface, &bld_base->base,
+                                                              indir_vertex_index ? true : false,
+                                                              indir_vertex_index ? indir_vertex_index : vertex_index_val,
+                                                              (indir_index && !var->data.compact) ? true : false, attrib_index_val,
+                                                              (indir_index && var->data.compact) ? true : false, swizzle_index_val);
+               if (bit_size == 64) {
+                  LLVMValueRef swizzle_index_val = lp_build_const_int32(gallivm, idx + 1);
+                  result2 = bld->tes_iface->fetch_vertex_input(bld->tes_iface, &bld_base->base,
+                                                               indir_vertex_index ? true : false,
+                                                               indir_vertex_index ? indir_vertex_index : vertex_index_val,
+                                                               indir_index ? true : false, attrib_index_val, false, swizzle_index_val);
+                  result[i] = emit_fetch_64bit(bld_base, result[i], result2);
+               }
+            }
+         } else if (bld->tcs_iface) {
+            LLVMValueRef vertex_index_val = lp_build_const_int32(gallivm, vertex_index);
+            LLVMValueRef attrib_index_val;
+            LLVMValueRef swizzle_index_val = lp_build_const_int32(gallivm, idx);
+
+            if (indir_index) {
+               if (var->data.compact) {
+                  swizzle_index_val = lp_build_add(&bld_base->uint_bld, indir_index, lp_build_const_int_vec(gallivm, bld_base->uint_bld.type, idx));
+                  attrib_index_val = lp_build_const_int32(gallivm, location);
+               } else
+                  attrib_index_val = lp_build_add(&bld_base->uint_bld, indir_index, lp_build_const_int_vec(gallivm, bld_base->uint_bld.type, var->data.driver_location));
+            } else
+               attrib_index_val = lp_build_const_int32(gallivm, location);
+            result[i] = bld->tcs_iface->emit_fetch_input(bld->tcs_iface, &bld_base->base,
+                                                         indir_vertex_index ? true : false, indir_vertex_index ? indir_vertex_index : vertex_index_val,
+                                                         (indir_index && !var->data.compact) ? true : false, attrib_index_val,
+                                                         (indir_index && var->data.compact) ? true : false, swizzle_index_val);
+            if (bit_size == 64) {
+               LLVMValueRef swizzle_index_val = lp_build_const_int32(gallivm, idx + 1);
+               LLVMValueRef result2 = bld->tcs_iface->emit_fetch_input(bld->tcs_iface, &bld_base->base,
+                                                                       indir_vertex_index ? true : false, indir_vertex_index ? indir_vertex_index : vertex_index_val,
+                                                                       indir_index ? true : false, attrib_index_val,
+                                                                       false, swizzle_index_val);
+               result[i] = emit_fetch_64bit(bld_base, result[i], result2);
+            }
          } else {
             if (indir_index) {
                LLVMValueRef attrib_index_val = lp_build_add(&bld_base->uint_bld, indir_index, lp_build_const_int_vec(gallivm, bld_base->uint_bld.type, var->data.driver_location));
@@ -325,12 +422,12 @@ static void emit_load_var(struct lp_build_nir_context *bld_base,
             } else {
                if (bld->indirects & nir_var_shader_in) {
                   LLVMValueRef lindex = lp_build_const_int32(gallivm,
-                                                             var->data.driver_location * 4 + idx);
+                                                             location * 4 + idx);
                   LLVMValueRef input_ptr = lp_build_pointer_get(gallivm->builder,
                                                              bld->inputs_array, lindex);
                   if (bit_size == 64) {
                      LLVMValueRef lindex2 = lp_build_const_int32(gallivm,
-                                                                 var->data.driver_location * 4 + (idx + 1));
+                                                                 location * 4 + (idx + 1));
                      LLVMValueRef input_ptr2 = lp_build_pointer_get(gallivm->builder,
                                                                     bld->inputs_array, lindex2);
                      result[i] = emit_fetch_64bit(bld_base, input_ptr, input_ptr2);
@@ -340,17 +437,49 @@ static void emit_load_var(struct lp_build_nir_context *bld_base,
                } else {
                   if (bit_size == 64) {
                      LLVMValueRef tmp[2];
-                     tmp[0] = bld->inputs[var->data.driver_location + const_index][idx];
-                     tmp[1] = bld->inputs[var->data.driver_location + const_index][idx + 1];
+                     tmp[0] = bld->inputs[location][idx];
+                     tmp[1] = bld->inputs[location][idx + 1];
                      result[i] = emit_fetch_64bit(bld_base, tmp[0], tmp[1]);
                   } else {
-                     result[i] = bld->inputs[var->data.driver_location + const_index][idx];
+                     result[i] = bld->inputs[location][idx];
                   }
                }
             }
          }
       }
-   }
+      break;
+   case nir_var_shader_out:
+      if (bld->fs_iface && bld->fs_iface->fb_fetch) {
+         bld->fs_iface->fb_fetch(bld->fs_iface, &bld_base->base, var->data.driver_location, result);
+         return;
+      }
+      for (unsigned i = 0; i < num_components; i++) {
+         int idx = (i * dmul) + location_frac;
+         if (bld->tcs_iface) {
+            LLVMValueRef vertex_index_val = lp_build_const_int32(gallivm, vertex_index);
+            LLVMValueRef attrib_index_val;
+            LLVMValueRef swizzle_index_val = lp_build_const_int32(gallivm, idx);
+
+            if (indir_index)
+               attrib_index_val = lp_build_add(&bld_base->uint_bld, indir_index, lp_build_const_int_vec(gallivm, bld_base->uint_bld.type, var->data.driver_location));
+            else
+               attrib_index_val = lp_build_const_int32(gallivm, location);
+
+            result[i] = bld->tcs_iface->emit_fetch_output(bld->tcs_iface, &bld_base->base,
+                                                          indir_vertex_index ? true : false, indir_vertex_index ? indir_vertex_index : vertex_index_val,
+                                                          (indir_index && !var->data.compact) ? true : false, attrib_index_val,
+                                                          (indir_index && var->data.compact) ? true : false, swizzle_index_val, 0);
+            if (bit_size == 64) {
+               LLVMValueRef swizzle_index_val = lp_build_const_int32(gallivm, idx + 1);
+               LLVMValueRef result2 = bld->tcs_iface->emit_fetch_output(bld->tcs_iface, &bld_base->base,
+                                                                        indir_vertex_index ? true : false, indir_vertex_index ? indir_vertex_index : vertex_index_val,
+                                                                        indir_index ? true : false, attrib_index_val,
+                                                                        false, swizzle_index_val, 0);
+               result[i] = emit_fetch_64bit(bld_base, result[i], result2);
+            }
+         }
+      }
+      break;
    default:
       break;
    }
@@ -383,19 +512,88 @@ static void emit_store_chan(struct lp_build_nir_context *bld_base,
    }
 }
 
+static void emit_store_tcs_chan(struct lp_build_nir_context *bld_base,
+                                bool is_compact,
+                                unsigned bit_size,
+                                unsigned location,
+                                unsigned const_index,
+                                LLVMValueRef indir_vertex_index,
+                                LLVMValueRef indir_index,
+                                unsigned comp,
+                                unsigned chan,
+                                LLVMValueRef chan_val)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   unsigned swizzle = chan;
+   if (bit_size == 64) {
+      swizzle *= 2;
+      swizzle += comp;
+      if (swizzle >= 4) {
+         swizzle -= 4;
+         location++;
+      }
+   } else
+      swizzle += comp;
+   LLVMValueRef attrib_index_val;
+   LLVMValueRef swizzle_index_val = lp_build_const_int32(gallivm, swizzle);
+
+   if (indir_index) {
+      if (is_compact) {
+         swizzle_index_val = lp_build_add(&bld_base->uint_bld, indir_index, lp_build_const_int_vec(gallivm, bld_base->uint_bld.type, swizzle));
+         attrib_index_val = lp_build_const_int32(gallivm, const_index + location);
+      } else
+         attrib_index_val = lp_build_add(&bld_base->uint_bld, indir_index, lp_build_const_int_vec(gallivm, bld_base->uint_bld.type, location));
+   } else
+      attrib_index_val = lp_build_const_int32(gallivm, const_index + location);
+   if (bit_size == 64) {
+      LLVMValueRef split_vals[2];
+      LLVMValueRef swizzle_index_val2 = lp_build_const_int32(gallivm, swizzle + 1);
+      emit_store_64bit_split(bld_base, chan_val, split_vals);
+      bld->tcs_iface->emit_store_output(bld->tcs_iface, &bld_base->base, 0,
+                                        indir_vertex_index ? true : false,
+                                        indir_vertex_index,
+                                        indir_index ? true : false,
+                                        attrib_index_val,
+                                        false, swizzle_index_val,
+                                        split_vals[0], mask_vec(bld_base));
+      bld->tcs_iface->emit_store_output(bld->tcs_iface, &bld_base->base, 0,
+                                        indir_vertex_index ? true : false,
+                                        indir_vertex_index,
+                                        indir_index ? true : false,
+                                        attrib_index_val,
+                                        false, swizzle_index_val2,
+                                        split_vals[1], mask_vec(bld_base));
+   } else {
+      chan_val = LLVMBuildBitCast(builder, chan_val, bld_base->base.vec_type, "");
+      bld->tcs_iface->emit_store_output(bld->tcs_iface, &bld_base->base, 0,
+                                        indir_vertex_index ? true : false,
+                                        indir_vertex_index,
+                                        indir_index && !is_compact ? true : false,
+                                        attrib_index_val,
+                                        indir_index && is_compact ? true : false,
+                                        swizzle_index_val,
+                                        chan_val, mask_vec(bld_base));
+   }
+}
+
 static void emit_store_var(struct lp_build_nir_context *bld_base,
                            nir_variable_mode deref_mode,
-                           unsigned bit_size,
                            unsigned num_components,
+                           unsigned bit_size,
+                           nir_variable *var,
                            unsigned writemask,
+                           LLVMValueRef indir_vertex_index,
                            unsigned const_index,
-                           nir_variable *var, LLVMValueRef dst)
+                           LLVMValueRef indir_index,
+                           LLVMValueRef dst)
 {
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
    switch (deref_mode) {
    case nir_var_shader_out: {
-      unsigned location = var->data.driver_location + const_index;
+      unsigned location = var->data.driver_location;
       unsigned comp = var->data.location_frac;
       if (bld_base->shader->info.stage == MESA_SHADER_FRAGMENT) {
          if (var->data.location == FRAG_RESULT_STENCIL)
@@ -403,10 +601,20 @@ static void emit_store_var(struct lp_build_nir_context *bld_base,
          else if (var->data.location == FRAG_RESULT_DEPTH)
             comp = 2;
       }
+
+      if (var->data.compact) {
+         location += const_index / 4;
+         comp += const_index % 4;
+         const_index = 0;
+      }
+
       for (unsigned chan = 0; chan < num_components; chan++) {
          if (writemask & (1u << chan)) {
             LLVMValueRef chan_val = (num_components == 1) ? dst : LLVMBuildExtractValue(builder, dst, chan, "");
-            emit_store_chan(bld_base, deref_mode, bit_size, location, comp, chan, chan_val);
+            if (bld->tcs_iface) {
+               emit_store_tcs_chan(bld_base, var->data.compact, bit_size, location, const_index, indir_vertex_index, indir_index, comp, chan, chan_val);
+            } else
+               emit_store_chan(bld_base, deref_mode, bit_size, location + const_index, comp, chan, chan_val);
          }
       }
       break;
@@ -425,7 +633,7 @@ static LLVMValueRef emit_load_reg(struct lp_build_nir_context *bld_base,
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    int nc = reg->reg->num_components;
-   LLVMValueRef vals[4];
+   LLVMValueRef vals[NIR_MAX_VEC_COMPONENTS] = { NULL };
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    if (reg->reg->num_array_elems) {
       LLVMValueRef indirect_val = lp_build_const_int_vec(gallivm, uint_bld->type, reg->base_offset);
@@ -455,7 +663,7 @@ static void emit_store_reg(struct lp_build_nir_context *bld_base,
                            unsigned writemask,
                            LLVMValueRef indir_src,
                            LLVMValueRef reg_storage,
-                           LLVMValueRef dst[4])
+                           LLVMValueRef dst[NIR_MAX_VEC_COMPONENTS])
 {
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -488,13 +696,248 @@ static void emit_store_reg(struct lp_build_nir_context *bld_base,
    }
 }
 
+static void emit_load_kernel_arg(struct lp_build_nir_context *bld_base,
+                                 unsigned nc,
+                                 unsigned bit_size,
+                                 unsigned offset_bit_size,
+                                 bool offset_is_uniform,
+                                 LLVMValueRef offset,
+                                 LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
+{
+   struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *bld_broad = get_int_bld(bld_base, true, bit_size);
+   LLVMValueRef kernel_args_ptr = bld->kernel_args_ptr;
+   unsigned size_shift = 0;
+   struct lp_build_context *bld_offset = get_int_bld(bld_base, true, offset_bit_size);
+   if (bit_size == 16)
+      size_shift = 1;
+   else if (bit_size == 32)
+      size_shift = 2;
+   else if (bit_size == 64)
+      size_shift = 3;
+   if (size_shift)
+      offset = lp_build_shr(bld_offset, offset, lp_build_const_int_vec(gallivm, bld_offset->type, size_shift));
+
+   LLVMTypeRef ptr_type = LLVMPointerType(bld_broad->elem_type, 0);
+   kernel_args_ptr = LLVMBuildBitCast(builder, kernel_args_ptr, ptr_type, "");
+
+   if (offset_is_uniform) {
+      offset = LLVMBuildExtractElement(builder, offset, lp_build_const_int32(gallivm, 0), "");
+
+      for (unsigned c = 0; c < nc; c++) {
+         LLVMValueRef this_offset = LLVMBuildAdd(builder, offset, offset_bit_size == 64 ? lp_build_const_int64(gallivm, c) : lp_build_const_int32(gallivm, c), "");
+
+         LLVMValueRef scalar = lp_build_pointer_get(builder, kernel_args_ptr, this_offset);
+         result[c] = lp_build_broadcast_scalar(bld_broad, scalar);
+      }
+   }
+}
+
+static LLVMValueRef global_addr_to_ptr(struct gallivm_state *gallivm, LLVMValueRef addr_ptr, unsigned bit_size)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   switch (bit_size) {
+   case 8:
+      addr_ptr = LLVMBuildIntToPtr(builder, addr_ptr, LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
+      break;
+   case 16:
+      addr_ptr = LLVMBuildIntToPtr(builder, addr_ptr, LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0), "");
+      break;
+   case 32:
+   default:
+      addr_ptr = LLVMBuildIntToPtr(builder, addr_ptr, LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0), "");
+      break;
+   case 64:
+      addr_ptr = LLVMBuildIntToPtr(builder, addr_ptr, LLVMPointerType(LLVMInt64TypeInContext(gallivm->context), 0), "");
+      break;
+   }
+   return addr_ptr;
+}
+
+static void emit_load_global(struct lp_build_nir_context *bld_base,
+                             unsigned nc,
+                             unsigned bit_size,
+                             unsigned addr_bit_size,
+                             LLVMValueRef addr,
+                             LLVMValueRef outval[NIR_MAX_VEC_COMPONENTS])
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   struct lp_build_context *res_bld;
+
+   res_bld = get_int_bld(bld_base, true, bit_size);
+
+   for (unsigned c = 0; c < nc; c++) {
+      LLVMValueRef result = lp_build_alloca(gallivm, res_bld->vec_type, "");
+
+      struct lp_build_loop_state loop_state;
+      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+
+      LLVMValueRef addr_ptr = LLVMBuildExtractElement(gallivm->builder, addr,
+                                                      loop_state.counter, "");
+      addr_ptr = global_addr_to_ptr(gallivm, addr_ptr, bit_size);
+
+      LLVMValueRef value_ptr = lp_build_pointer_get(builder, addr_ptr, lp_build_const_int32(gallivm, c));
+
+      LLVMValueRef temp_res;
+      temp_res = LLVMBuildLoad(builder, result, "");
+      temp_res = LLVMBuildInsertElement(builder, temp_res, value_ptr, loop_state.counter, "");
+      LLVMBuildStore(builder, temp_res, result);
+      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                             NULL, LLVMIntUGE);
+      outval[c] = LLVMBuildLoad(builder, result, "");
+   }
+}
+
+static void emit_store_global(struct lp_build_nir_context *bld_base,
+                              unsigned writemask,
+                              unsigned nc, unsigned bit_size,
+                              unsigned addr_bit_size,
+                              LLVMValueRef addr,
+                              LLVMValueRef dst)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+
+   for (unsigned c = 0; c < nc; c++) {
+      if (!(writemask & (1u << c)))
+         continue;
+      LLVMValueRef val = (nc == 1) ? dst : LLVMBuildExtractValue(builder, dst, c, "");
+
+      LLVMValueRef exec_mask = mask_vec(bld_base);
+      struct lp_build_loop_state loop_state;
+      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+      LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val,
+                                                       loop_state.counter, "");
+
+      LLVMValueRef addr_ptr = LLVMBuildExtractElement(gallivm->builder, addr,
+                                                      loop_state.counter, "");
+      addr_ptr = global_addr_to_ptr(gallivm, addr_ptr, bit_size);
+      switch (bit_size) {
+      case 32:
+         value_ptr = LLVMBuildBitCast(builder, value_ptr, LLVMInt32TypeInContext(gallivm->context), "");
+         break;
+      case 64:
+         value_ptr = LLVMBuildBitCast(builder, value_ptr, LLVMInt64TypeInContext(gallivm->context), "");
+         break;
+      default:
+         break;
+      }
+      struct lp_build_if_state ifthen;
+
+      LLVMValueRef cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
+      cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+      lp_build_if(&ifthen, gallivm, cond);
+      lp_build_pointer_set(builder, addr_ptr, lp_build_const_int32(gallivm, c), value_ptr);
+      lp_build_endif(&ifthen);
+      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                             NULL, LLVMIntUGE);
+   }
+}
+
+static void emit_atomic_global(struct lp_build_nir_context *bld_base,
+                               nir_intrinsic_op nir_op,
+                               unsigned addr_bit_size,
+                               LLVMValueRef addr,
+                               LLVMValueRef val, LLVMValueRef val2,
+                               LLVMValueRef *result)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+
+   LLVMValueRef atom_res = lp_build_alloca(gallivm,
+                                           uint_bld->vec_type, "");
+   LLVMValueRef exec_mask = mask_vec(bld_base);
+   struct lp_build_loop_state loop_state;
+   lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+
+   LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val,
+                                                    loop_state.counter, "");
+
+   LLVMValueRef addr_ptr = LLVMBuildExtractElement(gallivm->builder, addr,
+                                                   loop_state.counter, "");
+   addr_ptr = global_addr_to_ptr(gallivm, addr_ptr, 32);
+   struct lp_build_if_state ifthen;
+   LLVMValueRef cond, temp_res;
+   LLVMValueRef scalar;
+   cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
+   cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+   lp_build_if(&ifthen, gallivm, cond);
+
+   if (nir_op == nir_intrinsic_global_atomic_comp_swap) {
+      LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, val2,
+                                                         loop_state.counter, "");
+      cas_src_ptr = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, uint_bld->elem_type, "");
+      scalar = LLVMBuildAtomicCmpXchg(builder, addr_ptr, value_ptr,
+                                      cas_src_ptr,
+                                      LLVMAtomicOrderingSequentiallyConsistent,
+                                      LLVMAtomicOrderingSequentiallyConsistent,
+                                      false);
+      scalar = LLVMBuildExtractValue(gallivm->builder, scalar, 0, "");
+   } else {
+      LLVMAtomicRMWBinOp op;
+      switch (nir_op) {
+      case nir_intrinsic_global_atomic_add:
+         op = LLVMAtomicRMWBinOpAdd;
+         break;
+      case nir_intrinsic_global_atomic_exchange:
+         op = LLVMAtomicRMWBinOpXchg;
+         break;
+      case nir_intrinsic_global_atomic_and:
+         op = LLVMAtomicRMWBinOpAnd;
+         break;
+      case nir_intrinsic_global_atomic_or:
+         op = LLVMAtomicRMWBinOpOr;
+         break;
+      case nir_intrinsic_global_atomic_xor:
+         op = LLVMAtomicRMWBinOpXor;
+         break;
+      case nir_intrinsic_global_atomic_umin:
+         op = LLVMAtomicRMWBinOpUMin;
+         break;
+      case nir_intrinsic_global_atomic_umax:
+         op = LLVMAtomicRMWBinOpUMax;
+         break;
+      case nir_intrinsic_global_atomic_imin:
+         op = LLVMAtomicRMWBinOpMin;
+         break;
+      case nir_intrinsic_global_atomic_imax:
+         op = LLVMAtomicRMWBinOpMax;
+         break;
+      default:
+         unreachable("unknown atomic op");
+      }
+
+      scalar = LLVMBuildAtomicRMW(builder, op,
+                                  addr_ptr, value_ptr,
+                                  LLVMAtomicOrderingSequentiallyConsistent,
+                                  false);
+   }
+   temp_res = LLVMBuildLoad(builder, atom_res, "");
+   temp_res = LLVMBuildInsertElement(builder, temp_res, scalar, loop_state.counter, "");
+   LLVMBuildStore(builder, temp_res, atom_res);
+   lp_build_else(&ifthen);
+   temp_res = LLVMBuildLoad(builder, atom_res, "");
+   temp_res = LLVMBuildInsertElement(builder, temp_res, lp_build_const_int32(gallivm, 0), loop_state.counter, "");
+   LLVMBuildStore(builder, temp_res, atom_res);
+   lp_build_endif(&ifthen);
+   lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                          NULL, LLVMIntUGE);
+   *result = LLVMBuildLoad(builder, atom_res, "");
+}
+
 static void emit_load_ubo(struct lp_build_nir_context *bld_base,
                           unsigned nc,
                           unsigned bit_size,
                           bool offset_is_uniform,
                           LLVMValueRef index,
                           LLVMValueRef offset,
-                          LLVMValueRef result[4])
+                          LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
 {
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -527,13 +970,11 @@ static void emit_load_ubo(struct lp_build_nir_context *bld_base,
       LLVMValueRef overflow_mask;
       LLVMValueRef num_consts = lp_build_array_get(gallivm, bld->const_sizes_ptr, index);
 
-      num_consts = LLVMBuildShl(gallivm->builder, num_consts, lp_build_const_int32(gallivm, 4), "");
       num_consts = lp_build_broadcast_scalar(uint_bld, num_consts);
       for (unsigned c = 0; c < nc; c++) {
          LLVMValueRef this_offset = lp_build_add(uint_bld, offset, lp_build_const_int_vec(gallivm, uint_bld->type, c));
          overflow_mask = lp_build_compare(gallivm, uint_bld->type, PIPE_FUNC_GEQUAL,
                                           this_offset, num_consts);
-
          result[c] = build_gather(bld_base, bld_broad, consts_ptr, this_offset, overflow_mask, NULL);
       }
    }
@@ -545,7 +986,7 @@ static void emit_load_mem(struct lp_build_nir_context *bld_base,
                           unsigned bit_size,
                           LLVMValueRef index,
                           LLVMValueRef offset,
-                          LLVMValueRef outval[4])
+                          LLVMValueRef outval[NIR_MAX_VEC_COMPONENTS])
 {
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
@@ -688,7 +1129,6 @@ static void emit_atomic_mem(struct lp_build_nir_context *bld_base,
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
    LLVMValueRef ssbo_ptr;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
-   LLVMAtomicRMWBinOp op;
    LLVMValueRef ssbo_limit = NULL;
 
    if (index) {
@@ -699,47 +1139,6 @@ static void emit_atomic_mem(struct lp_build_nir_context *bld_base,
    } else
       ssbo_ptr = bld->shared_ptr;
 
-   switch (nir_op) {
-   case nir_intrinsic_shared_atomic_add:
-   case nir_intrinsic_ssbo_atomic_add:
-      op = LLVMAtomicRMWBinOpAdd;
-      break;
-   case nir_intrinsic_shared_atomic_exchange:
-   case nir_intrinsic_ssbo_atomic_exchange:
-      op = LLVMAtomicRMWBinOpXchg;
-      break;
-   case nir_intrinsic_shared_atomic_and:
-   case nir_intrinsic_ssbo_atomic_and:
-      op = LLVMAtomicRMWBinOpAnd;
-      break;
-   case nir_intrinsic_shared_atomic_or:
-   case nir_intrinsic_ssbo_atomic_or:
-      op = LLVMAtomicRMWBinOpOr;
-      break;
-   case nir_intrinsic_shared_atomic_xor:
-   case nir_intrinsic_ssbo_atomic_xor:
-      op = LLVMAtomicRMWBinOpXor;
-      break;
-   case nir_intrinsic_shared_atomic_umin:
-   case nir_intrinsic_ssbo_atomic_umin:
-      op = LLVMAtomicRMWBinOpUMin;
-      break;
-   case nir_intrinsic_shared_atomic_umax:
-   case nir_intrinsic_ssbo_atomic_umax:
-      op = LLVMAtomicRMWBinOpUMax;
-      break;
-   case nir_intrinsic_ssbo_atomic_imin:
-   case nir_intrinsic_shared_atomic_imin:
-      op = LLVMAtomicRMWBinOpMin;
-      break;
-   case nir_intrinsic_ssbo_atomic_imax:
-   case nir_intrinsic_shared_atomic_imax:
-      op = LLVMAtomicRMWBinOpMax;
-      break;
-   default:
-      break;
-   }
-
    offset = lp_build_shr_imm(uint_bld, offset, 2);
    LLVMValueRef atom_res = lp_build_alloca(gallivm,
                                            uint_bld->vec_type, "");
@@ -781,6 +1180,48 @@ static void emit_atomic_mem(struct lp_build_nir_context *bld_base,
                                       false);
       scalar = LLVMBuildExtractValue(gallivm->builder, scalar, 0, "");
    } else {
+      LLVMAtomicRMWBinOp op;
+
+      switch (nir_op) {
+      case nir_intrinsic_shared_atomic_add:
+      case nir_intrinsic_ssbo_atomic_add:
+         op = LLVMAtomicRMWBinOpAdd;
+         break;
+      case nir_intrinsic_shared_atomic_exchange:
+      case nir_intrinsic_ssbo_atomic_exchange:
+         op = LLVMAtomicRMWBinOpXchg;
+         break;
+      case nir_intrinsic_shared_atomic_and:
+      case nir_intrinsic_ssbo_atomic_and:
+         op = LLVMAtomicRMWBinOpAnd;
+         break;
+      case nir_intrinsic_shared_atomic_or:
+      case nir_intrinsic_ssbo_atomic_or:
+         op = LLVMAtomicRMWBinOpOr;
+         break;
+      case nir_intrinsic_shared_atomic_xor:
+      case nir_intrinsic_ssbo_atomic_xor:
+         op = LLVMAtomicRMWBinOpXor;
+         break;
+      case nir_intrinsic_shared_atomic_umin:
+      case nir_intrinsic_ssbo_atomic_umin:
+         op = LLVMAtomicRMWBinOpUMin;
+         break;
+      case nir_intrinsic_shared_atomic_umax:
+      case nir_intrinsic_ssbo_atomic_umax:
+         op = LLVMAtomicRMWBinOpUMax;
+         break;
+      case nir_intrinsic_ssbo_atomic_imin:
+      case nir_intrinsic_shared_atomic_imin:
+         op = LLVMAtomicRMWBinOpMin;
+         break;
+      case nir_intrinsic_ssbo_atomic_imax:
+      case nir_intrinsic_shared_atomic_imax:
+         op = LLVMAtomicRMWBinOpMax;
+         break;
+      default:
+         unreachable("unknown atomic op");
+      }
       scalar = LLVMBuildAtomicRMW(builder, op,
                                   scalar_ptr, value_ptr,
                                   LLVMAtomicOrderingSequentiallyConsistent,
@@ -814,10 +1255,12 @@ static void emit_barrier(struct lp_build_nir_context *bld_base)
 static LLVMValueRef emit_get_buffer_size(struct lp_build_nir_context *bld_base,
                                          LLVMValueRef index)
 {
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
    struct lp_build_context *bld_broad = &bld_base->uint_bld;
-   LLVMValueRef size_ptr = lp_build_array_get(bld_base->base.gallivm, bld->ssbo_sizes_ptr, LLVMBuildExtractElement(builder, index, bld_broad->zero, ""));
+   LLVMValueRef size_ptr = lp_build_array_get(bld_base->base.gallivm, bld->ssbo_sizes_ptr,
+                                              LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
    return lp_build_broadcast_scalar(bld_broad, size_ptr);
 }
 
@@ -825,10 +1268,17 @@ static void emit_image_op(struct lp_build_nir_context *bld_base,
                           struct lp_img_params *params)
 {
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+
    params->type = bld_base->base.type;
    params->context_ptr = bld->context_ptr;
    params->thread_data_ptr = bld->thread_data_ptr;
    params->exec_mask = mask_vec(bld_base);
+
+   if (params->image_index_offset)
+      params->image_index_offset = LLVMBuildExtractElement(gallivm->builder, params->image_index_offset,
+                                                           lp_build_const_int32(gallivm, 0), "");
+
    bld->image->emit_op(bld->image,
                        bld->bld_base.base.gallivm,
                        params);
@@ -839,10 +1289,14 @@ static void emit_image_size(struct lp_build_nir_context *bld_base,
                             struct lp_sampler_size_query_params *params)
 {
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
 
    params->int_type = bld_base->int_bld.type;
    params->context_ptr = bld->context_ptr;
 
+   if (params->texture_unit_offset)
+      params->texture_unit_offset = LLVMBuildExtractElement(gallivm->builder, params->texture_unit_offset,
+                                                            lp_build_const_int32(gallivm, 0), "");
    bld->image->emit_size_query(bld->image,
                                bld->bld_base.base.gallivm,
                                params);
@@ -855,6 +1309,8 @@ static void init_var_slots(struct lp_build_nir_context *bld_base,
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    unsigned slots = glsl_count_attribute_slots(var->type, false) * 4;
 
+   if (!bld->outputs)
+     return;
    for (unsigned comp = sc; comp < slots + sc; comp++) {
       unsigned this_loc = var->data.driver_location + (comp / 4);
       unsigned this_chan = comp % 4;
@@ -889,11 +1345,64 @@ static void emit_tex(struct lp_build_nir_context *bld_base,
                      struct lp_sampler_params *params)
 {
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
 
    params->type = bld_base->base.type;
    params->context_ptr = bld->context_ptr;
    params->thread_data_ptr = bld->thread_data_ptr;
 
+   if (params->texture_index_offset && bld_base->shader->info.stage != MESA_SHADER_FRAGMENT) {
+      /* this is horrible but this can be dynamic */
+      LLVMValueRef coords[5];
+      LLVMValueRef *orig_texel_ptr;
+      struct lp_build_context *uint_bld = &bld_base->uint_bld;
+      LLVMValueRef result[4] = { LLVMGetUndef(bld_base->base.vec_type),
+                                 LLVMGetUndef(bld_base->base.vec_type),
+                                 LLVMGetUndef(bld_base->base.vec_type),
+                                 LLVMGetUndef(bld_base->base.vec_type) };
+      LLVMValueRef texel[4], orig_offset;
+      unsigned i;
+      orig_texel_ptr = params->texel;
+
+      for (i = 0; i < 5; i++) {
+         coords[i] = params->coords[i];
+      }
+      orig_offset = params->texture_index_offset;
+
+      for (unsigned v = 0; v < uint_bld->type.length; v++) {
+         LLVMValueRef idx = lp_build_const_int32(gallivm, v);
+         LLVMValueRef new_coords[5];
+         for (i = 0; i < 5; i++) {
+            new_coords[i] = LLVMBuildExtractElement(gallivm->builder,
+                                                    coords[i], idx, "");
+         }
+         params->coords = new_coords;
+         params->texture_index_offset = LLVMBuildExtractElement(gallivm->builder,
+                                                                orig_offset,
+                                                                idx, "");
+         params->type = lp_elem_type(bld_base->base.type);
+
+         params->texel = texel;
+         bld->sampler->emit_tex_sample(bld->sampler,
+                                       gallivm,
+                                       params);
+
+         for (i = 0; i < 4; i++) {
+            result[i] = LLVMBuildInsertElement(gallivm->builder, result[i], texel[i], idx, "");
+         }
+      }
+      for (i = 0; i < 4; i++) {
+         orig_texel_ptr[i] = result[i];
+      }
+      return;
+   }
+
+   if (params->texture_index_offset)
+      params->texture_index_offset = LLVMBuildExtractElement(bld_base->base.gallivm->builder,
+                                                             params->texture_index_offset,
+                                                             lp_build_const_int32(bld_base->base.gallivm, 0), "");
+
+   params->type = bld_base->base.type;
    bld->sampler->emit_tex_sample(bld->sampler,
                                  bld->bld_base.base.gallivm,
                                  params);
@@ -907,6 +1416,10 @@ static void emit_tex_size(struct lp_build_nir_context *bld_base,
    params->int_type = bld_base->int_bld.type;
    params->context_ptr = bld->context_ptr;
 
+   if (params->texture_unit_offset)
+      params->texture_unit_offset = LLVMBuildExtractElement(bld_base->base.gallivm->builder,
+                                                             params->texture_unit_offset,
+                                                             lp_build_const_int32(bld_base->base.gallivm, 0), "");
    bld->sampler->emit_size_query(bld->sampler,
                                  bld->bld_base.base.gallivm,
                                  params);
@@ -914,7 +1427,7 @@ static void emit_tex_size(struct lp_build_nir_context *bld_base,
 
 static void emit_sysval_intrin(struct lp_build_nir_context *bld_base,
                                nir_intrinsic_instr *instr,
-                               LLVMValueRef result[4])
+                               LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
 {
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -922,6 +1435,12 @@ static void emit_sysval_intrin(struct lp_build_nir_context *bld_base,
    case nir_intrinsic_load_instance_id:
       result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.instance_id);
       break;
+   case nir_intrinsic_load_base_instance:
+      result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.base_instance);
+      break;
+   case nir_intrinsic_load_base_vertex:
+      result[0] = bld->system_values.basevertex;
+      break;
    case nir_intrinsic_load_vertex_id:
       result[0] = bld->system_values.vertex_id;
       break;
@@ -941,13 +1460,67 @@ static void emit_sysval_intrin(struct lp_build_nir_context *bld_base,
          result[i] = lp_build_broadcast_scalar(&bld_base->uint_bld, LLVMBuildExtractElement(gallivm->builder, bld->system_values.grid_size, lp_build_const_int32(gallivm, i), ""));
       break;
    case nir_intrinsic_load_invocation_id:
-      result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.invocation_id);
+      if (bld_base->shader->info.stage == MESA_SHADER_TESS_CTRL)
+         result[0] = bld->system_values.invocation_id;
+      else
+         result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.invocation_id);
+      break;
+   case nir_intrinsic_load_front_face:
+      result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.front_facing);
+      break;
+   case nir_intrinsic_load_draw_id:
+      result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.draw_id);
       break;
    default:
       break;
+   case nir_intrinsic_load_local_group_size:
+     for (unsigned i = 0; i < 3; i++)
+       result[i] = lp_build_broadcast_scalar(&bld_base->uint_bld, LLVMBuildExtractElement(gallivm->builder, bld->system_values.block_size, lp_build_const_int32(gallivm, i), ""));
+     break;
+   case nir_intrinsic_load_work_dim:
+      result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.work_dim);
+      break;
+   case nir_intrinsic_load_tess_coord:
+      for (unsigned i = 0; i < 3; i++) {
+        result[i] = LLVMBuildExtractValue(gallivm->builder, bld->system_values.tess_coord, i, "");
+      }
+      break;
+   case nir_intrinsic_load_tess_level_outer:
+      for (unsigned i = 0; i < 4; i++)
+         result[i] = lp_build_broadcast_scalar(&bld_base->base, LLVMBuildExtractValue(gallivm->builder, bld->system_values.tess_outer, i, ""));
+      break;
+   case nir_intrinsic_load_tess_level_inner:
+      for (unsigned i = 0; i < 2; i++)
+         result[i] = lp_build_broadcast_scalar(&bld_base->base, LLVMBuildExtractValue(gallivm->builder, bld->system_values.tess_inner, i, ""));
+      break;
+   case nir_intrinsic_load_patch_vertices_in:
+      result[0] = bld->system_values.vertices_in;
+      break;
+   case nir_intrinsic_load_sample_id:
+      result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.sample_id);
+      break;
+   case nir_intrinsic_load_sample_pos:
+      for (unsigned i = 0; i < 2; i++) {
+         LLVMValueRef idx = LLVMBuildMul(gallivm->builder, bld->system_values.sample_id, lp_build_const_int32(gallivm, 2), "");
+         idx = LLVMBuildAdd(gallivm->builder, idx, lp_build_const_int32(gallivm, i), "");
+         LLVMValueRef val = lp_build_array_get(gallivm, bld->system_values.sample_pos, idx);
+         result[i] = lp_build_broadcast_scalar(&bld_base->base, val);
+      }
+      break;
+   case nir_intrinsic_load_sample_mask_in:
+      result[0] = bld->system_values.sample_mask_in;
+      break;
    }
 }
 
+static void emit_helper_invocation(struct lp_build_nir_context *bld_base,
+                                   LLVMValueRef *dst)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   *dst = lp_build_cmp(uint_bld, PIPE_FUNC_NOTEQUAL, mask_vec(bld_base), lp_build_const_int_vec(gallivm, uint_bld->type, -1));
+}
+
 static void bgnloop(struct lp_build_nir_context *bld_base)
 {
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
@@ -1063,37 +1636,42 @@ static void emit_vertex(struct lp_build_nir_context *bld_base, uint32_t stream_i
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
 
+   if (stream_id >= bld->gs_vertex_streams)
+      return;
    assert(bld->gs_iface->emit_vertex);
    LLVMValueRef total_emitted_vertices_vec =
-      LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, "");
+      LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr[stream_id], "");
    LLVMValueRef mask = mask_vec(bld_base);
    mask = clamp_mask_to_max_output_vertices(bld, mask,
                                             total_emitted_vertices_vec);
    bld->gs_iface->emit_vertex(bld->gs_iface, &bld->bld_base.base,
                               bld->outputs,
                               total_emitted_vertices_vec,
+                              mask,
                               lp_build_const_int_vec(bld->bld_base.base.gallivm, bld->bld_base.base.type, stream_id));
 
-   increment_vec_ptr_by_mask(bld_base, bld->emitted_vertices_vec_ptr,
+   increment_vec_ptr_by_mask(bld_base, bld->emitted_vertices_vec_ptr[stream_id],
                              mask);
-   increment_vec_ptr_by_mask(bld_base, bld->total_emitted_vertices_vec_ptr,
+   increment_vec_ptr_by_mask(bld_base, bld->total_emitted_vertices_vec_ptr[stream_id],
                              mask);
 }
 
 static void
 end_primitive_masked(struct lp_build_nir_context * bld_base,
-                     LLVMValueRef mask)
+                     LLVMValueRef mask, uint32_t stream_id)
 {
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
 
+   if (stream_id >= bld->gs_vertex_streams)
+      return;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    LLVMValueRef emitted_vertices_vec =
-      LLVMBuildLoad(builder, bld->emitted_vertices_vec_ptr, "");
+      LLVMBuildLoad(builder, bld->emitted_vertices_vec_ptr[stream_id], "");
    LLVMValueRef emitted_prims_vec =
-      LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr, "");
+      LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr[stream_id], "");
    LLVMValueRef total_emitted_vertices_vec =
-      LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, "");
+      LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr[stream_id], "");
 
    LLVMValueRef emitted_mask = lp_build_cmp(uint_bld,
                                             PIPE_FUNC_NOTEQUAL,
@@ -1101,29 +1679,29 @@ end_primitive_masked(struct lp_build_nir_context * bld_base,
                                             uint_bld->zero);
    mask = LLVMBuildAnd(builder, mask, emitted_mask, "");
    bld->gs_iface->end_primitive(bld->gs_iface, &bld->bld_base.base,
-                                total_emitted_vertices_vec,
-                                emitted_vertices_vec, emitted_prims_vec, mask_vec(bld_base));
-   increment_vec_ptr_by_mask(bld_base, bld->emitted_prims_vec_ptr,
+                               total_emitted_vertices_vec,
+                               emitted_vertices_vec, emitted_prims_vec, mask, stream_id);
+   increment_vec_ptr_by_mask(bld_base, bld->emitted_prims_vec_ptr[stream_id],
                              mask);
-   clear_uint_vec_ptr_from_mask(bld_base, bld->emitted_vertices_vec_ptr,
+   clear_uint_vec_ptr_from_mask(bld_base, bld->emitted_vertices_vec_ptr[stream_id],
                                 mask);
 }
 
 static void end_primitive(struct lp_build_nir_context *bld_base, uint32_t stream_id)
 {
-   struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
+   ASSERTED struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
 
    assert(bld->gs_iface->end_primitive);
 
    LLVMValueRef mask = mask_vec(bld_base);
-   end_primitive_masked(bld_base, mask);
+   end_primitive_masked(bld_base, mask, stream_id);
 }
 
 static void
 emit_prologue(struct lp_build_nir_soa_context *bld)
 {
    struct gallivm_state * gallivm = bld->bld_base.base.gallivm;
-   if (bld->indirects & nir_var_shader_in && !bld->gs_iface) {
+   if (bld->indirects & nir_var_shader_in && !bld->gs_iface && !bld->tcs_iface && !bld->tes_iface) {
       uint32_t num_inputs = util_bitcount64(bld->bld_base.shader->info.inputs_read);
       unsigned index, chan;
       LLVMTypeRef vec_type = bld->bld_base.base.vec_type;
@@ -1147,6 +1725,82 @@ emit_prologue(struct lp_build_nir_soa_context *bld)
    }
 }
 
+static void emit_vote(struct lp_build_nir_context *bld_base, LLVMValueRef src, nir_intrinsic_instr *instr, LLVMValueRef result[4])
+{
+   struct gallivm_state * gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+
+   LLVMValueRef exec_mask = mask_vec(bld_base);
+   struct lp_build_loop_state loop_state;
+
+   LLVMValueRef outer_cond = LLVMBuildICmp(builder, LLVMIntNE, exec_mask, bld_base->uint_bld.zero, "");
+
+   LLVMValueRef res_store = lp_build_alloca(gallivm, bld_base->int_bld.elem_type, "");
+   LLVMValueRef init_val = NULL;
+   if (instr->intrinsic == nir_intrinsic_vote_ieq) {
+      /* for equal we unfortunately have to loop and find the first valid one. */
+      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+      LLVMValueRef if_cond = LLVMBuildExtractElement(gallivm->builder, outer_cond, loop_state.counter, "");
+
+      struct lp_build_if_state ifthen;
+      lp_build_if(&ifthen, gallivm, if_cond);
+      LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, src,
+                                                       loop_state.counter, "");
+      LLVMBuildStore(builder, value_ptr, res_store);
+      lp_build_endif(&ifthen);
+      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, bld_base->uint_bld.type.length),
+                            NULL, LLVMIntUGE);
+      init_val = LLVMBuildLoad(builder, res_store, "");
+   } else {
+      LLVMBuildStore(builder, lp_build_const_int32(gallivm, instr->intrinsic == nir_intrinsic_vote_any ? 0 : -1), res_store);
+   }
+
+   LLVMValueRef res;
+   lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+   LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, src,
+                                                       loop_state.counter, "");
+   struct lp_build_if_state ifthen;
+   LLVMValueRef if_cond;
+   if_cond = LLVMBuildExtractElement(gallivm->builder, outer_cond, loop_state.counter, "");
+
+   lp_build_if(&ifthen, gallivm, if_cond);
+   res = LLVMBuildLoad(builder, res_store, "");
+
+   if (instr->intrinsic == nir_intrinsic_vote_ieq) {
+      LLVMValueRef tmp = LLVMBuildICmp(builder, LLVMIntEQ, init_val, value_ptr, "");
+      tmp = LLVMBuildSExt(builder, tmp, bld_base->uint_bld.elem_type, "");
+      res = LLVMBuildOr(builder, res, tmp, "");
+   } else if (instr->intrinsic == nir_intrinsic_vote_any)
+      res = LLVMBuildOr(builder, res, value_ptr, "");
+   else
+      res = LLVMBuildAnd(builder, res, value_ptr, "");
+   LLVMBuildStore(builder, res, res_store);
+   lp_build_endif(&ifthen);
+   lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, bld_base->uint_bld.type.length),
+                         NULL, LLVMIntUGE);
+   result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, LLVMBuildLoad(builder, res_store, ""));
+}
+
+static void
+emit_interp_at(struct lp_build_nir_context *bld_base,
+               unsigned num_components,
+               nir_variable *var,
+               bool centroid,
+               bool sample,
+               unsigned const_index,
+               LLVMValueRef indir_index,
+               LLVMValueRef offsets[2],
+               LLVMValueRef dst[4])
+{
+   struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
+
+   for (unsigned i = 0; i < num_components; i++) {
+      dst[i] = bld->fs_iface->interp_fn(bld->fs_iface, &bld_base->base,
+                                        const_index + var->data.driver_location, i + var->data.location_frac,
+                                        centroid, sample, indir_index, offsets);
+   }
+}
+
 void lp_build_nir_soa(struct gallivm_state *gallivm,
                       struct nir_shader *shader,
                       const struct lp_build_tgsi_params *params,
@@ -1187,12 +1841,40 @@ void lp_build_nir_soa(struct gallivm_state *gallivm,
       int64_type.width *= 2;
       lp_build_context_init(&bld.bld_base.int64_bld, gallivm, int64_type);
    }
+   {
+      struct lp_type uint16_type;
+      uint16_type = lp_uint_type(type);
+      uint16_type.width /= 2;
+      lp_build_context_init(&bld.bld_base.uint16_bld, gallivm, uint16_type);
+   }
+   {
+      struct lp_type int16_type;
+      int16_type = lp_int_type(type);
+      int16_type.width /= 2;
+      lp_build_context_init(&bld.bld_base.int16_bld, gallivm, int16_type);
+   }
+   {
+      struct lp_type uint8_type;
+      uint8_type = lp_uint_type(type);
+      uint8_type.width /= 4;
+      lp_build_context_init(&bld.bld_base.uint8_bld, gallivm, uint8_type);
+   }
+   {
+      struct lp_type int8_type;
+      int8_type = lp_int_type(type);
+      int8_type.width /= 4;
+      lp_build_context_init(&bld.bld_base.int8_bld, gallivm, int8_type);
+   }
    bld.bld_base.load_var = emit_load_var;
    bld.bld_base.store_var = emit_store_var;
    bld.bld_base.load_reg = emit_load_reg;
    bld.bld_base.store_reg = emit_store_reg;
    bld.bld_base.emit_var_decl = emit_var_decl;
    bld.bld_base.load_ubo = emit_load_ubo;
+   bld.bld_base.load_kernel_arg = emit_load_kernel_arg;
+   bld.bld_base.load_global = emit_load_global;
+   bld.bld_base.store_global = emit_store_global;
+   bld.bld_base.atomic_global = emit_atomic_global;
    bld.bld_base.tex = emit_tex;
    bld.bld_base.tex_size = emit_tex_size;
    bld.bld_base.bgnloop = bgnloop;
@@ -1213,6 +1895,9 @@ void lp_build_nir_soa(struct gallivm_state *gallivm,
    bld.bld_base.barrier = emit_barrier;
    bld.bld_base.image_op = emit_image_op;
    bld.bld_base.image_size = emit_image_size;
+   bld.bld_base.vote = emit_vote;
+   bld.bld_base.helper_invocation = emit_helper_invocation;
+   bld.bld_base.interp_at = emit_interp_at;
 
    bld.mask = params->mask;
    bld.inputs = params->inputs;
@@ -1229,23 +1914,29 @@ void lp_build_nir_soa(struct gallivm_state *gallivm,
    bld.image = params->image;
    bld.shared_ptr = params->shared_ptr;
    bld.coro = params->coro;
-
+   bld.kernel_args_ptr = params->kernel_args;
    bld.indirects = 0;
    if (params->info->indirect_files & (1 << TGSI_FILE_INPUT))
       bld.indirects |= nir_var_shader_in;
 
    bld.gs_iface = params->gs_iface;
+   bld.tcs_iface = params->tcs_iface;
+   bld.tes_iface = params->tes_iface;
+   bld.fs_iface = params->fs_iface;
    if (bld.gs_iface) {
       struct lp_build_context *uint_bld = &bld.bld_base.uint_bld;
 
+      bld.gs_vertex_streams = params->gs_vertex_streams;
       bld.max_output_vertices_vec = lp_build_const_int_vec(gallivm, bld.bld_base.int_bld.type,
                                                            shader->info.gs.vertices_out);
-      bld.emitted_prims_vec_ptr =
-         lp_build_alloca(gallivm, uint_bld->vec_type, "emitted_prims_ptr");
-      bld.emitted_vertices_vec_ptr =
-         lp_build_alloca(gallivm, uint_bld->vec_type, "emitted_vertices_ptr");
-      bld.total_emitted_vertices_vec_ptr =
-         lp_build_alloca(gallivm, uint_bld->vec_type, "total_emitted_vertices_ptr");
+      for (int i = 0; i < params->gs_vertex_streams; i++) {
+         bld.emitted_prims_vec_ptr[i] =
+            lp_build_alloca(gallivm, uint_bld->vec_type, "emitted_prims_ptr");
+         bld.emitted_vertices_vec_ptr[i] =
+            lp_build_alloca(gallivm, uint_bld->vec_type, "emitted_vertices_ptr");
+         bld.total_emitted_vertices_vec_ptr[i] =
+            lp_build_alloca(gallivm, uint_bld->vec_type, "total_emitted_vertices_ptr");
+      }
    }
    lp_exec_mask_init(&bld.exec_mask, &bld.bld_base.int_bld);
 
@@ -1260,15 +1951,19 @@ void lp_build_nir_soa(struct gallivm_state *gallivm,
       LLVMBuilderRef builder = bld.bld_base.base.gallivm->builder;
       LLVMValueRef total_emitted_vertices_vec;
       LLVMValueRef emitted_prims_vec;
-      end_primitive_masked(&bld.bld_base, lp_build_mask_value(bld.mask));
-      total_emitted_vertices_vec =
-         LLVMBuildLoad(builder, bld.total_emitted_vertices_vec_ptr, "");
-      emitted_prims_vec =
-         LLVMBuildLoad(builder, bld.emitted_prims_vec_ptr, "");
-
-      bld.gs_iface->gs_epilogue(bld.gs_iface,
-                                 total_emitted_vertices_vec,
-                                 emitted_prims_vec);
+
+      for (int i = 0; i < params->gs_vertex_streams; i++) {
+         end_primitive_masked(&bld.bld_base, lp_build_mask_value(bld.mask), i);
+
+         total_emitted_vertices_vec =
+            LLVMBuildLoad(builder, bld.total_emitted_vertices_vec_ptr[i], "");
+
+         emitted_prims_vec =
+            LLVMBuildLoad(builder, bld.emitted_prims_vec_ptr[i], "");
+         bld.gs_iface->gs_epilogue(bld.gs_iface,
+                                   total_emitted_vertices_vec,
+                                   emitted_prims_vec, i);
+      }
    }
    lp_exec_mask_fini(&bld.exec_mask);
 }