iris: Fix headerless sampler messages in compute shaders with preemption

[mesa.git] / src / gallium / drivers / iris / iris_state.c
diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c

index 786a188746e53d0d4e3d1edc4d7286910117bc84..7d9ac0749880dc0ec366a1712ca346f150b2a7c6 100644 (file)
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -744,6 +744,7 @@ iris_emit_l3_config(struct iris_batch *batch,
                      const struct gen_l3_config *cfg)
  {
     uint32_t reg_val;
+   assert(cfg || GEN_GEN >= 12);
  
  #if GEN_GEN >= 12
  #define L3_ALLOCATION_REG GENX(L3ALLOC)
@@ -765,10 +766,16 @@ iris_emit_l3_config(struct iris_batch *batch,
        reg.ErrorDetectionBehaviorControl = true;
        reg.UseFullWays = true;
  #endif
-      reg.URBAllocation = cfg->n[GEN_L3P_URB];
-      reg.ROAllocation = cfg->n[GEN_L3P_RO];
-      reg.DCAllocation = cfg->n[GEN_L3P_DC];
-      reg.AllAllocation = cfg->n[GEN_L3P_ALL];
+      if (GEN_GEN < 12 || cfg) {
+         reg.URBAllocation = cfg->n[GEN_L3P_URB];
+         reg.ROAllocation = cfg->n[GEN_L3P_RO];
+         reg.DCAllocation = cfg->n[GEN_L3P_DC];
+         reg.AllAllocation = cfg->n[GEN_L3P_ALL];
+      } else {
+#if GEN_GEN >= 12
+         reg.L3FullWayAllocationEnable = true;
+#endif
+      }
     }
     _iris_emit_lri(batch, L3_ALLOCATION_REG_num, reg_val);
  }
@@ -898,6 +905,32 @@ static void
  init_aux_map_state(struct iris_batch *batch);
  #endif
  
+/**
+ * Upload initial GPU state for any kind of context.
+ *
+ * These need to happen for both render and compute.
+ */
+static void
+iris_init_common_context(struct iris_batch *batch)
+{
+#if GEN_GEN == 11
+   uint32_t reg_val;
+
+   iris_pack_state(GENX(SAMPLER_MODE), &reg_val, reg) {
+      reg.HeaderlessMessageforPreemptableContexts = 1;
+      reg.HeaderlessMessageforPreemptableContextsMask = 1;
+   }
+   iris_emit_lri(batch, SAMPLER_MODE, reg_val);
+
+   /* Bit 1 must be set in HALF_SLICE_CHICKEN7. */
+   iris_pack_state(GENX(HALF_SLICE_CHICKEN7), &reg_val, reg) {
+      reg.EnabledTexelOffsetPrecisionFix = 1;
+      reg.EnabledTexelOffsetPrecisionFixMask = 1;
+   }
+   iris_emit_lri(batch, HALF_SLICE_CHICKEN7, reg_val);
+#endif
+}
+
  /**
   * Upload the initial GPU state for a render context.
   *
@@ -918,6 +951,8 @@ iris_init_render_context(struct iris_batch *batch)
  
     init_state_base_address(batch);
  
+   iris_init_common_context(batch);
+
  #if GEN_GEN >= 9
     iris_pack_state(GENX(CS_DEBUG_MODE2), &reg_val, reg) {
        reg.CONSTANT_BUFFERAddressOffsetDisable = true;
@@ -954,19 +989,6 @@ iris_init_render_context(struct iris_batch *batch)
     }
     iris_emit_lri(batch, TCCNTLREG, reg_val);
  
-   iris_pack_state(GENX(SAMPLER_MODE), &reg_val, reg) {
-      reg.HeaderlessMessageforPreemptableContexts = 1;
-      reg.HeaderlessMessageforPreemptableContextsMask = 1;
-   }
-   iris_emit_lri(batch, SAMPLER_MODE, reg_val);
-
-   /* Bit 1 must be set in HALF_SLICE_CHICKEN7. */
-   iris_pack_state(GENX(HALF_SLICE_CHICKEN7), &reg_val, reg) {
-      reg.EnabledTexelOffsetPrecisionFix = 1;
-      reg.EnabledTexelOffsetPrecisionFixMask = 1;
-   }
-   iris_emit_lri(batch, HALF_SLICE_CHICKEN7, reg_val);
-
     /* Hardware specification recommends disabling repacking for the
      * compatibility with decompression mechanism in display controller.
      */
@@ -1046,6 +1068,8 @@ iris_init_compute_context(struct iris_batch *batch)
  
     init_state_base_address(batch);
  
+   iris_init_common_context(batch);
+
  #if GEN_GEN == 12
     emit_pipeline_select(batch, GPGPU);
  #endif
@@ -1280,11 +1304,9 @@ iris_bind_blend_state(struct pipe_context *ctx, void *state)
     struct iris_blend_state *cso = state;
  
     ice->state.cso_blend = cso;
-   ice->state.blend_enables = cso ? cso->blend_enables : 0;
  
     ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
     ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
-   ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
     ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[IRIS_NOS_BLEND];
  
     if (GEN_GEN == 8)
@@ -2840,6 +2862,39 @@ iris_set_sampler_views(struct pipe_context *ctx,
                                     : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
  }
  
+static void
+iris_set_compute_resources(struct pipe_context *ctx,
+                           unsigned start, unsigned count,
+                           struct pipe_surface **resources)
+{
+   assert(count == 0);
+}
+
+static void
+iris_set_global_binding(struct pipe_context *ctx,
+                        unsigned start_slot, unsigned count,
+                        struct pipe_resource **resources,
+                        uint32_t **handles)
+{
+   struct iris_context *ice = (struct iris_context *) ctx;
+
+   assert(start_slot + count <= IRIS_MAX_GLOBAL_BINDINGS);
+   for (unsigned i = 0; i < count; i++) {
+      if (resources && resources[i]) {
+         pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
+                                 resources[i]);
+         struct iris_resource *res = (void *) resources[i];
+         uint64_t addr = res->bo->gtt_offset;
+         memcpy(handles[i], &addr, sizeof(addr));
+      } else {
+         pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
+                                 NULL);
+      }
+   }
+
+   ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_CS;
+}
+
  /**
   * The pipe->set_tess_state() driver hook.
   */
@@ -3183,26 +3238,35 @@ iris_set_constant_buffer(struct pipe_context *ctx,
  
  static void
  upload_sysvals(struct iris_context *ice,
-                gl_shader_stage stage)
+               gl_shader_stage stage,
+               const struct pipe_grid_info *grid)
  {
     UNUSED struct iris_genx_state *genx = ice->state.genx;
     struct iris_shader_state *shs = &ice->state.shaders[stage];
  
     struct iris_compiled_shader *shader = ice->shaders.prog[stage];
-   if (!shader || shader->num_system_values == 0)
+   if (!shader || (shader->num_system_values == 0 &&
+                   shader->kernel_input_size == 0))
        return;
  
     assert(shader->num_cbufs > 0);
  
     unsigned sysval_cbuf_index = shader->num_cbufs - 1;
     struct pipe_shader_buffer *cbuf = &shs->constbuf[sysval_cbuf_index];
-   unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
-   uint32_t *map = NULL;
+   unsigned system_values_start =
+      ALIGN(shader->kernel_input_size, sizeof(uint32_t));
+   unsigned upload_size = system_values_start +
+                          shader->num_system_values * sizeof(uint32_t);
+   void *map = NULL;
  
     assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
     u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
-                  &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
+                  &cbuf->buffer_offset, &cbuf->buffer, &map);
+
+   if (shader->kernel_input_size > 0)
+      memcpy(map, grid->input, shader->kernel_input_size);
  
+   uint32_t *sysval_map = map + system_values_start;
     for (int i = 0; i < shader->num_system_values; i++) {
        uint32_t sysval = shader->system_values[i];
        uint32_t value = 0;
@@ -3251,7 +3315,7 @@ upload_sysvals(struct iris_context *ice,
           assert(!"unhandled system value");
        }
  
-      *map++ = value;
+      *sysval_map++ = value;
     }
  
     cbuf->buffer_size = upload_size;
@@ -4521,6 +4585,7 @@ surf_state_offset_for_aux(struct iris_resource *res,
                            unsigned aux_modes,
                            enum isl_aux_usage aux_usage)
  {
+   assert(aux_modes & (1 << aux_usage));
     return SURFACE_STATE_ALIGNMENT *
            util_bitcount(aux_modes & ((1 << aux_usage) - 1));
  }
@@ -4641,8 +4706,7 @@ use_surface(struct iris_context *ice,
     if (res->aux.bo) {
        iris_use_pinned_bo(batch, res->aux.bo, writeable, access);
        if (res->aux.clear_color_bo)
-         iris_use_pinned_bo(batch, res->aux.clear_color_bo,
-                            false, IRIS_DOMAIN_OTHER_READ);
+         iris_use_pinned_bo(batch, res->aux.clear_color_bo, false, access);
  
        if (memcmp(&res->aux.clear_color, &surf->clear_color,
                   sizeof(surf->clear_color)) != 0) {
@@ -5603,7 +5667,7 @@ iris_upload_dirty_render_state(struct iris_context *ice,
           continue;
  
        if (shs->sysvals_need_upload)
-         upload_sysvals(ice, stage);
+         upload_sysvals(ice, stage, NULL);
  
        struct push_bos push_bos = {};
        setup_constant_buffers(ice, batch, stage, &push_bos);
@@ -6362,9 +6426,9 @@ iris_upload_render_state(struct iris_context *ice,
     iris_use_pinned_bo(batch, ice->state.binder.bo, false,
                        IRIS_DOMAIN_NONE);
  
-   if (!batch->contains_draw) {
+   if (!batch->contains_draw_with_next_seqno) {
        iris_restore_render_saved_bos(ice, batch, draw);
-      batch->contains_draw = true;
+      batch->contains_draw_with_next_seqno = batch->contains_draw = true;
     }
  
     iris_upload_dirty_render_state(ice, batch, draw);
@@ -6566,9 +6630,36 @@ iris_upload_render_state(struct iris_context *ice,
  }
  
  static void
-iris_upload_compute_state(struct iris_context *ice,
-                          struct iris_batch *batch,
-                          const struct pipe_grid_info *grid)
+iris_load_indirect_location(struct iris_context *ice,
+                            struct iris_batch *batch,
+                            const struct pipe_grid_info *grid)
+{
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
+   assert(grid->indirect);
+
+   struct iris_state_ref *grid_size = &ice->state.grid_size;
+   struct iris_bo *bo = iris_resource_bo(grid_size->res);
+   iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+      lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
+      lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
+   }
+   iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+      lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
+      lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
+   }
+   iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+      lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
+      lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
+   }
+}
+
+static void
+iris_upload_gpgpu_walker(struct iris_context *ice,
+                         struct iris_batch *batch,
+                         const struct pipe_grid_info *grid)
  {
     const uint64_t stage_dirty = ice->state.stage_dirty;
     struct iris_screen *screen = batch->screen;
@@ -6579,43 +6670,11 @@ iris_upload_compute_state(struct iris_context *ice,
        ice->shaders.prog[MESA_SHADER_COMPUTE];
     struct brw_stage_prog_data *prog_data = shader->prog_data;
     struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
-
     const uint32_t group_size = grid->block[0] * grid->block[1] * grid->block[2];
     const unsigned simd_size =
        brw_cs_simd_size_for_group_size(devinfo, cs_prog_data, group_size);
     const unsigned threads = DIV_ROUND_UP(group_size, simd_size);
  
-   iris_batch_sync_region_start(batch);
-
-   /* Always pin the binder.  If we're emitting new binding table pointers,
-    * we need it.  If not, we're probably inheriting old tables via the
-    * context, and need it anyway.  Since true zero-bindings cases are
-    * practically non-existent, just pin it and avoid last_res tracking.
-    */
-   iris_use_pinned_bo(batch, ice->state.binder.bo, false, IRIS_DOMAIN_NONE);
-
-   if ((stage_dirty & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
-       shs->sysvals_need_upload)
-      upload_sysvals(ice, MESA_SHADER_COMPUTE);
-
-   if (stage_dirty & IRIS_STAGE_DIRTY_BINDINGS_CS)
-      iris_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
-
-   if (stage_dirty & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS)
-      iris_upload_sampler_states(ice, MESA_SHADER_COMPUTE);
-
-   iris_use_optional_res(batch, shs->sampler_table.res, false,
-                         IRIS_DOMAIN_NONE);
-   iris_use_pinned_bo(batch, iris_resource_bo(shader->assembly.res), false,
-                      IRIS_DOMAIN_NONE);
-
-   if (ice->state.need_border_colors)
-      iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false,
-                         IRIS_DOMAIN_NONE);
-
-#if GEN_GEN >= 12
-   genX(invalidate_aux_map_state)(batch);
-#endif
  
     if (stage_dirty & IRIS_STAGE_DIRTY_CS) {
        /* The MEDIA_VFE_STATE documentation for Gen8+ says:
@@ -6680,6 +6739,15 @@ iris_upload_compute_state(struct iris_context *ice,
        }
     }
  
+   for (unsigned i = 0; i < IRIS_MAX_GLOBAL_BINDINGS; i++) {
+      struct pipe_resource *res = ice->state.global_bindings[i];
+      if (!res)
+         continue;
+
+      iris_use_pinned_bo(batch, iris_resource_bo(res),
+                         true, IRIS_DOMAIN_NONE);
+   }
+
     if (stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_CS |
                        IRIS_STAGE_DIRTY_BINDINGS_CS |
                        IRIS_STAGE_DIRTY_CONSTANTS_CS |
@@ -6706,26 +6774,8 @@ iris_upload_compute_state(struct iris_context *ice,
        }
     }
  
-#define GPGPU_DISPATCHDIMX 0x2500
-#define GPGPU_DISPATCHDIMY 0x2504
-#define GPGPU_DISPATCHDIMZ 0x2508
-
-   if (grid->indirect) {
-      struct iris_state_ref *grid_size = &ice->state.grid_size;
-      struct iris_bo *bo = iris_resource_bo(grid_size->res);
-      iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
-         lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
-         lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
-      }
-      iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
-         lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
-         lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
-      }
-      iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
-         lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
-         lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
-      }
-   }
+   if (grid->indirect)
+      iris_load_indirect_location(ice, batch, grid);
  
     const uint32_t right_mask = brw_cs_right_mask(group_size, simd_size);
  
@@ -6743,10 +6793,55 @@ iris_upload_compute_state(struct iris_context *ice,
     }
  
     iris_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
+}
  
-   if (!batch->contains_draw) {
+static void
+iris_upload_compute_state(struct iris_context *ice,
+                          struct iris_batch *batch,
+                          const struct pipe_grid_info *grid)
+{
+   const uint64_t stage_dirty = ice->state.stage_dirty;
+   struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
+   struct iris_compiled_shader *shader =
+      ice->shaders.prog[MESA_SHADER_COMPUTE];
+
+   iris_batch_sync_region_start(batch);
+
+   /* Always pin the binder.  If we're emitting new binding table pointers,
+    * we need it.  If not, we're probably inheriting old tables via the
+    * context, and need it anyway.  Since true zero-bindings cases are
+    * practically non-existent, just pin it and avoid last_res tracking.
+    */
+   iris_use_pinned_bo(batch, ice->state.binder.bo, false, IRIS_DOMAIN_NONE);
+
+   if ((stage_dirty & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
+       shs->sysvals_need_upload)
+      upload_sysvals(ice, MESA_SHADER_COMPUTE, grid);
+
+   if (stage_dirty & IRIS_STAGE_DIRTY_BINDINGS_CS)
+      iris_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
+
+   if (stage_dirty & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS)
+      iris_upload_sampler_states(ice, MESA_SHADER_COMPUTE);
+
+   iris_use_optional_res(batch, shs->sampler_table.res, false,
+                         IRIS_DOMAIN_NONE);
+   iris_use_pinned_bo(batch, iris_resource_bo(shader->assembly.res), false,
+                      IRIS_DOMAIN_NONE);
+
+   if (ice->state.need_border_colors)
+      iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false,
+                         IRIS_DOMAIN_NONE);
+
+#if GEN_GEN >= 12
+   genX(invalidate_aux_map_state)(batch);
+#endif
+
+   iris_upload_gpgpu_walker(ice, batch, grid);
+
+   if (!batch->contains_draw_with_next_seqno) {
        iris_restore_compute_saved_bos(ice, batch, grid);
-      batch->contains_draw = true;
+      batch->contains_draw_with_next_seqno = batch->contains_draw = true;
     }
  
     iris_batch_sync_region_end(batch);
@@ -6943,6 +7038,45 @@ iris_rebind_buffer(struct iris_context *ice,
  
  /* ------------------------------------------------------------------- */
  
+/**
+ * Introduce a batch synchronization boundary, and update its cache coherency
+ * status to reflect the execution of a PIPE_CONTROL command with the
+ * specified flags.
+ */
+static void
+batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)
+{
+   iris_batch_sync_boundary(batch);
+
+   if ((flags & PIPE_CONTROL_CS_STALL)) {
+      if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
+         iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
+
+      if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
+         iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
+
+      if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
+         iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
+
+      if ((flags & (PIPE_CONTROL_CACHE_FLUSH_BITS |
+                    PIPE_CONTROL_STALL_AT_SCOREBOARD)))
+         iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_READ);
+   }
+
+   if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
+      iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
+
+   if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
+      iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
+
+   if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
+      iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
+
+   if ((flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) &&
+       (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE))
+      iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_READ);
+}
+
  static unsigned
  flags_to_post_sync_op(uint32_t flags)
  {
@@ -7371,6 +7505,7 @@ iris_emit_raw_pipe_control(struct iris_batch *batch,
                imm, reason);
     }
  
+   batch_mark_sync_for_pipe_control(batch, flags);
     iris_batch_sync_region_start(batch);
  
     iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
@@ -7643,6 +7778,8 @@ genX(init_state)(struct iris_context *ice)
     ctx->set_shader_buffers = iris_set_shader_buffers;
     ctx->set_shader_images = iris_set_shader_images;
     ctx->set_sampler_views = iris_set_sampler_views;
+   ctx->set_compute_resources = iris_set_compute_resources;
+   ctx->set_global_binding = iris_set_global_binding;
     ctx->set_tess_state = iris_set_tess_state;
     ctx->set_framebuffer_state = iris_set_framebuffer_state;
     ctx->set_polygon_stipple = iris_set_polygon_stipple;