iris: little bits of compute basics
authorKenneth Graunke <kenneth@whitecape.org>
Fri, 27 Jul 2018 04:59:20 +0000 (21:59 -0700)
committerKenneth Graunke <kenneth@whitecape.org>
Thu, 21 Feb 2019 18:26:09 +0000 (10:26 -0800)
src/gallium/drivers/iris/iris_binder.c
src/gallium/drivers/iris/iris_binder.h
src/gallium/drivers/iris/iris_context.c
src/gallium/drivers/iris/iris_context.h
src/gallium/drivers/iris/iris_draw.c
src/gallium/drivers/iris/iris_program.c
src/gallium/drivers/iris/iris_screen.c
src/gallium/drivers/iris/iris_screen.h
src/gallium/drivers/iris/iris_state.c

index 57c2865a7bc5bee190080ade383318428af640a2..ca60287df3f336cdcf5ea564470b2f182e169d2b 100644 (file)
@@ -178,6 +178,24 @@ iris_binder_reserve_3d(struct iris_context *ice)
    }
 }
 
+void
+iris_binder_reserve_compute(struct iris_context *ice)
+{
+   if (!(ice->state.dirty & IRIS_DIRTY_BINDINGS_CS))
+      return;
+
+   struct iris_binder *binder = &ice->state.binder;
+   struct brw_stage_prog_data *prog_data =
+      ice->shaders.prog[MESA_SHADER_COMPUTE]->prog_data;
+
+   unsigned size = prog_data->binding_table.size_bytes;
+
+   if (size == 0)
+      return;
+
+   binder->bt_offset[MESA_SHADER_COMPUTE] = iris_binder_reserve(ice, size);
+}
+
 void
 iris_init_binder(struct iris_context *ice)
 {
index e63170e298f49f61b5327b384d1d99fea9a12b69..78449286c6d90596ac90bd0826fa128eef304eb2 100644 (file)
@@ -53,5 +53,6 @@ void iris_init_binder(struct iris_context *ice);
 void iris_destroy_binder(struct iris_binder *binder);
 uint32_t iris_binder_reserve(struct iris_context *ice, unsigned size);
 void iris_binder_reserve_3d(struct iris_context *ice);
+void iris_binder_reserve_compute(struct iris_context *ice);
 
 #endif
index 73d6b0d806df47a9abcb40a7a4a2fc8dfab08885..6a38b3b4a856c3a50d477d06378cd8fc674f5851 100644 (file)
@@ -208,6 +208,8 @@ iris_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
    genX_call(devinfo, init_blorp, ice);
    ice->vtbl.init_render_context(screen, &ice->render_batch, &ice->vtbl,
                                  &ice->dbg);
+   ice->vtbl.init_compute_context(screen, &ice->compute_batch, &ice->vtbl,
+                                  &ice->dbg);
 
    return ctx;
 }
index 1f553d71fa2d0de9694d945a337db147bff4047f..87c7f144ec74a1c4990e9e810baf96da5c15710e 100644 (file)
@@ -277,11 +277,18 @@ struct iris_vtable {
                                struct iris_batch *batch,
                                struct iris_vtable *vtbl,
                                struct pipe_debug_callback *dbg);
+   void (*init_compute_context)(struct iris_screen *screen,
+                                struct iris_batch *batch,
+                                struct iris_vtable *vtbl,
+                                struct pipe_debug_callback *dbg);
    void (*upload_render_state)(struct iris_context *ice,
                                struct iris_batch *batch,
                                const struct pipe_draw_info *draw);
    void (*update_surface_base_address)(struct iris_batch *batch,
                                        struct iris_binder *binder);
+   void (*upload_compute_state)(struct iris_context *ice,
+                                struct iris_batch *batch,
+                                const struct pipe_grid_info *grid);
    void (*load_register_imm32)(struct iris_batch *batch, uint32_t reg,
                                uint32_t val);
    void (*load_register_imm64)(struct iris_batch *batch, uint32_t reg,
@@ -326,6 +333,8 @@ struct iris_vtable {
                            struct brw_gs_prog_key *key);
    void (*populate_fs_key)(const struct iris_context *ice,
                            struct brw_wm_prog_key *key);
+   void (*populate_cs_key)(const struct iris_context *ice,
+                           struct brw_cs_prog_key *key);
 };
 
 /**
@@ -363,6 +372,9 @@ struct iris_context {
    /** The main batch for rendering. */
    struct iris_batch render_batch;
 
+   /** The batch for compute shader dispatch */
+   struct iris_batch compute_batch;
+
    struct {
       struct iris_uncompiled_shader *uncompiled[MESA_SHADER_STAGES];
       struct iris_compiled_shader *prog[MESA_SHADER_STAGES];
@@ -471,6 +483,8 @@ void iris_init_program_functions(struct pipe_context *ctx);
 void iris_init_resource_functions(struct pipe_context *ctx);
 void iris_init_query_functions(struct pipe_context *ctx);
 void iris_update_compiled_shaders(struct iris_context *ice);
+void iris_update_compiled_compute_shader(struct iris_context *ice);
+
 
 /* iris_blit.c */
 void iris_blorp_surf_for_resource(struct blorp_surf *surf,
@@ -481,6 +495,7 @@ void iris_blorp_surf_for_resource(struct blorp_surf *surf,
 /* iris_draw.c */
 
 void iris_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info);
+void iris_launch_grid(struct pipe_context *, const struct pipe_grid_info *);
 
 /* iris_pipe_control.c */
 
index 0567bbac72ef4bf138a64f413213ef2b2bb161fe..196f8d2caf4cebe4c6d6ebaff21a35baeccf5fe6 100644 (file)
@@ -74,7 +74,10 @@ iris_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
    iris_batch_maybe_flush(batch, 1500);
 
+   // XXX: check if BOs are in use by the other batches (compute), if so flush
+
    iris_update_draw_info(ice, info);
+
    iris_update_compiled_shaders(ice);
 
    iris_predraw_resolve_inputs(ice, batch);
@@ -89,3 +92,30 @@ iris_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
    iris_postdraw_update_resolve_tracking(ice, batch);
 }
+
+void
+iris_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info)
+{
+   struct iris_context *ice = (struct iris_context *) ctx;
+   struct iris_batch *batch = &ice->compute_batch;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_REEMIT))
+      ice->state.dirty |= ~0ull;
+
+   iris_batch_maybe_flush(batch, 1500);
+
+   // XXX: check if BOs are in use by the other batches (render), if so flush
+   //
+   //if (dirty & IRIS_DIRTY_UNCOMPILED_CS)
+      iris_update_compiled_compute_shader(ice);
+
+   // XXX: predraw resolves / cache flushing
+
+   iris_binder_reserve_compute(ice);
+   ice->vtbl.update_surface_base_address(batch, &ice->state.binder);
+   ice->vtbl.upload_compute_state(ice, batch, info);
+
+   ice->state.dirty = 0ull;
+
+   // XXX: postdraw resolve tracking
+}
index ba09c087d69eba58fe395cfe335a370b3074a289..a62b2d0fb4242c9be9ed9f4e57d230aa3e99e7eb 100644 (file)
@@ -214,6 +214,7 @@ iris_create_uncompiled_shader(struct pipe_context *ctx,
    }
 
    // XXX: precompile!
+   // XXX: disallow more than 64KB of shared variables
 
    return ish;
 }
@@ -1004,6 +1005,58 @@ iris_update_compiled_shaders(struct iris_context *ice)
    }
 }
 
+static bool
+iris_compile_cs(struct iris_context *ice,
+                struct iris_uncompiled_shader *ish,
+                const struct brw_cs_prog_key *key)
+{
+   struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen;
+   const struct brw_compiler *compiler = screen->compiler;
+   const struct gen_device_info *devinfo = &screen->devinfo;
+   void *mem_ctx = ralloc_context(NULL);
+   struct brw_cs_prog_data *cs_prog_data =
+      rzalloc(mem_ctx, struct brw_cs_prog_data);
+   struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
+
+   nir_shader *nir = ish->nir;
+
+   cs_prog_data->binding_table.work_groups_start = 0;
+   assign_common_binding_table_offsets(devinfo, nir, prog_data, 1);
+
+   iris_setup_uniforms(compiler, mem_ctx, nir, prog_data);
+
+   char *error_str = NULL;
+   const unsigned *program =
+      brw_compile_cs(compiler, &ice->dbg, mem_ctx, key, cs_prog_data,
+                     nir, -1, &error_str);
+   if (program == NULL) {
+      dbg_printf("Failed to compile compute shader: %s\n", error_str);
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   iris_upload_and_bind_shader(ice, IRIS_CACHE_CS, key, program, prog_data,
+                               NULL);
+
+   ralloc_free(mem_ctx);
+   return true;
+}
+
+void
+iris_update_compiled_compute_shader(struct iris_context *ice)
+{
+   struct iris_uncompiled_shader *ish =
+      ice->shaders.uncompiled[MESA_SHADER_COMPUTE];
+
+   struct brw_cs_prog_key key = { .program_string_id = ish->program_id };
+   ice->vtbl.populate_cs_key(ice, &key);
+
+   if (iris_bind_cached_shader(ice, IRIS_CACHE_CS, &key))
+      return;
+
+   UNUSED bool success = iris_compile_cs(ice, ish, &key);
+}
+
 void
 iris_init_program_functions(struct pipe_context *ctx)
 {
index 1f336adc20b7dc7d8fce826c270be354407c1c0b..5e27acc54b1069bd192ae2573f2f89f6e75310f6 100644 (file)
@@ -587,6 +587,10 @@ iris_screen_create(int fd)
    slab_create_parent(&screen->transfer_pool,
                       sizeof(struct iris_transfer), 64);
 
+   screen->subslice_total =
+      iris_getparam_integer(screen, I915_PARAM_SUBSLICE_TOTAL);
+   assert(screen->subslice_total >= 1);
+
    struct pipe_screen *pscreen = &screen->base;
 
    iris_init_screen_resource_functions(pscreen);
index 463b191d1315fcd7ff46d240eb03175a9690f774..aa510efed180407ad50db0c07a32e251abfff421 100644 (file)
@@ -51,6 +51,8 @@ struct iris_screen {
    /** Global program_string_id counter (see get_program_string_id()) */
    unsigned program_id;
 
+   unsigned subslice_total;
+
    struct gen_device_info devinfo;
    struct isl_device isl_dev;
    struct iris_bufmgr *bufmgr;
index b20a608f3871714909dc4201402fde165ec795ac..189bd5e2380c6e5920fe2bb0f176f6a1a5b60d5f 100644 (file)
@@ -610,6 +610,54 @@ iris_init_render_context(struct iris_screen *screen,
    }
 }
 
+static void
+iris_init_compute_context(struct iris_screen *screen,
+                          struct iris_batch *batch,
+                          struct iris_vtable *vtbl,
+                          struct pipe_debug_callback *dbg)
+{
+   iris_init_batch(batch, screen, vtbl, dbg, I915_EXEC_RENDER);
+
+   /* XXX: PIPE_CONTROLs */
+
+   iris_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
+      sel.PipelineSelection = GPGPU;
+   }
+
+   iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
+   #if 0
+   // XXX: MOCS is stupid for this.
+      sba.GeneralStateMemoryObjectControlState            = MOCS_WB;
+      sba.StatelessDataPortAccessMemoryObjectControlState = MOCS_WB;
+      sba.SurfaceStateMemoryObjectControlState            = MOCS_WB;
+      sba.DynamicStateMemoryObjectControlState            = MOCS_WB;
+      sba.IndirectObjectMemoryObjectControlState          = MOCS_WB;
+      sba.InstructionMemoryObjectControlState             = MOCS_WB;
+      sba.BindlessSurfaceStateMemoryObjectControlState    = MOCS_WB;
+   #endif
+
+      sba.GeneralStateBaseAddressModifyEnable   = true;
+      sba.SurfaceStateBaseAddressModifyEnable   = true;
+      sba.DynamicStateBaseAddressModifyEnable   = true;
+      sba.IndirectObjectBaseAddressModifyEnable = true;
+      sba.InstructionBaseAddressModifyEnable    = true;
+      sba.GeneralStateBufferSizeModifyEnable    = true;
+      sba.DynamicStateBufferSizeModifyEnable    = true;
+      sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+      sba.IndirectObjectBufferSizeModifyEnable  = true;
+      sba.InstructionBuffersizeModifyEnable     = true;
+
+      sba.InstructionBaseAddress  = ro_bo(NULL, IRIS_MEMZONE_SHADER_START);
+      sba.SurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_SURFACE_START);
+      sba.DynamicStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_DYNAMIC_START);
+
+      sba.GeneralStateBufferSize   = 0xfffff;
+      sba.IndirectObjectBufferSize = 0xfffff;
+      sba.InstructionBufferSize    = 0xfffff;
+      sba.DynamicStateBufferSize   = 0xfffff;
+   }
+}
+
 struct iris_vertex_buffer_state {
    /** The 3DSTATE_VERTEX_BUFFERS hardware packet. */
    uint32_t vertex_buffers[1 + 33 * GENX(VERTEX_BUFFER_STATE_length)];
@@ -646,12 +694,6 @@ struct iris_genx_state {
    uint32_t streamout[4 * GENX(3DSTATE_STREAMOUT_length)];
 };
 
-// XXX: move this to iris_draw.c
-static void
-iris_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info)
-{
-}
-
 /**
  * The pipe->set_blend_color() driver hook.
  *
@@ -2826,6 +2868,13 @@ iris_populate_fs_key(const struct iris_context *ice,
    // XXX: respect hint for high_quality_derivatives:1;
 }
 
+static void
+iris_populate_cs_key(const struct iris_context *ice,
+                     struct brw_cs_prog_key *key)
+{
+   iris_populate_sampler_key(ice, &key->tex);
+}
+
 #if 0
    // XXX: these need to go in INIT_THREAD_DISPATCH_FIELDS
    pkt.SamplerCount =                                                     \
@@ -3074,6 +3123,26 @@ iris_store_fs_state(const struct gen_device_info *devinfo,
  *
  * This must match the data written by the iris_store_xs_state() functions.
  */
+static void
+iris_store_cs_state(const struct gen_device_info *devinfo,
+                    struct iris_compiled_shader *shader)
+{
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_cs_prog_data *cs_prog_data = (void *) shader->prog_data;
+   void *map = shader->derived_data;
+
+   iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), map, desc) {
+      desc.KernelStartPointer = KSP(shader);
+      desc.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
+      desc.NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads;
+      desc.SharedLocalMemorySize =
+         encode_slm_size(GEN_GEN, prog_data->total_shared);
+      desc.BarrierEnable = cs_prog_data->uses_barrier;
+      desc.CrossThreadConstantDataReadLength =
+         cs_prog_data->push.cross_thread.regs;
+   }
+}
+
 static unsigned
 iris_derived_program_state_size(enum iris_program_cache_id cache_id)
 {
@@ -3086,7 +3155,7 @@ iris_derived_program_state_size(enum iris_program_cache_id cache_id)
       [IRIS_CACHE_GS] = GENX(3DSTATE_GS_length),
       [IRIS_CACHE_FS] =
          GENX(3DSTATE_PS_length) + GENX(3DSTATE_PS_EXTRA_length),
-      [IRIS_CACHE_CS] = 0,
+      [IRIS_CACHE_CS] = GENX(INTERFACE_DESCRIPTOR_DATA_length),
       [IRIS_CACHE_BLORP] = 0,
    };
 
@@ -3121,6 +3190,7 @@ iris_store_derived_program_state(const struct gen_device_info *devinfo,
       iris_store_fs_state(devinfo, shader);
       break;
    case IRIS_CACHE_CS:
+      iris_store_cs_state(devinfo, shader);
    case IRIS_CACHE_BLORP:
       break;
    default:
@@ -4126,6 +4196,126 @@ iris_upload_render_state(struct iris_context *ice,
    }
 }
 
+static void
+iris_upload_compute_state(struct iris_context *ice,
+                          struct iris_batch *batch,
+                          const struct pipe_grid_info *grid)
+{
+   const uint64_t dirty = ice->state.dirty;
+   struct iris_screen *screen = batch->screen;
+   const struct gen_device_info *devinfo = &screen->devinfo;
+   struct iris_binder *binder = &ice->state.binder;
+   struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
+   struct iris_compiled_shader *shader =
+      ice->shaders.prog[MESA_SHADER_COMPUTE];
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
+
+   if (dirty & IRIS_DIRTY_BINDINGS_CS)
+      iris_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
+
+   iris_use_optional_res(batch, shs->sampler_table.res, false);
+   iris_use_pinned_bo(batch, iris_resource_bo(shader->assembly.res), false);
+
+   if (ice->state.need_border_colors)
+      iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false);
+
+   /* The MEDIA_VFE_STATE documentation for Gen8+ says:
+    *
+    *   "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+    *    the only bits that are changed are scoreboard related: Scoreboard
+    *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
+    *    these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
+    */
+   iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
+
+   iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
+      if (prog_data->total_scratch) {
+         /* Per Thread Scratch Space is in the range [0, 11] where
+          * 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
+          */
+         // XXX: vfe.ScratchSpaceBasePointer
+         //vfe.PerThreadScratchSpace =
+            //ffs(stage_state->per_thread_scratch) - 11;
+      }
+
+      vfe.MaximumNumberofThreads =
+         devinfo->max_cs_threads * screen->subslice_total - 1;
+#if GEN_GEN < 11
+      vfe.ResetGatewayTimer =
+         Resettingrelativetimerandlatchingtheglobaltimestamp;
+#endif
+
+      vfe.NumberofURBEntries = 2;
+      vfe.URBEntryAllocationSize = 2;
+
+      // XXX: Use Indirect Payload Storage?
+      vfe.CURBEAllocationSize =
+         ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
+               cs_prog_data->push.cross_thread.regs, 2);
+   }
+
+   // XXX: hack iris_set_constant_buffers to upload compute shader constants
+   // XXX: differently...?
+
+   if (cs_prog_data->push.total.size > 0) {
+      iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
+         curbe.CURBETotalDataLength =
+            ALIGN(cs_prog_data->push.total.size, 64);
+         // XXX: curbe.CURBEDataStartAddress = stage_state->push_const_offset;
+      }
+   }
+
+   struct pipe_resource *desc_res = NULL;
+   uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
+
+   iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
+      idd.SamplerStatePointer = shs->sampler_table.offset;
+      idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
+   }
+
+   for (int i = 0; i < GENX(INTERFACE_DESCRIPTOR_DATA_length); i++)
+      desc[i] |= ((uint32_t *) shader->derived_data)[i];
+
+   iris_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
+      load.InterfaceDescriptorTotalLength =
+         GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
+      load.InterfaceDescriptorDataStartAddress =
+         emit_state(batch, ice->state.dynamic_uploader,
+                    &desc_res, desc, sizeof(desc), 32);
+   }
+
+   pipe_resource_reference(&desc_res, NULL);
+
+   // XXX: grid->indirect
+
+   uint32_t group_size = grid->block[0] * grid->block[1] * grid->block[2];
+   uint32_t remainder = group_size & (cs_prog_data->simd_size - 1);
+   uint32_t right_mask;
+
+   if (remainder > 0)
+      right_mask = ~0u >> (32 - remainder);
+   else
+      right_mask = ~0u >> (32 - cs_prog_data->simd_size);
+
+   iris_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
+      ggw.SIMDSize                   = cs_prog_data->simd_size / 16;
+      ggw.ThreadDepthCounterMaximum  = 0;
+      ggw.ThreadHeightCounterMaximum = 0;
+      ggw.ThreadWidthCounterMaximum  = cs_prog_data->threads - 1;
+      ggw.ThreadGroupIDXDimension    = grid->block[0];
+      ggw.ThreadGroupIDYDimension    = grid->block[1];
+      ggw.ThreadGroupIDZDimension    = grid->block[2];
+      ggw.RightExecutionMask         = right_mask;
+      ggw.BottomExecutionMask        = 0xffffffff;
+   }
+
+   if (!batch->contains_draw) {
+      //iris_restore_context_saved_bos(ice, batch, draw);
+      batch->contains_draw = true;
+   }
+}
+
 /**
  * State module teardown.
  */
@@ -4729,8 +4919,10 @@ genX(init_state)(struct iris_context *ice)
 
    ice->vtbl.destroy_state = iris_destroy_state;
    ice->vtbl.init_render_context = iris_init_render_context;
+   ice->vtbl.init_compute_context = iris_init_compute_context;
    ice->vtbl.upload_render_state = iris_upload_render_state;
    ice->vtbl.update_surface_base_address = iris_update_surface_base_address;
+   ice->vtbl.upload_compute_state = iris_upload_compute_state;
    ice->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;
    ice->vtbl.load_register_imm32 = iris_load_register_imm32;
    ice->vtbl.load_register_imm64 = iris_load_register_imm64;
@@ -4749,6 +4941,7 @@ genX(init_state)(struct iris_context *ice)
    ice->vtbl.populate_tes_key = iris_populate_tes_key;
    ice->vtbl.populate_gs_key = iris_populate_gs_key;
    ice->vtbl.populate_fs_key = iris_populate_fs_key;
+   ice->vtbl.populate_cs_key = iris_populate_cs_key;
 
    ice->state.dirty = ~0ull;