From 4d219b0eb3d626abf094a53655843664974c7516 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Wed, 7 Nov 2018 22:05:14 -0800 Subject: [PATCH] iris: implement scratch space! we borrow the approach from anv rather than i965, as it works better with pre-baked state that needs to contain scratch BO addresses fixes a bunch of varying packing tests --- src/gallium/drivers/iris/iris_context.h | 14 +++- src/gallium/drivers/iris/iris_program.c | 50 +++++++++++++ src/gallium/drivers/iris/iris_program_cache.c | 4 +- src/gallium/drivers/iris/iris_state.c | 75 ++++++++++++------- 4 files changed, 111 insertions(+), 32 deletions(-) diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h index 3f68cb3552c..54aa1a509e8 100644 --- a/src/gallium/drivers/iris/iris_context.h +++ b/src/gallium/drivers/iris/iris_context.h @@ -328,7 +328,7 @@ struct iris_vtable { uint64_t imm); unsigned (*derived_program_state_size)(enum iris_program_cache_id id); - void (*store_derived_program_state)(const struct gen_device_info *devinfo, + void (*store_derived_program_state)(struct iris_context *ice, enum iris_program_cache_id cache_id, struct iris_compiled_shader *shader); uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol, @@ -394,6 +394,14 @@ struct iris_context { struct hash_table *cache; unsigned urb_size; + + /** + * Scratch buffers for various sizes and stages. + * + * Indexed by the "Per-Thread Scratch Space" field's 4-bit encoding, + * and shader stage. + */ + struct iris_bo *scratch_bos[1 << 4][MESA_SHADER_STAGES]; } shaders; struct { @@ -552,7 +560,9 @@ const struct shader_info *iris_get_shader_info(const struct iris_context *ice, gl_shader_stage stage); unsigned iris_get_shader_num_ubos(const struct iris_context *ice, gl_shader_stage stage); - +uint32_t iris_get_scratch_space(struct iris_context *ice, + unsigned per_thread_scratch, + gl_shader_stage stage); /* iris_program_cache.c */ diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c index fc874c4c08f..626098c6a7a 100644 --- a/src/gallium/drivers/iris/iris_program.c +++ b/src/gallium/drivers/iris/iris_program.c @@ -1072,6 +1072,56 @@ iris_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data, dst[8 * t] = t; } +/** + * Allocate scratch BOs as needed for the given per-thread size and stage. + * + * Returns the 32-bit "Scratch Space Base Pointer" value. + */ +uint32_t +iris_get_scratch_space(struct iris_context *ice, + unsigned per_thread_scratch, + gl_shader_stage stage) +{ + struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; + struct iris_bufmgr *bufmgr = screen->bufmgr; + const struct gen_device_info *devinfo = &screen->devinfo; + + unsigned encoded_size = ffs(per_thread_scratch) - 11; + assert(encoded_size < (1 << 16)); + + struct iris_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage]; + + /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says: + * + * "Scratch Space per slice is computed based on 4 sub-slices. SW must + * allocate scratch space enough so that each slice has 4 slices + * allowed." + * + * According to the other driver team, this applies to compute shaders + * as well. This is not currently documented at all. + */ + unsigned subslice_total = 4 * devinfo->num_slices; + assert(subslice_total >= screen->subslice_total); + + if (!*bop) { + unsigned scratch_ids_per_subslice = devinfo->max_cs_threads; + uint32_t max_threads[] = { + [MESA_SHADER_VERTEX] = devinfo->max_vs_threads, + [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads, + [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads, + [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads, + [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads, + [MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslice_total, + }; + + uint32_t size = per_thread_scratch * max_threads[stage]; + + *bop = iris_bo_alloc(bufmgr, "scratch", size, IRIS_MEMZONE_SHADER); + } + + return (*bop)->gtt_offset; +} + void iris_init_program_functions(struct pipe_context *ctx) { diff --git a/src/gallium/drivers/iris/iris_program_cache.c b/src/gallium/drivers/iris/iris_program_cache.c index f5c6fc26794..9f500097d52 100644 --- a/src/gallium/drivers/iris/iris_program_cache.c +++ b/src/gallium/drivers/iris/iris_program_cache.c @@ -241,8 +241,6 @@ iris_upload_shader(struct iris_context *ice, struct brw_stage_prog_data *prog_data, uint32_t *streamout) { - struct iris_screen *screen = (void *) ice->ctx.screen; - struct gen_device_info *devinfo = &screen->devinfo; struct hash_table *cache = ice->shaders.cache; struct iris_compiled_shader *shader = rzalloc_size(cache, sizeof(struct iris_compiled_shader) + @@ -277,7 +275,7 @@ iris_upload_shader(struct iris_context *ice, ralloc_steal(shader, shader->streamout); /* Store the 3DSTATE shader packets and other derived state. */ - ice->vtbl.store_derived_program_state(devinfo, cache_id, shader); + ice->vtbl.store_derived_program_state(ice, cache_id, shader); struct keybox *keybox = make_keybox(cache, cache_id, key, key_size); _mesa_hash_table_insert(ice->shaders.cache, keybox, shader); diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 3c668cac980..3d2af86c544 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -2981,8 +2981,6 @@ iris_populate_cs_key(const struct iris_context *ice, // XXX: these need to go in INIT_THREAD_DISPATCH_FIELDS pkt.SamplerCount = \ DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); \ - pkt.PerThreadScratchSpace = prog_data->total_scratch == 0 ? 0 : \ - ffs(stage_state->per_thread_scratch) - 11; \ #endif @@ -2997,7 +2995,7 @@ KSP(const struct iris_compiled_shader *shader) // prefetching of binding tables in A0 and B0 steppings. XXX: Revisit // this WA on C0 stepping. -#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \ +#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \ pkt.KernelStartPointer = KSP(shader); \ pkt.BindingTableEntryCount = GEN_GEN == 11 ? 0 : \ prog_data->binding_table.size_bytes / 4; \ @@ -3009,20 +3007,28 @@ KSP(const struct iris_compiled_shader *shader) pkt.prefix##URBEntryReadOffset = 0; \ \ pkt.StatisticsEnable = true; \ - pkt.Enable = true; + pkt.Enable = true; \ + \ + if (prog_data->total_scratch) { \ + uint32_t scratch_addr = \ + iris_get_scratch_space(ice, prog_data->total_scratch, stage); \ + pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \ + pkt.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr); \ + } /** * Encode most of 3DSTATE_VS based on the compiled shader. */ static void -iris_store_vs_state(const struct gen_device_info *devinfo, +iris_store_vs_state(struct iris_context *ice, + const struct gen_device_info *devinfo, struct iris_compiled_shader *shader) { struct brw_stage_prog_data *prog_data = shader->prog_data; struct brw_vue_prog_data *vue_prog_data = (void *) prog_data; iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) { - INIT_THREAD_DISPATCH_FIELDS(vs, Vertex); + INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX); vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1; vs.SIMD8DispatchEnable = true; vs.UserClipDistanceCullTestEnableBitmask = @@ -3034,7 +3040,8 @@ iris_store_vs_state(const struct gen_device_info *devinfo, * Encode most of 3DSTATE_HS based on the compiled shader. */ static void -iris_store_tcs_state(const struct gen_device_info *devinfo, +iris_store_tcs_state(struct iris_context *ice, + const struct gen_device_info *devinfo, struct iris_compiled_shader *shader) { struct brw_stage_prog_data *prog_data = shader->prog_data; @@ -3042,7 +3049,7 @@ iris_store_tcs_state(const struct gen_device_info *devinfo, struct brw_tcs_prog_data *tcs_prog_data = (void *) prog_data; iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) { - INIT_THREAD_DISPATCH_FIELDS(hs, Vertex); + INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL); hs.InstanceCount = tcs_prog_data->instances - 1; hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; @@ -3054,7 +3061,8 @@ iris_store_tcs_state(const struct gen_device_info *devinfo, * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader. */ static void -iris_store_tes_state(const struct gen_device_info *devinfo, +iris_store_tes_state(struct iris_context *ice, + const struct gen_device_info *devinfo, struct iris_compiled_shader *shader) { struct brw_stage_prog_data *prog_data = shader->prog_data; @@ -3074,7 +3082,7 @@ iris_store_tes_state(const struct gen_device_info *devinfo, } iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) { - INIT_THREAD_DISPATCH_FIELDS(ds, Patch); + INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL); ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH; ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; @@ -3091,7 +3099,8 @@ iris_store_tes_state(const struct gen_device_info *devinfo, * Encode most of 3DSTATE_GS based on the compiled shader. */ static void -iris_store_gs_state(const struct gen_device_info *devinfo, +iris_store_gs_state(struct iris_context *ice, + const struct gen_device_info *devinfo, struct iris_compiled_shader *shader) { struct brw_stage_prog_data *prog_data = shader->prog_data; @@ -3099,7 +3108,7 @@ iris_store_gs_state(const struct gen_device_info *devinfo, struct brw_gs_prog_data *gs_prog_data = (void *) prog_data; iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) { - INIT_THREAD_DISPATCH_FIELDS(gs, Vertex); + INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY); gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1; gs.OutputTopology = gs_prog_data->output_topology; @@ -3138,7 +3147,8 @@ iris_store_gs_state(const struct gen_device_info *devinfo, * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader. */ static void -iris_store_fs_state(const struct gen_device_info *devinfo, +iris_store_fs_state(struct iris_context *ice, + const struct gen_device_info *devinfo, struct iris_compiled_shader *shader) { struct brw_stage_prog_data *prog_data = shader->prog_data; @@ -3193,6 +3203,14 @@ iris_store_fs_state(const struct gen_device_info *devinfo, KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1); ps.KernelStartPointer2 = KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2); + + if (prog_data->total_scratch) { + uint32_t scratch_addr = + iris_get_scratch_space(ice, prog_data->total_scratch, + MESA_SHADER_FRAGMENT); + ps.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; + ps.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr); + } } iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) { @@ -3226,7 +3244,8 @@ iris_store_fs_state(const struct gen_device_info *devinfo, * This must match the data written by the iris_store_xs_state() functions. */ static void -iris_store_cs_state(const struct gen_device_info *devinfo, +iris_store_cs_state(struct iris_context *ice, + const struct gen_device_info *devinfo, struct iris_compiled_shader *shader) { struct brw_stage_prog_data *prog_data = shader->prog_data; @@ -3271,28 +3290,31 @@ iris_derived_program_state_size(enum iris_program_cache_id cache_id) * get most of the state packet without having to reconstruct it. */ static void -iris_store_derived_program_state(const struct gen_device_info *devinfo, +iris_store_derived_program_state(struct iris_context *ice, enum iris_program_cache_id cache_id, struct iris_compiled_shader *shader) { + struct iris_screen *screen = (void *) ice->ctx.screen; + const struct gen_device_info *devinfo = &screen->devinfo; + switch (cache_id) { case IRIS_CACHE_VS: - iris_store_vs_state(devinfo, shader); + iris_store_vs_state(ice, devinfo, shader); break; case IRIS_CACHE_TCS: - iris_store_tcs_state(devinfo, shader); + iris_store_tcs_state(ice, devinfo, shader); break; case IRIS_CACHE_TES: - iris_store_tes_state(devinfo, shader); + iris_store_tes_state(ice, devinfo, shader); break; case IRIS_CACHE_GS: - iris_store_gs_state(devinfo, shader); + iris_store_gs_state(ice, devinfo, shader); break; case IRIS_CACHE_FS: - iris_store_fs_state(devinfo, shader); + iris_store_fs_state(ice, devinfo, shader); break; case IRIS_CACHE_CS: - iris_store_cs_state(devinfo, shader); + iris_store_cs_state(ice, devinfo, shader); case IRIS_CACHE_BLORP: break; default: @@ -4401,12 +4423,11 @@ iris_upload_compute_state(struct iris_context *ice, iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) { if (prog_data->total_scratch) { - /* Per Thread Scratch Space is in the range [0, 11] where - * 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. - */ - // XXX: vfe.ScratchSpaceBasePointer - //vfe.PerThreadScratchSpace = - //ffs(stage_state->per_thread_scratch) - 11; + uint32_t scratch_addr = + iris_get_scratch_space(ice, prog_data->total_scratch, + MESA_SHADER_COMPUTE); + vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; + vfe.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr); } vfe.MaximumNumberofThreads = -- 2.30.2