#include "intel_batchbuffer.h"
#include "brw_nir.h"
#include "brw_program.h"
-
-void
-brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
- void *buffer, uint32_t threads, uint32_t stride)
-{
- if (prog_data->local_invocation_id_regs == 0)
- return;
-
- /* 'stride' should be an integer number of registers, that is, a multiple
- * of 32 bytes.
- */
- assert(stride % 32 == 0);
-
- unsigned x = 0, y = 0, z = 0;
- for (unsigned t = 0; t < threads; t++) {
- uint32_t *param = (uint32_t *) buffer + stride * t / 4;
-
- for (unsigned i = 0; i < prog_data->simd_size; i++) {
- param[0 * prog_data->simd_size + i] = x;
- param[1 * prog_data->simd_size + i] = y;
- param[2 * prog_data->simd_size + i] = z;
-
- x++;
- if (x == prog_data->local_size[0]) {
- x = 0;
- y++;
- if (y == prog_data->local_size[1]) {
- y = 0;
- z++;
- if (z == prog_data->local_size[2])
- z = 0;
- }
- }
- }
- }
-}
+#include "compiler/glsl/ir_uniform.h"
static void
assign_cs_binding_table_offsets(const struct brw_device_info *devinfo,
memset(&prog_data, 0, sizeof(prog_data));
+ if (prog->Comp.SharedSize > 64 * 1024) {
+ prog->LinkStatus = false;
+ const char *error_str =
+ "Compute shader used more than 64KB of shared variables";
+ ralloc_strcat(&prog->InfoLog, error_str);
+ _mesa_problem(NULL, "Failed to link compute shader: %s\n", error_str);
+
+ ralloc_free(mem_ctx);
+ return false;
+ } else {
+ prog_data.base.total_shared = prog->Comp.SharedSize;
+ }
+
assign_cs_binding_table_offsets(brw->intelScreen->devinfo, prog,
&cp->program.Base, &prog_data);
* prog_data associated with the compiled program, and which will be freed
* by the state cache.
*/
- int param_count = cp->program.Base.nir->num_uniforms;
+ int param_count = cp->program.Base.nir->num_uniforms / 4;
+
+ /* The backend also sometimes add a param for the thread local id. */
+ prog_data.thread_local_id_index = param_count++;
/* The backend also sometimes adds params for texture size. */
param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits;
}
}
- if (prog_data.base.total_scratch) {
- brw_get_scratch_bo(brw, &brw->cs.base.scratch_bo,
- prog_data.base.total_scratch * brw->max_cs_threads);
- }
+ const unsigned subslices = MAX2(brw->intelScreen->subslice_total, 1);
+
+ /* WaCSScratchSize:hsw
+ *
+ * Haswell's scratch space address calculation appears to be sparse
+ * rather than tightly packed. The Thread ID has bits indicating
+ * which subslice, EU within a subslice, and thread within an EU
+ * it is. There's a maximum of two slices and two subslices, so these
+ * can be stored with a single bit. Even though there are only 10 EUs
+ * per subslice, this is stored in 4 bits, so there's an effective
+ * maximum value of 16 EUs. Similarly, although there are only 7
+ * threads per EU, this is stored in a 3 bit number, giving an effective
+ * maximum value of 8 threads per EU.
+ *
+ * This means that we need to use 16 * 8 instead of 10 * 7 for the
+ * number of threads per subslice.
+ */
+ const unsigned scratch_ids_per_subslice =
+ brw->is_haswell ? 16 * 8 : brw->max_cs_threads;
+
+ brw_alloc_stage_scratch(brw, &brw->cs.base,
+ prog_data.base.total_scratch,
+ scratch_ids_per_subslice * subslices);
if (unlikely(INTEL_DEBUG & DEBUG_CS))
fprintf(stderr, "\n");
memset(key, 0, sizeof(*key));
/* _NEW_TEXTURE */
- brw_populate_sampler_prog_key_data(ctx, prog, brw->cs.base.sampler_count,
- &key->tex);
+ brw_populate_sampler_prog_key_data(ctx, prog, &key->tex);
/* The unique compute program ID */
key->program_string_id = cp->id;
return;
brw->cs.base.sampler_count =
- _mesa_fls(ctx->ComputeProgram._Current->Base.SamplersUsed);
+ util_last_bit(ctx->ComputeProgram._Current->Base.SamplersUsed);
brw_cs_populate_key(brw, &key);