#include "brw_eu.h"
#include "brw_wm.h"
#include "brw_fs.h"
+#include "brw_cs.h"
#include "brw_cfg.h"
#include "brw_dead_control_flow.h"
#include "main/uniforms.h"
#include "brw_fs_live_variables.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
#include "program/sampler.h"
using namespace brw;
inst->regs_written = regs_written;
if (devinfo->gen < 7) {
- inst->base_mrf = 13;
+ inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen);
inst->header_size = 1;
if (devinfo->gen == 4)
inst->mlen = 3;
negate == r.negate &&
abs == r.abs &&
!reladdr && !r.reladdr &&
- memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
+ ((file != HW_REG && file != IMM) ||
+ memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
+ sizeof(fixed_hw_reg)) == 0) &&
stride == r.stride);
}
break;
case CS_OPCODE_CS_TERMINATE:
+ case SHADER_OPCODE_BARRIER:
return 1;
default:
case SHADER_OPCODE_TXL:
case SHADER_OPCODE_TXS:
case SHADER_OPCODE_LOD:
+ case SHADER_OPCODE_SAMPLEINFO:
return 1;
case FS_OPCODE_FB_WRITE:
return 2;
+ case FS_OPCODE_GET_BUFFER_SIZE:
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
case SHADER_OPCODE_GEN4_SCRATCH_READ:
return 1;
this->param_size = v->param_size;
}
-void
-fs_visitor::setup_vec4_uniform_value(unsigned param_offset,
- const gl_constant_value *values,
- unsigned n)
-{
- static const gl_constant_value zero = { 0 };
-
- for (unsigned i = 0; i < n; ++i)
- stage_prog_data->param[param_offset + i] = &values[i];
-
- for (unsigned i = n; i < 4; ++i)
- stage_prog_data->param[param_offset + i] = &zero;
-}
-
fs_reg *
fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
bool origin_upper_left)
unsigned int array_elements;
if (type->is_array()) {
- array_elements = type->length;
+ array_elements = type->arrays_of_arrays_size();
if (array_elements == 0) {
fail("dereferenced array '%s' has length 0\n", name);
}
- type = type->fields.array;
+ type = type->without_array();
} else {
array_elements = 1;
}
}
}
}
+
+ /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
+ this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
}
void
int urb_next = 0;
/* Figure out where each of the incoming setup attributes lands. */
if (devinfo->gen >= 6) {
- if (_mesa_bitcount_64(prog->InputsRead &
+ if (_mesa_bitcount_64(nir->info.inputs_read &
BRW_FS_VARYING_INPUT_MASK) <= 16) {
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
* first 16 varying inputs, so we can put them wherever we want.
* a different vertex (or geometry) shader.
*/
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
- if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
+ if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
BITFIELD64_BIT(i)) {
prog_data->urb_setup[i] = urb_next++;
}
*/
struct brw_vue_map prev_stage_vue_map;
brw_compute_vue_map(devinfo, &prev_stage_vue_map,
- key->input_slots_valid);
+ key->input_slots_valid,
+ nir->info.separate_shader);
int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
* unused.
*/
if (varying != BRW_VARYING_SLOT_COUNT &&
- (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
+ (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
BITFIELD64_BIT(varying))) {
prog_data->urb_setup[varying] = slot - first_slot;
}
*
* See compile_sf_prog() for more info.
*/
- if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
+ if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
}
}
/* Each attribute is 4 setup channels, each of which is half a reg. */
- this->first_non_payload_grf =
- urb_start + prog_data->num_varying_inputs * 2;
+ this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
}
void
fs_visitor::assign_vs_urb_setup()
{
brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
- int grf, count, slot, channel, attr;
assert(stage == MESA_SHADER_VERTEX);
- count = _mesa_bitcount_64(vs_prog_data->inputs_read);
+ int count = _mesa_bitcount_64(vs_prog_data->inputs_read);
if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
count++;
/* Each attribute is 4 regs. */
- this->first_non_payload_grf =
- payload.num_regs + prog_data->curb_read_length + count * 4;
-
- unsigned vue_entries =
- MAX2(count, vs_prog_data->base.vue_map.num_slots);
-
- vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
- vs_prog_data->base.urb_read_length = (count + 1) / 2;
+ this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
assert(vs_prog_data->base.urb_read_length <= 15);
foreach_block_and_inst(block, fs_inst, inst, cfg) {
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == ATTR) {
-
- if (inst->src[i].reg == VERT_ATTRIB_MAX) {
- slot = count - 1;
- } else {
- /* Attributes come in in a contiguous block, ordered by their
- * gl_vert_attrib value. That means we can compute the slot
- * number for an attribute by masking out the enabled
- * attributes before it and counting the bits.
- */
- attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
- slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
- BITFIELD64_MASK(attr));
- }
-
- channel = inst->src[i].reg_offset & 3;
-
- grf = payload.num_regs +
- prog_data->curb_read_length +
- slot * 4 + channel;
+ int grf = payload.num_regs +
+ prog_data->curb_read_length +
+ inst->src[i].reg +
+ inst->src[i].reg_offset;
inst->src[i].file = HW_REG;
inst->src[i].fixed_hw_reg =
- retype(brw_vec8_grf(grf, 0), inst->src[i].type);
+ stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
+ inst->src[i].subreg_offset),
+ inst->exec_size * inst->src[i].stride,
+ inst->exec_size, inst->src[i].stride);
}
}
}
bool
fs_visitor::remove_duplicate_mrf_writes()
{
- fs_inst *last_mrf_move[16];
+ fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
bool progress = false;
/* Need to update the MRF tracking for compressed instructions. */
{
int write_len = inst->regs_written;
int first_write_grf = inst->dst.reg;
- bool needs_dep[BRW_MAX_MRF];
+ bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
assert(write_len < (int)sizeof(needs_dep) - 1);
memset(needs_dep, false, sizeof(needs_dep));
{
int write_len = inst->regs_written;
int first_write_grf = inst->dst.reg;
- bool needs_dep[BRW_MAX_MRF];
+ bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
assert(write_len < (int)sizeof(needs_dep) - 1);
memset(needs_dep, false, sizeof(needs_dep));
* else does except for register spill/unspill, which generates and
* uses its MRF within a single IR instruction.
*/
- inst->base_mrf = 14;
+ inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
inst->mlen = 1;
}
}
* schedule multi-component multiplications much better.
*/
- if (inst->conditional_mod && inst->dst.is_null()) {
+ fs_reg orig_dst = inst->dst;
+ if (orig_dst.is_null() || orig_dst.file == MRF) {
inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
inst->dst.type);
}
ibld.ADD(dst, low, high);
- if (inst->conditional_mod) {
- fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
+ if (inst->conditional_mod || orig_dst.file == MRF) {
set_condmod(inst->conditional_mod,
- ibld.MOV(null, inst->dst));
+ ibld.MOV(orig_dst, inst->dst));
}
}
fprintf(file, "***m%d***", inst->src[i].reg);
break;
case ATTR:
- fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
+ fprintf(file, "attr%d+%d", inst->src[i].reg, inst->src[i].reg_offset);
break;
case UNIFORM:
fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
fs_visitor::setup_payload_gen6()
{
bool uses_depth =
- (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
+ (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
unsigned barycentric_interp_modes =
(stage == MESA_SHADER_FRAGMENT) ?
((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
}
/* R32: MSAA input coverage mask */
- if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
+ if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) {
assert(devinfo->gen >= 7);
payload.sample_mask_in_reg = payload.num_regs;
payload.num_regs++;
/* R34-: bary for 32-pixel. */
/* R58-59: interp W for 32-pixel. */
- if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+ if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
source_depth_to_render_target = true;
}
}
payload.num_regs = 2;
}
+/**
+ * We are building the local ID push constant data using the simplest possible
+ * method. We simply push the local IDs directly as they should appear in the
+ * registers for the uvec3 gl_LocalInvocationID variable.
+ *
+ * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
+ * registers worth of push constant space.
+ *
+ * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
+ * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
+ * to coordinated.
+ *
+ * FINISHME: There are a few easy optimizations to consider.
+ *
+ * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
+ * no need for using push constant space for that dimension.
+ *
+ * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
+ * easily use 16-bit words rather than 32-bit dwords in the push constant
+ * data.
+ *
+ * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
+ * conveying the data, and thereby reduce push constant usage.
+ *
+ */
void
fs_visitor::setup_cs_payload()
{
assert(devinfo->gen >= 7);
+ brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
payload.num_regs = 1;
-}
-void
-fs_visitor::assign_binding_table_offsets()
-{
- assert(stage == MESA_SHADER_FRAGMENT);
- brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
- brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
- uint32_t next_binding_table_offset = 0;
-
- /* If there are no color regions, we still perform an FB write to a null
- * renderbuffer, which we place at surface index 0.
- */
- prog_data->binding_table.render_target_start = next_binding_table_offset;
- next_binding_table_offset += MAX2(key->nr_color_regions, 1);
-
- assign_common_binding_table_offsets(next_binding_table_offset);
+ if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
+ prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
+ payload.local_invocation_id_reg = payload.num_regs;
+ payload.num_regs += prog_data->local_invocation_id_regs;
+ }
}
void
void
fs_visitor::optimize()
{
+ /* Start by validating the shader we currently have. */
+ validate();
+
/* bld is the common builder object pointing at the end of the program we
* used to translate it into i965 IR. For the optimization and lowering
* passes coming next, any code added after the end of the program without
*/
bld = fs_builder(this, 64);
- split_virtual_grfs();
-
assign_constant_locations();
demote_pull_constants();
+ validate();
+
+ split_virtual_grfs();
+ validate();
+
#define OPT(pass, args...) ({ \
pass_num++; \
bool this_progress = pass(args); \
\
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
char filename[64]; \
- snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
- stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
+ snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass, \
+ stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
\
backend_shader::dump_instructions(filename); \
} \
\
+ validate(); \
+ \
progress = progress || this_progress; \
this_progress; \
})
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
char filename[64];
- snprintf(filename, 64, "%s%d-%04d-00-start",
- stage_abbrev, dispatch_width,
- shader_prog ? shader_prog->Name : 0);
+ snprintf(filename, 64, "%s%d-%s-00-start",
+ stage_abbrev, dispatch_width, nir->info.name);
backend_shader::dump_instructions(filename);
}
OPT(opt_algebraic);
OPT(opt_cse);
OPT(opt_copy_propagate);
- OPT(opt_peephole_predicated_break);
+ OPT(opt_predicated_break, this);
OPT(opt_cmod_propagation);
OPT(dead_code_eliminate);
OPT(opt_peephole_sel);
OPT(lower_integer_multiplication);
lower_uniform_pull_constant_loads();
+
+ validate();
}
/**
{
assert(stage == MESA_SHADER_VERTEX);
- assign_common_binding_table_offsets(0);
setup_vs_payload();
if (shader_time_index >= 0)
assert(stage == MESA_SHADER_FRAGMENT);
- sanity_param_count = prog->Parameters->NumParameters;
-
- assign_binding_table_offsets();
-
if (devinfo->gen >= 6)
setup_payload_gen6();
else
emit_shader_time_begin();
calculate_urb_setup();
- if (prog->InputsRead > 0) {
+ if (nir->info.inputs_read > 0) {
if (devinfo->gen < 6)
emit_interpolation_setup_gen4();
else
else
wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
- /* If any state parameters were appended, then ParameterValues could have
- * been realloced, in which case the driver uniform storage set up by
- * _mesa_associate_uniform_storage() would point to freed memory. Make
- * sure that didn't happen.
- */
- assert(sanity_param_count == prog->Parameters->NumParameters);
-
return !failed;
}
fs_visitor::run_cs()
{
assert(stage == MESA_SHADER_COMPUTE);
- assert(shader);
-
- sanity_param_count = prog->Parameters->NumParameters;
-
- assign_common_binding_table_offsets(0);
setup_cs_payload();
if (failed)
return false;
- /* If any state parameters were appended, then ParameterValues could have
- * been realloced, in which case the driver uniform storage set up by
- * _mesa_associate_uniform_storage() would point to freed memory. Make
- * sure that didn't happen.
- */
- assert(sanity_param_count == prog->Parameters->NumParameters);
-
return !failed;
}
const unsigned *
-brw_wm_fs_emit(struct brw_context *brw,
+brw_wm_fs_emit(const struct brw_compiler *compiler, void *log_data,
void *mem_ctx,
const struct brw_wm_prog_key *key,
struct brw_wm_prog_data *prog_data,
- struct gl_fragment_program *fp,
- struct gl_shader_program *prog,
- unsigned *final_assembly_size)
+ const nir_shader *shader,
+ struct gl_program *prog,
+ int shader_time_index8, int shader_time_index16,
+ bool use_rep_send,
+ unsigned *final_assembly_size,
+ char **error_str)
{
- bool start_busy = false;
- double start_time = 0;
-
- if (unlikely(brw->perf_debug)) {
- start_busy = (brw->batch.last_bo &&
- drm_intel_bo_busy(brw->batch.last_bo));
- start_time = get_time();
- }
-
- struct brw_shader *shader = NULL;
- if (prog)
- shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
-
- if (unlikely(INTEL_DEBUG & DEBUG_WM))
- brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
-
- int st_index8 = -1, st_index16 = -1;
- if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
- st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
- st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
- }
-
- /* Now the main event: Visit the shader IR and generate our FS IR for it.
- */
- fs_visitor v(brw->intelScreen->compiler, brw,
- mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
- prog, &fp->Base, 8, st_index8);
+ fs_visitor v(compiler, log_data, mem_ctx, key,
+ &prog_data->base, prog, shader, 8,
+ shader_time_index8);
if (!v.run_fs(false /* do_rep_send */)) {
- if (prog) {
- prog->LinkStatus = false;
- ralloc_strcat(&prog->InfoLog, v.fail_msg);
- }
-
- _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
- v.fail_msg);
+ if (error_str)
+ *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
return NULL;
}
cfg_t *simd16_cfg = NULL;
- fs_visitor v2(brw->intelScreen->compiler, brw,
- mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
- prog, &fp->Base, 16, st_index16);
- if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
+ fs_visitor v2(compiler, log_data, mem_ctx, key,
+ &prog_data->base, prog, shader, 16,
+ shader_time_index16);
+ if (likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
if (!v.simd16_unsupported) {
/* Try a SIMD16 compile */
v2.import_uniforms(&v);
- if (!v2.run_fs(brw->use_rep_send)) {
- perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
+ if (!v2.run_fs(use_rep_send)) {
+ compiler->shader_perf_log(log_data,
+ "SIMD16 shader failed to compile: %s",
+ v2.fail_msg);
} else {
simd16_cfg = v2.cfg;
}
}
cfg_t *simd8_cfg;
- int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
- if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
+ int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
+ if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
simd8_cfg = NULL;
prog_data->no_8 = true;
} else {
prog_data->no_8 = false;
}
- fs_generator g(brw->intelScreen->compiler, brw,
- mem_ctx, (void *) key, &prog_data->base,
- &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
+ fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
+ v.promoted_constants, v.runtime_check_aads_emit, "FS");
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
- char *name;
- if (prog)
- name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
- prog->Label ? prog->Label : "unnamed",
- prog->Name);
- else
- name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
-
- g.enable_debug(name);
+ g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
+ shader->info.label ? shader->info.label :
+ "unnamed",
+ shader->info.name));
}
if (simd8_cfg)
if (simd16_cfg)
prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
- if (unlikely(brw->perf_debug) && shader) {
- if (shader->compiled_once)
- brw_wm_debug_recompile(brw, prog, key);
- shader->compiled_once = true;
+ return g.get_assembly(final_assembly_size);
+}
- if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
- perf_debug("FS compile took %.03f ms and stalled the GPU\n",
- (get_time() - start_time) * 1000);
+void
+brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
+ void *buffer, uint32_t threads, uint32_t stride)
+{
+ if (prog_data->local_invocation_id_regs == 0)
+ return;
+
+ /* 'stride' should be an integer number of registers, that is, a multiple
+ * of 32 bytes.
+ */
+ assert(stride % 32 == 0);
+
+ unsigned x = 0, y = 0, z = 0;
+ for (unsigned t = 0; t < threads; t++) {
+ uint32_t *param = (uint32_t *) buffer + stride * t / 4;
+
+ for (unsigned i = 0; i < prog_data->simd_size; i++) {
+ param[0 * prog_data->simd_size + i] = x;
+ param[1 * prog_data->simd_size + i] = y;
+ param[2 * prog_data->simd_size + i] = z;
+
+ x++;
+ if (x == prog_data->local_size[0]) {
+ x = 0;
+ y++;
+ if (y == prog_data->local_size[1]) {
+ y = 0;
+ z++;
+ if (z == prog_data->local_size[2])
+ z = 0;
+ }
+ }
}
}
-
- return g.get_assembly(final_assembly_size);
}
-extern "C" bool
-brw_fs_precompile(struct gl_context *ctx,
- struct gl_shader_program *shader_prog,
- struct gl_program *prog)
+fs_reg *
+fs_visitor::emit_cs_local_invocation_id_setup()
{
- struct brw_context *brw = brw_context(ctx);
- struct brw_wm_prog_key key;
+ assert(stage == MESA_SHADER_COMPUTE);
- struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
- struct brw_fragment_program *bfp = brw_fragment_program(fp);
- bool program_uses_dfdy = fp->UsesDFdy;
+ fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
- memset(&key, 0, sizeof(key));
+ struct brw_reg src =
+ brw_vec8_grf(payload.local_invocation_id_reg, 0);
+ src = retype(src, BRW_REGISTER_TYPE_UD);
+ bld.MOV(*reg, src);
+ src.nr += dispatch_width / 8;
+ bld.MOV(offset(*reg, bld, 1), src);
+ src.nr += dispatch_width / 8;
+ bld.MOV(offset(*reg, bld, 2), src);
- if (brw->gen < 6) {
- if (fp->UsesKill)
- key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+ return reg;
+}
- if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
- key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
+fs_reg *
+fs_visitor::emit_cs_work_group_id_setup()
+{
+ assert(stage == MESA_SHADER_COMPUTE);
- /* Just assume depth testing. */
- key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
- key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
- }
+ fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
- if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
- BRW_FS_VARYING_INPUT_MASK) > 16)
- key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
+ struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
+ struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
+ struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
- brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
+ bld.MOV(*reg, r0_1);
+ bld.MOV(offset(*reg, bld, 1), r0_6);
+ bld.MOV(offset(*reg, bld, 2), r0_7);
- if (fp->Base.InputsRead & VARYING_BIT_POS) {
- key.drawable_height = ctx->DrawBuffer->Height;
- }
+ return reg;
+}
- key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
- ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
- BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
+const unsigned *
+brw_cs_emit(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const struct brw_cs_prog_key *key,
+ struct brw_cs_prog_data *prog_data,
+ const nir_shader *shader,
+ int shader_time_index,
+ unsigned *final_assembly_size,
+ char **error_str)
+{
+ prog_data->local_size[0] = shader->info.cs.local_size[0];
+ prog_data->local_size[1] = shader->info.cs.local_size[1];
+ prog_data->local_size[2] = shader->info.cs.local_size[2];
+ unsigned local_workgroup_size =
+ shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
+ shader->info.cs.local_size[2];
- if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
- key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
- key.nr_color_regions > 1;
- }
+ unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
- key.program_string_id = bfp->id;
+ cfg_t *cfg = NULL;
+ const char *fail_msg = NULL;
- uint32_t old_prog_offset = brw->wm.base.prog_offset;
- struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
+ /* Now the main event: Visit the shader IR and generate our CS IR for it.
+ */
+ fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
+ NULL, /* Never used in core profile */
+ shader, 8, shader_time_index);
+ if (!v8.run_cs()) {
+ fail_msg = v8.fail_msg;
+ } else if (local_workgroup_size <= 8 * max_cs_threads) {
+ cfg = v8.cfg;
+ prog_data->simd_size = 8;
+ }
- bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
+ fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
+ NULL, /* Never used in core profile */
+ shader, 16, shader_time_index);
+ if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
+ !fail_msg && !v8.simd16_unsupported &&
+ local_workgroup_size <= 16 * max_cs_threads) {
+ /* Try a SIMD16 compile */
+ v16.import_uniforms(&v8);
+ if (!v16.run_cs()) {
+ compiler->shader_perf_log(log_data,
+ "SIMD16 shader failed to compile: %s",
+ v16.fail_msg);
+ if (!cfg) {
+ fail_msg =
+ "Couldn't generate SIMD16 program and not "
+ "enough threads for SIMD8";
+ }
+ } else {
+ cfg = v16.cfg;
+ prog_data->simd_size = 16;
+ }
+ }
- brw->wm.base.prog_offset = old_prog_offset;
- brw->wm.prog_data = old_prog_data;
+ if (unlikely(cfg == NULL)) {
+ assert(fail_msg);
+ if (error_str)
+ *error_str = ralloc_strdup(mem_ctx, fail_msg);
- return success;
-}
+ return NULL;
+ }
-void
-brw_setup_tex_for_precompile(struct brw_context *brw,
- struct brw_sampler_prog_key_data *tex,
- struct gl_program *prog)
-{
- const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
- unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
- for (unsigned i = 0; i < sampler_count; i++) {
- if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
- /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
- tex->swizzles[i] =
- MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
- } else {
- /* Color sampler: assume no swizzling. */
- tex->swizzles[i] = SWIZZLE_XYZW;
- }
+ fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
+ v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
+ if (INTEL_DEBUG & DEBUG_CS) {
+ char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
+ shader->info.label ? shader->info.label :
+ "unnamed",
+ shader->info.name);
+ g.enable_debug(name);
}
+
+ g.generate_code(cfg, prog_data->simd_size);
+
+ return g.get_assembly(final_assembly_size);
}