#include "brw_eu.h"
#include "brw_wm.h"
#include "brw_fs.h"
+#include "brw_cs.h"
#include "brw_cfg.h"
#include "brw_dead_control_flow.h"
#include "main/uniforms.h"
#include "brw_fs_live_variables.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
#include "program/sampler.h"
using namespace brw;
case HW_REG:
case MRF:
case ATTR:
- this->regs_written =
- DIV_ROUND_UP(MAX2(exec_size * dst.stride, 1) * type_sz(dst.type), 32);
+ this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
+ REG_SIZE);
break;
case BAD_FILE:
this->regs_written = 0;
else
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
- assert(dst.width % 8 == 0);
int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
- fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
- dst.type, bld.dispatch_width());
+ fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
inst->regs_written = regs_written;
if (devinfo->gen < 7) {
- inst->base_mrf = 13;
+ inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen);
inst->header_size = 1;
if (devinfo->gen == 4)
inst->mlen = 3;
for (int i = 0; i < this->sources; i++) {
reg.type = this->src[i].type;
- reg.width = this->src[i].width;
if (!this->src[i].equals(reg))
return false;
init();
this->file = IMM;
this->type = BRW_REGISTER_TYPE_F;
+ this->stride = 0;
this->fixed_hw_reg.dw1.f = f;
- this->width = 1;
}
/** Immediate value constructor. */
init();
this->file = IMM;
this->type = BRW_REGISTER_TYPE_D;
+ this->stride = 0;
this->fixed_hw_reg.dw1.d = i;
- this->width = 1;
}
/** Immediate value constructor. */
init();
this->file = IMM;
this->type = BRW_REGISTER_TYPE_UD;
+ this->stride = 0;
this->fixed_hw_reg.dw1.ud = u;
- this->width = 1;
}
/** Vector float immediate value constructor. */
this->file = HW_REG;
this->fixed_hw_reg = fixed_hw_reg;
this->type = fixed_hw_reg.type;
- this->width = 1 << fixed_hw_reg.width;
}
bool
negate == r.negate &&
abs == r.abs &&
!reladdr && !r.reladdr &&
- memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
- width == r.width &&
+ ((file != HW_REG && file != IMM) ||
+ memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
+ sizeof(fixed_hw_reg)) == 0) &&
stride == r.stride);
}
return stride == 1;
}
-int
-fs_visitor::type_size(const struct glsl_type *type)
+unsigned
+fs_reg::component_size(unsigned width) const
+{
+ const unsigned stride = (file != HW_REG ? this->stride :
+ fixed_hw_reg.hstride == 0 ? 0 :
+ 1 << (fixed_hw_reg.hstride - 1));
+ return MAX2(width * stride, 1) * type_sz(type);
+}
+
+extern "C" int
+type_size_scalar(const struct glsl_type *type)
{
unsigned int size, i;
case GLSL_TYPE_BOOL:
return type->components();
case GLSL_TYPE_ARRAY:
- return type_size(type->fields.array) * type->length;
+ return type_size_scalar(type->fields.array) * type->length;
case GLSL_TYPE_STRUCT:
size = 0;
for (i = 0; i < type->length; i++) {
- size += type_size(type->fields.structure[i].type);
+ size += type_size_scalar(type->fields.structure[i].type);
}
return size;
case GLSL_TYPE_SAMPLER:
return 0;
case GLSL_TYPE_ATOMIC_UINT:
return 0;
+ case GLSL_TYPE_SUBROUTINE:
+ return 1;
case GLSL_TYPE_IMAGE:
+ return BRW_IMAGE_PARAM_SIZE;
case GLSL_TYPE_VOID:
case GLSL_TYPE_ERROR:
case GLSL_TYPE_INTERFACE:
0),
BRW_REGISTER_TYPE_UD));
- fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
+ fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
/* We want to read the 3 fields we care about even if it's not enabled in
* the dispatch.
fs_reg start = shader_start_time;
start.negate = true;
- fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
+ fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
diff.set_smear(0);
const fs_builder cbld = ibld.group(1, 0);
!this->dst.is_contiguous());
}
+unsigned
+fs_inst::components_read(unsigned i) const
+{
+ switch (opcode) {
+ case FS_OPCODE_LINTERP:
+ if (i == 0)
+ return 2;
+ else
+ return 1;
+
+ case FS_OPCODE_PIXEL_X:
+ case FS_OPCODE_PIXEL_Y:
+ assert(i == 0);
+ return 2;
+
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ assert(src[6].file == IMM);
+ /* First/second FB write color. */
+ if (i < 2)
+ return src[6].fixed_hw_reg.dw1.ud;
+ else
+ return 1;
+
+ case SHADER_OPCODE_TEX_LOGICAL:
+ case SHADER_OPCODE_TXD_LOGICAL:
+ case SHADER_OPCODE_TXF_LOGICAL:
+ case SHADER_OPCODE_TXL_LOGICAL:
+ case SHADER_OPCODE_TXS_LOGICAL:
+ case FS_OPCODE_TXB_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ case SHADER_OPCODE_TXF_UMS_LOGICAL:
+ case SHADER_OPCODE_TXF_MCS_LOGICAL:
+ case SHADER_OPCODE_LOD_LOGICAL:
+ case SHADER_OPCODE_TG4_LOGICAL:
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+ assert(src[8].file == IMM && src[9].file == IMM);
+ /* Texture coordinates. */
+ if (i == 0)
+ return src[8].fixed_hw_reg.dw1.ud;
+ /* Texture derivatives. */
+ else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
+ return src[9].fixed_hw_reg.dw1.ud;
+ /* Texture offset. */
+ else if (i == 7)
+ return 2;
+ else
+ return 1;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ assert(src[3].file == IMM);
+ /* Surface coordinates. */
+ if (i == 0)
+ return src[3].fixed_hw_reg.dw1.ud;
+ /* Surface operation source (ignored for reads). */
+ else if (i == 1)
+ return 0;
+ else
+ return 1;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ assert(src[3].file == IMM &&
+ src[4].file == IMM);
+ /* Surface coordinates. */
+ if (i == 0)
+ return src[3].fixed_hw_reg.dw1.ud;
+ /* Surface operation source. */
+ else if (i == 1)
+ return src[4].fixed_hw_reg.dw1.ud;
+ else
+ return 1;
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
+ assert(src[3].file == IMM &&
+ src[4].file == IMM);
+ const unsigned op = src[4].fixed_hw_reg.dw1.ud;
+ /* Surface coordinates. */
+ if (i == 0)
+ return src[3].fixed_hw_reg.dw1.ud;
+ /* Surface operation source. */
+ else if (i == 1 && op == BRW_AOP_CMPWR)
+ return 2;
+ else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
+ op == BRW_AOP_PREDEC))
+ return 0;
+ else
+ return 1;
+ }
+
+ default:
+ return 1;
+ }
+}
+
int
fs_inst::regs_read(int arg) const
{
- unsigned components = 1;
switch (opcode) {
case FS_OPCODE_FB_WRITE:
case SHADER_OPCODE_URB_WRITE_SIMD8:
break;
case FS_OPCODE_LINTERP:
- if (arg == 0)
- return exec_size / 4;
- break;
-
- case FS_OPCODE_PIXEL_X:
- case FS_OPCODE_PIXEL_Y:
- if (arg == 0)
- components = 1;
+ if (arg == 1)
+ return 1;
break;
case SHADER_OPCODE_LOAD_PAYLOAD:
return 1;
break;
+ case CS_OPCODE_CS_TERMINATE:
+ case SHADER_OPCODE_BARRIER:
+ return 1;
+
default:
if (is_tex() && arg == 0 && src[0].file == GRF)
return mlen;
switch (src[arg].file) {
case BAD_FILE:
+ return 0;
case UNIFORM:
case IMM:
return 1;
case GRF:
+ case ATTR:
case HW_REG:
- if (src[arg].stride == 0) {
- return 1;
- } else {
- int size = components * this->exec_size * type_sz(src[arg].type);
- return DIV_ROUND_UP(size * src[arg].stride, 32);
- }
+ return DIV_ROUND_UP(components_read(arg) *
+ src[arg].component_size(exec_size),
+ REG_SIZE);
case MRF:
unreachable("MRF registers are not allowed as sources");
default:
case SHADER_OPCODE_TXL:
case SHADER_OPCODE_TXS:
case SHADER_OPCODE_LOD:
+ case SHADER_OPCODE_SAMPLEINFO:
return 1;
case FS_OPCODE_FB_WRITE:
return 2;
+ case FS_OPCODE_GET_BUFFER_SIZE:
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
case SHADER_OPCODE_GEN4_SCRATCH_READ:
return 1;
fs_visitor::vgrf(const glsl_type *const type)
{
int reg_width = dispatch_width / 8;
- return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
- brw_type_for_base_type(type), dispatch_width);
+ return fs_reg(GRF, alloc.allocate(type_size_scalar(type) * reg_width),
+ brw_type_for_base_type(type));
}
/** Fixed HW reg constructor. */
this->file = file;
this->reg = reg;
this->type = BRW_REGISTER_TYPE_F;
-
- switch (file) {
- case UNIFORM:
- this->width = 1;
- break;
- default:
- this->width = 8;
- }
+ this->stride = (file == UNIFORM ? 0 : 1);
}
/** Fixed HW reg constructor. */
this->file = file;
this->reg = reg;
this->type = type;
-
- switch (file) {
- case UNIFORM:
- this->width = 1;
- break;
- default:
- this->width = 8;
- }
-}
-
-/** Fixed HW reg constructor. */
-fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
- uint8_t width)
-{
- init();
- this->file = file;
- this->reg = reg;
- this->type = type;
- this->width = width;
+ this->stride = (file == UNIFORM ? 0 : 1);
}
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
bld.MOV(wpos, this->pixel_y);
} else {
fs_reg pixel_y = this->pixel_y;
- float offset = (pixel_center_integer ? 0.0 : 0.5);
+ float offset = (pixel_center_integer ? 0.0f : 0.5f);
if (flip) {
pixel_y.negate = true;
- offset += key->drawable_height - 1.0;
+ offset += key->drawable_height - 1.0f;
}
bld.ADD(wpos, pixel_y, fs_reg(offset));
unsigned int array_elements;
if (type->is_array()) {
- array_elements = type->length;
+ array_elements = type->arrays_of_arrays_size();
if (array_elements == 0) {
fail("dereferenced array '%s' has length 0\n", name);
}
- type = type->fields.array;
+ type = type->without_array();
} else {
array_elements = 1;
}
return reg;
}
-void
-fs_visitor::resolve_source_modifiers(fs_reg *src)
+fs_reg
+fs_visitor::resolve_source_modifiers(const fs_reg &src)
{
- if (!src->abs && !src->negate)
- return;
+ if (!src.abs && !src.negate)
+ return src;
- fs_reg temp = bld.vgrf(src->type);
- bld.MOV(temp, *src);
- *src = temp;
+ fs_reg temp = bld.vgrf(src.type);
+ bld.MOV(temp, src);
+
+ return temp;
}
void
constant_nr / 8,
constant_nr % 8);
+ assert(inst->src[i].stride == 0);
inst->src[i].file = HW_REG;
inst->src[i].fixed_hw_reg = byte_offset(
retype(brw_reg, inst->src[i].type),
}
}
}
+
+ /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
+ this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
}
void
int urb_next = 0;
/* Figure out where each of the incoming setup attributes lands. */
if (devinfo->gen >= 6) {
- if (_mesa_bitcount_64(prog->InputsRead &
+ if (_mesa_bitcount_64(nir->info.inputs_read &
BRW_FS_VARYING_INPUT_MASK) <= 16) {
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
* first 16 varying inputs, so we can put them wherever we want.
* a different vertex (or geometry) shader.
*/
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
- if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
+ if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
BITFIELD64_BIT(i)) {
prog_data->urb_setup[i] = urb_next++;
}
*/
struct brw_vue_map prev_stage_vue_map;
brw_compute_vue_map(devinfo, &prev_stage_vue_map,
- key->input_slots_valid);
+ key->input_slots_valid,
+ nir->info.separate_shader);
int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
* unused.
*/
if (varying != BRW_VARYING_SLOT_COUNT &&
- (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
+ (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
BITFIELD64_BIT(varying))) {
prog_data->urb_setup[varying] = slot - first_slot;
}
*
* See compile_sf_prog() for more info.
*/
- if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
+ if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
}
}
/* Each attribute is 4 setup channels, each of which is half a reg. */
- this->first_non_payload_grf =
- urb_start + prog_data->num_varying_inputs * 2;
+ this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
}
void
fs_visitor::assign_vs_urb_setup()
{
brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
- int grf, count, slot, channel, attr;
assert(stage == MESA_SHADER_VERTEX);
- count = _mesa_bitcount_64(vs_prog_data->inputs_read);
+ int count = _mesa_bitcount_64(vs_prog_data->inputs_read);
if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
count++;
/* Each attribute is 4 regs. */
- this->first_non_payload_grf =
- payload.num_regs + prog_data->curb_read_length + count * 4;
-
- unsigned vue_entries =
- MAX2(count, vs_prog_data->base.vue_map.num_slots);
-
- vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
- vs_prog_data->base.urb_read_length = (count + 1) / 2;
+ this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
assert(vs_prog_data->base.urb_read_length <= 15);
foreach_block_and_inst(block, fs_inst, inst, cfg) {
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == ATTR) {
-
- if (inst->src[i].reg == VERT_ATTRIB_MAX) {
- slot = count - 1;
- } else {
- /* Attributes come in in a contiguous block, ordered by their
- * gl_vert_attrib value. That means we can compute the slot
- * number for an attribute by masking out the enabled
- * attributes before it and counting the bits.
- */
- attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
- slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
- BITFIELD64_MASK(attr));
- }
-
- channel = inst->src[i].reg_offset & 3;
-
- grf = payload.num_regs +
- prog_data->curb_read_length +
- slot * 4 + channel;
+ int grf = payload.num_regs +
+ prog_data->curb_read_length +
+ inst->src[i].reg +
+ inst->src[i].reg_offset;
inst->src[i].file = HW_REG;
inst->src[i].fixed_hw_reg =
- retype(brw_vec8_grf(grf, 0), inst->src[i].type);
+ stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
+ inst->src[i].subreg_offset),
+ inst->exec_size * inst->src[i].stride,
+ inst->exec_size, inst->src[i].stride);
}
}
}
return progress;
}
-/*
- * Implements array access of uniforms by inserting a
- * PULL_CONSTANT_LOAD instruction.
+/**
+ * Assign UNIFORM file registers to either push constants or pull constants.
*
- * Unlike temporary GRF array access (where we don't support it due to
- * the difficulty of doing relative addressing on instruction
- * destinations), we could potentially do array access of uniforms
- * that were loaded in GRF space as push constants. In real-world
- * usage we've seen, though, the arrays being used are always larger
- * than we could load as push constants, so just always move all
- * uniform array access out to a pull constant buffer.
+ * We allow a fragment shader to have more than the specified minimum
+ * maximum number of fragment shader uniform components (64). If
+ * there are too many of these, they'd fill up all of register space.
+ * So, this will push some of them out to the pull constant buffer and
+ * update the program to load them. We also use pull constants for all
+ * indirect constant loads because we don't support indirect accesses in
+ * registers yet.
*/
void
-fs_visitor::move_uniform_array_access_to_pull_constants()
+fs_visitor::assign_constant_locations()
{
+ /* Only the first compile (SIMD8 mode) gets to decide on locations. */
if (dispatch_width != 8)
return;
+ unsigned int num_pull_constants = 0;
+
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
- /* Walk through and find array access of uniforms. Put a copy of that
- * uniform in the pull constant buffer.
+ bool is_live[uniforms];
+ memset(is_live, 0, sizeof(is_live));
+
+ /* First, we walk through the instructions and do two things:
+ *
+ * 1) Figure out which uniforms are live.
+ *
+ * 2) Find all indirect access of uniform arrays and flag them as needing
+ * to go into the pull constant buffer.
*
* Note that we don't move constant-indexed accesses to arrays. No
* testing has been done of the performance impact of this choice.
*/
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
for (int i = 0 ; i < inst->sources; i++) {
- if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
+ if (inst->src[i].file != UNIFORM)
continue;
- int uniform = inst->src[i].reg;
-
- /* If this array isn't already present in the pull constant buffer,
- * add it.
- */
- if (pull_constant_loc[uniform] == -1) {
- const gl_constant_value **values = &stage_prog_data->param[uniform];
-
- assert(param_size[uniform]);
-
- for (int j = 0; j < param_size[uniform]; j++) {
- pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
+ if (inst->src[i].reladdr) {
+ int uniform = inst->src[i].reg;
- stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
- values[j];
+ /* If this array isn't already present in the pull constant buffer,
+ * add it.
+ */
+ if (pull_constant_loc[uniform] == -1) {
+ assert(param_size[uniform]);
+ for (int j = 0; j < param_size[uniform]; j++)
+ pull_constant_loc[uniform + j] = num_pull_constants++;
}
+ } else {
+ /* Mark the the one accessed uniform as live */
+ int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
+ if (constant_nr >= 0 && constant_nr < (int) uniforms)
+ is_live[constant_nr] = true;
}
}
}
-}
-
-/**
- * Assign UNIFORM file registers to either push constants or pull constants.
- *
- * We allow a fragment shader to have more than the specified minimum
- * maximum number of fragment shader uniform components (64). If
- * there are too many of these, they'd fill up all of register space.
- * So, this will push some of them out to the pull constant buffer and
- * update the program to load them.
- */
-void
-fs_visitor::assign_constant_locations()
-{
- /* Only the first compile (SIMD8 mode) gets to decide on locations. */
- if (dispatch_width != 8)
- return;
-
- /* Find which UNIFORM registers are still in use. */
- bool is_live[uniforms];
- for (unsigned int i = 0; i < uniforms; i++) {
- is_live[i] = false;
- }
-
- foreach_block_and_inst(block, fs_inst, inst, cfg) {
- for (int i = 0; i < inst->sources; i++) {
- if (inst->src[i].file != UNIFORM)
- continue;
-
- int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
- if (constant_nr >= 0 && constant_nr < (int) uniforms)
- is_live[constant_nr] = true;
- }
- }
/* Only allow 16 registers (128 uniform components) as push constants.
*
} else {
/* Demote to a pull constant. */
push_constant_loc[i] = -1;
-
- int pull_index = stage_prog_data->nr_pull_params++;
- stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
- pull_constant_loc[i] = pull_index;
+ pull_constant_loc[i] = num_pull_constants++;
}
}
stage_prog_data->nr_params = num_push_constants;
+ stage_prog_data->nr_pull_params = num_pull_constants;
/* Up until now, the param[] array has been indexed by reg + reg_offset
- * of UNIFORM registers. Condense it to only contain the uniforms we
- * chose to upload as push constants.
+ * of UNIFORM registers. Move pull constants into pull_param[] and
+ * condense param[] to only contain the uniforms we chose to push.
+ *
+ * NOTE: Because we are condensing the params[] array, we know that
+ * push_constant_loc[i] <= i and we can do it in one smooth loop without
+ * having to make a copy.
*/
for (unsigned int i = 0; i < uniforms; i++) {
- int remapped = push_constant_loc[i];
-
- if (remapped == -1)
- continue;
+ const gl_constant_value *value = stage_prog_data->param[i];
- assert(remapped <= (int)i);
- stage_prog_data->param[remapped] = stage_prog_data->param[i];
+ if (pull_constant_loc[i] != -1) {
+ stage_prog_data->pull_param[pull_constant_loc[i]] = value;
+ } else if (push_constant_loc[i] != -1) {
+ stage_prog_data->param[push_constant_loc[i]] = value;
+ }
}
}
continue;
/* Set up the annotation tracking for new generated instructions. */
- const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
- .at(block, inst);
+ const fs_builder ibld(this, block, inst);
fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
fs_reg dst = vgrf(glsl_type::float_type);
+ assert(inst->src[i].stride == 0);
+
/* Generate a pull load into dst. */
if (inst->src[i].reladdr) {
VARYING_PULL_CONSTANT_LOAD(ibld, dst,
*inst->src[i].reladdr,
pull_index);
inst->src[i].reladdr = NULL;
+ inst->src[i].stride = 1;
} else {
+ const fs_builder ubld = ibld.exec_all().group(8, 0);
fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
- ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+ ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
dst, surf_index, offset);
inst->src[i].set_smear(pull_index & 3);
}
inst->src[i].file = GRF;
inst->src[i].reg = dst.reg;
inst->src[i].reg_offset = 0;
- inst->src[i].width = dispatch_width;
}
}
invalidate_live_intervals();
* "Parameter 0 is required except for the sampleinfo message, which
* has no parameter 0"
*/
- while (inst->mlen > inst->header_size + dispatch_width / 8 &&
+ while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
load_payload->src[(inst->mlen - inst->header_size) /
- (dispatch_width / 8) +
+ (inst->exec_size / 8) +
inst->header_size - 1].is_zero()) {
- inst->mlen -= dispatch_width / 8;
+ inst->mlen -= inst->exec_size / 8;
progress = true;
}
}
return false;
/* Look for a texturing instruction immediately before the final FB_WRITE. */
- fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
+ bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
+ fs_inst *fb_write = (fs_inst *)block->end();
assert(fb_write->eot);
assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
assert(!tex_inst->eot); /* We can't get here twice */
assert((tex_inst->offset & (0xff << 24)) == 0);
+ const fs_builder ibld(this, block, tex_inst);
+
tex_inst->offset |= fb_write->target << 24;
tex_inst->eot = true;
- tex_inst->dst = bld.null_reg_ud();
+ tex_inst->dst = ibld.null_reg_ud();
fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
/* If a header is present, marking the eot is sufficient. Otherwise, we need
if (tex_inst->header_size != 0)
return true;
- fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
- load_payload->sources + 1);
+ fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
+ load_payload->sources + 1);
fs_reg *new_sources =
ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
bool
fs_visitor::remove_duplicate_mrf_writes()
{
- fs_inst *last_mrf_move[16];
+ fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
bool progress = false;
/* Need to update the MRF tracking for compressed instructions. */
{
int write_len = inst->regs_written;
int first_write_grf = inst->dst.reg;
- bool needs_dep[BRW_MAX_MRF];
+ bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
assert(write_len < (int)sizeof(needs_dep) - 1);
memset(needs_dep, false, sizeof(needs_dep));
if (block->start() == scan_inst) {
for (int i = 0; i < write_len; i++) {
if (needs_dep[i])
- DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
+ DEP_RESOLVE_MOV(fs_builder(this, block, inst),
+ first_write_grf + i);
}
return;
}
if (reg >= first_write_grf &&
reg < first_write_grf + write_len &&
needs_dep[reg - first_write_grf]) {
- DEP_RESOLVE_MOV(bld.at(block, inst), reg);
+ DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
needs_dep[reg - first_write_grf] = false;
if (scan_inst->exec_size == 16)
needs_dep[reg - first_write_grf + 1] = false;
{
int write_len = inst->regs_written;
int first_write_grf = inst->dst.reg;
- bool needs_dep[BRW_MAX_MRF];
+ bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
assert(write_len < (int)sizeof(needs_dep) - 1);
memset(needs_dep, false, sizeof(needs_dep));
if (block->end() == scan_inst) {
for (int i = 0; i < write_len; i++) {
if (needs_dep[i])
- DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
+ DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+ first_write_grf + i);
}
return;
}
scan_inst->dst.reg >= first_write_grf &&
scan_inst->dst.reg < first_write_grf + write_len &&
needs_dep[scan_inst->dst.reg - first_write_grf]) {
- DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
+ DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+ scan_inst->dst.reg);
needs_dep[scan_inst->dst.reg - first_write_grf] = false;
}
* else does except for register spill/unspill, which generates and
* uses its MRF within a single IR instruction.
*/
- inst->base_mrf = 14;
+ inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
inst->mlen = 1;
}
}
if (dst.file == MRF)
dst.reg = dst.reg & ~BRW_MRF_COMPR4;
- dst.width = 8;
- const fs_builder hbld = bld.group(8, 0).exec_all().at(block, inst);
+ const fs_builder ibld(this, block, inst);
+ const fs_builder hbld = ibld.exec_all().group(8, 0);
for (uint8_t i = 0; i < inst->header_size; i++) {
if (inst->src[i].file != BAD_FILE) {
fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
- mov_src.width = 8;
hbld.MOV(mov_dst, mov_src);
}
dst = offset(dst, hbld, 1);
}
- dst.width = inst->exec_size;
- const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
- .exec_all(inst->force_writemask_all)
- .at(block, inst);
-
if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
inst->exec_size > 8) {
/* In this case, the payload portion of the LOAD_PAYLOAD isn't
} else {
/* Platform doesn't have COMPR4. We have to fake it */
fs_reg mov_dst = retype(dst, inst->src[i].type);
- mov_dst.width = 8;
ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
mov_dst.reg += 4;
ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
{
bool progress = false;
- /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
- * directly, but Cherryview cannot.
- */
- if (devinfo->gen >= 8 && !devinfo->is_cherryview)
- return false;
-
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
- if (inst->opcode != BRW_OPCODE_MUL ||
- inst->dst.is_accumulator() ||
- (inst->dst.type != BRW_REGISTER_TYPE_D &&
- inst->dst.type != BRW_REGISTER_TYPE_UD))
- continue;
+ const fs_builder ibld(this, block, inst);
- const fs_builder ibld = bld.at(block, inst);
+ if (inst->opcode == BRW_OPCODE_MUL) {
+ if (inst->dst.is_accumulator() ||
+ (inst->dst.type != BRW_REGISTER_TYPE_D &&
+ inst->dst.type != BRW_REGISTER_TYPE_UD))
+ continue;
- /* The MUL instruction isn't commutative. On Gen <= 6, only the low
- * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
- * src1 are used.
- *
- * If multiplying by an immediate value that fits in 16-bits, do a
- * single MUL instruction with that value in the proper location.
- */
- if (inst->src[1].file == IMM &&
- inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
- if (devinfo->gen < 7) {
- fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
- inst->dst.type, dispatch_width);
- ibld.MOV(imm, inst->src[1]);
- ibld.MUL(inst->dst, imm, inst->src[0]);
- } else {
- ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
- }
- } else {
- /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
- * do 32-bit integer multiplication in one instruction, but instead
- * must do a sequence (which actually calculates a 64-bit result):
- *
- * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
- * mach(8) null g3<8,8,1>D g4<8,8,1>D
- * mov(8) g2<1>D acc0<8,8,1>D
- *
- * But on Gen > 6, the ability to use second accumulator register
- * (acc1) for non-float data types was removed, preventing a simple
- * implementation in SIMD16. A 16-channel result can be calculated by
- * executing the three instructions twice in SIMD8, once with quarter
- * control of 1Q for the first eight channels and again with 2Q for
- * the second eight channels.
- *
- * Which accumulator register is implicitly accessed (by AccWrEnable
- * for instance) is determined by the quarter control. Unfortunately
- * Ivybridge (and presumably Baytrail) has a hardware bug in which an
- * implicit accumulator access by an instruction with 2Q will access
- * acc1 regardless of whether the data type is usable in acc1.
- *
- * Specifically, the 2Q mach(8) writes acc1 which does not exist for
- * integer data types.
- *
- * Since we only want the low 32-bits of the result, we can do two
- * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
- * adjust the high result and add them (like the mach is doing):
- *
- * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
- * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
- * shl(8) g9<1>D g8<8,8,1>D 16D
- * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
- *
- * We avoid the shl instruction by realizing that we only want to add
- * the low 16-bits of the "high" result to the high 16-bits of the
- * "low" result and using proper regioning on the add:
- *
- * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
- * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
- * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
- *
- * Since it does not use the (single) accumulator register, we can
- * schedule multi-component multiplications much better.
+ /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
+ * operation directly, but CHV/BXT cannot.
*/
+ if (devinfo->gen >= 8 &&
+ !devinfo->is_cherryview && !devinfo->is_broxton)
+ continue;
- if (inst->conditional_mod && inst->dst.is_null()) {
- inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
- inst->dst.type, dispatch_width);
- }
- fs_reg low = inst->dst;
- fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
- inst->dst.type, dispatch_width);
+ if (inst->src[1].file == IMM &&
+ inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
+ /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+ * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+ * src1 are used.
+ *
+ * If multiplying by an immediate value that fits in 16-bits, do a
+ * single MUL instruction with that value in the proper location.
+ */
+ if (devinfo->gen < 7) {
+ fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
+ inst->dst.type);
+ ibld.MOV(imm, inst->src[1]);
+ ibld.MUL(inst->dst, imm, inst->src[0]);
+ } else {
+ ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
+ }
+ } else {
+ /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
+ * do 32-bit integer multiplication in one instruction, but instead
+ * must do a sequence (which actually calculates a 64-bit result):
+ *
+ * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
+ * mach(8) null g3<8,8,1>D g4<8,8,1>D
+ * mov(8) g2<1>D acc0<8,8,1>D
+ *
+ * But on Gen > 6, the ability to use second accumulator register
+ * (acc1) for non-float data types was removed, preventing a simple
+ * implementation in SIMD16. A 16-channel result can be calculated by
+ * executing the three instructions twice in SIMD8, once with quarter
+ * control of 1Q for the first eight channels and again with 2Q for
+ * the second eight channels.
+ *
+ * Which accumulator register is implicitly accessed (by AccWrEnable
+ * for instance) is determined by the quarter control. Unfortunately
+ * Ivybridge (and presumably Baytrail) has a hardware bug in which an
+ * implicit accumulator access by an instruction with 2Q will access
+ * acc1 regardless of whether the data type is usable in acc1.
+ *
+ * Specifically, the 2Q mach(8) writes acc1 which does not exist for
+ * integer data types.
+ *
+ * Since we only want the low 32-bits of the result, we can do two
+ * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
+ * adjust the high result and add them (like the mach is doing):
+ *
+ * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
+ * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
+ * shl(8) g9<1>D g8<8,8,1>D 16D
+ * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
+ *
+ * We avoid the shl instruction by realizing that we only want to add
+ * the low 16-bits of the "high" result to the high 16-bits of the
+ * "low" result and using proper regioning on the add:
+ *
+ * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
+ * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
+ * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
+ *
+ * Since it does not use the (single) accumulator register, we can
+ * schedule multi-component multiplications much better.
+ */
- if (devinfo->gen >= 7) {
- fs_reg src1_0_w = inst->src[1];
- fs_reg src1_1_w = inst->src[1];
+ fs_reg orig_dst = inst->dst;
+ if (orig_dst.is_null() || orig_dst.file == MRF) {
+ inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+ inst->dst.type);
+ }
+ fs_reg low = inst->dst;
+ fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
+ inst->dst.type);
- if (inst->src[1].file == IMM) {
- src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
- src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
- } else {
- src1_0_w.type = BRW_REGISTER_TYPE_UW;
- if (src1_0_w.stride != 0) {
- assert(src1_0_w.stride == 1);
- src1_0_w.stride = 2;
+ if (devinfo->gen >= 7) {
+ fs_reg src1_0_w = inst->src[1];
+ fs_reg src1_1_w = inst->src[1];
+
+ if (inst->src[1].file == IMM) {
+ src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
+ src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
+ } else {
+ src1_0_w.type = BRW_REGISTER_TYPE_UW;
+ if (src1_0_w.stride != 0) {
+ assert(src1_0_w.stride == 1);
+ src1_0_w.stride = 2;
+ }
+
+ src1_1_w.type = BRW_REGISTER_TYPE_UW;
+ if (src1_1_w.stride != 0) {
+ assert(src1_1_w.stride == 1);
+ src1_1_w.stride = 2;
+ }
+ src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
}
+ ibld.MUL(low, inst->src[0], src1_0_w);
+ ibld.MUL(high, inst->src[0], src1_1_w);
+ } else {
+ fs_reg src0_0_w = inst->src[0];
+ fs_reg src0_1_w = inst->src[0];
- src1_1_w.type = BRW_REGISTER_TYPE_UW;
- if (src1_1_w.stride != 0) {
- assert(src1_1_w.stride == 1);
- src1_1_w.stride = 2;
+ src0_0_w.type = BRW_REGISTER_TYPE_UW;
+ if (src0_0_w.stride != 0) {
+ assert(src0_0_w.stride == 1);
+ src0_0_w.stride = 2;
}
- src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
- }
- ibld.MUL(low, inst->src[0], src1_0_w);
- ibld.MUL(high, inst->src[0], src1_1_w);
- } else {
- fs_reg src0_0_w = inst->src[0];
- fs_reg src0_1_w = inst->src[0];
- src0_0_w.type = BRW_REGISTER_TYPE_UW;
- if (src0_0_w.stride != 0) {
- assert(src0_0_w.stride == 1);
- src0_0_w.stride = 2;
- }
+ src0_1_w.type = BRW_REGISTER_TYPE_UW;
+ if (src0_1_w.stride != 0) {
+ assert(src0_1_w.stride == 1);
+ src0_1_w.stride = 2;
+ }
+ src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
- src0_1_w.type = BRW_REGISTER_TYPE_UW;
- if (src0_1_w.stride != 0) {
- assert(src0_1_w.stride == 1);
- src0_1_w.stride = 2;
+ ibld.MUL(low, src0_0_w, inst->src[1]);
+ ibld.MUL(high, src0_1_w, inst->src[1]);
}
- src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
- ibld.MUL(low, src0_0_w, inst->src[1]);
- ibld.MUL(high, src0_1_w, inst->src[1]);
- }
+ fs_reg dst = inst->dst;
+ dst.type = BRW_REGISTER_TYPE_UW;
+ dst.subreg_offset = 2;
+ dst.stride = 2;
- fs_reg dst = inst->dst;
- dst.type = BRW_REGISTER_TYPE_UW;
- dst.subreg_offset = 2;
- dst.stride = 2;
+ high.type = BRW_REGISTER_TYPE_UW;
+ high.stride = 2;
- high.type = BRW_REGISTER_TYPE_UW;
- high.stride = 2;
+ low.type = BRW_REGISTER_TYPE_UW;
+ low.subreg_offset = 2;
+ low.stride = 2;
- low.type = BRW_REGISTER_TYPE_UW;
- low.subreg_offset = 2;
- low.stride = 2;
+ ibld.ADD(dst, low, high);
- ibld.ADD(dst, low, high);
+ if (inst->conditional_mod || orig_dst.file == MRF) {
+ set_condmod(inst->conditional_mod,
+ ibld.MOV(orig_dst, inst->dst));
+ }
+ }
- if (inst->conditional_mod) {
- fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
- set_condmod(inst->conditional_mod,
- ibld.MOV(null, inst->dst));
+ } else if (inst->opcode == SHADER_OPCODE_MULH) {
+ /* Should have been lowered to 8-wide. */
+ assert(inst->exec_size <= 8);
+ const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
+ inst->dst.type);
+ fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
+ fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
+
+ if (devinfo->gen >= 8) {
+ /* Until Gen8, integer multiplies read 32-bits from one source,
+ * and 16-bits from the other, and relying on the MACH instruction
+ * to generate the high bits of the result.
+ *
+ * On Gen8, the multiply instruction does a full 32x32-bit
+ * multiply, but in order to do a 64-bit multiply we can simulate
+ * the previous behavior and then use a MACH instruction.
+ *
+ * FINISHME: Don't use source modifiers on src1.
+ */
+ assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
+ mul->src[1].type == BRW_REGISTER_TYPE_UD);
+ mul->src[1].type = (type_is_signed(mul->src[1].type) ?
+ BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+ mul->src[1].stride *= 2;
+
+ } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
+ inst->force_sechalf) {
+ /* Among other things the quarter control bits influence which
+ * accumulator register is used by the hardware for instructions
+ * that access the accumulator implicitly (e.g. MACH). A
+ * second-half instruction would normally map to acc1, which
+ * doesn't exist on Gen7 and up (the hardware does emulate it for
+ * floating-point instructions *only* by taking advantage of the
+ * extra precision of acc0 not normally used for floating point
+ * arithmetic).
+ *
+ * HSW and up are careful enough not to try to access an
+ * accumulator register that doesn't exist, but on earlier Gen7
+ * hardware we need to make sure that the quarter control bits are
+ * zero to avoid non-deterministic behaviour and emit an extra MOV
+ * to get the result masked correctly according to the current
+ * channel enables.
+ */
+ mach->force_sechalf = false;
+ mach->force_writemask_all = true;
+ mach->dst = ibld.vgrf(inst->dst.type);
+ ibld.MOV(inst->dst, mach->dst);
}
+ } else {
+ continue;
}
inst->remove(block);
return progress;
}
-void
-fs_visitor::dump_instructions()
+static void
+setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
+ fs_reg *dst, fs_reg color, unsigned components)
{
- dump_instructions(NULL);
+ if (key->clamp_fragment_color) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+ assert(color.type == BRW_REGISTER_TYPE_F);
+
+ for (unsigned i = 0; i < components; i++)
+ set_saturate(true,
+ bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
+
+ color = tmp;
+ }
+
+ for (unsigned i = 0; i < components; i++)
+ dst[i] = offset(color, bld, i);
}
-void
-fs_visitor::dump_instructions(const char *name)
+static void
+lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
+ const brw_wm_prog_data *prog_data,
+ const brw_wm_prog_key *key,
+ const fs_visitor::thread_payload &payload)
{
- FILE *file = stderr;
- if (name && geteuid() != 0) {
- file = fopen(name, "w");
- if (!file)
- file = stderr;
+ assert(inst->src[6].file == IMM);
+ const brw_device_info *devinfo = bld.shader->devinfo;
+ const fs_reg &color0 = inst->src[0];
+ const fs_reg &color1 = inst->src[1];
+ const fs_reg &src0_alpha = inst->src[2];
+ const fs_reg &src_depth = inst->src[3];
+ const fs_reg &dst_depth = inst->src[4];
+ fs_reg sample_mask = inst->src[5];
+ const unsigned components = inst->src[6].fixed_hw_reg.dw1.ud;
+
+ /* We can potentially have a message length of up to 15, so we have to set
+ * base_mrf to either 0 or 1 in order to fit in m0..m15.
+ */
+ fs_reg sources[15];
+ int header_size = 2, payload_header_size;
+ unsigned length = 0;
+
+ /* From the Sandy Bridge PRM, volume 4, page 198:
+ *
+ * "Dispatched Pixel Enables. One bit per pixel indicating
+ * which pixels were originally enabled when the thread was
+ * dispatched. This field is only required for the end-of-
+ * thread message and on all dual-source messages."
+ */
+ if (devinfo->gen >= 6 &&
+ (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
+ color1.file == BAD_FILE &&
+ key->nr_color_regions == 1) {
+ header_size = 0;
}
- if (cfg) {
- calculate_register_pressure();
- int ip = 0, max_pressure = 0;
- foreach_block_and_inst(block, backend_instruction, inst, cfg) {
- max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
- fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
- dump_instruction(inst, file);
- ip++;
- }
- fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
- } else {
- int ip = 0;
- foreach_in_list(backend_instruction, inst, &instructions) {
- fprintf(file, "%4d: ", ip++);
- dump_instruction(inst, file);
- }
+ if (header_size != 0) {
+ assert(header_size == 2);
+ /* Allocate 2 registers for a header */
+ length += 2;
}
- if (file != stderr) {
- fclose(file);
+ if (payload.aa_dest_stencil_reg) {
+ sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1));
+ bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
+ .MOV(sources[length],
+ fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
+ length++;
}
-}
-void
-fs_visitor::dump_instruction(backend_instruction *be_inst)
-{
- dump_instruction(be_inst, stderr);
-}
+ if (prog_data->uses_omask) {
+ sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1),
+ BRW_REGISTER_TYPE_UD);
-void
-fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
-{
- fs_inst *inst = (fs_inst *)be_inst;
+ /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
+ * relevant. Since it's unsigned single words one vgrf is always
+ * 16-wide, but only the lower or higher 8 channels will be used by the
+ * hardware when doing a SIMD8 write depending on whether we have
+ * selected the subspans for the first or second half respectively.
+ */
+ assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
+ sample_mask.type = BRW_REGISTER_TYPE_UW;
+ sample_mask.stride *= 2;
+
+ bld.exec_all().annotate("FB write oMask")
+ .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
+ inst->force_sechalf),
+ sample_mask);
+ length++;
+ }
- if (inst->predicate) {
- fprintf(file, "(%cf0.%d) ",
- inst->predicate_inverse ? '-' : '+',
- inst->flag_subreg);
+ payload_header_size = length;
+
+ if (src0_alpha.file != BAD_FILE) {
+ /* FIXME: This is being passed at the wrong location in the payload and
+ * doesn't work when gl_SampleMask and MRTs are used simultaneously.
+ * It's supposed to be immediately before oMask but there seems to be no
+ * reasonable way to pass them in the correct order because LOAD_PAYLOAD
+ * requires header sources to form a contiguous segment at the beginning
+ * of the message and src0_alpha has per-channel semantics.
+ */
+ setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
+ length++;
}
- fprintf(file, "%s", brw_instruction_name(inst->opcode));
- if (inst->saturate)
- fprintf(file, ".sat");
- if (inst->conditional_mod) {
- fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
- if (!inst->predicate &&
- (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
- inst->opcode != BRW_OPCODE_IF &&
- inst->opcode != BRW_OPCODE_WHILE))) {
- fprintf(file, ".f0.%d", inst->flag_subreg);
- }
+ setup_color_payload(bld, key, &sources[length], color0, components);
+ length += 4;
+
+ if (color1.file != BAD_FILE) {
+ setup_color_payload(bld, key, &sources[length], color1, components);
+ length += 4;
}
- fprintf(file, "(%d) ", inst->exec_size);
- if (inst->mlen) {
- fprintf(file, "(mlen: %d) ", inst->mlen);
+ if (src_depth.file != BAD_FILE) {
+ sources[length] = src_depth;
+ length++;
+ }
+
+ if (dst_depth.file != BAD_FILE) {
+ sources[length] = dst_depth;
+ length++;
+ }
+
+ fs_inst *load;
+ if (devinfo->gen >= 7) {
+ /* Send from the GRF */
+ fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
+ load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
+ payload.reg = bld.shader->alloc.allocate(load->regs_written);
+ load->dst = payload;
+
+ inst->src[0] = payload;
+ inst->resize_sources(1);
+ inst->base_mrf = -1;
+ } else {
+ /* Send from the MRF */
+ load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
+ sources, length, payload_header_size);
+
+ /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
+ * will do this for us if we just give it a COMPR4 destination.
+ */
+ if (devinfo->gen < 6 && bld.dispatch_width() == 16)
+ load->dst.reg |= BRW_MRF_COMPR4;
+
+ inst->resize_sources(0);
+ inst->base_mrf = 1;
+ }
+
+ inst->opcode = FS_OPCODE_FB_WRITE;
+ inst->mlen = load->regs_written;
+ inst->header_size = header_size;
+}
+
+static void
+lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
+ const fs_reg &coordinate,
+ const fs_reg &shadow_c,
+ const fs_reg &lod, const fs_reg &lod2,
+ const fs_reg &sampler,
+ unsigned coord_components,
+ unsigned grad_components)
+{
+ const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
+ op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
+ fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
+ fs_reg msg_end = msg_begin;
+
+ /* g0 header. */
+ msg_end = offset(msg_end, bld.group(8, 0), 1);
+
+ for (unsigned i = 0; i < coord_components; i++)
+ bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
+ offset(coordinate, bld, i));
+
+ msg_end = offset(msg_end, bld, coord_components);
+
+ /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
+ * require all three components to be present and zero if they are unused.
+ */
+ if (coord_components > 0 &&
+ (has_lod || shadow_c.file != BAD_FILE ||
+ (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
+ for (unsigned i = coord_components; i < 3; i++)
+ bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
+
+ msg_end = offset(msg_end, bld, 3 - coord_components);
+ }
+
+ if (op == SHADER_OPCODE_TXD) {
+ /* TXD unsupported in SIMD16 mode. */
+ assert(bld.dispatch_width() == 8);
+
+ /* the slots for u and v are always present, but r is optional */
+ if (coord_components < 2)
+ msg_end = offset(msg_end, bld, 2 - coord_components);
+
+ /* P = u, v, r
+ * dPdx = dudx, dvdx, drdx
+ * dPdy = dudy, dvdy, drdy
+ *
+ * 1-arg: Does not exist.
+ *
+ * 2-arg: dudx dvdx dudy dvdy
+ * dPdx.x dPdx.y dPdy.x dPdy.y
+ * m4 m5 m6 m7
+ *
+ * 3-arg: dudx dvdx drdx dudy dvdy drdy
+ * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
+ * m5 m6 m7 m8 m9 m10
+ */
+ for (unsigned i = 0; i < grad_components; i++)
+ bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
+
+ msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+
+ for (unsigned i = 0; i < grad_components; i++)
+ bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
+
+ msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+ }
+
+ if (has_lod) {
+ /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
+ * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
+ */
+ assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
+ bld.dispatch_width() == 16);
+
+ const brw_reg_type type =
+ (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
+ BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
+ bld.MOV(retype(msg_end, type), lod);
+ msg_end = offset(msg_end, bld, 1);
+ }
+
+ if (shadow_c.file != BAD_FILE) {
+ if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
+ /* There's no plain shadow compare message, so we use shadow
+ * compare with a bias of 0.0.
+ */
+ bld.MOV(msg_end, fs_reg(0.0f));
+ msg_end = offset(msg_end, bld, 1);
+ }
+
+ bld.MOV(msg_end, shadow_c);
+ msg_end = offset(msg_end, bld, 1);
+ }
+
+ inst->opcode = op;
+ inst->src[0] = reg_undef;
+ inst->src[1] = sampler;
+ inst->resize_sources(2);
+ inst->base_mrf = msg_begin.reg;
+ inst->mlen = msg_end.reg - msg_begin.reg;
+ inst->header_size = 1;
+}
+
+static void
+lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
+ fs_reg coordinate,
+ const fs_reg &shadow_c,
+ fs_reg lod, fs_reg lod2,
+ const fs_reg &sample_index,
+ const fs_reg &sampler,
+ const fs_reg &offset_value,
+ unsigned coord_components,
+ unsigned grad_components)
+{
+ fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
+ fs_reg msg_coords = message;
+ unsigned header_size = 0;
+
+ if (offset_value.file != BAD_FILE) {
+ /* The offsets set up by the visitor are in the m1 header, so we can't
+ * go headerless.
+ */
+ header_size = 1;
+ message.reg--;
+ }
+
+ for (unsigned i = 0; i < coord_components; i++) {
+ bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ }
+ fs_reg msg_end = offset(msg_coords, bld, coord_components);
+ fs_reg msg_lod = offset(msg_coords, bld, 4);
+
+ if (shadow_c.file != BAD_FILE) {
+ fs_reg msg_shadow = msg_lod;
+ bld.MOV(msg_shadow, shadow_c);
+ msg_lod = offset(msg_shadow, bld, 1);
+ msg_end = msg_lod;
+ }
+
+ switch (op) {
+ case SHADER_OPCODE_TXL:
+ case FS_OPCODE_TXB:
+ bld.MOV(msg_lod, lod);
+ msg_end = offset(msg_lod, bld, 1);
+ break;
+ case SHADER_OPCODE_TXD:
+ /**
+ * P = u, v, r
+ * dPdx = dudx, dvdx, drdx
+ * dPdy = dudy, dvdy, drdy
+ *
+ * Load up these values:
+ * - dudx dudy dvdx dvdy drdx drdy
+ * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
+ */
+ msg_end = msg_lod;
+ for (unsigned i = 0; i < grad_components; i++) {
+ bld.MOV(msg_end, lod);
+ lod = offset(lod, bld, 1);
+ msg_end = offset(msg_end, bld, 1);
+
+ bld.MOV(msg_end, lod2);
+ lod2 = offset(lod2, bld, 1);
+ msg_end = offset(msg_end, bld, 1);
+ }
+ break;
+ case SHADER_OPCODE_TXS:
+ msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
+ bld.MOV(msg_lod, lod);
+ msg_end = offset(msg_lod, bld, 1);
+ break;
+ case SHADER_OPCODE_TXF:
+ msg_lod = offset(msg_coords, bld, 3);
+ bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
+ msg_end = offset(msg_lod, bld, 1);
+ break;
+ case SHADER_OPCODE_TXF_CMS:
+ msg_lod = offset(msg_coords, bld, 3);
+ /* lod */
+ bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
+ /* sample index */
+ bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
+ msg_end = offset(msg_lod, bld, 2);
+ break;
+ default:
+ break;
+ }
+
+ inst->opcode = op;
+ inst->src[0] = reg_undef;
+ inst->src[1] = sampler;
+ inst->resize_sources(2);
+ inst->base_mrf = message.reg;
+ inst->mlen = msg_end.reg - message.reg;
+ inst->header_size = header_size;
+
+ /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+ assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static bool
+is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
+{
+ if (devinfo->gen < 8 && !devinfo->is_haswell)
+ return false;
+
+ return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
+}
+
+static void
+lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
+ fs_reg coordinate,
+ const fs_reg &shadow_c,
+ fs_reg lod, fs_reg lod2,
+ const fs_reg &sample_index,
+ const fs_reg &mcs, const fs_reg &sampler,
+ fs_reg offset_value,
+ unsigned coord_components,
+ unsigned grad_components)
+{
+ const brw_device_info *devinfo = bld.shader->devinfo;
+ int reg_width = bld.dispatch_width() / 8;
+ unsigned header_size = 0, length = 0;
+ fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
+ for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
+ sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+ if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
+ offset_value.file != BAD_FILE ||
+ is_high_sampler(devinfo, sampler)) {
+ /* For general texture offsets (no txf workaround), we need a header to
+ * put them in. Note that we're only reserving space for it in the
+ * message payload as it will be initialized implicitly by the
+ * generator.
+ *
+ * TG4 needs to place its channel select in the header, for interaction
+ * with ARB_texture_swizzle. The sampler index is only 4-bits, so for
+ * larger sampler numbers we need to offset the Sampler State Pointer in
+ * the header.
+ */
+ header_size = 1;
+ sources[0] = fs_reg();
+ length++;
+ }
+
+ if (shadow_c.file != BAD_FILE) {
+ bld.MOV(sources[length], shadow_c);
+ length++;
+ }
+
+ bool coordinate_done = false;
+
+ /* The sampler can only meaningfully compute LOD for fragment shader
+ * messages. For all other stages, we change the opcode to TXL and
+ * hardcode the LOD to 0.
+ */
+ if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
+ op == SHADER_OPCODE_TEX) {
+ op = SHADER_OPCODE_TXL;
+ lod = fs_reg(0.0f);
+ }
+
+ /* Set up the LOD info */
+ switch (op) {
+ case FS_OPCODE_TXB:
+ case SHADER_OPCODE_TXL:
+ bld.MOV(sources[length], lod);
+ length++;
+ break;
+ case SHADER_OPCODE_TXD:
+ /* TXD should have been lowered in SIMD16 mode. */
+ assert(bld.dispatch_width() == 8);
+
+ /* Load dPdx and the coordinate together:
+ * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+ */
+ for (unsigned i = 0; i < coord_components; i++) {
+ bld.MOV(sources[length], coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+
+ /* For cube map array, the coordinate is (u,v,r,ai) but there are
+ * only derivatives for (u, v, r).
+ */
+ if (i < grad_components) {
+ bld.MOV(sources[length], lod);
+ lod = offset(lod, bld, 1);
+ length++;
+
+ bld.MOV(sources[length], lod2);
+ lod2 = offset(lod2, bld, 1);
+ length++;
+ }
+ }
+
+ coordinate_done = true;
+ break;
+ case SHADER_OPCODE_TXS:
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
+ length++;
+ break;
+ case SHADER_OPCODE_TXF:
+ /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
+ * On Gen9 they are u, v, lod, r
+ */
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+
+ if (devinfo->gen >= 9) {
+ if (coord_components >= 2) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ }
+ length++;
+ }
+
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+ length++;
+
+ for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+ }
+
+ coordinate_done = true;
+ break;
+ case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_UMS:
+ case SHADER_OPCODE_TXF_MCS:
+ if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
+ length++;
+ }
+
+ if (op == SHADER_OPCODE_TXF_CMS) {
+ /* Data from the multisample control surface. */
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
+ length++;
+ }
+
+ /* There is no offsetting for this message; just copy in the integer
+ * texture coordinates.
+ */
+ for (unsigned i = 0; i < coord_components; i++) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+ }
+
+ coordinate_done = true;
+ break;
+ case SHADER_OPCODE_TG4_OFFSET:
+ /* gather4_po_c should have been lowered in SIMD16 mode. */
+ assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
+
+ /* More crazy intermixing */
+ for (unsigned i = 0; i < 2; i++) { /* u, v */
+ bld.MOV(sources[length], coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+ }
+
+ for (unsigned i = 0; i < 2; i++) { /* offu, offv */
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
+ offset_value = offset(offset_value, bld, 1);
+ length++;
+ }
+
+ if (coord_components == 3) { /* r if present */
+ bld.MOV(sources[length], coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+ }
+
+ coordinate_done = true;
+ break;
+ default:
+ break;
+ }
+
+ /* Set up the coordinate (except for cases where it was done above) */
+ if (!coordinate_done) {
+ for (unsigned i = 0; i < coord_components; i++) {
+ bld.MOV(sources[length], coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+ }
+ }
+
+ int mlen;
+ if (reg_width == 2)
+ mlen = length * reg_width - header_size;
+ else
+ mlen = length * reg_width;
+
+ const fs_reg src_payload = fs_reg(GRF, bld.shader->alloc.allocate(mlen),
+ BRW_REGISTER_TYPE_F);
+ bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
+
+ /* Generate the SEND. */
+ inst->opcode = op;
+ inst->src[0] = src_payload;
+ inst->src[1] = sampler;
+ inst->resize_sources(2);
+ inst->base_mrf = -1;
+ inst->mlen = mlen;
+ inst->header_size = header_size;
+
+ /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+ assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static void
+lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
+{
+ const brw_device_info *devinfo = bld.shader->devinfo;
+ const fs_reg &coordinate = inst->src[0];
+ const fs_reg &shadow_c = inst->src[1];
+ const fs_reg &lod = inst->src[2];
+ const fs_reg &lod2 = inst->src[3];
+ const fs_reg &sample_index = inst->src[4];
+ const fs_reg &mcs = inst->src[5];
+ const fs_reg &sampler = inst->src[6];
+ const fs_reg &offset_value = inst->src[7];
+ assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
+ const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
+ const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud;
+
+ if (devinfo->gen >= 7) {
+ lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
+ shadow_c, lod, lod2, sample_index,
+ mcs, sampler, offset_value,
+ coord_components, grad_components);
+ } else if (devinfo->gen >= 5) {
+ lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
+ shadow_c, lod, lod2, sample_index,
+ sampler, offset_value,
+ coord_components, grad_components);
+ } else {
+ lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
+ shadow_c, lod, lod2, sampler,
+ coord_components, grad_components);
+ }
+}
+
+/**
+ * Initialize the header present in some typed and untyped surface
+ * messages.
+ */
+static fs_reg
+emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
+{
+ fs_builder ubld = bld.exec_all().group(8, 0);
+ const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+ ubld.MOV(dst, fs_reg(0));
+ ubld.MOV(component(dst, 7), sample_mask);
+ return dst;
+}
+
+static void
+lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
+ const fs_reg &sample_mask)
+{
+ /* Get the logical send arguments. */
+ const fs_reg &addr = inst->src[0];
+ const fs_reg &src = inst->src[1];
+ const fs_reg &surface = inst->src[2];
+ const UNUSED fs_reg &dims = inst->src[3];
+ const fs_reg &arg = inst->src[4];
+
+ /* Calculate the total number of components of the payload. */
+ const unsigned addr_sz = inst->components_read(0);
+ const unsigned src_sz = inst->components_read(1);
+ const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
+ const unsigned sz = header_sz + addr_sz + src_sz;
+
+ /* Allocate space for the payload. */
+ fs_reg *const components = new fs_reg[sz];
+ const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+ unsigned n = 0;
+
+ /* Construct the payload. */
+ if (header_sz)
+ components[n++] = emit_surface_header(bld, sample_mask);
+
+ for (unsigned i = 0; i < addr_sz; i++)
+ components[n++] = offset(addr, bld, i);
+
+ for (unsigned i = 0; i < src_sz; i++)
+ components[n++] = offset(src, bld, i);
+
+ bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
+
+ /* Update the original instruction. */
+ inst->opcode = op;
+ inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
+ inst->header_size = header_sz;
+
+ inst->src[0] = payload;
+ inst->src[1] = surface;
+ inst->src[2] = arg;
+ inst->resize_sources(3);
+
+ delete[] components;
+}
+
+bool
+fs_visitor::lower_logical_sends()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ const fs_builder ibld(this, block, inst);
+
+ switch (inst->opcode) {
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ assert(stage == MESA_SHADER_FRAGMENT);
+ lower_fb_write_logical_send(ibld, inst,
+ (const brw_wm_prog_data *)prog_data,
+ (const brw_wm_prog_key *)key,
+ payload);
+ break;
+
+ case SHADER_OPCODE_TEX_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
+ break;
+
+ case SHADER_OPCODE_TXD_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
+ break;
+
+ case SHADER_OPCODE_TXF_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
+ break;
+
+ case SHADER_OPCODE_TXL_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
+ break;
+
+ case SHADER_OPCODE_TXS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
+ break;
+
+ case FS_OPCODE_TXB_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
+ break;
+
+ case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
+ break;
+
+ case SHADER_OPCODE_TXF_UMS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
+ break;
+
+ case SHADER_OPCODE_TXF_MCS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
+ break;
+
+ case SHADER_OPCODE_LOD_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
+ break;
+
+ case SHADER_OPCODE_TG4_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
+ break;
+
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
+ break;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_UNTYPED_SURFACE_READ,
+ fs_reg(0xffff));
+ break;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+ ibld.sample_mask_reg());
+ break;
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_UNTYPED_ATOMIC,
+ ibld.sample_mask_reg());
+ break;
+
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_TYPED_SURFACE_READ,
+ fs_reg(0xffff));
+ break;
+
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_TYPED_SURFACE_WRITE,
+ ibld.sample_mask_reg());
+ break;
+
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_TYPED_ATOMIC,
+ ibld.sample_mask_reg());
+ break;
+
+ default:
+ continue;
+ }
+
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst. The instruction will be left untouched by
+ * fs_visitor::lower_simd_width() if the returned value is equal to the
+ * original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct brw_device_info *devinfo,
+ const fs_inst *inst)
+{
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ case BRW_OPCODE_SEL:
+ case BRW_OPCODE_NOT:
+ case BRW_OPCODE_AND:
+ case BRW_OPCODE_OR:
+ case BRW_OPCODE_XOR:
+ case BRW_OPCODE_SHR:
+ case BRW_OPCODE_SHL:
+ case BRW_OPCODE_ASR:
+ case BRW_OPCODE_CMP:
+ case BRW_OPCODE_CMPN:
+ case BRW_OPCODE_CSEL:
+ case BRW_OPCODE_F32TO16:
+ case BRW_OPCODE_F16TO32:
+ case BRW_OPCODE_BFREV:
+ case BRW_OPCODE_BFE:
+ case BRW_OPCODE_BFI1:
+ case BRW_OPCODE_BFI2:
+ case BRW_OPCODE_ADD:
+ case BRW_OPCODE_MUL:
+ case BRW_OPCODE_AVG:
+ case BRW_OPCODE_FRC:
+ case BRW_OPCODE_RNDU:
+ case BRW_OPCODE_RNDD:
+ case BRW_OPCODE_RNDE:
+ case BRW_OPCODE_RNDZ:
+ case BRW_OPCODE_LZD:
+ case BRW_OPCODE_FBH:
+ case BRW_OPCODE_FBL:
+ case BRW_OPCODE_CBIT:
+ case BRW_OPCODE_SAD2:
+ case BRW_OPCODE_MAD:
+ case BRW_OPCODE_LRP:
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_POW:
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS: {
+ /* According to the PRMs:
+ * "A. In Direct Addressing mode, a source cannot span more than 2
+ * adjacent GRF registers.
+ * B. A destination cannot span more than 2 adjacent GRF registers."
+ *
+ * Look for the source or destination with the largest register region
+ * which is the one that is going to limit the overal execution size of
+ * the instruction due to this rule.
+ */
+ unsigned reg_count = inst->regs_written;
+
+ for (unsigned i = 0; i < inst->sources; i++)
+ reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i));
+
+ /* Calculate the maximum execution size of the instruction based on the
+ * factor by which it goes over the hardware limit of 2 GRFs.
+ */
+ return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
+ }
+ case SHADER_OPCODE_MULH:
+ /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
+ * is 8-wide on Gen7+.
+ */
+ return (devinfo->gen >= 7 ? 8 : inst->exec_size);
+
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
+ * here.
+ */
+ assert(devinfo->gen != 6 || inst->src[3].file == BAD_FILE ||
+ inst->exec_size == 8);
+ /* Dual-source FB writes are unsupported in SIMD16 mode. */
+ return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size);
+
+ case SHADER_OPCODE_TXD_LOGICAL:
+ /* TXD is unsupported in SIMD16 mode. */
+ return 8;
+
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
+ /* gather4_po_c is unsupported in SIMD16 mode. */
+ const fs_reg &shadow_c = inst->src[1];
+ return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
+ }
+ case SHADER_OPCODE_TXL_LOGICAL:
+ case FS_OPCODE_TXB_LOGICAL: {
+ /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
+ * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
+ * mode because the message exceeds the maximum length of 11.
+ */
+ const fs_reg &shadow_c = inst->src[1];
+ if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
+ return 16;
+ else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
+ return 8;
+ else
+ return inst->exec_size;
+ }
+ case SHADER_OPCODE_TXF_LOGICAL:
+ case SHADER_OPCODE_TXS_LOGICAL:
+ /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
+ * messages. Use SIMD16 instead.
+ */
+ if (devinfo->gen == 4)
+ return 16;
+ else
+ return inst->exec_size;
+
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ return 8;
+
+ default:
+ return inst->exec_size;
+ }
+}
+
+/**
+ * The \p rows array of registers represents a \p num_rows by \p num_columns
+ * matrix in row-major order, write it in column-major order into the register
+ * passed as destination. \p stride gives the separation between matrix
+ * elements in the input in fs_builder::dispatch_width() units.
+ */
+static void
+emit_transpose(const fs_builder &bld,
+ const fs_reg &dst, const fs_reg *rows,
+ unsigned num_rows, unsigned num_columns, unsigned stride)
+{
+ fs_reg *const components = new fs_reg[num_rows * num_columns];
+
+ for (unsigned i = 0; i < num_columns; ++i) {
+ for (unsigned j = 0; j < num_rows; ++j)
+ components[num_rows * i + j] = offset(rows[j], bld, stride * i);
+ }
+
+ bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
+
+ delete[] components;
+}
+
+bool
+fs_visitor::lower_simd_width()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
+
+ if (lower_width != inst->exec_size) {
+ /* Builder matching the original instruction. We may also need to
+ * emit an instruction of width larger than the original, set the
+ * execution size of the builder to the highest of both for now so
+ * we're sure that both cases can be handled.
+ */
+ const fs_builder ibld = bld.at(block, inst)
+ .exec_all(inst->force_writemask_all)
+ .group(MAX2(inst->exec_size, lower_width),
+ inst->force_sechalf);
+
+ /* Split the copies in chunks of the execution width of either the
+ * original or the lowered instruction, whichever is lower.
+ */
+ const unsigned copy_width = MIN2(lower_width, inst->exec_size);
+ const unsigned n = inst->exec_size / copy_width;
+ const unsigned dst_size = inst->regs_written * REG_SIZE /
+ inst->dst.component_size(inst->exec_size);
+ fs_reg dsts[4];
+
+ assert(n > 0 && n <= ARRAY_SIZE(dsts) &&
+ !inst->writes_accumulator && !inst->mlen);
+
+ for (unsigned i = 0; i < n; i++) {
+ /* Emit a copy of the original instruction with the lowered width.
+ * If the EOT flag was set throw it away except for the last
+ * instruction to avoid killing the thread prematurely.
+ */
+ fs_inst split_inst = *inst;
+ split_inst.exec_size = lower_width;
+ split_inst.eot = inst->eot && i == n - 1;
+
+ /* Select the correct channel enables for the i-th group, then
+ * transform the sources and destination and emit the lowered
+ * instruction.
+ */
+ const fs_builder lbld = ibld.group(lower_width, i);
+
+ for (unsigned j = 0; j < inst->sources; j++) {
+ if (inst->src[j].file != BAD_FILE &&
+ !is_uniform(inst->src[j])) {
+ /* Get the i-th copy_width-wide chunk of the source. */
+ const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
+ const unsigned src_size = inst->components_read(j);
+
+ /* Use a trivial transposition to copy one every n
+ * copy_width-wide components of the register into a
+ * temporary passed as source to the lowered instruction.
+ */
+ split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
+ emit_transpose(lbld.group(copy_width, 0),
+ split_inst.src[j], &src, 1, src_size, n);
+ }
+ }
+
+ if (inst->regs_written) {
+ /* Allocate enough space to hold the result of the lowered
+ * instruction and fix up the number of registers written.
+ */
+ split_inst.dst = dsts[i] =
+ lbld.vgrf(inst->dst.type, dst_size);
+ split_inst.regs_written =
+ DIV_ROUND_UP(inst->regs_written * lower_width,
+ inst->exec_size);
+ }
+
+ lbld.emit(split_inst);
+ }
+
+ if (inst->regs_written) {
+ /* Distance between useful channels in the temporaries, skipping
+ * garbage if the lowered instruction is wider than the original.
+ */
+ const unsigned m = lower_width / copy_width;
+
+ /* Interleave the components of the result from the lowered
+ * instructions. We need to set exec_all() when copying more than
+ * one half per component, because LOAD_PAYLOAD (in terms of which
+ * emit_transpose is implemented) can only use the same channel
+ * enable signals for all of its non-header sources.
+ */
+ emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
+ .group(copy_width, 0),
+ inst->dst, dsts, n, dst_size, m);
+ }
+
+ inst->remove(block);
+ progress = true;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+void
+fs_visitor::dump_instructions()
+{
+ dump_instructions(NULL);
+}
+
+void
+fs_visitor::dump_instructions(const char *name)
+{
+ FILE *file = stderr;
+ if (name && geteuid() != 0) {
+ file = fopen(name, "w");
+ if (!file)
+ file = stderr;
+ }
+
+ if (cfg) {
+ calculate_register_pressure();
+ int ip = 0, max_pressure = 0;
+ foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+ max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
+ fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
+ dump_instruction(inst, file);
+ ip++;
+ }
+ fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
+ } else {
+ int ip = 0;
+ foreach_in_list(backend_instruction, inst, &instructions) {
+ fprintf(file, "%4d: ", ip++);
+ dump_instruction(inst, file);
+ }
+ }
+
+ if (file != stderr) {
+ fclose(file);
+ }
+}
+
+void
+fs_visitor::dump_instruction(backend_instruction *be_inst)
+{
+ dump_instruction(be_inst, stderr);
+}
+
+void
+fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
+{
+ fs_inst *inst = (fs_inst *)be_inst;
+
+ if (inst->predicate) {
+ fprintf(file, "(%cf0.%d) ",
+ inst->predicate_inverse ? '-' : '+',
+ inst->flag_subreg);
+ }
+
+ fprintf(file, "%s", brw_instruction_name(inst->opcode));
+ if (inst->saturate)
+ fprintf(file, ".sat");
+ if (inst->conditional_mod) {
+ fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
+ if (!inst->predicate &&
+ (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
+ inst->opcode != BRW_OPCODE_IF &&
+ inst->opcode != BRW_OPCODE_WHILE))) {
+ fprintf(file, ".f0.%d", inst->flag_subreg);
+ }
+ }
+ fprintf(file, "(%d) ", inst->exec_size);
+
+ if (inst->mlen) {
+ fprintf(file, "(mlen: %d) ", inst->mlen);
}
switch (inst->dst.file) {
case GRF:
fprintf(file, "vgrf%d", inst->dst.reg);
- if (inst->dst.width != dispatch_width)
- fprintf(file, "@%d", inst->dst.width);
- if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
+ if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
inst->dst.subreg_offset)
fprintf(file, "+%d.%d",
inst->dst.reg_offset, inst->dst.subreg_offset);
switch (inst->src[i].file) {
case GRF:
fprintf(file, "vgrf%d", inst->src[i].reg);
- if (inst->src[i].width != dispatch_width)
- fprintf(file, "@%d", inst->src[i].width);
- if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
+ if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
inst->src[i].subreg_offset)
fprintf(file, "+%d.%d", inst->src[i].reg_offset,
inst->src[i].subreg_offset);
fprintf(file, "***m%d***", inst->src[i].reg);
break;
case ATTR:
- fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
+ fprintf(file, "attr%d+%d", inst->src[i].reg, inst->src[i].reg_offset);
break;
case UNIFORM:
fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
fs_visitor::setup_payload_gen6()
{
bool uses_depth =
- (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
+ (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
unsigned barycentric_interp_modes =
(stage == MESA_SHADER_FRAGMENT) ?
((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
}
/* R32: MSAA input coverage mask */
- if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
+ if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) {
assert(devinfo->gen >= 7);
payload.sample_mask_in_reg = payload.num_regs;
payload.num_regs++;
/* R34-: bary for 32-pixel. */
/* R58-59: interp W for 32-pixel. */
- if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+ if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
source_depth_to_render_target = true;
}
}
payload.num_regs = 2;
}
+/**
+ * We are building the local ID push constant data using the simplest possible
+ * method. We simply push the local IDs directly as they should appear in the
+ * registers for the uvec3 gl_LocalInvocationID variable.
+ *
+ * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
+ * registers worth of push constant space.
+ *
+ * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
+ * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
+ * to coordinated.
+ *
+ * FINISHME: There are a few easy optimizations to consider.
+ *
+ * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
+ * no need for using push constant space for that dimension.
+ *
+ * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
+ * easily use 16-bit words rather than 32-bit dwords in the push constant
+ * data.
+ *
+ * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
+ * conveying the data, and thereby reduce push constant usage.
+ *
+ */
void
fs_visitor::setup_cs_payload()
{
assert(devinfo->gen >= 7);
+ brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
payload.num_regs = 1;
-}
-
-void
-fs_visitor::assign_binding_table_offsets()
-{
- assert(stage == MESA_SHADER_FRAGMENT);
- brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
- brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
- uint32_t next_binding_table_offset = 0;
-
- /* If there are no color regions, we still perform an FB write to a null
- * renderbuffer, which we place at surface index 0.
- */
- prog_data->binding_table.render_target_start = next_binding_table_offset;
- next_binding_table_offset += MAX2(key->nr_color_regions, 1);
- assign_common_binding_table_offsets(next_binding_table_offset);
+ if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
+ prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
+ payload.local_invocation_id_reg = payload.num_regs;
+ payload.num_regs += prog_data->local_invocation_id_regs;
+ }
}
void
void
fs_visitor::optimize()
{
+ /* Start by validating the shader we currently have. */
+ validate();
+
/* bld is the common builder object pointing at the end of the program we
* used to translate it into i965 IR. For the optimization and lowering
* passes coming next, any code added after the end of the program without
* Ideally optimization passes wouldn't be part of the visitor so they
* wouldn't have access to bld at all, but they do, so just in case some
* pass forgets to ask for a location explicitly set it to NULL here to
- * make it trip.
+ * make it trip. The dispatch width is initialized to a bogus value to
+ * make sure that optimizations set the execution controls explicitly to
+ * match the code they are manipulating instead of relying on the defaults.
*/
- bld = bld.at(NULL, NULL);
+ bld = fs_builder(this, 64);
- split_virtual_grfs();
-
- move_uniform_array_access_to_pull_constants();
assign_constant_locations();
demote_pull_constants();
+ validate();
+
+ split_virtual_grfs();
+ validate();
+
#define OPT(pass, args...) ({ \
pass_num++; \
bool this_progress = pass(args); \
\
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
char filename[64]; \
- snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
- stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
+ snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass, \
+ stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
\
backend_shader::dump_instructions(filename); \
} \
\
+ validate(); \
+ \
progress = progress || this_progress; \
this_progress; \
})
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
char filename[64];
- snprintf(filename, 64, "%s%d-%04d-00-start",
- stage_abbrev, dispatch_width,
- shader_prog ? shader_prog->Name : 0);
+ snprintf(filename, 64, "%s%d-%s-00-start",
+ stage_abbrev, dispatch_width, nir->info.name);
backend_shader::dump_instructions(filename);
}
- bool progress;
+ bool progress = false;
int iteration = 0;
int pass_num = 0;
+
+ OPT(lower_simd_width);
+ OPT(lower_logical_sends);
+
do {
progress = false;
pass_num = 0;
OPT(opt_algebraic);
OPT(opt_cse);
OPT(opt_copy_propagate);
- OPT(opt_peephole_predicated_break);
+ OPT(opt_predicated_break, this);
OPT(opt_cmod_propagation);
OPT(dead_code_eliminate);
OPT(opt_peephole_sel);
OPT(lower_integer_multiplication);
lower_uniform_pull_constant_loads();
+
+ validate();
}
/**
{
assert(stage == MESA_SHADER_VERTEX);
- assign_common_binding_table_offsets(0);
setup_vs_payload();
if (shader_time_index >= 0)
assert(stage == MESA_SHADER_FRAGMENT);
- sanity_param_count = prog->Parameters->NumParameters;
-
- assign_binding_table_offsets();
-
if (devinfo->gen >= 6)
setup_payload_gen6();
else
emit_shader_time_begin();
calculate_urb_setup();
- if (prog->InputsRead > 0) {
+ if (nir->info.inputs_read > 0) {
if (devinfo->gen < 6)
emit_interpolation_setup_gen4();
else
else
wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
- /* If any state parameters were appended, then ParameterValues could have
- * been realloced, in which case the driver uniform storage set up by
- * _mesa_associate_uniform_storage() would point to freed memory. Make
- * sure that didn't happen.
- */
- assert(sanity_param_count == prog->Parameters->NumParameters);
-
return !failed;
}
fs_visitor::run_cs()
{
assert(stage == MESA_SHADER_COMPUTE);
- assert(shader);
-
- sanity_param_count = prog->Parameters->NumParameters;
-
- assign_common_binding_table_offsets(0);
setup_cs_payload();
if (failed)
return false;
- /* If any state parameters were appended, then ParameterValues could have
- * been realloced, in which case the driver uniform storage set up by
- * _mesa_associate_uniform_storage() would point to freed memory. Make
- * sure that didn't happen.
- */
- assert(sanity_param_count == prog->Parameters->NumParameters);
-
return !failed;
}
struct brw_wm_prog_data *prog_data,
struct gl_fragment_program *fp,
struct gl_shader_program *prog,
+ int shader_time_index8, int shader_time_index16,
unsigned *final_assembly_size)
{
- bool start_busy = false;
- double start_time = 0;
-
- if (unlikely(brw->perf_debug)) {
- start_busy = (brw->batch.last_bo &&
- drm_intel_bo_busy(brw->batch.last_bo));
- start_time = get_time();
- }
-
- struct brw_shader *shader = NULL;
- if (prog)
- shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
-
- if (unlikely(INTEL_DEBUG & DEBUG_WM))
- brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
-
- int st_index8 = -1, st_index16 = -1;
- if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
- st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
- st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
- }
-
/* Now the main event: Visit the shader IR and generate our FS IR for it.
*/
- fs_visitor v(brw->intelScreen->compiler, brw,
- mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
- prog, &fp->Base, 8, st_index8);
+ fs_visitor v(brw->intelScreen->compiler, brw, mem_ctx, key,
+ &prog_data->base, &fp->Base, fp->Base.nir, 8, shader_time_index8);
if (!v.run_fs(false /* do_rep_send */)) {
if (prog) {
prog->LinkStatus = false;
}
cfg_t *simd16_cfg = NULL;
- fs_visitor v2(brw->intelScreen->compiler, brw,
- mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
- prog, &fp->Base, 16, st_index16);
+ fs_visitor v2(brw->intelScreen->compiler, brw, mem_ctx, key,
+ &prog_data->base, &fp->Base, fp->Base.nir, 16, shader_time_index16);
if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
if (!v.simd16_unsupported) {
/* Try a SIMD16 compile */
if (simd16_cfg)
prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
- if (unlikely(brw->perf_debug) && shader) {
- if (shader->compiled_once)
- brw_wm_debug_recompile(brw, prog, key);
- shader->compiled_once = true;
+ return g.get_assembly(final_assembly_size);
+}
- if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
- perf_debug("FS compile took %.03f ms and stalled the GPU\n",
- (get_time() - start_time) * 1000);
+void
+brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
+ void *buffer, uint32_t threads, uint32_t stride)
+{
+ if (prog_data->local_invocation_id_regs == 0)
+ return;
+
+ /* 'stride' should be an integer number of registers, that is, a multiple
+ * of 32 bytes.
+ */
+ assert(stride % 32 == 0);
+
+ unsigned x = 0, y = 0, z = 0;
+ for (unsigned t = 0; t < threads; t++) {
+ uint32_t *param = (uint32_t *) buffer + stride * t / 4;
+
+ for (unsigned i = 0; i < prog_data->simd_size; i++) {
+ param[0 * prog_data->simd_size + i] = x;
+ param[1 * prog_data->simd_size + i] = y;
+ param[2 * prog_data->simd_size + i] = z;
+
+ x++;
+ if (x == prog_data->local_size[0]) {
+ x = 0;
+ y++;
+ if (y == prog_data->local_size[1]) {
+ y = 0;
+ z++;
+ if (z == prog_data->local_size[2])
+ z = 0;
+ }
+ }
}
}
-
- return g.get_assembly(final_assembly_size);
}
-extern "C" bool
-brw_fs_precompile(struct gl_context *ctx,
- struct gl_shader_program *shader_prog,
- struct gl_program *prog)
+fs_reg *
+fs_visitor::emit_cs_local_invocation_id_setup()
{
- struct brw_context *brw = brw_context(ctx);
- struct brw_wm_prog_key key;
+ assert(stage == MESA_SHADER_COMPUTE);
- struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
- struct brw_fragment_program *bfp = brw_fragment_program(fp);
- bool program_uses_dfdy = fp->UsesDFdy;
+ fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
- memset(&key, 0, sizeof(key));
+ struct brw_reg src =
+ brw_vec8_grf(payload.local_invocation_id_reg, 0);
+ src = retype(src, BRW_REGISTER_TYPE_UD);
+ bld.MOV(*reg, src);
+ src.nr += dispatch_width / 8;
+ bld.MOV(offset(*reg, bld, 1), src);
+ src.nr += dispatch_width / 8;
+ bld.MOV(offset(*reg, bld, 2), src);
- if (brw->gen < 6) {
- if (fp->UsesKill)
- key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+ return reg;
+}
- if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
- key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
+fs_reg *
+fs_visitor::emit_cs_work_group_id_setup()
+{
+ assert(stage == MESA_SHADER_COMPUTE);
- /* Just assume depth testing. */
- key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
- key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
- }
+ fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
- if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
- BRW_FS_VARYING_INPUT_MASK) > 16)
- key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
+ struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
+ struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
+ struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
- brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
+ bld.MOV(*reg, r0_1);
+ bld.MOV(offset(*reg, bld, 1), r0_6);
+ bld.MOV(offset(*reg, bld, 2), r0_7);
- if (fp->Base.InputsRead & VARYING_BIT_POS) {
- key.drawable_height = ctx->DrawBuffer->Height;
- }
+ return reg;
+}
- key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
- ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
- BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
+const unsigned *
+brw_cs_emit(struct brw_context *brw,
+ void *mem_ctx,
+ const struct brw_cs_prog_key *key,
+ struct brw_cs_prog_data *prog_data,
+ struct gl_compute_program *cp,
+ struct gl_shader_program *prog,
+ int shader_time_index,
+ unsigned *final_assembly_size)
+{
+ prog_data->local_size[0] = cp->LocalSize[0];
+ prog_data->local_size[1] = cp->LocalSize[1];
+ prog_data->local_size[2] = cp->LocalSize[2];
+ unsigned local_workgroup_size =
+ cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2];
+ unsigned max_cs_threads = brw->intelScreen->compiler->devinfo->max_cs_threads;
- if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
- key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
- key.nr_color_regions > 1;
- }
+ cfg_t *cfg = NULL;
+ const char *fail_msg = NULL;
- key.program_string_id = bfp->id;
+ /* Now the main event: Visit the shader IR and generate our CS IR for it.
+ */
+ fs_visitor v8(brw->intelScreen->compiler, brw, mem_ctx, key,
+ &prog_data->base, &cp->Base, cp->Base.nir, 8, shader_time_index);
+ if (!v8.run_cs()) {
+ fail_msg = v8.fail_msg;
+ } else if (local_workgroup_size <= 8 * max_cs_threads) {
+ cfg = v8.cfg;
+ prog_data->simd_size = 8;
+ }
- uint32_t old_prog_offset = brw->wm.base.prog_offset;
- struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
+ fs_visitor v16(brw->intelScreen->compiler, brw, mem_ctx, key,
+ &prog_data->base, &cp->Base, cp->Base.nir, 16, shader_time_index);
+ if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
+ !fail_msg && !v8.simd16_unsupported &&
+ local_workgroup_size <= 16 * max_cs_threads) {
+ /* Try a SIMD16 compile */
+ v16.import_uniforms(&v8);
+ if (!v16.run_cs()) {
+ perf_debug("SIMD16 shader failed to compile: %s", v16.fail_msg);
+ if (!cfg) {
+ fail_msg =
+ "Couldn't generate SIMD16 program and not "
+ "enough threads for SIMD8";
+ }
+ } else {
+ cfg = v16.cfg;
+ prog_data->simd_size = 16;
+ }
+ }
- bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
+ if (unlikely(cfg == NULL)) {
+ assert(fail_msg);
+ prog->LinkStatus = false;
+ ralloc_strcat(&prog->InfoLog, fail_msg);
+ _mesa_problem(NULL, "Failed to compile compute shader: %s\n",
+ fail_msg);
+ return NULL;
+ }
- brw->wm.base.prog_offset = old_prog_offset;
- brw->wm.prog_data = old_prog_data;
+ fs_generator g(brw->intelScreen->compiler, brw,
+ mem_ctx, (void*) key, &prog_data->base, &cp->Base,
+ v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
+ if (INTEL_DEBUG & DEBUG_CS) {
+ char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d",
+ prog->Label ? prog->Label : "unnamed",
+ prog->Name);
+ g.enable_debug(name);
+ }
- return success;
-}
+ g.generate_code(cfg, prog_data->simd_size);
-void
-brw_setup_tex_for_precompile(struct brw_context *brw,
- struct brw_sampler_prog_key_data *tex,
- struct gl_program *prog)
-{
- const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
- unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
- for (unsigned i = 0; i < sampler_count; i++) {
- if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
- /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
- tex->swizzles[i] =
- MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
- } else {
- /* Color sampler: assume no swizzling. */
- tex->swizzles[i] = SWIZZLE_XYZW;
- }
- }
+ return g.get_assembly(final_assembly_size);
}