#include "glsl/glsl_types.h"
#include "program/sampler.h"
+using namespace brw;
+
void
fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
const fs_reg *src, unsigned sources)
}
assert(this->exec_size != 0);
- for (unsigned i = 0; i < sources; ++i) {
- switch (this->src[i].file) {
- case BAD_FILE:
- this->src[i].effective_width = 8;
- break;
- case GRF:
- case HW_REG:
- case ATTR:
- assert(this->src[i].width > 0);
- if (this->src[i].width == 1) {
- this->src[i].effective_width = this->exec_size;
- } else {
- this->src[i].effective_width = this->src[i].width;
- }
- break;
- case IMM:
- case UNIFORM:
- this->src[i].effective_width = this->exec_size;
- break;
- default:
- unreachable("Invalid source register file");
- }
- }
- this->dst.effective_width = this->exec_size;
-
this->conditional_mod = BRW_CONDITIONAL_NONE;
/* This will be the case for almost all instructions. */
}
}
-#define ALU1(op) \
- fs_inst * \
- fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
- { \
- return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
- }
-
-#define ALU2(op) \
- fs_inst * \
- fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
- const fs_reg &src1) \
- { \
- return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
- }
-
-#define ALU2_ACC(op) \
- fs_inst * \
- fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
- const fs_reg &src1) \
- { \
- fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
- inst->writes_accumulator = true; \
- return inst; \
- }
-
-#define ALU3(op) \
- fs_inst * \
- fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
- const fs_reg &src1, const fs_reg &src2) \
- { \
- return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
- }
-
-ALU1(NOT)
-ALU1(MOV)
-ALU1(FRC)
-ALU1(RNDD)
-ALU1(RNDE)
-ALU1(RNDZ)
-ALU2(ADD)
-ALU2(MUL)
-ALU2_ACC(MACH)
-ALU2(AND)
-ALU2(OR)
-ALU2(XOR)
-ALU2(SHL)
-ALU2(SHR)
-ALU2(ASR)
-ALU3(LRP)
-ALU1(BFREV)
-ALU3(BFE)
-ALU2(BFI1)
-ALU3(BFI2)
-ALU1(FBH)
-ALU1(FBL)
-ALU1(CBIT)
-ALU3(MAD)
-ALU2_ACC(ADDC)
-ALU2_ACC(SUBB)
-ALU2(SEL)
-ALU2(MAC)
-
-/** Gen4 predicated IF. */
-fs_inst *
-fs_visitor::IF(enum brw_predicate predicate)
-{
- fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
- inst->predicate = predicate;
- return inst;
-}
-
-/** Gen6 IF with embedded comparison. */
-fs_inst *
-fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
- enum brw_conditional_mod condition)
-{
- assert(brw->gen == 6);
- fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
- reg_null_d, src0, src1);
- inst->conditional_mod = condition;
- return inst;
-}
-
-/**
- * CMP: Sets the low bit of the destination channels with the result
- * of the comparison, while the upper bits are undefined, and updates
- * the flag register with the packed 16 bits of the result.
- */
-fs_inst *
-fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
- enum brw_conditional_mod condition)
-{
- fs_inst *inst;
-
- /* Take the instruction:
- *
- * CMP null<d> src0<f> src1<f>
- *
- * Original gen4 does type conversion to the destination type before
- * comparison, producing garbage results for floating point comparisons.
- *
- * The destination type doesn't matter on newer generations, so we set the
- * type to match src0 so we can compact the instruction.
- */
- dst.type = src0.type;
- if (dst.file == HW_REG)
- dst.fixed_hw_reg.type = dst.type;
-
- resolve_ud_negate(&src0);
- resolve_ud_negate(&src1);
-
- inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
- inst->conditional_mod = condition;
-
- return inst;
-}
-
-fs_inst *
-fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
-{
- uint8_t exec_size = dst.width;
- for (int i = 0; i < sources; ++i) {
- assert(src[i].width % dst.width == 0);
- if (src[i].width > exec_size)
- exec_size = src[i].width;
- }
-
- fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
- dst, src, sources);
- inst->regs_written = 0;
- for (int i = 0; i < sources; ++i) {
- /* The LOAD_PAYLOAD instruction only really makes sense if we are
- * dealing with whole registers. If this ever changes, we can deal
- * with it later.
- */
- int size = inst->src[i].effective_width * type_sz(src[i].type);
- assert(size % 32 == 0);
- inst->regs_written += (size + 31) / 32;
- }
-
- return inst;
-}
-
-exec_list
-fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
+void
+fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
+ const fs_reg &dst,
const fs_reg &surf_index,
const fs_reg &varying_offset,
uint32_t const_offset)
{
- exec_list instructions;
- fs_inst *inst;
-
/* We have our constant surface use a pitch of 4 bytes, so our index can
* be any component of a vector, and then we load 4 contiguous
* components starting from that.
* the redundant ones.
*/
fs_reg vec4_offset = vgrf(glsl_type::int_type);
- instructions.push_tail(ADD(vec4_offset,
- varying_offset, fs_reg(const_offset & ~3)));
+ bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
int scale = 1;
- if (brw->gen == 4 && dst.width == 8) {
+ if (devinfo->gen == 4 && dst.width == 8) {
/* Pre-gen5, we can either use a SIMD8 message that requires (header,
* u, v, r) as parameters, or we can just use the SIMD16 message
* consisting of (header, u). We choose the second, at the cost of a
}
enum opcode op;
- if (brw->gen >= 7)
+ if (devinfo->gen >= 7)
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
else
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
int regs_written = 4 * (dst.width / 8) * scale;
fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
dst.type, dst.width);
- inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
+ fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
inst->regs_written = regs_written;
- instructions.push_tail(inst);
- if (brw->gen < 7) {
+ if (devinfo->gen < 7) {
inst->base_mrf = 13;
- inst->header_present = true;
- if (brw->gen == 4)
+ inst->header_size = 1;
+ if (devinfo->gen == 4)
inst->mlen = 3;
else
inst->mlen = 1 + dispatch_width / 8;
}
- fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
- instructions.push_tail(MOV(dst, result));
-
- return instructions;
+ bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
}
/**
* A helper for MOV generation for fixing up broken hardware SEND dependency
* handling.
*/
-fs_inst *
-fs_visitor::DEP_RESOLVE_MOV(int grf)
+void
+fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
{
- fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
-
- inst->ir = NULL;
- inst->annotation = "send dependency resolve";
-
/* The caller always wants uncompressed to emit the minimal extra
* dependencies, and to avoid having to deal with aligning its regs to 2.
*/
- inst->exec_size = 8;
+ const fs_builder ubld = bld.annotate("send dependency resolve")
+ .half(0);
- return inst;
+ ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
}
bool
base_mrf == inst->base_mrf &&
target == inst->target &&
eot == inst->eot &&
- header_present == inst->header_present &&
+ header_size == inst->header_size &&
shadow_compare == inst->shadow_compare &&
exec_size == inst->exec_size &&
offset == inst->offset);
bool
fs_inst::overwrites_reg(const fs_reg ®) const
{
- return (reg.file == dst.file &&
- reg.reg == dst.reg &&
- reg.reg_offset >= dst.reg_offset &&
- reg.reg_offset < dst.reg_offset + regs_written);
+ return reg.in_range(dst, regs_written);
}
bool
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
case SHADER_OPCODE_UNTYPED_ATOMIC:
case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
case SHADER_OPCODE_URB_WRITE_SIMD8:
return true;
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
}
bool
-fs_inst::can_do_source_mods(struct brw_context *brw)
+fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
{
- if (brw->gen == 6 && is_math())
+ if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+ return false;
+
+ fs_reg reg = this->src[0];
+ if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
+ return false;
+
+ if (grf_alloc.sizes[reg.reg] != this->regs_written)
+ return false;
+
+ for (int i = 0; i < this->sources; i++) {
+ reg.type = this->src[i].type;
+ reg.width = this->src[i].width;
+ if (!this->src[i].equals(reg))
+ return false;
+ reg = ::offset(reg, 1);
+ }
+
+ return true;
+}
+
+bool
+fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
+{
+ if (devinfo->gen == 6 && is_math())
return false;
if (is_send_from_grf())
return true;
}
+bool
+fs_inst::has_side_effects() const
+{
+ return this->eot || backend_instruction::has_side_effects();
+}
+
void
fs_reg::init()
{
case GLSL_TYPE_ERROR:
case GLSL_TYPE_INTERFACE:
case GLSL_TYPE_DOUBLE:
+ case GLSL_TYPE_FUNCTION:
unreachable("not reached");
}
* the destination of the MOV, with extra parameters set.
*/
fs_reg
-fs_visitor::get_timestamp(fs_inst **out_mov)
+fs_visitor::get_timestamp(const fs_builder &bld)
{
- assert(brw->gen >= 7);
+ assert(devinfo->gen >= 7);
fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
BRW_ARF_TIMESTAMP,
fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
- fs_inst *mov = MOV(dst, ts);
/* We want to read the 3 fields we care about even if it's not enabled in
* the dispatch.
*/
- mov->force_writemask_all = true;
+ bld.exec_all().MOV(dst, ts);
/* The caller wants the low 32 bits of the timestamp. Since it's running
* at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
*/
dst.set_smear(0);
- *out_mov = mov;
return dst;
}
void
fs_visitor::emit_shader_time_begin()
{
- current_annotation = "shader time start";
- fs_inst *mov;
- shader_start_time = get_timestamp(&mov);
- emit(mov);
+ shader_start_time = get_timestamp(bld.annotate("shader time start"));
}
void
fs_visitor::emit_shader_time_end()
{
- current_annotation = "shader time end";
-
- enum shader_time_shader_type type, written_type, reset_type;
- switch (stage) {
- case MESA_SHADER_VERTEX:
- type = ST_VS;
- written_type = ST_VS_WRITTEN;
- reset_type = ST_VS_RESET;
- break;
- case MESA_SHADER_GEOMETRY:
- type = ST_GS;
- written_type = ST_GS_WRITTEN;
- reset_type = ST_GS_RESET;
- break;
- case MESA_SHADER_FRAGMENT:
- if (dispatch_width == 8) {
- type = ST_FS8;
- written_type = ST_FS8_WRITTEN;
- reset_type = ST_FS8_RESET;
- } else {
- assert(dispatch_width == 16);
- type = ST_FS16;
- written_type = ST_FS16_WRITTEN;
- reset_type = ST_FS16_RESET;
- }
- break;
- default:
- unreachable("fs_visitor::emit_shader_time_end missing code");
- }
-
/* Insert our code just before the final SEND with EOT. */
exec_node *end = this->instructions.get_tail();
assert(end && ((fs_inst *) end)->eot);
+ const fs_builder ibld = bld.annotate("shader time end")
+ .exec_all().at(NULL, end);
- fs_inst *tm_read;
- fs_reg shader_end_time = get_timestamp(&tm_read);
- end->insert_before(tm_read);
+ fs_reg shader_end_time = get_timestamp(ibld);
/* Check that there weren't any timestamp reset events (assuming these
* were the only two timestamp reads that happened).
*/
fs_reg reset = shader_end_time;
reset.set_smear(2);
- fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
- test->conditional_mod = BRW_CONDITIONAL_Z;
- test->force_writemask_all = true;
- end->insert_before(test);
- end->insert_before(IF(BRW_PREDICATE_NORMAL));
+ set_condmod(BRW_CONDITIONAL_Z,
+ ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
+ ibld.IF(BRW_PREDICATE_NORMAL);
fs_reg start = shader_start_time;
start.negate = true;
fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
diff.set_smear(0);
- fs_inst *add = ADD(diff, start, shader_end_time);
- add->force_writemask_all = true;
- end->insert_before(add);
+ ibld.ADD(diff, start, shader_end_time);
/* If there were no instructions between the two timestamp gets, the diff
* is 2 cycles. Remove that overhead, so I can forget about that when
* trying to determine the time taken for single instructions.
*/
- add = ADD(diff, diff, fs_reg(-2u));
- add->force_writemask_all = true;
- end->insert_before(add);
-
- end->insert_before(SHADER_TIME_ADD(type, diff));
- end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
- end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
- end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
- end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
+ ibld.ADD(diff, diff, fs_reg(-2u));
+ SHADER_TIME_ADD(ibld, 0, diff);
+ SHADER_TIME_ADD(ibld, 1, fs_reg(1u));
+ ibld.emit(BRW_OPCODE_ELSE);
+ SHADER_TIME_ADD(ibld, 2, fs_reg(1u));
+ ibld.emit(BRW_OPCODE_ENDIF);
}
-fs_inst *
-fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
+void
+fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
+ int shader_time_subindex,
+ fs_reg value)
{
- int shader_time_index =
- brw_get_shader_time_index(brw, shader_prog, prog, type);
- fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
+ int index = shader_time_index * 3 + shader_time_subindex;
+ fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
fs_reg payload;
if (dispatch_width == 8)
else
payload = vgrf(glsl_type::uint_type);
- return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
- fs_reg(), payload, offset, value);
+ bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
}
void
* During a SIMD16 compile (if one happens anyway), this just calls fail().
*/
void
-fs_visitor::no16(const char *format, ...)
+fs_visitor::no16(const char *msg)
{
- va_list va;
-
- va_start(va, format);
-
if (dispatch_width == 16) {
- vfail(format, va);
+ fail("%s", msg);
} else {
simd16_unsupported = true;
- if (brw->perf_debug) {
- if (no16_msg)
- ralloc_vasprintf_append(&no16_msg, format, va);
- else
- no16_msg = ralloc_vasprintf(mem_ctx, format, va);
- }
+ compiler->shader_perf_log(log_data,
+ "SIMD16 shader failed to compile: %s", msg);
}
-
- va_end(va);
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode)
-{
- return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
-{
- return emit(new(mem_ctx) fs_inst(opcode, dst));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
-{
- return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
- const fs_reg &src1)
-{
- return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
- const fs_reg &src1, const fs_reg &src2)
-{
- return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
- fs_reg src[], int sources)
-{
- return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
}
/**
return mlen;
} else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
return mlen;
+ } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
+ return mlen;
+ } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
+ return mlen;
+ } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
+ return mlen;
+ } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
+ return mlen;
} else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
return mlen;
+ } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
+ return exec_size / 4;
}
switch (src[arg].file) {
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
return inst->mlen;
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
- return 2;
+ return inst->mlen;
case SHADER_OPCODE_UNTYPED_ATOMIC:
case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
case SHADER_OPCODE_URB_WRITE_SIMD8:
case FS_OPCODE_INTERPOLATE_AT_CENTROID:
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
brw_type_for_base_type(type), dispatch_width);
}
-fs_reg
-fs_visitor::vgrf(int num_components)
-{
- int reg_width = dispatch_width / 8;
- return fs_reg(GRF, alloc.allocate(num_components * reg_width),
- BRW_REGISTER_TYPE_F, dispatch_width);
-}
-
/** Fixed HW reg constructor. */
fs_reg::fs_reg(enum register_file file, int reg)
{
this->width = width;
}
-fs_reg *
-fs_visitor::variable_storage(ir_variable *var)
-{
- return (fs_reg *)hash_table_find(this->variable_ht, var);
-}
-
-void
-import_uniforms_callback(const void *key,
- void *data,
- void *closure)
-{
- struct hash_table *dst_ht = (struct hash_table *)closure;
- const fs_reg *reg = (const fs_reg *)data;
-
- if (reg->file != UNIFORM)
- return;
-
- hash_table_insert(dst_ht, data, key);
-}
-
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
* This brings in those uniform definitions
*/
void
fs_visitor::import_uniforms(fs_visitor *v)
{
- hash_table_call_foreach(v->variable_ht,
- import_uniforms_callback,
- variable_ht);
this->push_constant_loc = v->push_constant_loc;
this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms;
this->param_size = v->param_size;
}
-/* Our support for uniforms is piggy-backed on the struct
- * gl_fragment_program, because that's where the values actually
- * get stored, rather than in some global gl_shader_program uniform
- * store.
- */
-void
-fs_visitor::setup_uniform_values(ir_variable *ir)
-{
- int namelen = strlen(ir->name);
-
- /* The data for our (non-builtin) uniforms is stored in a series of
- * gl_uniform_driver_storage structs for each subcomponent that
- * glGetUniformLocation() could name. We know it's been set up in the same
- * order we'd walk the type, so walk the list of storage and find anything
- * with our name, or the prefix of a component that starts with our name.
- */
- unsigned params_before = uniforms;
- for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
- struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
-
- if (strncmp(ir->name, storage->name, namelen) != 0 ||
- (storage->name[namelen] != 0 &&
- storage->name[namelen] != '.' &&
- storage->name[namelen] != '[')) {
- continue;
- }
-
- unsigned slots = storage->type->component_slots();
- if (storage->array_elements)
- slots *= storage->array_elements;
-
- for (unsigned i = 0; i < slots; i++) {
- stage_prog_data->param[uniforms++] = &storage->storage[i];
- }
- }
-
- /* Make sure we actually initialized the right amount of stuff here. */
- assert(params_before + ir->type->component_slots() == uniforms);
- (void)params_before;
-}
-
-
-/* Our support for builtin uniforms is even scarier than non-builtin.
- * It sits on top of the PROG_STATE_VAR parameters that are
- * automatically updated from GL context state.
- */
-void
-fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
-{
- const ir_state_slot *const slots = ir->get_state_slots();
- assert(slots != NULL);
-
- for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
- /* This state reference has already been setup by ir_to_mesa, but we'll
- * get the same index back here.
- */
- int index = _mesa_add_state_reference(this->prog->Parameters,
- (gl_state_index *)slots[i].tokens);
-
- /* Add each of the unique swizzles of the element as a parameter.
- * This'll end up matching the expected layout of the
- * array/matrix/structure we're trying to fill in.
- */
- int last_swiz = -1;
- for (unsigned int j = 0; j < 4; j++) {
- int swiz = GET_SWZ(slots[i].swizzle, j);
- if (swiz == last_swiz)
- break;
- last_swiz = swiz;
-
- stage_prog_data->param[uniforms++] =
- &prog->Parameters->ParameterValues[index][swiz];
- }
- }
-}
-
fs_reg *
fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
bool origin_upper_left)
/* gl_FragCoord.x */
if (pixel_center_integer) {
- emit(MOV(wpos, this->pixel_x));
+ bld.MOV(wpos, this->pixel_x);
} else {
- emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
+ bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
}
wpos = offset(wpos, 1);
/* gl_FragCoord.y */
if (!flip && pixel_center_integer) {
- emit(MOV(wpos, this->pixel_y));
+ bld.MOV(wpos, this->pixel_y);
} else {
fs_reg pixel_y = this->pixel_y;
float offset = (pixel_center_integer ? 0.0 : 0.5);
offset += key->drawable_height - 1.0;
}
- emit(ADD(wpos, pixel_y, fs_reg(offset)));
+ bld.ADD(wpos, pixel_y, fs_reg(offset));
}
wpos = offset(wpos, 1);
/* gl_FragCoord.z */
- if (brw->gen >= 6) {
- emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
+ if (devinfo->gen >= 6) {
+ bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
} else {
- emit(FS_OPCODE_LINTERP, wpos,
- this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
- this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
+ bld.emit(FS_OPCODE_LINTERP, wpos,
+ this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
interp_reg(VARYING_SLOT_POS, 2));
}
wpos = offset(wpos, 1);
/* gl_FragCoord.w: Already set up in emit_interpolation */
- emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
+ bld.MOV(wpos, this->wpos_w);
return reg;
}
bool is_centroid, bool is_sample)
{
brw_wm_barycentric_interp_mode barycoord_mode;
- if (brw->gen >= 6) {
+ if (devinfo->gen >= 6) {
if (is_centroid) {
if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
*/
barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
}
- return emit(FS_OPCODE_LINTERP, attr,
- this->delta_x[barycoord_mode],
- this->delta_y[barycoord_mode], interp);
+ return bld.emit(FS_OPCODE_LINTERP, attr,
+ this->delta_xy[barycoord_mode], interp);
}
void
struct brw_reg interp = interp_reg(location, k);
interp = suboffset(interp, 3);
interp.type = attr.type;
- emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
+ bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
attr = offset(attr, 1);
}
} else {
/* Smooth/noperspective interpolation case. */
for (unsigned int k = 0; k < type->vector_elements; k++) {
struct brw_reg interp = interp_reg(location, k);
- if (brw->needs_unlit_centroid_workaround && mod_centroid) {
+ if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
/* Get the pixel/sample mask into f0 so that we know
* which pixels are lit. Then, for each channel that is
* unlit, replace the centroid data with non-centroid
* data.
*/
- emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+ bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
fs_inst *inst;
inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
false, false);
inst->predicate = BRW_PREDICATE_NORMAL;
inst->predicate_inverse = true;
- if (brw->has_pln)
+ if (devinfo->has_pln)
inst->no_dd_clear = true;
inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
mod_sample || key->persample_shading);
inst->predicate = BRW_PREDICATE_NORMAL;
inst->predicate_inverse = false;
- if (brw->has_pln)
+ if (devinfo->has_pln)
inst->no_dd_check = true;
} else {
mod_centroid && !key->persample_shading,
mod_sample || key->persample_shading);
}
- if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
- emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
+ if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
+ bld.MUL(attr, attr, this->pixel_w);
}
attr = offset(attr, 1);
}
{
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
- if (brw->gen >= 6) {
+ if (devinfo->gen >= 6) {
/* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
* a boolean result from this (~0/true or 0/false).
*
fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
g0.negate = true;
- emit(ASR(*reg, g0, fs_reg(15)));
+ bld.ASR(*reg, g0, fs_reg(15));
} else {
/* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
* a boolean result from this (1/true or 0/false).
fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
g1_6.negate = true;
- emit(ASR(*reg, g1_6, fs_reg(31)));
+ bld.ASR(*reg, g1_6, fs_reg(31));
}
return reg;
if (key->compute_pos_offset) {
/* Convert int_sample_pos to floating point */
- emit(MOV(dst, int_sample_pos));
+ bld.MOV(dst, int_sample_pos);
/* Scale to the range [0, 1] */
- emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
+ bld.MUL(dst, dst, fs_reg(1 / 16.0f));
}
else {
/* From ARB_sample_shading specification:
* rasterization is disabled, gl_SamplePosition will always be
* (0.5, 0.5).
*/
- emit(MOV(dst, fs_reg(0.5f)));
+ bld.MOV(dst, fs_reg(0.5f));
}
}
fs_reg *
fs_visitor::emit_samplepos_setup()
{
- assert(brw->gen >= 6);
+ assert(devinfo->gen >= 6);
- this->current_annotation = "compute sample position";
+ const fs_builder abld = bld.annotate("compute sample position");
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
fs_reg pos = *reg;
fs_reg int_sample_x = vgrf(glsl_type::int_type);
BRW_REGISTER_TYPE_B), 16, 8, 2);
if (dispatch_width == 8) {
- emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
+ abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
} else {
- emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
- emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
- ->force_sechalf = true;
+ abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
+ abld.half(1).MOV(half(int_sample_x, 1),
+ fs_reg(suboffset(sample_pos_reg, 16)));
}
/* Compute gl_SamplePosition.x */
compute_sample_position(pos, int_sample_x);
pos = offset(pos, 1);
if (dispatch_width == 8) {
- emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
+ abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
} else {
- emit(MOV(half(int_sample_y, 0),
- fs_reg(suboffset(sample_pos_reg, 1))));
- emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
- ->force_sechalf = true;
+ abld.half(0).MOV(half(int_sample_y, 0),
+ fs_reg(suboffset(sample_pos_reg, 1)));
+ abld.half(1).MOV(half(int_sample_y, 1),
+ fs_reg(suboffset(sample_pos_reg, 17)));
}
/* Compute gl_SamplePosition.y */
compute_sample_position(pos, int_sample_y);
{
assert(stage == MESA_SHADER_FRAGMENT);
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
- assert(brw->gen >= 6);
+ assert(devinfo->gen >= 6);
- this->current_annotation = "compute sample id";
+ const fs_builder abld = bld.annotate("compute sample id");
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
if (key->compute_sample_id) {
* are sample 1 of subspan 0; the third group is sample 0 of
* subspan 1, and finally sample 1 of subspan 1.
*/
- fs_inst *inst;
- inst = emit(BRW_OPCODE_AND, t1,
- fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
- fs_reg(0xc0));
- inst->force_writemask_all = true;
- inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
- inst->force_writemask_all = true;
+ abld.exec_all()
+ .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+ fs_reg(0xc0));
+ abld.exec_all().SHR(t1, t1, fs_reg(5));
+
/* This works for both SIMD8 and SIMD16 */
- inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
- inst->force_writemask_all = true;
+ abld.exec_all()
+ .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
+
/* This special instruction takes care of setting vstride=1,
* width=4, hstride=0 of t2 during an ADD instruction.
*/
- emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
+ abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
} else {
/* As per GL_ARB_sample_shading specification:
* "When rendering to a non-multisample buffer, or if multisample
* rasterization is disabled, gl_SampleID will always be zero."
*/
- emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
+ abld.MOV(*reg, fs_reg(0));
}
return reg;
if (!src->abs && !src->negate)
return;
- fs_reg temp = retype(vgrf(1), src->type);
- emit(MOV(temp, *src));
+ fs_reg temp = bld.vgrf(src->type);
+ bld.MOV(temp, *src);
*src = temp;
}
-fs_reg
-fs_visitor::fix_math_operand(fs_reg src)
-{
- /* Can't do hstride == 0 args on gen6 math, so expand it out. We
- * might be able to do better by doing execsize = 1 math and then
- * expanding that result out, but we would need to be careful with
- * masking.
- *
- * The hardware ignores source modifiers (negate and abs) on math
- * instructions, so we also move to a temp to set those up.
- */
- if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
- !src.abs && !src.negate)
- return src;
-
- /* Gen7 relaxes most of the above restrictions, but still can't use IMM
- * operands to math
- */
- if (brw->gen >= 7 && src.file != IMM)
- return src;
-
- fs_reg expanded = vgrf(glsl_type::float_type);
- expanded.type = src.type;
- emit(BRW_OPCODE_MOV, expanded, src);
- return expanded;
-}
-
-fs_inst *
-fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
+void
+fs_visitor::emit_discard_jump()
{
- switch (opcode) {
- case SHADER_OPCODE_RCP:
- case SHADER_OPCODE_RSQ:
- case SHADER_OPCODE_SQRT:
- case SHADER_OPCODE_EXP2:
- case SHADER_OPCODE_LOG2:
- case SHADER_OPCODE_SIN:
- case SHADER_OPCODE_COS:
- break;
- default:
- unreachable("not reached: bad math opcode");
- }
+ assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
- /* Can't do hstride == 0 args to gen6 math, so expand it out. We
- * might be able to do better by doing execsize = 1 math and then
- * expanding that result out, but we would need to be careful with
- * masking.
- *
- * Gen 6 hardware ignores source modifiers (negate and abs) on math
- * instructions, so we also move to a temp to set those up.
+ /* For performance, after a discard, jump to the end of the
+ * shader if all relevant channels have been discarded.
*/
- if (brw->gen == 6 || brw->gen == 7)
- src = fix_math_operand(src);
+ fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
+ discard_jump->flag_subreg = 1;
- fs_inst *inst = emit(opcode, dst, src);
-
- if (brw->gen < 6) {
- inst->base_mrf = 2;
- inst->mlen = dispatch_width / 8;
- }
-
- return inst;
-}
-
-fs_inst *
-fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
-{
- int base_mrf = 2;
- fs_inst *inst;
-
- if (brw->gen >= 8) {
- inst = emit(opcode, dst, src0, src1);
- } else if (brw->gen >= 6) {
- src0 = fix_math_operand(src0);
- src1 = fix_math_operand(src1);
-
- inst = emit(opcode, dst, src0, src1);
- } else {
- /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
- * "Message Payload":
- *
- * "Operand0[7]. For the INT DIV functions, this operand is the
- * denominator."
- * ...
- * "Operand1[7]. For the INT DIV functions, this operand is the
- * numerator."
- */
- bool is_int_div = opcode != SHADER_OPCODE_POW;
- fs_reg &op0 = is_int_div ? src1 : src0;
- fs_reg &op1 = is_int_div ? src0 : src1;
-
- emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
- inst = emit(opcode, dst, op0, reg_null_f);
-
- inst->base_mrf = base_mrf;
- inst->mlen = 2 * dispatch_width / 8;
- }
- return inst;
+ discard_jump->predicate = (dispatch_width == 8)
+ ? BRW_PREDICATE_ALIGN1_ANY8H
+ : BRW_PREDICATE_ALIGN1_ANY16H;
+ discard_jump->predicate_inverse = true;
}
void
if (dispatch_width == 8) {
prog_data->dispatch_grf_start_reg = payload.num_regs;
} else {
- assert(stage == MESA_SHADER_FRAGMENT);
- brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
- prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
+ if (stage == MESA_SHADER_FRAGMENT) {
+ brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
+ prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
+ } else if (stage == MESA_SHADER_COMPUTE) {
+ brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
+ prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
+ } else {
+ unreachable("Unsupported shader type!");
+ }
}
prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
int urb_next = 0;
/* Figure out where each of the incoming setup attributes lands. */
- if (brw->gen >= 6) {
+ if (devinfo->gen >= 6) {
if (_mesa_bitcount_64(prog->InputsRead &
BRW_FS_VARYING_INPUT_MASK) <= 16) {
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
* (geometry or vertex shader).
*/
struct brw_vue_map prev_stage_vue_map;
- brw_compute_vue_map(brw, &prev_stage_vue_map,
+ brw_compute_vue_map(devinfo, &prev_stage_vue_map,
key->input_slots_valid);
int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
*/
foreach_block_and_inst(block, fs_inst, inst, cfg) {
if (inst->opcode == FS_OPCODE_LINTERP) {
- assert(inst->src[2].file == HW_REG);
- inst->src[2].fixed_hw_reg.nr += urb_start;
+ assert(inst->src[1].file == HW_REG);
+ inst->src[1].fixed_hw_reg.nr += urb_start;
}
if (inst->opcode == FS_OPCODE_CINTERP) {
unsigned vue_entries =
MAX2(count, vs_prog_data->base.vue_map.num_slots);
+ /* URB entry size is counted in units of 64 bytes (for the 3DSTATE_URB_VS
+ * command). Each attribute is 16 bytes (4 floats/dwords), so each unit
+ * fits four attributes.
+ */
vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
vs_prog_data->base.urb_read_length = (count + 1) / 2;
}
}
- if (brw->has_pln &&
- this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
- /* PLN opcodes rely on the delta_xy being contiguous. We only have to
- * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
- * Gen6, that was the only supported interpolation mode, and since Gen6,
- * delta_x and delta_y are in fixed hardware registers.
- */
- int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
- split_points[vgrf_to_reg[vgrf] + 1] = false;
- }
-
foreach_block_and_inst(block, fs_inst, inst, cfg) {
if (inst->dst.file == GRF) {
int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
}
}
- /* Patch all the references to delta_x/delta_y, since they're used in
- * register allocation. If they're unused, switch them to BAD_FILE so
- * we don't think some random VGRF is delta_x/delta_y.
+ /* Patch all the references to delta_xy, since they're used in register
+ * allocation. If they're unused, switch them to BAD_FILE so we don't
+ * think some random VGRF is delta_xy.
*/
- for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
- if (delta_x[i].file == GRF) {
- if (remap_table[delta_x[i].reg] != -1) {
- delta_x[i].reg = remap_table[delta_x[i].reg];
+ for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
+ if (delta_xy[i].file == GRF) {
+ if (remap_table[delta_xy[i].reg] != -1) {
+ delta_xy[i].reg = remap_table[delta_xy[i].reg];
} else {
- delta_x[i].file = BAD_FILE;
- }
- }
- }
- for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
- if (delta_y[i].file == GRF) {
- if (remap_table[delta_y[i].reg] != -1) {
- delta_y[i].reg = remap_table[delta_y[i].reg];
- } else {
- delta_y[i].file = BAD_FILE;
+ delta_xy[i].file = BAD_FILE;
}
}
}
continue;
/* Set up the annotation tracking for new generated instructions. */
- base_ir = inst->ir;
- current_annotation = inst->annotation;
-
+ const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
+ .at(block, inst);
fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
fs_reg dst = vgrf(glsl_type::float_type);
/* Generate a pull load into dst. */
if (inst->src[i].reladdr) {
- exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
- surf_index,
- *inst->src[i].reladdr,
- pull_index);
- inst->insert_before(block, &list);
+ VARYING_PULL_CONSTANT_LOAD(ibld, dst,
+ surf_index,
+ *inst->src[i].reladdr,
+ pull_index);
inst->src[i].reladdr = NULL;
} else {
fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
- fs_inst *pull =
- new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
- dst, surf_index, offset);
- inst->insert_before(block, pull);
+ ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+ dst, surf_index, offset);
inst->src[i].set_smear(pull_index & 3);
}
}
break;
}
+ case SHADER_OPCODE_BROADCAST:
+ if (is_uniform(inst->src[0])) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->sources = 1;
+ inst->force_writemask_all = true;
+ progress = true;
+ } else if (inst->src[1].file == IMM) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0] = component(inst->src[0],
+ inst->src[1].fixed_hw_reg.dw1.ud);
+ inst->sources = 1;
+ inst->force_writemask_all = true;
+ progress = true;
+ }
+ break;
+
default:
break;
}
return progress;
}
+/**
+ * Optimize sample messages that have constant zero values for the trailing
+ * texture coordinates. We can just reduce the message length for these
+ * instructions instead of reserving a register for it. Trailing parameters
+ * that aren't sent default to zero anyway. This will cause the dead code
+ * eliminator to remove the MOV instruction that would otherwise be emitted to
+ * set up the zero value.
+ */
+bool
+fs_visitor::opt_zero_samples()
+{
+ /* Gen4 infers the texturing opcode based on the message length so we can't
+ * change it.
+ */
+ if (devinfo->gen < 5)
+ return false;
+
+ bool progress = false;
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (!inst->is_tex())
+ continue;
+
+ fs_inst *load_payload = (fs_inst *) inst->prev;
+
+ if (load_payload->is_head_sentinel() ||
+ load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+ continue;
+
+ /* We don't want to remove the message header or the first parameter.
+ * Removing the first parameter is not allowed, see the Haswell PRM
+ * volume 7, page 149:
+ *
+ * "Parameter 0 is required except for the sampleinfo message, which
+ * has no parameter 0"
+ */
+ while (inst->mlen > inst->header_size + dispatch_width / 8 &&
+ load_payload->src[(inst->mlen - inst->header_size) /
+ (dispatch_width / 8) +
+ inst->header_size - 1].is_zero()) {
+ inst->mlen -= dispatch_width / 8;
+ progress = true;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+/**
+ * Optimize sample messages which are followed by the final RT write.
+ *
+ * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
+ * results sent directly to the framebuffer, bypassing the EU. Recognize the
+ * final texturing results copied to the framebuffer write payload and modify
+ * them to write to the framebuffer directly.
+ */
+bool
+fs_visitor::opt_sampler_eot()
+{
+ brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
+ if (stage != MESA_SHADER_FRAGMENT)
+ return false;
+
+ if (devinfo->gen < 9 && !devinfo->is_cherryview)
+ return false;
+
+ /* FINISHME: It should be possible to implement this optimization when there
+ * are multiple drawbuffers.
+ */
+ if (key->nr_color_regions != 1)
+ return false;
+
+ /* Look for a texturing instruction immediately before the final FB_WRITE. */
+ fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
+ assert(fb_write->eot);
+ assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
+
+ fs_inst *tex_inst = (fs_inst *) fb_write->prev;
+
+ /* There wasn't one; nothing to do. */
+ if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
+ return false;
+
+ /* This optimisation doesn't seem to work for textureGather for some
+ * reason. I can't find any documentation or known workarounds to indicate
+ * that this is expected, but considering that it is probably pretty
+ * unlikely that a shader would directly write out the results from
+ * textureGather we might as well just disable it.
+ */
+ if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
+ tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
+ return false;
+
+ /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
+ * It's very likely to be the previous instruction.
+ */
+ fs_inst *load_payload = (fs_inst *) tex_inst->prev;
+ if (load_payload->is_head_sentinel() ||
+ load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+ return false;
+
+ assert(!tex_inst->eot); /* We can't get here twice */
+ assert((tex_inst->offset & (0xff << 24)) == 0);
+
+ tex_inst->offset |= fb_write->target << 24;
+ tex_inst->eot = true;
+ tex_inst->dst = bld.null_reg_ud();
+ fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
+
+ /* If a header is present, marking the eot is sufficient. Otherwise, we need
+ * to create a new LOAD_PAYLOAD command with the same sources and a space
+ * saved for the header. Using a new destination register not only makes sure
+ * we have enough space, but it will make sure the dead code eliminator kills
+ * the instruction that this will replace.
+ */
+ if (tex_inst->header_size != 0)
+ return true;
+
+ fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
+ load_payload->sources + 1);
+ fs_reg *new_sources =
+ ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
+
+ new_sources[0] = fs_reg();
+ for (int i = 0; i < load_payload->sources; i++)
+ new_sources[i+1] = load_payload->src[i];
+
+ /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
+ * requires a lot of information about the sources to appropriately figure
+ * out the number of registers needed to be used. Given this stage in our
+ * optimization, we may not have the appropriate GRFs required by
+ * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
+ * manually emit the instruction.
+ */
+ fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
+ load_payload->exec_size,
+ send_header,
+ new_sources,
+ load_payload->sources + 1);
+
+ new_load_payload->regs_written = load_payload->regs_written + 1;
+ new_load_payload->header_size = 1;
+ tex_inst->mlen++;
+ tex_inst->header_size = 1;
+ tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
+ tex_inst->src[0] = send_header;
+
+ return true;
+}
+
bool
fs_visitor::opt_register_renaming()
{
if (progress) {
invalidate_live_intervals();
- for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
- if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
- delta_x[i].reg = remap[delta_x[i].reg];
- }
- }
- for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
- if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
- delta_y[i].reg = remap[delta_y[i].reg];
+ for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
+ if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
+ delta_xy[i].reg = remap[delta_xy[i].reg];
}
}
}
int next_ip = 0;
/* No MRFs on Gen >= 7. */
- if (brw->gen >= 7)
+ if (devinfo->gen >= 7)
return false;
calculate_live_intervals();
if (scan_inst->mlen)
break;
- if (brw->gen == 6) {
+ if (devinfo->gen == 6) {
/* gen6 math instructions must have the destination be
* GRF, so no compute-to-MRF for them.
*/
return progress;
}
+/**
+ * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
+ * flow. We could probably do better here with some form of divergence
+ * analysis.
+ */
+bool
+fs_visitor::eliminate_find_live_channel()
+{
+ bool progress = false;
+ unsigned depth = 0;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ switch (inst->opcode) {
+ case BRW_OPCODE_IF:
+ case BRW_OPCODE_DO:
+ depth++;
+ break;
+
+ case BRW_OPCODE_ENDIF:
+ case BRW_OPCODE_WHILE:
+ depth--;
+ break;
+
+ case FS_OPCODE_DISCARD_JUMP:
+ /* This can potentially make control flow non-uniform until the end
+ * of the program.
+ */
+ return progress;
+
+ case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+ if (depth == 0) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0] = fs_reg(0);
+ inst->sources = 1;
+ inst->force_writemask_all = true;
+ progress = true;
+ }
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ return progress;
+}
+
/**
* Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
* instructions to FS_OPCODE_REP_FB_WRITE.
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
int base_mrf = 1;
int color_mrf = base_mrf + 2;
+ fs_inst *mov;
- fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
- fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
- mov->force_writemask_all = true;
+ if (uniforms == 1) {
+ mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
+ fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+ } else {
+ struct brw_reg reg =
+ brw_reg(BRW_GENERAL_REGISTER_FILE,
+ 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_8,
+ BRW_WIDTH_2,
+ BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+
+ mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
+ fs_reg(reg));
+ }
fs_inst *write;
if (key->nr_color_regions == 1) {
- write = emit(FS_OPCODE_REP_FB_WRITE);
+ write = bld.emit(FS_OPCODE_REP_FB_WRITE);
write->saturate = key->clamp_fragment_color;
write->base_mrf = color_mrf;
write->target = 0;
- write->header_present = false;
+ write->header_size = 0;
write->mlen = 1;
} else {
assume(key->nr_color_regions > 0);
for (int i = 0; i < key->nr_color_regions; ++i) {
- write = emit(FS_OPCODE_REP_FB_WRITE);
+ write = bld.emit(FS_OPCODE_REP_FB_WRITE);
write->saturate = key->clamp_fragment_color;
write->base_mrf = base_mrf;
write->target = i;
- write->header_present = true;
+ write->header_size = 2;
write->mlen = 3;
}
}
assign_curb_setup();
/* Now that we have the uniform assigned, go ahead and force it to a vec4. */
- assert(mov->src[0].file == HW_REG);
- mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
+ if (uniforms == 1) {
+ assert(mov->src[0].file == HW_REG);
+ mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
+ }
}
/**
*/
if (block->start() == scan_inst) {
for (int i = 0; i < write_len; i++) {
- if (needs_dep[i]) {
- inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
- }
+ if (needs_dep[i])
+ DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
}
return;
}
if (reg >= first_write_grf &&
reg < first_write_grf + write_len &&
needs_dep[reg - first_write_grf]) {
- inst->insert_before(block, DEP_RESOLVE_MOV(reg));
+ DEP_RESOLVE_MOV(bld.at(block, inst), reg);
needs_dep[reg - first_write_grf] = false;
if (scan_inst->exec_size == 16)
needs_dep[reg - first_write_grf + 1] = false;
if (block->end() == scan_inst) {
for (int i = 0; i < write_len; i++) {
if (needs_dep[i])
- scan_inst->insert_before(block,
- DEP_RESOLVE_MOV(first_write_grf + i));
+ DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
}
return;
}
scan_inst->dst.reg >= first_write_grf &&
scan_inst->dst.reg < first_write_grf + write_len &&
needs_dep[scan_inst->dst.reg - first_write_grf]) {
- scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
+ DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
needs_dep[scan_inst->dst.reg - first_write_grf] = false;
}
void
fs_visitor::insert_gen4_send_dependency_workarounds()
{
- if (brw->gen != 4 || brw->is_g4x)
+ if (devinfo->gen != 4 || devinfo->is_g4x)
return;
bool progress = false;
if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
continue;
- if (brw->gen >= 7) {
+ if (devinfo->gen >= 7) {
/* The offset arg before was a vec4-aligned byte offset. We need to
* turn it into a dword offset.
*/
/* We have to use a message header on Skylake to get SIMD4x2 mode.
* Reserve space for the register.
*/
- if (brw->gen >= 9) {
+ if (devinfo->gen >= 9) {
payload.reg_offset++;
alloc.sizes[payload.reg] = 2;
}
{
bool progress = false;
- int vgrf_to_reg[alloc.count];
- int reg_count = 0;
- for (unsigned i = 0; i < alloc.count; ++i) {
- vgrf_to_reg[i] = reg_count;
- reg_count += alloc.sizes[i];
- }
+ foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+ continue;
+
+ assert(inst->dst.file == MRF || inst->dst.file == GRF);
+ assert(inst->saturate == false);
+
+ const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
+ .exec_all(inst->force_writemask_all)
+ .at(block, inst);
+ fs_reg dst = inst->dst;
+
+ /* Get rid of COMPR4. We'll add it back in if we need it */
+ if (dst.file == MRF)
+ dst.reg = dst.reg & ~BRW_MRF_COMPR4;
+
+ dst.width = 8;
+ for (uint8_t i = 0; i < inst->header_size; i++) {
+ if (inst->src[i].file != BAD_FILE) {
+ fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
+ fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
+ mov_src.width = 8;
+ ibld.exec_all().MOV(mov_dst, mov_src);
+ }
+ dst = offset(dst, 1);
+ }
- struct {
- bool written:1; /* Whether this register has ever been written */
- bool force_writemask_all:1;
- bool force_sechalf:1;
- } metadata[reg_count];
- memset(metadata, 0, sizeof(metadata));
+ dst.width = inst->exec_size;
+ if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
+ inst->exec_size > 8) {
+ /* In this case, the payload portion of the LOAD_PAYLOAD isn't
+ * a straightforward copy. Instead, the result of the
+ * LOAD_PAYLOAD is treated as interleaved and the first four
+ * non-header sources are unpacked as:
+ *
+ * m + 0: r0
+ * m + 1: g0
+ * m + 2: b0
+ * m + 3: a0
+ * m + 4: r1
+ * m + 5: g1
+ * m + 6: b1
+ * m + 7: a1
+ *
+ * This is used for gen <= 5 fb writes.
+ */
+ assert(inst->exec_size == 16);
+ assert(inst->header_size + 4 <= inst->sources);
+ for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
+ if (inst->src[i].file != BAD_FILE) {
+ if (devinfo->has_compr4) {
+ fs_reg compr4_dst = retype(dst, inst->src[i].type);
+ compr4_dst.reg |= BRW_MRF_COMPR4;
+ ibld.MOV(compr4_dst, inst->src[i]);
+ } else {
+ /* Platform doesn't have COMPR4. We have to fake it */
+ fs_reg mov_dst = retype(dst, inst->src[i].type);
+ mov_dst.width = 8;
+ ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
+ ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
+ }
+ }
- foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
- if (inst->dst.file == GRF) {
- const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
- bool force_sechalf = inst->force_sechalf &&
- !inst->force_writemask_all;
- bool toggle_sechalf = inst->dst.width == 16 &&
- type_sz(inst->dst.type) == 4 &&
- !inst->force_writemask_all;
- for (int i = 0; i < inst->regs_written; ++i) {
- metadata[dst_reg + i].written = true;
- metadata[dst_reg + i].force_sechalf = force_sechalf;
- metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
- force_sechalf = (toggle_sechalf != force_sechalf);
+ dst.reg++;
}
+
+ /* The loop above only ever incremented us through the first set
+ * of 4 registers. However, thanks to the magic of COMPR4, we
+ * actually wrote to the first 8 registers, so we need to take
+ * that into account now.
+ */
+ dst.reg += 4;
+
+ /* The COMPR4 code took care of the first 4 sources. We'll let
+ * the regular path handle any remaining sources. Yes, we are
+ * modifying the instruction but we're about to delete it so
+ * this really doesn't hurt anything.
+ */
+ inst->header_size += 4;
}
- if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
- assert(inst->dst.file == MRF || inst->dst.file == GRF);
- fs_reg dst = inst->dst;
+ for (uint8_t i = inst->header_size; i < inst->sources; i++) {
+ if (inst->src[i].file != BAD_FILE)
+ ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
+ dst = offset(dst, 1);
+ }
- for (int i = 0; i < inst->sources; i++) {
- dst.width = inst->src[i].effective_width;
- dst.type = inst->src[i].type;
-
- if (inst->src[i].file == BAD_FILE) {
- /* Do nothing but otherwise increment as normal */
- } else if (dst.file == MRF &&
- dst.width == 8 &&
- brw->has_compr4 &&
- i + 4 < inst->sources &&
- inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
- fs_reg compr4_dst = dst;
- compr4_dst.reg += BRW_MRF_COMPR4;
- compr4_dst.width = 16;
- fs_reg compr4_src = inst->src[i];
- compr4_src.width = 16;
- fs_inst *mov = MOV(compr4_dst, compr4_src);
- mov->force_writemask_all = true;
- inst->insert_before(block, mov);
- /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
- inst->src[i + 4].file = BAD_FILE;
+ inst->remove(block);
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+bool
+fs_visitor::lower_integer_multiplication()
+{
+ bool progress = false;
+
+ /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
+ * directly, but Cherryview cannot.
+ */
+ if (devinfo->gen >= 8 && !devinfo->is_cherryview)
+ return false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ if (inst->opcode != BRW_OPCODE_MUL ||
+ inst->dst.is_accumulator() ||
+ (inst->dst.type != BRW_REGISTER_TYPE_D &&
+ inst->dst.type != BRW_REGISTER_TYPE_UD))
+ continue;
+
+ const fs_builder ibld = bld.at(block, inst);
+
+ /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+ * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+ * src1 are used.
+ *
+ * If multiplying by an immediate value that fits in 16-bits, do a
+ * single MUL instruction with that value in the proper location.
+ */
+ if (inst->src[1].file == IMM &&
+ inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
+ if (devinfo->gen < 7) {
+ fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
+ inst->dst.type, dispatch_width);
+ ibld.MOV(imm, inst->src[1]);
+ ibld.MUL(inst->dst, imm, inst->src[0]);
+ } else {
+ ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
+ }
+ } else {
+ /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
+ * do 32-bit integer multiplication in one instruction, but instead
+ * must do a sequence (which actually calculates a 64-bit result):
+ *
+ * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
+ * mach(8) null g3<8,8,1>D g4<8,8,1>D
+ * mov(8) g2<1>D acc0<8,8,1>D
+ *
+ * But on Gen > 6, the ability to use second accumulator register
+ * (acc1) for non-float data types was removed, preventing a simple
+ * implementation in SIMD16. A 16-channel result can be calculated by
+ * executing the three instructions twice in SIMD8, once with quarter
+ * control of 1Q for the first eight channels and again with 2Q for
+ * the second eight channels.
+ *
+ * Which accumulator register is implicitly accessed (by AccWrEnable
+ * for instance) is determined by the quarter control. Unfortunately
+ * Ivybridge (and presumably Baytrail) has a hardware bug in which an
+ * implicit accumulator access by an instruction with 2Q will access
+ * acc1 regardless of whether the data type is usable in acc1.
+ *
+ * Specifically, the 2Q mach(8) writes acc1 which does not exist for
+ * integer data types.
+ *
+ * Since we only want the low 32-bits of the result, we can do two
+ * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
+ * adjust the high result and add them (like the mach is doing):
+ *
+ * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
+ * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
+ * shl(8) g9<1>D g8<8,8,1>D 16D
+ * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
+ *
+ * We avoid the shl instruction by realizing that we only want to add
+ * the low 16-bits of the "high" result to the high 16-bits of the
+ * "low" result and using proper regioning on the add:
+ *
+ * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
+ * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
+ * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
+ *
+ * Since it does not use the (single) accumulator register, we can
+ * schedule multi-component multiplications much better.
+ */
+
+ if (inst->conditional_mod && inst->dst.is_null()) {
+ inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+ inst->dst.type, dispatch_width);
+ }
+ fs_reg low = inst->dst;
+ fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
+ inst->dst.type, dispatch_width);
+
+ if (devinfo->gen >= 7) {
+ fs_reg src1_0_w = inst->src[1];
+ fs_reg src1_1_w = inst->src[1];
+
+ if (inst->src[1].file == IMM) {
+ src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
+ src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
} else {
- fs_inst *mov = MOV(dst, inst->src[i]);
- if (inst->src[i].file == GRF) {
- int src_reg = vgrf_to_reg[inst->src[i].reg] +
- inst->src[i].reg_offset;
- mov->force_sechalf = metadata[src_reg].force_sechalf;
- mov->force_writemask_all = metadata[src_reg].force_writemask_all;
- } else {
- /* We don't have any useful metadata for immediates or
- * uniforms. Assume that any of the channels of the
- * destination may be used.
- */
- assert(inst->src[i].file == IMM ||
- inst->src[i].file == UNIFORM);
- mov->force_writemask_all = true;
+ src1_0_w.type = BRW_REGISTER_TYPE_UW;
+ if (src1_0_w.stride != 0) {
+ assert(src1_0_w.stride == 1);
+ src1_0_w.stride = 2;
}
- if (dst.file == GRF) {
- const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
- const bool force_writemask = mov->force_writemask_all;
- metadata[dst_reg].force_writemask_all = force_writemask;
- metadata[dst_reg].force_sechalf = mov->force_sechalf;
- if (dst.width * type_sz(dst.type) > 32) {
- assert(!mov->force_sechalf);
- metadata[dst_reg + 1].force_writemask_all = force_writemask;
- metadata[dst_reg + 1].force_sechalf = !force_writemask;
- }
+ src1_1_w.type = BRW_REGISTER_TYPE_UW;
+ if (src1_1_w.stride != 0) {
+ assert(src1_1_w.stride == 1);
+ src1_1_w.stride = 2;
}
+ src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
+ }
+ ibld.MUL(low, inst->src[0], src1_0_w);
+ ibld.MUL(high, inst->src[0], src1_1_w);
+ } else {
+ fs_reg src0_0_w = inst->src[0];
+ fs_reg src0_1_w = inst->src[0];
+
+ src0_0_w.type = BRW_REGISTER_TYPE_UW;
+ if (src0_0_w.stride != 0) {
+ assert(src0_0_w.stride == 1);
+ src0_0_w.stride = 2;
+ }
- inst->insert_before(block, mov);
+ src0_1_w.type = BRW_REGISTER_TYPE_UW;
+ if (src0_1_w.stride != 0) {
+ assert(src0_1_w.stride == 1);
+ src0_1_w.stride = 2;
}
+ src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
- dst = offset(dst, 1);
+ ibld.MUL(low, src0_0_w, inst->src[1]);
+ ibld.MUL(high, src0_1_w, inst->src[1]);
}
- inst->remove(block);
- progress = true;
+ fs_reg dst = inst->dst;
+ dst.type = BRW_REGISTER_TYPE_UW;
+ dst.subreg_offset = 2;
+ dst.stride = 2;
+
+ high.type = BRW_REGISTER_TYPE_UW;
+ high.stride = 2;
+
+ low.type = BRW_REGISTER_TYPE_UW;
+ low.subreg_offset = 2;
+ low.stride = 2;
+
+ ibld.ADD(dst, low, high);
+
+ if (inst->conditional_mod) {
+ fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
+ set_condmod(inst->conditional_mod,
+ ibld.MOV(null, inst->dst));
+ }
}
+
+ inst->remove(block);
+ progress = true;
}
if (progress)
if (inst->conditional_mod) {
fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
if (!inst->predicate &&
- (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
+ (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
inst->opcode != BRW_OPCODE_IF &&
inst->opcode != BRW_OPCODE_WHILE))) {
fprintf(file, ".f0.%d", inst->flag_subreg);
}
fprintf(file, "(%d) ", inst->exec_size);
+ if (inst->mlen) {
+ fprintf(file, "(mlen: %d) ", inst->mlen);
+ }
switch (inst->dst.file) {
case GRF:
(stage == MESA_SHADER_FRAGMENT) ?
((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
- assert(brw->gen >= 6);
+ assert(devinfo->gen >= 6);
/* R0-1: masks, pixel X/Y coordinates. */
payload.num_regs = 2;
/* R32: MSAA input coverage mask */
if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
- assert(brw->gen >= 7);
+ assert(devinfo->gen >= 7);
payload.sample_mask_in_reg = payload.num_regs;
payload.num_regs++;
if (dispatch_width == 16) {
payload.num_regs = 2;
}
+void
+fs_visitor::setup_cs_payload()
+{
+ assert(devinfo->gen >= 7);
+
+ payload.num_regs = 1;
+}
+
void
fs_visitor::assign_binding_table_offsets()
{
void
fs_visitor::optimize()
{
- const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
+ /* bld is the common builder object pointing at the end of the program we
+ * used to translate it into i965 IR. For the optimization and lowering
+ * passes coming next, any code added after the end of the program without
+ * having explicitly called fs_builder::at() clearly points at a mistake.
+ * Ideally optimization passes wouldn't be part of the visitor so they
+ * wouldn't have access to bld at all, but they do, so just in case some
+ * pass forgets to ask for a location explicitly set it to NULL here to
+ * make it trip.
+ */
+ bld = bld.at(NULL, NULL);
split_virtual_grfs();
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
char filename[64]; \
snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
- stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
+ stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
\
- backend_visitor::dump_instructions(filename); \
+ backend_shader::dump_instructions(filename); \
} \
\
progress = progress || this_progress; \
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
char filename[64];
snprintf(filename, 64, "%s%d-%04d-00-start",
- stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
+ stage_abbrev, dispatch_width,
+ shader_prog ? shader_prog->Name : 0);
- backend_visitor::dump_instructions(filename);
+ backend_shader::dump_instructions(filename);
}
bool progress;
OPT(opt_register_renaming);
OPT(opt_redundant_discard_jumps);
OPT(opt_saturate_propagation);
+ OPT(opt_zero_samples);
OPT(register_coalesce);
OPT(compute_to_mrf);
+ OPT(eliminate_find_live_channel);
OPT(compact_virtual_grfs);
} while (progress);
pass_num = 0;
+ OPT(opt_sampler_eot);
+
if (OPT(lower_load_payload)) {
split_virtual_grfs();
OPT(register_coalesce);
}
OPT(opt_combine_constants);
+ OPT(lower_integer_multiplication);
lower_uniform_pull_constant_loads();
}
}
if (!allocated_without_spills) {
- const char *stage_name = stage == MESA_SHADER_VERTEX ?
- "Vertex" : "Fragment";
-
/* We assume that any spilling is worse than just dropping back to
* SIMD8. There's probably actually some intermediate point where
* SIMD16 with a couple of spills is still better.
fail("Failure to register allocate. Reduce number of "
"live scalar values to avoid this.");
} else {
- perf_debug("%s shader triggered register spilling. "
- "Try reducing the number of live scalar values to "
- "improve performance.\n", stage_name);
+ compiler->shader_perf_log(log_data,
+ "%s shader triggered register spilling. "
+ "Try reducing the number of live scalar "
+ "values to improve performance.\n",
+ stage_name);
}
/* Since we're out of heuristics, just go spill registers until we
}
bool
-fs_visitor::run_vs()
+fs_visitor::run_vs(gl_clip_plane *clip_planes)
{
assert(stage == MESA_SHADER_VERTEX);
- assign_common_binding_table_offsets(0);
+ if (prog_data->map_entries == NULL)
+ assign_common_binding_table_offsets(0);
setup_vs_payload();
- if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ if (shader_time_index >= 0)
emit_shader_time_begin();
- if (getenv("INTEL_USE_NIR") != NULL) {
- emit_nir_code();
- } else {
- foreach_in_list(ir_instruction, ir, shader->base.ir) {
- base_ir = ir;
- this->result = reg_undef;
- ir->accept(this);
- }
- base_ir = NULL;
- }
+ emit_nir_code();
if (failed)
return false;
- emit_urb_writes();
+ emit_urb_writes(clip_planes);
+
+ if (shader_time_index >= 0)
+ emit_shader_time_end();
calculate_cfg();
}
bool
-fs_visitor::run_fs()
+fs_visitor::run_fs(bool do_rep_send)
{
brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
sanity_param_count = prog->Parameters->NumParameters;
- assign_binding_table_offsets();
+ if (prog_data->map_entries == NULL)
+ assign_binding_table_offsets();
- if (brw->gen >= 6)
+ if (devinfo->gen >= 6)
setup_payload_gen6();
else
setup_payload_gen4();
if (0) {
emit_dummy_fs();
- } else if (brw->use_rep_send && dispatch_width == 16) {
+ } else if (do_rep_send) {
+ assert(dispatch_width == 16);
emit_repclear_shader();
} else {
- if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ if (shader_time_index >= 0)
emit_shader_time_begin();
calculate_urb_setup();
if (prog->InputsRead > 0) {
- if (brw->gen < 6)
+ if (devinfo->gen < 6)
emit_interpolation_setup_gen4();
else
emit_interpolation_setup_gen6();
* Initialize it with the dispatched pixels.
*/
if (wm_prog_data->uses_kill) {
- fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+ fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
discard_init->flag_subreg = 1;
}
/* Generate FS IR for main(). (the visitor only descends into
* functions called "main").
*/
- if (shader) {
- if (getenv("INTEL_USE_NIR") != NULL) {
- emit_nir_code();
- } else {
- foreach_in_list(ir_instruction, ir, shader->base.ir) {
- base_ir = ir;
- this->result = reg_undef;
- ir->accept(this);
- }
- }
- } else {
- emit_fragment_program_code();
- }
- base_ir = NULL;
+ emit_nir_code();
+
if (failed)
return false;
- emit(FS_OPCODE_PLACEHOLDER_HALT);
+ if (wm_prog_data->uses_kill)
+ bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
if (wm_key->alpha_test_func)
emit_alpha_test();
emit_fb_writes();
- if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ if (shader_time_index >= 0)
emit_shader_time_end();
calculate_cfg();
return !failed;
}
+bool
+fs_visitor::run_cs()
+{
+ assert(stage == MESA_SHADER_COMPUTE);
+ assert(shader);
+
+ sanity_param_count = prog->Parameters->NumParameters;
+
+ assign_common_binding_table_offsets(0);
+
+ setup_cs_payload();
+
+ if (shader_time_index >= 0)
+ emit_shader_time_begin();
+
+ emit_nir_code();
+
+ if (failed)
+ return false;
+
+ emit_cs_terminate();
+
+ if (shader_time_index >= 0)
+ emit_shader_time_end();
+
+ calculate_cfg();
+
+ optimize();
+
+ assign_curb_setup();
+
+ fixup_3src_null_dest();
+ allocate_registers();
+
+ if (failed)
+ return false;
+
+ /* If any state parameters were appended, then ParameterValues could have
+ * been realloced, in which case the driver uniform storage set up by
+ * _mesa_associate_uniform_storage() would point to freed memory. Make
+ * sure that didn't happen.
+ */
+ assert(sanity_param_count == prog->Parameters->NumParameters);
+
+ return !failed;
+}
+
const unsigned *
brw_wm_fs_emit(struct brw_context *brw,
void *mem_ctx,
if (unlikely(INTEL_DEBUG & DEBUG_WM))
brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
+ int st_index8 = -1, st_index16 = -1;
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+ st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
+ st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
+ }
+
/* Now the main event: Visit the shader IR and generate our FS IR for it.
*/
- fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
- if (!v.run_fs()) {
+ fs_visitor v(brw->intelScreen->compiler, brw,
+ mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
+ prog, &fp->Base, 8, st_index8);
+ if (!v.run_fs(false /* do_rep_send */)) {
if (prog) {
prog->LinkStatus = false;
ralloc_strcat(&prog->InfoLog, v.fail_msg);
}
cfg_t *simd16_cfg = NULL;
- fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
- if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
- brw->use_rep_send)) {
+ fs_visitor v2(brw->intelScreen->compiler, brw,
+ mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
+ prog, &fp->Base, 16, st_index16);
+ if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
if (!v.simd16_unsupported) {
/* Try a SIMD16 compile */
v2.import_uniforms(&v);
- if (!v2.run_fs()) {
- perf_debug("SIMD16 shader failed to compile, falling back to "
- "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
+ if (!v2.run_fs(brw->use_rep_send)) {
+ perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
} else {
simd16_cfg = v2.cfg;
}
- } else {
- perf_debug("SIMD16 shader unsupported, falling back to "
- "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
}
}
cfg_t *simd8_cfg;
int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
- if (no_simd8 && simd16_cfg) {
+ if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
simd8_cfg = NULL;
prog_data->no_8 = true;
} else {
prog_data->no_8 = false;
}
- fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
- &fp->Base, v.runtime_check_aads_emit, "FS");
+ fs_generator g(brw->intelScreen->compiler, brw,
+ mem_ctx, (void *) key, &prog_data->base,
+ &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
char *name;
BRW_FS_VARYING_INPUT_MASK) > 16)
key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
- const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
- unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
- for (unsigned i = 0; i < sampler_count; i++) {
- if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
- /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
- key.tex.swizzles[i] =
- MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
- } else {
- /* Color sampler: assume no swizzling. */
- key.tex.swizzles[i] = SWIZZLE_XYZW;
- }
- }
+ brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
if (fp->Base.InputsRead & VARYING_BIT_POS) {
key.drawable_height = ctx->DrawBuffer->Height;
uint32_t old_prog_offset = brw->wm.base.prog_offset;
struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
- bool success = do_wm_prog(brw, shader_prog, bfp, &key);
+ bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
brw->wm.base.prog_offset = old_prog_offset;
brw->wm.prog_data = old_prog_data;
return success;
}
+
+void
+brw_setup_tex_for_precompile(struct brw_context *brw,
+ struct brw_sampler_prog_key_data *tex,
+ struct gl_program *prog)
+{
+ const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
+ unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
+ for (unsigned i = 0; i < sampler_count; i++) {
+ if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
+ /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
+ tex->swizzles[i] =
+ MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
+ } else {
+ /* Color sampler: assume no swizzling. */
+ tex->swizzles[i] = SWIZZLE_XYZW;
+ }
+ }
+}