#include "glsl/glsl_types.h"
#include "program/sampler.h"
+using namespace brw;
+
void
fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
const fs_reg *src, unsigned sources)
assert(dst.file != IMM && dst.file != UNIFORM);
- /* If exec_size == 0, try to guess it from the registers. Since all
- * manner of things may use hardware registers, we first try to guess
- * based on GRF registers. If this fails, we will go ahead and take the
- * width from the destination register.
- */
- if (this->exec_size == 0) {
- if (dst.file == GRF) {
- this->exec_size = dst.width;
- } else {
- for (unsigned i = 0; i < sources; ++i) {
- if (src[i].file != GRF && src[i].file != ATTR)
- continue;
-
- if (this->exec_size <= 1)
- this->exec_size = src[i].width;
- assert(src[i].width == 1 || src[i].width == this->exec_size);
- }
- }
-
- if (this->exec_size == 0 && dst.file != BAD_FILE)
- this->exec_size = dst.width;
- }
assert(this->exec_size != 0);
this->conditional_mod = BRW_CONDITIONAL_NONE;
case HW_REG:
case MRF:
case ATTR:
- this->regs_written =
- DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
+ this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
+ REG_SIZE);
break;
case BAD_FILE:
this->regs_written = 0;
init(opcode, exec_size, reg_undef, NULL, 0);
}
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
{
- init(opcode, 0, dst, NULL, 0);
+ init(opcode, exec_size, dst, NULL, 0);
}
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
init(opcode, exec_size, dst, src, 1);
}
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
-{
- const fs_reg src[1] = { src0 };
- init(opcode, 0, dst, src, 1);
-}
-
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
const fs_reg &src0, const fs_reg &src1)
{
init(opcode, exec_size, dst, src, 2);
}
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
- const fs_reg &src1)
-{
- const fs_reg src[2] = { src0, src1 };
- init(opcode, 0, dst, src, 2);
-}
-
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
{
init(opcode, exec_size, dst, src, 3);
}
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
- const fs_reg &src1, const fs_reg &src2)
-{
- const fs_reg src[3] = { src0, src1, src2 };
- init(opcode, 0, dst, src, 3);
-}
-
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
- const fs_reg src[], unsigned sources)
-{
- init(opcode, 0, dst, src, sources);
-}
-
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
const fs_reg src[], unsigned sources)
{
}
}
-#define ALU1(op) \
- fs_inst * \
- fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
- { \
- return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
- }
-
-#define ALU2(op) \
- fs_inst * \
- fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
- const fs_reg &src1) \
- { \
- return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
- }
-
-#define ALU2_ACC(op) \
- fs_inst * \
- fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
- const fs_reg &src1) \
- { \
- fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
- inst->writes_accumulator = true; \
- return inst; \
- }
-
-#define ALU3(op) \
- fs_inst * \
- fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
- const fs_reg &src1, const fs_reg &src2) \
- { \
- return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
- }
-
-ALU1(NOT)
-ALU1(MOV)
-ALU1(FRC)
-ALU1(RNDD)
-ALU1(RNDE)
-ALU1(RNDZ)
-ALU2(ADD)
-ALU2(MUL)
-ALU2_ACC(MACH)
-ALU2(AND)
-ALU2(OR)
-ALU2(XOR)
-ALU2(SHL)
-ALU2(SHR)
-ALU2(ASR)
-ALU3(LRP)
-ALU1(BFREV)
-ALU3(BFE)
-ALU2(BFI1)
-ALU3(BFI2)
-ALU1(FBH)
-ALU1(FBL)
-ALU1(CBIT)
-ALU3(MAD)
-ALU2_ACC(ADDC)
-ALU2_ACC(SUBB)
-ALU2(SEL)
-ALU2(MAC)
-
-/** Gen4 predicated IF. */
-fs_inst *
-fs_visitor::IF(enum brw_predicate predicate)
-{
- fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
- inst->predicate = predicate;
- return inst;
-}
-
-/** Gen6 IF with embedded comparison. */
-fs_inst *
-fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
- enum brw_conditional_mod condition)
-{
- assert(devinfo->gen == 6);
- fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
- reg_null_d, src0, src1);
- inst->conditional_mod = condition;
- return inst;
-}
-
-/**
- * CMP: Sets the low bit of the destination channels with the result
- * of the comparison, while the upper bits are undefined, and updates
- * the flag register with the packed 16 bits of the result.
- */
-fs_inst *
-fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
- enum brw_conditional_mod condition)
-{
- fs_inst *inst;
-
- /* Take the instruction:
- *
- * CMP null<d> src0<f> src1<f>
- *
- * Original gen4 does type conversion to the destination type before
- * comparison, producing garbage results for floating point comparisons.
- *
- * The destination type doesn't matter on newer generations, so we set the
- * type to match src0 so we can compact the instruction.
- */
- dst.type = src0.type;
- if (dst.file == HW_REG)
- dst.fixed_hw_reg.type = dst.type;
-
- resolve_ud_negate(&src0);
- resolve_ud_negate(&src1);
-
- inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
- inst->conditional_mod = condition;
-
- return inst;
-}
-
-fs_inst *
-fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
- int header_size)
-{
- assert(dst.width % 8 == 0);
- fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
- dst, src, sources);
- inst->header_size = header_size;
-
- for (int i = 0; i < header_size; i++)
- assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
- inst->regs_written = header_size;
-
- for (int i = header_size; i < sources; ++i)
- assert(src[i].file != GRF || src[i].width == dst.width);
- inst->regs_written += (sources - header_size) * (dst.width / 8);
-
- return inst;
-}
-
-exec_list
-fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
+void
+fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
+ const fs_reg &dst,
const fs_reg &surf_index,
const fs_reg &varying_offset,
uint32_t const_offset)
{
- exec_list instructions;
- fs_inst *inst;
-
/* We have our constant surface use a pitch of 4 bytes, so our index can
* be any component of a vector, and then we load 4 contiguous
* components starting from that.
* the redundant ones.
*/
fs_reg vec4_offset = vgrf(glsl_type::int_type);
- instructions.push_tail(ADD(vec4_offset,
- varying_offset, fs_reg(const_offset & ~3)));
+ bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
int scale = 1;
- if (devinfo->gen == 4 && dst.width == 8) {
+ if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
/* Pre-gen5, we can either use a SIMD8 message that requires (header,
* u, v, r) as parameters, or we can just use the SIMD16 message
* consisting of (header, u). We choose the second, at the cost of a
else
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
- assert(dst.width % 8 == 0);
- int regs_written = 4 * (dst.width / 8) * scale;
- fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
- dst.type, dst.width);
- inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
+ int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
+ fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
+ fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
inst->regs_written = regs_written;
- instructions.push_tail(inst);
if (devinfo->gen < 7) {
inst->base_mrf = 13;
if (devinfo->gen == 4)
inst->mlen = 3;
else
- inst->mlen = 1 + dispatch_width / 8;
+ inst->mlen = 1 + bld.dispatch_width() / 8;
}
- fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
- instructions.push_tail(MOV(dst, result));
-
- return instructions;
+ bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
}
/**
* A helper for MOV generation for fixing up broken hardware SEND dependency
* handling.
*/
-fs_inst *
-fs_visitor::DEP_RESOLVE_MOV(int grf)
+void
+fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
{
- fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
-
- inst->ir = NULL;
- inst->annotation = "send dependency resolve";
-
/* The caller always wants uncompressed to emit the minimal extra
* dependencies, and to avoid having to deal with aligning its regs to 2.
*/
- inst->exec_size = 8;
+ const fs_builder ubld = bld.annotate("send dependency resolve")
+ .half(0);
- return inst;
+ ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
}
bool
for (int i = 0; i < this->sources; i++) {
reg.type = this->src[i].type;
- reg.width = this->src[i].width;
if (!this->src[i].equals(reg))
return false;
- reg = ::offset(reg, 1);
+
+ if (i < this->header_size) {
+ reg.reg_offset += 1;
+ } else {
+ reg.reg_offset += this->exec_size / 8;
+ }
}
return true;
init();
this->file = IMM;
this->type = BRW_REGISTER_TYPE_F;
+ this->stride = 0;
this->fixed_hw_reg.dw1.f = f;
- this->width = 1;
}
/** Immediate value constructor. */
init();
this->file = IMM;
this->type = BRW_REGISTER_TYPE_D;
+ this->stride = 0;
this->fixed_hw_reg.dw1.d = i;
- this->width = 1;
}
/** Immediate value constructor. */
init();
this->file = IMM;
this->type = BRW_REGISTER_TYPE_UD;
+ this->stride = 0;
this->fixed_hw_reg.dw1.ud = u;
- this->width = 1;
}
/** Vector float immediate value constructor. */
this->file = HW_REG;
this->fixed_hw_reg = fixed_hw_reg;
this->type = fixed_hw_reg.type;
- this->width = 1 << fixed_hw_reg.width;
}
bool
abs == r.abs &&
!reladdr && !r.reladdr &&
memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
- width == r.width &&
stride == r.stride);
}
return stride == 1;
}
+unsigned
+fs_reg::component_size(unsigned width) const
+{
+ const unsigned stride = (file != HW_REG ? this->stride :
+ fixed_hw_reg.hstride == 0 ? 0 :
+ 1 << (fixed_hw_reg.hstride - 1));
+ return MAX2(width * stride, 1) * type_sz(type);
+}
+
int
fs_visitor::type_size(const struct glsl_type *type)
{
return 0;
case GLSL_TYPE_ATOMIC_UINT:
return 0;
+ case GLSL_TYPE_SUBROUTINE:
+ return 1;
case GLSL_TYPE_IMAGE:
case GLSL_TYPE_VOID:
case GLSL_TYPE_ERROR:
* the destination of the MOV, with extra parameters set.
*/
fs_reg
-fs_visitor::get_timestamp(fs_inst **out_mov)
+fs_visitor::get_timestamp(const fs_builder &bld)
{
assert(devinfo->gen >= 7);
0),
BRW_REGISTER_TYPE_UD));
- fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
+ fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
- fs_inst *mov = MOV(dst, ts);
/* We want to read the 3 fields we care about even if it's not enabled in
* the dispatch.
*/
- mov->force_writemask_all = true;
+ bld.group(4, 0).exec_all().MOV(dst, ts);
/* The caller wants the low 32 bits of the timestamp. Since it's running
* at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
*/
dst.set_smear(0);
- *out_mov = mov;
return dst;
}
void
fs_visitor::emit_shader_time_begin()
{
- current_annotation = "shader time start";
- fs_inst *mov;
- shader_start_time = get_timestamp(&mov);
- emit(mov);
+ shader_start_time = get_timestamp(bld.annotate("shader time start"));
}
void
fs_visitor::emit_shader_time_end()
{
- current_annotation = "shader time end";
-
- enum shader_time_shader_type type, written_type, reset_type;
- switch (stage) {
- case MESA_SHADER_VERTEX:
- type = ST_VS;
- written_type = ST_VS_WRITTEN;
- reset_type = ST_VS_RESET;
- break;
- case MESA_SHADER_GEOMETRY:
- type = ST_GS;
- written_type = ST_GS_WRITTEN;
- reset_type = ST_GS_RESET;
- break;
- case MESA_SHADER_FRAGMENT:
- if (dispatch_width == 8) {
- type = ST_FS8;
- written_type = ST_FS8_WRITTEN;
- reset_type = ST_FS8_RESET;
- } else {
- assert(dispatch_width == 16);
- type = ST_FS16;
- written_type = ST_FS16_WRITTEN;
- reset_type = ST_FS16_RESET;
- }
- break;
- case MESA_SHADER_COMPUTE:
- type = ST_CS;
- written_type = ST_CS_WRITTEN;
- reset_type = ST_CS_RESET;
- break;
- default:
- unreachable("fs_visitor::emit_shader_time_end missing code");
- }
-
/* Insert our code just before the final SEND with EOT. */
exec_node *end = this->instructions.get_tail();
assert(end && ((fs_inst *) end)->eot);
+ const fs_builder ibld = bld.annotate("shader time end")
+ .exec_all().at(NULL, end);
- fs_inst *tm_read;
- fs_reg shader_end_time = get_timestamp(&tm_read);
- end->insert_before(tm_read);
+ fs_reg shader_end_time = get_timestamp(ibld);
/* Check that there weren't any timestamp reset events (assuming these
* were the only two timestamp reads that happened).
*/
fs_reg reset = shader_end_time;
reset.set_smear(2);
- fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
- test->conditional_mod = BRW_CONDITIONAL_Z;
- test->force_writemask_all = true;
- end->insert_before(test);
- end->insert_before(IF(BRW_PREDICATE_NORMAL));
+ set_condmod(BRW_CONDITIONAL_Z,
+ ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
+ ibld.IF(BRW_PREDICATE_NORMAL);
fs_reg start = shader_start_time;
start.negate = true;
- fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
+ fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
diff.set_smear(0);
- fs_inst *add = ADD(diff, start, shader_end_time);
- add->force_writemask_all = true;
- end->insert_before(add);
+
+ const fs_builder cbld = ibld.group(1, 0);
+ cbld.group(1, 0).ADD(diff, start, shader_end_time);
/* If there were no instructions between the two timestamp gets, the diff
* is 2 cycles. Remove that overhead, so I can forget about that when
* trying to determine the time taken for single instructions.
*/
- add = ADD(diff, diff, fs_reg(-2u));
- add->force_writemask_all = true;
- end->insert_before(add);
-
- end->insert_before(SHADER_TIME_ADD(type, diff));
- end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
- end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
- end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
- end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
+ cbld.ADD(diff, diff, fs_reg(-2u));
+ SHADER_TIME_ADD(cbld, 0, diff);
+ SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
+ ibld.emit(BRW_OPCODE_ELSE);
+ SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
+ ibld.emit(BRW_OPCODE_ENDIF);
}
-fs_inst *
-fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
+void
+fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
+ int shader_time_subindex,
+ fs_reg value)
{
- int shader_time_index =
- brw_get_shader_time_index(brw, shader_prog, prog, type);
- fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
+ int index = shader_time_index * 3 + shader_time_subindex;
+ fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
fs_reg payload;
if (dispatch_width == 8)
else
payload = vgrf(glsl_type::uint_type);
- return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
- fs_reg(), payload, offset, value);
+ bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
}
void
* During a SIMD16 compile (if one happens anyway), this just calls fail().
*/
void
-fs_visitor::no16(const char *format, ...)
+fs_visitor::no16(const char *msg)
{
- va_list va;
-
- va_start(va, format);
-
if (dispatch_width == 16) {
- vfail(format, va);
+ fail("%s", msg);
} else {
simd16_unsupported = true;
- if (brw->perf_debug) {
- if (no16_msg)
- ralloc_vasprintf_append(&no16_msg, format, va);
- else
- no16_msg = ralloc_vasprintf(mem_ctx, format, va);
- }
+ compiler->shader_perf_log(log_data,
+ "SIMD16 shader failed to compile: %s", msg);
}
-
- va_end(va);
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode)
-{
- return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
-{
- return emit(new(mem_ctx) fs_inst(opcode, dst));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
-{
- return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
- const fs_reg &src1)
-{
- return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
- const fs_reg &src1, const fs_reg &src2)
-{
- return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
- fs_reg src[], int sources)
-{
- return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
}
/**
fs_inst::is_partial_write() const
{
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
- (this->dst.width * type_sz(this->dst.type)) < 32 ||
+ (this->exec_size * type_sz(this->dst.type)) < 32 ||
!this->dst.is_contiguous());
}
+unsigned
+fs_inst::components_read(unsigned i) const
+{
+ switch (opcode) {
+ case FS_OPCODE_LINTERP:
+ if (i == 0)
+ return 2;
+ else
+ return 1;
+
+ case FS_OPCODE_PIXEL_X:
+ case FS_OPCODE_PIXEL_Y:
+ assert(i == 0);
+ return 2;
+
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ assert(src[6].file == IMM);
+ /* First/second FB write color. */
+ if (i < 2)
+ return src[6].fixed_hw_reg.dw1.ud;
+ else
+ return 1;
+
+ case SHADER_OPCODE_TEX_LOGICAL:
+ case SHADER_OPCODE_TXD_LOGICAL:
+ case SHADER_OPCODE_TXF_LOGICAL:
+ case SHADER_OPCODE_TXL_LOGICAL:
+ case SHADER_OPCODE_TXS_LOGICAL:
+ case FS_OPCODE_TXB_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ case SHADER_OPCODE_TXF_UMS_LOGICAL:
+ case SHADER_OPCODE_TXF_MCS_LOGICAL:
+ case SHADER_OPCODE_LOD_LOGICAL:
+ case SHADER_OPCODE_TG4_LOGICAL:
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+ assert(src[8].file == IMM && src[9].file == IMM);
+ /* Texture coordinates. */
+ if (i == 0)
+ return src[8].fixed_hw_reg.dw1.ud;
+ /* Texture derivatives. */
+ else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
+ return src[9].fixed_hw_reg.dw1.ud;
+ /* Texture offset. */
+ else if (i == 7)
+ return 2;
+ else
+ return 1;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ assert(src[3].file == IMM);
+ /* Surface coordinates. */
+ if (i == 0)
+ return src[3].fixed_hw_reg.dw1.ud;
+ /* Surface operation source (ignored for reads). */
+ else if (i == 1)
+ return 0;
+ else
+ return 1;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ assert(src[3].file == IMM &&
+ src[4].file == IMM);
+ /* Surface coordinates. */
+ if (i == 0)
+ return src[3].fixed_hw_reg.dw1.ud;
+ /* Surface operation source. */
+ else if (i == 1)
+ return src[4].fixed_hw_reg.dw1.ud;
+ else
+ return 1;
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
+ assert(src[3].file == IMM &&
+ src[4].file == IMM);
+ const unsigned op = src[4].fixed_hw_reg.dw1.ud;
+ /* Surface coordinates. */
+ if (i == 0)
+ return src[3].fixed_hw_reg.dw1.ud;
+ /* Surface operation source. */
+ else if (i == 1 && op == BRW_AOP_CMPWR)
+ return 2;
+ else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
+ op == BRW_AOP_PREDEC))
+ return 0;
+ else
+ return 1;
+ }
+
+ default:
+ return 1;
+ }
+}
+
int
fs_inst::regs_read(int arg) const
{
- if (is_tex() && arg == 0 && src[0].file == GRF) {
- return mlen;
- } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
- return mlen;
- } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
- return mlen;
- } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
- return mlen;
- } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
- return mlen;
- } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
- return mlen;
- } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
- return mlen;
- } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
- return mlen;
- } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
- return mlen;
- } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
- return mlen;
- } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
- return exec_size / 4;
+ switch (opcode) {
+ case FS_OPCODE_FB_WRITE:
+ case SHADER_OPCODE_URB_WRITE_SIMD8:
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+ case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+ if (arg == 0)
+ return mlen;
+ break;
+
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+ /* The payload is actually stored in src1 */
+ if (arg == 1)
+ return mlen;
+ break;
+
+ case FS_OPCODE_LINTERP:
+ if (arg == 1)
+ return 1;
+ break;
+
+ case SHADER_OPCODE_LOAD_PAYLOAD:
+ if (arg < this->header_size)
+ return 1;
+ break;
+
+ case CS_OPCODE_CS_TERMINATE:
+ return 1;
+
+ default:
+ if (is_tex() && arg == 0 && src[0].file == GRF)
+ return mlen;
+ break;
}
switch (src[arg].file) {
case BAD_FILE:
+ return 0;
case UNIFORM:
case IMM:
return 1;
case GRF:
case HW_REG:
- if (src[arg].stride == 0) {
- return 1;
- } else {
- int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
- return (size + 31) / 32;
- }
+ return DIV_ROUND_UP(components_read(arg) *
+ src[arg].component_size(exec_size),
+ REG_SIZE);
case MRF:
unreachable("MRF registers are not allowed as sources");
default:
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
return inst->mlen;
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
- return 2;
+ return inst->mlen;
case SHADER_OPCODE_UNTYPED_ATOMIC:
case SHADER_OPCODE_UNTYPED_SURFACE_READ:
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
{
int reg_width = dispatch_width / 8;
return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
- brw_type_for_base_type(type), dispatch_width);
-}
-
-fs_reg
-fs_visitor::vgrf(int num_components)
-{
- int reg_width = dispatch_width / 8;
- return fs_reg(GRF, alloc.allocate(num_components * reg_width),
- BRW_REGISTER_TYPE_F, dispatch_width);
+ brw_type_for_base_type(type));
}
/** Fixed HW reg constructor. */
this->file = file;
this->reg = reg;
this->type = BRW_REGISTER_TYPE_F;
-
- switch (file) {
- case UNIFORM:
- this->width = 1;
- break;
- default:
- this->width = 8;
- }
+ this->stride = (file == UNIFORM ? 0 : 1);
}
/** Fixed HW reg constructor. */
this->file = file;
this->reg = reg;
this->type = type;
-
- switch (file) {
- case UNIFORM:
- this->width = 1;
- break;
- default:
- this->width = 8;
- }
-}
-
-/** Fixed HW reg constructor. */
-fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
- uint8_t width)
-{
- init();
- this->file = file;
- this->reg = reg;
- this->type = type;
- this->width = width;
-}
-
-fs_reg *
-fs_visitor::variable_storage(ir_variable *var)
-{
- return (fs_reg *)hash_table_find(this->variable_ht, var);
-}
-
-void
-import_uniforms_callback(const void *key,
- void *data,
- void *closure)
-{
- struct hash_table *dst_ht = (struct hash_table *)closure;
- const fs_reg *reg = (const fs_reg *)data;
-
- if (reg->file != UNIFORM)
- return;
-
- hash_table_insert(dst_ht, data, key);
+ this->stride = (file == UNIFORM ? 0 : 1);
}
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
void
fs_visitor::import_uniforms(fs_visitor *v)
{
- hash_table_call_foreach(v->variable_ht,
- import_uniforms_callback,
- variable_ht);
this->push_constant_loc = v->push_constant_loc;
this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms;
this->param_size = v->param_size;
}
-/* Our support for uniforms is piggy-backed on the struct
- * gl_fragment_program, because that's where the values actually
- * get stored, rather than in some global gl_shader_program uniform
- * store.
- */
-void
-fs_visitor::setup_uniform_values(ir_variable *ir)
-{
- int namelen = strlen(ir->name);
-
- /* The data for our (non-builtin) uniforms is stored in a series of
- * gl_uniform_driver_storage structs for each subcomponent that
- * glGetUniformLocation() could name. We know it's been set up in the same
- * order we'd walk the type, so walk the list of storage and find anything
- * with our name, or the prefix of a component that starts with our name.
- */
- unsigned params_before = uniforms;
- for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
- struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
-
- if (strncmp(ir->name, storage->name, namelen) != 0 ||
- (storage->name[namelen] != 0 &&
- storage->name[namelen] != '.' &&
- storage->name[namelen] != '[')) {
- continue;
- }
-
- unsigned slots = storage->type->component_slots();
- if (storage->array_elements)
- slots *= storage->array_elements;
-
- for (unsigned i = 0; i < slots; i++) {
- stage_prog_data->param[uniforms++] = &storage->storage[i];
- }
- }
-
- /* Make sure we actually initialized the right amount of stuff here. */
- assert(params_before + ir->type->component_slots() == uniforms);
- (void)params_before;
-}
-
-
-/* Our support for builtin uniforms is even scarier than non-builtin.
- * It sits on top of the PROG_STATE_VAR parameters that are
- * automatically updated from GL context state.
- */
void
-fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
+fs_visitor::setup_vector_uniform_values(const gl_constant_value *values, unsigned n)
{
- const ir_state_slot *const slots = ir->get_state_slots();
- assert(slots != NULL);
+ static const gl_constant_value zero = { 0 };
- for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
- /* This state reference has already been setup by ir_to_mesa, but we'll
- * get the same index back here.
- */
- int index = _mesa_add_state_reference(this->prog->Parameters,
- (gl_state_index *)slots[i].tokens);
-
- /* Add each of the unique swizzles of the element as a parameter.
- * This'll end up matching the expected layout of the
- * array/matrix/structure we're trying to fill in.
- */
- int last_swiz = -1;
- for (unsigned int j = 0; j < 4; j++) {
- int swiz = GET_SWZ(slots[i].swizzle, j);
- if (swiz == last_swiz)
- break;
- last_swiz = swiz;
+ for (unsigned i = 0; i < n; ++i)
+ stage_prog_data->param[uniforms++] = &values[i];
- stage_prog_data->param[uniforms++] =
- &prog->Parameters->ParameterValues[index][swiz];
- }
- }
+ for (unsigned i = n; i < 4; ++i)
+ stage_prog_data->param[uniforms++] = &zero;
}
fs_reg *
/* gl_FragCoord.x */
if (pixel_center_integer) {
- emit(MOV(wpos, this->pixel_x));
+ bld.MOV(wpos, this->pixel_x);
} else {
- emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
+ bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
}
- wpos = offset(wpos, 1);
+ wpos = offset(wpos, bld, 1);
/* gl_FragCoord.y */
if (!flip && pixel_center_integer) {
- emit(MOV(wpos, this->pixel_y));
+ bld.MOV(wpos, this->pixel_y);
} else {
fs_reg pixel_y = this->pixel_y;
- float offset = (pixel_center_integer ? 0.0 : 0.5);
+ float offset = (pixel_center_integer ? 0.0f : 0.5f);
if (flip) {
pixel_y.negate = true;
- offset += key->drawable_height - 1.0;
+ offset += key->drawable_height - 1.0f;
}
- emit(ADD(wpos, pixel_y, fs_reg(offset)));
+ bld.ADD(wpos, pixel_y, fs_reg(offset));
}
- wpos = offset(wpos, 1);
+ wpos = offset(wpos, bld, 1);
/* gl_FragCoord.z */
if (devinfo->gen >= 6) {
- emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
+ bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
} else {
- emit(FS_OPCODE_LINTERP, wpos,
+ bld.emit(FS_OPCODE_LINTERP, wpos,
this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
interp_reg(VARYING_SLOT_POS, 2));
}
- wpos = offset(wpos, 1);
+ wpos = offset(wpos, bld, 1);
/* gl_FragCoord.w: Already set up in emit_interpolation */
- emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
+ bld.MOV(wpos, this->wpos_w);
return reg;
}
*/
barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
}
- return emit(FS_OPCODE_LINTERP, attr,
- this->delta_xy[barycoord_mode], interp);
+ return bld.emit(FS_OPCODE_LINTERP, attr,
+ this->delta_xy[barycoord_mode], interp);
}
void
/* If there's no incoming setup data for this slot, don't
* emit interpolation for it.
*/
- attr = offset(attr, type->vector_elements);
+ attr = offset(attr, bld, type->vector_elements);
location++;
continue;
}
struct brw_reg interp = interp_reg(location, k);
interp = suboffset(interp, 3);
interp.type = attr.type;
- emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
- attr = offset(attr, 1);
+ bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
+ attr = offset(attr, bld, 1);
}
} else {
/* Smooth/noperspective interpolation case. */
* unlit, replace the centroid data with non-centroid
* data.
*/
- emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+ bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
fs_inst *inst;
inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
mod_sample || key->persample_shading);
}
if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
- emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
+ bld.MUL(attr, attr, this->pixel_w);
}
- attr = offset(attr, 1);
+ attr = offset(attr, bld, 1);
}
}
fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
g0.negate = true;
- emit(ASR(*reg, g0, fs_reg(15)));
+ bld.ASR(*reg, g0, fs_reg(15));
} else {
/* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
* a boolean result from this (1/true or 0/false).
fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
g1_6.negate = true;
- emit(ASR(*reg, g1_6, fs_reg(31)));
+ bld.ASR(*reg, g1_6, fs_reg(31));
}
return reg;
if (key->compute_pos_offset) {
/* Convert int_sample_pos to floating point */
- emit(MOV(dst, int_sample_pos));
+ bld.MOV(dst, int_sample_pos);
/* Scale to the range [0, 1] */
- emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
+ bld.MUL(dst, dst, fs_reg(1 / 16.0f));
}
else {
/* From ARB_sample_shading specification:
* rasterization is disabled, gl_SamplePosition will always be
* (0.5, 0.5).
*/
- emit(MOV(dst, fs_reg(0.5f)));
+ bld.MOV(dst, fs_reg(0.5f));
}
}
{
assert(devinfo->gen >= 6);
- this->current_annotation = "compute sample position";
+ const fs_builder abld = bld.annotate("compute sample position");
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
fs_reg pos = *reg;
fs_reg int_sample_x = vgrf(glsl_type::int_type);
BRW_REGISTER_TYPE_B), 16, 8, 2);
if (dispatch_width == 8) {
- emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
+ abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
} else {
- emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
- emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
- ->force_sechalf = true;
+ abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
+ abld.half(1).MOV(half(int_sample_x, 1),
+ fs_reg(suboffset(sample_pos_reg, 16)));
}
/* Compute gl_SamplePosition.x */
compute_sample_position(pos, int_sample_x);
- pos = offset(pos, 1);
+ pos = offset(pos, abld, 1);
if (dispatch_width == 8) {
- emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
+ abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
} else {
- emit(MOV(half(int_sample_y, 0),
- fs_reg(suboffset(sample_pos_reg, 1))));
- emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
- ->force_sechalf = true;
+ abld.half(0).MOV(half(int_sample_y, 0),
+ fs_reg(suboffset(sample_pos_reg, 1)));
+ abld.half(1).MOV(half(int_sample_y, 1),
+ fs_reg(suboffset(sample_pos_reg, 17)));
}
/* Compute gl_SamplePosition.y */
compute_sample_position(pos, int_sample_y);
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
assert(devinfo->gen >= 6);
- this->current_annotation = "compute sample id";
+ const fs_builder abld = bld.annotate("compute sample id");
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
if (key->compute_sample_id) {
* are sample 1 of subspan 0; the third group is sample 0 of
* subspan 1, and finally sample 1 of subspan 1.
*/
- fs_inst *inst;
- inst = emit(BRW_OPCODE_AND, t1,
- fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
- fs_reg(0xc0));
- inst->force_writemask_all = true;
- inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
- inst->force_writemask_all = true;
+ abld.exec_all()
+ .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+ fs_reg(0xc0));
+ abld.exec_all().SHR(t1, t1, fs_reg(5));
+
/* This works for both SIMD8 and SIMD16 */
- inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
- inst->force_writemask_all = true;
+ abld.exec_all()
+ .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
+
/* This special instruction takes care of setting vstride=1,
* width=4, hstride=0 of t2 during an ADD instruction.
*/
- emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
+ abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
} else {
/* As per GL_ARB_sample_shading specification:
* "When rendering to a non-multisample buffer, or if multisample
* rasterization is disabled, gl_SampleID will always be zero."
*/
- emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
+ abld.MOV(*reg, fs_reg(0));
}
return reg;
if (!src->abs && !src->negate)
return;
- fs_reg temp = retype(vgrf(1), src->type);
- emit(MOV(temp, *src));
+ fs_reg temp = bld.vgrf(src->type);
+ bld.MOV(temp, *src);
*src = temp;
}
-fs_reg
-fs_visitor::fix_math_operand(fs_reg src)
+void
+fs_visitor::emit_discard_jump()
{
- /* Can't do hstride == 0 args on gen6 math, so expand it out. We
- * might be able to do better by doing execsize = 1 math and then
- * expanding that result out, but we would need to be careful with
- * masking.
- *
- * The hardware ignores source modifiers (negate and abs) on math
- * instructions, so we also move to a temp to set those up.
- */
- if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
- !src.abs && !src.negate)
- return src;
+ assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
- /* Gen7 relaxes most of the above restrictions, but still can't use IMM
- * operands to math
+ /* For performance, after a discard, jump to the end of the
+ * shader if all relevant channels have been discarded.
*/
- if (devinfo->gen >= 7 && src.file != IMM)
- return src;
+ fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
+ discard_jump->flag_subreg = 1;
- fs_reg expanded = vgrf(glsl_type::float_type);
- expanded.type = src.type;
- emit(BRW_OPCODE_MOV, expanded, src);
- return expanded;
+ discard_jump->predicate = (dispatch_width == 8)
+ ? BRW_PREDICATE_ALIGN1_ANY8H
+ : BRW_PREDICATE_ALIGN1_ANY16H;
+ discard_jump->predicate_inverse = true;
}
-fs_inst *
-fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
-{
- switch (opcode) {
- case SHADER_OPCODE_RCP:
- case SHADER_OPCODE_RSQ:
- case SHADER_OPCODE_SQRT:
- case SHADER_OPCODE_EXP2:
- case SHADER_OPCODE_LOG2:
- case SHADER_OPCODE_SIN:
- case SHADER_OPCODE_COS:
- break;
- default:
- unreachable("not reached: bad math opcode");
- }
-
- /* Can't do hstride == 0 args to gen6 math, so expand it out. We
- * might be able to do better by doing execsize = 1 math and then
- * expanding that result out, but we would need to be careful with
- * masking.
- *
- * Gen 6 hardware ignores source modifiers (negate and abs) on math
- * instructions, so we also move to a temp to set those up.
- */
- if (devinfo->gen == 6 || devinfo->gen == 7)
- src = fix_math_operand(src);
-
- fs_inst *inst = emit(opcode, dst, src);
-
- if (devinfo->gen < 6) {
- inst->base_mrf = 2;
- inst->mlen = dispatch_width / 8;
- }
-
- return inst;
-}
-
-fs_inst *
-fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
-{
- int base_mrf = 2;
- fs_inst *inst;
-
- if (devinfo->gen >= 8) {
- inst = emit(opcode, dst, src0, src1);
- } else if (devinfo->gen >= 6) {
- src0 = fix_math_operand(src0);
- src1 = fix_math_operand(src1);
-
- inst = emit(opcode, dst, src0, src1);
- } else {
- /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
- * "Message Payload":
- *
- * "Operand0[7]. For the INT DIV functions, this operand is the
- * denominator."
- * ...
- * "Operand1[7]. For the INT DIV functions, this operand is the
- * numerator."
- */
- bool is_int_div = opcode != SHADER_OPCODE_POW;
- fs_reg &op0 = is_int_div ? src1 : src0;
- fs_reg &op1 = is_int_div ? src0 : src1;
-
- emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
- inst = emit(opcode, dst, op0, reg_null_f);
-
- inst->base_mrf = base_mrf;
- inst->mlen = 2 * dispatch_width / 8;
- }
- return inst;
-}
-
-void
-fs_visitor::emit_discard_jump()
-{
- assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
-
- /* For performance, after a discard, jump to the end of the
- * shader if all relevant channels have been discarded.
- */
- fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
- discard_jump->flag_subreg = 1;
-
- discard_jump->predicate = (dispatch_width == 8)
- ? BRW_PREDICATE_ALIGN1_ANY8H
- : BRW_PREDICATE_ALIGN1_ANY16H;
- discard_jump->predicate_inverse = true;
-}
-
-void
-fs_visitor::assign_curb_setup()
+void
+fs_visitor::assign_curb_setup()
{
if (dispatch_width == 8) {
prog_data->dispatch_grf_start_reg = payload.num_regs;
constant_nr / 8,
constant_nr % 8);
+ assert(inst->src[i].stride == 0);
inst->src[i].file = HW_REG;
inst->src[i].fixed_hw_reg = byte_offset(
retype(brw_reg, inst->src[i].type),
continue;
/* Set up the annotation tracking for new generated instructions. */
- base_ir = inst->ir;
- current_annotation = inst->annotation;
-
+ const fs_builder ibld(this, block, inst);
fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
fs_reg dst = vgrf(glsl_type::float_type);
+ assert(inst->src[i].stride == 0);
+
/* Generate a pull load into dst. */
if (inst->src[i].reladdr) {
- exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
- surf_index,
- *inst->src[i].reladdr,
- pull_index);
- inst->insert_before(block, &list);
+ VARYING_PULL_CONSTANT_LOAD(ibld, dst,
+ surf_index,
+ *inst->src[i].reladdr,
+ pull_index);
inst->src[i].reladdr = NULL;
+ inst->src[i].stride = 1;
} else {
+ const fs_builder ubld = ibld.exec_all().group(8, 0);
fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
- fs_inst *pull =
- new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
- dst, surf_index, offset);
- inst->insert_before(block, pull);
+ ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+ dst, surf_index, offset);
inst->src[i].set_smear(pull_index & 3);
}
inst->src[i].file = GRF;
inst->src[i].reg = dst.reg;
inst->src[i].reg_offset = 0;
- inst->src[i].width = dispatch_width;
}
}
invalidate_live_intervals();
* "Parameter 0 is required except for the sampleinfo message, which
* has no parameter 0"
*/
- while (inst->mlen > inst->header_size + dispatch_width / 8 &&
+ while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
load_payload->src[(inst->mlen - inst->header_size) /
- (dispatch_width / 8) +
+ (inst->exec_size / 8) +
inst->header_size - 1].is_zero()) {
- inst->mlen -= dispatch_width / 8;
+ inst->mlen -= inst->exec_size / 8;
progress = true;
}
}
return false;
/* Look for a texturing instruction immediately before the final FB_WRITE. */
- fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
+ bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
+ fs_inst *fb_write = (fs_inst *)block->end();
assert(fb_write->eot);
assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
assert(!tex_inst->eot); /* We can't get here twice */
assert((tex_inst->offset & (0xff << 24)) == 0);
+ const fs_builder ibld(this, block, tex_inst);
+
tex_inst->offset |= fb_write->target << 24;
tex_inst->eot = true;
- tex_inst->dst = reg_null_ud;
+ tex_inst->dst = ibld.null_reg_ud();
fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
/* If a header is present, marking the eot is sufficient. Otherwise, we need
if (tex_inst->header_size != 0)
return true;
- fs_reg send_header = vgrf(load_payload->sources + 1);
+ fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
+ load_payload->sources + 1);
fs_reg *new_sources =
ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
if (depth == 0 &&
inst->dst.file == GRF &&
- alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
+ alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
!inst->is_partial_write()) {
if (remap[dst] == -1) {
remap[dst] = dst;
} else {
- remap[dst] = alloc.allocate(inst->dst.width / 8);
+ remap[dst] = alloc.allocate(inst->exec_size / 8);
inst->dst.reg = remap[dst];
progress = true;
}
/* Things returning more than one register would need us to
* understand coalescing out more than one MOV at a time.
*/
- if (scan_inst->regs_written > scan_inst->dst.width / 8)
+ if (scan_inst->regs_written > scan_inst->exec_size / 8)
break;
/* SEND instructions can't have MRF as a destination. */
int base_mrf = 1;
int color_mrf = base_mrf + 2;
- fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
- fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
- mov->force_writemask_all = true;
+ fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
+ fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
fs_inst *write;
if (key->nr_color_regions == 1) {
- write = emit(FS_OPCODE_REP_FB_WRITE);
+ write = bld.emit(FS_OPCODE_REP_FB_WRITE);
write->saturate = key->clamp_fragment_color;
write->base_mrf = color_mrf;
write->target = 0;
} else {
assume(key->nr_color_regions > 0);
for (int i = 0; i < key->nr_color_regions; ++i) {
- write = emit(FS_OPCODE_REP_FB_WRITE);
+ write = bld.emit(FS_OPCODE_REP_FB_WRITE);
write->saturate = key->clamp_fragment_color;
write->base_mrf = base_mrf;
write->target = i;
*/
if (block->start() == scan_inst) {
for (int i = 0; i < write_len; i++) {
- if (needs_dep[i]) {
- inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
- }
+ if (needs_dep[i])
+ DEP_RESOLVE_MOV(fs_builder(this, block, inst),
+ first_write_grf + i);
}
return;
}
if (reg >= first_write_grf &&
reg < first_write_grf + write_len &&
needs_dep[reg - first_write_grf]) {
- inst->insert_before(block, DEP_RESOLVE_MOV(reg));
+ DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
needs_dep[reg - first_write_grf] = false;
if (scan_inst->exec_size == 16)
needs_dep[reg - first_write_grf + 1] = false;
if (block->end() == scan_inst) {
for (int i = 0; i < write_len; i++) {
if (needs_dep[i])
- scan_inst->insert_before(block,
- DEP_RESOLVE_MOV(first_write_grf + i));
+ DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+ first_write_grf + i);
}
return;
}
scan_inst->dst.reg >= first_write_grf &&
scan_inst->dst.reg < first_write_grf + write_len &&
needs_dep[scan_inst->dst.reg - first_write_grf]) {
- scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
+ DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+ scan_inst->dst.reg);
needs_dep[scan_inst->dst.reg - first_write_grf] = false;
}
assert(const_offset_reg.file == IMM &&
const_offset_reg.type == BRW_REGISTER_TYPE_UD);
const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
- fs_reg payload = fs_reg(GRF, alloc.allocate(1));
- /* We have to use a message header on Skylake to get SIMD4x2 mode.
- * Reserve space for the register.
- */
+ fs_reg payload, offset;
if (devinfo->gen >= 9) {
- payload.reg_offset++;
- alloc.sizes[payload.reg] = 2;
+ /* We have to use a message header on Skylake to get SIMD4x2
+ * mode. Reserve space for the register.
+ */
+ offset = payload = fs_reg(GRF, alloc.allocate(2));
+ offset.reg_offset++;
+ inst->mlen = 2;
+ } else {
+ offset = payload = fs_reg(GRF, alloc.allocate(1));
+ inst->mlen = 1;
}
/* This is actually going to be a MOV, but since only the first dword
* by live variable analysis, or register allocation will explode.
*/
fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
- 8, payload, const_offset_reg);
+ 8, offset, const_offset_reg);
setup->force_writemask_all = true;
setup->ir = inst->ir;
*/
inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
inst->src[1] = payload;
+ inst->base_mrf = -1;
invalidate_live_intervals();
} else {
assert(inst->dst.file == MRF || inst->dst.file == GRF);
assert(inst->saturate == false);
-
fs_reg dst = inst->dst;
/* Get rid of COMPR4. We'll add it back in if we need it */
if (dst.file == MRF)
dst.reg = dst.reg & ~BRW_MRF_COMPR4;
- dst.width = 8;
+ const fs_builder ibld(this, block, inst);
+ const fs_builder hbld = ibld.exec_all().group(8, 0);
+
for (uint8_t i = 0; i < inst->header_size; i++) {
if (inst->src[i].file != BAD_FILE) {
fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
- mov_src.width = 8;
- fs_inst *mov = MOV(mov_dst, mov_src);
- mov->force_writemask_all = true;
- inst->insert_before(block, mov);
+ hbld.MOV(mov_dst, mov_src);
}
- dst = offset(dst, 1);
+ dst = offset(dst, hbld, 1);
}
- dst.width = inst->exec_size;
if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
inst->exec_size > 8) {
/* In this case, the payload portion of the LOAD_PAYLOAD isn't
if (devinfo->has_compr4) {
fs_reg compr4_dst = retype(dst, inst->src[i].type);
compr4_dst.reg |= BRW_MRF_COMPR4;
-
- fs_inst *mov = MOV(compr4_dst, inst->src[i]);
- mov->force_writemask_all = inst->force_writemask_all;
- inst->insert_before(block, mov);
+ ibld.MOV(compr4_dst, inst->src[i]);
} else {
/* Platform doesn't have COMPR4. We have to fake it */
fs_reg mov_dst = retype(dst, inst->src[i].type);
- mov_dst.width = 8;
-
- fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
- mov->force_writemask_all = inst->force_writemask_all;
- inst->insert_before(block, mov);
-
- mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
- mov->force_writemask_all = inst->force_writemask_all;
- mov->force_sechalf = true;
- inst->insert_before(block, mov);
+ ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
+ mov_dst.reg += 4;
+ ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
}
}
}
for (uint8_t i = inst->header_size; i < inst->sources; i++) {
- if (inst->src[i].file != BAD_FILE) {
- fs_inst *mov = MOV(retype(dst, inst->src[i].type),
- inst->src[i]);
- mov->force_writemask_all = inst->force_writemask_all;
- mov->force_sechalf = inst->force_sechalf;
- inst->insert_before(block, mov);
- }
- dst = offset(dst, 1);
+ if (inst->src[i].file != BAD_FILE)
+ ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
+ dst = offset(dst, ibld, 1);
}
inst->remove(block);
inst->dst.type != BRW_REGISTER_TYPE_UD))
continue;
-#define insert(instr) inst->insert_before(block, instr)
+ const fs_builder ibld(this, block, inst);
/* The MUL instruction isn't commutative. On Gen <= 6, only the low
* 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
if (devinfo->gen < 7) {
fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
- inst->dst.type, dispatch_width);
- insert(MOV(imm, inst->src[1]));
- insert(MUL(inst->dst, imm, inst->src[0]));
+ inst->dst.type);
+ ibld.MOV(imm, inst->src[1]);
+ ibld.MUL(inst->dst, imm, inst->src[0]);
} else {
- insert(MUL(inst->dst, inst->src[0], inst->src[1]));
+ ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
}
} else {
/* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
* schedule multi-component multiplications much better.
*/
+ if (inst->conditional_mod && inst->dst.is_null()) {
+ inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+ inst->dst.type);
+ }
fs_reg low = inst->dst;
fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
- inst->dst.type, dispatch_width);
+ inst->dst.type);
- if (brw->gen >= 7) {
+ if (devinfo->gen >= 7) {
fs_reg src1_0_w = inst->src[1];
fs_reg src1_1_w = inst->src[1];
src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
} else {
src1_0_w.type = BRW_REGISTER_TYPE_UW;
- src1_0_w.stride = 2;
+ if (src1_0_w.stride != 0) {
+ assert(src1_0_w.stride == 1);
+ src1_0_w.stride = 2;
+ }
src1_1_w.type = BRW_REGISTER_TYPE_UW;
- src1_1_w.stride = 2;
+ if (src1_1_w.stride != 0) {
+ assert(src1_1_w.stride == 1);
+ src1_1_w.stride = 2;
+ }
src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
}
- insert(MUL(low, inst->src[0], src1_0_w));
- insert(MUL(high, inst->src[0], src1_1_w));
+ ibld.MUL(low, inst->src[0], src1_0_w);
+ ibld.MUL(high, inst->src[0], src1_1_w);
} else {
fs_reg src0_0_w = inst->src[0];
fs_reg src0_1_w = inst->src[0];
src0_0_w.type = BRW_REGISTER_TYPE_UW;
- src0_0_w.stride = 2;
+ if (src0_0_w.stride != 0) {
+ assert(src0_0_w.stride == 1);
+ src0_0_w.stride = 2;
+ }
src0_1_w.type = BRW_REGISTER_TYPE_UW;
- src0_1_w.stride = 2;
+ if (src0_1_w.stride != 0) {
+ assert(src0_1_w.stride == 1);
+ src0_1_w.stride = 2;
+ }
src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
- insert(MUL(low, src0_0_w, inst->src[1]));
- insert(MUL(high, src0_1_w, inst->src[1]));
+ ibld.MUL(low, src0_0_w, inst->src[1]);
+ ibld.MUL(high, src0_1_w, inst->src[1]);
}
fs_reg dst = inst->dst;
low.subreg_offset = 2;
low.stride = 2;
- insert(ADD(dst, low, high));
+ ibld.ADD(dst, low, high);
+
+ if (inst->conditional_mod) {
+ fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
+ set_condmod(inst->conditional_mod,
+ ibld.MOV(null, inst->dst));
+ }
}
-#undef insert
inst->remove(block);
progress = true;
return progress;
}
+static void
+setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
+ fs_reg *dst, fs_reg color, unsigned components)
+{
+ if (key->clamp_fragment_color) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+ assert(color.type == BRW_REGISTER_TYPE_F);
+
+ for (unsigned i = 0; i < components; i++)
+ set_saturate(true,
+ bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
+
+ color = tmp;
+ }
+
+ for (unsigned i = 0; i < components; i++)
+ dst[i] = offset(color, bld, i);
+}
+
+static void
+lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
+ const brw_wm_prog_data *prog_data,
+ const brw_wm_prog_key *key,
+ const fs_visitor::thread_payload &payload)
+{
+ assert(inst->src[6].file == IMM);
+ const brw_device_info *devinfo = bld.shader->devinfo;
+ const fs_reg &color0 = inst->src[0];
+ const fs_reg &color1 = inst->src[1];
+ const fs_reg &src0_alpha = inst->src[2];
+ const fs_reg &src_depth = inst->src[3];
+ const fs_reg &dst_depth = inst->src[4];
+ fs_reg sample_mask = inst->src[5];
+ const unsigned components = inst->src[6].fixed_hw_reg.dw1.ud;
+
+ /* We can potentially have a message length of up to 15, so we have to set
+ * base_mrf to either 0 or 1 in order to fit in m0..m15.
+ */
+ fs_reg sources[15];
+ int header_size = 2, payload_header_size;
+ unsigned length = 0;
+
+ /* From the Sandy Bridge PRM, volume 4, page 198:
+ *
+ * "Dispatched Pixel Enables. One bit per pixel indicating
+ * which pixels were originally enabled when the thread was
+ * dispatched. This field is only required for the end-of-
+ * thread message and on all dual-source messages."
+ */
+ if (devinfo->gen >= 6 &&
+ (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
+ color1.file == BAD_FILE &&
+ key->nr_color_regions == 1) {
+ header_size = 0;
+ }
+
+ if (header_size != 0) {
+ assert(header_size == 2);
+ /* Allocate 2 registers for a header */
+ length += 2;
+ }
+
+ if (payload.aa_dest_stencil_reg) {
+ sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1));
+ bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
+ .MOV(sources[length],
+ fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
+ length++;
+ }
+
+ if (prog_data->uses_omask) {
+ sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1),
+ BRW_REGISTER_TYPE_UD);
+
+ /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
+ * relevant. Since it's unsigned single words one vgrf is always
+ * 16-wide, but only the lower or higher 8 channels will be used by the
+ * hardware when doing a SIMD8 write depending on whether we have
+ * selected the subspans for the first or second half respectively.
+ */
+ assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
+ sample_mask.type = BRW_REGISTER_TYPE_UW;
+ sample_mask.stride *= 2;
+
+ bld.exec_all().annotate("FB write oMask")
+ .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
+ inst->force_sechalf),
+ sample_mask);
+ length++;
+ }
+
+ payload_header_size = length;
+
+ if (src0_alpha.file != BAD_FILE) {
+ /* FIXME: This is being passed at the wrong location in the payload and
+ * doesn't work when gl_SampleMask and MRTs are used simultaneously.
+ * It's supposed to be immediately before oMask but there seems to be no
+ * reasonable way to pass them in the correct order because LOAD_PAYLOAD
+ * requires header sources to form a contiguous segment at the beginning
+ * of the message and src0_alpha has per-channel semantics.
+ */
+ setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
+ length++;
+ }
+
+ setup_color_payload(bld, key, &sources[length], color0, components);
+ length += 4;
+
+ if (color1.file != BAD_FILE) {
+ setup_color_payload(bld, key, &sources[length], color1, components);
+ length += 4;
+ }
+
+ if (src_depth.file != BAD_FILE) {
+ sources[length] = src_depth;
+ length++;
+ }
+
+ if (dst_depth.file != BAD_FILE) {
+ sources[length] = dst_depth;
+ length++;
+ }
+
+ fs_inst *load;
+ if (devinfo->gen >= 7) {
+ /* Send from the GRF */
+ fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
+ load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
+ payload.reg = bld.shader->alloc.allocate(load->regs_written);
+ load->dst = payload;
+
+ inst->src[0] = payload;
+ inst->resize_sources(1);
+ inst->base_mrf = -1;
+ } else {
+ /* Send from the MRF */
+ load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
+ sources, length, payload_header_size);
+
+ /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
+ * will do this for us if we just give it a COMPR4 destination.
+ */
+ if (devinfo->gen < 6 && bld.dispatch_width() == 16)
+ load->dst.reg |= BRW_MRF_COMPR4;
+
+ inst->resize_sources(0);
+ inst->base_mrf = 1;
+ }
+
+ inst->opcode = FS_OPCODE_FB_WRITE;
+ inst->mlen = load->regs_written;
+ inst->header_size = header_size;
+}
+
+static void
+lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
+ const fs_reg &coordinate,
+ const fs_reg &shadow_c,
+ const fs_reg &lod, const fs_reg &lod2,
+ const fs_reg &sampler,
+ unsigned coord_components,
+ unsigned grad_components)
+{
+ const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
+ op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
+ fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
+ fs_reg msg_end = msg_begin;
+
+ /* g0 header. */
+ msg_end = offset(msg_end, bld.group(8, 0), 1);
+
+ for (unsigned i = 0; i < coord_components; i++)
+ bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
+ offset(coordinate, bld, i));
+
+ msg_end = offset(msg_end, bld, coord_components);
+
+ /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
+ * require all three components to be present and zero if they are unused.
+ */
+ if (coord_components > 0 &&
+ (has_lod || shadow_c.file != BAD_FILE ||
+ (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
+ for (unsigned i = coord_components; i < 3; i++)
+ bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
+
+ msg_end = offset(msg_end, bld, 3 - coord_components);
+ }
+
+ if (op == SHADER_OPCODE_TXD) {
+ /* TXD unsupported in SIMD16 mode. */
+ assert(bld.dispatch_width() == 8);
+
+ /* the slots for u and v are always present, but r is optional */
+ if (coord_components < 2)
+ msg_end = offset(msg_end, bld, 2 - coord_components);
+
+ /* P = u, v, r
+ * dPdx = dudx, dvdx, drdx
+ * dPdy = dudy, dvdy, drdy
+ *
+ * 1-arg: Does not exist.
+ *
+ * 2-arg: dudx dvdx dudy dvdy
+ * dPdx.x dPdx.y dPdy.x dPdy.y
+ * m4 m5 m6 m7
+ *
+ * 3-arg: dudx dvdx drdx dudy dvdy drdy
+ * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
+ * m5 m6 m7 m8 m9 m10
+ */
+ for (unsigned i = 0; i < grad_components; i++)
+ bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
+
+ msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+
+ for (unsigned i = 0; i < grad_components; i++)
+ bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
+
+ msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+ }
+
+ if (has_lod) {
+ /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
+ * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
+ */
+ assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
+ bld.dispatch_width() == 16);
+
+ const brw_reg_type type =
+ (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
+ BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
+ bld.MOV(retype(msg_end, type), lod);
+ msg_end = offset(msg_end, bld, 1);
+ }
+
+ if (shadow_c.file != BAD_FILE) {
+ if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
+ /* There's no plain shadow compare message, so we use shadow
+ * compare with a bias of 0.0.
+ */
+ bld.MOV(msg_end, fs_reg(0.0f));
+ msg_end = offset(msg_end, bld, 1);
+ }
+
+ bld.MOV(msg_end, shadow_c);
+ msg_end = offset(msg_end, bld, 1);
+ }
+
+ inst->opcode = op;
+ inst->src[0] = reg_undef;
+ inst->src[1] = sampler;
+ inst->resize_sources(2);
+ inst->base_mrf = msg_begin.reg;
+ inst->mlen = msg_end.reg - msg_begin.reg;
+ inst->header_size = 1;
+}
+
+static void
+lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
+ fs_reg coordinate,
+ const fs_reg &shadow_c,
+ fs_reg lod, fs_reg lod2,
+ const fs_reg &sample_index,
+ const fs_reg &sampler,
+ const fs_reg &offset_value,
+ unsigned coord_components,
+ unsigned grad_components)
+{
+ fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
+ fs_reg msg_coords = message;
+ unsigned header_size = 0;
+
+ if (offset_value.file != BAD_FILE) {
+ /* The offsets set up by the visitor are in the m1 header, so we can't
+ * go headerless.
+ */
+ header_size = 1;
+ message.reg--;
+ }
+
+ for (unsigned i = 0; i < coord_components; i++) {
+ bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ }
+ fs_reg msg_end = offset(msg_coords, bld, coord_components);
+ fs_reg msg_lod = offset(msg_coords, bld, 4);
+
+ if (shadow_c.file != BAD_FILE) {
+ fs_reg msg_shadow = msg_lod;
+ bld.MOV(msg_shadow, shadow_c);
+ msg_lod = offset(msg_shadow, bld, 1);
+ msg_end = msg_lod;
+ }
+
+ switch (op) {
+ case SHADER_OPCODE_TXL:
+ case FS_OPCODE_TXB:
+ bld.MOV(msg_lod, lod);
+ msg_end = offset(msg_lod, bld, 1);
+ break;
+ case SHADER_OPCODE_TXD:
+ /**
+ * P = u, v, r
+ * dPdx = dudx, dvdx, drdx
+ * dPdy = dudy, dvdy, drdy
+ *
+ * Load up these values:
+ * - dudx dudy dvdx dvdy drdx drdy
+ * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
+ */
+ msg_end = msg_lod;
+ for (unsigned i = 0; i < grad_components; i++) {
+ bld.MOV(msg_end, lod);
+ lod = offset(lod, bld, 1);
+ msg_end = offset(msg_end, bld, 1);
+
+ bld.MOV(msg_end, lod2);
+ lod2 = offset(lod2, bld, 1);
+ msg_end = offset(msg_end, bld, 1);
+ }
+ break;
+ case SHADER_OPCODE_TXS:
+ msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
+ bld.MOV(msg_lod, lod);
+ msg_end = offset(msg_lod, bld, 1);
+ break;
+ case SHADER_OPCODE_TXF:
+ msg_lod = offset(msg_coords, bld, 3);
+ bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
+ msg_end = offset(msg_lod, bld, 1);
+ break;
+ case SHADER_OPCODE_TXF_CMS:
+ msg_lod = offset(msg_coords, bld, 3);
+ /* lod */
+ bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
+ /* sample index */
+ bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
+ msg_end = offset(msg_lod, bld, 2);
+ break;
+ default:
+ break;
+ }
+
+ inst->opcode = op;
+ inst->src[0] = reg_undef;
+ inst->src[1] = sampler;
+ inst->resize_sources(2);
+ inst->base_mrf = message.reg;
+ inst->mlen = msg_end.reg - message.reg;
+ inst->header_size = header_size;
+
+ /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+ assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static bool
+is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
+{
+ if (devinfo->gen < 8 && !devinfo->is_haswell)
+ return false;
+
+ return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
+}
+
+static void
+lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
+ fs_reg coordinate,
+ const fs_reg &shadow_c,
+ fs_reg lod, fs_reg lod2,
+ const fs_reg &sample_index,
+ const fs_reg &mcs, const fs_reg &sampler,
+ fs_reg offset_value,
+ unsigned coord_components,
+ unsigned grad_components)
+{
+ const brw_device_info *devinfo = bld.shader->devinfo;
+ int reg_width = bld.dispatch_width() / 8;
+ unsigned header_size = 0, length = 0;
+ fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
+ for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
+ sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+ if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
+ offset_value.file != BAD_FILE ||
+ is_high_sampler(devinfo, sampler)) {
+ /* For general texture offsets (no txf workaround), we need a header to
+ * put them in. Note that we're only reserving space for it in the
+ * message payload as it will be initialized implicitly by the
+ * generator.
+ *
+ * TG4 needs to place its channel select in the header, for interaction
+ * with ARB_texture_swizzle. The sampler index is only 4-bits, so for
+ * larger sampler numbers we need to offset the Sampler State Pointer in
+ * the header.
+ */
+ header_size = 1;
+ sources[0] = fs_reg();
+ length++;
+ }
+
+ if (shadow_c.file != BAD_FILE) {
+ bld.MOV(sources[length], shadow_c);
+ length++;
+ }
+
+ bool coordinate_done = false;
+
+ /* The sampler can only meaningfully compute LOD for fragment shader
+ * messages. For all other stages, we change the opcode to TXL and
+ * hardcode the LOD to 0.
+ */
+ if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
+ op == SHADER_OPCODE_TEX) {
+ op = SHADER_OPCODE_TXL;
+ lod = fs_reg(0.0f);
+ }
+
+ /* Set up the LOD info */
+ switch (op) {
+ case FS_OPCODE_TXB:
+ case SHADER_OPCODE_TXL:
+ bld.MOV(sources[length], lod);
+ length++;
+ break;
+ case SHADER_OPCODE_TXD:
+ /* TXD should have been lowered in SIMD16 mode. */
+ assert(bld.dispatch_width() == 8);
+
+ /* Load dPdx and the coordinate together:
+ * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+ */
+ for (unsigned i = 0; i < coord_components; i++) {
+ bld.MOV(sources[length], coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+
+ /* For cube map array, the coordinate is (u,v,r,ai) but there are
+ * only derivatives for (u, v, r).
+ */
+ if (i < grad_components) {
+ bld.MOV(sources[length], lod);
+ lod = offset(lod, bld, 1);
+ length++;
+
+ bld.MOV(sources[length], lod2);
+ lod2 = offset(lod2, bld, 1);
+ length++;
+ }
+ }
+
+ coordinate_done = true;
+ break;
+ case SHADER_OPCODE_TXS:
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
+ length++;
+ break;
+ case SHADER_OPCODE_TXF:
+ /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
+ * On Gen9 they are u, v, lod, r
+ */
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+
+ if (devinfo->gen >= 9) {
+ if (coord_components >= 2) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ }
+ length++;
+ }
+
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+ length++;
+
+ for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+ }
+
+ coordinate_done = true;
+ break;
+ case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_UMS:
+ case SHADER_OPCODE_TXF_MCS:
+ if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
+ length++;
+ }
+
+ if (op == SHADER_OPCODE_TXF_CMS) {
+ /* Data from the multisample control surface. */
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
+ length++;
+ }
+
+ /* There is no offsetting for this message; just copy in the integer
+ * texture coordinates.
+ */
+ for (unsigned i = 0; i < coord_components; i++) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+ }
+
+ coordinate_done = true;
+ break;
+ case SHADER_OPCODE_TG4_OFFSET:
+ /* gather4_po_c should have been lowered in SIMD16 mode. */
+ assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
+
+ /* More crazy intermixing */
+ for (unsigned i = 0; i < 2; i++) { /* u, v */
+ bld.MOV(sources[length], coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+ }
+
+ for (unsigned i = 0; i < 2; i++) { /* offu, offv */
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
+ offset_value = offset(offset_value, bld, 1);
+ length++;
+ }
+
+ if (coord_components == 3) { /* r if present */
+ bld.MOV(sources[length], coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+ }
+
+ coordinate_done = true;
+ break;
+ default:
+ break;
+ }
+
+ /* Set up the coordinate (except for cases where it was done above) */
+ if (!coordinate_done) {
+ for (unsigned i = 0; i < coord_components; i++) {
+ bld.MOV(sources[length], coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+ }
+ }
+
+ int mlen;
+ if (reg_width == 2)
+ mlen = length * reg_width - header_size;
+ else
+ mlen = length * reg_width;
+
+ const fs_reg src_payload = fs_reg(GRF, bld.shader->alloc.allocate(mlen),
+ BRW_REGISTER_TYPE_F);
+ bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
+
+ /* Generate the SEND. */
+ inst->opcode = op;
+ inst->src[0] = src_payload;
+ inst->src[1] = sampler;
+ inst->resize_sources(2);
+ inst->base_mrf = -1;
+ inst->mlen = mlen;
+ inst->header_size = header_size;
+
+ /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+ assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static void
+lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
+{
+ const brw_device_info *devinfo = bld.shader->devinfo;
+ const fs_reg &coordinate = inst->src[0];
+ const fs_reg &shadow_c = inst->src[1];
+ const fs_reg &lod = inst->src[2];
+ const fs_reg &lod2 = inst->src[3];
+ const fs_reg &sample_index = inst->src[4];
+ const fs_reg &mcs = inst->src[5];
+ const fs_reg &sampler = inst->src[6];
+ const fs_reg &offset_value = inst->src[7];
+ assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
+ const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
+ const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud;
+
+ if (devinfo->gen >= 7) {
+ lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
+ shadow_c, lod, lod2, sample_index,
+ mcs, sampler, offset_value,
+ coord_components, grad_components);
+ } else if (devinfo->gen >= 5) {
+ lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
+ shadow_c, lod, lod2, sample_index,
+ sampler, offset_value,
+ coord_components, grad_components);
+ } else {
+ lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
+ shadow_c, lod, lod2, sampler,
+ coord_components, grad_components);
+ }
+}
+
+/**
+ * Initialize the header present in some typed and untyped surface
+ * messages.
+ */
+static fs_reg
+emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
+{
+ fs_builder ubld = bld.exec_all().group(8, 0);
+ const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+ ubld.MOV(dst, fs_reg(0));
+ ubld.MOV(component(dst, 7), sample_mask);
+ return dst;
+}
+
+static void
+lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
+ const fs_reg &sample_mask)
+{
+ /* Get the logical send arguments. */
+ const fs_reg &addr = inst->src[0];
+ const fs_reg &src = inst->src[1];
+ const fs_reg &surface = inst->src[2];
+ const UNUSED fs_reg &dims = inst->src[3];
+ const fs_reg &arg = inst->src[4];
+
+ /* Calculate the total number of components of the payload. */
+ const unsigned addr_sz = inst->components_read(0);
+ const unsigned src_sz = inst->components_read(1);
+ const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
+ const unsigned sz = header_sz + addr_sz + src_sz;
+
+ /* Allocate space for the payload. */
+ fs_reg *const components = new fs_reg[sz];
+ const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+ unsigned n = 0;
+
+ /* Construct the payload. */
+ if (header_sz)
+ components[n++] = emit_surface_header(bld, sample_mask);
+
+ for (unsigned i = 0; i < addr_sz; i++)
+ components[n++] = offset(addr, bld, i);
+
+ for (unsigned i = 0; i < src_sz; i++)
+ components[n++] = offset(src, bld, i);
+
+ bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
+
+ /* Update the original instruction. */
+ inst->opcode = op;
+ inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
+ inst->header_size = header_sz;
+
+ inst->src[0] = payload;
+ inst->src[1] = surface;
+ inst->src[2] = arg;
+ inst->resize_sources(3);
+
+ delete[] components;
+}
+
+bool
+fs_visitor::lower_logical_sends()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ const fs_builder ibld(this, block, inst);
+
+ switch (inst->opcode) {
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ assert(stage == MESA_SHADER_FRAGMENT);
+ lower_fb_write_logical_send(ibld, inst,
+ (const brw_wm_prog_data *)prog_data,
+ (const brw_wm_prog_key *)key,
+ payload);
+ break;
+
+ case SHADER_OPCODE_TEX_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
+ break;
+
+ case SHADER_OPCODE_TXD_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
+ break;
+
+ case SHADER_OPCODE_TXF_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
+ break;
+
+ case SHADER_OPCODE_TXL_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
+ break;
+
+ case SHADER_OPCODE_TXS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
+ break;
+
+ case FS_OPCODE_TXB_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
+ break;
+
+ case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
+ break;
+
+ case SHADER_OPCODE_TXF_UMS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
+ break;
+
+ case SHADER_OPCODE_TXF_MCS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
+ break;
+
+ case SHADER_OPCODE_LOD_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
+ break;
+
+ case SHADER_OPCODE_TG4_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
+ break;
+
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
+ break;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_UNTYPED_SURFACE_READ,
+ fs_reg(0xffff));
+ break;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+ ibld.sample_mask_reg());
+ break;
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_UNTYPED_ATOMIC,
+ ibld.sample_mask_reg());
+ break;
+
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_TYPED_SURFACE_READ,
+ fs_reg(0xffff));
+ break;
+
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_TYPED_SURFACE_WRITE,
+ ibld.sample_mask_reg());
+ break;
+
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_TYPED_ATOMIC,
+ ibld.sample_mask_reg());
+ break;
+
+ default:
+ continue;
+ }
+
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst. The instruction will be left untouched by
+ * fs_visitor::lower_simd_width() if the returned value is equal to the
+ * original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct brw_device_info *devinfo,
+ const fs_inst *inst)
+{
+ switch (inst->opcode) {
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
+ * here.
+ */
+ assert(devinfo->gen != 6 || inst->src[3].file == BAD_FILE ||
+ inst->exec_size == 8);
+ /* Dual-source FB writes are unsupported in SIMD16 mode. */
+ return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size);
+
+ case SHADER_OPCODE_TXD_LOGICAL:
+ /* TXD is unsupported in SIMD16 mode. */
+ return 8;
+
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
+ /* gather4_po_c is unsupported in SIMD16 mode. */
+ const fs_reg &shadow_c = inst->src[1];
+ return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
+ }
+ case SHADER_OPCODE_TXL_LOGICAL:
+ case FS_OPCODE_TXB_LOGICAL: {
+ /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
+ * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
+ * mode because the message exceeds the maximum length of 11.
+ */
+ const fs_reg &shadow_c = inst->src[1];
+ if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
+ return 16;
+ else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
+ return 8;
+ else
+ return inst->exec_size;
+ }
+ case SHADER_OPCODE_TXF_LOGICAL:
+ case SHADER_OPCODE_TXS_LOGICAL:
+ /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
+ * messages. Use SIMD16 instead.
+ */
+ if (devinfo->gen == 4)
+ return 16;
+ else
+ return inst->exec_size;
+
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ return 8;
+
+ default:
+ return inst->exec_size;
+ }
+}
+
+/**
+ * The \p rows array of registers represents a \p num_rows by \p num_columns
+ * matrix in row-major order, write it in column-major order into the register
+ * passed as destination. \p stride gives the separation between matrix
+ * elements in the input in fs_builder::dispatch_width() units.
+ */
+static void
+emit_transpose(const fs_builder &bld,
+ const fs_reg &dst, const fs_reg *rows,
+ unsigned num_rows, unsigned num_columns, unsigned stride)
+{
+ fs_reg *const components = new fs_reg[num_rows * num_columns];
+
+ for (unsigned i = 0; i < num_columns; ++i) {
+ for (unsigned j = 0; j < num_rows; ++j)
+ components[num_rows * i + j] = offset(rows[j], bld, stride * i);
+ }
+
+ bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
+
+ delete[] components;
+}
+
+bool
+fs_visitor::lower_simd_width()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
+
+ if (lower_width != inst->exec_size) {
+ /* Builder matching the original instruction. We may also need to
+ * emit an instruction of width larger than the original, set the
+ * execution size of the builder to the highest of both for now so
+ * we're sure that both cases can be handled.
+ */
+ const fs_builder ibld = bld.at(block, inst)
+ .exec_all(inst->force_writemask_all)
+ .group(MAX2(inst->exec_size, lower_width),
+ inst->force_sechalf);
+
+ /* Split the copies in chunks of the execution width of either the
+ * original or the lowered instruction, whichever is lower.
+ */
+ const unsigned copy_width = MIN2(lower_width, inst->exec_size);
+ const unsigned n = inst->exec_size / copy_width;
+ const unsigned dst_size = inst->regs_written * REG_SIZE /
+ inst->dst.component_size(inst->exec_size);
+ fs_reg dsts[4];
+
+ assert(n > 0 && n <= ARRAY_SIZE(dsts) &&
+ !inst->writes_accumulator && !inst->mlen);
+
+ for (unsigned i = 0; i < n; i++) {
+ /* Emit a copy of the original instruction with the lowered width.
+ * If the EOT flag was set throw it away except for the last
+ * instruction to avoid killing the thread prematurely.
+ */
+ fs_inst split_inst = *inst;
+ split_inst.exec_size = lower_width;
+ split_inst.eot = inst->eot && i == n - 1;
+
+ /* Select the correct channel enables for the i-th group, then
+ * transform the sources and destination and emit the lowered
+ * instruction.
+ */
+ const fs_builder lbld = ibld.group(lower_width, i);
+
+ for (unsigned j = 0; j < inst->sources; j++) {
+ if (inst->src[j].file != BAD_FILE &&
+ !is_uniform(inst->src[j])) {
+ /* Get the i-th copy_width-wide chunk of the source. */
+ const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
+ const unsigned src_size = inst->components_read(j);
+
+ /* Use a trivial transposition to copy one every n
+ * copy_width-wide components of the register into a
+ * temporary passed as source to the lowered instruction.
+ */
+ split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
+ emit_transpose(lbld.group(copy_width, 0),
+ split_inst.src[j], &src, 1, src_size, n);
+ }
+ }
+
+ if (inst->regs_written) {
+ /* Allocate enough space to hold the result of the lowered
+ * instruction and fix up the number of registers written.
+ */
+ split_inst.dst = dsts[i] =
+ lbld.vgrf(inst->dst.type, dst_size);
+ split_inst.regs_written =
+ DIV_ROUND_UP(inst->regs_written * lower_width,
+ inst->exec_size);
+ }
+
+ lbld.emit(split_inst);
+ }
+
+ if (inst->regs_written) {
+ /* Distance between useful channels in the temporaries, skipping
+ * garbage if the lowered instruction is wider than the original.
+ */
+ const unsigned m = lower_width / copy_width;
+
+ /* Interleave the components of the result from the lowered
+ * instructions. We need to set exec_all() when copying more than
+ * one half per component, because LOAD_PAYLOAD (in terms of which
+ * emit_transpose is implemented) can only use the same channel
+ * enable signals for all of its non-header sources.
+ */
+ emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
+ .group(copy_width, 0),
+ inst->dst, dsts, n, dst_size, m);
+ }
+
+ inst->remove(block);
+ progress = true;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
void
fs_visitor::dump_instructions()
{
}
fprintf(file, "(%d) ", inst->exec_size);
+ if (inst->mlen) {
+ fprintf(file, "(mlen: %d) ", inst->mlen);
+ }
switch (inst->dst.file) {
case GRF:
fprintf(file, "vgrf%d", inst->dst.reg);
- if (inst->dst.width != dispatch_width)
- fprintf(file, "@%d", inst->dst.width);
- if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
+ if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
inst->dst.subreg_offset)
fprintf(file, "+%d.%d",
inst->dst.reg_offset, inst->dst.subreg_offset);
switch (inst->src[i].file) {
case GRF:
fprintf(file, "vgrf%d", inst->src[i].reg);
- if (inst->src[i].width != dispatch_width)
- fprintf(file, "@%d", inst->src[i].width);
- if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
+ if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
inst->src[i].subreg_offset)
fprintf(file, "+%d.%d", inst->src[i].reg_offset,
inst->src[i].subreg_offset);
void
fs_visitor::setup_cs_payload()
{
- assert(brw->gen >= 7);
+ assert(devinfo->gen >= 7);
payload.num_regs = 1;
}
void
fs_visitor::optimize()
{
+ /* bld is the common builder object pointing at the end of the program we
+ * used to translate it into i965 IR. For the optimization and lowering
+ * passes coming next, any code added after the end of the program without
+ * having explicitly called fs_builder::at() clearly points at a mistake.
+ * Ideally optimization passes wouldn't be part of the visitor so they
+ * wouldn't have access to bld at all, but they do, so just in case some
+ * pass forgets to ask for a location explicitly set it to NULL here to
+ * make it trip. The dispatch width is initialized to a bogus value to
+ * make sure that optimizations set the execution controls explicitly to
+ * match the code they are manipulating instead of relying on the defaults.
+ */
+ bld = fs_builder(this, 64);
+
split_virtual_grfs();
move_uniform_array_access_to_pull_constants();
snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
\
- backend_visitor::dump_instructions(filename); \
+ backend_shader::dump_instructions(filename); \
} \
\
progress = progress || this_progress; \
stage_abbrev, dispatch_width,
shader_prog ? shader_prog->Name : 0);
- backend_visitor::dump_instructions(filename);
+ backend_shader::dump_instructions(filename);
}
- bool progress;
+ bool progress = false;
int iteration = 0;
int pass_num = 0;
+
+ OPT(lower_simd_width);
+ OPT(lower_logical_sends);
+
do {
progress = false;
pass_num = 0;
fail("Failure to register allocate. Reduce number of "
"live scalar values to avoid this.");
} else {
- perf_debug("%s shader triggered register spilling. "
- "Try reducing the number of live scalar values to "
- "improve performance.\n", stage_name);
+ compiler->shader_perf_log(log_data,
+ "%s shader triggered register spilling. "
+ "Try reducing the number of live scalar "
+ "values to improve performance.\n",
+ stage_name);
}
/* Since we're out of heuristics, just go spill registers until we
}
bool
-fs_visitor::run_vs()
+fs_visitor::run_vs(gl_clip_plane *clip_planes)
{
assert(stage == MESA_SHADER_VERTEX);
assign_common_binding_table_offsets(0);
setup_vs_payload();
- if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ if (shader_time_index >= 0)
emit_shader_time_begin();
- if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
- emit_nir_code();
- } else {
- foreach_in_list(ir_instruction, ir, shader->base.ir) {
- base_ir = ir;
- this->result = reg_undef;
- ir->accept(this);
- }
- base_ir = NULL;
- }
+ emit_nir_code();
if (failed)
return false;
+ compute_clip_distance(clip_planes);
+
emit_urb_writes();
- if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ if (shader_time_index >= 0)
emit_shader_time_end();
calculate_cfg();
}
bool
-fs_visitor::run_fs()
+fs_visitor::run_fs(bool do_rep_send)
{
brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
if (0) {
emit_dummy_fs();
- } else if (brw->use_rep_send && dispatch_width == 16) {
+ } else if (do_rep_send) {
+ assert(dispatch_width == 16);
emit_repclear_shader();
} else {
- if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ if (shader_time_index >= 0)
emit_shader_time_begin();
calculate_urb_setup();
* Initialize it with the dispatched pixels.
*/
if (wm_prog_data->uses_kill) {
- fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+ fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
discard_init->flag_subreg = 1;
}
/* Generate FS IR for main(). (the visitor only descends into
* functions called "main").
*/
- if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
- emit_nir_code();
- } else if (shader) {
- foreach_in_list(ir_instruction, ir, shader->base.ir) {
- base_ir = ir;
- this->result = reg_undef;
- ir->accept(this);
- }
- } else {
- emit_fragment_program_code();
- }
- base_ir = NULL;
+ emit_nir_code();
+
if (failed)
return false;
if (wm_prog_data->uses_kill)
- emit(FS_OPCODE_PLACEHOLDER_HALT);
+ bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
if (wm_key->alpha_test_func)
emit_alpha_test();
emit_fb_writes();
- if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ if (shader_time_index >= 0)
emit_shader_time_end();
calculate_cfg();
setup_cs_payload();
- if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ if (shader_time_index >= 0)
emit_shader_time_begin();
emit_nir_code();
emit_cs_terminate();
- if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ if (shader_time_index >= 0)
emit_shader_time_end();
calculate_cfg();
if (unlikely(INTEL_DEBUG & DEBUG_WM))
brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
+ int st_index8 = -1, st_index16 = -1;
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+ st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
+ st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
+ }
+
/* Now the main event: Visit the shader IR and generate our FS IR for it.
*/
- fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
- prog, &fp->Base, 8);
- if (!v.run_fs()) {
+ fs_visitor v(brw->intelScreen->compiler, brw,
+ mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
+ prog, &fp->Base, 8, st_index8);
+ if (!v.run_fs(false /* do_rep_send */)) {
if (prog) {
prog->LinkStatus = false;
ralloc_strcat(&prog->InfoLog, v.fail_msg);
}
cfg_t *simd16_cfg = NULL;
- fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
- prog, &fp->Base, 16);
+ fs_visitor v2(brw->intelScreen->compiler, brw,
+ mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
+ prog, &fp->Base, 16, st_index16);
if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
if (!v.simd16_unsupported) {
/* Try a SIMD16 compile */
v2.import_uniforms(&v);
- if (!v2.run_fs()) {
- perf_debug("SIMD16 shader failed to compile, falling back to "
- "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
+ if (!v2.run_fs(brw->use_rep_send)) {
+ perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
} else {
simd16_cfg = v2.cfg;
}
- } else {
- perf_debug("SIMD16 shader unsupported, falling back to "
- "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
}
}
prog_data->no_8 = false;
}
- fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
+ fs_generator g(brw->intelScreen->compiler, brw,
+ mem_ctx, (void *) key, &prog_data->base,
&fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {