X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_vec4_visitor.cpp;h=4b9e7879a2c8a214c44bb4dda0878429ecfec69e;hb=3c9dc2d31b80fc73bffa1f40a91443a53229c8e2;hp=3b0687f615a1298070ff0e3b0f7fdd9d2a51842f;hpb=b5b6460c40e1c46f6af6a490485132ea0864572c;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 3b0687f615a..4b9e7879a2c 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -24,9 +24,6 @@ #include "brw_vec4.h" #include "glsl/ir_uniform.h" extern "C" { -#include "main/context.h" -#include "main/macros.h" -#include "program/prog_parameter.h" #include "program/sampler.h" } @@ -41,7 +38,22 @@ vec4_instruction::vec4_instruction(vec4_visitor *v, this->src[0] = src0; this->src[1] = src1; this->src[2] = src2; + this->saturate = false; + this->force_writemask_all = false; + this->no_dd_clear = false; + this->no_dd_check = false; + this->conditional_mod = BRW_CONDITIONAL_NONE; + this->sampler = 0; + this->texture_offset = 0; + this->target = 0; + this->shadow_compare = false; this->ir = v->base_ir; + this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; + this->header_present = false; + this->mlen = 0; + this->base_mrf = 0; + this->offset = 0; + this->ir = NULL; this->annotation = v->current_annotation; } @@ -85,6 +97,12 @@ vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0) return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0)); } +vec4_instruction * +vec4_visitor::emit(enum opcode opcode, dst_reg dst) +{ + return emit(new(mem_ctx) vec4_instruction(this, opcode, dst)); +} + vec4_instruction * vec4_visitor::emit(enum opcode opcode) { @@ -136,6 +154,16 @@ ALU2(SHL) ALU2(SHR) ALU2(ASR) ALU3(LRP) +ALU1(BFREV) +ALU3(BFE) +ALU2(BFI1) +ALU3(BFI2) +ALU1(FBH) +ALU1(FBL) +ALU1(CBIT) +ALU3(MAD) +ALU2(ADDC) +ALU2(SUBB) /** Gen4 predicated IF. */ vec4_instruction * @@ -153,7 +181,7 @@ vec4_visitor::IF(uint32_t predicate) vec4_instruction * vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition) { - assert(intel->gen >= 6); + assert(brw->gen >= 6); vec4_instruction *inst; @@ -181,7 +209,7 @@ vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition) * before before comparison, producing garbage results for floating * point comparisons. */ - if (intel->gen == 4) { + if (brw->gen == 4) { dst.type = src0.type; if (dst.file == HW_REG) dst.fixed_hw_reg.type = dst.type; @@ -269,7 +297,7 @@ vec4_visitor::fix_math_operand(src_reg src) * can't use. */ - if (intel->gen == 7 && src.file != IMM) + if (brw->gen == 7 && src.file != IMM) return src; dst_reg expanded = dst_reg(this, glsl_type::vec4_type); @@ -322,7 +350,7 @@ vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src) return; } - if (intel->gen >= 6) { + if (brw->gen >= 6) { return emit_math1_gen6(opcode, dst, src); } else { return emit_math1_gen4(opcode, dst, src); @@ -374,7 +402,7 @@ vec4_visitor::emit_math(enum opcode opcode, return; } - if (intel->gen >= 6) { + if (brw->gen >= 6) { return emit_math2_gen6(opcode, dst, src0, src1); } else { return emit_math2_gen4(opcode, dst, src0, src1); @@ -384,7 +412,7 @@ vec4_visitor::emit_math(enum opcode opcode, void vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0) { - if (intel->gen < 7) + if (brw->gen < 7) assert(!"ir_unop_pack_half_2x16 should be lowered"); assert(dst.type == BRW_REGISTER_TYPE_UD); @@ -460,7 +488,7 @@ vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0) void vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0) { - if (intel->gen < 7) + if (brw->gen < 7) assert(!"ir_unop_unpack_half_2x16 should be lowered"); assert(dst.type == BRW_REGISTER_TYPE_F); @@ -655,40 +683,14 @@ vec4_visitor::setup_uniform_clipplane_values() { gl_clip_plane *clip_planes = brw_select_clip_planes(ctx); - if (intel->gen < 6) { - /* Pre-Gen6, we compact clip planes. For example, if the user - * enables just clip planes 0, 1, and 3, we will enable clip planes - * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip - * plane 2. This simplifies the implementation of the Gen6 clip - * thread. - */ - int compacted_clipplane_index = 0; - for (int i = 0; i < MAX_CLIP_PLANES; ++i) { - if (!(key->userclip_planes_enabled_gen_4_5 & (1 << i))) - continue; - - this->uniform_vector_size[this->uniforms] = 4; - this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms); - this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F; - for (int j = 0; j < 4; ++j) { - prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j]; - } - ++compacted_clipplane_index; - ++this->uniforms; - } - } else { - /* In Gen6 and later, we don't compact clip planes, because this - * simplifies the implementation of gl_ClipDistance. - */ - for (int i = 0; i < key->nr_userclip_plane_consts; ++i) { - this->uniform_vector_size[this->uniforms] = 4; - this->userplane[i] = dst_reg(UNIFORM, this->uniforms); - this->userplane[i].type = BRW_REGISTER_TYPE_F; - for (int j = 0; j < 4; ++j) { - prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j]; - } - ++this->uniforms; + for (int i = 0; i < key->nr_userclip_plane_consts; ++i) { + this->uniform_vector_size[this->uniforms] = 4; + this->userplane[i] = dst_reg(UNIFORM, this->uniforms); + this->userplane[i].type = BRW_REGISTER_TYPE_F; + for (int j = 0; j < 4; ++j) { + prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j]; } + ++this->uniforms; } } @@ -776,7 +778,7 @@ vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate) break; case ir_unop_f2b: - if (intel->gen >= 6) { + if (brw->gen >= 6) { emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ)); } else { inst = emit(MOV(dst_null_f(), op[0])); @@ -785,7 +787,7 @@ vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate) break; case ir_unop_i2b: - if (intel->gen >= 6) { + if (brw->gen >= 6) { emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ)); } else { inst = emit(MOV(dst_null_d(), op[0])); @@ -829,7 +831,7 @@ vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate) resolve_ud_negate(&this->result); - if (intel->gen >= 6) { + if (brw->gen >= 6) { vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1))); inst->conditional_mod = BRW_CONDITIONAL_NZ; @@ -925,7 +927,7 @@ vec4_visitor::emit_if_gen6(ir_if *ir) emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ)); } -static dst_reg +dst_reg with_writemask(dst_reg const & r, int mask) { dst_reg result = r; @@ -933,146 +935,6 @@ with_writemask(dst_reg const & r, int mask) return result; } -void -vec4_vs_visitor::emit_prolog() -{ - dst_reg sign_recovery_shift; - dst_reg normalize_factor; - dst_reg es3_normalize_factor; - - for (int i = 0; i < VERT_ATTRIB_MAX; i++) { - if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) { - uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i]; - dst_reg reg(ATTR, i); - dst_reg reg_d = reg; - reg_d.type = BRW_REGISTER_TYPE_D; - dst_reg reg_ud = reg; - reg_ud.type = BRW_REGISTER_TYPE_UD; - - /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes - * come in as floating point conversions of the integer values. - */ - if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) { - dst_reg dst = reg; - dst.type = brw_type_for_base_type(glsl_type::vec4_type); - dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1; - emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f))); - } - - /* Do sign recovery for 2101010 formats if required. */ - if (wa_flags & BRW_ATTRIB_WA_SIGN) { - if (sign_recovery_shift.file == BAD_FILE) { - /* shift constant: <22,22,22,30> */ - sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type); - emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u))); - emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u))); - } - - emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift))); - emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift))); - } - - /* Apply BGRA swizzle if required. */ - if (wa_flags & BRW_ATTRIB_WA_BGRA) { - src_reg temp = src_reg(reg); - temp.swizzle = BRW_SWIZZLE4(2,1,0,3); - emit(MOV(reg, temp)); - } - - if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) { - /* ES 3.0 has different rules for converting signed normalized - * fixed-point numbers than desktop GL. - */ - if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) { - /* According to equation 2.2 of the ES 3.0 specification, - * signed normalization conversion is done by: - * - * f = c / (2^(b-1)-1) - */ - if (es3_normalize_factor.file == BAD_FILE) { - /* mul constant: 1 / (2^(b-1) - 1) */ - es3_normalize_factor = dst_reg(this, glsl_type::vec4_type); - emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ), - src_reg(1.0f / ((1<<9) - 1)))); - emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W), - src_reg(1.0f / ((1<<1) - 1)))); - } - - dst_reg dst = reg; - dst.type = brw_type_for_base_type(glsl_type::vec4_type); - emit(MOV(dst, src_reg(reg_d))); - emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor))); - emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f)); - } else { - /* The following equations are from the OpenGL 3.2 specification: - * - * 2.1 unsigned normalization - * f = c/(2^n-1) - * - * 2.2 signed normalization - * f = (2c+1)/(2^n-1) - * - * Both of these share a common divisor, which is represented by - * "normalize_factor" in the code below. - */ - if (normalize_factor.file == BAD_FILE) { - /* 1 / (2^b - 1) for b=<10,10,10,2> */ - normalize_factor = dst_reg(this, glsl_type::vec4_type); - emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ), - src_reg(1.0f / ((1<<10) - 1)))); - emit(MOV(with_writemask(normalize_factor, WRITEMASK_W), - src_reg(1.0f / ((1<<2) - 1)))); - } - - dst_reg dst = reg; - dst.type = brw_type_for_base_type(glsl_type::vec4_type); - emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud))); - - /* For signed normalization, we want the numerator to be 2c+1. */ - if (wa_flags & BRW_ATTRIB_WA_SIGN) { - emit(MUL(dst, src_reg(dst), src_reg(2.0f))); - emit(ADD(dst, src_reg(dst), src_reg(1.0f))); - } - - emit(MUL(dst, src_reg(dst), src_reg(normalize_factor))); - } - } - - if (wa_flags & BRW_ATTRIB_WA_SCALE) { - dst_reg dst = reg; - dst.type = brw_type_for_base_type(glsl_type::vec4_type); - emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud))); - } - } - } -} - - -dst_reg * -vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir) -{ - /* VertexID is stored by the VF as the last vertex element, but - * we don't represent it with a flag in inputs_read, so we call - * it VERT_ATTRIB_MAX, which setup_attributes() picks up on. - */ - dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX); - vs_prog_data->uses_vertexid = true; - - switch (ir->location) { - case SYSTEM_VALUE_VERTEX_ID: - reg->writemask = WRITEMASK_X; - break; - case SYSTEM_VALUE_INSTANCE_ID: - reg->writemask = WRITEMASK_Y; - break; - default: - assert(!"not reached"); - break; - } - - return reg; -} - void vec4_visitor::visit(ir_variable *ir) @@ -1217,7 +1079,7 @@ vec4_visitor::visit(ir_function *ir) const ir_function_signature *sig; exec_list empty; - sig = ir->matching_signature(&empty); + sig = ir->matching_signature(NULL, &empty); assert(sig); @@ -1243,12 +1105,44 @@ vec4_visitor::try_emit_sat(ir_expression *ir) return true; } +bool +vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg) +{ + /* 3-src instructions were introduced in gen6. */ + if (brw->gen < 6) + return false; + + /* MAD can only handle floating-point data. */ + if (ir->type->base_type != GLSL_TYPE_FLOAT) + return false; + + ir_rvalue *nonmul = ir->operands[1 - mul_arg]; + ir_expression *mul = ir->operands[mul_arg]->as_expression(); + + if (!mul || mul->operation != ir_binop_mul) + return false; + + nonmul->accept(this); + src_reg src0 = fix_3src_operand(this->result); + + mul->operands[0]->accept(this); + src_reg src1 = fix_3src_operand(this->result); + + mul->operands[1]->accept(this); + src_reg src2 = fix_3src_operand(this->result); + + this->result = src_reg(this, ir->type); + emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2); + + return true; +} + void vec4_visitor::emit_bool_comparison(unsigned int op, dst_reg dst, src_reg src0, src_reg src1) { /* original gen4 does destination conversion before comparison. */ - if (intel->gen < 5) + if (brw->gen < 5) dst.type = src0.type; emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op))); @@ -1263,7 +1157,7 @@ vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst, { vec4_instruction *inst; - if (intel->gen >= 6) { + if (brw->gen >= 6) { inst = emit(BRW_OPCODE_SEL, dst, src0, src1); inst->conditional_mod = conditionalmod; } else { @@ -1274,6 +1168,20 @@ vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst, } } +static bool +is_16bit_constant(ir_rvalue *rvalue) +{ + ir_constant *constant = rvalue->as_constant(); + if (!constant) + return false; + + if (constant->type != glsl_type::int_type && + constant->type != glsl_type::uint_type) + return false; + + return constant->value.u[0] < (1 << 16); +} + void vec4_visitor::visit(ir_expression *ir) { @@ -1286,6 +1194,11 @@ vec4_visitor::visit(ir_expression *ir) if (try_emit_sat(ir)) return; + if (ir->operation == ir_binop_add) { + if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1)) + return; + } + for (operand = 0; operand < ir->get_num_operands(); operand++) { this->result.file = BAD_FILE; ir->operands[operand]->accept(this); @@ -1333,12 +1246,12 @@ vec4_visitor::visit(ir_expression *ir) break; case ir_unop_neg: op[0].negate = !op[0].negate; - this->result = op[0]; + emit(MOV(result_dst, op[0])); break; case ir_unop_abs: op[0].abs = true; op[0].negate = false; - this->result = op[0]; + emit(MOV(result_dst, op[0])); break; case ir_unop_sign: @@ -1382,6 +1295,39 @@ vec4_visitor::visit(ir_expression *ir) assert(!"derivatives not valid in vertex shader"); break; + case ir_unop_bitfield_reverse: + emit(BFREV(result_dst, op[0])); + break; + case ir_unop_bit_count: + emit(CBIT(result_dst, op[0])); + break; + case ir_unop_find_msb: { + src_reg temp = src_reg(this, glsl_type::uint_type); + + inst = emit(FBH(dst_reg(temp), op[0])); + inst->dst.writemask = WRITEMASK_XYZW; + + /* FBH counts from the MSB side, while GLSL's findMSB() wants the count + * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then + * subtract the result from 31 to convert the MSB count into an LSB count. + */ + + /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */ + temp.swizzle = BRW_SWIZZLE_NOOP; + emit(MOV(result_dst, temp)); + + src_reg src_tmp = src_reg(result_dst); + emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ)); + + src_tmp.negate = true; + inst = emit(ADD(result_dst, src_tmp, src_reg(31))); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + } + case ir_unop_find_lsb: + emit(FBL(result_dst, op[0])); + break; + case ir_unop_noise: assert(!"not reached: should be handled by lower_noise"); break; @@ -1395,28 +1341,59 @@ vec4_visitor::visit(ir_expression *ir) case ir_binop_mul: if (ir->type->is_integer()) { - /* For integer multiplication, the MUL uses the low 16 bits - * of one of the operands (src0 on gen6, src1 on gen7). The - * MACH accumulates in the contribution of the upper 16 bits - * of that operand. - * - * FINISHME: Emit just the MUL if we know an operand is small - * enough. - */ - struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D); - - emit(MUL(acc, op[0], op[1])); - emit(MACH(dst_null_d(), op[0], op[1])); - emit(MOV(result_dst, src_reg(acc))); + /* For integer multiplication, the MUL uses the low 16 bits of one of + * the operands (src0 through SNB, src1 on IVB and later). The MACH + * accumulates in the contribution of the upper 16 bits of that + * operand. If we can determine that one of the args is in the low + * 16 bits, though, we can just emit a single MUL. + */ + if (is_16bit_constant(ir->operands[0])) { + if (brw->gen < 7) + emit(MUL(result_dst, op[0], op[1])); + else + emit(MUL(result_dst, op[1], op[0])); + } else if (is_16bit_constant(ir->operands[1])) { + if (brw->gen < 7) + emit(MUL(result_dst, op[1], op[0])); + else + emit(MUL(result_dst, op[0], op[1])); + } else { + struct brw_reg acc = retype(brw_acc_reg(), result_dst.type); + + emit(MUL(acc, op[0], op[1])); + emit(MACH(dst_null_d(), op[0], op[1])); + emit(MOV(result_dst, src_reg(acc))); + } } else { emit(MUL(result_dst, op[0], op[1])); } break; + case ir_binop_imul_high: { + struct brw_reg acc = retype(brw_acc_reg(), result_dst.type); + + emit(MUL(acc, op[0], op[1])); + emit(MACH(result_dst, op[0], op[1])); + break; + } case ir_binop_div: /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */ assert(ir->type->is_integer()); emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]); break; + case ir_binop_carry: { + struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD); + + emit(ADDC(dst_null_ud(), op[0], op[1])); + emit(MOV(result_dst, src_reg(acc))); + break; + } + case ir_binop_borrow: { + struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD); + + emit(SUBB(dst_null_ud(), op[0], op[1])); + emit(MOV(result_dst, src_reg(acc))); + break; + } case ir_binop_mod: /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */ assert(ir->type->is_integer()); @@ -1582,6 +1559,10 @@ vec4_visitor::visit(ir_expression *ir) inst = emit(SHR(result_dst, op[0], op[1])); break; + case ir_binop_bfm: + emit(BFI1(result_dst, op[0], op[1])); + break; + case ir_binop_ubo_load: { ir_constant *uniform_block = ir->operands[0]->as_constant(); ir_constant *const_offset_ir = ir->operands[1]->as_constant(); @@ -1594,7 +1575,7 @@ vec4_visitor::visit(ir_expression *ir) src_reg packed_consts = src_reg(this, glsl_type::vec4_type); packed_consts.type = result.type; src_reg surf_index = - src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0])); + src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]); if (const_offset_ir) { offset = src_reg(const_offset / 16); } else { @@ -1627,6 +1608,20 @@ vec4_visitor::visit(ir_expression *ir) break; } + case ir_binop_vector_extract: + assert(!"should have been lowered by vec_index_to_cond_assign"); + break; + + case ir_triop_fma: + op[0] = fix_3src_operand(op[0]); + op[1] = fix_3src_operand(op[1]); + op[2] = fix_3src_operand(op[2]); + /* Note that the instruction's argument order is reversed from GLSL + * and the IR. + */ + emit(MAD(result_dst, op[2], op[1], op[0])); + break; + case ir_triop_lrp: op[0] = fix_3src_operand(op[0]); op[1] = fix_3src_operand(op[1]); @@ -1637,6 +1632,38 @@ vec4_visitor::visit(ir_expression *ir) emit(LRP(result_dst, op[2], op[1], op[0])); break; + case ir_triop_csel: + emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ)); + inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + + case ir_triop_bfi: + op[0] = fix_3src_operand(op[0]); + op[1] = fix_3src_operand(op[1]); + op[2] = fix_3src_operand(op[2]); + emit(BFI2(result_dst, op[0], op[1], op[2])); + break; + + case ir_triop_bitfield_extract: + op[0] = fix_3src_operand(op[0]); + op[1] = fix_3src_operand(op[1]); + op[2] = fix_3src_operand(op[2]); + /* Note that the instruction's argument order is reversed from GLSL + * and the IR. + */ + emit(BFE(result_dst, op[2], op[1], op[0])); + break; + + case ir_triop_vector_insert: + assert(!"should have been lowered by lower_vector_insert"); + break; + + case ir_quadop_bitfield_insert: + assert(!"not reached: should be handled by " + "bitfield_insert_to_bfm_bfi\n"); + break; + case ir_quadop_vector: assert(!"not reached: should be handled by lower_quadop_vector"); break; @@ -1662,6 +1689,9 @@ vec4_visitor::visit(ir_expression *ir) case ir_binop_pack_half_2x16_split: assert(!"not reached: should not occur in vertex shader"); break; + case ir_binop_ldexp: + assert(!"not reached: should be handled by ldexp_to_arith()"); + break; } } @@ -2139,6 +2169,20 @@ vec4_visitor::visit(ir_texture *ir) int sampler = _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog); + /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother + * emitting anything other than setting up the constant result. + */ + if (ir->op == ir_tg4) { + ir_constant *chan = ir->lod_info.component->as_constant(); + int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]); + if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) { + dst_reg result(this, ir->type); + this->result = src_reg(result); + emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f))); + return; + } + } + /* Should be lowered by do_lower_texture_projection */ assert(!ir->projector); @@ -2172,6 +2216,10 @@ vec4_visitor::visit(ir_texture *ir) lod = this->result; lod_type = ir->lod_info.lod->type; break; + case ir_query_levels: + lod = src_reg(0); + lod_type = glsl_type::int_type; + break; case ir_txf_ms: ir->lod_info.sample_index->accept(this); sample_index = this->result; @@ -2188,6 +2236,7 @@ vec4_visitor::visit(ir_texture *ir) break; case ir_txb: case ir_lod: + case ir_tg4: break; } @@ -2209,18 +2258,26 @@ vec4_visitor::visit(ir_texture *ir) case ir_txs: inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS); break; + case ir_tg4: + inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4); + break; + case ir_query_levels: + inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS); + break; case ir_txb: assert(!"TXB is not valid for vertex shaders."); break; case ir_lod: assert(!"LOD is not valid for vertex shaders."); break; + default: + assert(!"Unrecognized tex op"); } bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf; /* Texel offsets go in the message header; Gen4 also requires headers. */ - inst->header_present = use_texture_offset || intel->gen < 5; + inst->header_present = use_texture_offset || brw->gen < 5 || ir->op == ir_tg4; inst->base_mrf = 2; inst->mlen = inst->header_present + 1; /* always at least one */ inst->sampler = sampler; @@ -2231,20 +2288,21 @@ vec4_visitor::visit(ir_texture *ir) if (use_texture_offset) inst->texture_offset = brw_texture_offset(ir->offset->as_constant()); + /* Stuff the channel select bits in the top of the texture offset */ + if (ir->op == ir_tg4) + inst->texture_offset |= gather_channel(ir, sampler)<<16; + /* MRF for the first parameter */ int param_base = inst->base_mrf + inst->header_present; - if (ir->op == ir_txs) { - int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X; + if (ir->op == ir_txs || ir->op == ir_query_levels) { + int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X; emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod)); } else { - int i, coord_mask = 0, zero_mask = 0; /* Load the coordinate */ /* FINISHME: gl_clamp_mask and saturate */ - for (i = 0; i < ir->coordinate->type->vector_elements; i++) - coord_mask |= (1 << i); - for (; i < 4; i++) - zero_mask |= (1 << i); + int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1; + int zero_mask = 0xf & ~coord_mask; if (ir->offset && ir->op == ir_txf) { /* It appears that the ld instruction used for txf does its @@ -2268,8 +2326,10 @@ vec4_visitor::visit(ir_texture *ir) emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask), coordinate)); } - emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask), - src_reg(0))); + if (zero_mask != 0) { + emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask), + src_reg(0))); + } /* Load the shadow comparitor */ if (ir->shadow_comparitor && ir->op != ir_txd) { emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type, @@ -2281,7 +2341,7 @@ vec4_visitor::visit(ir_texture *ir) /* Load the LOD info */ if (ir->op == ir_tex || ir->op == ir_txl) { int mrf, writemask; - if (intel->gen >= 5) { + if (brw->gen >= 5) { mrf = param_base + 1; if (ir->shadow_comparitor) { writemask = WRITEMASK_Y; @@ -2290,9 +2350,9 @@ vec4_visitor::visit(ir_texture *ir) writemask = WRITEMASK_X; inst->mlen++; } - } else /* intel->gen == 4 */ { + } else /* brw->gen == 4 */ { mrf = param_base; - writemask = WRITEMASK_Z; + writemask = WRITEMASK_W; } emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod)); } else if (ir->op == ir_txf) { @@ -2310,7 +2370,7 @@ vec4_visitor::visit(ir_texture *ir) } else if (ir->op == ir_txd) { const glsl_type *type = lod_type; - if (intel->gen >= 5) { + if (brw->gen >= 5) { dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx)); @@ -2330,7 +2390,7 @@ vec4_visitor::visit(ir_texture *ir) shadow_comparitor)); } } - } else /* intel->gen == 4 */ { + } else /* brw->gen == 4 */ { emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx)); emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy)); inst->mlen += 2; @@ -2356,6 +2416,31 @@ vec4_visitor::visit(ir_texture *ir) swizzle_result(ir, src_reg(inst->dst), sampler); } +/** + * Set up the gather channel based on the swizzle, for gather4. + */ +uint32_t +vec4_visitor::gather_channel(ir_texture *ir, int sampler) +{ + ir_constant *chan = ir->lod_info.component->as_constant(); + int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]); + switch (swiz) { + case SWIZZLE_X: return 0; + case SWIZZLE_Y: + /* gather4 sampler is broken for green channel on RG32F -- + * we must ask for blue instead. + */ + if (key->tex.gather_channel_quirk_mask & (1<result = src_reg(this, ir->type); dst_reg swizzled_result(this->result); + if (ir->op == ir_query_levels) { + /* # levels is in .w */ + orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W); + emit(MOV(swizzled_result, orig_val)); + return; + } + if (ir->op == ir_txs || ir->type == glsl_type::float_type - || s == SWIZZLE_NOOP) { + || s == SWIZZLE_NOOP || ir->op == ir_tg4) { emit(MOV(swizzled_result, orig_val)); return; } + int zero_mask = 0, one_mask = 0, copy_mask = 0; - int swizzle[4]; + int swizzle[4] = {0}; for (int i = 0; i < 4; i++) { switch (GET_SWZ(s, i)) { @@ -2425,7 +2518,7 @@ vec4_visitor::visit(ir_if *ir) */ this->base_ir = ir->condition; - if (intel->gen == 6) { + if (brw->gen == 6) { emit_if_gen6(ir); } else { uint32_t predicate; @@ -2446,6 +2539,18 @@ vec4_visitor::visit(ir_if *ir) emit(BRW_OPCODE_ENDIF); } +void +vec4_visitor::visit(ir_emit_vertex *) +{ + assert(!"not reached"); +} + +void +vec4_visitor::visit(ir_end_primitive *) +{ + assert(!"not reached"); +} + void vec4_visitor::emit_ndc_computation() { @@ -2472,13 +2577,12 @@ vec4_visitor::emit_ndc_computation() void vec4_visitor::emit_psiz_and_flags(struct brw_reg reg) { - if (intel->gen < 6 && + if (brw->gen < 6 && ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) || key->userclip_active || brw->has_negative_rhw_bug)) { dst_reg header1 = dst_reg(this, glsl_type::uvec4_type); dst_reg header1_w = header1; header1_w.writemask = WRITEMASK_W; - GLuint i; emit(MOV(header1, 0u)); @@ -2490,16 +2594,19 @@ vec4_visitor::emit_psiz_and_flags(struct brw_reg reg) emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8)); } - current_annotation = "Clipping flags"; - for (i = 0; i < key->nr_userclip_plane_consts; i++) { - vec4_instruction *inst; + if (key->userclip_active) { + current_annotation = "Clipping flags"; + dst_reg flags0 = dst_reg(this, glsl_type::uint_type); + dst_reg flags1 = dst_reg(this, glsl_type::uint_type); - inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]), - src_reg(this->userplane[i]))); - inst->conditional_mod = BRW_CONDITIONAL_L; + emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L)); + emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0)); + emit(OR(header1_w, src_reg(header1_w), src_reg(flags0))); - inst = emit(OR(header1_w, src_reg(header1_w), 1u << i)); - inst->predicate = BRW_PREDICATE_NORMAL; + emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L)); + emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0)); + emit(SHL(flags1, src_reg(flags1), src_reg(4))); + emit(OR(header1_w, src_reg(header1_w), src_reg(flags1))); } /* i965 clipping workaround: @@ -2523,7 +2630,7 @@ vec4_visitor::emit_psiz_and_flags(struct brw_reg reg) } emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1))); - } else if (intel->gen < 6) { + } else if (brw->gen < 6) { emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u)); } else { emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0))); @@ -2531,20 +2638,16 @@ vec4_visitor::emit_psiz_and_flags(struct brw_reg reg) emit(MOV(brw_writemask(reg, WRITEMASK_W), src_reg(output_reg[VARYING_SLOT_PSIZ]))); } + if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) { + emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D), + src_reg(output_reg[VARYING_SLOT_LAYER]))); + } } } void -vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset) +vec4_visitor::emit_clip_distances(dst_reg reg, int offset) { - if (intel->gen < 6) { - /* Clip distance slots are set aside in gen5, but they are not used. It - * is not clear whether we actually need to set aside space for them, - * but the performance cost is negligible. - */ - return; - } - /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables): * * "If a linked set of shaders forming the vertex stage contains no @@ -2564,7 +2667,8 @@ vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset) for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4; ++i) { - emit(DP4(dst_reg(brw_writemask(reg, 1 << i)), + reg.writemask = 1 << i; + emit(DP4(reg, src_reg(output_reg[clip_vertex]), src_reg(this->userplane[i + offset]))); } @@ -2605,20 +2709,10 @@ vec4_visitor::emit_urb_slot(int mrf, int varying) current_annotation = "NDC"; emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC]))); break; - case BRW_VARYING_SLOT_POS_DUPLICATE: case VARYING_SLOT_POS: current_annotation = "gl_Position"; emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS]))); break; - case VARYING_SLOT_CLIP_DIST0: - case VARYING_SLOT_CLIP_DIST1: - if (this->key->uses_clip_distance) { - emit_generic_urb_slot(reg, varying); - } else { - current_annotation = "user clip distances"; - emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4); - } - break; case VARYING_SLOT_EDGE: /* This is present when doing unfilled polygons. We're supposed to copy * the edge flag from the user-provided vertex array @@ -2642,9 +2736,7 @@ vec4_visitor::emit_urb_slot(int mrf, int varying) static int align_interleaved_urb_mlen(struct brw_context *brw, int mlen) { - struct intel_context *intel = &brw->intel; - - if (intel->gen >= 6) { + if (brw->gen >= 6) { /* URB data written (does not include the message header reg) must * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, * section 5.4.3.2.2: URB_INTERLEAVED. @@ -2660,29 +2752,6 @@ align_interleaved_urb_mlen(struct brw_context *brw, int mlen) return mlen; } -void -vec4_vs_visitor::emit_urb_write_header(int mrf) -{ - /* No need to do anything for VS; an implied write to this MRF will be - * performed by VS_OPCODE_URB_WRITE. - */ - (void) mrf; -} - -vec4_instruction * -vec4_vs_visitor::emit_urb_write_opcode(bool complete) -{ - /* For VS, the URB writes end the thread. */ - if (complete) { - if (INTEL_DEBUG & DEBUG_SHADER_TIME) - emit_shader_time_end(); - } - - vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE); - inst->eot = complete; - - return inst; -} /** * Generates the VUE payload plus the necessary URB write instructions to @@ -2715,62 +2784,54 @@ vec4_visitor::emit_vertex() */ emit_urb_write_header(mrf++); - if (intel->gen < 6) { + if (brw->gen < 6) { emit_ndc_computation(); } - /* Set up the VUE data for the first URB write */ - int slot; - for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) { - emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]); + /* Lower legacy ff and ClipVertex clipping to clip distances */ + if (key->userclip_active && !key->uses_clip_distance) { + current_annotation = "user clip distances"; - /* If this was max_usable_mrf, we can't fit anything more into this URB - * WRITE. - */ - if (mrf > max_usable_mrf) { - slot++; - break; - } + output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type); + output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type); + + emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0); + emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4); } - bool complete = slot >= prog_data->vue_map.num_slots; - current_annotation = "URB write"; - vec4_instruction *inst = emit_urb_write_opcode(complete); - inst->base_mrf = base_mrf; - inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf); + /* We may need to split this up into several URB writes, so do them in a + * loop. + */ + int slot = 0; + bool complete = false; + do { + /* URB offset is in URB row increments, and each of our MRFs is half of + * one of those, since we're doing interleaved writes. + */ + int offset = slot / 2; - /* Optional second URB write */ - if (!complete) { mrf = base_mrf + 1; - for (; slot < prog_data->vue_map.num_slots; ++slot) { - assert(mrf < max_usable_mrf); - emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]); + + /* If this was max_usable_mrf, we can't fit anything more into this + * URB WRITE. + */ + if (mrf > max_usable_mrf) { + slot++; + break; + } } + complete = slot >= prog_data->vue_map.num_slots; current_annotation = "URB write"; - inst = emit_urb_write_opcode(true /* complete */); + vec4_instruction *inst = emit_urb_write_opcode(complete); inst->base_mrf = base_mrf; inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf); - /* URB destination offset. In the previous write, we got MRFs - * 2-13 minus the one header MRF, so 12 regs. URB offset is in - * URB row increments, and each of our MRFs is half of one of - * those, since we're doing interleaved writes. - */ - inst->offset = (max_usable_mrf - base_mrf) / 2; - } + inst->offset += offset; + } while(!complete); } -void -vec4_vs_visitor::emit_thread_end() -{ - /* For VS, we always end the thread by emitting a single vertex. - * emit_urb_write_opcode() will take care of setting the eot flag on the - * SEND instruction. - */ - emit_vertex(); -} src_reg vec4_visitor::get_scratch_offset(vec4_instruction *inst, @@ -2784,7 +2845,7 @@ vec4_visitor::get_scratch_offset(vec4_instruction *inst, /* Pre-gen6, the message header uses byte offsets instead of vec4 * (16-byte) offset units. */ - if (intel->gen < 6) + if (brw->gen < 6) message_header_scale *= 16; if (reladdr) { @@ -2812,13 +2873,13 @@ vec4_visitor::get_pull_constant_offset(vec4_instruction *inst, /* Pre-gen6, the message header uses byte offsets instead of vec4 * (16-byte) offset units. */ - if (intel->gen < 6) { + if (brw->gen < 6) { emit_before(inst, MUL(dst_reg(index), index, src_reg(16))); } return index; } else { - int message_header_scale = intel->gen < 6 ? 16 : 1; + int message_header_scale = brw->gen < 6 ? 16 : 1; return src_reg(reg_offset * message_header_scale); } } @@ -2967,11 +3028,11 @@ vec4_visitor::emit_pull_constant_load(vec4_instruction *inst, int base_offset) { int reg_offset = base_offset + orig_src.reg_offset; - src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER); + src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start); src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset); vec4_instruction *load; - if (intel->gen >= 7) { + if (brw->gen >= 7) { dst_reg grf_offset = dst_reg(this, glsl_type::int_type); grf_offset.type = offset.type; emit_before(inst, MOV(grf_offset, offset)); @@ -3086,8 +3147,7 @@ vec4_visitor::vec4_visitor(struct brw_context *brw, : debug_flag(debug_flag) { this->brw = brw; - this->intel = &brw->intel; - this->ctx = &intel->ctx; + this->ctx = &brw->ctx; this->shader_prog = shader_prog; this->shader = shader; @@ -3107,8 +3167,8 @@ vec4_visitor::vec4_visitor(struct brw_context *brw, hash_table_pointer_hash, hash_table_pointer_compare); - this->virtual_grf_def = NULL; - this->virtual_grf_use = NULL; + this->virtual_grf_start = NULL; + this->virtual_grf_end = NULL; this->virtual_grf_sizes = NULL; this->virtual_grf_count = 0; this->virtual_grf_reg_map = NULL; @@ -3116,7 +3176,7 @@ vec4_visitor::vec4_visitor(struct brw_context *brw, this->virtual_grf_array_size = 0; this->live_intervals_valid = false; - this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF; + this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF; this->uniforms = 0; } @@ -3127,21 +3187,6 @@ vec4_visitor::~vec4_visitor() } -vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw, - struct brw_vs_compile *vs_compile, - struct brw_vs_prog_data *vs_prog_data, - struct gl_shader_program *prog, - struct brw_shader *shader, - void *mem_ctx) - : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base, - &vs_compile->key.base, &vs_prog_data->base, prog, shader, - mem_ctx, INTEL_DEBUG & DEBUG_VS), - vs_compile(vs_compile), - vs_prog_data(vs_prog_data) -{ -} - - void vec4_visitor::fail(const char *format, ...) {