+/**
+ * Splits virtual GRFs requesting more than one contiguous physical register.
+ *
+ * We initially create large virtual GRFs for temporary structures, arrays,
+ * and matrices, so that the dereference visitor functions can add reg_offsets
+ * to work their way down to the actual member being accessed. But when it
+ * comes to optimization, we'd like to treat each register as individual
+ * storage if possible.
+ *
+ * So far, the only thing that might prevent splitting is a send message from
+ * a GRF on IVB.
+ */
+void
+vec4_visitor::split_virtual_grfs()
+{
+ int num_vars = this->virtual_grf_count;
+ int new_virtual_grf[num_vars];
+ bool split_grf[num_vars];
+
+ memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
+
+ /* Try to split anything > 0 sized. */
+ for (int i = 0; i < num_vars; i++) {
+ split_grf[i] = this->virtual_grf_sizes[i] != 1;
+ }
+
+ /* Check that the instructions are compatible with the registers we're trying
+ * to split.
+ */
+ foreach_list(node, &this->instructions) {
+ vec4_instruction *inst = (vec4_instruction *)node;
+
+ /* If there's a SEND message loading from a GRF on gen7+, it needs to be
+ * contiguous.
+ */
+ if (inst->is_send_from_grf()) {
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file == GRF) {
+ split_grf[inst->src[i].reg] = false;
+ }
+ }
+ }
+ }
+
+ /* Allocate new space for split regs. Note that the virtual
+ * numbers will be contiguous.
+ */
+ for (int i = 0; i < num_vars; i++) {
+ if (!split_grf[i])
+ continue;
+
+ new_virtual_grf[i] = virtual_grf_alloc(1);
+ for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
+ int reg = virtual_grf_alloc(1);
+ assert(reg == new_virtual_grf[i] + j - 1);
+ (void) reg;
+ }
+ this->virtual_grf_sizes[i] = 1;
+ }
+
+ foreach_list(node, &this->instructions) {
+ vec4_instruction *inst = (vec4_instruction *)node;
+
+ if (inst->dst.file == GRF && split_grf[inst->dst.reg] &&
+ inst->dst.reg_offset != 0) {
+ inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
+ inst->dst.reg_offset - 1);
+ inst->dst.reg_offset = 0;
+ }
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] &&
+ inst->src[i].reg_offset != 0) {
+ inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
+ inst->src[i].reg_offset - 1);
+ inst->src[i].reg_offset = 0;
+ }
+ }
+ }
+ this->live_intervals_valid = false;
+}
+
+void
+vec4_visitor::dump_instruction(backend_instruction *be_inst)
+{
+ vec4_instruction *inst = (vec4_instruction *)be_inst;
+
+ printf("%s ", brw_instruction_name(inst->opcode));
+
+ switch (inst->dst.file) {
+ case GRF:
+ printf("vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset);
+ break;
+ case MRF:
+ printf("m%d", inst->dst.reg);
+ break;
+ case BAD_FILE:
+ printf("(null)");
+ break;
+ default:
+ printf("???");
+ break;
+ }
+ if (inst->dst.writemask != WRITEMASK_XYZW) {
+ printf(".");
+ if (inst->dst.writemask & 1)
+ printf("x");
+ if (inst->dst.writemask & 2)
+ printf("y");
+ if (inst->dst.writemask & 4)
+ printf("z");
+ if (inst->dst.writemask & 8)
+ printf("w");
+ }
+ printf(", ");
+
+ for (int i = 0; i < 3; i++) {
+ switch (inst->src[i].file) {
+ case GRF:
+ printf("vgrf%d", inst->src[i].reg);
+ break;
+ case ATTR:
+ printf("attr%d", inst->src[i].reg);
+ break;
+ case UNIFORM:
+ printf("u%d", inst->src[i].reg);
+ break;
+ case IMM:
+ switch (inst->src[i].type) {
+ case BRW_REGISTER_TYPE_F:
+ printf("%fF", inst->src[i].imm.f);
+ break;
+ case BRW_REGISTER_TYPE_D:
+ printf("%dD", inst->src[i].imm.i);
+ break;
+ case BRW_REGISTER_TYPE_UD:
+ printf("%uU", inst->src[i].imm.u);
+ break;
+ default:
+ printf("???");
+ break;
+ }
+ break;
+ case BAD_FILE:
+ printf("(null)");
+ break;
+ default:
+ printf("???");
+ break;
+ }
+
+ if (inst->src[i].reg_offset)
+ printf(".%d", inst->src[i].reg_offset);
+
+ static const char *chans[4] = {"x", "y", "z", "w"};
+ printf(".");
+ for (int c = 0; c < 4; c++) {
+ printf("%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
+ }
+
+ if (i < 3)
+ printf(", ");
+ }
+
+ printf("\n");
+}
+
+/**
+ * Replace each register of type ATTR in this->instructions with a reference
+ * to a fixed HW register.
+ */
+void
+vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map)
+{
+ foreach_list(node, &this->instructions) {
+ vec4_instruction *inst = (vec4_instruction *)node;
+
+ /* We have to support ATTR as a destination for GL_FIXED fixup. */
+ if (inst->dst.file == ATTR) {
+ int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
+
+ /* All attributes used in the shader need to have been assigned a
+ * hardware register by the caller
+ */
+ assert(grf != 0);
+
+ struct brw_reg reg = brw_vec8_grf(grf, 0);
+ reg.type = inst->dst.type;
+ reg.dw1.bits.writemask = inst->dst.writemask;
+
+ inst->dst.file = HW_REG;
+ inst->dst.fixed_hw_reg = reg;
+ }
+
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file != ATTR)
+ continue;
+
+ int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
+
+ /* All attributes used in the shader need to have been assigned a
+ * hardware register by the caller
+ */
+ assert(grf != 0);
+
+ struct brw_reg reg = brw_vec8_grf(grf, 0);
+ reg.dw1.bits.swizzle = inst->src[i].swizzle;
+ reg.type = inst->src[i].type;
+ if (inst->src[i].abs)
+ reg = brw_abs(reg);
+ if (inst->src[i].negate)
+ reg = negate(reg);
+
+ inst->src[i].file = HW_REG;
+ inst->src[i].fixed_hw_reg = reg;
+ }
+ }
+}
+
+int
+vec4_vs_visitor::setup_attributes(int payload_reg)
+{
+ int nr_attributes;
+ int attribute_map[VERT_ATTRIB_MAX + 1];
+ memset(attribute_map, 0, sizeof(attribute_map));
+
+ nr_attributes = 0;
+ for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
+ if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
+ attribute_map[i] = payload_reg + nr_attributes;
+ nr_attributes++;
+ }
+ }
+
+ /* VertexID is stored by the VF as the last vertex element, but we
+ * don't represent it with a flag in inputs_read, so we call it
+ * VERT_ATTRIB_MAX.
+ */
+ if (vs_prog_data->uses_vertexid) {
+ attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
+ nr_attributes++;
+ }
+
+ lower_attributes_to_hw_regs(attribute_map);
+
+ /* The BSpec says we always have to read at least one thing from
+ * the VF, and it appears that the hardware wedges otherwise.
+ */
+ if (nr_attributes == 0)
+ nr_attributes = 1;
+
+ prog_data->urb_read_length = (nr_attributes + 1) / 2;
+
+ unsigned vue_entries =
+ MAX2(nr_attributes, prog_data->vue_map.num_slots);
+
+ if (brw->gen == 6)
+ prog_data->urb_entry_size = ALIGN(vue_entries, 8) / 8;
+ else
+ prog_data->urb_entry_size = ALIGN(vue_entries, 4) / 4;
+
+ return payload_reg + nr_attributes;
+}
+
+int
+vec4_visitor::setup_uniforms(int reg)
+{
+ prog_data->dispatch_grf_start_reg = reg;
+
+ /* The pre-gen6 VS requires that some push constants get loaded no
+ * matter what, or the GPU would hang.
+ */
+ if (brw->gen < 6 && this->uniforms == 0) {
+ this->uniform_vector_size[this->uniforms] = 1;
+
+ prog_data->param = reralloc(NULL, prog_data->param, const float *, 4);
+ for (unsigned int i = 0; i < 4; i++) {
+ unsigned int slot = this->uniforms * 4 + i;
+ static float zero = 0.0;
+ prog_data->param[slot] = &zero;
+ }
+
+ this->uniforms++;
+ reg++;
+ } else {
+ reg += ALIGN(uniforms, 2) / 2;
+ }
+
+ prog_data->nr_params = this->uniforms * 4;
+
+ prog_data->curb_read_length = reg - prog_data->dispatch_grf_start_reg;
+
+ return reg;
+}
+
+void
+vec4_vs_visitor::setup_payload(void)
+{
+ int reg = 0;
+
+ /* The payload always contains important data in g0, which contains
+ * the URB handles that are passed on to the URB write at the end
+ * of the thread. So, we always start push constants at g1.
+ */
+ reg++;
+
+ reg = setup_uniforms(reg);
+
+ reg = setup_attributes(reg);
+
+ this->first_non_payload_grf = reg;
+}
+
+src_reg
+vec4_visitor::get_timestamp()
+{
+ assert(brw->gen >= 7);
+
+ src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+ BRW_ARF_TIMESTAMP,
+ 0,
+ BRW_REGISTER_TYPE_UD,
+ BRW_VERTICAL_STRIDE_0,
+ BRW_WIDTH_4,
+ BRW_HORIZONTAL_STRIDE_4,
+ BRW_SWIZZLE_XYZW,
+ WRITEMASK_XYZW));
+
+ dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
+
+ vec4_instruction *mov = emit(MOV(dst, ts));
+ /* We want to read the 3 fields we care about (mostly field 0, but also 2)
+ * even if it's not enabled in the dispatch.
+ */
+ mov->force_writemask_all = true;
+
+ return src_reg(dst);
+}
+
+void
+vec4_visitor::emit_shader_time_begin()
+{
+ current_annotation = "shader time start";
+ shader_start_time = get_timestamp();
+}
+
+void
+vec4_visitor::emit_shader_time_end()
+{
+ current_annotation = "shader time end";
+ src_reg shader_end_time = get_timestamp();
+
+
+ /* Check that there weren't any timestamp reset events (assuming these
+ * were the only two timestamp reads that happened).
+ */
+ src_reg reset_end = shader_end_time;
+ reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
+ vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
+ test->conditional_mod = BRW_CONDITIONAL_Z;
+
+ emit(IF(BRW_PREDICATE_NORMAL));
+
+ /* Take the current timestamp and get the delta. */
+ shader_start_time.negate = true;
+ dst_reg diff = dst_reg(this, glsl_type::uint_type);
+ emit(ADD(diff, shader_start_time, shader_end_time));
+
+ /* If there were no instructions between the two timestamp gets, the diff
+ * is 2 cycles. Remove that overhead, so I can forget about that when
+ * trying to determine the time taken for single instructions.
+ */
+ emit(ADD(diff, src_reg(diff), src_reg(-2u)));
+
+ emit_shader_time_write(ST_VS, src_reg(diff));
+ emit_shader_time_write(ST_VS_WRITTEN, src_reg(1u));
+ emit(BRW_OPCODE_ELSE);
+ emit_shader_time_write(ST_VS_RESET, src_reg(1u));
+ emit(BRW_OPCODE_ENDIF);
+}
+
+void
+vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
+ src_reg value)
+{
+ int shader_time_index =
+ brw_get_shader_time_index(brw, shader_prog, prog, type);
+
+ dst_reg dst =
+ dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
+
+ dst_reg offset = dst;
+ dst_reg time = dst;
+ time.reg_offset++;
+
+ offset.type = BRW_REGISTER_TYPE_UD;
+ emit(MOV(offset, src_reg(shader_time_index * SHADER_TIME_STRIDE)));
+
+ time.type = BRW_REGISTER_TYPE_UD;
+ emit(MOV(time, src_reg(value)));
+
+ emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
+}
+
+bool
+vec4_visitor::run()
+{
+ sanity_param_count = prog->Parameters->NumParameters;
+
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ emit_shader_time_begin();
+
+ assign_common_binding_table_offsets(0);
+
+ emit_prolog();
+
+ /* Generate VS IR for main(). (the visitor only descends into
+ * functions called "main").
+ */
+ if (shader) {
+ visit_instructions(shader->ir);
+ } else {
+ emit_program_code();
+ }
+ base_ir = NULL;
+
+ if (key->userclip_active && !prog->UsesClipDistanceOut)
+ setup_uniform_clipplane_values();
+
+ emit_thread_end();
+
+ /* Before any optimization, push array accesses out to scratch
+ * space where we need them to be. This pass may allocate new
+ * virtual GRFs, so we want to do it early. It also makes sure
+ * that we have reladdr computations available for CSE, since we'll
+ * often do repeated subexpressions for those.
+ */
+ if (shader) {
+ move_grf_array_access_to_scratch();
+ move_uniform_array_access_to_pull_constants();
+ } else {
+ /* The ARB_vertex_program frontend emits pull constant loads directly
+ * rather than using reladdr, so we don't need to walk through all the
+ * instructions looking for things to move. There isn't anything.
+ *
+ * We do still need to split things to vec4 size.
+ */
+ split_uniform_registers();
+ }
+ pack_uniform_registers();
+ move_push_constants_to_pull_constants();
+ split_virtual_grfs();
+
+ bool progress;
+ do {
+ progress = false;
+ progress = dead_code_eliminate() || progress;
+ progress = opt_copy_propagation() || progress;
+ progress = opt_algebraic() || progress;
+ progress = opt_register_coalesce() || progress;
+ } while (progress);
+
+
+ if (failed)
+ return false;
+
+ setup_payload();
+
+ if (false) {
+ /* Debug of register spilling: Go spill everything. */
+ const int grf_count = virtual_grf_count;
+ float spill_costs[virtual_grf_count];
+ bool no_spill[virtual_grf_count];
+ evaluate_spill_costs(spill_costs, no_spill);
+ for (int i = 0; i < grf_count; i++) {
+ if (no_spill[i])
+ continue;
+ spill_reg(i);
+ }
+ }
+
+ while (!reg_allocate()) {
+ if (failed)
+ break;
+ }
+
+ opt_schedule_instructions();
+
+ opt_set_dependency_control();
+
+ /* If any state parameters were appended, then ParameterValues could have
+ * been realloced, in which case the driver uniform storage set up by
+ * _mesa_associate_uniform_storage() would point to freed memory. Make
+ * sure that didn't happen.
+ */
+ assert(sanity_param_count == prog->Parameters->NumParameters);
+
+ return !failed;
+}
+