From 8b6a797d743be38396fcaf4a2f7fb01d3bcd9ba3 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Kristian=20H=C3=B8gsberg?= Date: Mon, 27 Oct 2014 22:42:50 -0700 Subject: [PATCH] i965: Add fs_visitor::run_vs() to generate scalar vertex shader code MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch uses the previous refactoring to add a new run_vs() method that generates vertex shader code using the scalar visitor and optimizer. Signed-off-by: Kristian Høgsberg Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_fs.cpp | 111 ++++++- src/mesa/drivers/dri/i965/brw_fs.h | 21 +- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 317 ++++++++++++++++++- 3 files changed, 436 insertions(+), 13 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index b1afe46b6b2..16f8b32639c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1808,6 +1808,61 @@ fs_visitor::assign_urb_setup() urb_start + prog_data->num_varying_inputs * 2; } +void +fs_visitor::assign_vs_urb_setup() +{ + brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; + int grf, count, slot, channel, attr; + + assert(stage == MESA_SHADER_VERTEX); + count = _mesa_bitcount_64(vs_prog_data->inputs_read); + if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) + count++; + + /* Each attribute is 4 regs. */ + this->first_non_payload_grf = + payload.num_regs + prog_data->curb_read_length + count * 4; + + unsigned vue_entries = + MAX2(count, vs_prog_data->base.vue_map.num_slots); + + vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4; + vs_prog_data->base.urb_read_length = (count + 1) / 2; + + assert(vs_prog_data->base.urb_read_length <= 15); + + /* Rewrite all ATTR file references to the hw grf that they land in. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == ATTR) { + + if (inst->src[i].reg == VERT_ATTRIB_MAX) { + slot = count - 1; + } else { + /* Attributes come in in a contiguous block, ordered by their + * gl_vert_attrib value. That means we can compute the slot + * number for an attribute by masking out the enabled + * attributes before it and counting the bits. + */ + attr = inst->src[i].reg + inst->src[i].reg_offset / 4; + slot = _mesa_bitcount_64(vs_prog_data->inputs_read & + BITFIELD64_MASK(attr)); + } + + channel = inst->src[i].reg_offset & 3; + + grf = payload.num_regs + + prog_data->curb_read_length + + slot * 4 + channel; + + inst->src[i].file = HW_REG; + inst->src[i].fixed_hw_reg = + retype(brw_vec8_grf(grf, 0), inst->src[i].type); + } + } + } +} + /** * Split large virtual GRFs into separate components if we can. * @@ -3395,6 +3450,13 @@ fs_visitor::setup_payload_gen6() } } +void +fs_visitor::setup_vs_payload() +{ + /* R0: thread header, R1: urb handles */ + payload.num_regs = 2; +} + void fs_visitor::assign_binding_table_offsets() { @@ -3433,6 +3495,8 @@ fs_visitor::calculate_register_pressure() void fs_visitor::optimize() { + const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs"; + calculate_cfg(); split_virtual_grfs(); @@ -3447,8 +3511,8 @@ fs_visitor::optimize() \ if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \ char filename[64]; \ - snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \ - dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \ + snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \ + stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \ \ backend_visitor::dump_instructions(filename); \ } \ @@ -3458,8 +3522,8 @@ fs_visitor::optimize() if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { char filename[64]; - snprintf(filename, 64, "fs%d-%04d-00-start", - dispatch_width, shader_prog ? shader_prog->Name : 0); + snprintf(filename, 64, "%s%d-%04d-00-start", + stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0); backend_visitor::dump_instructions(filename); } @@ -3527,6 +3591,9 @@ fs_visitor::allocate_registers() } if (!allocated_without_spills) { + const char *stage_name = stage == MESA_SHADER_VERTEX ? + "Vertex" : "Fragment"; + /* We assume that any spilling is worse than just dropping back to * SIMD8. There's probably actually some intermediate point where * SIMD16 with a couple of spills is still better. @@ -3535,9 +3602,9 @@ fs_visitor::allocate_registers() fail("Failure to register allocate. Reduce number of " "live scalar values to avoid this."); } else { - perf_debug("Fragment shader triggered register spilling. " + perf_debug("%s shader triggered register spilling. " "Try reducing the number of live scalar values to " - "improve performance.\n"); + "improve performance.\n", stage_name); } /* Since we're out of heuristics, just go spill registers until we @@ -3565,6 +3632,38 @@ fs_visitor::allocate_registers() prog_data->total_scratch = brw_get_scratch_size(last_scratch); } +bool +fs_visitor::run_vs() +{ + assert(stage == MESA_SHADER_VERTEX); + + assign_common_binding_table_offsets(0); + setup_vs_payload(); + + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + emit_shader_time_begin(); + + foreach_in_list(ir_instruction, ir, shader->base.ir) { + base_ir = ir; + this->result = reg_undef; + ir->accept(this); + } + base_ir = NULL; + if (failed) + return false; + + emit_urb_writes(); + + optimize(); + + assign_curb_setup(); + assign_vs_urb_setup(); + + allocate_registers(); + + return !failed; +} + bool fs_visitor::run() { diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index a674a0256a4..84a0b101e5b 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -308,12 +308,23 @@ public: struct gl_shader_program *shader_prog, struct gl_fragment_program *fp, unsigned dispatch_width); + + fs_visitor(struct brw_context *brw, + void *mem_ctx, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *prog_data, + struct gl_shader_program *shader_prog, + struct gl_vertex_program *cp, + unsigned dispatch_width); + ~fs_visitor(); void init(); fs_reg *variable_storage(ir_variable *var); int virtual_grf_alloc(int size); void import_uniforms(fs_visitor *v); + void setup_uniform_clipplane_values(); + void compute_clip_distance(); void visit(ir_variable *ir); void visit(ir_assignment *ir); @@ -404,14 +415,17 @@ public: uint32_t const_offset); bool run(); + bool run_vs(); void optimize(); void allocate_registers(); void assign_binding_table_offsets(); void setup_payload_gen4(); void setup_payload_gen6(); + void setup_vs_payload(); void assign_curb_setup(); void calculate_urb_setup(); void assign_urb_setup(); + void assign_vs_urb_setup(); bool assign_regs(bool allow_spilling); void assign_regs_trivial(); void get_used_mrfs(bool *mrf_used); @@ -465,6 +479,7 @@ public: fs_reg *emit_samplepos_setup(); fs_reg *emit_sampleid_setup(); fs_reg *emit_general_interpolation(ir_variable *ir); + fs_reg *emit_vs_system_value(enum brw_reg_type type, int location); void emit_interpolation_setup_gen4(); void emit_interpolation_setup_gen6(); void compute_sample_position(fs_reg dst, fs_reg int_sample_pos); @@ -552,6 +567,7 @@ public: fs_inst *emit_single_fb_write(fs_reg color1, fs_reg color2, fs_reg src0_alpha, unsigned components); void emit_fb_writes(); + void emit_urb_writes(); void emit_shader_time_begin(); void emit_shader_time_end(); @@ -627,8 +643,8 @@ public: struct hash_table *variable_ht; fs_reg frag_depth; fs_reg sample_mask; - fs_reg outputs[BRW_MAX_DRAW_BUFFERS]; - unsigned output_components[BRW_MAX_DRAW_BUFFERS]; + fs_reg outputs[VARYING_SLOT_MAX]; + unsigned output_components[VARYING_SLOT_MAX]; fs_reg dual_src_output; bool do_dual_src; int first_non_payload_grf; @@ -675,6 +691,7 @@ public: fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; fs_reg shader_start_time; + fs_reg userplane[MAX_CLIP_PLANES]; int grf_used; bool spilled_any_registers; diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index a9f5474cea3..399e772e3c5 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -43,11 +43,40 @@ extern "C" { #include "brw_eu.h" #include "brw_wm.h" } +#include "brw_vec4.h" #include "brw_fs.h" #include "main/uniforms.h" #include "glsl/glsl_types.h" #include "glsl/ir_optimization.h" +fs_reg * +fs_visitor::emit_vs_system_value(enum brw_reg_type type, int location) +{ + fs_reg *reg = new(this->mem_ctx) + fs_reg(ATTR, VERT_ATTRIB_MAX, type); + brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; + + switch (location) { + case SYSTEM_VALUE_BASE_VERTEX: + reg->reg_offset = 0; + vs_prog_data->uses_vertexid = true; + break; + case SYSTEM_VALUE_VERTEX_ID: + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: + reg->reg_offset = 2; + vs_prog_data->uses_vertexid = true; + break; + case SYSTEM_VALUE_INSTANCE_ID: + reg->reg_offset = 3; + vs_prog_data->uses_instanceid = true; + break; + default: + unreachable("not reached"); + } + + return reg; +} + void fs_visitor::visit(ir_variable *ir) { @@ -58,7 +87,11 @@ fs_visitor::visit(ir_variable *ir) if (ir->data.mode == ir_var_shader_in) { assert(ir->data.location != -1); - if (!strcmp(ir->name, "gl_FragCoord")) { + if (stage == MESA_SHADER_VERTEX) { + reg = new(this->mem_ctx) + fs_reg(ATTR, ir->data.location, + brw_type_for_base_type(ir->type->get_scalar_type())); + } else if (!strcmp(ir->name, "gl_FragCoord")) { reg = emit_fragcoord_interpolation(ir); } else if (!strcmp(ir->name, "gl_FrontFacing")) { reg = emit_frontfacing_interpolation(); @@ -71,7 +104,19 @@ fs_visitor::visit(ir_variable *ir) } else if (ir->data.mode == ir_var_shader_out) { reg = new(this->mem_ctx) fs_reg(this, ir->type); - if (ir->data.index > 0) { + if (stage == MESA_SHADER_VERTEX) { + int vector_elements = + ir->type->is_array() ? ir->type->fields.array->vector_elements + : ir->type->vector_elements; + + for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) { + int output = ir->data.location + i; + this->outputs[output] = *reg; + this->outputs[output].reg_offset = i * 4; + this->output_components[output] = vector_elements; + } + + } else if (ir->data.index > 0) { assert(ir->data.location == FRAG_RESULT_DATA0); assert(ir->data.index == 1); this->dual_src_output = *reg; @@ -135,15 +180,26 @@ fs_visitor::visit(ir_variable *ir) reg->type = brw_type_for_base_type(ir->type); } else if (ir->data.mode == ir_var_system_value) { - if (ir->data.location == SYSTEM_VALUE_SAMPLE_POS) { + switch (ir->data.location) { + case SYSTEM_VALUE_BASE_VERTEX: + case SYSTEM_VALUE_VERTEX_ID: + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: + case SYSTEM_VALUE_INSTANCE_ID: + reg = emit_vs_system_value(brw_type_for_base_type(ir->type), + ir->data.location); + break; + case SYSTEM_VALUE_SAMPLE_POS: reg = emit_samplepos_setup(); - } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_ID) { + break; + case SYSTEM_VALUE_SAMPLE_ID: reg = emit_sampleid_setup(); - } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_MASK_IN) { + break; + case SYSTEM_VALUE_SAMPLE_MASK_IN: assert(brw->gen >= 7); reg = new(mem_ctx) fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0), BRW_REGISTER_TYPE_D)); + break; } } @@ -1770,6 +1826,8 @@ get_tex(gl_shader_stage stage, const void *key) switch (stage) { case MESA_SHADER_FRAGMENT: return &((brw_wm_prog_key*) key)->tex; + case MESA_SHADER_VERTEX: + return &((brw_vue_prog_key*) key)->tex; default: unreachable("unhandled shader stage"); } @@ -3448,6 +3506,236 @@ fs_visitor::emit_fb_writes() this->current_annotation = NULL; } +void +fs_visitor::setup_uniform_clipplane_values() +{ + gl_clip_plane *clip_planes = brw_select_clip_planes(ctx); + const struct brw_vue_prog_key *key = + (const struct brw_vue_prog_key *) this->key; + + for (int i = 0; i < key->nr_userclip_plane_consts; i++) { + this->userplane[i] = fs_reg(UNIFORM, uniforms); + for (int j = 0; j < 4; ++j) { + stage_prog_data->param[uniforms + j] = + (gl_constant_value *) &clip_planes[i][j]; + } + uniforms += 4; + } +} + +void fs_visitor::compute_clip_distance() +{ + struct brw_vue_prog_data *vue_prog_data = + (struct brw_vue_prog_data *) prog_data; + const struct brw_vue_prog_key *key = + (const struct brw_vue_prog_key *) this->key; + + /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables): + * + * "If a linked set of shaders forming the vertex stage contains no + * static write to gl_ClipVertex or gl_ClipDistance, but the + * application has requested clipping against user clip planes through + * the API, then the coordinate written to gl_Position is used for + * comparison against the user clip planes." + * + * This function is only called if the shader didn't write to + * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping + * if the user wrote to it; otherwise we use gl_Position. + */ + + gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX; + if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) + clip_vertex = VARYING_SLOT_POS; + + /* If the clip vertex isn't written, skip this. Typically this means + * the GS will set up clipping. */ + if (outputs[clip_vertex].file == BAD_FILE) + return; + + setup_uniform_clipplane_values(); + + current_annotation = "user clip distances"; + + this->outputs[VARYING_SLOT_CLIP_DIST0] = fs_reg(this, glsl_type::vec4_type); + this->outputs[VARYING_SLOT_CLIP_DIST1] = fs_reg(this, glsl_type::vec4_type); + + for (int i = 0; i < key->nr_userclip_plane_consts; i++) { + fs_reg u = userplane[i]; + fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4]; + output.reg_offset = i & 3; + + emit(MUL(output, outputs[clip_vertex], u)); + for (int j = 1; j < 4; j++) { + u.reg = userplane[i].reg + j; + emit(MAD(output, output, offset(outputs[clip_vertex], j), u)); + } + } +} + +void +fs_visitor::emit_urb_writes() +{ + int slot, urb_offset, length; + struct brw_vs_prog_data *vs_prog_data = + (struct brw_vs_prog_data *) prog_data; + const struct brw_vs_prog_key *key = + (const struct brw_vs_prog_key *) this->key; + const GLbitfield64 psiz_mask = + VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ; + const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map; + bool flush; + fs_reg sources[8]; + + /* Lower legacy ff and ClipVertex clipping to clip distances */ + if (key->base.userclip_active && !prog->UsesClipDistanceOut) + compute_clip_distance(); + + /* If we don't have any valid slots to write, just do a minimal urb write + * send to terminate the shader. */ + if (vue_map->slots_valid == 0) { + + fs_reg payload = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD); + fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0), + BRW_REGISTER_TYPE_UD)))); + inst->force_writemask_all = true; + + inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload); + inst->eot = true; + inst->mlen = 1; + inst->offset = 1; + return; + } + + length = 0; + urb_offset = 0; + flush = false; + for (slot = 0; slot < vue_map->num_slots; slot++) { + fs_reg reg, src, zero; + + int varying = vue_map->slot_to_varying[slot]; + switch (varying) { + case VARYING_SLOT_PSIZ: + + /* The point size varying slot is the vue header and is always in the + * vue map. But often none of the special varyings that live there + * are written and in that case we can skip writing to the vue + * header, provided the corresponding state properly clamps the + * values further down the pipeline. */ + if ((vue_map->slots_valid & psiz_mask) == 0) { + assert(length == 0); + urb_offset++; + break; + } + + zero = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD); + emit(MOV(zero, fs_reg(0u))); + + sources[length++] = zero; + if (vue_map->slots_valid & VARYING_BIT_LAYER) + sources[length++] = this->outputs[VARYING_SLOT_LAYER]; + else + sources[length++] = zero; + + if (vue_map->slots_valid & VARYING_BIT_VIEWPORT) + sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT]; + else + sources[length++] = zero; + + if (vue_map->slots_valid & VARYING_BIT_PSIZ) + sources[length++] = this->outputs[VARYING_SLOT_PSIZ]; + else + sources[length++] = zero; + break; + + case BRW_VARYING_SLOT_NDC: + case VARYING_SLOT_EDGE: + unreachable("unexpected scalar vs output"); + break; + + case BRW_VARYING_SLOT_PAD: + break; + + default: + /* gl_Position is always in the vue map, but isn't always written by + * the shader. Other varyings (clip distances) get added to the vue + * map but don't always get written. In those cases, the + * corresponding this->output[] slot will be invalid we and can skip + * the urb write for the varying. If we've already queued up a vue + * slot for writing we flush a mlen 5 urb write, otherwise we just + * advance the urb_offset. + */ + if (this->outputs[varying].file == BAD_FILE) { + if (length > 0) + flush = true; + else + urb_offset++; + break; + } + + if ((varying == VARYING_SLOT_COL0 || + varying == VARYING_SLOT_COL1 || + varying == VARYING_SLOT_BFC0 || + varying == VARYING_SLOT_BFC1) && + key->clamp_vertex_color) { + /* We need to clamp these guys, so do a saturating MOV into a + * temp register and use that for the payload. + */ + for (int i = 0; i < 4; i++) { + reg = fs_reg(GRF, virtual_grf_alloc(1), outputs[varying].type); + src = offset(this->outputs[varying], i); + fs_inst *inst = emit(MOV(reg, src)); + inst->saturate = true; + sources[length++] = reg; + } + } else { + for (int i = 0; i < 4; i++) + sources[length++] = offset(this->outputs[varying], i); + } + break; + } + + current_annotation = "URB write"; + + /* If we've queued up 8 registers of payload (2 VUE slots), if this is + * the last slot or if we need to flush (see BAD_FILE varying case + * above), emit a URB write send now to flush out the data. + */ + int last = slot == vue_map->num_slots - 1; + if (length == 8 || last) + flush = true; + if (flush) { + if (last && (INTEL_DEBUG & DEBUG_SHADER_TIME)) + emit_shader_time_end(); + + fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1); + fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length + 1), + BRW_REGISTER_TYPE_F); + + /* We need WE_all on the MOV for the message header (the URB handles) + * so do a MOV to a dummy register and set force_writemask_all on the + * MOV. LOAD_PAYLOAD will preserve that. + */ + fs_reg dummy = fs_reg(GRF, virtual_grf_alloc(1), + BRW_REGISTER_TYPE_UD); + fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0), + BRW_REGISTER_TYPE_UD)))); + inst->force_writemask_all = true; + payload_sources[0] = dummy; + + memcpy(&payload_sources[1], sources, length * sizeof sources[0]); + emit(LOAD_PAYLOAD(payload, payload_sources, length + 1)); + + inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload); + inst->eot = last; + inst->mlen = length + 1; + inst->offset = urb_offset; + urb_offset = slot + 1; + length = 0; + flush = false; + } + } +} + void fs_visitor::resolve_ud_negate(fs_reg *reg) { @@ -3500,6 +3788,25 @@ fs_visitor::fs_visitor(struct brw_context *brw, init(); } +fs_visitor::fs_visitor(struct brw_context *brw, + void *mem_ctx, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *prog_data, + struct gl_shader_program *shader_prog, + struct gl_vertex_program *cp, + unsigned dispatch_width) + : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base, + MESA_SHADER_VERTEX), + reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)), + reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)), + reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)), + key(key), prog_data(&prog_data->base.base), + dispatch_width(dispatch_width) +{ + this->mem_ctx = mem_ctx; + init(); +} + void fs_visitor::init() { -- 2.30.2