From: Eric Anholt Date: Tue, 12 Nov 2013 23:33:27 +0000 (-0800) Subject: i965: Replace 8-wide and 16-wide with SIMD8 and SIMD16. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=746e3e3b3ad20a29ee6de64d663d2dc11deac06e;p=mesa.git i965: Replace 8-wide and 16-wide with SIMD8 and SIMD16. Those are the terms used in the docs, and think "n-wide" was something I just happened to say. Note that shader-db needs updating for the INTEL_DEBUG=fs parsing. Reviewed-by: Ian Romanick --- diff --git a/src/mesa/drivers/dri/i965/brw_eu.c b/src/mesa/drivers/dri/i965/brw_eu.c index 5a07d839730..dee91123ff3 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.c +++ b/src/mesa/drivers/dri/i965/brw_eu.c @@ -112,7 +112,7 @@ brw_set_compression_control(struct brw_compile *p, p->compressed = (compression_control == BRW_COMPRESSION_COMPRESSED); if (p->brw->gen >= 6) { - /* Since we don't use the 32-wide support in gen6, we translate + /* Since we don't use the SIMD32 support in gen6, we translate * the pre-gen6 compression control here. */ switch (compression_control) { @@ -123,12 +123,12 @@ brw_set_compression_control(struct brw_compile *p, p->current->header.compression_control = GEN6_COMPRESSION_1Q; break; case BRW_COMPRESSION_2NDHALF: - /* For 8-wide, this is "use the second set of 8 bits." */ + /* For SIMD8, this is "use the second set of 8 bits." */ p->current->header.compression_control = GEN6_COMPRESSION_2Q; break; case BRW_COMPRESSION_COMPRESSED: - /* For 16-wide instruction compression, use the first set of 16 bits - * since we don't do 32-wide dispatch. + /* For SIMD16 instruction compression, use the first set of 16 bits + * since we don't do SIMD32 dispatch. */ p->current->header.compression_control = GEN6_COMPRESSION_1H; break; diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 3536cbeecc5..37329b9e6b2 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -856,7 +856,7 @@ import_uniforms_callback(const void *key, hash_table_insert(dst_ht, data, key); } -/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch. +/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch. * This brings in those uniform definitions */ void @@ -1340,7 +1340,7 @@ fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1) case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: if (brw->gen >= 7 && dispatch_width == 16) - fail("16-wide INTDIV unsupported\n"); + fail("SIMD16 INTDIV unsupported\n"); break; case SHADER_OPCODE_POW: break; @@ -1764,7 +1764,7 @@ fs_visitor::remove_dead_constants() c->prog_data.nr_params = new_nr_params; } else { - /* This should have been generated in the 8-wide pass already. */ + /* This should have been generated in the SIMD8 pass already. */ assert(this->params_remap); } @@ -1883,7 +1883,7 @@ fs_visitor::setup_pull_constants() return; if (dispatch_width == 16) { - fail("Pull constants not supported in 16-wide\n"); + fail("Pull constants not supported in SIMD16\n"); return; } @@ -2557,7 +2557,7 @@ static void clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps, int first_grf, int grf_len) { - bool inst_16wide = (dispatch_width > 8 && + bool inst_simd16 = (dispatch_width > 8 && !inst->force_uncompressed && !inst->force_sechalf); @@ -2576,7 +2576,7 @@ clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps, if (grf >= first_grf && grf < first_grf + grf_len) { deps[grf - first_grf] = false; - if (inst_16wide) + if (inst_simd16) deps[grf - first_grf + 1] = false; } } @@ -2634,7 +2634,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst) return; } - bool scan_inst_16wide = (dispatch_width > 8 && + bool scan_inst_simd16 = (dispatch_width > 8 && !scan_inst->force_uncompressed && !scan_inst->force_sechalf); @@ -2651,7 +2651,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst) needs_dep[reg - first_write_grf]) { inst->insert_before(DEP_RESOLVE_MOV(reg)); needs_dep[reg - first_write_grf] = false; - if (scan_inst_16wide) + if (scan_inst_simd16) needs_dep[reg - first_write_grf + 1] = false; } } @@ -3062,7 +3062,7 @@ fs_visitor::setup_payload_gen6() c->source_depth_reg = c->nr_payload_regs; c->nr_payload_regs++; if (dispatch_width == 16) { - /* R28: interpolated depth if not 8-wide. */ + /* R28: interpolated depth if not SIMD8. */ c->nr_payload_regs++; } } @@ -3071,7 +3071,7 @@ fs_visitor::setup_payload_gen6() c->source_w_reg = c->nr_payload_regs; c->nr_payload_regs++; if (dispatch_width == 16) { - /* R30: interpolated W if not 8-wide. */ + /* R30: interpolated W if not SIMD8. */ c->nr_payload_regs++; } } @@ -3089,7 +3089,7 @@ fs_visitor::setup_payload_gen6() c->sample_mask_reg = c->nr_payload_regs; c->nr_payload_regs++; if (dispatch_width == 16) { - /* R33: input coverage mask if not 8-wide. */ + /* R33: input coverage mask if not SIMD8. */ c->nr_payload_regs++; } } @@ -3333,16 +3333,16 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, fs_visitor v2(brw, c, prog, fp, 16); if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) { if (c->prog_data.nr_pull_params == 0) { - /* Try a 16-wide compile */ + /* Try a SIMD16 compile */ v2.import_uniforms(&v); if (!v2.run()) { - perf_debug("16-wide shader failed to compile, falling back to " - "8-wide at a 10-20%% performance cost: %s", v2.fail_msg); + perf_debug("SIMD16 shader failed to compile, falling back to " + "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg); } else { simd16_instructions = &v2.instructions; } } else { - perf_debug("Skipping 16-wide due to pull parameters.\n"); + perf_debug("Skipping SIMD16 due to pull parameters.\n"); } } diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 2bbf687c590..e701fc524d2 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -215,8 +215,8 @@ fs_generator::generate_pixel_xy(struct brw_reg dst, bool is_x) dst = vec16(dst); } - /* We do this 8 or 16-wide, but since the destination is UW we - * don't do compression in the 16-wide case. + /* We do this SIMD8 or SIMD16, but since the destination is UW we + * don't do compression in the SIMD16 case. */ brw_push_insn_state(p); brw_set_compression_control(p, BRW_COMPRESSION_NONE); @@ -521,7 +521,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src } if (brw->gen >= 7 && inst->header_present && dispatch_width == 16) { - /* The send-from-GRF for 16-wide texturing with a header has an extra + /* The send-from-GRF for SIMD16 texturing with a header has an extra * hardware register allocated to it, which we need to skip over (since * our coordinates in the payload are in the even-numbered registers, * and the header comes right before the first one). @@ -1302,13 +1302,13 @@ fs_generator::generate_code(exec_list *instructions) if (unlikely(INTEL_DEBUG & DEBUG_WM)) { if (shader) { - printf("Native code for fragment shader %d (%d-wide dispatch):\n", + printf("Native code for fragment shader %d (SIMD%d dispatch):\n", prog->Name, dispatch_width); } else if (fp) { - printf("Native code for fragment program %d (%d-wide dispatch):\n", + printf("Native code for fragment program %d (SIMD%d dispatch):\n", fp->Base.Id, dispatch_width); } else { - printf("Native code for blorp program (%d-wide dispatch):\n", + printf("Native code for blorp program (SIMD%d dispatch):\n", dispatch_width); } } @@ -1831,7 +1831,7 @@ fs_generator::generate_assembly(exec_list *simd8_instructions, brw_NOP(p); } - /* Save off the start of this 16-wide program */ + /* Save off the start of this SIMD16 program */ c->prog_data.prog_offset_16 = p->nr_insn * sizeof(struct brw_instruction); brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index 8567afd3c16..f54a2defd44 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -78,7 +78,7 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width) /* The registers used to make up almost all values handled in the compiler * are a scalar value occupying a single register (or 2 registers in the - * case of 16-wide, which is handled by dividing base_reg_count by 2 and + * case of SIMD16, which is handled by dividing base_reg_count by 2 and * multiplying allocated register numbers by 2). Things that were * aggregates of scalar values at the GLSL level were split to scalar * values by split_virtual_grfs(). @@ -225,7 +225,7 @@ count_to_loop_end(fs_inst *do_inst) * nr_payload_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients. * * And we have payload_node_count nodes covering these registers in order - * (note that in 16-wide, a node is two registers). + * (note that in SIMD16, a node is two registers). */ void fs_visitor::setup_payload_interference(struct ra_graph *g, @@ -295,7 +295,7 @@ fs_visitor::setup_payload_interference(struct ra_graph *g, break; case FS_OPCODE_LINTERP: - /* On gen6+ in 16-wide, there are 4 adjacent registers (so 2 nodes) + /* On gen6+ in SIMD16, there are 4 adjacent registers (so 2 nodes) * used by PLN's sourcing of the deltas, while we list only the first * two in the arguments (1 node). Pre-gen6, the deltas are computed * in normal VGRFs. @@ -420,7 +420,7 @@ bool fs_visitor::assign_regs(bool allow_spilling) { /* Most of this allocation was written for a reg_width of 1 - * (dispatch_width == 8). In extending to 16-wide, the code was + * (dispatch_width == 8). In extending to SIMD16, the code was * left in place and it was converted to have the hardware * registers it's allocating be contiguous physical pairs of regs * for reg_width == 2. diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 53cd0a1cf57..1727ef9593a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -116,7 +116,7 @@ fs_visitor::visit(ir_variable *ir) if (dispatch_width == 16) { if (!variable_storage(ir)) { - fail("Failed to find uniform '%s' in 16-wide\n", ir->name); + fail("Failed to find uniform '%s' in SIMD16\n", ir->name); } return; } @@ -461,7 +461,7 @@ fs_visitor::visit(ir_expression *ir) * enough. */ if (brw->gen >= 7 && dispatch_width == 16) - fail("16-wide explicit accumulator operands unsupported\n"); + fail("SIMD16 explicit accumulator operands unsupported\n"); struct brw_reg acc = retype(brw_acc_reg(), this->result.type); @@ -474,7 +474,7 @@ fs_visitor::visit(ir_expression *ir) break; case ir_binop_imul_high: { if (brw->gen >= 7 && dispatch_width == 16) - fail("16-wide explicit accumulator operands unsupported\n"); + fail("SIMD16 explicit accumulator operands unsupported\n"); struct brw_reg acc = retype(brw_acc_reg(), this->result.type); @@ -489,7 +489,7 @@ fs_visitor::visit(ir_expression *ir) break; case ir_binop_carry: { if (brw->gen >= 7 && dispatch_width == 16) - fail("16-wide explicit accumulator operands unsupported\n"); + fail("SIMD16 explicit accumulator operands unsupported\n"); struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD); @@ -499,7 +499,7 @@ fs_visitor::visit(ir_expression *ir) } case ir_binop_borrow: { if (brw->gen >= 7 && dispatch_width == 16) - fail("16-wide explicit accumulator operands unsupported\n"); + fail("SIMD16 explicit accumulator operands unsupported\n"); struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD); @@ -1251,7 +1251,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate, if (ir->op == ir_tg4 || (ir->offset && ir->op != ir_txf)) { /* For general texture offsets (no txf workaround), we need a header to - * put them in. Note that for 16-wide we're making space for two actual + * put them in. Note that for SIMD16 we're making space for two actual * hardware registers here, so the emit will have to fix up for this. * * * ir4_tg4 needs to place its channel select in the header, @@ -1457,7 +1457,7 @@ fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate, }; if (dispatch_width == 16) { - fail("rectangle scale uniform setup not supported on 16-wide\n"); + fail("rectangle scale uniform setup not supported on SIMD16\n"); return coordinate; } @@ -2142,7 +2142,7 @@ void fs_visitor::visit(ir_if *ir) { if (brw->gen < 6 && dispatch_width == 16) { - fail("Can't support (non-uniform) control flow on 16-wide\n"); + fail("Can't support (non-uniform) control flow on SIMD16\n"); } /* Don't point the annotation at the if statement, because then it plus @@ -2185,7 +2185,7 @@ void fs_visitor::visit(ir_loop *ir) { if (brw->gen < 6 && dispatch_width == 16) { - fail("Can't support (non-uniform) control flow on 16-wide\n"); + fail("Can't support (non-uniform) control flow on SIMD16\n"); } this->base_ir = NULL; @@ -2693,7 +2693,7 @@ fs_visitor::emit_fb_writes() bool src0_alpha_to_render_target = false; if (dispatch_width == 16 && do_dual_src) { - fail("GL_ARB_blend_func_extended not yet supported in 16-wide."); + fail("GL_ARB_blend_func_extended not yet supported in SIMD16."); do_dual_src = false; } @@ -2747,7 +2747,7 @@ fs_visitor::emit_fb_writes() if (c->source_depth_to_render_target) { if (brw->gen == 6 && dispatch_width == 16) { /* For outputting oDepth on gen6, SIMD8 writes have to be - * used. This would require 8-wide moves of each half to + * used. This would require SIMD8 moves of each half to * message regs, kind of like pre-gen5 SIMD16 FB writes. * Just bail on doing so for now. */ diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index baf67fb1ea2..a61bbab613b 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -431,8 +431,8 @@ public: * Returns how many cycles it takes the instruction to issue. * * Instructions in gen hardware are handled one simd4 vector at a time, - * with 1 cycle per vector dispatched. Thus 8-wide pixel shaders take 2 - * cycles to dispatch and 16-wide (compressed) instructions take 4. + * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2 + * cycles to dispatch and SIMD16 (compressed) instructions take 4. */ virtual int issue_time(backend_instruction *inst) = 0; @@ -1157,7 +1157,7 @@ fs_instruction_scheduler::choose_instruction_to_schedule() } else { /* Before register allocation, we don't care about the latencies of * instructions. All we care about is reducing live intervals of - * variables so that we can avoid register spilling, or get 16-wide + * variables so that we can avoid register spilling, or get SIMD16 * shaders which naturally do a better job of hiding instruction * latency. */