X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_fs_reg_allocate.cpp;h=2494b117e57506e2c9d4336148600e17a914e97a;hb=a92e5f7cf63d496ad7830b5cea4bbab287c25b8e;hp=7c5414ac26cd4753d6a8b5860ea42ae489f20602;hpb=6034b9a5124475d300d0678bd2fb6160865fa972;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index 7c5414ac26c..2494b117e57 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -26,9 +26,8 @@ */ #include "brw_fs.h" -#include "../glsl/glsl_types.h" -#include "../glsl/ir_optimization.h" -#include "../glsl/ir_print_visitor.h" +#include "glsl/glsl_types.h" +#include "glsl/ir_optimization.h" static void assign_reg(int *reg_hw_locations, fs_reg *reg, int reg_width) @@ -43,17 +42,17 @@ assign_reg(int *reg_hw_locations, fs_reg *reg, int reg_width) void fs_visitor::assign_regs_trivial() { - int hw_reg_mapping[this->virtual_grf_next + 1]; + int hw_reg_mapping[this->virtual_grf_count + 1]; int i; - int reg_width = c->dispatch_width / 8; + int reg_width = dispatch_width / 8; /* Note that compressed instructions require alignment to 2 registers. */ hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width); - for (i = 1; i <= this->virtual_grf_next; i++) { + for (i = 1; i <= this->virtual_grf_count; i++) { hw_reg_mapping[i] = (hw_reg_mapping[i - 1] + this->virtual_grf_sizes[i - 1] * reg_width); } - this->grf_used = hw_reg_mapping[this->virtual_grf_next]; + this->grf_used = hw_reg_mapping[this->virtual_grf_count]; foreach_list(node, &this->instructions) { fs_inst *inst = (fs_inst *)node; @@ -61,23 +60,57 @@ fs_visitor::assign_regs_trivial() assign_reg(hw_reg_mapping, &inst->dst, reg_width); assign_reg(hw_reg_mapping, &inst->src[0], reg_width); assign_reg(hw_reg_mapping, &inst->src[1], reg_width); + assign_reg(hw_reg_mapping, &inst->src[2], reg_width); } - if (this->grf_used >= BRW_MAX_GRF) { + if (this->grf_used >= max_grf) { fail("Ran out of regs on trivial allocator (%d/%d)\n", - this->grf_used, BRW_MAX_GRF); + this->grf_used, max_grf); } } static void -brw_alloc_reg_set_for_classes(struct brw_context *brw, - int *class_sizes, - int class_count, - int reg_width, - int base_reg_count) +brw_alloc_reg_set(struct brw_context *brw, int reg_width) { - struct intel_context *intel = &brw->intel; + int base_reg_count = BRW_MAX_GRF / reg_width; + int index = reg_width - 1; + + /* The registers used to make up almost all values handled in the compiler + * are a scalar value occupying a single register (or 2 registers in the + * case of SIMD16, which is handled by dividing base_reg_count by 2 and + * multiplying allocated register numbers by 2). Things that were + * aggregates of scalar values at the GLSL level were split to scalar + * values by split_virtual_grfs(). + * + * However, texture SEND messages return a series of contiguous registers + * to write into. We currently always ask for 4 registers, but we may + * convert that to use less some day. + * + * Additionally, on gen5 we need aligned pairs of registers for the PLN + * instruction, and on gen4 we need 8 contiguous regs for workaround simd16 + * texturing. + * + * So we have a need for classes for 1, 2, 4, and 8 registers currently, + * and we add in '3' to make indexing the array easier for the common case + * (since we'll probably want it for texturing later). + * + * And, on gen7 and newer, we do texturing SEND messages from GRFs, which + * means that we may need any size up to the sampler message size limit (11 + * regs). + */ + int class_count; + int class_sizes[BRW_MAX_MRF]; + + if (brw->gen >= 7) { + for (class_count = 0; class_count < MAX_SAMPLER_MESSAGE_SIZE; + class_count++) + class_sizes[class_count] = class_count + 1; + } else { + for (class_count = 0; class_count < 4; class_count++) + class_sizes[class_count] = class_count + 1; + class_sizes[class_count++] = 8; + } /* Compute the total number of registers across all classes. */ int ra_reg_count = 0; @@ -85,14 +118,12 @@ brw_alloc_reg_set_for_classes(struct brw_context *brw, ra_reg_count += base_reg_count - (class_sizes[i] - 1); } - ralloc_free(brw->wm.ra_reg_to_grf); - brw->wm.ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count); - ralloc_free(brw->wm.regs); - brw->wm.regs = ra_alloc_reg_set(ra_reg_count); - ralloc_free(brw->wm.classes); - brw->wm.classes = ralloc_array(brw, int, class_count + 1); - - brw->wm.aligned_pairs_class = -1; + uint8_t *ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count); + struct ra_regs *regs = ra_alloc_reg_set(brw, ra_reg_count); + if (brw->gen >= 6) + ra_set_allocate_round_robin(regs); + int *classes = ralloc_array(brw, int, class_count); + int aligned_pairs_class = -1; /* Now, add the registers to their classes, and add the conflicts * between them and the base GRF registers (and also each other). @@ -102,7 +133,7 @@ brw_alloc_reg_set_for_classes(struct brw_context *brw, int pairs_reg_count = 0; for (int i = 0; i < class_count; i++) { int class_reg_count = base_reg_count - (class_sizes[i] - 1); - brw->wm.classes[i] = ra_alloc_reg_class(brw->wm.regs); + classes[i] = ra_alloc_reg_class(regs); /* Save this off for the aligned pair class at the end. */ if (class_sizes[i] == 2) { @@ -111,14 +142,14 @@ brw_alloc_reg_set_for_classes(struct brw_context *brw, } for (int j = 0; j < class_reg_count; j++) { - ra_class_add_reg(brw->wm.regs, brw->wm.classes[i], reg); + ra_class_add_reg(regs, classes[i], reg); - brw->wm.ra_reg_to_grf[reg] = j; + ra_reg_to_grf[reg] = j; for (int base_reg = j; base_reg < j + class_sizes[i]; base_reg++) { - ra_add_transitive_reg_conflict(brw->wm.regs, base_reg, reg); + ra_add_transitive_reg_conflict(regs, base_reg, reg); } reg++; @@ -129,88 +160,311 @@ brw_alloc_reg_set_for_classes(struct brw_context *brw, /* Add a special class for aligned pairs, which we'll put delta_x/y * in on gen5 so that we can do PLN. */ - if (brw->has_pln && reg_width == 1 && intel->gen < 6) { - brw->wm.aligned_pairs_class = ra_alloc_reg_class(brw->wm.regs); + if (brw->has_pln && reg_width == 1 && brw->gen < 6) { + aligned_pairs_class = ra_alloc_reg_class(regs); for (int i = 0; i < pairs_reg_count; i++) { - if ((brw->wm.ra_reg_to_grf[pairs_base_reg + i] & 1) == 0) { - ra_class_add_reg(brw->wm.regs, brw->wm.aligned_pairs_class, - pairs_base_reg + i); + if ((ra_reg_to_grf[pairs_base_reg + i] & 1) == 0) { + ra_class_add_reg(regs, aligned_pairs_class, pairs_base_reg + i); } } - class_count++; } - ra_set_finalize(brw->wm.regs); + ra_set_finalize(regs, NULL); + + brw->wm.reg_sets[index].regs = regs; + for (unsigned i = 0; i < ARRAY_SIZE(brw->wm.reg_sets[index].classes); i++) + brw->wm.reg_sets[index].classes[i] = -1; + for (int i = 0; i < class_count; i++) + brw->wm.reg_sets[index].classes[class_sizes[i] - 1] = classes[i]; + brw->wm.reg_sets[index].ra_reg_to_grf = ra_reg_to_grf; + brw->wm.reg_sets[index].aligned_pairs_class = aligned_pairs_class; } -bool -fs_visitor::assign_regs() +void +brw_fs_alloc_reg_sets(struct brw_context *brw) { - /* Most of this allocation was written for a reg_width of 1 - * (dispatch_width == 8). In extending to 16-wide, the code was - * left in place and it was converted to have the hardware - * registers it's allocating be contiguous physical pairs of regs - * for reg_width == 2. - */ - int reg_width = c->dispatch_width / 8; - int hw_reg_mapping[this->virtual_grf_next]; - int first_assigned_grf = ALIGN(this->first_non_payload_grf, reg_width); - int base_reg_count = (BRW_MAX_GRF - first_assigned_grf) / reg_width; - int class_sizes[base_reg_count]; - int class_count = 0; + brw_alloc_reg_set(brw, 1); + brw_alloc_reg_set(brw, 2); +} - calculate_live_intervals(); +int +count_to_loop_end(fs_inst *do_inst) +{ + int depth = 1; + int ip = 1; + for (fs_inst *inst = (fs_inst *)do_inst->next; + depth > 0; + inst = (fs_inst *)inst->next) { + switch (inst->opcode) { + case BRW_OPCODE_DO: + depth++; + break; + case BRW_OPCODE_WHILE: + depth--; + break; + default: + break; + } + ip++; + } + return ip; +} - /* Set up the register classes. - * - * The base registers store a scalar value. For texture samples, - * we get virtual GRFs composed of 4 contiguous hw register. For - * structures and arrays, we store them as contiguous larger things - * than that, though we should be able to do better most of the - * time. - */ - class_sizes[class_count++] = 1; - if (brw->has_pln && intel->gen < 6) { - /* Always set up the (unaligned) pairs for gen5, so we can find - * them for making the aligned pair class. +/** + * Sets up interference between thread payload registers and the virtual GRFs + * to be allocated for program temporaries. + * + * We want to be able to reallocate the payload for our virtual GRFs, notably + * because the setup coefficients for a full set of 16 FS inputs takes up 8 of + * our 128 registers. + * + * The layout of the payload registers is: + * + * 0..nr_payload_regs-1: fixed function setup (including bary coordinates). + * nr_payload_regs..nr_payload_regs+curb_read_lengh-1: uniform data + * nr_payload_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients. + * + * And we have payload_node_count nodes covering these registers in order + * (note that in SIMD16, a node is two registers). + */ +void +fs_visitor::setup_payload_interference(struct ra_graph *g, + int payload_node_count, + int first_payload_node) +{ + int reg_width = dispatch_width / 8; + int loop_depth = 0; + int loop_end_ip = 0; + + int payload_last_use_ip[payload_node_count]; + memset(payload_last_use_ip, 0, sizeof(payload_last_use_ip)); + int ip = 0; + foreach_list(node, &this->instructions) { + fs_inst *inst = (fs_inst *)node; + + switch (inst->opcode) { + case BRW_OPCODE_DO: + loop_depth++; + + /* Since payload regs are deffed only at the start of the shader + * execution, any uses of the payload within a loop mean the live + * interval extends to the end of the outermost loop. Find the ip of + * the end now. + */ + if (loop_depth == 1) + loop_end_ip = ip + count_to_loop_end(inst); + break; + case BRW_OPCODE_WHILE: + loop_depth--; + break; + default: + break; + } + + int use_ip; + if (loop_depth > 0) + use_ip = loop_end_ip; + else + use_ip = ip; + + /* Note that UNIFORM args have been turned into FIXED_HW_REG by + * assign_curbe_setup(), and interpolation uses fixed hardware regs from + * the start (see interp_reg()). */ - class_sizes[class_count++] = 2; + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == HW_REG && + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { + int node_nr = inst->src[i].fixed_hw_reg.nr / reg_width; + if (node_nr >= payload_node_count) + continue; + + payload_last_use_ip[node_nr] = use_ip; + } + } + + /* Special case instructions which have extra implied registers used. */ + switch (inst->opcode) { + case FS_OPCODE_FB_WRITE: + /* We could omit this for the !inst->header_present case, except that + * the simulator apparently incorrectly reads from g0/g1 instead of + * sideband. It also really freaks out driver developers to see g0 + * used in unusual places, so just always reserve it. + */ + payload_last_use_ip[0 / reg_width] = use_ip; + payload_last_use_ip[1 / reg_width] = use_ip; + break; + + case FS_OPCODE_LINTERP: + /* On gen6+ in SIMD16, there are 4 adjacent registers (so 2 nodes) + * used by PLN's sourcing of the deltas, while we list only the first + * two in the arguments (1 node). Pre-gen6, the deltas are computed + * in normal VGRFs. + */ + if (brw->gen >= 6) { + int delta_x_arg = 0; + if (inst->src[delta_x_arg].file == HW_REG && + inst->src[delta_x_arg].fixed_hw_reg.file == + BRW_GENERAL_REGISTER_FILE) { + int sechalf_node = (inst->src[delta_x_arg].fixed_hw_reg.nr / + reg_width) + 1; + assert(sechalf_node < payload_node_count); + payload_last_use_ip[sechalf_node] = use_ip; + } + } + break; + + default: + break; + } + + ip++; } - for (int r = 0; r < this->virtual_grf_next; r++) { - int i; - for (i = 0; i < class_count; i++) { - if (class_sizes[i] == this->virtual_grf_sizes[r]) - break; + for (int i = 0; i < payload_node_count; i++) { + /* Mark the payload node as interfering with any virtual grf that is + * live between the start of the program and our last use of the payload + * node. + */ + for (int j = 0; j < this->virtual_grf_count; j++) { + /* Note that we use a <= comparison, unlike virtual_grf_interferes(), + * in order to not have to worry about the uniform issue described in + * calculate_live_intervals(). + */ + if (this->virtual_grf_start[j] <= payload_last_use_ip[i]) { + ra_add_node_interference(g, first_payload_node + i, j); + } } - if (i == class_count) { - if (this->virtual_grf_sizes[r] >= base_reg_count) { - fail("Object too large to register allocate.\n"); - } + } + + for (int i = 0; i < payload_node_count; i++) { + /* Mark each payload node as being allocated to its physical register. + * + * The alternative would be to have per-physical-register classes, which + * would just be silly. + */ + ra_set_node_reg(g, first_payload_node + i, i); + } +} + +/** + * Sets the mrf_used array to indicate which MRFs are used by the shader IR + * + * This is used in assign_regs() to decide which of the GRFs that we use as + * MRFs on gen7 get normally register allocated, and in register spilling to + * see if we can actually use MRFs to do spills without overwriting normal MRF + * contents. + */ +void +fs_visitor::get_used_mrfs(bool *mrf_used) +{ + int reg_width = dispatch_width / 8; - class_sizes[class_count++] = this->virtual_grf_sizes[r]; + memset(mrf_used, 0, BRW_MAX_MRF * sizeof(bool)); + + foreach_list(node, &this->instructions) { + fs_inst *inst = (fs_inst *)node; + + if (inst->dst.file == MRF) { + int reg = inst->dst.reg & ~BRW_MRF_COMPR4; + mrf_used[reg] = true; + if (reg_width == 2) { + if (inst->dst.reg & BRW_MRF_COMPR4) { + mrf_used[reg + 4] = true; + } else { + mrf_used[reg + 1] = true; + } + } + } + + if (inst->mlen > 0) { + for (int i = 0; i < implied_mrf_writes(inst); i++) { + mrf_used[inst->base_mrf + i] = true; + } } } +} - brw_alloc_reg_set_for_classes(brw, class_sizes, class_count, - reg_width, base_reg_count); - - struct ra_graph *g = ra_alloc_interference_graph(brw->wm.regs, - this->virtual_grf_next); - - for (int i = 0; i < this->virtual_grf_next; i++) { - for (int c = 0; c < class_count; c++) { - if (class_sizes[c] == this->virtual_grf_sizes[i]) { - if (brw->wm.aligned_pairs_class >= 0 && - this->delta_x.reg == i) { - ra_set_node_class(g, i, brw->wm.aligned_pairs_class); - } else { - ra_set_node_class(g, i, brw->wm.classes[c]); - } - break; - } +/** + * Sets interference between virtual GRFs and usage of the high GRFs for SEND + * messages (treated as MRFs in code generation). + */ +void +fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node) +{ + int reg_width = dispatch_width / 8; + + bool mrf_used[BRW_MAX_MRF]; + get_used_mrfs(mrf_used); + + for (int i = 0; i < BRW_MAX_MRF; i++) { + /* Mark each MRF reg node as being allocated to its physical register. + * + * The alternative would be to have per-physical-register classes, which + * would just be silly. + */ + ra_set_node_reg(g, first_mrf_node + i, + (GEN7_MRF_HACK_START + i) / reg_width); + + /* Since we don't have any live/dead analysis on the MRFs, just mark all + * that are used as conflicting with all virtual GRFs. + */ + if (mrf_used[i]) { + for (int j = 0; j < this->virtual_grf_count; j++) { + ra_add_node_interference(g, first_mrf_node + i, j); + } } + } +} + +bool +fs_visitor::assign_regs(bool allow_spilling) +{ + /* Most of this allocation was written for a reg_width of 1 + * (dispatch_width == 8). In extending to SIMD16, the code was + * left in place and it was converted to have the hardware + * registers it's allocating be contiguous physical pairs of regs + * for reg_width == 2. + */ + int reg_width = dispatch_width / 8; + int hw_reg_mapping[this->virtual_grf_count]; + int payload_node_count = (ALIGN(this->first_non_payload_grf, reg_width) / + reg_width); + int rsi = reg_width - 1; /* Which brw->wm.reg_sets[] to use */ + calculate_live_intervals(); + + int node_count = this->virtual_grf_count; + int first_payload_node = node_count; + node_count += payload_node_count; + int first_mrf_hack_node = node_count; + if (brw->gen >= 7) + node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START; + struct ra_graph *g = ra_alloc_interference_graph(brw->wm.reg_sets[rsi].regs, + node_count); + + for (int i = 0; i < this->virtual_grf_count; i++) { + unsigned size = this->virtual_grf_sizes[i]; + int c; + + assert(size <= ARRAY_SIZE(brw->wm.reg_sets[rsi].classes) && + "Register allocation relies on split_virtual_grfs()"); + c = brw->wm.reg_sets[rsi].classes[size - 1]; + + /* Special case: on pre-GEN6 hardware that supports PLN, the + * second operand of a PLN instruction needs to be an + * even-numbered register, so we have a special register class + * wm_aligned_pairs_class to handle this case. pre-GEN6 always + * uses this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the + * second operand of a PLN instruction (since it doesn't support + * any other interpolation modes). So all we need to do is find + * that register and set it to the appropriate class. + */ + if (brw->wm.reg_sets[rsi].aligned_pairs_class >= 0 && + this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) { + c = brw->wm.reg_sets[rsi].aligned_pairs_class; + } + + ra_set_node_class(g, i, c); for (int j = 0; j < i; j++) { if (virtual_grf_interferes(i, j)) { @@ -219,6 +473,21 @@ fs_visitor::assign_regs() } } + setup_payload_interference(g, payload_node_count, first_payload_node); + if (brw->gen >= 7) + setup_mrf_hack_interference(g, first_mrf_hack_node); + + /* Debug of register spilling: Go spill everything. */ + if (0) { + int reg = choose_spill_reg(g); + + if (reg != -1) { + spill_reg(reg); + ralloc_free(g); + return false; + } + } + if (!ra_allocate_no_spills(g)) { /* Failed to allocate registers. Spill a reg, and the caller will * loop back into here to try again. @@ -226,16 +495,12 @@ fs_visitor::assign_regs() int reg = choose_spill_reg(g); if (reg == -1) { - fail("no register to spill\n"); - } else if (intel->gen >= 7) { - fail("no spilling support on gen7 yet\n"); - } else if (c->dispatch_width == 16) { - fail("no spilling support on 16-wide yet\n"); - } else { - spill_reg(reg); + fail("no register to spill:\n"); + dump_instructions(); + } else if (allow_spilling) { + spill_reg(reg); } - ralloc_free(g); return false; @@ -245,12 +510,11 @@ fs_visitor::assign_regs() * regs in the register classes back down to real hardware reg * numbers. */ - this->grf_used = first_assigned_grf; - for (int i = 0; i < this->virtual_grf_next; i++) { + this->grf_used = payload_node_count * reg_width; + for (int i = 0; i < this->virtual_grf_count; i++) { int reg = ra_get_node_reg(g, i); - hw_reg_mapping[i] = (first_assigned_grf + - brw->wm.ra_reg_to_grf[reg] * reg_width); + hw_reg_mapping[i] = brw->wm.reg_sets[rsi].ra_reg_to_grf[reg] * reg_width; this->grf_used = MAX2(this->grf_used, hw_reg_mapping[i] + this->virtual_grf_sizes[i] * reg_width); @@ -262,6 +526,7 @@ fs_visitor::assign_regs() assign_reg(hw_reg_mapping, &inst->dst, reg_width); assign_reg(hw_reg_mapping, &inst->src[0], reg_width); assign_reg(hw_reg_mapping, &inst->src[1], reg_width); + assign_reg(hw_reg_mapping, &inst->src[2], reg_width); } ralloc_free(g); @@ -270,25 +535,30 @@ fs_visitor::assign_regs() } void -fs_visitor::emit_unspill(fs_inst *inst, fs_reg dst, uint32_t spill_offset) +fs_visitor::emit_unspill(fs_inst *inst, fs_reg dst, uint32_t spill_offset, + int count) { - int size = virtual_grf_sizes[dst.reg]; - dst.reg_offset = 0; - - for (int chan = 0; chan < size; chan++) { - fs_inst *unspill_inst = new(mem_ctx) fs_inst(FS_OPCODE_UNSPILL, - dst); - dst.reg_offset++; - unspill_inst->offset = spill_offset + chan * REG_SIZE; + for (int i = 0; i < count; i++) { + /* The gen7 descriptor-based offset is 12 bits of HWORD units. */ + bool gen7_read = brw->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE; + + fs_inst *unspill_inst = + new(mem_ctx) fs_inst(gen7_read ? + SHADER_OPCODE_GEN7_SCRATCH_READ : + SHADER_OPCODE_GEN4_SCRATCH_READ, + dst); + unspill_inst->offset = spill_offset; unspill_inst->ir = inst->ir; unspill_inst->annotation = inst->annotation; - /* Choose a MRF that won't conflict with an MRF that's live across the - * spill. Nothing else will make it up to MRF 14/15. - */ - unspill_inst->base_mrf = 14; - unspill_inst->mlen = 1; /* header contains offset */ + if (!gen7_read) { + unspill_inst->base_mrf = 14; + unspill_inst->mlen = 1; /* header contains offset */ + } inst->insert_before(unspill_inst); + + dst.reg_offset++; + spill_offset += dispatch_width * sizeof(float); } } @@ -296,10 +566,10 @@ int fs_visitor::choose_spill_reg(struct ra_graph *g) { float loop_scale = 1.0; - float spill_costs[this->virtual_grf_next]; - bool no_spill[this->virtual_grf_next]; + float spill_costs[this->virtual_grf_count]; + bool no_spill[this->virtual_grf_count]; - for (int i = 0; i < this->virtual_grf_next; i++) { + for (int i = 0; i < this->virtual_grf_count; i++) { spill_costs[i] = 0.0; no_spill[i] = false; } @@ -313,14 +583,27 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) for (unsigned int i = 0; i < 3; i++) { if (inst->src[i].file == GRF) { - int size = virtual_grf_sizes[inst->src[i].reg]; - spill_costs[inst->src[i].reg] += size * loop_scale; + spill_costs[inst->src[i].reg] += loop_scale; + + /* Register spilling logic assumes full-width registers; smeared + * registers have a width of 1 so if we try to spill them we'll + * generate invalid assembly. This shouldn't be a problem because + * smeared registers are only used as short-term temporaries when + * loading pull constants, so spilling them is unlikely to reduce + * register pressure anyhow. + */ + if (inst->src[i].smear >= 0) { + no_spill[inst->src[i].reg] = true; + } } } if (inst->dst.file == GRF) { - int size = virtual_grf_sizes[inst->dst.reg]; - spill_costs[inst->dst.reg] += size * loop_scale; + spill_costs[inst->dst.reg] += inst->regs_written * loop_scale; + + if (inst->dst.smear >= 0) { + no_spill[inst->dst.reg] = true; + } } switch (inst->opcode) { @@ -333,12 +616,13 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) loop_scale /= 10; break; - case FS_OPCODE_SPILL: + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: if (inst->src[0].file == GRF) no_spill[inst->src[0].reg] = true; break; - case FS_OPCODE_UNSPILL: + case SHADER_OPCODE_GEN4_SCRATCH_READ: + case SHADER_OPCODE_GEN7_SCRATCH_READ: if (inst->dst.file == GRF) no_spill[inst->dst.reg] = true; break; @@ -348,7 +632,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) } } - for (int i = 0; i < this->virtual_grf_next; i++) { + for (int i = 0; i < this->virtual_grf_count; i++) { if (!no_spill[i]) ra_set_node_spill_cost(g, i, spill_costs[i]); } @@ -359,10 +643,34 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) void fs_visitor::spill_reg(int spill_reg) { + int reg_size = dispatch_width * sizeof(float); int size = virtual_grf_sizes[spill_reg]; unsigned int spill_offset = c->last_scratch; assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */ - c->last_scratch += size * REG_SIZE; + int spill_base_mrf = dispatch_width > 8 ? 13 : 14; + + /* Spills may use MRFs 13-15 in the SIMD16 case. Our texturing is done + * using up to 11 MRFs starting from either m1 or m2, and fb writes can use + * up to m13 (gen6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or + * m15 (gen4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst + * depth), starting from m1. In summary: We may not be able to spill in + * SIMD16 mode, because we'd stomp the FB writes. + */ + if (!spilled_any_registers) { + bool mrf_used[BRW_MAX_MRF]; + get_used_mrfs(mrf_used); + + for (int i = spill_base_mrf; i < BRW_MAX_MRF; i++) { + if (mrf_used[i]) { + fail("Register spilling not supported with m%d used", i); + return; + } + } + + spilled_any_registers = true; + } + + c->last_scratch += size * reg_size; /* Generate spill/unspill instructions for the objects being * spilled. Right now, we spill or unspill the whole thing to a @@ -375,21 +683,31 @@ fs_visitor::spill_reg(int spill_reg) for (unsigned int i = 0; i < 3; i++) { if (inst->src[i].file == GRF && inst->src[i].reg == spill_reg) { - inst->src[i].reg = virtual_grf_alloc(size); - emit_unspill(inst, inst->src[i], spill_offset); + int regs_read = inst->regs_read(this, i); + int subset_spill_offset = (spill_offset + + reg_size * inst->src[i].reg_offset); + + inst->src[i].reg = virtual_grf_alloc(regs_read); + inst->src[i].reg_offset = 0; + + emit_unspill(inst, inst->src[i], subset_spill_offset, regs_read); } } if (inst->dst.file == GRF && inst->dst.reg == spill_reg) { - inst->dst.reg = virtual_grf_alloc(size); - - /* Since we spill/unspill the whole thing even if we access - * just a component, we may need to unspill before the - * instruction we're spilling for. + int subset_spill_offset = (spill_offset + + reg_size * inst->dst.reg_offset); + inst->dst.reg = virtual_grf_alloc(inst->regs_written); + inst->dst.reg_offset = 0; + + /* If our write is going to affect just part of the + * inst->regs_written(), then we need to unspill the destination + * since we write back out all of the regs_written(). */ - if (size != 1 || inst->predicated) { - emit_unspill(inst, inst->dst, spill_offset); + if (inst->predicate || inst->force_uncompressed || inst->force_sechalf) { + emit_unspill(inst, inst->dst, subset_spill_offset, + inst->regs_written); } fs_reg spill_src = inst->dst; @@ -398,19 +716,20 @@ fs_visitor::spill_reg(int spill_reg) spill_src.negate = false; spill_src.smear = -1; - for (int chan = 0; chan < size; chan++) { - fs_inst *spill_inst = new(mem_ctx) fs_inst(FS_OPCODE_SPILL, - reg_null_f, spill_src); + for (int chan = 0; chan < inst->regs_written; chan++) { + fs_inst *spill_inst = + new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE, + reg_null_f, spill_src); spill_src.reg_offset++; - spill_inst->offset = spill_offset + chan * REG_SIZE; + spill_inst->offset = subset_spill_offset + chan * reg_size; spill_inst->ir = inst->ir; spill_inst->annotation = inst->annotation; - spill_inst->base_mrf = 14; - spill_inst->mlen = 2; /* header, value */ + spill_inst->mlen = 1 + dispatch_width / 8; /* header, value */ + spill_inst->base_mrf = spill_base_mrf; inst->insert_after(spill_inst); } } } - this->live_intervals_valid = false; + invalidate_live_intervals(); }