else
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
+ /* The pull load message will load a vec4 (16 bytes). If we are loading
+ * a double this means we are only loading 2 elements worth of data.
+ * We also want to use a 32-bit data type for the dst of the load operation
+ * so other parts of the driver don't get confused about the size of the
+ * result.
+ */
int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
- fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), dst.type);
+ fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written),
+ BRW_REGISTER_TYPE_F);
fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
inst->regs_written = regs_written;
inst->mlen = 1 + bld.dispatch_width() / 8;
}
- bld.MOV(dst, offset(vec4_result, bld, ((const_offset & 0xf) / 4) * scale));
+ if (type_sz(dst.type) == 8) {
+ assert(scale == 1);
+ shuffle_32bit_load_result_to_64bit_data(
+ bld, retype(vec4_result, dst.type), vec4_result, 2);
+ }
+
+ vec4_result.type = dst.type;
+ bld.MOV(dst, offset(vec4_result, bld,
+ (const_offset & 0xf) / type_sz(vec4_result.type) * scale));
}
/**
return 4 * type_size_vec4(type);
}
+/* Attribute arrays are loaded as one vec4 per element (or matrix column),
+ * except for double-precision types, which are loaded as one dvec4.
+ */
+extern "C" int
+type_size_vs_input(const struct glsl_type *type)
+{
+ if (type->is_double()) {
+ return type_size_dvec4(type);
+ } else {
+ return type_size_vec4(type);
+ }
+}
+
/**
* Create a MOV to read the timestamp register.
*
case FS_OPCODE_TXB:
case SHADER_OPCODE_TXD:
case SHADER_OPCODE_TXF:
+ case SHADER_OPCODE_TXF_LZ:
case SHADER_OPCODE_TXF_CMS:
case SHADER_OPCODE_TXF_CMS_W:
case SHADER_OPCODE_TXF_MCS:
case SHADER_OPCODE_TG4:
case SHADER_OPCODE_TG4_OFFSET:
case SHADER_OPCODE_TXL:
+ case SHADER_OPCODE_TXL_LZ:
case SHADER_OPCODE_TXS:
case SHADER_OPCODE_LOD:
case SHADER_OPCODE_SAMPLEINFO:
}
fs_reg *
-fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
- bool origin_upper_left)
+fs_visitor::emit_fragcoord_interpolation()
{
assert(stage == MESA_SHADER_FRAGMENT);
- brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
fs_reg wpos = *reg;
- bool flip = !origin_upper_left ^ key->render_to_fbo;
/* gl_FragCoord.x */
- if (pixel_center_integer) {
- bld.MOV(wpos, this->pixel_x);
- } else {
- bld.ADD(wpos, this->pixel_x, brw_imm_f(0.5f));
- }
+ bld.MOV(wpos, this->pixel_x);
wpos = offset(wpos, bld, 1);
/* gl_FragCoord.y */
- if (!flip && pixel_center_integer) {
- bld.MOV(wpos, this->pixel_y);
- } else {
- fs_reg pixel_y = this->pixel_y;
- float offset = (pixel_center_integer ? 0.0f : 0.5f);
-
- if (flip) {
- pixel_y.negate = true;
- offset += key->drawable_height - 1.0f;
- }
-
- bld.ADD(wpos, pixel_y, brw_imm_f(offset));
- }
+ bld.MOV(wpos, this->pixel_y);
wpos = offset(wpos, bld, 1);
/* gl_FragCoord.z */
inst->no_dd_clear = true;
inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode,
- mod_centroid && !key->persample_shading,
- mod_sample || key->persample_shading);
+ mod_centroid && !key->persample_interp,
+ mod_sample || key->persample_interp);
inst->predicate = BRW_PREDICATE_NORMAL;
inst->predicate_inverse = false;
if (devinfo->has_pln)
} else {
emit_linterp(*attr, fs_reg(interp), interpolation_mode,
- mod_centroid && !key->persample_shading,
- mod_sample || key->persample_shading);
+ mod_centroid && !key->persample_interp,
+ mod_sample || key->persample_interp);
}
if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
bld.MUL(*attr, *attr, this->pixel_w);
fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
{
assert(stage == MESA_SHADER_FRAGMENT);
- brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+ brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
assert(dst.type == BRW_REGISTER_TYPE_F);
- if (key->compute_pos_offset) {
+ if (wm_prog_data->persample_dispatch) {
/* Convert int_sample_pos to floating point */
bld.MOV(dst, int_sample_pos);
/* Scale to the range [0, 1] */
fs_visitor::emit_samplemaskin_setup()
{
assert(stage == MESA_SHADER_FRAGMENT);
- brw_wm_prog_key *key = (brw_wm_prog_key *) this->key;
+ brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
assert(devinfo->gen >= 6);
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
BRW_REGISTER_TYPE_D));
- if (key->persample_shading) {
+ if (wm_prog_data->persample_dispatch) {
/* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
* and a mask representing which sample is being processed by the
* current shader invocation.
void
fs_visitor::assign_curb_setup()
{
- if (dispatch_width == 8) {
- prog_data->dispatch_grf_start_reg = payload.num_regs;
- } else {
- if (stage == MESA_SHADER_FRAGMENT) {
- brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
- prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
- } else if (stage == MESA_SHADER_COMPUTE) {
- brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
- prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
- } else {
- unreachable("Unsupported shader type!");
- }
- }
-
prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
/* Map the offsets in the UNIFORM file to fixed HW regs. */
inst->src[i].nr +
inst->src[i].reg_offset;
- unsigned width = inst->src[i].stride == 0 ? 1 : inst->exec_size;
+ /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
+ *
+ * VertStride must be used to cross GRF register boundaries. This
+ * rule implies that elements within a 'Width' cannot cross GRF
+ * boundaries.
+ *
+ * So, for registers that are large enough, we have to split the exec
+ * size in two and trust the compression state to sort it out.
+ */
+ unsigned total_size = inst->exec_size *
+ inst->src[i].stride *
+ type_sz(inst->src[i].type);
+
+ assert(total_size <= 2 * REG_SIZE);
+ const unsigned exec_size =
+ (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
+
+ unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
struct brw_reg reg =
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
inst->src[i].subreg_offset),
- inst->exec_size * inst->src[i].stride,
+ exec_size * inst->src[i].stride,
width, inst->src[i].stride);
reg.abs = inst->src[i].abs;
reg.negate = inst->src[i].negate;
assert(stage == MESA_SHADER_VERTEX);
/* Each attribute is 4 regs. */
- this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
+ this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
assert(vs_prog_data->base.urb_read_length <= 15);
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+ /* Default to -1 meaning no location */
+ memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
+ memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
+
int chunk_start = -1;
/* First push 64-bit uniforms to ensure they are properly aligned */
if (!is_live[u] || !is_live_64bit[u])
continue;
- pull_constant_loc[u] = -1;
- push_constant_loc[u] = -1;
-
set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
push_constant_loc, pull_constant_loc,
&num_push_constants, &num_pull_constants,
if (!is_live[u] || is_live_64bit[u])
continue;
- pull_constant_loc[u] = -1;
- push_constant_loc[u] = -1;
-
set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
push_constant_loc, pull_constant_loc,
&num_push_constants, &num_pull_constants,
inst->src[i].file = VGRF;
inst->src[i].nr = dst.nr;
inst->src[i].reg_offset = 0;
- inst->src[i].set_smear(pull_index & 3);
+ inst->src[i].set_smear((pull_index & 3) * 4 /
+ type_sz(inst->src[i].type));
brw_mark_surface_used(prog_data, index);
}
fs_visitor::emit_repclear_shader()
{
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
- int base_mrf = 1;
+ int base_mrf = 0;
int color_mrf = base_mrf + 2;
fs_inst *mov;
switch (op) {
case FS_OPCODE_TXB:
case SHADER_OPCODE_TXL:
+ if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
+ op = SHADER_OPCODE_TXL_LZ;
+ break;
+ }
bld.MOV(sources[length], lod);
length++;
break;
length++;
}
- bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
- length++;
+ if (devinfo->gen >= 9 && lod.is_zero()) {
+ op = SHADER_OPCODE_TXF_LZ;
+ } else {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+ length++;
+ }
for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
coordinate_done = true;
break;
+
case SHADER_OPCODE_TXF_CMS:
case SHADER_OPCODE_TXF_CMS_W:
case SHADER_OPCODE_TXF_UMS:
}
}
-/**
- * The \p rows array of registers represents a \p num_rows by \p num_columns
- * matrix in row-major order, write it in column-major order into the register
- * passed as destination. \p stride gives the separation between matrix
- * elements in the input in fs_builder::dispatch_width() units.
- */
-static void
-emit_transpose(const fs_builder &bld,
- const fs_reg &dst, const fs_reg *rows,
- unsigned num_rows, unsigned num_columns, unsigned stride)
-{
- fs_reg *const components = new fs_reg[num_rows * num_columns];
-
- for (unsigned i = 0; i < num_columns; ++i) {
- for (unsigned j = 0; j < num_rows; ++j)
- components[num_rows * i + j] = offset(rows[j], bld, stride * i);
- }
-
- bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
-
- delete[] components;
-}
-
bool
fs_visitor::lower_simd_width()
{
if (inst->src[j].file != BAD_FILE &&
!is_uniform(inst->src[j])) {
/* Get the i-th copy_width-wide chunk of the source. */
- const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
+ const fs_builder cbld = lbld.group(copy_width, 0);
+ const fs_reg src = offset(inst->src[j], cbld, i);
const unsigned src_size = inst->components_read(j);
- /* Use a trivial transposition to copy one every n
- * copy_width-wide components of the register into a
- * temporary passed as source to the lowered instruction.
+ /* Copy one every n copy_width-wide components of the
+ * register into a temporary passed as source to the lowered
+ * instruction.
*/
split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
- emit_transpose(lbld.group(copy_width, 0),
- split_inst.src[j], &src, 1, src_size, n);
+
+ for (unsigned k = 0; k < src_size; ++k)
+ cbld.MOV(offset(split_inst.src[j], lbld, k),
+ offset(src, cbld, n * k));
}
}
}
if (inst->regs_written) {
- /* Distance between useful channels in the temporaries, skipping
- * garbage if the lowered instruction is wider than the original.
- */
- const unsigned m = lower_width / copy_width;
+ const fs_builder lbld = ibld.group(lower_width, 0);
/* Interleave the components of the result from the lowered
- * instructions. We need to set exec_all() when copying more than
- * one half per component, because LOAD_PAYLOAD (in terms of which
- * emit_transpose is implemented) can only use the same channel
- * enable signals for all of its non-header sources.
+ * instructions.
*/
- emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
- .group(copy_width, 0),
- inst->dst, dsts, n, dst_size, m);
+ for (unsigned i = 0; i < dst_size; ++i) {
+ for (unsigned j = 0; j < n; ++j) {
+ const fs_builder cbld = ibld.group(copy_width, j);
+ cbld.MOV(offset(inst->dst, cbld, n * i + j),
+ offset(dsts[j], lbld, i));
+ }
+ }
}
inst->remove(block);
{
assert(stage == MESA_SHADER_FRAGMENT);
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
- brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
unsigned barycentric_interp_modes =
(stage == MESA_SHADER_FRAGMENT) ?
}
}
- prog_data->uses_pos_offset = key->compute_pos_offset;
/* R31: MSAA position offsets. */
- if (prog_data->uses_pos_offset) {
+ if (prog_data->persample_dispatch &&
+ (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
+ /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+ *
+ * "MSDISPMODE_PERSAMPLE is required in order to select
+ * POSOFFSET_SAMPLE"
+ *
+ * So we can only really get sample positions if we are doing real
+ * per-sample dispatch. If we need gl_SamplePosition and we don't have
+ * persample dispatch, we hard-code it to 0.5.
+ */
+ prog_data->uses_pos_offset = true;
payload.sample_pos_reg = payload.num_regs;
payload.num_regs++;
}
payload.num_regs++;
}
- /* Use a maximum of 32 registers for push-model inputs. */
- const unsigned max_push_components = 32;
+ /* Use a maximum of 24 registers for push-model inputs. */
+ const unsigned max_push_components = 24;
/* If pushing our inputs would take too many registers, reduce the URB read
* length (which is in HWords, or 8 registers), and resort to pulling.
}
void
-fs_visitor::allocate_registers()
+fs_visitor::allocate_registers(bool allow_spilling)
{
bool allocated_without_spills;
SCHEDULE_PRE_LIFO,
};
+ bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
+
/* Try each scheduling heuristic to see if it can successfully register
* allocate without spilling. They should be ordered by decreasing
* performance but increasing likelihood of allocating.
assign_regs_trivial();
allocated_without_spills = true;
} else {
- allocated_without_spills = assign_regs(false);
+ allocated_without_spills = assign_regs(false, spill_all);
}
if (allocated_without_spills)
break;
/* Since we're out of heuristics, just go spill registers until we
* get an allocation.
*/
- while (!assign_regs(true)) {
+ while (!assign_regs(true, spill_all)) {
if (failed)
break;
}
}
+ assert(last_scratch == 0 || allow_spilling);
+
/* This must come after all optimization and register allocation, since
* it inserts dead code that happens to have side effects, and it does
* so based on the actual physical registers in use.
assign_vs_urb_setup();
fixup_3src_null_dest();
- allocate_registers();
+ allocate_registers(true);
return !failed;
}
assign_tcs_single_patch_urb_setup();
fixup_3src_null_dest();
- allocate_registers();
+ allocate_registers(true);
return !failed;
}
assign_tes_urb_setup();
fixup_3src_null_dest();
- allocate_registers();
+ allocate_registers(true);
return !failed;
}
assign_gs_urb_setup();
fixup_3src_null_dest();
- allocate_registers();
+ allocate_registers(true);
return !failed;
}
bool
-fs_visitor::run_fs(bool do_rep_send)
+fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
{
brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
assign_urb_setup();
fixup_3src_null_dest();
- allocate_registers();
+ allocate_registers(allow_spilling);
if (failed)
return false;
}
- if (dispatch_width == 8)
- wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
- else
- wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
-
return !failed;
}
assign_curb_setup();
fixup_3src_null_dest();
- allocate_registers();
+ allocate_registers(true);
if (failed)
return false;
const nir_shader *src_shader,
struct gl_program *prog,
int shader_time_index8, int shader_time_index16,
+ bool allow_spilling,
bool use_rep_send,
unsigned *final_assembly_size,
char **error_str)
prog_data->computed_stencil =
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
+ prog_data->persample_dispatch =
+ key->multisample_fbo &&
+ (key->persample_interp ||
+ (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
+ SYSTEM_BIT_SAMPLE_POS)) ||
+ shader->info.fs.uses_sample_qualifier);
+
prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
prog_data->barycentric_interp_modes =
brw_compute_barycentric_interp_modes(compiler->devinfo,
key->flat_shade,
- key->persample_shading,
+ key->persample_interp,
shader);
- fs_visitor v(compiler, log_data, mem_ctx, key,
- &prog_data->base, prog, shader, 8,
- shader_time_index8);
- if (!v.run_fs(false /* do_rep_send */)) {
+ cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
+ uint8_t simd8_grf_start = 0, simd16_grf_start = 0;
+ unsigned simd8_grf_used = 0, simd16_grf_used = 0;
+
+ fs_visitor v8(compiler, log_data, mem_ctx, key,
+ &prog_data->base, prog, shader, 8,
+ shader_time_index8);
+ if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) {
if (error_str)
- *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+ *error_str = ralloc_strdup(mem_ctx, v8.fail_msg);
return NULL;
+ } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
+ simd8_cfg = v8.cfg;
+ simd8_grf_start = v8.payload.num_regs;
+ simd8_grf_used = v8.grf_used;
}
- cfg_t *simd16_cfg = NULL;
- fs_visitor v2(compiler, log_data, mem_ctx, key,
- &prog_data->base, prog, shader, 16,
- shader_time_index16);
- if (likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
- if (!v.simd16_unsupported) {
- /* Try a SIMD16 compile */
- v2.import_uniforms(&v);
- if (!v2.run_fs(use_rep_send)) {
- compiler->shader_perf_log(log_data,
- "SIMD16 shader failed to compile: %s",
- v2.fail_msg);
- } else {
- simd16_cfg = v2.cfg;
- }
+ if (!v8.simd16_unsupported &&
+ likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
+ /* Try a SIMD16 compile */
+ fs_visitor v16(compiler, log_data, mem_ctx, key,
+ &prog_data->base, prog, shader, 16,
+ shader_time_index16);
+ v16.import_uniforms(&v8);
+ if (!v16.run_fs(allow_spilling, use_rep_send)) {
+ compiler->shader_perf_log(log_data,
+ "SIMD16 shader failed to compile: %s",
+ v16.fail_msg);
+ } else {
+ simd16_cfg = v16.cfg;
+ simd16_grf_start = v16.payload.num_regs;
+ simd16_grf_used = v16.grf_used;
+ }
+ }
+
+ /* When the caller requests a repclear shader, they want SIMD16-only */
+ if (use_rep_send)
+ simd8_cfg = NULL;
+
+ /* Prior to Iron Lake, the PS had a single shader offset with a jump table
+ * at the top to select the shader. We've never implemented that.
+ * Instead, we just give them exactly one shader and we pick the widest one
+ * available.
+ */
+ if (compiler->devinfo->gen < 5 && simd16_cfg)
+ simd8_cfg = NULL;
+
+ if (prog_data->persample_dispatch) {
+ /* Starting with SandyBridge (where we first get MSAA), the different
+ * pixel dispatch combinations are grouped into classifications A
+ * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware
+ * generations, the only configurations supporting persample dispatch
+ * are are this in which only one dispatch width is enabled.
+ *
+ * If computed depth is enabled, SNB only allows SIMD8 while IVB+
+ * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
+ */
+ if (compiler->devinfo->gen == 6 &&
+ prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
+ simd16_cfg = NULL;
+ } else if (simd16_cfg) {
+ simd8_cfg = NULL;
}
}
*/
brw_compute_flat_inputs(prog_data, key->flat_shade, shader);
- cfg_t *simd8_cfg;
- int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
- if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
- simd8_cfg = NULL;
- prog_data->no_8 = true;
- } else {
- simd8_cfg = v.cfg;
- prog_data->no_8 = false;
- }
-
fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
- v.promoted_constants, v.runtime_check_aads_emit,
+ v8.promoted_constants, v8.runtime_check_aads_emit,
MESA_SHADER_FRAGMENT);
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
shader->info.name));
}
- if (simd8_cfg)
+ if (simd8_cfg) {
+ prog_data->dispatch_8 = true;
g.generate_code(simd8_cfg, 8);
- if (simd16_cfg)
- prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
+ prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
+ prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
+
+ if (simd16_cfg) {
+ prog_data->dispatch_16 = true;
+ prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
+ prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
+ prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
+ }
+ } else if (simd16_cfg) {
+ prog_data->dispatch_16 = true;
+ g.generate_code(simd16_cfg, 16);
+ prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
+ prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
+ }
return g.get_assembly(final_assembly_size);
}
} else {
cfg = v8.cfg;
prog_data->simd_size = 8;
+ prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
}
}
} else {
cfg = v16.cfg;
prog_data->simd_size = 16;
+ prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
}
}