const struct brw_stage_prog_data *stage_prog_data;
struct anv_pipeline_bind_map map;
struct brw_wm_prog_key key;
- uint32_t kernel = NO_KERNEL;
unsigned char sha1[20];
populate_wm_prog_key(&pipeline->device->info, info, extra, &key);
if (module->size > 0) {
anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint, spec_info);
- kernel = anv_pipeline_cache_search(cache, sha1, &stage_prog_data, &map);
+ pipeline->ps_ksp0 =
+ anv_pipeline_cache_search(cache, sha1, &stage_prog_data, &map);
}
- if (kernel == NO_KERNEL) {
+ if (pipeline->ps_ksp0 == NO_KERNEL) {
struct brw_wm_prog_data prog_data = { 0, };
struct anv_pipeline_binding surface_to_descriptor[256];
struct anv_pipeline_binding sampler_to_descriptor[256];
}
stage_prog_data = &prog_data.base;
- kernel = anv_pipeline_cache_upload_kernel(cache,
- module->size > 0 ? sha1 : NULL,
- shader_code, code_size,
+ pipeline->ps_ksp0 =
+ anv_pipeline_cache_upload_kernel(cache,
+ module->size > 0 ? sha1 : NULL,
+ shader_code, code_size,
&stage_prog_data, sizeof(prog_data),
&map);
ralloc_free(mem_ctx);
}
- const struct brw_wm_prog_data *wm_prog_data =
- (const struct brw_wm_prog_data *) stage_prog_data;
-
- if (wm_prog_data->no_8)
- pipeline->ps_simd8 = NO_KERNEL;
- else
- pipeline->ps_simd8 = kernel;
-
- if (wm_prog_data->no_8 || wm_prog_data->prog_offset_16) {
- pipeline->ps_simd16 = kernel + wm_prog_data->prog_offset_16;
- } else {
- pipeline->ps_simd16 = NO_KERNEL;
- }
-
- pipeline->ps_ksp2 = 0;
- pipeline->ps_grf_start2 = 0;
- if (pipeline->ps_simd8 != NO_KERNEL) {
- pipeline->ps_ksp0 = pipeline->ps_simd8;
- pipeline->ps_grf_start0 = wm_prog_data->base.dispatch_grf_start_reg;
- if (pipeline->ps_simd16 != NO_KERNEL) {
- pipeline->ps_ksp2 = pipeline->ps_simd16;
- pipeline->ps_grf_start2 = wm_prog_data->dispatch_grf_start_reg_16;
- }
- } else if (pipeline->ps_simd16 != NO_KERNEL) {
- pipeline->ps_ksp0 = pipeline->ps_simd16;
- pipeline->ps_grf_start0 = wm_prog_data->dispatch_grf_start_reg_16;
- }
-
anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_FRAGMENT,
stage_prog_data, &map);
struct anv_state blend_state;
uint32_t vs_simd8;
uint32_t vs_vec4;
- uint32_t ps_simd8;
- uint32_t ps_simd16;
uint32_t ps_ksp0;
- uint32_t ps_ksp2;
- uint32_t ps_grf_start0;
- uint32_t ps_grf_start2;
uint32_t gs_kernel;
uint32_t cs_simd;
POSOFFSET_SAMPLE : POSOFFSET_NONE;
ps._32PixelDispatchEnable = false;
- ps._16PixelDispatchEnable = pipeline->ps_simd16 != NO_KERNEL;
- ps._8PixelDispatchEnable = pipeline->ps_simd8 != NO_KERNEL;
+ ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+ ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
- ps.DispatchGRFStartRegisterforConstantSetupData0 = pipeline->ps_grf_start0,
+ ps.DispatchGRFStartRegisterforConstantSetupData0 =
+ wm_prog_data->base.dispatch_grf_start_reg,
ps.DispatchGRFStartRegisterforConstantSetupData1 = 0,
- ps.DispatchGRFStartRegisterforConstantSetupData2 = pipeline->ps_grf_start2,
+ ps.DispatchGRFStartRegisterforConstantSetupData2 =
+ wm_prog_data->dispatch_grf_start_reg_2,
/* Haswell requires the sample mask to be set in this packet as well as
* in 3DSTATE_SAMPLE_MASK; the values should match. */
/* _NEW_BUFFERS, _NEW_MULTISAMPLE */
ps.KernelStartPointer1 = 0;
- ps.KernelStartPointer2 = pipeline->ps_ksp2;
+ ps.KernelStartPointer2 = pipeline->ps_ksp0 + wm_prog_data->prog_offset_2;
}
/* FIXME-GEN7: This needs a lot more work, cf gen7 upload_wm_state(). */
anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS), ps) {
ps.KernelStartPointer0 = pipeline->ps_ksp0;
ps.KernelStartPointer1 = 0;
- ps.KernelStartPointer2 = pipeline->ps_ksp2;
- ps._8PixelDispatchEnable = pipeline->ps_simd8 != NO_KERNEL;
- ps._16PixelDispatchEnable = pipeline->ps_simd16 != NO_KERNEL;
+ ps.KernelStartPointer2 = pipeline->ps_ksp0 + wm_prog_data->prog_offset_2;
+ ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+ ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
ps._32PixelDispatchEnable = false;
ps.SingleProgramFlow = false;
ps.VectorMaskEnable = true;
ps.ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_FRAGMENT];
ps.PerThreadScratchSpace = scratch_space(&wm_prog_data->base);
- ps.DispatchGRFStartRegisterForConstantSetupData0 = pipeline->ps_grf_start0;
+ ps.DispatchGRFStartRegisterForConstantSetupData0 =
+ wm_prog_data->base.dispatch_grf_start_reg;
ps.DispatchGRFStartRegisterForConstantSetupData1 = 0;
- ps.DispatchGRFStartRegisterForConstantSetupData2 = pipeline->ps_grf_start2;
+ ps.DispatchGRFStartRegisterForConstantSetupData2 =
+ wm_prog_data->dispatch_grf_start_reg_2;
}
bool per_sample_ps = pCreateInfo->pMultisampleState &&
GLuint num_varying_inputs;
- GLuint dispatch_grf_start_reg_16;
- GLuint reg_blocks;
- GLuint reg_blocks_16;
+ uint8_t reg_blocks_0;
+ uint8_t reg_blocks_2;
+
+ uint8_t dispatch_grf_start_reg_2;
+ uint32_t prog_offset_2;
struct {
/** @{
bool computed_stencil;
bool early_fragment_tests;
- bool no_8;
+ bool dispatch_8;
+ bool dispatch_16;
bool dual_src_blend;
bool persample_dispatch;
bool uses_pos_offset;
bool uses_src_w;
bool uses_sample_mask;
bool pulls_bary;
- uint32_t prog_offset_16;
/**
* Mask of which interpolation modes are required by the fragment shader.
return false;
}
- if (dispatch_width == 8)
- wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
- else
- wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
-
return !failed;
}
shader);
cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
+ uint8_t simd8_grf_start, simd16_grf_start;
+ unsigned simd8_grf_used, simd16_grf_used;
fs_visitor v8(compiler, log_data, mem_ctx, key,
&prog_data->base, prog, shader, 8,
return NULL;
} else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
simd8_cfg = v8.cfg;
- prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
+ simd8_grf_start = v8.payload.num_regs;
+ simd8_grf_used = v8.grf_used;
}
if (!v8.simd16_unsupported &&
v16.fail_msg);
} else {
simd16_cfg = v16.cfg;
- prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
+ simd16_grf_start = v16.payload.num_regs;
+ simd16_grf_used = v16.grf_used;
}
}
if (compiler->devinfo->gen < 5 && simd16_cfg)
simd8_cfg = NULL;
+ if (prog_data->persample_dispatch) {
+ /* Starting with SandyBridge (where we first get MSAA), the different
+ * pixel dispatch combinations are grouped into classifications A
+ * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware
+ * generations, the only configurations supporting persample dispatch
+ * are are this in which only one dispatch width is enabled.
+ *
+ * If computed depth is enabled, SNB only allows SIMD8 while IVB+
+ * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
+ */
+ if (compiler->devinfo->gen == 6 &&
+ prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
+ simd16_cfg = NULL;
+ } else if (simd16_cfg) {
+ simd8_cfg = NULL;
+ }
+ }
+
/* We have to compute the flat inputs after the visitor is finished running
* because it relies on prog_data->urb_setup which is computed in
* fs_visitor::calculate_urb_setup().
}
if (simd8_cfg) {
+ prog_data->dispatch_8 = true;
g.generate_code(simd8_cfg, 8);
- prog_data->no_8 = false;
- } else {
- prog_data->no_8 = true;
+ prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
+ prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
+
+ if (simd16_cfg) {
+ prog_data->dispatch_16 = true;
+ prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
+ prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
+ prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
+ }
+ } else if (simd16_cfg) {
+ prog_data->dispatch_16 = true;
+ g.generate_code(simd16_cfg, 16);
+ prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
+ prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
}
- if (simd16_cfg)
- prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
-
return g.get_assembly(final_assembly_size);
}
stage_prog_data->nr_pull_params = 0;
stage_prog_data->curb_read_length = 0;
stage_prog_data->dispatch_grf_start_reg = 2;
- wm_prog_data->dispatch_grf_start_reg_16 = 2;
+ wm_prog_data->dispatch_grf_start_reg_2 = 2;
grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
calculate_cfg();
sizeof(*wm), 32, &brw->wm.base.state_offset);
memset(wm, 0, sizeof(*wm));
- if (prog_data->prog_offset_16) {
+ if (prog_data->dispatch_8 && prog_data->dispatch_16) {
/* These two fields should be the same pre-gen6, which is why we
* only have one hardware field to program for both dispatch
* widths.
*/
assert(prog_data->base.dispatch_grf_start_reg ==
- prog_data->dispatch_grf_start_reg_16);
+ prog_data->dispatch_grf_start_reg_2);
}
/* BRW_NEW_PROGRAM_CACHE | BRW_NEW_FS_PROG_DATA */
- if (prog_data->no_8) {
- wm->wm5.enable_16_pix = 1;
- wm->thread0.grf_reg_count = prog_data->reg_blocks_16;
- wm->thread0.kernel_start_pointer =
- brw_program_reloc(brw,
- brw->wm.base.state_offset +
- offsetof(struct brw_wm_unit_state, thread0),
- brw->wm.base.prog_offset +
- prog_data->prog_offset_16 +
- (prog_data->reg_blocks_16 << 1)) >> 6;
-
- } else {
- wm->thread0.grf_reg_count = prog_data->reg_blocks;
- wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_16;
-
- wm->wm5.enable_8_pix = 1;
- if (prog_data->prog_offset_16)
- wm->wm5.enable_16_pix = 1;
+ wm->wm5.enable_8_pix = prog_data->dispatch_8;
+ wm->wm5.enable_16_pix = prog_data->dispatch_16;
+ if (prog_data->dispatch_8 || prog_data->dispatch_16) {
+ wm->thread0.grf_reg_count = prog_data->reg_blocks_0;
wm->thread0.kernel_start_pointer =
brw_program_reloc(brw,
brw->wm.base.state_offset +
offsetof(struct brw_wm_unit_state, thread0),
brw->wm.base.prog_offset +
(wm->thread0.grf_reg_count << 1)) >> 6;
+ }
+ if (prog_data->prog_offset_2) {
+ wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_2;
wm->wm9.kernel_start_pointer_2 =
brw_program_reloc(brw,
brw->wm.base.state_offset +
offsetof(struct brw_wm_unit_state, wm9),
brw->wm.base.prog_offset +
- prog_data->prog_offset_16 +
+ prog_data->prog_offset_2 +
(wm->wm9.grf_reg_count_2 << 1)) >> 6;
}
dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
- if (prog_data->prog_offset_16 || prog_data->no_8) {
+ if (prog_data->dispatch_8)
+ dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
+
+ if (prog_data->dispatch_16)
dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
- if (!prog_data->no_8 && !prog_data->persample_dispatch) {
- dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
- dw4 |= (prog_data->base.dispatch_grf_start_reg <<
- GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
- dw4 |= (prog_data->dispatch_grf_start_reg_16 <<
- GEN6_WM_DISPATCH_START_GRF_SHIFT_2);
- ksp0 = stage_state->prog_offset;
- ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
- } else {
- dw4 |= (prog_data->dispatch_grf_start_reg_16 <<
- GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
- ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
- }
- }
- else {
- dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
- dw4 |= (prog_data->base.dispatch_grf_start_reg <<
- GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
- ksp0 = stage_state->prog_offset;
- }
+ dw4 |= prog_data->base.dispatch_grf_start_reg <<
+ GEN6_WM_DISPATCH_START_GRF_SHIFT_0;
+ dw4 |= prog_data->dispatch_grf_start_reg_2 <<
+ GEN6_WM_DISPATCH_START_GRF_SHIFT_2;
+
+ ksp0 = stage_state->prog_offset;
+ ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;
if (dual_source_blend_enable)
dw5 |= GEN6_WM_DUAL_SOURCE_BLEND_ENABLE;
dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE;
else {
dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL;
-
- /* From the Sandy Bridge PRM, Vol 2 part 1, 7.7.1 ("Pixel Grouping
- * (Dispatch Size) Control"), p.334:
- *
- * Note: in the table below, the Valid column indicates which
- * products that combination is supported on. Combinations of
- * dispatch enables not listed in the table are not available on
- * any product.
- *
- * A: Valid on all products
- *
- * B: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader
- * computed depth.
- *
- * D: Valid on all products, except when in non-1x PERSAMPLE mode
- * (applies to [DevSNB+] only). Not valid on [DevSNB] if 4x
- * PERPIXEL mode with pixel shader computed depth.
- *
- * E: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader
- * computed depth.
- *
- * F: Valid on all products, except not valid on [DevSNB] if 4x
- * PERPIXEL mode with pixel shader computed depth.
- *
- * In the table that follows, the only entry with "A" in the Valid
- * column is the entry where only 8 pixel dispatch is enabled.
- * Therefore, when we are in PERPIXEL mode with pixel shader computed
- * depth, we need to disable SIMD16 dispatch.
- */
- if (dw5 & GEN6_WM_COMPUTED_DEPTH)
- dw5 &= ~GEN6_WM_16_DISPATCH_ENABLE;
}
} else {
dw6 |= GEN6_WM_MSRAST_OFF_PIXEL;
dw4 |= fast_clear_op;
- if (prog_data->prog_offset_16 || prog_data->no_8) {
+ if (prog_data->dispatch_16)
dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
- /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
- * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
- * is successfully compiled. In majority of the cases that bring us
- * better performance than 'SIMD8 only' dispatch.
- */
- if (!prog_data->no_8 && !prog_data->persample_dispatch) {
- dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
- dw5 |= (prog_data->base.dispatch_grf_start_reg <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
- dw5 |= (prog_data->dispatch_grf_start_reg_16 <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
- ksp0 = stage_state->prog_offset;
- ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
- } else {
- dw5 |= (prog_data->dispatch_grf_start_reg_16 <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
- ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
- }
- }
- else {
+ if (prog_data->dispatch_8)
dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
- dw5 |= (prog_data->base.dispatch_grf_start_reg <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
- ksp0 = stage_state->prog_offset;
- }
+
+ dw5 |= prog_data->base.dispatch_grf_start_reg <<
+ GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
+ dw5 |= prog_data->dispatch_grf_start_reg_2 <<
+ GEN7_PS_DISPATCH_START_GRF_SHIFT_2;
+
+ ksp0 = stage_state->prog_offset;
+ ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;
BEGIN_BATCH(8);
OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
dw6 |= fast_clear_op;
- if (prog_data->prog_offset_16 || prog_data->no_8) {
+ if (prog_data->dispatch_8)
+ dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
+
+ if (prog_data->dispatch_16)
dw6 |= GEN7_PS_16_DISPATCH_ENABLE;
- /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
- * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
- * is successfully compiled. In majority of the cases that bring us
- * better performance than 'SIMD8 only' dispatch.
- */
- if (!prog_data->no_8 && !prog_data->persample_dispatch) {
- dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
- dw7 |= (prog_data->base.dispatch_grf_start_reg <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
- dw7 |= (prog_data->dispatch_grf_start_reg_16 <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
- ksp0 = stage_state->prog_offset;
- ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
- } else {
- dw7 |= (prog_data->dispatch_grf_start_reg_16 <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-
- ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
- }
- } else {
- dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
- dw7 |= (prog_data->base.dispatch_grf_start_reg <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
- ksp0 = stage_state->prog_offset;
- }
+ dw7 |= prog_data->base.dispatch_grf_start_reg <<
+ GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
+ dw7 |= prog_data->dispatch_grf_start_reg_2 <<
+ GEN7_PS_DISPATCH_START_GRF_SHIFT_2;
+
+ ksp0 = stage_state->prog_offset;
+ ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;
BEGIN_BATCH(12);
OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2));