#include <brw_vs.h>
#include <brw_gs.h>
+#include <brw_cs.h>
#include <mesa/main/shaderobj.h>
#include <mesa/main/fbobject.h>
return true;
}
+static bool
+brw_codegen_cs_prog(struct brw_context *brw,
+ struct gl_shader_program *prog,
+ struct brw_compute_program *cp,
+ struct brw_cs_prog_key *key, struct anv_pipeline *pipeline)
+{
+ struct gl_context *ctx = &brw->ctx;
+ const GLuint *program;
+ void *mem_ctx = ralloc_context(NULL);
+ GLuint program_size;
+ struct brw_cs_prog_data prog_data;
+
+ struct gl_shader *cs = prog->_LinkedShaders[MESA_SHADER_COMPUTE];
+ assert (cs);
+
+ memset(&prog_data, 0, sizeof(prog_data));
+
+ /* Allocate the references to the uniforms that will end up in the
+ * prog_data associated with the compiled program, and which will be freed
+ * by the state cache.
+ */
+ int param_count = cs->num_uniform_components;
+
+ /* The backend also sometimes adds params for texture size. */
+ param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits;
+ prog_data.base.param =
+ rzalloc_array(NULL, const gl_constant_value *, param_count);
+ prog_data.base.pull_param =
+ rzalloc_array(NULL, const gl_constant_value *, param_count);
+ prog_data.base.nr_params = param_count;
+
+ program = brw_cs_emit(brw, mem_ctx, key, &prog_data,
+ &cp->program, prog, &program_size);
+ if (program == NULL) {
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (unlikely(INTEL_DEBUG & DEBUG_CS))
+ fprintf(stderr, "\n");
+
+ struct anv_state cs_state = anv_state_stream_alloc(&pipeline->program_stream,
+ program_size, 64);
+ memcpy(cs_state.map, program, program_size);
+
+ pipeline->cs_simd = cs_state.offset;
+
+ ralloc_free(mem_ctx);
+
+ return true;
+}
+
+static void
+brw_cs_populate_key(struct brw_context *brw,
+ struct brw_compute_program *bcp, struct brw_cs_prog_key *key)
+{
+ memset(key, 0, sizeof(*key));
+
+ /* The unique compute program ID */
+ key->program_string_id = bcp->id;
+}
+
static void
fail_on_compile_error(int status, const char *msg)
{
compiler->brw->is_baytrail = devinfo->is_baytrail;
compiler->brw->is_haswell = devinfo->is_haswell;
compiler->brw->is_cherryview = devinfo->is_cherryview;
+
+ /* We need this at least for CS, which will check brw->max_cs_threads
+ * against the work group size. */
+ compiler->brw->max_vs_threads = devinfo->max_vs_threads;
+ compiler->brw->max_hs_threads = devinfo->max_hs_threads;
+ compiler->brw->max_ds_threads = devinfo->max_ds_threads;
+ compiler->brw->max_gs_threads = devinfo->max_gs_threads;
+ compiler->brw->max_wm_threads = devinfo->max_wm_threads;
+ compiler->brw->max_cs_threads = devinfo->max_cs_threads;
+ compiler->brw->urb.size = devinfo->urb.size;
+ compiler->brw->urb.min_vs_entries = devinfo->urb.min_vs_entries;
+ compiler->brw->urb.max_vs_entries = devinfo->urb.max_vs_entries;
+ compiler->brw->urb.max_hs_entries = devinfo->urb.max_hs_entries;
+ compiler->brw->urb.max_ds_entries = devinfo->urb.max_ds_entries;
+ compiler->brw->urb.max_gs_entries = devinfo->urb.max_gs_entries;
+
compiler->brw->intelScreen = compiler->screen;
compiler->screen->devinfo = &device->info;
pipeline->active_stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
}
+ if (pipeline->shaders[VK_SHADER_STAGE_COMPUTE]) {
+ struct brw_cs_prog_key cs_key;
+ struct gl_compute_program *cp = (struct gl_compute_program *)
+ program->_LinkedShaders[MESA_SHADER_COMPUTE]->Program;
+ struct brw_compute_program *bcp = brw_compute_program(cp);
+
+ brw_cs_populate_key(brw, bcp, &cs_key);
+
+ success = brw_codegen_cs_prog(brw, program, bcp, &cs_key, pipeline);
+ fail_if(!success, "brw_codegen_cs_prog failed\n");
+ pipeline->prog_data[VK_SHADER_STAGE_COMPUTE] = &pipeline->cs_prog_data.base;
+ pipeline->active_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
+ }
+
brw->ctx.Driver.DeleteShaderProgram(&brw->ctx, program);
gen7_compute_urb_partition(pipeline);
struct anv_cmd_buffer *cmd_buffer = (struct anv_cmd_buffer *) cmdBuffer;
struct anv_pipeline *pipeline = (struct anv_pipeline *) _pipeline;
- cmd_buffer->pipeline = pipeline;
- cmd_buffer->vb_dirty |= pipeline->vb_used;
- cmd_buffer->dirty |= ANV_CMD_BUFFER_PIPELINE_DIRTY;
+ switch (pipelineBindPoint) {
+ case VK_PIPELINE_BIND_POINT_COMPUTE:
+ cmd_buffer->compute_pipeline = pipeline;
+ cmd_buffer->compute_dirty |= ANV_CMD_BUFFER_PIPELINE_DIRTY;
+ break;
+
+ case VK_PIPELINE_BIND_POINT_GRAPHICS:
+ cmd_buffer->pipeline = pipeline;
+ cmd_buffer->vb_dirty |= pipeline->vb_used;
+ cmd_buffer->dirty |= ANV_CMD_BUFFER_PIPELINE_DIRTY;
+ break;
+
+ default:
+ assert(!"invalid bind point");
+ break;
+ }
}
void anv_CmdBindDynamicStateObject(
static VkResult
cmd_buffer_emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
- unsigned stage)
+ unsigned stage, struct anv_state *bt_state)
{
- struct anv_pipeline_layout *layout = cmd_buffer->pipeline->layout;
+ struct anv_pipeline_layout *layout;
uint32_t color_attachments, bias, size;
- struct anv_state bt_state;
+
+ if (stage == VK_SHADER_STAGE_COMPUTE)
+ layout = cmd_buffer->compute_pipeline->layout;
+ else
+ layout = cmd_buffer->pipeline->layout;
if (stage == VK_SHADER_STAGE_FRAGMENT) {
bias = MAX_RTS;
return VK_SUCCESS;
size = (bias + surface_count) * sizeof(uint32_t);
- bt_state = anv_cmd_buffer_alloc_surface_state(cmd_buffer, size, 32);
- uint32_t *bt_map = bt_state.map;
+ *bt_state = anv_cmd_buffer_alloc_surface_state(cmd_buffer, size, 32);
+ uint32_t *bt_map = bt_state->map;
- if (bt_state.map == NULL)
+ if (bt_state->map == NULL)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
- static const uint32_t binding_table_opcodes[] = {
- [VK_SHADER_STAGE_VERTEX] = 38,
- [VK_SHADER_STAGE_TESS_CONTROL] = 39,
- [VK_SHADER_STAGE_TESS_EVALUATION] = 40,
- [VK_SHADER_STAGE_GEOMETRY] = 41,
- [VK_SHADER_STAGE_FRAGMENT] = 42,
- [VK_SHADER_STAGE_COMPUTE] = 0,
- };
-
- anv_batch_emit(&cmd_buffer->batch,
- GEN8_3DSTATE_BINDING_TABLE_POINTERS_VS,
- ._3DCommandSubOpcode = binding_table_opcodes[stage],
- .PointertoVSBindingTable = bt_state.offset);
-
for (uint32_t ca = 0; ca < color_attachments; ca++) {
const struct anv_surface_view *view =
cmd_buffer->framebuffer->color_attachments[ca];
}
static VkResult
-cmd_buffer_emit_samplers(struct anv_cmd_buffer *cmd_buffer, unsigned stage)
+cmd_buffer_emit_samplers(struct anv_cmd_buffer *cmd_buffer,
+ unsigned stage, struct anv_state *state)
{
- struct anv_pipeline_layout *layout = cmd_buffer->pipeline->layout;
- struct anv_state state;
+ struct anv_pipeline_layout *layout;
+ uint32_t sampler_count;
- if (!layout)
- return VK_SUCCESS;
-
- uint32_t sampler_count = layout->stage[stage].sampler_count;
+ if (stage == VK_SHADER_STAGE_COMPUTE)
+ layout = cmd_buffer->compute_pipeline->layout;
+ else
+ layout = cmd_buffer->pipeline->layout;
+ sampler_count = layout ? layout->stage[stage].sampler_count : 0;
if (sampler_count == 0)
return VK_SUCCESS;
uint32_t size = sampler_count * 16;
- state = anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream, size, 32);
+ *state = anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream, size, 32);
- if (state.map == NULL)
+ if (state->map == NULL)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
- static const uint32_t sampler_state_opcodes[] = {
- [VK_SHADER_STAGE_VERTEX] = 43,
- [VK_SHADER_STAGE_TESS_CONTROL] = 44, /* HS */
- [VK_SHADER_STAGE_TESS_EVALUATION] = 45, /* DS */
- [VK_SHADER_STAGE_GEOMETRY] = 46,
- [VK_SHADER_STAGE_FRAGMENT] = 47,
- [VK_SHADER_STAGE_COMPUTE] = 0,
- };
-
- anv_batch_emit(&cmd_buffer->batch,
- GEN8_3DSTATE_SAMPLER_STATE_POINTERS_VS,
- ._3DCommandSubOpcode = sampler_state_opcodes[stage],
- .PointertoVSSamplerState = state.offset);
-
for (uint32_t set = 0; set < layout->num_sets; set++) {
struct anv_descriptor_set_binding *d = &cmd_buffer->descriptors[set];
struct anv_descriptor_set_layout *set_layout = layout->set[set].layout;
if (!sampler)
continue;
- memcpy(state.map + (start + b) * 16,
+ memcpy(state->map + (start + b) * 16,
sampler->state, sizeof(sampler->state));
}
}
return VK_SUCCESS;
}
+static VkResult
+flush_descriptor_set(struct anv_cmd_buffer *cmd_buffer, uint32_t stage)
+{
+ struct anv_state surfaces = { 0, }, samplers = { 0, };
+ VkResult result;
+
+ result = cmd_buffer_emit_samplers(cmd_buffer, stage, &samplers);
+ if (result != VK_SUCCESS)
+ return result;
+ result = cmd_buffer_emit_binding_table(cmd_buffer, stage, &surfaces);
+ if (result != VK_SUCCESS)
+ return result;
+
+ static const uint32_t sampler_state_opcodes[] = {
+ [VK_SHADER_STAGE_VERTEX] = 43,
+ [VK_SHADER_STAGE_TESS_CONTROL] = 44, /* HS */
+ [VK_SHADER_STAGE_TESS_EVALUATION] = 45, /* DS */
+ [VK_SHADER_STAGE_GEOMETRY] = 46,
+ [VK_SHADER_STAGE_FRAGMENT] = 47,
+ [VK_SHADER_STAGE_COMPUTE] = 0,
+ };
+
+ static const uint32_t binding_table_opcodes[] = {
+ [VK_SHADER_STAGE_VERTEX] = 38,
+ [VK_SHADER_STAGE_TESS_CONTROL] = 39,
+ [VK_SHADER_STAGE_TESS_EVALUATION] = 40,
+ [VK_SHADER_STAGE_GEOMETRY] = 41,
+ [VK_SHADER_STAGE_FRAGMENT] = 42,
+ [VK_SHADER_STAGE_COMPUTE] = 0,
+ };
+
+ if (samplers.alloc_size > 0) {
+ anv_batch_emit(&cmd_buffer->batch,
+ GEN8_3DSTATE_SAMPLER_STATE_POINTERS_VS,
+ ._3DCommandSubOpcode = sampler_state_opcodes[stage],
+ .PointertoVSSamplerState = samplers.offset);
+ }
+
+ if (surfaces.alloc_size > 0) {
+ anv_batch_emit(&cmd_buffer->batch,
+ GEN8_3DSTATE_BINDING_TABLE_POINTERS_VS,
+ ._3DCommandSubOpcode = binding_table_opcodes[stage],
+ .PointertoVSBindingTable = surfaces.offset);
+ }
+
+ return VK_SUCCESS;
+}
+
static void
flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer)
{
VkResult result;
for_each_bit(s, dirty) {
- result = cmd_buffer_emit_binding_table(cmd_buffer, s);
- if (result != VK_SUCCESS)
- break;
-
- result = cmd_buffer_emit_samplers(cmd_buffer, s);
+ result = flush_descriptor_set(cmd_buffer, s);
if (result != VK_SUCCESS)
break;
}
/* Re-emit all active binding tables */
for_each_bit(s, cmd_buffer->pipeline->active_stages) {
- result = cmd_buffer_emit_binding_table(cmd_buffer, s);
- result = cmd_buffer_emit_samplers(cmd_buffer, s);
- }
+ result = flush_descriptor_set(cmd_buffer, s);
- /* It had better succeed this time */
- assert(result == VK_SUCCESS);
+ /* It had better succeed this time */
+ assert(result == VK_SUCCESS);
+ }
}
cmd_buffer->descriptors_dirty &= ~cmd_buffer->pipeline->active_stages;
return state;
}
+static VkResult
+flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_device *device = cmd_buffer->device;
+ struct anv_pipeline *pipeline = cmd_buffer->compute_pipeline;
+ struct anv_state surfaces = { 0, }, samplers = { 0, };
+ VkResult result;
+
+ result = cmd_buffer_emit_samplers(cmd_buffer,
+ VK_SHADER_STAGE_COMPUTE, &samplers);
+ if (result != VK_SUCCESS)
+ return result;
+ result = cmd_buffer_emit_binding_table(cmd_buffer,
+ VK_SHADER_STAGE_COMPUTE, &surfaces);
+ if (result != VK_SUCCESS)
+ return result;
+
+ struct GEN8_INTERFACE_DESCRIPTOR_DATA desc = {
+ .KernelStartPointer = pipeline->cs_simd,
+ .KernelStartPointerHigh = 0,
+ .BindingTablePointer = surfaces.offset,
+ .BindingTableEntryCount = 0,
+ .SamplerStatePointer = samplers.offset,
+ .SamplerCount = 0,
+ .NumberofThreadsinGPGPUThreadGroup = 0 /* FIXME: Really? */
+ };
+
+ uint32_t size = GEN8_INTERFACE_DESCRIPTOR_DATA_length * sizeof(uint32_t);
+ struct anv_state state =
+ anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
+
+ GEN8_INTERFACE_DESCRIPTOR_DATA_pack(NULL, state.map, &desc);
+
+ anv_batch_emit(&cmd_buffer->batch, GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD,
+ .InterfaceDescriptorTotalLength = size,
+ .InterfaceDescriptorDataStartAddress = state.offset);
+
+ return VK_SUCCESS;
+}
+
+static void
+anv_cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_pipeline *pipeline = cmd_buffer->compute_pipeline;
+ VkResult result;
+
+ assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
+
+ if (cmd_buffer->current_pipeline != GPGPU) {
+ anv_batch_emit(&cmd_buffer->batch, GEN8_PIPELINE_SELECT,
+ .PipelineSelection = GPGPU);
+ cmd_buffer->current_pipeline = GPGPU;
+ }
+
+ if (cmd_buffer->compute_dirty & ANV_CMD_BUFFER_PIPELINE_DIRTY)
+ anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
+
+ if ((cmd_buffer->descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
+ (cmd_buffer->compute_dirty & ANV_CMD_BUFFER_PIPELINE_DIRTY)) {
+ result = flush_compute_descriptor_set(cmd_buffer);
+ if (result != VK_SUCCESS) {
+ result = anv_cmd_buffer_new_surface_state_bo(cmd_buffer);
+ assert(result == VK_SUCCESS);
+ result = flush_compute_descriptor_set(cmd_buffer);
+ assert(result == VK_SUCCESS);
+ }
+ cmd_buffer->descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE;
+ }
+
+ cmd_buffer->compute_dirty = 0;
+}
+
static void
anv_cmd_buffer_flush_state(struct anv_cmd_buffer *cmd_buffer)
{
uint32_t y,
uint32_t z)
{
- stub();
+ struct anv_cmd_buffer *cmd_buffer = (struct anv_cmd_buffer *) cmdBuffer;
+ uint32_t size = SIMD8; /* FIXME */
+ uint32_t right_mask = 0; /* FIXME */
+ uint32_t thread_width_max = 0; /* FIXME */
+
+ anv_cmd_buffer_flush_compute_state(cmd_buffer);
+
+ anv_batch_emit(&cmd_buffer->batch, GEN8_GPGPU_WALKER,
+
+ .InterfaceDescriptorOffset = 0,
+ .IndirectDataLength = 0,
+ .IndirectDataStartAddress = 0,
+
+ .SIMDSize = size,
+
+ .ThreadDepthCounterMaximum = 0,
+ .ThreadHeightCounterMaximum = 0,
+ .ThreadWidthCounterMaximum = thread_width_max,
+
+ .ThreadGroupIDStartingX = 0,
+ .ThreadGroupIDXDimension = x,
+ .ThreadGroupIDStartingY = 0,
+ .ThreadGroupIDYDimension = y,
+ .ThreadGroupIDStartingResumeZ = 0,
+ .ThreadGroupIDZDimension = z,
+ .RightExecutionMask = right_mask,
+ .BottomExecutionMask = 0xffffffff);
+
+ anv_batch_emit(&cmd_buffer->batch, GEN8_MEDIA_STATE_FLUSH);
}
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
void anv_CmdDispatchIndirect(
VkCmdBuffer cmdBuffer,
- VkBuffer buffer,
+ VkBuffer _buffer,
VkDeviceSize offset)
{
- stub();
+ struct anv_cmd_buffer *cmd_buffer = (struct anv_cmd_buffer *) cmdBuffer;
+ struct anv_buffer *buffer = (struct anv_buffer *) _buffer;
+ struct anv_bo *bo = buffer->bo;
+ uint32_t bo_offset = buffer->offset + offset;
+
+ anv_cmd_buffer_flush_compute_state(cmd_buffer);
+
+ anv_batch_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
+ anv_batch_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
+ anv_batch_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
+
+ uint32_t size = SIMD8; /* FIXME */
+ uint32_t right_mask = 0; /* FIXME */
+ uint32_t thread_width_max = 0; /* FIXME */
+
+ /* FIXME: We can't compute thread_width_max for indirect, looks like it
+ * depends on DIMX. */
+
+ anv_batch_emit(&cmd_buffer->batch, GEN8_GPGPU_WALKER,
+ .IndirectParameterEnable = true,
+ .InterfaceDescriptorOffset = 0,
+ .IndirectDataLength = 0,
+ .IndirectDataStartAddress = 0,
+
+ .SIMDSize = size,
+
+ .ThreadDepthCounterMaximum = 0,
+ .ThreadHeightCounterMaximum = 0,
+ .ThreadWidthCounterMaximum = thread_width_max,
+
+ .RightExecutionMask = right_mask,
+ .BottomExecutionMask = 0xffffffff);
+
+ anv_batch_emit(&cmd_buffer->batch, GEN8_MEDIA_STATE_FLUSH);
}
void anv_CmdSetEvent(