i965/cs: Setup surface binding for gl_NumWorkGroups
[mesa.git] / src / mesa / drivers / dri / i965 / brw_compute.c
1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include <sys/errno.h>
25
26 #include "main/condrender.h"
27 #include "main/glheader.h"
28 #include "main/mtypes.h"
29 #include "main/state.h"
30 #include "brw_context.h"
31 #include "brw_draw.h"
32 #include "brw_state.h"
33 #include "intel_batchbuffer.h"
34 #include "intel_buffer_objects.h"
35 #include "brw_defines.h"
36
37
38 static void
39 brw_emit_gpgpu_walker(struct brw_context *brw)
40 {
41 const struct brw_cs_prog_data *prog_data = brw->cs.prog_data;
42
43 const GLuint *num_groups = brw->compute.num_work_groups;
44 uint32_t indirect_flag;
45
46 if (brw->compute.num_work_groups_bo == NULL) {
47 indirect_flag = 0;
48 } else {
49 GLintptr indirect_offset = brw->compute.num_work_groups_offset;
50 drm_intel_bo *bo = brw->compute.num_work_groups_bo;
51
52 indirect_flag = GEN7_GPGPU_INDIRECT_PARAMETER_ENABLE;
53
54 brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMX, bo,
55 I915_GEM_DOMAIN_VERTEX, 0,
56 indirect_offset + 0);
57 brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMY, bo,
58 I915_GEM_DOMAIN_VERTEX, 0,
59 indirect_offset + 4);
60 brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMZ, bo,
61 I915_GEM_DOMAIN_VERTEX, 0,
62 indirect_offset + 8);
63 }
64
65 const unsigned simd_size = prog_data->simd_size;
66 unsigned group_size = prog_data->local_size[0] *
67 prog_data->local_size[1] * prog_data->local_size[2];
68 unsigned thread_width_max =
69 (group_size + simd_size - 1) / simd_size;
70
71 uint32_t right_mask = 0xffffffffu >> (32 - simd_size);
72 const unsigned right_non_aligned = group_size & (simd_size - 1);
73 if (right_non_aligned != 0)
74 right_mask >>= (simd_size - right_non_aligned);
75
76 uint32_t dwords = brw->gen < 8 ? 11 : 15;
77 BEGIN_BATCH(dwords);
78 OUT_BATCH(GPGPU_WALKER << 16 | (dwords - 2) | indirect_flag);
79 OUT_BATCH(0);
80 if (brw->gen >= 8) {
81 OUT_BATCH(0); /* Indirect Data Length */
82 OUT_BATCH(0); /* Indirect Data Start Address */
83 }
84 assert(thread_width_max <= brw->max_cs_threads);
85 OUT_BATCH(SET_FIELD(simd_size / 16, GPGPU_WALKER_SIMD_SIZE) |
86 SET_FIELD(thread_width_max - 1, GPGPU_WALKER_THREAD_WIDTH_MAX));
87 OUT_BATCH(0); /* Thread Group ID Starting X */
88 if (brw->gen >= 8)
89 OUT_BATCH(0); /* MBZ */
90 OUT_BATCH(num_groups[0]); /* Thread Group ID X Dimension */
91 OUT_BATCH(0); /* Thread Group ID Starting Y */
92 if (brw->gen >= 8)
93 OUT_BATCH(0); /* MBZ */
94 OUT_BATCH(num_groups[1]); /* Thread Group ID Y Dimension */
95 OUT_BATCH(0); /* Thread Group ID Starting/Resume Z */
96 OUT_BATCH(num_groups[2]); /* Thread Group ID Z Dimension */
97 OUT_BATCH(right_mask); /* Right Execution Mask */
98 OUT_BATCH(0xffffffff); /* Bottom Execution Mask */
99 ADVANCE_BATCH();
100
101 BEGIN_BATCH(2);
102 OUT_BATCH(MEDIA_STATE_FLUSH << 16 | (2 - 2));
103 OUT_BATCH(0);
104 ADVANCE_BATCH();
105 }
106
107
108 static void
109 brw_dispatch_compute_common(struct gl_context *ctx)
110 {
111 struct brw_context *brw = brw_context(ctx);
112 int estimated_buffer_space_needed;
113 bool fail_next = false;
114
115 if (!_mesa_check_conditional_render(ctx))
116 return;
117
118 if (ctx->NewState)
119 _mesa_update_state(ctx);
120
121 brw_validate_textures(brw);
122
123 const int sampler_state_size = 16; /* 16 bytes */
124 estimated_buffer_space_needed = 512; /* batchbuffer commands */
125 estimated_buffer_space_needed += (BRW_MAX_TEX_UNIT *
126 (sampler_state_size +
127 sizeof(struct gen5_sampler_default_color)));
128 estimated_buffer_space_needed += 1024; /* push constants */
129 estimated_buffer_space_needed += 512; /* misc. pad */
130
131 /* Flush the batch if it's approaching full, so that we don't wrap while
132 * we've got validated state that needs to be in the same batch as the
133 * primitives.
134 */
135 intel_batchbuffer_require_space(brw, estimated_buffer_space_needed,
136 RENDER_RING);
137 intel_batchbuffer_save_state(brw);
138
139 retry:
140 brw->no_batch_wrap = true;
141 brw_upload_compute_state(brw);
142
143 brw_emit_gpgpu_walker(brw);
144
145 brw->no_batch_wrap = false;
146
147 if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) {
148 if (!fail_next) {
149 intel_batchbuffer_reset_to_saved(brw);
150 intel_batchbuffer_flush(brw);
151 fail_next = true;
152 goto retry;
153 } else {
154 if (intel_batchbuffer_flush(brw) == -ENOSPC) {
155 static bool warned = false;
156
157 if (!warned) {
158 fprintf(stderr, "i965: Single compute shader dispatch "
159 "exceeded available aperture space\n");
160 warned = true;
161 }
162 }
163 }
164 }
165
166 /* Now that we know we haven't run out of aperture space, we can safely
167 * reset the dirty bits.
168 */
169 brw_compute_state_finished(brw);
170
171 if (brw->always_flush_batch)
172 intel_batchbuffer_flush(brw);
173
174 brw_state_cache_check_size(brw);
175
176 /* Note: since compute shaders can't write to framebuffers, there's no need
177 * to call brw_postdraw_set_buffers_need_resolve().
178 */
179 }
180
181 static void
182 brw_dispatch_compute(struct gl_context *ctx, const GLuint *num_groups) {
183 struct brw_context *brw = brw_context(ctx);
184
185 brw->compute.num_work_groups_bo = NULL;
186 brw->compute.num_work_groups = num_groups;
187 ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
188
189 brw_dispatch_compute_common(ctx);
190 }
191
192 static void
193 brw_dispatch_compute_indirect(struct gl_context *ctx, GLintptr indirect)
194 {
195 struct brw_context *brw = brw_context(ctx);
196 static const GLuint indirect_group_counts[3] = { 0, 0, 0 };
197 struct gl_buffer_object *indirect_buffer = ctx->DispatchIndirectBuffer;
198 drm_intel_bo *bo =
199 intel_bufferobj_buffer(brw,
200 intel_buffer_object(indirect_buffer),
201 indirect, 3 * sizeof(GLuint));
202
203 brw->compute.num_work_groups_bo = bo;
204 brw->compute.num_work_groups_offset = indirect;
205 brw->compute.num_work_groups = indirect_group_counts;
206 ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
207
208 brw_dispatch_compute_common(ctx);
209 }
210
211 void
212 brw_init_compute_functions(struct dd_function_table *functions)
213 {
214 functions->DispatchCompute = brw_dispatch_compute;
215 functions->DispatchComputeIndirect = brw_dispatch_compute_indirect;
216 }