i965/vs: Unify URB entry size/read length calculations between backends.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "main/compiler.h"
34 #include "brw_context.h"
35 #include "brw_vs.h"
36 #include "brw_util.h"
37 #include "brw_state.h"
38 #include "program/prog_print.h"
39 #include "program/prog_parameter.h"
40 #include "brw_nir.h"
41
42 #include "util/ralloc.h"
43
44 bool
45 brw_codegen_vs_prog(struct brw_context *brw,
46 struct gl_shader_program *prog,
47 struct brw_vertex_program *vp,
48 struct brw_vs_prog_key *key)
49 {
50 GLuint program_size;
51 const GLuint *program;
52 struct brw_vs_prog_data prog_data;
53 struct brw_stage_prog_data *stage_prog_data = &prog_data.base.base;
54 void *mem_ctx;
55 int i;
56 struct brw_shader *vs = NULL;
57 bool start_busy = false;
58 double start_time = 0;
59
60 if (!vp->program.Base.nir) {
61 /* Normally we generate NIR in LinkShader() or
62 * ProgramStringNotify(), but Mesa's fixed-function vertex program
63 * handling doesn't notify the driver at all. Just do it here, at
64 * the last minute, even though it's lame.
65 */
66 assert(vp->program.Base.Id == 0 && prog == NULL);
67 vp->program.Base.nir =
68 brw_create_nir(brw, NULL, &vp->program.Base, MESA_SHADER_VERTEX,
69 brw->intelScreen->compiler->scalar_vs);
70 }
71
72 if (prog)
73 vs = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
74
75 memset(&prog_data, 0, sizeof(prog_data));
76
77 /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */
78 if (!prog)
79 stage_prog_data->use_alt_mode = true;
80
81 mem_ctx = ralloc_context(NULL);
82
83 brw_assign_common_binding_table_offsets(MESA_SHADER_VERTEX,
84 brw->intelScreen->devinfo,
85 prog, &vp->program.Base,
86 &prog_data.base.base, 0);
87
88 /* Allocate the references to the uniforms that will end up in the
89 * prog_data associated with the compiled program, and which will be freed
90 * by the state cache.
91 */
92 int param_count = vp->program.Base.nir->num_uniforms;
93 if (!brw->intelScreen->compiler->scalar_vs)
94 param_count *= 4;
95
96 if (vs)
97 prog_data.base.base.nr_image_params = vs->base.NumImages;
98
99 /* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip
100 * planes as uniforms.
101 */
102 param_count += key->nr_userclip_plane_consts * 4;
103
104 stage_prog_data->param =
105 rzalloc_array(NULL, const gl_constant_value *, param_count);
106 stage_prog_data->pull_param =
107 rzalloc_array(NULL, const gl_constant_value *, param_count);
108 stage_prog_data->image_param =
109 rzalloc_array(NULL, struct brw_image_param,
110 stage_prog_data->nr_image_params);
111 stage_prog_data->nr_params = param_count;
112
113 if (prog) {
114 brw_nir_setup_glsl_uniforms(vp->program.Base.nir, prog, &vp->program.Base,
115 &prog_data.base.base,
116 brw->intelScreen->compiler->scalar_vs);
117 } else {
118 brw_nir_setup_arb_uniforms(vp->program.Base.nir, &vp->program.Base,
119 &prog_data.base.base);
120 }
121
122 GLbitfield64 outputs_written = vp->program.Base.OutputsWritten;
123 prog_data.inputs_read = vp->program.Base.InputsRead;
124
125 if (key->copy_edgeflag) {
126 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE);
127 prog_data.inputs_read |= VERT_BIT_EDGEFLAG;
128 }
129
130 if (brw->gen < 6) {
131 /* Put dummy slots into the VUE for the SF to put the replaced
132 * point sprite coords in. We shouldn't need these dummy slots,
133 * which take up precious URB space, but it would mean that the SF
134 * doesn't get nice aligned pairs of input coords into output
135 * coords, which would be a pain to handle.
136 */
137 for (i = 0; i < 8; i++) {
138 if (key->point_coord_replace & (1 << i))
139 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i);
140 }
141
142 /* if back colors are written, allocate slots for front colors too */
143 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0))
144 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0);
145 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1))
146 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1);
147 }
148
149 /* In order for legacy clipping to work, we need to populate the clip
150 * distance varying slots whenever clipping is enabled, even if the vertex
151 * shader doesn't write to gl_ClipDistance.
152 */
153 if (key->nr_userclip_plane_consts > 0) {
154 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
155 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
156 }
157
158 brw_compute_vue_map(brw->intelScreen->devinfo,
159 &prog_data.base.vue_map, outputs_written,
160 prog ? prog->SeparateShader : false);
161
162 unsigned nr_attributes = _mesa_bitcount_64(prog_data.inputs_read);
163
164 /* gl_VertexID and gl_InstanceID are system values, but arrive via an
165 * incoming vertex attribute. So, add an extra slot.
166 */
167 if (vp->program.Base.SystemValuesRead &
168 (BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
169 BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
170 nr_attributes++;
171 }
172
173 /* The BSpec says we always have to read at least one thing from the VF,
174 * and it appears that the hardware wedges otherwise.
175 */
176 if (nr_attributes == 0 && !brw->intelScreen->compiler->scalar_vs)
177 nr_attributes = 1;
178
179 prog_data.nr_attributes = nr_attributes;
180 prog_data.base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2);
181
182 /* Since vertex shaders reuse the same VUE entry for inputs and outputs
183 * (overwriting the original contents), we need to make sure the size is
184 * the larger of the two.
185 */
186 const unsigned vue_entries =
187 MAX2(nr_attributes, prog_data.base.vue_map.num_slots);
188
189 if (brw->gen == 6)
190 prog_data.base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
191 else
192 prog_data.base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
193
194 if (0) {
195 _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG,
196 true);
197 }
198
199 if (unlikely(brw->perf_debug)) {
200 start_busy = (brw->batch.last_bo &&
201 drm_intel_bo_busy(brw->batch.last_bo));
202 start_time = get_time();
203 }
204
205 if (unlikely(INTEL_DEBUG & DEBUG_VS))
206 brw_dump_ir("vertex", prog, &vs->base, &vp->program.Base);
207
208 int st_index = -1;
209 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
210 st_index = brw_get_shader_time_index(brw, prog, &vp->program.Base, ST_VS);
211
212 /* Emit GEN4 code.
213 */
214 program = brw_vs_emit(brw, mem_ctx, key, &prog_data,
215 &vp->program, prog, st_index, &program_size);
216 if (program == NULL) {
217 ralloc_free(mem_ctx);
218 return false;
219 }
220
221 if (unlikely(brw->perf_debug) && vs) {
222 if (vs->compiled_once) {
223 brw_vs_debug_recompile(brw, prog, key);
224 }
225 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
226 perf_debug("VS compile took %.03f ms and stalled the GPU\n",
227 (get_time() - start_time) * 1000);
228 }
229 vs->compiled_once = true;
230 }
231
232 /* Scratch space is used for register spilling */
233 if (prog_data.base.base.total_scratch) {
234 brw_get_scratch_bo(brw, &brw->vs.base.scratch_bo,
235 prog_data.base.base.total_scratch *
236 brw->max_vs_threads);
237 }
238
239 brw_upload_cache(&brw->cache, BRW_CACHE_VS_PROG,
240 key, sizeof(struct brw_vs_prog_key),
241 program, program_size,
242 &prog_data, sizeof(prog_data),
243 &brw->vs.base.prog_offset, &brw->vs.prog_data);
244 ralloc_free(mem_ctx);
245
246 return true;
247 }
248
249 static bool
250 key_debug(struct brw_context *brw, const char *name, int a, int b)
251 {
252 if (a != b) {
253 perf_debug(" %s %d->%d\n", name, a, b);
254 return true;
255 }
256 return false;
257 }
258
259 void
260 brw_vs_debug_recompile(struct brw_context *brw,
261 struct gl_shader_program *prog,
262 const struct brw_vs_prog_key *key)
263 {
264 struct brw_cache_item *c = NULL;
265 const struct brw_vs_prog_key *old_key = NULL;
266 bool found = false;
267
268 perf_debug("Recompiling vertex shader for program %d\n", prog->Name);
269
270 for (unsigned int i = 0; i < brw->cache.size; i++) {
271 for (c = brw->cache.items[i]; c; c = c->next) {
272 if (c->cache_id == BRW_CACHE_VS_PROG) {
273 old_key = c->key;
274
275 if (old_key->program_string_id == key->program_string_id)
276 break;
277 }
278 }
279 if (c)
280 break;
281 }
282
283 if (!c) {
284 perf_debug(" Didn't find previous compile in the shader cache for "
285 "debug\n");
286 return;
287 }
288
289 for (unsigned int i = 0; i < VERT_ATTRIB_MAX; i++) {
290 found |= key_debug(brw, "Vertex attrib w/a flags",
291 old_key->gl_attrib_wa_flags[i],
292 key->gl_attrib_wa_flags[i]);
293 }
294
295 found |= key_debug(brw, "legacy user clipping",
296 old_key->nr_userclip_plane_consts,
297 key->nr_userclip_plane_consts);
298
299 found |= key_debug(brw, "copy edgeflag",
300 old_key->copy_edgeflag, key->copy_edgeflag);
301 found |= key_debug(brw, "PointCoord replace",
302 old_key->point_coord_replace, key->point_coord_replace);
303 found |= key_debug(brw, "vertex color clamping",
304 old_key->clamp_vertex_color, key->clamp_vertex_color);
305
306 found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex);
307
308 if (!found) {
309 perf_debug(" Something else\n");
310 }
311 }
312
313 static bool
314 brw_vs_state_dirty(struct brw_context *brw)
315 {
316 return brw_state_dirty(brw,
317 _NEW_BUFFERS |
318 _NEW_LIGHT |
319 _NEW_POINT |
320 _NEW_POLYGON |
321 _NEW_TEXTURE |
322 _NEW_TRANSFORM,
323 BRW_NEW_VERTEX_PROGRAM |
324 BRW_NEW_VS_ATTRIB_WORKAROUNDS);
325 }
326
327 static void
328 brw_vs_populate_key(struct brw_context *brw,
329 struct brw_vs_prog_key *key)
330 {
331 struct gl_context *ctx = &brw->ctx;
332 /* BRW_NEW_VERTEX_PROGRAM */
333 struct brw_vertex_program *vp =
334 (struct brw_vertex_program *)brw->vertex_program;
335 struct gl_program *prog = (struct gl_program *) brw->vertex_program;
336 int i;
337
338 memset(key, 0, sizeof(*key));
339
340 /* Just upload the program verbatim for now. Always send it all
341 * the inputs it asks for, whether they are varying or not.
342 */
343 key->program_string_id = vp->id;
344
345 if (ctx->Transform.ClipPlanesEnabled != 0 &&
346 ctx->API == API_OPENGL_COMPAT &&
347 !vp->program.Base.UsesClipDistanceOut) {
348 key->nr_userclip_plane_consts =
349 _mesa_logbase2(ctx->Transform.ClipPlanesEnabled) + 1;
350 }
351
352 /* _NEW_POLYGON */
353 if (brw->gen < 6) {
354 key->copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL ||
355 ctx->Polygon.BackMode != GL_FILL);
356 }
357
358 if (prog->OutputsWritten & (VARYING_BIT_COL0 | VARYING_BIT_COL1 |
359 VARYING_BIT_BFC0 | VARYING_BIT_BFC1)) {
360 /* _NEW_LIGHT | _NEW_BUFFERS */
361 key->clamp_vertex_color = ctx->Light._ClampVertexColor;
362 }
363
364 /* _NEW_POINT */
365 if (brw->gen < 6 && ctx->Point.PointSprite) {
366 for (i = 0; i < 8; i++) {
367 if (ctx->Point.CoordReplace[i])
368 key->point_coord_replace |= (1 << i);
369 }
370 }
371
372 /* _NEW_TEXTURE */
373 brw_populate_sampler_prog_key_data(ctx, prog, brw->vs.base.sampler_count,
374 &key->tex);
375
376 /* BRW_NEW_VS_ATTRIB_WORKAROUNDS */
377 memcpy(key->gl_attrib_wa_flags, brw->vb.attrib_wa_flags,
378 sizeof(brw->vb.attrib_wa_flags));
379 }
380
381 void
382 brw_upload_vs_prog(struct brw_context *brw)
383 {
384 struct gl_context *ctx = &brw->ctx;
385 struct gl_shader_program **current = ctx->_Shader->CurrentProgram;
386 struct brw_vs_prog_key key;
387 /* BRW_NEW_VERTEX_PROGRAM */
388 struct brw_vertex_program *vp =
389 (struct brw_vertex_program *)brw->vertex_program;
390
391 if (!brw_vs_state_dirty(brw))
392 return;
393
394 brw_vs_populate_key(brw, &key);
395
396 if (!brw_search_cache(&brw->cache, BRW_CACHE_VS_PROG,
397 &key, sizeof(key),
398 &brw->vs.base.prog_offset, &brw->vs.prog_data)) {
399 bool success = brw_codegen_vs_prog(brw, current[MESA_SHADER_VERTEX],
400 vp, &key);
401 (void) success;
402 assert(success);
403 }
404 brw->vs.base.prog_data = &brw->vs.prog_data->base.base;
405 }
406
407 bool
408 brw_vs_precompile(struct gl_context *ctx,
409 struct gl_shader_program *shader_prog,
410 struct gl_program *prog)
411 {
412 struct brw_context *brw = brw_context(ctx);
413 struct brw_vs_prog_key key;
414 uint32_t old_prog_offset = brw->vs.base.prog_offset;
415 struct brw_vs_prog_data *old_prog_data = brw->vs.prog_data;
416 bool success;
417
418 struct gl_vertex_program *vp = (struct gl_vertex_program *) prog;
419 struct brw_vertex_program *bvp = brw_vertex_program(vp);
420
421 memset(&key, 0, sizeof(key));
422
423 brw_setup_tex_for_precompile(brw, &key.tex, prog);
424 key.program_string_id = bvp->id;
425 key.clamp_vertex_color =
426 (prog->OutputsWritten & (VARYING_BIT_COL0 | VARYING_BIT_COL1 |
427 VARYING_BIT_BFC0 | VARYING_BIT_BFC1));
428
429 success = brw_codegen_vs_prog(brw, shader_prog, bvp, &key);
430
431 brw->vs.base.prog_offset = old_prog_offset;
432 brw->vs.prog_data = old_prog_data;
433
434 return success;
435 }