intel/compiler: Do image load/store lowering to NIR
[mesa.git] / src / mesa / drivers / dri / i965 / brw_program.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32 #include <pthread.h>
33 #include "main/imports.h"
34 #include "main/glspirv.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "program/prog_to_nir.h"
38 #include "program/program.h"
39 #include "program/programopt.h"
40 #include "tnl/tnl.h"
41 #include "util/ralloc.h"
42 #include "compiler/glsl/ir.h"
43 #include "compiler/glsl/glsl_to_nir.h"
44
45 #include "brw_program.h"
46 #include "brw_context.h"
47 #include "compiler/brw_nir.h"
48 #include "brw_defines.h"
49 #include "intel_batchbuffer.h"
50
51 #include "brw_cs.h"
52 #include "brw_gs.h"
53 #include "brw_vs.h"
54 #include "brw_wm.h"
55
56 static bool
57 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
58 {
59 if (is_scalar) {
60 nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
61 type_size_scalar_bytes);
62 return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
63 } else {
64 nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
65 type_size_vec4_bytes);
66 return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
67 }
68 }
69
70 nir_shader *
71 brw_create_nir(struct brw_context *brw,
72 const struct gl_shader_program *shader_prog,
73 struct gl_program *prog,
74 gl_shader_stage stage,
75 bool is_scalar)
76 {
77 const struct gen_device_info *devinfo = &brw->screen->devinfo;
78 struct gl_context *ctx = &brw->ctx;
79 const nir_shader_compiler_options *options =
80 ctx->Const.ShaderCompilerOptions[stage].NirOptions;
81 nir_shader *nir;
82
83 /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
84 if (shader_prog) {
85 if (shader_prog->data->spirv) {
86 nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
87 } else {
88 nir = glsl_to_nir(shader_prog, stage, options);
89 }
90 assert (nir);
91
92 nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
93 nir_lower_returns(nir);
94 nir_validate_shader(nir);
95 NIR_PASS_V(nir, nir_lower_io_to_temporaries,
96 nir_shader_get_entrypoint(nir), true, false);
97 } else {
98 nir = prog_to_nir(prog, options);
99 NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
100 }
101 nir_validate_shader(nir);
102
103 nir = brw_preprocess_nir(brw->screen->compiler, nir);
104
105 NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
106
107 if (stage == MESA_SHADER_TESS_CTRL) {
108 /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
109 static const gl_state_index16 tokens[STATE_LENGTH] =
110 { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
111 nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
112 }
113
114 if (stage == MESA_SHADER_TESS_EVAL) {
115 /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
116 * a uniform if we don't.
117 */
118 struct gl_linked_shader *tcs =
119 shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
120 uint32_t static_patch_vertices =
121 tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
122 static const gl_state_index16 tokens[STATE_LENGTH] =
123 { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
124 nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
125 }
126
127 if (stage == MESA_SHADER_FRAGMENT) {
128 static const struct nir_lower_wpos_ytransform_options wpos_options = {
129 .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
130 .fs_coord_pixel_center_integer = 1,
131 .fs_coord_origin_upper_left = 1,
132 };
133
134 bool progress = false;
135 NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
136 if (progress) {
137 _mesa_add_state_reference(prog->Parameters,
138 wpos_options.state_tokens);
139 }
140 }
141
142 NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
143
144 return nir;
145 }
146
147 void
148 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
149 {
150 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
151
152 /* Copy the info we just generated back into the gl_program */
153 const char *prog_name = prog->info.name;
154 const char *prog_label = prog->info.label;
155 prog->info = nir->info;
156 prog->info.name = prog_name;
157 prog->info.label = prog_label;
158 }
159
160 static unsigned
161 get_new_program_id(struct intel_screen *screen)
162 {
163 return p_atomic_inc_return(&screen->program_id);
164 }
165
166 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
167 GLuint id, bool is_arb_asm)
168 {
169 struct brw_context *brw = brw_context(ctx);
170 struct brw_program *prog = rzalloc(NULL, struct brw_program);
171
172 if (prog) {
173 prog->id = get_new_program_id(brw->screen);
174
175 return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
176 }
177
178 return NULL;
179 }
180
181 static void brwDeleteProgram( struct gl_context *ctx,
182 struct gl_program *prog )
183 {
184 struct brw_context *brw = brw_context(ctx);
185
186 /* Beware! prog's refcount has reached zero, and it's about to be freed.
187 *
188 * In brw_upload_pipeline_state(), we compare brw->programs[i] to
189 * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
190 * pointer has changed.
191 *
192 * We cannot leave brw->programs[i] as a dangling pointer to the dead
193 * program. malloc() may allocate the same memory for a new gl_program,
194 * causing us to see matching pointers...but totally different programs.
195 *
196 * We cannot set brw->programs[i] to NULL, either. If we've deleted the
197 * active program, Mesa may set ctx->FooProgram._Current to NULL. That
198 * would cause us to see matching pointers (NULL == NULL), and fail to
199 * detect that a program has changed since our last draw.
200 *
201 * So, set it to a bogus gl_program pointer that will never match,
202 * causing us to properly reevaluate the state on our next draw.
203 *
204 * Getting this wrong causes heisenbugs which are very hard to catch,
205 * as you need a very specific allocation pattern to hit the problem.
206 */
207 static const struct gl_program deleted_program;
208
209 for (int i = 0; i < MESA_SHADER_STAGES; i++) {
210 if (brw->programs[i] == prog)
211 brw->programs[i] = (struct gl_program *) &deleted_program;
212 }
213
214 _mesa_delete_program( ctx, prog );
215 }
216
217
218 static GLboolean
219 brwProgramStringNotify(struct gl_context *ctx,
220 GLenum target,
221 struct gl_program *prog)
222 {
223 assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
224
225 struct brw_context *brw = brw_context(ctx);
226 const struct brw_compiler *compiler = brw->screen->compiler;
227
228 switch (target) {
229 case GL_FRAGMENT_PROGRAM_ARB: {
230 struct brw_program *newFP = brw_program(prog);
231 const struct brw_program *curFP =
232 brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
233
234 if (newFP == curFP)
235 brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
236 newFP->id = get_new_program_id(brw->screen);
237
238 prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
239
240 brw_shader_gather_info(prog->nir, prog);
241
242 brw_fs_precompile(ctx, prog);
243 break;
244 }
245 case GL_VERTEX_PROGRAM_ARB: {
246 struct brw_program *newVP = brw_program(prog);
247 const struct brw_program *curVP =
248 brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
249
250 if (newVP == curVP)
251 brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
252 if (newVP->program.arb.IsPositionInvariant) {
253 _mesa_insert_mvp_code(ctx, &newVP->program);
254 }
255 newVP->id = get_new_program_id(brw->screen);
256
257 /* Also tell tnl about it:
258 */
259 _tnl_program_string(ctx, target, prog);
260
261 prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
262 compiler->scalar_stage[MESA_SHADER_VERTEX]);
263
264 brw_shader_gather_info(prog->nir, prog);
265
266 brw_vs_precompile(ctx, prog);
267 break;
268 }
269 default:
270 /*
271 * driver->ProgramStringNotify is only called for ARB programs, fixed
272 * function vertex programs, and ir_to_mesa (which isn't used by the
273 * i965 back-end). Therefore, even after geometry shaders are added,
274 * this function should only ever be called with a target of
275 * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
276 */
277 unreachable("Unexpected target in brwProgramStringNotify");
278 }
279
280 return true;
281 }
282
283 static void
284 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
285 {
286 struct brw_context *brw = brw_context(ctx);
287 const struct gen_device_info *devinfo = &brw->screen->devinfo;
288 unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
289 assert(devinfo->gen >= 7 && devinfo->gen <= 11);
290
291 if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
292 GL_ELEMENT_ARRAY_BARRIER_BIT |
293 GL_COMMAND_BARRIER_BIT))
294 bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
295
296 if (barriers & GL_UNIFORM_BARRIER_BIT)
297 bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
298 PIPE_CONTROL_CONST_CACHE_INVALIDATE);
299
300 if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
301 bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
302
303 if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
304 GL_PIXEL_BUFFER_BARRIER_BIT))
305 bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
306 PIPE_CONTROL_RENDER_TARGET_FLUSH);
307
308 if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
309 bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
310 PIPE_CONTROL_RENDER_TARGET_FLUSH);
311
312 /* Typed surface messages are handled by the render cache on IVB, so we
313 * need to flush it too.
314 */
315 if (devinfo->gen == 7 && !devinfo->is_haswell)
316 bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
317
318 brw_emit_pipe_control_flush(brw, bits);
319 }
320
321 static void
322 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
323 {
324 struct brw_context *brw = brw_context(ctx);
325 const struct gen_device_info *devinfo = &brw->screen->devinfo;
326
327 if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
328 if (devinfo->gen >= 6) {
329 brw_emit_pipe_control_flush(brw,
330 PIPE_CONTROL_RENDER_TARGET_FLUSH |
331 PIPE_CONTROL_CS_STALL);
332 brw_emit_pipe_control_flush(brw,
333 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
334 } else {
335 brw_emit_pipe_control_flush(brw,
336 PIPE_CONTROL_RENDER_TARGET_FLUSH);
337 }
338 }
339 }
340
341 void
342 brw_get_scratch_bo(struct brw_context *brw,
343 struct brw_bo **scratch_bo, int size)
344 {
345 struct brw_bo *old_bo = *scratch_bo;
346
347 if (old_bo && old_bo->size < size) {
348 brw_bo_unreference(old_bo);
349 old_bo = NULL;
350 }
351
352 if (!old_bo) {
353 *scratch_bo =
354 brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
355 }
356 }
357
358 /**
359 * Reserve enough scratch space for the given stage to hold \p per_thread_size
360 * bytes times the given \p thread_count.
361 */
362 void
363 brw_alloc_stage_scratch(struct brw_context *brw,
364 struct brw_stage_state *stage_state,
365 unsigned per_thread_size)
366 {
367 if (stage_state->per_thread_scratch >= per_thread_size)
368 return;
369
370 stage_state->per_thread_scratch = per_thread_size;
371
372 if (stage_state->scratch_bo)
373 brw_bo_unreference(stage_state->scratch_bo);
374
375 const struct gen_device_info *devinfo = &brw->screen->devinfo;
376 unsigned thread_count;
377 switch(stage_state->stage) {
378 case MESA_SHADER_VERTEX:
379 thread_count = devinfo->max_vs_threads;
380 break;
381 case MESA_SHADER_TESS_CTRL:
382 thread_count = devinfo->max_tcs_threads;
383 break;
384 case MESA_SHADER_TESS_EVAL:
385 thread_count = devinfo->max_tes_threads;
386 break;
387 case MESA_SHADER_GEOMETRY:
388 thread_count = devinfo->max_gs_threads;
389 break;
390 case MESA_SHADER_FRAGMENT:
391 thread_count = devinfo->max_wm_threads;
392 break;
393 case MESA_SHADER_COMPUTE: {
394 unsigned subslices = MAX2(brw->screen->subslice_total, 1);
395
396 /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
397 *
398 * "Scratch Space per slice is computed based on 4 sub-slices. SW must
399 * allocate scratch space enough so that each slice has 4 slices
400 * allowed."
401 *
402 * According to the other driver team, this applies to compute shaders
403 * as well. This is not currently documented at all.
404 *
405 * brw->screen->subslice_total is the TOTAL number of subslices
406 * and we wish to view that there are 4 subslices per slice
407 * instead of the actual number of subslices per slice.
408 */
409 if (devinfo->gen >= 9)
410 subslices = 4 * brw->screen->devinfo.num_slices;
411
412 unsigned scratch_ids_per_subslice;
413 if (devinfo->is_haswell) {
414 /* WaCSScratchSize:hsw
415 *
416 * Haswell's scratch space address calculation appears to be sparse
417 * rather than tightly packed. The Thread ID has bits indicating
418 * which subslice, EU within a subslice, and thread within an EU it
419 * is. There's a maximum of two slices and two subslices, so these
420 * can be stored with a single bit. Even though there are only 10 EUs
421 * per subslice, this is stored in 4 bits, so there's an effective
422 * maximum value of 16 EUs. Similarly, although there are only 7
423 * threads per EU, this is stored in a 3 bit number, giving an
424 * effective maximum value of 8 threads per EU.
425 *
426 * This means that we need to use 16 * 8 instead of 10 * 7 for the
427 * number of threads per subslice.
428 */
429 scratch_ids_per_subslice = 16 * 8;
430 } else if (devinfo->is_cherryview) {
431 /* Cherryview devices have either 6 or 8 EUs per subslice, and each
432 * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
433 * as if it had 8 EUs.
434 */
435 scratch_ids_per_subslice = 8 * 7;
436 } else {
437 scratch_ids_per_subslice = devinfo->max_cs_threads;
438 }
439
440 thread_count = scratch_ids_per_subslice * subslices;
441 break;
442 }
443 default:
444 unreachable("Unsupported stage!");
445 }
446
447 stage_state->scratch_bo =
448 brw_bo_alloc(brw->bufmgr, "shader scratch space",
449 per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
450 }
451
452 void brwInitFragProgFuncs( struct dd_function_table *functions )
453 {
454 assert(functions->ProgramStringNotify == _tnl_program_string);
455
456 functions->NewProgram = brwNewProgram;
457 functions->DeleteProgram = brwDeleteProgram;
458 functions->ProgramStringNotify = brwProgramStringNotify;
459
460 functions->LinkShader = brw_link_shader;
461
462 functions->MemoryBarrier = brw_memory_barrier;
463 functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
464 }
465
466 struct shader_times {
467 uint64_t time;
468 uint64_t written;
469 uint64_t reset;
470 };
471
472 void
473 brw_init_shader_time(struct brw_context *brw)
474 {
475 const int max_entries = 2048;
476 brw->shader_time.bo =
477 brw_bo_alloc(brw->bufmgr, "shader time",
478 max_entries * BRW_SHADER_TIME_STRIDE * 3,
479 BRW_MEMZONE_OTHER);
480 brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
481 brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
482 brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
483 max_entries);
484 brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
485 max_entries);
486 brw->shader_time.max_entries = max_entries;
487 }
488
489 static int
490 compare_time(const void *a, const void *b)
491 {
492 uint64_t * const *a_val = a;
493 uint64_t * const *b_val = b;
494
495 /* We don't just subtract because we're turning the value to an int. */
496 if (**a_val < **b_val)
497 return -1;
498 else if (**a_val == **b_val)
499 return 0;
500 else
501 return 1;
502 }
503
504 static void
505 print_shader_time_line(const char *stage, const char *name,
506 int shader_num, uint64_t time, uint64_t total)
507 {
508 fprintf(stderr, "%-6s%-18s", stage, name);
509
510 if (shader_num != 0)
511 fprintf(stderr, "%4d: ", shader_num);
512 else
513 fprintf(stderr, " : ");
514
515 fprintf(stderr, "%16lld (%7.2f Gcycles) %4.1f%%\n",
516 (long long)time,
517 (double)time / 1000000000.0,
518 (double)time / total * 100.0);
519 }
520
521 static void
522 brw_report_shader_time(struct brw_context *brw)
523 {
524 if (!brw->shader_time.bo || !brw->shader_time.num_entries)
525 return;
526
527 uint64_t scaled[brw->shader_time.num_entries];
528 uint64_t *sorted[brw->shader_time.num_entries];
529 uint64_t total_by_type[ST_CS + 1];
530 memset(total_by_type, 0, sizeof(total_by_type));
531 double total = 0;
532 for (int i = 0; i < brw->shader_time.num_entries; i++) {
533 uint64_t written = 0, reset = 0;
534 enum shader_time_shader_type type = brw->shader_time.types[i];
535
536 sorted[i] = &scaled[i];
537
538 switch (type) {
539 case ST_VS:
540 case ST_TCS:
541 case ST_TES:
542 case ST_GS:
543 case ST_FS8:
544 case ST_FS16:
545 case ST_FS32:
546 case ST_CS:
547 written = brw->shader_time.cumulative[i].written;
548 reset = brw->shader_time.cumulative[i].reset;
549 break;
550
551 default:
552 /* I sometimes want to print things that aren't the 3 shader times.
553 * Just print the sum in that case.
554 */
555 written = 1;
556 reset = 0;
557 break;
558 }
559
560 uint64_t time = brw->shader_time.cumulative[i].time;
561 if (written) {
562 scaled[i] = time / written * (written + reset);
563 } else {
564 scaled[i] = time;
565 }
566
567 switch (type) {
568 case ST_VS:
569 case ST_TCS:
570 case ST_TES:
571 case ST_GS:
572 case ST_FS8:
573 case ST_FS16:
574 case ST_FS32:
575 case ST_CS:
576 total_by_type[type] += scaled[i];
577 break;
578 default:
579 break;
580 }
581
582 total += scaled[i];
583 }
584
585 if (total == 0) {
586 fprintf(stderr, "No shader time collected yet\n");
587 return;
588 }
589
590 qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
591
592 fprintf(stderr, "\n");
593 fprintf(stderr, "type ID cycles spent %% of total\n");
594 for (int s = 0; s < brw->shader_time.num_entries; s++) {
595 const char *stage;
596 /* Work back from the sorted pointers times to a time to print. */
597 int i = sorted[s] - scaled;
598
599 if (scaled[i] == 0)
600 continue;
601
602 int shader_num = brw->shader_time.ids[i];
603 const char *shader_name = brw->shader_time.names[i];
604
605 switch (brw->shader_time.types[i]) {
606 case ST_VS:
607 stage = "vs";
608 break;
609 case ST_TCS:
610 stage = "tcs";
611 break;
612 case ST_TES:
613 stage = "tes";
614 break;
615 case ST_GS:
616 stage = "gs";
617 break;
618 case ST_FS8:
619 stage = "fs8";
620 break;
621 case ST_FS16:
622 stage = "fs16";
623 break;
624 case ST_FS32:
625 stage = "fs32";
626 break;
627 case ST_CS:
628 stage = "cs";
629 break;
630 default:
631 stage = "other";
632 break;
633 }
634
635 print_shader_time_line(stage, shader_name, shader_num,
636 scaled[i], total);
637 }
638
639 fprintf(stderr, "\n");
640 print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
641 print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
642 print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
643 print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
644 print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
645 print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
646 print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
647 print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
648 }
649
650 static void
651 brw_collect_shader_time(struct brw_context *brw)
652 {
653 if (!brw->shader_time.bo)
654 return;
655
656 /* This probably stalls on the last rendering. We could fix that by
657 * delaying reading the reports, but it doesn't look like it's a big
658 * overhead compared to the cost of tracking the time in the first place.
659 */
660 void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
661
662 for (int i = 0; i < brw->shader_time.num_entries; i++) {
663 uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
664
665 brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
666 brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
667 brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
668 }
669
670 /* Zero the BO out to clear it out for our next collection.
671 */
672 memset(bo_map, 0, brw->shader_time.bo->size);
673 brw_bo_unmap(brw->shader_time.bo);
674 }
675
676 void
677 brw_collect_and_report_shader_time(struct brw_context *brw)
678 {
679 brw_collect_shader_time(brw);
680
681 if (brw->shader_time.report_time == 0 ||
682 get_time() - brw->shader_time.report_time >= 1.0) {
683 brw_report_shader_time(brw);
684 brw->shader_time.report_time = get_time();
685 }
686 }
687
688 /**
689 * Chooses an index in the shader_time buffer and sets up tracking information
690 * for our printouts.
691 *
692 * Note that this holds on to references to the underlying programs, which may
693 * change their lifetimes compared to normal operation.
694 */
695 int
696 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
697 enum shader_time_shader_type type, bool is_glsl_sh)
698 {
699 int shader_time_index = brw->shader_time.num_entries++;
700 assert(shader_time_index < brw->shader_time.max_entries);
701 brw->shader_time.types[shader_time_index] = type;
702
703 const char *name;
704 if (prog->Id == 0) {
705 name = "ff";
706 } else if (is_glsl_sh) {
707 name = prog->info.label ?
708 ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
709 } else {
710 name = "prog";
711 }
712
713 brw->shader_time.names[shader_time_index] = name;
714 brw->shader_time.ids[shader_time_index] = prog->Id;
715
716 return shader_time_index;
717 }
718
719 void
720 brw_destroy_shader_time(struct brw_context *brw)
721 {
722 brw_bo_unreference(brw->shader_time.bo);
723 brw->shader_time.bo = NULL;
724 }
725
726 void
727 brw_stage_prog_data_free(const void *p)
728 {
729 struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
730
731 ralloc_free(prog_data->param);
732 ralloc_free(prog_data->pull_param);
733 }
734
735 void
736 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
737 {
738 fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
739 stage, prog->Id, stage);
740 _mesa_print_program(prog);
741 }
742
743 void
744 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
745 struct brw_sampler_prog_key_data *tex,
746 struct gl_program *prog)
747 {
748 const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
749 unsigned sampler_count = util_last_bit(prog->SamplersUsed);
750 for (unsigned i = 0; i < sampler_count; i++) {
751 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
752 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
753 tex->swizzles[i] =
754 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
755 } else {
756 /* Color sampler: assume no swizzling. */
757 tex->swizzles[i] = SWIZZLE_XYZW;
758 }
759 }
760 }
761
762 /**
763 * Sets up the starting offsets for the groups of binding table entries
764 * common to all pipeline stages.
765 *
766 * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
767 * unused but also make sure that addition of small offsets to them will
768 * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
769 */
770 uint32_t
771 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
772 const struct gl_program *prog,
773 struct brw_stage_prog_data *stage_prog_data,
774 uint32_t next_binding_table_offset)
775 {
776 int num_textures = util_last_bit(prog->SamplersUsed);
777
778 stage_prog_data->binding_table.texture_start = next_binding_table_offset;
779 next_binding_table_offset += num_textures;
780
781 if (prog->info.num_ubos) {
782 assert(prog->info.num_ubos <= BRW_MAX_UBO);
783 stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
784 next_binding_table_offset += prog->info.num_ubos;
785 } else {
786 stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
787 }
788
789 if (prog->info.num_ssbos || prog->info.num_abos) {
790 assert(prog->info.num_abos <= BRW_MAX_ABO);
791 assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
792 stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
793 next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
794 } else {
795 stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
796 }
797
798 if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
799 stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
800 next_binding_table_offset++;
801 } else {
802 stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
803 }
804
805 if (prog->info.uses_texture_gather) {
806 if (devinfo->gen >= 8) {
807 stage_prog_data->binding_table.gather_texture_start =
808 stage_prog_data->binding_table.texture_start;
809 } else {
810 stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
811 next_binding_table_offset += num_textures;
812 }
813 } else {
814 stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
815 }
816
817 if (prog->info.num_images) {
818 stage_prog_data->binding_table.image_start = next_binding_table_offset;
819 next_binding_table_offset += prog->info.num_images;
820 } else {
821 stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
822 }
823
824 /* This may or may not be used depending on how the compile goes. */
825 stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
826 next_binding_table_offset++;
827
828 /* Plane 0 is just the regular texture section */
829 stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
830
831 stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
832 next_binding_table_offset += num_textures;
833
834 stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
835 next_binding_table_offset += num_textures;
836
837 /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
838
839 assert(next_binding_table_offset <= BRW_MAX_SURFACES);
840 return next_binding_table_offset;
841 }
842
843 void
844 brw_prog_key_set_id(union brw_any_prog_key *key, gl_shader_stage stage,
845 unsigned id)
846 {
847 static const unsigned stage_offsets[] = {
848 offsetof(struct brw_vs_prog_key, program_string_id),
849 offsetof(struct brw_tcs_prog_key, program_string_id),
850 offsetof(struct brw_tes_prog_key, program_string_id),
851 offsetof(struct brw_gs_prog_key, program_string_id),
852 offsetof(struct brw_wm_prog_key, program_string_id),
853 offsetof(struct brw_cs_prog_key, program_string_id),
854 };
855 assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_offsets));
856 *(unsigned*)((uint8_t*)key + stage_offsets[stage]) = id;
857 }
858
859 void
860 brw_populate_default_key(const struct gen_device_info *devinfo,
861 union brw_any_prog_key *prog_key,
862 struct gl_shader_program *sh_prog,
863 struct gl_program *prog)
864 {
865 switch (prog->info.stage) {
866 case MESA_SHADER_VERTEX:
867 brw_vs_populate_default_key(devinfo, &prog_key->vs, prog);
868 break;
869 case MESA_SHADER_TESS_CTRL:
870 brw_tcs_populate_default_key(devinfo, &prog_key->tcs, sh_prog, prog);
871 break;
872 case MESA_SHADER_TESS_EVAL:
873 brw_tes_populate_default_key(devinfo, &prog_key->tes, sh_prog, prog);
874 break;
875 case MESA_SHADER_GEOMETRY:
876 brw_gs_populate_default_key(devinfo, &prog_key->gs, prog);
877 break;
878 case MESA_SHADER_FRAGMENT:
879 brw_wm_populate_default_key(devinfo, &prog_key->wm, prog);
880 break;
881 case MESA_SHADER_COMPUTE:
882 brw_cs_populate_default_key(devinfo, &prog_key->cs, prog);
883 break;
884 default:
885 unreachable("Unsupported stage!");
886 }
887 }