Merge remote-tracking branch 'public/master' into vulkan
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27 #include "brw_vs.h"
28 #include "brw_nir.h"
29 #include "brw_vec4_builder.h"
30 #include "brw_vec4_live_variables.h"
31 #include "brw_dead_control_flow.h"
32 #include "program/prog_parameter.h"
33
34 #define MAX_INSTRUCTION (1 << 30)
35
36 using namespace brw;
37
38 namespace brw {
39
40 void
41 src_reg::init()
42 {
43 memset(this, 0, sizeof(*this));
44
45 this->file = BAD_FILE;
46 }
47
48 src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type)
49 {
50 init();
51
52 this->file = file;
53 this->nr = nr;
54 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
55 this->swizzle = brw_swizzle_for_size(type->vector_elements);
56 else
57 this->swizzle = BRW_SWIZZLE_XYZW;
58 if (type)
59 this->type = brw_type_for_base_type(type);
60 }
61
62 /** Generic unset register constructor. */
63 src_reg::src_reg()
64 {
65 init();
66 }
67
68 src_reg::src_reg(struct ::brw_reg reg) :
69 backend_reg(reg)
70 {
71 this->reg_offset = 0;
72 this->reladdr = NULL;
73 }
74
75 src_reg::src_reg(const dst_reg &reg) :
76 backend_reg(reg)
77 {
78 this->reladdr = reg.reladdr;
79 this->swizzle = brw_swizzle_for_mask(reg.writemask);
80 }
81
82 void
83 dst_reg::init()
84 {
85 memset(this, 0, sizeof(*this));
86 this->file = BAD_FILE;
87 this->writemask = WRITEMASK_XYZW;
88 }
89
90 dst_reg::dst_reg()
91 {
92 init();
93 }
94
95 dst_reg::dst_reg(enum brw_reg_file file, int nr)
96 {
97 init();
98
99 this->file = file;
100 this->nr = nr;
101 }
102
103 dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
104 unsigned writemask)
105 {
106 init();
107
108 this->file = file;
109 this->nr = nr;
110 this->type = brw_type_for_base_type(type);
111 this->writemask = writemask;
112 }
113
114 dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
115 unsigned writemask)
116 {
117 init();
118
119 this->file = file;
120 this->nr = nr;
121 this->type = type;
122 this->writemask = writemask;
123 }
124
125 dst_reg::dst_reg(struct ::brw_reg reg) :
126 backend_reg(reg)
127 {
128 this->reg_offset = 0;
129 this->reladdr = NULL;
130 }
131
132 dst_reg::dst_reg(const src_reg &reg) :
133 backend_reg(reg)
134 {
135 this->writemask = brw_mask_for_swizzle(reg.swizzle);
136 this->reladdr = reg.reladdr;
137 }
138
139 bool
140 dst_reg::equals(const dst_reg &r) const
141 {
142 return (this->backend_reg::equals(r) &&
143 (reladdr == r.reladdr ||
144 (reladdr && r.reladdr && reladdr->equals(*r.reladdr))));
145 }
146
147 bool
148 vec4_instruction::is_send_from_grf()
149 {
150 switch (opcode) {
151 case SHADER_OPCODE_SHADER_TIME_ADD:
152 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
153 case SHADER_OPCODE_UNTYPED_ATOMIC:
154 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
155 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
156 case SHADER_OPCODE_TYPED_ATOMIC:
157 case SHADER_OPCODE_TYPED_SURFACE_READ:
158 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
159 case VEC4_OPCODE_URB_READ:
160 case TCS_OPCODE_URB_WRITE:
161 case TCS_OPCODE_RELEASE_INPUT:
162 case SHADER_OPCODE_BARRIER:
163 return true;
164 default:
165 return false;
166 }
167 }
168
169 /**
170 * Returns true if this instruction's sources and destinations cannot
171 * safely be the same register.
172 *
173 * In most cases, a register can be written over safely by the same
174 * instruction that is its last use. For a single instruction, the
175 * sources are dereferenced before writing of the destination starts
176 * (naturally).
177 *
178 * However, there are a few cases where this can be problematic:
179 *
180 * - Virtual opcodes that translate to multiple instructions in the
181 * code generator: if src == dst and one instruction writes the
182 * destination before a later instruction reads the source, then
183 * src will have been clobbered.
184 *
185 * The register allocator uses this information to set up conflicts between
186 * GRF sources and the destination.
187 */
188 bool
189 vec4_instruction::has_source_and_destination_hazard() const
190 {
191 switch (opcode) {
192 case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
193 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
194 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
195 return true;
196 default:
197 return false;
198 }
199 }
200
201 unsigned
202 vec4_instruction::regs_read(unsigned arg) const
203 {
204 if (src[arg].file == BAD_FILE)
205 return 0;
206
207 switch (opcode) {
208 case SHADER_OPCODE_SHADER_TIME_ADD:
209 case SHADER_OPCODE_UNTYPED_ATOMIC:
210 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
211 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
212 case SHADER_OPCODE_TYPED_ATOMIC:
213 case SHADER_OPCODE_TYPED_SURFACE_READ:
214 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
215 case TCS_OPCODE_URB_WRITE:
216 return arg == 0 ? mlen : 1;
217
218 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
219 return arg == 1 ? mlen : 1;
220
221 default:
222 return 1;
223 }
224 }
225
226 bool
227 vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo)
228 {
229 if (devinfo->gen == 6 && is_math())
230 return false;
231
232 if (is_send_from_grf())
233 return false;
234
235 if (!backend_instruction::can_do_source_mods())
236 return false;
237
238 return true;
239 }
240
241 bool
242 vec4_instruction::can_change_types() const
243 {
244 return dst.type == src[0].type &&
245 !src[0].abs && !src[0].negate && !saturate &&
246 (opcode == BRW_OPCODE_MOV ||
247 (opcode == BRW_OPCODE_SEL &&
248 dst.type == src[1].type &&
249 predicate != BRW_PREDICATE_NONE &&
250 !src[1].abs && !src[1].negate));
251 }
252
253 /**
254 * Returns how many MRFs an opcode will write over.
255 *
256 * Note that this is not the 0 or 1 implied writes in an actual gen
257 * instruction -- the generate_* functions generate additional MOVs
258 * for setup.
259 */
260 int
261 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
262 {
263 if (inst->mlen == 0 || inst->is_send_from_grf())
264 return 0;
265
266 switch (inst->opcode) {
267 case SHADER_OPCODE_RCP:
268 case SHADER_OPCODE_RSQ:
269 case SHADER_OPCODE_SQRT:
270 case SHADER_OPCODE_EXP2:
271 case SHADER_OPCODE_LOG2:
272 case SHADER_OPCODE_SIN:
273 case SHADER_OPCODE_COS:
274 return 1;
275 case SHADER_OPCODE_INT_QUOTIENT:
276 case SHADER_OPCODE_INT_REMAINDER:
277 case SHADER_OPCODE_POW:
278 case TCS_OPCODE_THREAD_END:
279 return 2;
280 case VS_OPCODE_URB_WRITE:
281 return 1;
282 case VS_OPCODE_PULL_CONSTANT_LOAD:
283 return 2;
284 case SHADER_OPCODE_GEN4_SCRATCH_READ:
285 return 2;
286 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
287 return 3;
288 case GS_OPCODE_URB_WRITE:
289 case GS_OPCODE_URB_WRITE_ALLOCATE:
290 case GS_OPCODE_THREAD_END:
291 return 0;
292 case GS_OPCODE_FF_SYNC:
293 return 1;
294 case TCS_OPCODE_URB_WRITE:
295 return 0;
296 case SHADER_OPCODE_SHADER_TIME_ADD:
297 return 0;
298 case SHADER_OPCODE_TEX:
299 case SHADER_OPCODE_TXL:
300 case SHADER_OPCODE_TXD:
301 case SHADER_OPCODE_TXF:
302 case SHADER_OPCODE_TXF_CMS:
303 case SHADER_OPCODE_TXF_CMS_W:
304 case SHADER_OPCODE_TXF_MCS:
305 case SHADER_OPCODE_TXS:
306 case SHADER_OPCODE_TG4:
307 case SHADER_OPCODE_TG4_OFFSET:
308 case SHADER_OPCODE_SAMPLEINFO:
309 case VS_OPCODE_GET_BUFFER_SIZE:
310 return inst->header_size;
311 default:
312 unreachable("not reached");
313 }
314 }
315
316 bool
317 src_reg::equals(const src_reg &r) const
318 {
319 return (this->backend_reg::equals(r) &&
320 !reladdr && !r.reladdr);
321 }
322
323 bool
324 vec4_visitor::vectorize_mov(bblock_t *block, vec4_instruction *inst,
325 uint8_t imm[4], vec4_instruction *imm_inst[4],
326 int inst_count, unsigned writemask)
327 {
328 if (inst_count < 2)
329 return false;
330
331 unsigned vf;
332 memcpy(&vf, imm, sizeof(vf));
333 vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
334 mov->dst.type = BRW_REGISTER_TYPE_F;
335 mov->dst.writemask = writemask;
336 inst->insert_before(block, mov);
337
338 for (int i = 0; i < inst_count; i++) {
339 imm_inst[i]->remove(block);
340 }
341
342 return true;
343 }
344
345 bool
346 vec4_visitor::opt_vector_float()
347 {
348 bool progress = false;
349
350 int last_reg = -1, last_reg_offset = -1;
351 enum brw_reg_file last_reg_file = BAD_FILE;
352
353 uint8_t imm[4] = { 0 };
354 int inst_count = 0;
355 vec4_instruction *imm_inst[4];
356 unsigned writemask = 0;
357
358 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
359 if (last_reg != inst->dst.nr ||
360 last_reg_offset != inst->dst.reg_offset ||
361 last_reg_file != inst->dst.file) {
362 progress |= vectorize_mov(block, inst, imm, imm_inst, inst_count,
363 writemask);
364 inst_count = 0;
365 writemask = 0;
366 last_reg = inst->dst.nr;
367 last_reg_offset = inst->dst.reg_offset;
368 last_reg_file = inst->dst.file;
369
370 for (int i = 0; i < 4; i++) {
371 imm[i] = 0;
372 }
373 }
374
375 if (inst->opcode != BRW_OPCODE_MOV ||
376 inst->dst.writemask == WRITEMASK_XYZW ||
377 inst->src[0].file != IMM ||
378 inst->predicate != BRW_PREDICATE_NONE) {
379 progress |= vectorize_mov(block, inst, imm, imm_inst, inst_count,
380 writemask);
381 inst_count = 0;
382 last_reg = -1;
383 continue;
384 }
385
386 int vf = brw_float_to_vf(inst->src[0].f);
387 if (vf == -1)
388 continue;
389
390 if ((inst->dst.writemask & WRITEMASK_X) != 0)
391 imm[0] = vf;
392 if ((inst->dst.writemask & WRITEMASK_Y) != 0)
393 imm[1] = vf;
394 if ((inst->dst.writemask & WRITEMASK_Z) != 0)
395 imm[2] = vf;
396 if ((inst->dst.writemask & WRITEMASK_W) != 0)
397 imm[3] = vf;
398
399 writemask |= inst->dst.writemask;
400 imm_inst[inst_count++] = inst;
401 }
402
403 if (progress)
404 invalidate_live_intervals();
405
406 return progress;
407 }
408
409 /* Replaces unused channels of a swizzle with channels that are used.
410 *
411 * For instance, this pass transforms
412 *
413 * mov vgrf4.yz, vgrf5.wxzy
414 *
415 * into
416 *
417 * mov vgrf4.yz, vgrf5.xxzx
418 *
419 * This eliminates false uses of some channels, letting dead code elimination
420 * remove the instructions that wrote them.
421 */
422 bool
423 vec4_visitor::opt_reduce_swizzle()
424 {
425 bool progress = false;
426
427 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
428 if (inst->dst.file == BAD_FILE ||
429 inst->dst.file == ARF ||
430 inst->dst.file == FIXED_GRF ||
431 inst->is_send_from_grf())
432 continue;
433
434 unsigned swizzle;
435
436 /* Determine which channels of the sources are read. */
437 switch (inst->opcode) {
438 case VEC4_OPCODE_PACK_BYTES:
439 case BRW_OPCODE_DP4:
440 case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
441 * but all four of src1.
442 */
443 swizzle = brw_swizzle_for_size(4);
444 break;
445 case BRW_OPCODE_DP3:
446 swizzle = brw_swizzle_for_size(3);
447 break;
448 case BRW_OPCODE_DP2:
449 swizzle = brw_swizzle_for_size(2);
450 break;
451 default:
452 swizzle = brw_swizzle_for_mask(inst->dst.writemask);
453 break;
454 }
455
456 /* Update sources' swizzles. */
457 for (int i = 0; i < 3; i++) {
458 if (inst->src[i].file != VGRF &&
459 inst->src[i].file != ATTR &&
460 inst->src[i].file != UNIFORM)
461 continue;
462
463 const unsigned new_swizzle =
464 brw_compose_swizzle(swizzle, inst->src[i].swizzle);
465 if (inst->src[i].swizzle != new_swizzle) {
466 inst->src[i].swizzle = new_swizzle;
467 progress = true;
468 }
469 }
470 }
471
472 if (progress)
473 invalidate_live_intervals();
474
475 return progress;
476 }
477
478 void
479 vec4_visitor::split_uniform_registers()
480 {
481 /* Prior to this, uniforms have been in an array sized according to
482 * the number of vector uniforms present, sparsely filled (so an
483 * aggregate results in reg indices being skipped over). Now we're
484 * going to cut those aggregates up so each .nr index is one
485 * vector. The goal is to make elimination of unused uniform
486 * components easier later.
487 */
488 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
489 for (int i = 0 ; i < 3; i++) {
490 if (inst->src[i].file != UNIFORM)
491 continue;
492
493 assert(!inst->src[i].reladdr);
494
495 inst->src[i].nr += inst->src[i].reg_offset;
496 inst->src[i].reg_offset = 0;
497 }
498 }
499 }
500
501 void
502 vec4_visitor::pack_uniform_registers()
503 {
504 uint8_t chans_used[this->uniforms];
505 int new_loc[this->uniforms];
506 int new_chan[this->uniforms];
507
508 memset(chans_used, 0, sizeof(chans_used));
509 memset(new_loc, 0, sizeof(new_loc));
510 memset(new_chan, 0, sizeof(new_chan));
511
512 /* Find which uniform vectors are actually used by the program. We
513 * expect unused vector elements when we've moved array access out
514 * to pull constants, and from some GLSL code generators like wine.
515 */
516 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
517 unsigned readmask;
518 switch (inst->opcode) {
519 case VEC4_OPCODE_PACK_BYTES:
520 case BRW_OPCODE_DP4:
521 case BRW_OPCODE_DPH:
522 readmask = 0xf;
523 break;
524 case BRW_OPCODE_DP3:
525 readmask = 0x7;
526 break;
527 case BRW_OPCODE_DP2:
528 readmask = 0x3;
529 break;
530 default:
531 readmask = inst->dst.writemask;
532 break;
533 }
534
535 for (int i = 0 ; i < 3; i++) {
536 if (inst->src[i].file != UNIFORM)
537 continue;
538
539 int reg = inst->src[i].nr;
540 for (int c = 0; c < 4; c++) {
541 if (!(readmask & (1 << c)))
542 continue;
543
544 chans_used[reg] = MAX2(chans_used[reg],
545 BRW_GET_SWZ(inst->src[i].swizzle, c) + 1);
546 }
547 }
548 }
549
550 int new_uniform_count = 0;
551
552 /* Now, figure out a packing of the live uniform vectors into our
553 * push constants.
554 */
555 for (int src = 0; src < uniforms; src++) {
556 int size = chans_used[src];
557
558 if (size == 0)
559 continue;
560
561 int dst;
562 /* Find the lowest place we can slot this uniform in. */
563 for (dst = 0; dst < src; dst++) {
564 if (chans_used[dst] + size <= 4)
565 break;
566 }
567
568 if (src == dst) {
569 new_loc[src] = dst;
570 new_chan[src] = 0;
571 } else {
572 new_loc[src] = dst;
573 new_chan[src] = chans_used[dst];
574
575 /* Move the references to the data */
576 for (int j = 0; j < size; j++) {
577 stage_prog_data->param[dst * 4 + new_chan[src] + j] =
578 stage_prog_data->param[src * 4 + j];
579 }
580
581 chans_used[dst] += size;
582 chans_used[src] = 0;
583 }
584
585 new_uniform_count = MAX2(new_uniform_count, dst + 1);
586 }
587
588 this->uniforms = new_uniform_count;
589
590 /* Now, update the instructions for our repacked uniforms. */
591 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
592 for (int i = 0 ; i < 3; i++) {
593 int src = inst->src[i].nr;
594
595 if (inst->src[i].file != UNIFORM)
596 continue;
597
598 inst->src[i].nr = new_loc[src];
599 inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
600 new_chan[src], new_chan[src]);
601 }
602 }
603 }
604
605 /**
606 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
607 *
608 * While GLSL IR also performs this optimization, we end up with it in
609 * our instruction stream for a couple of reasons. One is that we
610 * sometimes generate silly instructions, for example in array access
611 * where we'll generate "ADD offset, index, base" even if base is 0.
612 * The other is that GLSL IR's constant propagation doesn't track the
613 * components of aggregates, so some VS patterns (initialize matrix to
614 * 0, accumulate in vertex blending factors) end up breaking down to
615 * instructions involving 0.
616 */
617 bool
618 vec4_visitor::opt_algebraic()
619 {
620 bool progress = false;
621
622 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
623 switch (inst->opcode) {
624 case BRW_OPCODE_MOV:
625 if (inst->src[0].file != IMM)
626 break;
627
628 if (inst->saturate) {
629 if (inst->dst.type != inst->src[0].type)
630 assert(!"unimplemented: saturate mixed types");
631
632 if (brw_saturate_immediate(inst->dst.type,
633 &inst->src[0].as_brw_reg())) {
634 inst->saturate = false;
635 progress = true;
636 }
637 }
638 break;
639
640 case VEC4_OPCODE_UNPACK_UNIFORM:
641 if (inst->src[0].file != UNIFORM) {
642 inst->opcode = BRW_OPCODE_MOV;
643 progress = true;
644 }
645 break;
646
647 case BRW_OPCODE_ADD:
648 if (inst->src[1].is_zero()) {
649 inst->opcode = BRW_OPCODE_MOV;
650 inst->src[1] = src_reg();
651 progress = true;
652 }
653 break;
654
655 case BRW_OPCODE_MUL:
656 if (inst->src[1].is_zero()) {
657 inst->opcode = BRW_OPCODE_MOV;
658 switch (inst->src[0].type) {
659 case BRW_REGISTER_TYPE_F:
660 inst->src[0] = brw_imm_f(0.0f);
661 break;
662 case BRW_REGISTER_TYPE_D:
663 inst->src[0] = brw_imm_d(0);
664 break;
665 case BRW_REGISTER_TYPE_UD:
666 inst->src[0] = brw_imm_ud(0u);
667 break;
668 default:
669 unreachable("not reached");
670 }
671 inst->src[1] = src_reg();
672 progress = true;
673 } else if (inst->src[1].is_one()) {
674 inst->opcode = BRW_OPCODE_MOV;
675 inst->src[1] = src_reg();
676 progress = true;
677 } else if (inst->src[1].is_negative_one()) {
678 inst->opcode = BRW_OPCODE_MOV;
679 inst->src[0].negate = !inst->src[0].negate;
680 inst->src[1] = src_reg();
681 progress = true;
682 }
683 break;
684 case BRW_OPCODE_CMP:
685 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
686 inst->src[0].abs &&
687 inst->src[0].negate &&
688 inst->src[1].is_zero()) {
689 inst->src[0].abs = false;
690 inst->src[0].negate = false;
691 inst->conditional_mod = BRW_CONDITIONAL_Z;
692 progress = true;
693 break;
694 }
695 break;
696 case SHADER_OPCODE_BROADCAST:
697 if (is_uniform(inst->src[0]) ||
698 inst->src[1].is_zero()) {
699 inst->opcode = BRW_OPCODE_MOV;
700 inst->src[1] = src_reg();
701 inst->force_writemask_all = true;
702 progress = true;
703 }
704 break;
705
706 default:
707 break;
708 }
709 }
710
711 if (progress)
712 invalidate_live_intervals();
713
714 return progress;
715 }
716
717 /**
718 * Only a limited number of hardware registers may be used for push
719 * constants, so this turns access to the overflowed constants into
720 * pull constants.
721 */
722 void
723 vec4_visitor::move_push_constants_to_pull_constants()
724 {
725 int pull_constant_loc[this->uniforms];
726
727 /* Only allow 32 registers (256 uniform components) as push constants,
728 * which is the limit on gen6.
729 *
730 * If changing this value, note the limitation about total_regs in
731 * brw_curbe.c.
732 */
733 int max_uniform_components = 32 * 8;
734 if (this->uniforms * 4 <= max_uniform_components)
735 return;
736
737 /* Make some sort of choice as to which uniforms get sent to pull
738 * constants. We could potentially do something clever here like
739 * look for the most infrequently used uniform vec4s, but leave
740 * that for later.
741 */
742 for (int i = 0; i < this->uniforms * 4; i += 4) {
743 pull_constant_loc[i / 4] = -1;
744
745 if (i >= max_uniform_components) {
746 const gl_constant_value **values = &stage_prog_data->param[i];
747
748 /* Try to find an existing copy of this uniform in the pull
749 * constants if it was part of an array access already.
750 */
751 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
752 int matches;
753
754 for (matches = 0; matches < 4; matches++) {
755 if (stage_prog_data->pull_param[j + matches] != values[matches])
756 break;
757 }
758
759 if (matches == 4) {
760 pull_constant_loc[i / 4] = j / 4;
761 break;
762 }
763 }
764
765 if (pull_constant_loc[i / 4] == -1) {
766 assert(stage_prog_data->nr_pull_params % 4 == 0);
767 pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
768
769 for (int j = 0; j < 4; j++) {
770 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
771 values[j];
772 }
773 }
774 }
775 }
776
777 /* Now actually rewrite usage of the things we've moved to pull
778 * constants.
779 */
780 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
781 for (int i = 0 ; i < 3; i++) {
782 if (inst->src[i].file != UNIFORM ||
783 pull_constant_loc[inst->src[i].nr] == -1)
784 continue;
785
786 int uniform = inst->src[i].nr;
787
788 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
789
790 emit_pull_constant_load(block, inst, temp, inst->src[i],
791 pull_constant_loc[uniform], src_reg());
792
793 inst->src[i].file = temp.file;
794 inst->src[i].nr = temp.nr;
795 inst->src[i].reg_offset = temp.reg_offset;
796 inst->src[i].reladdr = NULL;
797 }
798 }
799
800 /* Repack push constants to remove the now-unused ones. */
801 pack_uniform_registers();
802 }
803
804 /* Conditions for which we want to avoid setting the dependency control bits */
805 bool
806 vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
807 {
808 #define IS_DWORD(reg) \
809 (reg.type == BRW_REGISTER_TYPE_UD || \
810 reg.type == BRW_REGISTER_TYPE_D)
811
812 /* "When source or destination datatype is 64b or operation is integer DWord
813 * multiply, DepCtrl must not be used."
814 * May apply to future SoCs as well.
815 */
816 if (devinfo->is_cherryview) {
817 if (inst->opcode == BRW_OPCODE_MUL &&
818 IS_DWORD(inst->src[0]) &&
819 IS_DWORD(inst->src[1]))
820 return true;
821 }
822 #undef IS_DWORD
823
824 if (devinfo->gen >= 8) {
825 if (inst->opcode == BRW_OPCODE_F32TO16)
826 return true;
827 }
828
829 /*
830 * mlen:
831 * In the presence of send messages, totally interrupt dependency
832 * control. They're long enough that the chance of dependency
833 * control around them just doesn't matter.
834 *
835 * predicate:
836 * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
837 * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
838 * completes the scoreboard clear must have a non-zero execution mask. This
839 * means, if any kind of predication can change the execution mask or channel
840 * enable of the last instruction, the optimization must be avoided. This is
841 * to avoid instructions being shot down the pipeline when no writes are
842 * required.
843 *
844 * math:
845 * Dependency control does not work well over math instructions.
846 * NB: Discovered empirically
847 */
848 return (inst->mlen || inst->predicate || inst->is_math());
849 }
850
851 /**
852 * Sets the dependency control fields on instructions after register
853 * allocation and before the generator is run.
854 *
855 * When you have a sequence of instructions like:
856 *
857 * DP4 temp.x vertex uniform[0]
858 * DP4 temp.y vertex uniform[0]
859 * DP4 temp.z vertex uniform[0]
860 * DP4 temp.w vertex uniform[0]
861 *
862 * The hardware doesn't know that it can actually run the later instructions
863 * while the previous ones are in flight, producing stalls. However, we have
864 * manual fields we can set in the instructions that let it do so.
865 */
866 void
867 vec4_visitor::opt_set_dependency_control()
868 {
869 vec4_instruction *last_grf_write[BRW_MAX_GRF];
870 uint8_t grf_channels_written[BRW_MAX_GRF];
871 vec4_instruction *last_mrf_write[BRW_MAX_GRF];
872 uint8_t mrf_channels_written[BRW_MAX_GRF];
873
874 assert(prog_data->total_grf ||
875 !"Must be called after register allocation");
876
877 foreach_block (block, cfg) {
878 memset(last_grf_write, 0, sizeof(last_grf_write));
879 memset(last_mrf_write, 0, sizeof(last_mrf_write));
880
881 foreach_inst_in_block (vec4_instruction, inst, block) {
882 /* If we read from a register that we were doing dependency control
883 * on, don't do dependency control across the read.
884 */
885 for (int i = 0; i < 3; i++) {
886 int reg = inst->src[i].nr + inst->src[i].reg_offset;
887 if (inst->src[i].file == VGRF) {
888 last_grf_write[reg] = NULL;
889 } else if (inst->src[i].file == FIXED_GRF) {
890 memset(last_grf_write, 0, sizeof(last_grf_write));
891 break;
892 }
893 assert(inst->src[i].file != MRF);
894 }
895
896 if (is_dep_ctrl_unsafe(inst)) {
897 memset(last_grf_write, 0, sizeof(last_grf_write));
898 memset(last_mrf_write, 0, sizeof(last_mrf_write));
899 continue;
900 }
901
902 /* Now, see if we can do dependency control for this instruction
903 * against a previous one writing to its destination.
904 */
905 int reg = inst->dst.nr + inst->dst.reg_offset;
906 if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
907 if (last_grf_write[reg] &&
908 !(inst->dst.writemask & grf_channels_written[reg])) {
909 last_grf_write[reg]->no_dd_clear = true;
910 inst->no_dd_check = true;
911 } else {
912 grf_channels_written[reg] = 0;
913 }
914
915 last_grf_write[reg] = inst;
916 grf_channels_written[reg] |= inst->dst.writemask;
917 } else if (inst->dst.file == MRF) {
918 if (last_mrf_write[reg] &&
919 !(inst->dst.writemask & mrf_channels_written[reg])) {
920 last_mrf_write[reg]->no_dd_clear = true;
921 inst->no_dd_check = true;
922 } else {
923 mrf_channels_written[reg] = 0;
924 }
925
926 last_mrf_write[reg] = inst;
927 mrf_channels_written[reg] |= inst->dst.writemask;
928 }
929 }
930 }
931 }
932
933 bool
934 vec4_instruction::can_reswizzle(const struct brw_device_info *devinfo,
935 int dst_writemask,
936 int swizzle,
937 int swizzle_mask)
938 {
939 /* Gen6 MATH instructions can not execute in align16 mode, so swizzles
940 * or writemasking are not allowed.
941 */
942 if (devinfo->gen == 6 && is_math() &&
943 (swizzle != BRW_SWIZZLE_XYZW || dst_writemask != WRITEMASK_XYZW))
944 return false;
945
946 /* If this instruction sets anything not referenced by swizzle, then we'd
947 * totally break it when we reswizzle.
948 */
949 if (dst.writemask & ~swizzle_mask)
950 return false;
951
952 if (mlen > 0)
953 return false;
954
955 for (int i = 0; i < 3; i++) {
956 if (src[i].is_accumulator())
957 return false;
958 }
959
960 return true;
961 }
962
963 /**
964 * For any channels in the swizzle's source that were populated by this
965 * instruction, rewrite the instruction to put the appropriate result directly
966 * in those channels.
967 *
968 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
969 */
970 void
971 vec4_instruction::reswizzle(int dst_writemask, int swizzle)
972 {
973 /* Destination write mask doesn't correspond to source swizzle for the dot
974 * product and pack_bytes instructions.
975 */
976 if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
977 opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
978 opcode != VEC4_OPCODE_PACK_BYTES) {
979 for (int i = 0; i < 3; i++) {
980 if (src[i].file == BAD_FILE || src[i].file == IMM)
981 continue;
982
983 src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
984 }
985 }
986
987 /* Apply the specified swizzle and writemask to the original mask of
988 * written components.
989 */
990 dst.writemask = dst_writemask &
991 brw_apply_swizzle_to_mask(swizzle, dst.writemask);
992 }
993
994 /*
995 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
996 * just written and then MOVed into another reg and making the original write
997 * of the GRF write directly to the final destination instead.
998 */
999 bool
1000 vec4_visitor::opt_register_coalesce()
1001 {
1002 bool progress = false;
1003 int next_ip = 0;
1004
1005 calculate_live_intervals();
1006
1007 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
1008 int ip = next_ip;
1009 next_ip++;
1010
1011 if (inst->opcode != BRW_OPCODE_MOV ||
1012 (inst->dst.file != VGRF && inst->dst.file != MRF) ||
1013 inst->predicate ||
1014 inst->src[0].file != VGRF ||
1015 inst->dst.type != inst->src[0].type ||
1016 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
1017 continue;
1018
1019 /* Remove no-op MOVs */
1020 if (inst->dst.file == inst->src[0].file &&
1021 inst->dst.nr == inst->src[0].nr &&
1022 inst->dst.reg_offset == inst->src[0].reg_offset) {
1023 bool is_nop_mov = true;
1024
1025 for (unsigned c = 0; c < 4; c++) {
1026 if ((inst->dst.writemask & (1 << c)) == 0)
1027 continue;
1028
1029 if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) {
1030 is_nop_mov = false;
1031 break;
1032 }
1033 }
1034
1035 if (is_nop_mov) {
1036 inst->remove(block);
1037 progress = true;
1038 continue;
1039 }
1040 }
1041
1042 bool to_mrf = (inst->dst.file == MRF);
1043
1044 /* Can't coalesce this GRF if someone else was going to
1045 * read it later.
1046 */
1047 if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip)
1048 continue;
1049
1050 /* We need to check interference with the final destination between this
1051 * instruction and the earliest instruction involved in writing the GRF
1052 * we're eliminating. To do that, keep track of which of our source
1053 * channels we've seen initialized.
1054 */
1055 const unsigned chans_needed =
1056 brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
1057 inst->dst.writemask);
1058 unsigned chans_remaining = chans_needed;
1059
1060 /* Now walk up the instruction stream trying to see if we can rewrite
1061 * everything writing to the temporary to write into the destination
1062 * instead.
1063 */
1064 vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
1065 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
1066 inst) {
1067 _scan_inst = scan_inst;
1068
1069 if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) {
1070 /* Found something writing to the reg we want to coalesce away. */
1071 if (to_mrf) {
1072 /* SEND instructions can't have MRF as a destination. */
1073 if (scan_inst->mlen)
1074 break;
1075
1076 if (devinfo->gen == 6) {
1077 /* gen6 math instructions must have the destination be
1078 * VGRF, so no compute-to-MRF for them.
1079 */
1080 if (scan_inst->is_math()) {
1081 break;
1082 }
1083 }
1084 }
1085
1086 /* This doesn't handle saturation on the instruction we
1087 * want to coalesce away if the register types do not match.
1088 * But if scan_inst is a non type-converting 'mov', we can fix
1089 * the types later.
1090 */
1091 if (inst->saturate &&
1092 inst->dst.type != scan_inst->dst.type &&
1093 !(scan_inst->opcode == BRW_OPCODE_MOV &&
1094 scan_inst->dst.type == scan_inst->src[0].type))
1095 break;
1096
1097 /* If we can't handle the swizzle, bail. */
1098 if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
1099 inst->src[0].swizzle,
1100 chans_needed)) {
1101 break;
1102 }
1103
1104 /* This doesn't handle coalescing of multiple registers. */
1105 if (scan_inst->regs_written > 1)
1106 break;
1107
1108 /* Mark which channels we found unconditional writes for. */
1109 if (!scan_inst->predicate)
1110 chans_remaining &= ~scan_inst->dst.writemask;
1111
1112 if (chans_remaining == 0)
1113 break;
1114 }
1115
1116 /* You can't read from an MRF, so if someone else reads our MRF's
1117 * source GRF that we wanted to rewrite, that stops us. If it's a
1118 * GRF we're trying to coalesce to, we don't actually handle
1119 * rewriting sources so bail in that case as well.
1120 */
1121 bool interfered = false;
1122 for (int i = 0; i < 3; i++) {
1123 if (inst->src[0].in_range(scan_inst->src[i],
1124 scan_inst->regs_read(i)))
1125 interfered = true;
1126 }
1127 if (interfered)
1128 break;
1129
1130 /* If somebody else writes the same channels of our destination here,
1131 * we can't coalesce before that.
1132 */
1133 if (inst->dst.in_range(scan_inst->dst, scan_inst->regs_written) &&
1134 (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
1135 break;
1136 }
1137
1138 /* Check for reads of the register we're trying to coalesce into. We
1139 * can't go rewriting instructions above that to put some other value
1140 * in the register instead.
1141 */
1142 if (to_mrf && scan_inst->mlen > 0) {
1143 if (inst->dst.nr >= scan_inst->base_mrf &&
1144 inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) {
1145 break;
1146 }
1147 } else {
1148 for (int i = 0; i < 3; i++) {
1149 if (inst->dst.in_range(scan_inst->src[i],
1150 scan_inst->regs_read(i)))
1151 interfered = true;
1152 }
1153 if (interfered)
1154 break;
1155 }
1156 }
1157
1158 if (chans_remaining == 0) {
1159 /* If we've made it here, we have an MOV we want to coalesce out, and
1160 * a scan_inst pointing to the earliest instruction involved in
1161 * computing the value. Now go rewrite the instruction stream
1162 * between the two.
1163 */
1164 vec4_instruction *scan_inst = _scan_inst;
1165 while (scan_inst != inst) {
1166 if (scan_inst->dst.file == VGRF &&
1167 scan_inst->dst.nr == inst->src[0].nr &&
1168 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1169 scan_inst->reswizzle(inst->dst.writemask,
1170 inst->src[0].swizzle);
1171 scan_inst->dst.file = inst->dst.file;
1172 scan_inst->dst.nr = inst->dst.nr;
1173 scan_inst->dst.reg_offset = inst->dst.reg_offset;
1174 if (inst->saturate &&
1175 inst->dst.type != scan_inst->dst.type) {
1176 /* If we have reached this point, scan_inst is a non
1177 * type-converting 'mov' and we can modify its register types
1178 * to match the ones in inst. Otherwise, we could have an
1179 * incorrect saturation result.
1180 */
1181 scan_inst->dst.type = inst->dst.type;
1182 scan_inst->src[0].type = inst->src[0].type;
1183 }
1184 scan_inst->saturate |= inst->saturate;
1185 }
1186 scan_inst = (vec4_instruction *)scan_inst->next;
1187 }
1188 inst->remove(block);
1189 progress = true;
1190 }
1191 }
1192
1193 if (progress)
1194 invalidate_live_intervals();
1195
1196 return progress;
1197 }
1198
1199 /**
1200 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
1201 * flow. We could probably do better here with some form of divergence
1202 * analysis.
1203 */
1204 bool
1205 vec4_visitor::eliminate_find_live_channel()
1206 {
1207 bool progress = false;
1208 unsigned depth = 0;
1209
1210 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1211 switch (inst->opcode) {
1212 case BRW_OPCODE_IF:
1213 case BRW_OPCODE_DO:
1214 depth++;
1215 break;
1216
1217 case BRW_OPCODE_ENDIF:
1218 case BRW_OPCODE_WHILE:
1219 depth--;
1220 break;
1221
1222 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1223 if (depth == 0) {
1224 inst->opcode = BRW_OPCODE_MOV;
1225 inst->src[0] = brw_imm_d(0);
1226 inst->force_writemask_all = true;
1227 progress = true;
1228 }
1229 break;
1230
1231 default:
1232 break;
1233 }
1234 }
1235
1236 return progress;
1237 }
1238
1239 /**
1240 * Splits virtual GRFs requesting more than one contiguous physical register.
1241 *
1242 * We initially create large virtual GRFs for temporary structures, arrays,
1243 * and matrices, so that the dereference visitor functions can add reg_offsets
1244 * to work their way down to the actual member being accessed. But when it
1245 * comes to optimization, we'd like to treat each register as individual
1246 * storage if possible.
1247 *
1248 * So far, the only thing that might prevent splitting is a send message from
1249 * a GRF on IVB.
1250 */
1251 void
1252 vec4_visitor::split_virtual_grfs()
1253 {
1254 int num_vars = this->alloc.count;
1255 int new_virtual_grf[num_vars];
1256 bool split_grf[num_vars];
1257
1258 memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
1259
1260 /* Try to split anything > 0 sized. */
1261 for (int i = 0; i < num_vars; i++) {
1262 split_grf[i] = this->alloc.sizes[i] != 1;
1263 }
1264
1265 /* Check that the instructions are compatible with the registers we're trying
1266 * to split.
1267 */
1268 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1269 if (inst->dst.file == VGRF && inst->regs_written > 1)
1270 split_grf[inst->dst.nr] = false;
1271
1272 for (int i = 0; i < 3; i++) {
1273 if (inst->src[i].file == VGRF && inst->regs_read(i) > 1)
1274 split_grf[inst->src[i].nr] = false;
1275 }
1276 }
1277
1278 /* Allocate new space for split regs. Note that the virtual
1279 * numbers will be contiguous.
1280 */
1281 for (int i = 0; i < num_vars; i++) {
1282 if (!split_grf[i])
1283 continue;
1284
1285 new_virtual_grf[i] = alloc.allocate(1);
1286 for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
1287 unsigned reg = alloc.allocate(1);
1288 assert(reg == new_virtual_grf[i] + j - 1);
1289 (void) reg;
1290 }
1291 this->alloc.sizes[i] = 1;
1292 }
1293
1294 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1295 if (inst->dst.file == VGRF && split_grf[inst->dst.nr] &&
1296 inst->dst.reg_offset != 0) {
1297 inst->dst.nr = (new_virtual_grf[inst->dst.nr] +
1298 inst->dst.reg_offset - 1);
1299 inst->dst.reg_offset = 0;
1300 }
1301 for (int i = 0; i < 3; i++) {
1302 if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] &&
1303 inst->src[i].reg_offset != 0) {
1304 inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] +
1305 inst->src[i].reg_offset - 1);
1306 inst->src[i].reg_offset = 0;
1307 }
1308 }
1309 }
1310 invalidate_live_intervals();
1311 }
1312
1313 void
1314 vec4_visitor::dump_instruction(backend_instruction *be_inst)
1315 {
1316 dump_instruction(be_inst, stderr);
1317 }
1318
1319 void
1320 vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
1321 {
1322 vec4_instruction *inst = (vec4_instruction *)be_inst;
1323
1324 if (inst->predicate) {
1325 fprintf(file, "(%cf0.%d%s) ",
1326 inst->predicate_inverse ? '-' : '+',
1327 inst->flag_subreg,
1328 pred_ctrl_align16[inst->predicate]);
1329 }
1330
1331 fprintf(file, "%s", brw_instruction_name(inst->opcode));
1332 if (inst->saturate)
1333 fprintf(file, ".sat");
1334 if (inst->conditional_mod) {
1335 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
1336 if (!inst->predicate &&
1337 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
1338 inst->opcode != BRW_OPCODE_IF &&
1339 inst->opcode != BRW_OPCODE_WHILE))) {
1340 fprintf(file, ".f0.%d", inst->flag_subreg);
1341 }
1342 }
1343 fprintf(file, " ");
1344
1345 switch (inst->dst.file) {
1346 case VGRF:
1347 fprintf(file, "vgrf%d.%d", inst->dst.nr, inst->dst.reg_offset);
1348 break;
1349 case FIXED_GRF:
1350 fprintf(file, "g%d", inst->dst.nr);
1351 break;
1352 case MRF:
1353 fprintf(file, "m%d", inst->dst.nr);
1354 break;
1355 case ARF:
1356 switch (inst->dst.nr) {
1357 case BRW_ARF_NULL:
1358 fprintf(file, "null");
1359 break;
1360 case BRW_ARF_ADDRESS:
1361 fprintf(file, "a0.%d", inst->dst.subnr);
1362 break;
1363 case BRW_ARF_ACCUMULATOR:
1364 fprintf(file, "acc%d", inst->dst.subnr);
1365 break;
1366 case BRW_ARF_FLAG:
1367 fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1368 break;
1369 default:
1370 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1371 break;
1372 }
1373 if (inst->dst.subnr)
1374 fprintf(file, "+%d", inst->dst.subnr);
1375 break;
1376 case BAD_FILE:
1377 fprintf(file, "(null)");
1378 break;
1379 case IMM:
1380 case ATTR:
1381 case UNIFORM:
1382 unreachable("not reached");
1383 }
1384 if (inst->dst.writemask != WRITEMASK_XYZW) {
1385 fprintf(file, ".");
1386 if (inst->dst.writemask & 1)
1387 fprintf(file, "x");
1388 if (inst->dst.writemask & 2)
1389 fprintf(file, "y");
1390 if (inst->dst.writemask & 4)
1391 fprintf(file, "z");
1392 if (inst->dst.writemask & 8)
1393 fprintf(file, "w");
1394 }
1395 fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type));
1396
1397 if (inst->src[0].file != BAD_FILE)
1398 fprintf(file, ", ");
1399
1400 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1401 if (inst->src[i].negate)
1402 fprintf(file, "-");
1403 if (inst->src[i].abs)
1404 fprintf(file, "|");
1405 switch (inst->src[i].file) {
1406 case VGRF:
1407 fprintf(file, "vgrf%d", inst->src[i].nr);
1408 break;
1409 case FIXED_GRF:
1410 fprintf(file, "g%d", inst->src[i].nr);
1411 break;
1412 case ATTR:
1413 fprintf(file, "attr%d", inst->src[i].nr);
1414 break;
1415 case UNIFORM:
1416 fprintf(file, "u%d", inst->src[i].nr);
1417 break;
1418 case IMM:
1419 switch (inst->src[i].type) {
1420 case BRW_REGISTER_TYPE_F:
1421 fprintf(file, "%fF", inst->src[i].f);
1422 break;
1423 case BRW_REGISTER_TYPE_D:
1424 fprintf(file, "%dD", inst->src[i].d);
1425 break;
1426 case BRW_REGISTER_TYPE_UD:
1427 fprintf(file, "%uU", inst->src[i].ud);
1428 break;
1429 case BRW_REGISTER_TYPE_VF:
1430 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
1431 brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
1432 brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
1433 brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
1434 brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
1435 break;
1436 default:
1437 fprintf(file, "???");
1438 break;
1439 }
1440 break;
1441 case ARF:
1442 switch (inst->src[i].nr) {
1443 case BRW_ARF_NULL:
1444 fprintf(file, "null");
1445 break;
1446 case BRW_ARF_ADDRESS:
1447 fprintf(file, "a0.%d", inst->src[i].subnr);
1448 break;
1449 case BRW_ARF_ACCUMULATOR:
1450 fprintf(file, "acc%d", inst->src[i].subnr);
1451 break;
1452 case BRW_ARF_FLAG:
1453 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1454 break;
1455 default:
1456 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1457 break;
1458 }
1459 if (inst->src[i].subnr)
1460 fprintf(file, "+%d", inst->src[i].subnr);
1461 break;
1462 case BAD_FILE:
1463 fprintf(file, "(null)");
1464 break;
1465 case MRF:
1466 unreachable("not reached");
1467 }
1468
1469 /* Don't print .0; and only VGRFs have reg_offsets and sizes */
1470 if (inst->src[i].reg_offset != 0 &&
1471 inst->src[i].file == VGRF &&
1472 alloc.sizes[inst->src[i].nr] != 1)
1473 fprintf(file, ".%d", inst->src[i].reg_offset);
1474
1475 if (inst->src[i].file != IMM) {
1476 static const char *chans[4] = {"x", "y", "z", "w"};
1477 fprintf(file, ".");
1478 for (int c = 0; c < 4; c++) {
1479 fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1480 }
1481 }
1482
1483 if (inst->src[i].abs)
1484 fprintf(file, "|");
1485
1486 if (inst->src[i].file != IMM) {
1487 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
1488 }
1489
1490 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1491 fprintf(file, ", ");
1492 }
1493
1494 if (inst->force_writemask_all)
1495 fprintf(file, " NoMask");
1496
1497 fprintf(file, "\n");
1498 }
1499
1500
1501 static inline struct brw_reg
1502 attribute_to_hw_reg(int attr, bool interleaved)
1503 {
1504 if (interleaved)
1505 return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1);
1506 else
1507 return brw_vec8_grf(attr, 0);
1508 }
1509
1510
1511 /**
1512 * Replace each register of type ATTR in this->instructions with a reference
1513 * to a fixed HW register.
1514 *
1515 * If interleaved is true, then each attribute takes up half a register, with
1516 * register N containing attribute 2*N in its first half and attribute 2*N+1
1517 * in its second half (this corresponds to the payload setup used by geometry
1518 * shaders in "single" or "dual instanced" dispatch mode). If interleaved is
1519 * false, then each attribute takes up a whole register, with register N
1520 * containing attribute N (this corresponds to the payload setup used by
1521 * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
1522 */
1523 void
1524 vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
1525 bool interleaved)
1526 {
1527 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1528 for (int i = 0; i < 3; i++) {
1529 if (inst->src[i].file != ATTR)
1530 continue;
1531
1532 int grf = attribute_map[inst->src[i].nr + inst->src[i].reg_offset];
1533
1534 /* All attributes used in the shader need to have been assigned a
1535 * hardware register by the caller
1536 */
1537 assert(grf != 0);
1538
1539 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1540 reg.swizzle = inst->src[i].swizzle;
1541 reg.type = inst->src[i].type;
1542 if (inst->src[i].abs)
1543 reg = brw_abs(reg);
1544 if (inst->src[i].negate)
1545 reg = negate(reg);
1546
1547 inst->src[i] = reg;
1548 }
1549 }
1550 }
1551
1552 int
1553 vec4_vs_visitor::setup_attributes(int payload_reg)
1554 {
1555 int nr_attributes;
1556 int attribute_map[VERT_ATTRIB_MAX + 2];
1557 memset(attribute_map, 0, sizeof(attribute_map));
1558
1559 nr_attributes = 0;
1560 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1561 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
1562 attribute_map[i] = payload_reg + nr_attributes;
1563 nr_attributes++;
1564 }
1565 }
1566
1567 /* VertexID is stored by the VF as the last vertex element, but we
1568 * don't represent it with a flag in inputs_read, so we call it
1569 * VERT_ATTRIB_MAX.
1570 */
1571 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid ||
1572 vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) {
1573 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1574 nr_attributes++;
1575 }
1576
1577 if (vs_prog_data->uses_drawid) {
1578 attribute_map[VERT_ATTRIB_MAX + 1] = payload_reg + nr_attributes;
1579 nr_attributes++;
1580 }
1581
1582 lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
1583
1584 return payload_reg + vs_prog_data->nr_attributes;
1585 }
1586
1587 int
1588 vec4_visitor::setup_uniforms(int reg)
1589 {
1590 prog_data->base.dispatch_grf_start_reg = reg;
1591
1592 /* The pre-gen6 VS requires that some push constants get loaded no
1593 * matter what, or the GPU would hang.
1594 */
1595 if (devinfo->gen < 6 && this->uniforms == 0) {
1596 stage_prog_data->param =
1597 reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
1598 for (unsigned int i = 0; i < 4; i++) {
1599 unsigned int slot = this->uniforms * 4 + i;
1600 static gl_constant_value zero = { 0.0 };
1601 stage_prog_data->param[slot] = &zero;
1602 }
1603
1604 this->uniforms++;
1605 reg++;
1606 } else {
1607 reg += ALIGN(uniforms, 2) / 2;
1608 }
1609
1610 stage_prog_data->nr_params = this->uniforms * 4;
1611
1612 prog_data->base.curb_read_length =
1613 reg - prog_data->base.dispatch_grf_start_reg;
1614
1615 return reg;
1616 }
1617
1618 void
1619 vec4_vs_visitor::setup_payload(void)
1620 {
1621 int reg = 0;
1622
1623 /* The payload always contains important data in g0, which contains
1624 * the URB handles that are passed on to the URB write at the end
1625 * of the thread. So, we always start push constants at g1.
1626 */
1627 reg++;
1628
1629 reg = setup_uniforms(reg);
1630
1631 reg = setup_attributes(reg);
1632
1633 this->first_non_payload_grf = reg;
1634 }
1635
1636 bool
1637 vec4_visitor::lower_minmax()
1638 {
1639 assert(devinfo->gen < 6);
1640
1641 bool progress = false;
1642
1643 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1644 const vec4_builder ibld(this, block, inst);
1645
1646 if (inst->opcode == BRW_OPCODE_SEL &&
1647 inst->predicate == BRW_PREDICATE_NONE) {
1648 /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
1649 * the original SEL.L/GE instruction
1650 */
1651 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
1652 inst->conditional_mod);
1653 inst->predicate = BRW_PREDICATE_NORMAL;
1654 inst->conditional_mod = BRW_CONDITIONAL_NONE;
1655
1656 progress = true;
1657 }
1658 }
1659
1660 if (progress)
1661 invalidate_live_intervals();
1662
1663 return progress;
1664 }
1665
1666 src_reg
1667 vec4_visitor::get_timestamp()
1668 {
1669 assert(devinfo->gen >= 7);
1670
1671 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1672 BRW_ARF_TIMESTAMP,
1673 0,
1674 0,
1675 0,
1676 BRW_REGISTER_TYPE_UD,
1677 BRW_VERTICAL_STRIDE_0,
1678 BRW_WIDTH_4,
1679 BRW_HORIZONTAL_STRIDE_4,
1680 BRW_SWIZZLE_XYZW,
1681 WRITEMASK_XYZW));
1682
1683 dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1684
1685 vec4_instruction *mov = emit(MOV(dst, ts));
1686 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1687 * even if it's not enabled in the dispatch.
1688 */
1689 mov->force_writemask_all = true;
1690
1691 return src_reg(dst);
1692 }
1693
1694 void
1695 vec4_visitor::emit_shader_time_begin()
1696 {
1697 current_annotation = "shader time start";
1698 shader_start_time = get_timestamp();
1699 }
1700
1701 void
1702 vec4_visitor::emit_shader_time_end()
1703 {
1704 current_annotation = "shader time end";
1705 src_reg shader_end_time = get_timestamp();
1706
1707
1708 /* Check that there weren't any timestamp reset events (assuming these
1709 * were the only two timestamp reads that happened).
1710 */
1711 src_reg reset_end = shader_end_time;
1712 reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1713 vec4_instruction *test = emit(AND(dst_null_ud(), reset_end, brw_imm_ud(1u)));
1714 test->conditional_mod = BRW_CONDITIONAL_Z;
1715
1716 emit(IF(BRW_PREDICATE_NORMAL));
1717
1718 /* Take the current timestamp and get the delta. */
1719 shader_start_time.negate = true;
1720 dst_reg diff = dst_reg(this, glsl_type::uint_type);
1721 emit(ADD(diff, shader_start_time, shader_end_time));
1722
1723 /* If there were no instructions between the two timestamp gets, the diff
1724 * is 2 cycles. Remove that overhead, so I can forget about that when
1725 * trying to determine the time taken for single instructions.
1726 */
1727 emit(ADD(diff, src_reg(diff), brw_imm_ud(-2u)));
1728
1729 emit_shader_time_write(0, src_reg(diff));
1730 emit_shader_time_write(1, brw_imm_ud(1u));
1731 emit(BRW_OPCODE_ELSE);
1732 emit_shader_time_write(2, brw_imm_ud(1u));
1733 emit(BRW_OPCODE_ENDIF);
1734 }
1735
1736 void
1737 vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
1738 {
1739 dst_reg dst =
1740 dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
1741
1742 dst_reg offset = dst;
1743 dst_reg time = dst;
1744 time.reg_offset++;
1745
1746 offset.type = BRW_REGISTER_TYPE_UD;
1747 int index = shader_time_index * 3 + shader_time_subindex;
1748 emit(MOV(offset, brw_imm_d(index * SHADER_TIME_STRIDE)));
1749
1750 time.type = BRW_REGISTER_TYPE_UD;
1751 emit(MOV(time, value));
1752
1753 vec4_instruction *inst =
1754 emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
1755 inst->mlen = 2;
1756 }
1757
1758 void
1759 vec4_visitor::convert_to_hw_regs()
1760 {
1761 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1762 for (int i = 0; i < 3; i++) {
1763 struct src_reg &src = inst->src[i];
1764 struct brw_reg reg;
1765 switch (src.file) {
1766 case VGRF:
1767 reg = brw_vec8_grf(src.nr + src.reg_offset, 0);
1768 reg.type = src.type;
1769 reg.swizzle = src.swizzle;
1770 reg.abs = src.abs;
1771 reg.negate = src.negate;
1772 break;
1773
1774 case UNIFORM:
1775 reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg +
1776 (src.nr + src.reg_offset) / 2,
1777 ((src.nr + src.reg_offset) % 2) * 4),
1778 0, 4, 1);
1779 reg.type = src.type;
1780 reg.swizzle = src.swizzle;
1781 reg.abs = src.abs;
1782 reg.negate = src.negate;
1783
1784 /* This should have been moved to pull constants. */
1785 assert(!src.reladdr);
1786 break;
1787
1788 case ARF:
1789 case FIXED_GRF:
1790 case IMM:
1791 continue;
1792
1793 case BAD_FILE:
1794 /* Probably unused. */
1795 reg = brw_null_reg();
1796 break;
1797
1798 case MRF:
1799 case ATTR:
1800 unreachable("not reached");
1801 }
1802
1803 src = reg;
1804 }
1805
1806 if (inst->is_3src()) {
1807 /* 3-src instructions with scalar sources support arbitrary subnr,
1808 * but don't actually use swizzles. Convert swizzle into subnr.
1809 */
1810 for (int i = 0; i < 3; i++) {
1811 if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0) {
1812 assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
1813 inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
1814 }
1815 }
1816 }
1817
1818 dst_reg &dst = inst->dst;
1819 struct brw_reg reg;
1820
1821 switch (inst->dst.file) {
1822 case VGRF:
1823 reg = brw_vec8_grf(dst.nr + dst.reg_offset, 0);
1824 reg.type = dst.type;
1825 reg.writemask = dst.writemask;
1826 break;
1827
1828 case MRF:
1829 assert(((dst.nr + dst.reg_offset) & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
1830 reg = brw_message_reg(dst.nr + dst.reg_offset);
1831 reg.type = dst.type;
1832 reg.writemask = dst.writemask;
1833 break;
1834
1835 case ARF:
1836 case FIXED_GRF:
1837 reg = dst.as_brw_reg();
1838 break;
1839
1840 case BAD_FILE:
1841 reg = brw_null_reg();
1842 break;
1843
1844 case IMM:
1845 case ATTR:
1846 case UNIFORM:
1847 unreachable("not reached");
1848 }
1849
1850 dst = reg;
1851 }
1852 }
1853
1854 bool
1855 vec4_visitor::run()
1856 {
1857 if (shader_time_index >= 0)
1858 emit_shader_time_begin();
1859
1860 emit_prolog();
1861
1862 emit_nir_code();
1863 if (failed)
1864 return false;
1865 base_ir = NULL;
1866
1867 emit_thread_end();
1868
1869 calculate_cfg();
1870
1871 /* Before any optimization, push array accesses out to scratch
1872 * space where we need them to be. This pass may allocate new
1873 * virtual GRFs, so we want to do it early. It also makes sure
1874 * that we have reladdr computations available for CSE, since we'll
1875 * often do repeated subexpressions for those.
1876 */
1877 move_grf_array_access_to_scratch();
1878 move_uniform_array_access_to_pull_constants();
1879
1880 pack_uniform_registers();
1881 move_push_constants_to_pull_constants();
1882 split_virtual_grfs();
1883
1884 #define OPT(pass, args...) ({ \
1885 pass_num++; \
1886 bool this_progress = pass(args); \
1887 \
1888 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
1889 char filename[64]; \
1890 snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \
1891 stage_abbrev, nir->info.name, iteration, pass_num); \
1892 \
1893 backend_shader::dump_instructions(filename); \
1894 } \
1895 \
1896 progress = progress || this_progress; \
1897 this_progress; \
1898 })
1899
1900
1901 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
1902 char filename[64];
1903 snprintf(filename, 64, "%s-%s-00-00-start",
1904 stage_abbrev, nir->info.name);
1905
1906 backend_shader::dump_instructions(filename);
1907 }
1908
1909 bool progress;
1910 int iteration = 0;
1911 int pass_num = 0;
1912 do {
1913 progress = false;
1914 pass_num = 0;
1915 iteration++;
1916
1917 OPT(opt_predicated_break, this);
1918 OPT(opt_reduce_swizzle);
1919 OPT(dead_code_eliminate);
1920 OPT(dead_control_flow_eliminate, this);
1921 OPT(opt_copy_propagation);
1922 OPT(opt_cmod_propagation);
1923 OPT(opt_cse);
1924 OPT(opt_algebraic);
1925 OPT(opt_register_coalesce);
1926 OPT(eliminate_find_live_channel);
1927 } while (progress);
1928
1929 pass_num = 0;
1930
1931 if (OPT(opt_vector_float)) {
1932 OPT(opt_cse);
1933 OPT(opt_copy_propagation, false);
1934 OPT(opt_copy_propagation, true);
1935 OPT(dead_code_eliminate);
1936 }
1937
1938 if (devinfo->gen <= 5 && OPT(lower_minmax)) {
1939 OPT(opt_cmod_propagation);
1940 OPT(opt_cse);
1941 OPT(opt_copy_propagation);
1942 OPT(dead_code_eliminate);
1943 }
1944
1945 if (failed)
1946 return false;
1947
1948 setup_payload();
1949
1950 if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {
1951 /* Debug of register spilling: Go spill everything. */
1952 const int grf_count = alloc.count;
1953 float spill_costs[alloc.count];
1954 bool no_spill[alloc.count];
1955 evaluate_spill_costs(spill_costs, no_spill);
1956 for (int i = 0; i < grf_count; i++) {
1957 if (no_spill[i])
1958 continue;
1959 spill_reg(i);
1960 }
1961 }
1962
1963 bool allocated_without_spills = reg_allocate();
1964
1965 if (!allocated_without_spills) {
1966 compiler->shader_perf_log(log_data,
1967 "%s shader triggered register spilling. "
1968 "Try reducing the number of live vec4 values "
1969 "to improve performance.\n",
1970 stage_name);
1971
1972 while (!reg_allocate()) {
1973 if (failed)
1974 return false;
1975 }
1976 }
1977
1978 opt_schedule_instructions();
1979
1980 opt_set_dependency_control();
1981
1982 convert_to_hw_regs();
1983
1984 if (last_scratch > 0) {
1985 prog_data->base.total_scratch =
1986 brw_get_scratch_size(last_scratch * REG_SIZE);
1987 }
1988
1989 return !failed;
1990 }
1991
1992 } /* namespace brw */
1993
1994 extern "C" {
1995
1996 /**
1997 * Compile a vertex shader.
1998 *
1999 * Returns the final assembly and the program's size.
2000 */
2001 const unsigned *
2002 brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
2003 void *mem_ctx,
2004 const struct brw_vs_prog_key *key,
2005 struct brw_vs_prog_data *prog_data,
2006 const nir_shader *src_shader,
2007 gl_clip_plane *clip_planes,
2008 bool use_legacy_snorm_formula,
2009 int shader_time_index,
2010 unsigned *final_assembly_size,
2011 char **error_str)
2012 {
2013 const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
2014 nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
2015 shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
2016 is_scalar);
2017 brw_nir_lower_vs_inputs(shader, compiler->devinfo, is_scalar,
2018 use_legacy_snorm_formula, key->gl_attrib_wa_flags);
2019 brw_nir_lower_vue_outputs(shader, is_scalar);
2020 shader = brw_postprocess_nir(shader, compiler->devinfo, is_scalar);
2021
2022 const unsigned *assembly = NULL;
2023
2024 unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
2025
2026 /* gl_VertexID and gl_InstanceID are system values, but arrive via an
2027 * incoming vertex attribute. So, add an extra slot.
2028 */
2029 if (shader->info.system_values_read &
2030 (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
2031 BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
2032 BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
2033 BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
2034 nr_attributes++;
2035 }
2036
2037 /* gl_DrawID has its very own vec4 */
2038 if (shader->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) {
2039 nr_attributes++;
2040 }
2041
2042 /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
2043 * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in
2044 * vec4 mode, the hardware appears to wedge unless we read something.
2045 */
2046 if (is_scalar)
2047 prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2);
2048 else
2049 prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2);
2050
2051 prog_data->nr_attributes = nr_attributes;
2052
2053 /* Since vertex shaders reuse the same VUE entry for inputs and outputs
2054 * (overwriting the original contents), we need to make sure the size is
2055 * the larger of the two.
2056 */
2057 const unsigned vue_entries =
2058 MAX2(nr_attributes, (unsigned)prog_data->base.vue_map.num_slots);
2059
2060 if (compiler->devinfo->gen == 6)
2061 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
2062 else
2063 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
2064
2065 if (is_scalar) {
2066 prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
2067
2068 fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
2069 NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */
2070 shader, 8, shader_time_index);
2071 if (!v.run_vs(clip_planes)) {
2072 if (error_str)
2073 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2074
2075 return NULL;
2076 }
2077
2078 fs_generator g(compiler, log_data, mem_ctx, (void *) key,
2079 &prog_data->base.base, v.promoted_constants,
2080 v.runtime_check_aads_emit, MESA_SHADER_VERTEX);
2081 if (INTEL_DEBUG & DEBUG_VS) {
2082 const char *debug_name =
2083 ralloc_asprintf(mem_ctx, "%s vertex shader %s",
2084 shader->info.label ? shader->info.label : "unnamed",
2085 shader->info.name);
2086
2087 g.enable_debug(debug_name);
2088 }
2089 g.generate_code(v.cfg, 8);
2090 assembly = g.get_assembly(final_assembly_size);
2091 }
2092
2093 if (!assembly) {
2094 prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
2095
2096 vec4_vs_visitor v(compiler, log_data, key, prog_data,
2097 shader, clip_planes, mem_ctx,
2098 shader_time_index, use_legacy_snorm_formula);
2099 if (!v.run()) {
2100 if (error_str)
2101 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2102
2103 return NULL;
2104 }
2105
2106 assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
2107 shader, &prog_data->base, v.cfg,
2108 final_assembly_size);
2109 }
2110
2111 return assembly;
2112 }
2113
2114 } /* extern "C" */