i965: Use immediate storage in inherited brw_reg.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27 #include "brw_vs.h"
28 #include "brw_nir.h"
29 #include "brw_vec4_live_variables.h"
30 #include "brw_dead_control_flow.h"
31
32 extern "C" {
33 #include "main/macros.h"
34 #include "main/shaderobj.h"
35 #include "program/prog_print.h"
36 #include "program/prog_parameter.h"
37 }
38 #include "main/context.h"
39
40 #define MAX_INSTRUCTION (1 << 30)
41
42 using namespace brw;
43
44 namespace brw {
45
46 void
47 src_reg::init()
48 {
49 memset(this, 0, sizeof(*this));
50
51 this->file = BAD_FILE;
52 }
53
54 src_reg::src_reg(register_file file, int reg, const glsl_type *type)
55 {
56 init();
57
58 this->file = file;
59 this->reg = reg;
60 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
61 this->swizzle = brw_swizzle_for_size(type->vector_elements);
62 else
63 this->swizzle = BRW_SWIZZLE_XYZW;
64 if (type)
65 this->type = brw_type_for_base_type(type);
66 }
67
68 /** Generic unset register constructor. */
69 src_reg::src_reg()
70 {
71 init();
72 }
73
74 src_reg::src_reg(float f)
75 {
76 init();
77
78 this->file = IMM;
79 this->type = BRW_REGISTER_TYPE_F;
80 this->f = f;
81 }
82
83 src_reg::src_reg(uint32_t u)
84 {
85 init();
86
87 this->file = IMM;
88 this->type = BRW_REGISTER_TYPE_UD;
89 this->ud = u;
90 }
91
92 src_reg::src_reg(int32_t i)
93 {
94 init();
95
96 this->file = IMM;
97 this->type = BRW_REGISTER_TYPE_D;
98 this->d = i;
99 }
100
101 src_reg::src_reg(uint8_t vf[4])
102 {
103 init();
104
105 this->file = IMM;
106 this->type = BRW_REGISTER_TYPE_VF;
107 memcpy(&this->ud, vf, sizeof(unsigned));
108 }
109
110 src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
111 {
112 init();
113
114 this->file = IMM;
115 this->type = BRW_REGISTER_TYPE_VF;
116 this->ud = (vf0 << 0) |
117 (vf1 << 8) |
118 (vf2 << 16) |
119 (vf3 << 24);
120 }
121
122 src_reg::src_reg(struct brw_reg reg)
123 {
124 init();
125
126 this->file = HW_REG;
127 this->fixed_hw_reg = reg;
128 this->type = reg.type;
129 }
130
131 src_reg::src_reg(const dst_reg &reg)
132 {
133 init();
134
135 this->file = reg.file;
136 this->reg = reg.reg;
137 this->reg_offset = reg.reg_offset;
138 this->type = reg.type;
139 this->reladdr = reg.reladdr;
140 this->fixed_hw_reg = reg.fixed_hw_reg;
141 this->swizzle = brw_swizzle_for_mask(reg.writemask);
142 }
143
144 void
145 dst_reg::init()
146 {
147 memset(this, 0, sizeof(*this));
148 this->file = BAD_FILE;
149 this->writemask = WRITEMASK_XYZW;
150 }
151
152 dst_reg::dst_reg()
153 {
154 init();
155 }
156
157 dst_reg::dst_reg(register_file file, int reg)
158 {
159 init();
160
161 this->file = file;
162 this->reg = reg;
163 }
164
165 dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
166 unsigned writemask)
167 {
168 init();
169
170 this->file = file;
171 this->reg = reg;
172 this->type = brw_type_for_base_type(type);
173 this->writemask = writemask;
174 }
175
176 dst_reg::dst_reg(register_file file, int reg, brw_reg_type type,
177 unsigned writemask)
178 {
179 init();
180
181 this->file = file;
182 this->reg = reg;
183 this->type = type;
184 this->writemask = writemask;
185 }
186
187 dst_reg::dst_reg(struct brw_reg reg)
188 {
189 init();
190
191 this->file = HW_REG;
192 this->fixed_hw_reg = reg;
193 this->type = reg.type;
194 }
195
196 dst_reg::dst_reg(const src_reg &reg)
197 {
198 init();
199
200 this->file = reg.file;
201 this->reg = reg.reg;
202 this->reg_offset = reg.reg_offset;
203 this->type = reg.type;
204 this->writemask = brw_mask_for_swizzle(reg.swizzle);
205 this->reladdr = reg.reladdr;
206 this->fixed_hw_reg = reg.fixed_hw_reg;
207 }
208
209 bool
210 dst_reg::equals(const dst_reg &r) const
211 {
212 return (file == r.file &&
213 reg == r.reg &&
214 reg_offset == r.reg_offset &&
215 type == r.type &&
216 negate == r.negate &&
217 abs == r.abs &&
218 writemask == r.writemask &&
219 (reladdr == r.reladdr ||
220 (reladdr && r.reladdr && reladdr->equals(*r.reladdr))) &&
221 (file != HW_REG ||
222 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
223 sizeof(fixed_hw_reg)) == 0));
224 }
225
226 bool
227 vec4_instruction::is_send_from_grf()
228 {
229 switch (opcode) {
230 case SHADER_OPCODE_SHADER_TIME_ADD:
231 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
232 case SHADER_OPCODE_UNTYPED_ATOMIC:
233 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
234 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
235 case SHADER_OPCODE_TYPED_ATOMIC:
236 case SHADER_OPCODE_TYPED_SURFACE_READ:
237 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
238 return true;
239 default:
240 return false;
241 }
242 }
243
244 unsigned
245 vec4_instruction::regs_read(unsigned arg) const
246 {
247 if (src[arg].file == BAD_FILE)
248 return 0;
249
250 switch (opcode) {
251 case SHADER_OPCODE_SHADER_TIME_ADD:
252 case SHADER_OPCODE_UNTYPED_ATOMIC:
253 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
254 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
255 case SHADER_OPCODE_TYPED_ATOMIC:
256 case SHADER_OPCODE_TYPED_SURFACE_READ:
257 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
258 return arg == 0 ? mlen : 1;
259
260 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
261 return arg == 1 ? mlen : 1;
262
263 default:
264 return 1;
265 }
266 }
267
268 bool
269 vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo)
270 {
271 if (devinfo->gen == 6 && is_math())
272 return false;
273
274 if (is_send_from_grf())
275 return false;
276
277 if (!backend_instruction::can_do_source_mods())
278 return false;
279
280 return true;
281 }
282
283 bool
284 vec4_instruction::can_change_types() const
285 {
286 return dst.type == src[0].type &&
287 !src[0].abs && !src[0].negate && !saturate &&
288 (opcode == BRW_OPCODE_MOV ||
289 (opcode == BRW_OPCODE_SEL &&
290 dst.type == src[1].type &&
291 predicate != BRW_PREDICATE_NONE &&
292 !src[1].abs && !src[1].negate));
293 }
294
295 /**
296 * Returns how many MRFs an opcode will write over.
297 *
298 * Note that this is not the 0 or 1 implied writes in an actual gen
299 * instruction -- the generate_* functions generate additional MOVs
300 * for setup.
301 */
302 int
303 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
304 {
305 if (inst->mlen == 0 || inst->is_send_from_grf())
306 return 0;
307
308 switch (inst->opcode) {
309 case SHADER_OPCODE_RCP:
310 case SHADER_OPCODE_RSQ:
311 case SHADER_OPCODE_SQRT:
312 case SHADER_OPCODE_EXP2:
313 case SHADER_OPCODE_LOG2:
314 case SHADER_OPCODE_SIN:
315 case SHADER_OPCODE_COS:
316 return 1;
317 case SHADER_OPCODE_INT_QUOTIENT:
318 case SHADER_OPCODE_INT_REMAINDER:
319 case SHADER_OPCODE_POW:
320 return 2;
321 case VS_OPCODE_URB_WRITE:
322 return 1;
323 case VS_OPCODE_PULL_CONSTANT_LOAD:
324 return 2;
325 case SHADER_OPCODE_GEN4_SCRATCH_READ:
326 return 2;
327 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
328 return 3;
329 case GS_OPCODE_URB_WRITE:
330 case GS_OPCODE_URB_WRITE_ALLOCATE:
331 case GS_OPCODE_THREAD_END:
332 return 0;
333 case GS_OPCODE_FF_SYNC:
334 return 1;
335 case SHADER_OPCODE_SHADER_TIME_ADD:
336 return 0;
337 case SHADER_OPCODE_TEX:
338 case SHADER_OPCODE_TXL:
339 case SHADER_OPCODE_TXD:
340 case SHADER_OPCODE_TXF:
341 case SHADER_OPCODE_TXF_CMS:
342 case SHADER_OPCODE_TXF_CMS_W:
343 case SHADER_OPCODE_TXF_MCS:
344 case SHADER_OPCODE_TXS:
345 case SHADER_OPCODE_TG4:
346 case SHADER_OPCODE_TG4_OFFSET:
347 case SHADER_OPCODE_SAMPLEINFO:
348 case VS_OPCODE_GET_BUFFER_SIZE:
349 return inst->header_size;
350 default:
351 unreachable("not reached");
352 }
353 }
354
355 bool
356 src_reg::equals(const src_reg &r) const
357 {
358 return (file == r.file &&
359 reg == r.reg &&
360 reg_offset == r.reg_offset &&
361 type == r.type &&
362 negate == r.negate &&
363 abs == r.abs &&
364 swizzle == r.swizzle &&
365 !reladdr && !r.reladdr &&
366 (file != HW_REG ||
367 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
368 sizeof(fixed_hw_reg)) == 0) &&
369 (file != IMM || d == r.d));
370 }
371
372 bool
373 vec4_visitor::opt_vector_float()
374 {
375 bool progress = false;
376
377 int last_reg = -1, last_reg_offset = -1;
378 enum register_file last_reg_file = BAD_FILE;
379
380 int remaining_channels = 0;
381 uint8_t imm[4];
382 int inst_count = 0;
383 vec4_instruction *imm_inst[4];
384
385 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
386 if (last_reg != inst->dst.reg ||
387 last_reg_offset != inst->dst.reg_offset ||
388 last_reg_file != inst->dst.file) {
389 last_reg = inst->dst.reg;
390 last_reg_offset = inst->dst.reg_offset;
391 last_reg_file = inst->dst.file;
392 remaining_channels = WRITEMASK_XYZW;
393
394 inst_count = 0;
395 }
396
397 if (inst->opcode != BRW_OPCODE_MOV ||
398 inst->dst.writemask == WRITEMASK_XYZW ||
399 inst->src[0].file != IMM)
400 continue;
401
402 int vf = brw_float_to_vf(inst->src[0].f);
403 if (vf == -1)
404 continue;
405
406 if ((inst->dst.writemask & WRITEMASK_X) != 0)
407 imm[0] = vf;
408 if ((inst->dst.writemask & WRITEMASK_Y) != 0)
409 imm[1] = vf;
410 if ((inst->dst.writemask & WRITEMASK_Z) != 0)
411 imm[2] = vf;
412 if ((inst->dst.writemask & WRITEMASK_W) != 0)
413 imm[3] = vf;
414
415 imm_inst[inst_count++] = inst;
416
417 remaining_channels &= ~inst->dst.writemask;
418 if (remaining_channels == 0) {
419 vec4_instruction *mov = MOV(inst->dst, imm);
420 mov->dst.type = BRW_REGISTER_TYPE_F;
421 mov->dst.writemask = WRITEMASK_XYZW;
422 inst->insert_after(block, mov);
423 last_reg = -1;
424
425 for (int i = 0; i < inst_count; i++) {
426 imm_inst[i]->remove(block);
427 }
428 progress = true;
429 }
430 }
431
432 if (progress)
433 invalidate_live_intervals();
434
435 return progress;
436 }
437
438 /* Replaces unused channels of a swizzle with channels that are used.
439 *
440 * For instance, this pass transforms
441 *
442 * mov vgrf4.yz, vgrf5.wxzy
443 *
444 * into
445 *
446 * mov vgrf4.yz, vgrf5.xxzx
447 *
448 * This eliminates false uses of some channels, letting dead code elimination
449 * remove the instructions that wrote them.
450 */
451 bool
452 vec4_visitor::opt_reduce_swizzle()
453 {
454 bool progress = false;
455
456 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
457 if (inst->dst.file == BAD_FILE || inst->dst.file == HW_REG ||
458 inst->is_send_from_grf())
459 continue;
460
461 unsigned swizzle;
462
463 /* Determine which channels of the sources are read. */
464 switch (inst->opcode) {
465 case VEC4_OPCODE_PACK_BYTES:
466 case BRW_OPCODE_DP4:
467 case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
468 * but all four of src1.
469 */
470 swizzle = brw_swizzle_for_size(4);
471 break;
472 case BRW_OPCODE_DP3:
473 swizzle = brw_swizzle_for_size(3);
474 break;
475 case BRW_OPCODE_DP2:
476 swizzle = brw_swizzle_for_size(2);
477 break;
478 default:
479 swizzle = brw_swizzle_for_mask(inst->dst.writemask);
480 break;
481 }
482
483 /* Update sources' swizzles. */
484 for (int i = 0; i < 3; i++) {
485 if (inst->src[i].file != GRF &&
486 inst->src[i].file != ATTR &&
487 inst->src[i].file != UNIFORM)
488 continue;
489
490 const unsigned new_swizzle =
491 brw_compose_swizzle(swizzle, inst->src[i].swizzle);
492 if (inst->src[i].swizzle != new_swizzle) {
493 inst->src[i].swizzle = new_swizzle;
494 progress = true;
495 }
496 }
497 }
498
499 if (progress)
500 invalidate_live_intervals();
501
502 return progress;
503 }
504
505 void
506 vec4_visitor::split_uniform_registers()
507 {
508 /* Prior to this, uniforms have been in an array sized according to
509 * the number of vector uniforms present, sparsely filled (so an
510 * aggregate results in reg indices being skipped over). Now we're
511 * going to cut those aggregates up so each .reg index is one
512 * vector. The goal is to make elimination of unused uniform
513 * components easier later.
514 */
515 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
516 for (int i = 0 ; i < 3; i++) {
517 if (inst->src[i].file != UNIFORM)
518 continue;
519
520 assert(!inst->src[i].reladdr);
521
522 inst->src[i].reg += inst->src[i].reg_offset;
523 inst->src[i].reg_offset = 0;
524 }
525 }
526
527 /* Update that everything is now vector-sized. */
528 for (int i = 0; i < this->uniforms; i++) {
529 this->uniform_size[i] = 1;
530 }
531 }
532
533 void
534 vec4_visitor::pack_uniform_registers()
535 {
536 uint8_t chans_used[this->uniforms];
537 int new_loc[this->uniforms];
538 int new_chan[this->uniforms];
539
540 memset(chans_used, 0, sizeof(chans_used));
541 memset(new_loc, 0, sizeof(new_loc));
542 memset(new_chan, 0, sizeof(new_chan));
543
544 /* Find which uniform vectors are actually used by the program. We
545 * expect unused vector elements when we've moved array access out
546 * to pull constants, and from some GLSL code generators like wine.
547 */
548 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
549 unsigned readmask;
550 switch (inst->opcode) {
551 case VEC4_OPCODE_PACK_BYTES:
552 case BRW_OPCODE_DP4:
553 case BRW_OPCODE_DPH:
554 readmask = 0xf;
555 break;
556 case BRW_OPCODE_DP3:
557 readmask = 0x7;
558 break;
559 case BRW_OPCODE_DP2:
560 readmask = 0x3;
561 break;
562 default:
563 readmask = inst->dst.writemask;
564 break;
565 }
566
567 for (int i = 0 ; i < 3; i++) {
568 if (inst->src[i].file != UNIFORM)
569 continue;
570
571 int reg = inst->src[i].reg;
572 for (int c = 0; c < 4; c++) {
573 if (!(readmask & (1 << c)))
574 continue;
575
576 chans_used[reg] = MAX2(chans_used[reg],
577 BRW_GET_SWZ(inst->src[i].swizzle, c) + 1);
578 }
579 }
580 }
581
582 int new_uniform_count = 0;
583
584 /* Now, figure out a packing of the live uniform vectors into our
585 * push constants.
586 */
587 for (int src = 0; src < uniforms; src++) {
588 assert(src < uniform_array_size);
589 int size = chans_used[src];
590
591 if (size == 0)
592 continue;
593
594 int dst;
595 /* Find the lowest place we can slot this uniform in. */
596 for (dst = 0; dst < src; dst++) {
597 if (chans_used[dst] + size <= 4)
598 break;
599 }
600
601 if (src == dst) {
602 new_loc[src] = dst;
603 new_chan[src] = 0;
604 } else {
605 new_loc[src] = dst;
606 new_chan[src] = chans_used[dst];
607
608 /* Move the references to the data */
609 for (int j = 0; j < size; j++) {
610 stage_prog_data->param[dst * 4 + new_chan[src] + j] =
611 stage_prog_data->param[src * 4 + j];
612 }
613
614 chans_used[dst] += size;
615 chans_used[src] = 0;
616 }
617
618 new_uniform_count = MAX2(new_uniform_count, dst + 1);
619 }
620
621 this->uniforms = new_uniform_count;
622
623 /* Now, update the instructions for our repacked uniforms. */
624 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
625 for (int i = 0 ; i < 3; i++) {
626 int src = inst->src[i].reg;
627
628 if (inst->src[i].file != UNIFORM)
629 continue;
630
631 inst->src[i].reg = new_loc[src];
632 inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
633 new_chan[src], new_chan[src]);
634 }
635 }
636 }
637
638 /**
639 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
640 *
641 * While GLSL IR also performs this optimization, we end up with it in
642 * our instruction stream for a couple of reasons. One is that we
643 * sometimes generate silly instructions, for example in array access
644 * where we'll generate "ADD offset, index, base" even if base is 0.
645 * The other is that GLSL IR's constant propagation doesn't track the
646 * components of aggregates, so some VS patterns (initialize matrix to
647 * 0, accumulate in vertex blending factors) end up breaking down to
648 * instructions involving 0.
649 */
650 bool
651 vec4_visitor::opt_algebraic()
652 {
653 bool progress = false;
654
655 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
656 switch (inst->opcode) {
657 case BRW_OPCODE_MOV:
658 if (inst->src[0].file != IMM)
659 break;
660
661 if (inst->saturate) {
662 if (inst->dst.type != inst->src[0].type)
663 assert(!"unimplemented: saturate mixed types");
664
665 if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) {
666 inst->saturate = false;
667 progress = true;
668 }
669 }
670 break;
671
672 case VEC4_OPCODE_UNPACK_UNIFORM:
673 if (inst->src[0].file != UNIFORM) {
674 inst->opcode = BRW_OPCODE_MOV;
675 progress = true;
676 }
677 break;
678
679 case BRW_OPCODE_ADD:
680 if (inst->src[1].is_zero()) {
681 inst->opcode = BRW_OPCODE_MOV;
682 inst->src[1] = src_reg();
683 progress = true;
684 }
685 break;
686
687 case BRW_OPCODE_MUL:
688 if (inst->src[1].is_zero()) {
689 inst->opcode = BRW_OPCODE_MOV;
690 switch (inst->src[0].type) {
691 case BRW_REGISTER_TYPE_F:
692 inst->src[0] = src_reg(0.0f);
693 break;
694 case BRW_REGISTER_TYPE_D:
695 inst->src[0] = src_reg(0);
696 break;
697 case BRW_REGISTER_TYPE_UD:
698 inst->src[0] = src_reg(0u);
699 break;
700 default:
701 unreachable("not reached");
702 }
703 inst->src[1] = src_reg();
704 progress = true;
705 } else if (inst->src[1].is_one()) {
706 inst->opcode = BRW_OPCODE_MOV;
707 inst->src[1] = src_reg();
708 progress = true;
709 } else if (inst->src[1].is_negative_one()) {
710 inst->opcode = BRW_OPCODE_MOV;
711 inst->src[0].negate = !inst->src[0].negate;
712 inst->src[1] = src_reg();
713 progress = true;
714 }
715 break;
716 case BRW_OPCODE_CMP:
717 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
718 inst->src[0].abs &&
719 inst->src[0].negate &&
720 inst->src[1].is_zero()) {
721 inst->src[0].abs = false;
722 inst->src[0].negate = false;
723 inst->conditional_mod = BRW_CONDITIONAL_Z;
724 progress = true;
725 break;
726 }
727 break;
728 case SHADER_OPCODE_RCP: {
729 vec4_instruction *prev = (vec4_instruction *)inst->prev;
730 if (prev->opcode == SHADER_OPCODE_SQRT) {
731 if (inst->src[0].equals(src_reg(prev->dst))) {
732 inst->opcode = SHADER_OPCODE_RSQ;
733 inst->src[0] = prev->src[0];
734 progress = true;
735 }
736 }
737 break;
738 }
739 case SHADER_OPCODE_BROADCAST:
740 if (is_uniform(inst->src[0]) ||
741 inst->src[1].is_zero()) {
742 inst->opcode = BRW_OPCODE_MOV;
743 inst->src[1] = src_reg();
744 inst->force_writemask_all = true;
745 progress = true;
746 }
747 break;
748
749 default:
750 break;
751 }
752 }
753
754 if (progress)
755 invalidate_live_intervals();
756
757 return progress;
758 }
759
760 /**
761 * Only a limited number of hardware registers may be used for push
762 * constants, so this turns access to the overflowed constants into
763 * pull constants.
764 */
765 void
766 vec4_visitor::move_push_constants_to_pull_constants()
767 {
768 int pull_constant_loc[this->uniforms];
769
770 /* Only allow 32 registers (256 uniform components) as push constants,
771 * which is the limit on gen6.
772 *
773 * If changing this value, note the limitation about total_regs in
774 * brw_curbe.c.
775 */
776 int max_uniform_components = 32 * 8;
777 if (this->uniforms * 4 <= max_uniform_components)
778 return;
779
780 /* Make some sort of choice as to which uniforms get sent to pull
781 * constants. We could potentially do something clever here like
782 * look for the most infrequently used uniform vec4s, but leave
783 * that for later.
784 */
785 for (int i = 0; i < this->uniforms * 4; i += 4) {
786 pull_constant_loc[i / 4] = -1;
787
788 if (i >= max_uniform_components) {
789 const gl_constant_value **values = &stage_prog_data->param[i];
790
791 /* Try to find an existing copy of this uniform in the pull
792 * constants if it was part of an array access already.
793 */
794 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
795 int matches;
796
797 for (matches = 0; matches < 4; matches++) {
798 if (stage_prog_data->pull_param[j + matches] != values[matches])
799 break;
800 }
801
802 if (matches == 4) {
803 pull_constant_loc[i / 4] = j / 4;
804 break;
805 }
806 }
807
808 if (pull_constant_loc[i / 4] == -1) {
809 assert(stage_prog_data->nr_pull_params % 4 == 0);
810 pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
811
812 for (int j = 0; j < 4; j++) {
813 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
814 values[j];
815 }
816 }
817 }
818 }
819
820 /* Now actually rewrite usage of the things we've moved to pull
821 * constants.
822 */
823 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
824 for (int i = 0 ; i < 3; i++) {
825 if (inst->src[i].file != UNIFORM ||
826 pull_constant_loc[inst->src[i].reg] == -1)
827 continue;
828
829 int uniform = inst->src[i].reg;
830
831 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
832
833 emit_pull_constant_load(block, inst, temp, inst->src[i],
834 pull_constant_loc[uniform]);
835
836 inst->src[i].file = temp.file;
837 inst->src[i].reg = temp.reg;
838 inst->src[i].reg_offset = temp.reg_offset;
839 inst->src[i].reladdr = NULL;
840 }
841 }
842
843 /* Repack push constants to remove the now-unused ones. */
844 pack_uniform_registers();
845 }
846
847 /* Conditions for which we want to avoid setting the dependency control bits */
848 bool
849 vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
850 {
851 #define IS_DWORD(reg) \
852 (reg.type == BRW_REGISTER_TYPE_UD || \
853 reg.type == BRW_REGISTER_TYPE_D)
854
855 /* "When source or destination datatype is 64b or operation is integer DWord
856 * multiply, DepCtrl must not be used."
857 * May apply to future SoCs as well.
858 */
859 if (devinfo->is_cherryview) {
860 if (inst->opcode == BRW_OPCODE_MUL &&
861 IS_DWORD(inst->src[0]) &&
862 IS_DWORD(inst->src[1]))
863 return true;
864 }
865 #undef IS_DWORD
866
867 if (devinfo->gen >= 8) {
868 if (inst->opcode == BRW_OPCODE_F32TO16)
869 return true;
870 }
871
872 /*
873 * mlen:
874 * In the presence of send messages, totally interrupt dependency
875 * control. They're long enough that the chance of dependency
876 * control around them just doesn't matter.
877 *
878 * predicate:
879 * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
880 * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
881 * completes the scoreboard clear must have a non-zero execution mask. This
882 * means, if any kind of predication can change the execution mask or channel
883 * enable of the last instruction, the optimization must be avoided. This is
884 * to avoid instructions being shot down the pipeline when no writes are
885 * required.
886 *
887 * math:
888 * Dependency control does not work well over math instructions.
889 * NB: Discovered empirically
890 */
891 return (inst->mlen || inst->predicate || inst->is_math());
892 }
893
894 /**
895 * Sets the dependency control fields on instructions after register
896 * allocation and before the generator is run.
897 *
898 * When you have a sequence of instructions like:
899 *
900 * DP4 temp.x vertex uniform[0]
901 * DP4 temp.y vertex uniform[0]
902 * DP4 temp.z vertex uniform[0]
903 * DP4 temp.w vertex uniform[0]
904 *
905 * The hardware doesn't know that it can actually run the later instructions
906 * while the previous ones are in flight, producing stalls. However, we have
907 * manual fields we can set in the instructions that let it do so.
908 */
909 void
910 vec4_visitor::opt_set_dependency_control()
911 {
912 vec4_instruction *last_grf_write[BRW_MAX_GRF];
913 uint8_t grf_channels_written[BRW_MAX_GRF];
914 vec4_instruction *last_mrf_write[BRW_MAX_GRF];
915 uint8_t mrf_channels_written[BRW_MAX_GRF];
916
917 assert(prog_data->total_grf ||
918 !"Must be called after register allocation");
919
920 foreach_block (block, cfg) {
921 memset(last_grf_write, 0, sizeof(last_grf_write));
922 memset(last_mrf_write, 0, sizeof(last_mrf_write));
923
924 foreach_inst_in_block (vec4_instruction, inst, block) {
925 /* If we read from a register that we were doing dependency control
926 * on, don't do dependency control across the read.
927 */
928 for (int i = 0; i < 3; i++) {
929 int reg = inst->src[i].reg + inst->src[i].reg_offset;
930 if (inst->src[i].file == GRF) {
931 last_grf_write[reg] = NULL;
932 } else if (inst->src[i].file == HW_REG) {
933 memset(last_grf_write, 0, sizeof(last_grf_write));
934 break;
935 }
936 assert(inst->src[i].file != MRF);
937 }
938
939 if (is_dep_ctrl_unsafe(inst)) {
940 memset(last_grf_write, 0, sizeof(last_grf_write));
941 memset(last_mrf_write, 0, sizeof(last_mrf_write));
942 continue;
943 }
944
945 /* Now, see if we can do dependency control for this instruction
946 * against a previous one writing to its destination.
947 */
948 int reg = inst->dst.reg + inst->dst.reg_offset;
949 if (inst->dst.file == GRF) {
950 if (last_grf_write[reg] &&
951 !(inst->dst.writemask & grf_channels_written[reg])) {
952 last_grf_write[reg]->no_dd_clear = true;
953 inst->no_dd_check = true;
954 } else {
955 grf_channels_written[reg] = 0;
956 }
957
958 last_grf_write[reg] = inst;
959 grf_channels_written[reg] |= inst->dst.writemask;
960 } else if (inst->dst.file == MRF) {
961 if (last_mrf_write[reg] &&
962 !(inst->dst.writemask & mrf_channels_written[reg])) {
963 last_mrf_write[reg]->no_dd_clear = true;
964 inst->no_dd_check = true;
965 } else {
966 mrf_channels_written[reg] = 0;
967 }
968
969 last_mrf_write[reg] = inst;
970 mrf_channels_written[reg] |= inst->dst.writemask;
971 } else if (inst->dst.reg == HW_REG) {
972 if (inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE)
973 memset(last_grf_write, 0, sizeof(last_grf_write));
974 if (inst->dst.fixed_hw_reg.file == BRW_MESSAGE_REGISTER_FILE)
975 memset(last_mrf_write, 0, sizeof(last_mrf_write));
976 }
977 }
978 }
979 }
980
981 bool
982 vec4_instruction::can_reswizzle(const struct brw_device_info *devinfo,
983 int dst_writemask,
984 int swizzle,
985 int swizzle_mask)
986 {
987 /* Gen6 MATH instructions can not execute in align16 mode, so swizzles
988 * or writemasking are not allowed.
989 */
990 if (devinfo->gen == 6 && is_math() &&
991 (swizzle != BRW_SWIZZLE_XYZW || dst_writemask != WRITEMASK_XYZW))
992 return false;
993
994 /* If this instruction sets anything not referenced by swizzle, then we'd
995 * totally break it when we reswizzle.
996 */
997 if (dst.writemask & ~swizzle_mask)
998 return false;
999
1000 if (mlen > 0)
1001 return false;
1002
1003 /* We can't use swizzles on the accumulator and that's really the only
1004 * HW_REG we would care to reswizzle so just disallow them all.
1005 */
1006 for (int i = 0; i < 3; i++) {
1007 if (src[i].file == HW_REG)
1008 return false;
1009 }
1010
1011 return true;
1012 }
1013
1014 /**
1015 * For any channels in the swizzle's source that were populated by this
1016 * instruction, rewrite the instruction to put the appropriate result directly
1017 * in those channels.
1018 *
1019 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
1020 */
1021 void
1022 vec4_instruction::reswizzle(int dst_writemask, int swizzle)
1023 {
1024 /* Destination write mask doesn't correspond to source swizzle for the dot
1025 * product and pack_bytes instructions.
1026 */
1027 if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
1028 opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
1029 opcode != VEC4_OPCODE_PACK_BYTES) {
1030 for (int i = 0; i < 3; i++) {
1031 if (src[i].file == BAD_FILE || src[i].file == IMM)
1032 continue;
1033
1034 src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
1035 }
1036 }
1037
1038 /* Apply the specified swizzle and writemask to the original mask of
1039 * written components.
1040 */
1041 dst.writemask = dst_writemask &
1042 brw_apply_swizzle_to_mask(swizzle, dst.writemask);
1043 }
1044
1045 /*
1046 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
1047 * just written and then MOVed into another reg and making the original write
1048 * of the GRF write directly to the final destination instead.
1049 */
1050 bool
1051 vec4_visitor::opt_register_coalesce()
1052 {
1053 bool progress = false;
1054 int next_ip = 0;
1055
1056 calculate_live_intervals();
1057
1058 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
1059 int ip = next_ip;
1060 next_ip++;
1061
1062 if (inst->opcode != BRW_OPCODE_MOV ||
1063 (inst->dst.file != GRF && inst->dst.file != MRF) ||
1064 inst->predicate ||
1065 inst->src[0].file != GRF ||
1066 inst->dst.type != inst->src[0].type ||
1067 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
1068 continue;
1069
1070 /* Remove no-op MOVs */
1071 if (inst->dst.file == inst->src[0].file &&
1072 inst->dst.reg == inst->src[0].reg &&
1073 inst->dst.reg_offset == inst->src[0].reg_offset) {
1074 bool is_nop_mov = true;
1075
1076 for (unsigned c = 0; c < 4; c++) {
1077 if ((inst->dst.writemask & (1 << c)) == 0)
1078 continue;
1079
1080 if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) {
1081 is_nop_mov = false;
1082 break;
1083 }
1084 }
1085
1086 if (is_nop_mov) {
1087 inst->remove(block);
1088 continue;
1089 }
1090 }
1091
1092 bool to_mrf = (inst->dst.file == MRF);
1093
1094 /* Can't coalesce this GRF if someone else was going to
1095 * read it later.
1096 */
1097 if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip)
1098 continue;
1099
1100 /* We need to check interference with the final destination between this
1101 * instruction and the earliest instruction involved in writing the GRF
1102 * we're eliminating. To do that, keep track of which of our source
1103 * channels we've seen initialized.
1104 */
1105 const unsigned chans_needed =
1106 brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
1107 inst->dst.writemask);
1108 unsigned chans_remaining = chans_needed;
1109
1110 /* Now walk up the instruction stream trying to see if we can rewrite
1111 * everything writing to the temporary to write into the destination
1112 * instead.
1113 */
1114 vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
1115 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
1116 inst) {
1117 _scan_inst = scan_inst;
1118
1119 if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) {
1120 /* Found something writing to the reg we want to coalesce away. */
1121 if (to_mrf) {
1122 /* SEND instructions can't have MRF as a destination. */
1123 if (scan_inst->mlen)
1124 break;
1125
1126 if (devinfo->gen == 6) {
1127 /* gen6 math instructions must have the destination be
1128 * GRF, so no compute-to-MRF for them.
1129 */
1130 if (scan_inst->is_math()) {
1131 break;
1132 }
1133 }
1134 }
1135
1136 /* This doesn't handle saturation on the instruction we
1137 * want to coalesce away if the register types do not match.
1138 * But if scan_inst is a non type-converting 'mov', we can fix
1139 * the types later.
1140 */
1141 if (inst->saturate &&
1142 inst->dst.type != scan_inst->dst.type &&
1143 !(scan_inst->opcode == BRW_OPCODE_MOV &&
1144 scan_inst->dst.type == scan_inst->src[0].type))
1145 break;
1146
1147 /* If we can't handle the swizzle, bail. */
1148 if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
1149 inst->src[0].swizzle,
1150 chans_needed)) {
1151 break;
1152 }
1153
1154 /* This doesn't handle coalescing of multiple registers. */
1155 if (scan_inst->regs_written > 1)
1156 break;
1157
1158 /* Mark which channels we found unconditional writes for. */
1159 if (!scan_inst->predicate)
1160 chans_remaining &= ~scan_inst->dst.writemask;
1161
1162 if (chans_remaining == 0)
1163 break;
1164 }
1165
1166 /* You can't read from an MRF, so if someone else reads our MRF's
1167 * source GRF that we wanted to rewrite, that stops us. If it's a
1168 * GRF we're trying to coalesce to, we don't actually handle
1169 * rewriting sources so bail in that case as well.
1170 */
1171 bool interfered = false;
1172 for (int i = 0; i < 3; i++) {
1173 if (inst->src[0].in_range(scan_inst->src[i],
1174 scan_inst->regs_read(i)))
1175 interfered = true;
1176 }
1177 if (interfered)
1178 break;
1179
1180 /* If somebody else writes the same channels of our destination here,
1181 * we can't coalesce before that.
1182 */
1183 if (inst->dst.in_range(scan_inst->dst, scan_inst->regs_written) &&
1184 (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
1185 break;
1186 }
1187
1188 /* Check for reads of the register we're trying to coalesce into. We
1189 * can't go rewriting instructions above that to put some other value
1190 * in the register instead.
1191 */
1192 if (to_mrf && scan_inst->mlen > 0) {
1193 if (inst->dst.reg >= scan_inst->base_mrf &&
1194 inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) {
1195 break;
1196 }
1197 } else {
1198 for (int i = 0; i < 3; i++) {
1199 if (inst->dst.in_range(scan_inst->src[i],
1200 scan_inst->regs_read(i)))
1201 interfered = true;
1202 }
1203 if (interfered)
1204 break;
1205 }
1206 }
1207
1208 if (chans_remaining == 0) {
1209 /* If we've made it here, we have an MOV we want to coalesce out, and
1210 * a scan_inst pointing to the earliest instruction involved in
1211 * computing the value. Now go rewrite the instruction stream
1212 * between the two.
1213 */
1214 vec4_instruction *scan_inst = _scan_inst;
1215 while (scan_inst != inst) {
1216 if (scan_inst->dst.file == GRF &&
1217 scan_inst->dst.reg == inst->src[0].reg &&
1218 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1219 scan_inst->reswizzle(inst->dst.writemask,
1220 inst->src[0].swizzle);
1221 scan_inst->dst.file = inst->dst.file;
1222 scan_inst->dst.reg = inst->dst.reg;
1223 scan_inst->dst.reg_offset = inst->dst.reg_offset;
1224 if (inst->saturate &&
1225 inst->dst.type != scan_inst->dst.type) {
1226 /* If we have reached this point, scan_inst is a non
1227 * type-converting 'mov' and we can modify its register types
1228 * to match the ones in inst. Otherwise, we could have an
1229 * incorrect saturation result.
1230 */
1231 scan_inst->dst.type = inst->dst.type;
1232 scan_inst->src[0].type = inst->src[0].type;
1233 }
1234 scan_inst->saturate |= inst->saturate;
1235 }
1236 scan_inst = (vec4_instruction *)scan_inst->next;
1237 }
1238 inst->remove(block);
1239 progress = true;
1240 }
1241 }
1242
1243 if (progress)
1244 invalidate_live_intervals();
1245
1246 return progress;
1247 }
1248
1249 /**
1250 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
1251 * flow. We could probably do better here with some form of divergence
1252 * analysis.
1253 */
1254 bool
1255 vec4_visitor::eliminate_find_live_channel()
1256 {
1257 bool progress = false;
1258 unsigned depth = 0;
1259
1260 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1261 switch (inst->opcode) {
1262 case BRW_OPCODE_IF:
1263 case BRW_OPCODE_DO:
1264 depth++;
1265 break;
1266
1267 case BRW_OPCODE_ENDIF:
1268 case BRW_OPCODE_WHILE:
1269 depth--;
1270 break;
1271
1272 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1273 if (depth == 0) {
1274 inst->opcode = BRW_OPCODE_MOV;
1275 inst->src[0] = src_reg(0);
1276 inst->force_writemask_all = true;
1277 progress = true;
1278 }
1279 break;
1280
1281 default:
1282 break;
1283 }
1284 }
1285
1286 return progress;
1287 }
1288
1289 /**
1290 * Splits virtual GRFs requesting more than one contiguous physical register.
1291 *
1292 * We initially create large virtual GRFs for temporary structures, arrays,
1293 * and matrices, so that the dereference visitor functions can add reg_offsets
1294 * to work their way down to the actual member being accessed. But when it
1295 * comes to optimization, we'd like to treat each register as individual
1296 * storage if possible.
1297 *
1298 * So far, the only thing that might prevent splitting is a send message from
1299 * a GRF on IVB.
1300 */
1301 void
1302 vec4_visitor::split_virtual_grfs()
1303 {
1304 int num_vars = this->alloc.count;
1305 int new_virtual_grf[num_vars];
1306 bool split_grf[num_vars];
1307
1308 memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
1309
1310 /* Try to split anything > 0 sized. */
1311 for (int i = 0; i < num_vars; i++) {
1312 split_grf[i] = this->alloc.sizes[i] != 1;
1313 }
1314
1315 /* Check that the instructions are compatible with the registers we're trying
1316 * to split.
1317 */
1318 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1319 if (inst->dst.file == GRF && inst->regs_written > 1)
1320 split_grf[inst->dst.reg] = false;
1321
1322 for (int i = 0; i < 3; i++) {
1323 if (inst->src[i].file == GRF && inst->regs_read(i) > 1)
1324 split_grf[inst->src[i].reg] = false;
1325 }
1326 }
1327
1328 /* Allocate new space for split regs. Note that the virtual
1329 * numbers will be contiguous.
1330 */
1331 for (int i = 0; i < num_vars; i++) {
1332 if (!split_grf[i])
1333 continue;
1334
1335 new_virtual_grf[i] = alloc.allocate(1);
1336 for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
1337 unsigned reg = alloc.allocate(1);
1338 assert(reg == new_virtual_grf[i] + j - 1);
1339 (void) reg;
1340 }
1341 this->alloc.sizes[i] = 1;
1342 }
1343
1344 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1345 if (inst->dst.file == GRF && split_grf[inst->dst.reg] &&
1346 inst->dst.reg_offset != 0) {
1347 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1348 inst->dst.reg_offset - 1);
1349 inst->dst.reg_offset = 0;
1350 }
1351 for (int i = 0; i < 3; i++) {
1352 if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] &&
1353 inst->src[i].reg_offset != 0) {
1354 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1355 inst->src[i].reg_offset - 1);
1356 inst->src[i].reg_offset = 0;
1357 }
1358 }
1359 }
1360 invalidate_live_intervals();
1361 }
1362
1363 void
1364 vec4_visitor::dump_instruction(backend_instruction *be_inst)
1365 {
1366 dump_instruction(be_inst, stderr);
1367 }
1368
1369 void
1370 vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
1371 {
1372 vec4_instruction *inst = (vec4_instruction *)be_inst;
1373
1374 if (inst->predicate) {
1375 fprintf(file, "(%cf0.%d%s) ",
1376 inst->predicate_inverse ? '-' : '+',
1377 inst->flag_subreg,
1378 pred_ctrl_align16[inst->predicate]);
1379 }
1380
1381 fprintf(file, "%s", brw_instruction_name(inst->opcode));
1382 if (inst->saturate)
1383 fprintf(file, ".sat");
1384 if (inst->conditional_mod) {
1385 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
1386 if (!inst->predicate &&
1387 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
1388 inst->opcode != BRW_OPCODE_IF &&
1389 inst->opcode != BRW_OPCODE_WHILE))) {
1390 fprintf(file, ".f0.%d", inst->flag_subreg);
1391 }
1392 }
1393 fprintf(file, " ");
1394
1395 switch (inst->dst.file) {
1396 case GRF:
1397 fprintf(file, "vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset);
1398 break;
1399 case MRF:
1400 fprintf(file, "m%d", inst->dst.reg);
1401 break;
1402 case HW_REG:
1403 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1404 switch (inst->dst.fixed_hw_reg.nr) {
1405 case BRW_ARF_NULL:
1406 fprintf(file, "null");
1407 break;
1408 case BRW_ARF_ADDRESS:
1409 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
1410 break;
1411 case BRW_ARF_ACCUMULATOR:
1412 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
1413 break;
1414 case BRW_ARF_FLAG:
1415 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
1416 inst->dst.fixed_hw_reg.subnr);
1417 break;
1418 default:
1419 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
1420 inst->dst.fixed_hw_reg.subnr);
1421 break;
1422 }
1423 } else {
1424 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
1425 }
1426 if (inst->dst.fixed_hw_reg.subnr)
1427 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
1428 break;
1429 case BAD_FILE:
1430 fprintf(file, "(null)");
1431 break;
1432 case IMM:
1433 case ATTR:
1434 case UNIFORM:
1435 unreachable("not reached");
1436 }
1437 if (inst->dst.writemask != WRITEMASK_XYZW) {
1438 fprintf(file, ".");
1439 if (inst->dst.writemask & 1)
1440 fprintf(file, "x");
1441 if (inst->dst.writemask & 2)
1442 fprintf(file, "y");
1443 if (inst->dst.writemask & 4)
1444 fprintf(file, "z");
1445 if (inst->dst.writemask & 8)
1446 fprintf(file, "w");
1447 }
1448 fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type));
1449
1450 if (inst->src[0].file != BAD_FILE)
1451 fprintf(file, ", ");
1452
1453 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1454 if (inst->src[i].negate)
1455 fprintf(file, "-");
1456 if (inst->src[i].abs)
1457 fprintf(file, "|");
1458 switch (inst->src[i].file) {
1459 case GRF:
1460 fprintf(file, "vgrf%d", inst->src[i].reg);
1461 break;
1462 case ATTR:
1463 fprintf(file, "attr%d", inst->src[i].reg);
1464 break;
1465 case UNIFORM:
1466 fprintf(file, "u%d", inst->src[i].reg);
1467 break;
1468 case IMM:
1469 switch (inst->src[i].type) {
1470 case BRW_REGISTER_TYPE_F:
1471 fprintf(file, "%fF", inst->src[i].f);
1472 break;
1473 case BRW_REGISTER_TYPE_D:
1474 fprintf(file, "%dD", inst->src[i].d);
1475 break;
1476 case BRW_REGISTER_TYPE_UD:
1477 fprintf(file, "%uU", inst->src[i].ud);
1478 break;
1479 case BRW_REGISTER_TYPE_VF:
1480 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
1481 brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
1482 brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
1483 brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
1484 brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
1485 break;
1486 default:
1487 fprintf(file, "???");
1488 break;
1489 }
1490 break;
1491 case HW_REG:
1492 if (inst->src[i].fixed_hw_reg.negate)
1493 fprintf(file, "-");
1494 if (inst->src[i].fixed_hw_reg.abs)
1495 fprintf(file, "|");
1496 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1497 switch (inst->src[i].fixed_hw_reg.nr) {
1498 case BRW_ARF_NULL:
1499 fprintf(file, "null");
1500 break;
1501 case BRW_ARF_ADDRESS:
1502 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
1503 break;
1504 case BRW_ARF_ACCUMULATOR:
1505 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
1506 break;
1507 case BRW_ARF_FLAG:
1508 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
1509 inst->src[i].fixed_hw_reg.subnr);
1510 break;
1511 default:
1512 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
1513 inst->src[i].fixed_hw_reg.subnr);
1514 break;
1515 }
1516 } else {
1517 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
1518 }
1519 if (inst->src[i].fixed_hw_reg.subnr)
1520 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
1521 if (inst->src[i].fixed_hw_reg.abs)
1522 fprintf(file, "|");
1523 break;
1524 case BAD_FILE:
1525 fprintf(file, "(null)");
1526 break;
1527 case MRF:
1528 unreachable("not reached");
1529 }
1530
1531 /* Don't print .0; and only VGRFs have reg_offsets and sizes */
1532 if (inst->src[i].reg_offset != 0 &&
1533 inst->src[i].file == GRF &&
1534 alloc.sizes[inst->src[i].reg] != 1)
1535 fprintf(file, ".%d", inst->src[i].reg_offset);
1536
1537 if (inst->src[i].file != IMM) {
1538 static const char *chans[4] = {"x", "y", "z", "w"};
1539 fprintf(file, ".");
1540 for (int c = 0; c < 4; c++) {
1541 fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1542 }
1543 }
1544
1545 if (inst->src[i].abs)
1546 fprintf(file, "|");
1547
1548 if (inst->src[i].file != IMM) {
1549 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
1550 }
1551
1552 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1553 fprintf(file, ", ");
1554 }
1555
1556 if (inst->force_writemask_all)
1557 fprintf(file, " NoMask");
1558
1559 fprintf(file, "\n");
1560 }
1561
1562
1563 static inline struct brw_reg
1564 attribute_to_hw_reg(int attr, bool interleaved)
1565 {
1566 if (interleaved)
1567 return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1);
1568 else
1569 return brw_vec8_grf(attr, 0);
1570 }
1571
1572
1573 /**
1574 * Replace each register of type ATTR in this->instructions with a reference
1575 * to a fixed HW register.
1576 *
1577 * If interleaved is true, then each attribute takes up half a register, with
1578 * register N containing attribute 2*N in its first half and attribute 2*N+1
1579 * in its second half (this corresponds to the payload setup used by geometry
1580 * shaders in "single" or "dual instanced" dispatch mode). If interleaved is
1581 * false, then each attribute takes up a whole register, with register N
1582 * containing attribute N (this corresponds to the payload setup used by
1583 * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
1584 */
1585 void
1586 vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
1587 bool interleaved)
1588 {
1589 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1590 /* We have to support ATTR as a destination for GL_FIXED fixup. */
1591 if (inst->dst.file == ATTR) {
1592 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
1593
1594 /* All attributes used in the shader need to have been assigned a
1595 * hardware register by the caller
1596 */
1597 assert(grf != 0);
1598
1599 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1600 reg.type = inst->dst.type;
1601 reg.writemask = inst->dst.writemask;
1602
1603 inst->dst.file = HW_REG;
1604 inst->dst.fixed_hw_reg = reg;
1605 }
1606
1607 for (int i = 0; i < 3; i++) {
1608 if (inst->src[i].file != ATTR)
1609 continue;
1610
1611 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
1612
1613 /* All attributes used in the shader need to have been assigned a
1614 * hardware register by the caller
1615 */
1616 assert(grf != 0);
1617
1618 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1619 reg.swizzle = inst->src[i].swizzle;
1620 reg.type = inst->src[i].type;
1621 if (inst->src[i].abs)
1622 reg = brw_abs(reg);
1623 if (inst->src[i].negate)
1624 reg = negate(reg);
1625
1626 inst->src[i].file = HW_REG;
1627 inst->src[i].fixed_hw_reg = reg;
1628 }
1629 }
1630 }
1631
1632 int
1633 vec4_vs_visitor::setup_attributes(int payload_reg)
1634 {
1635 int nr_attributes;
1636 int attribute_map[VERT_ATTRIB_MAX + 1];
1637 memset(attribute_map, 0, sizeof(attribute_map));
1638
1639 nr_attributes = 0;
1640 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1641 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
1642 attribute_map[i] = payload_reg + nr_attributes;
1643 nr_attributes++;
1644 }
1645 }
1646
1647 /* VertexID is stored by the VF as the last vertex element, but we
1648 * don't represent it with a flag in inputs_read, so we call it
1649 * VERT_ATTRIB_MAX.
1650 */
1651 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
1652 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1653 }
1654
1655 lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
1656
1657 return payload_reg + vs_prog_data->nr_attributes;
1658 }
1659
1660 int
1661 vec4_visitor::setup_uniforms(int reg)
1662 {
1663 prog_data->base.dispatch_grf_start_reg = reg;
1664
1665 /* The pre-gen6 VS requires that some push constants get loaded no
1666 * matter what, or the GPU would hang.
1667 */
1668 if (devinfo->gen < 6 && this->uniforms == 0) {
1669 assert(this->uniforms < this->uniform_array_size);
1670
1671 stage_prog_data->param =
1672 reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
1673 for (unsigned int i = 0; i < 4; i++) {
1674 unsigned int slot = this->uniforms * 4 + i;
1675 static gl_constant_value zero = { 0.0 };
1676 stage_prog_data->param[slot] = &zero;
1677 }
1678
1679 this->uniforms++;
1680 reg++;
1681 } else {
1682 reg += ALIGN(uniforms, 2) / 2;
1683 }
1684
1685 stage_prog_data->nr_params = this->uniforms * 4;
1686
1687 prog_data->base.curb_read_length =
1688 reg - prog_data->base.dispatch_grf_start_reg;
1689
1690 return reg;
1691 }
1692
1693 void
1694 vec4_vs_visitor::setup_payload(void)
1695 {
1696 int reg = 0;
1697
1698 /* The payload always contains important data in g0, which contains
1699 * the URB handles that are passed on to the URB write at the end
1700 * of the thread. So, we always start push constants at g1.
1701 */
1702 reg++;
1703
1704 reg = setup_uniforms(reg);
1705
1706 reg = setup_attributes(reg);
1707
1708 this->first_non_payload_grf = reg;
1709 }
1710
1711 src_reg
1712 vec4_visitor::get_timestamp()
1713 {
1714 assert(devinfo->gen >= 7);
1715
1716 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1717 BRW_ARF_TIMESTAMP,
1718 0,
1719 0,
1720 0,
1721 BRW_REGISTER_TYPE_UD,
1722 BRW_VERTICAL_STRIDE_0,
1723 BRW_WIDTH_4,
1724 BRW_HORIZONTAL_STRIDE_4,
1725 BRW_SWIZZLE_XYZW,
1726 WRITEMASK_XYZW));
1727
1728 dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1729
1730 vec4_instruction *mov = emit(MOV(dst, ts));
1731 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1732 * even if it's not enabled in the dispatch.
1733 */
1734 mov->force_writemask_all = true;
1735
1736 return src_reg(dst);
1737 }
1738
1739 void
1740 vec4_visitor::emit_shader_time_begin()
1741 {
1742 current_annotation = "shader time start";
1743 shader_start_time = get_timestamp();
1744 }
1745
1746 void
1747 vec4_visitor::emit_shader_time_end()
1748 {
1749 current_annotation = "shader time end";
1750 src_reg shader_end_time = get_timestamp();
1751
1752
1753 /* Check that there weren't any timestamp reset events (assuming these
1754 * were the only two timestamp reads that happened).
1755 */
1756 src_reg reset_end = shader_end_time;
1757 reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1758 vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
1759 test->conditional_mod = BRW_CONDITIONAL_Z;
1760
1761 emit(IF(BRW_PREDICATE_NORMAL));
1762
1763 /* Take the current timestamp and get the delta. */
1764 shader_start_time.negate = true;
1765 dst_reg diff = dst_reg(this, glsl_type::uint_type);
1766 emit(ADD(diff, shader_start_time, shader_end_time));
1767
1768 /* If there were no instructions between the two timestamp gets, the diff
1769 * is 2 cycles. Remove that overhead, so I can forget about that when
1770 * trying to determine the time taken for single instructions.
1771 */
1772 emit(ADD(diff, src_reg(diff), src_reg(-2u)));
1773
1774 emit_shader_time_write(0, src_reg(diff));
1775 emit_shader_time_write(1, src_reg(1u));
1776 emit(BRW_OPCODE_ELSE);
1777 emit_shader_time_write(2, src_reg(1u));
1778 emit(BRW_OPCODE_ENDIF);
1779 }
1780
1781 void
1782 vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
1783 {
1784 dst_reg dst =
1785 dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
1786
1787 dst_reg offset = dst;
1788 dst_reg time = dst;
1789 time.reg_offset++;
1790
1791 offset.type = BRW_REGISTER_TYPE_UD;
1792 int index = shader_time_index * 3 + shader_time_subindex;
1793 emit(MOV(offset, src_reg(index * SHADER_TIME_STRIDE)));
1794
1795 time.type = BRW_REGISTER_TYPE_UD;
1796 emit(MOV(time, value));
1797
1798 vec4_instruction *inst =
1799 emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
1800 inst->mlen = 2;
1801 }
1802
1803 void
1804 vec4_visitor::convert_to_hw_regs()
1805 {
1806 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1807 for (int i = 0; i < 3; i++) {
1808 struct src_reg &src = inst->src[i];
1809 struct brw_reg reg;
1810 switch (src.file) {
1811 case GRF:
1812 reg = brw_vec8_grf(src.reg + src.reg_offset, 0);
1813 reg.type = src.type;
1814 reg.swizzle = src.swizzle;
1815 reg.abs = src.abs;
1816 reg.negate = src.negate;
1817 break;
1818
1819 case IMM:
1820 reg = brw_imm_reg(src.type);
1821 reg.ud = src.ud;
1822 break;
1823
1824 case UNIFORM:
1825 reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg +
1826 (src.reg + src.reg_offset) / 2,
1827 ((src.reg + src.reg_offset) % 2) * 4),
1828 0, 4, 1);
1829 reg.type = src.type;
1830 reg.swizzle = src.swizzle;
1831 reg.abs = src.abs;
1832 reg.negate = src.negate;
1833
1834 /* This should have been moved to pull constants. */
1835 assert(!src.reladdr);
1836 break;
1837
1838 case HW_REG:
1839 assert(src.type == src.fixed_hw_reg.type);
1840 continue;
1841
1842 case BAD_FILE:
1843 /* Probably unused. */
1844 reg = brw_null_reg();
1845 break;
1846
1847 case MRF:
1848 case ATTR:
1849 unreachable("not reached");
1850 }
1851 src.fixed_hw_reg = reg;
1852 }
1853
1854 dst_reg &dst = inst->dst;
1855 struct brw_reg reg;
1856
1857 switch (inst->dst.file) {
1858 case GRF:
1859 reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
1860 reg.type = dst.type;
1861 reg.writemask = dst.writemask;
1862 break;
1863
1864 case MRF:
1865 assert(((dst.reg + dst.reg_offset) & ~(1 << 7)) < BRW_MAX_MRF(devinfo->gen));
1866 reg = brw_message_reg(dst.reg + dst.reg_offset);
1867 reg.type = dst.type;
1868 reg.writemask = dst.writemask;
1869 break;
1870
1871 case HW_REG:
1872 assert(dst.type == dst.fixed_hw_reg.type);
1873 reg = dst.fixed_hw_reg;
1874 break;
1875
1876 case BAD_FILE:
1877 reg = brw_null_reg();
1878 break;
1879
1880 case IMM:
1881 case ATTR:
1882 case UNIFORM:
1883 unreachable("not reached");
1884 }
1885
1886 dst.fixed_hw_reg = reg;
1887 }
1888 }
1889
1890 bool
1891 vec4_visitor::run()
1892 {
1893 if (shader_time_index >= 0)
1894 emit_shader_time_begin();
1895
1896 emit_prolog();
1897
1898 emit_nir_code();
1899 if (failed)
1900 return false;
1901 base_ir = NULL;
1902
1903 emit_thread_end();
1904
1905 calculate_cfg();
1906
1907 /* Before any optimization, push array accesses out to scratch
1908 * space where we need them to be. This pass may allocate new
1909 * virtual GRFs, so we want to do it early. It also makes sure
1910 * that we have reladdr computations available for CSE, since we'll
1911 * often do repeated subexpressions for those.
1912 */
1913 move_grf_array_access_to_scratch();
1914 move_uniform_array_access_to_pull_constants();
1915
1916 pack_uniform_registers();
1917 move_push_constants_to_pull_constants();
1918 split_virtual_grfs();
1919
1920 #define OPT(pass, args...) ({ \
1921 pass_num++; \
1922 bool this_progress = pass(args); \
1923 \
1924 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
1925 char filename[64]; \
1926 snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \
1927 stage_abbrev, nir->info.name, iteration, pass_num); \
1928 \
1929 backend_shader::dump_instructions(filename); \
1930 } \
1931 \
1932 progress = progress || this_progress; \
1933 this_progress; \
1934 })
1935
1936
1937 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
1938 char filename[64];
1939 snprintf(filename, 64, "%s-%s-00-start",
1940 stage_abbrev, nir->info.name);
1941
1942 backend_shader::dump_instructions(filename);
1943 }
1944
1945 bool progress;
1946 int iteration = 0;
1947 int pass_num = 0;
1948 do {
1949 progress = false;
1950 pass_num = 0;
1951 iteration++;
1952
1953 OPT(opt_predicated_break, this);
1954 OPT(opt_reduce_swizzle);
1955 OPT(dead_code_eliminate);
1956 OPT(dead_control_flow_eliminate, this);
1957 OPT(opt_copy_propagation);
1958 OPT(opt_cmod_propagation);
1959 OPT(opt_cse);
1960 OPT(opt_algebraic);
1961 OPT(opt_register_coalesce);
1962 OPT(eliminate_find_live_channel);
1963 } while (progress);
1964
1965 pass_num = 0;
1966
1967 if (OPT(opt_vector_float)) {
1968 OPT(opt_cse);
1969 OPT(opt_copy_propagation, false);
1970 OPT(opt_copy_propagation, true);
1971 OPT(dead_code_eliminate);
1972 }
1973
1974 if (failed)
1975 return false;
1976
1977 setup_payload();
1978
1979 if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {
1980 /* Debug of register spilling: Go spill everything. */
1981 const int grf_count = alloc.count;
1982 float spill_costs[alloc.count];
1983 bool no_spill[alloc.count];
1984 evaluate_spill_costs(spill_costs, no_spill);
1985 for (int i = 0; i < grf_count; i++) {
1986 if (no_spill[i])
1987 continue;
1988 spill_reg(i);
1989 }
1990 }
1991
1992 bool allocated_without_spills = reg_allocate();
1993
1994 if (!allocated_without_spills) {
1995 compiler->shader_perf_log(log_data,
1996 "%s shader triggered register spilling. "
1997 "Try reducing the number of live vec4 values "
1998 "to improve performance.\n",
1999 stage_name);
2000
2001 while (!reg_allocate()) {
2002 if (failed)
2003 return false;
2004 }
2005 }
2006
2007 opt_schedule_instructions();
2008
2009 opt_set_dependency_control();
2010
2011 convert_to_hw_regs();
2012
2013 if (last_scratch > 0) {
2014 prog_data->base.total_scratch =
2015 brw_get_scratch_size(last_scratch * REG_SIZE);
2016 }
2017
2018 return !failed;
2019 }
2020
2021 } /* namespace brw */
2022
2023 extern "C" {
2024
2025 /**
2026 * Compile a vertex shader.
2027 *
2028 * Returns the final assembly and the program's size.
2029 */
2030 const unsigned *
2031 brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
2032 void *mem_ctx,
2033 const struct brw_vs_prog_key *key,
2034 struct brw_vs_prog_data *prog_data,
2035 const nir_shader *shader,
2036 gl_clip_plane *clip_planes,
2037 bool use_legacy_snorm_formula,
2038 int shader_time_index,
2039 unsigned *final_assembly_size,
2040 char **error_str)
2041 {
2042 const unsigned *assembly = NULL;
2043
2044 unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
2045
2046 /* gl_VertexID and gl_InstanceID are system values, but arrive via an
2047 * incoming vertex attribute. So, add an extra slot.
2048 */
2049 if (shader->info.system_values_read &
2050 (BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
2051 BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
2052 nr_attributes++;
2053 }
2054
2055 /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
2056 * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in
2057 * vec4 mode, the hardware appears to wedge unless we read something.
2058 */
2059 if (compiler->scalar_vs)
2060 prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2);
2061 else
2062 prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2);
2063
2064 prog_data->nr_attributes = nr_attributes;
2065
2066 /* Since vertex shaders reuse the same VUE entry for inputs and outputs
2067 * (overwriting the original contents), we need to make sure the size is
2068 * the larger of the two.
2069 */
2070 const unsigned vue_entries =
2071 MAX2(nr_attributes, (unsigned)prog_data->base.vue_map.num_slots);
2072
2073 if (compiler->devinfo->gen == 6)
2074 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
2075 else
2076 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
2077
2078 if (compiler->scalar_vs) {
2079 prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
2080
2081 fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
2082 NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */
2083 shader, 8, shader_time_index);
2084 if (!v.run_vs(clip_planes)) {
2085 if (error_str)
2086 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2087
2088 return NULL;
2089 }
2090
2091 fs_generator g(compiler, log_data, mem_ctx, (void *) key,
2092 &prog_data->base.base, v.promoted_constants,
2093 v.runtime_check_aads_emit, "VS");
2094 if (INTEL_DEBUG & DEBUG_VS) {
2095 const char *debug_name =
2096 ralloc_asprintf(mem_ctx, "%s vertex shader %s",
2097 shader->info.label ? shader->info.label : "unnamed",
2098 shader->info.name);
2099
2100 g.enable_debug(debug_name);
2101 }
2102 g.generate_code(v.cfg, 8);
2103 assembly = g.get_assembly(final_assembly_size);
2104 }
2105
2106 if (!assembly) {
2107 prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
2108
2109 vec4_vs_visitor v(compiler, log_data, key, prog_data,
2110 shader, clip_planes, mem_ctx,
2111 shader_time_index, use_legacy_snorm_formula);
2112 if (!v.run()) {
2113 if (error_str)
2114 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2115
2116 return NULL;
2117 }
2118
2119 assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
2120 shader, &prog_data->base, v.cfg,
2121 final_assembly_size);
2122 }
2123
2124 return assembly;
2125 }
2126
2127 } /* extern "C" */