i965/fs: Migrate opt_sampler_eot to the IR builder.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 this->conditional_mod = BRW_CONDITIONAL_NONE;
94
95 /* This will be the case for almost all instructions. */
96 switch (dst.file) {
97 case GRF:
98 case HW_REG:
99 case MRF:
100 case ATTR:
101 this->regs_written =
102 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
103 break;
104 case BAD_FILE:
105 this->regs_written = 0;
106 break;
107 case IMM:
108 case UNIFORM:
109 unreachable("Invalid destination register file");
110 default:
111 unreachable("Invalid register file");
112 }
113
114 this->writes_accumulator = false;
115 }
116
117 fs_inst::fs_inst()
118 {
119 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
120 }
121
122 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
123 {
124 init(opcode, exec_size, reg_undef, NULL, 0);
125 }
126
127 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
128 {
129 init(opcode, 0, dst, NULL, 0);
130 }
131
132 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
133 const fs_reg &src0)
134 {
135 const fs_reg src[1] = { src0 };
136 init(opcode, exec_size, dst, src, 1);
137 }
138
139 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
140 {
141 const fs_reg src[1] = { src0 };
142 init(opcode, 0, dst, src, 1);
143 }
144
145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
146 const fs_reg &src0, const fs_reg &src1)
147 {
148 const fs_reg src[2] = { src0, src1 };
149 init(opcode, exec_size, dst, src, 2);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
153 const fs_reg &src1)
154 {
155 const fs_reg src[2] = { src0, src1 };
156 init(opcode, 0, dst, src, 2);
157 }
158
159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
160 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
161 {
162 const fs_reg src[3] = { src0, src1, src2 };
163 init(opcode, exec_size, dst, src, 3);
164 }
165
166 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
167 const fs_reg &src1, const fs_reg &src2)
168 {
169 const fs_reg src[3] = { src0, src1, src2 };
170 init(opcode, 0, dst, src, 3);
171 }
172
173 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
174 const fs_reg src[], unsigned sources)
175 {
176 init(opcode, 0, dst, src, sources);
177 }
178
179 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
180 const fs_reg src[], unsigned sources)
181 {
182 init(opcode, exec_width, dst, src, sources);
183 }
184
185 fs_inst::fs_inst(const fs_inst &that)
186 {
187 memcpy(this, &that, sizeof(that));
188
189 this->src = new fs_reg[MAX2(that.sources, 3)];
190
191 for (unsigned i = 0; i < that.sources; i++)
192 this->src[i] = that.src[i];
193 }
194
195 fs_inst::~fs_inst()
196 {
197 delete[] this->src;
198 }
199
200 void
201 fs_inst::resize_sources(uint8_t num_sources)
202 {
203 if (this->sources != num_sources) {
204 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
205
206 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
207 src[i] = this->src[i];
208
209 delete[] this->src;
210 this->src = src;
211 this->sources = num_sources;
212 }
213 }
214
215 #define ALU1(op) \
216 fs_inst * \
217 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
218 { \
219 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
220 }
221
222 #define ALU2(op) \
223 fs_inst * \
224 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
225 const fs_reg &src1) \
226 { \
227 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
228 }
229
230 #define ALU2_ACC(op) \
231 fs_inst * \
232 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
233 const fs_reg &src1) \
234 { \
235 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
236 inst->writes_accumulator = true; \
237 return inst; \
238 }
239
240 #define ALU3(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
243 const fs_reg &src1, const fs_reg &src2) \
244 { \
245 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
246 }
247
248 ALU1(NOT)
249 ALU1(MOV)
250 ALU1(FRC)
251 ALU1(RNDD)
252 ALU1(RNDE)
253 ALU1(RNDZ)
254 ALU2(ADD)
255 ALU2(MUL)
256 ALU2_ACC(MACH)
257 ALU2(AND)
258 ALU2(OR)
259 ALU2(XOR)
260 ALU2(SHL)
261 ALU2(SHR)
262 ALU2(ASR)
263 ALU3(LRP)
264 ALU1(BFREV)
265 ALU3(BFE)
266 ALU2(BFI1)
267 ALU3(BFI2)
268 ALU1(FBH)
269 ALU1(FBL)
270 ALU1(CBIT)
271 ALU3(MAD)
272 ALU2_ACC(ADDC)
273 ALU2_ACC(SUBB)
274 ALU2(SEL)
275 ALU2(MAC)
276
277 /** Gen4 predicated IF. */
278 fs_inst *
279 fs_visitor::IF(enum brw_predicate predicate)
280 {
281 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
282 inst->predicate = predicate;
283 return inst;
284 }
285
286 /** Gen6 IF with embedded comparison. */
287 fs_inst *
288 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
289 enum brw_conditional_mod condition)
290 {
291 assert(devinfo->gen == 6);
292 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
293 reg_null_d, src0, src1);
294 inst->conditional_mod = condition;
295 return inst;
296 }
297
298 /**
299 * CMP: Sets the low bit of the destination channels with the result
300 * of the comparison, while the upper bits are undefined, and updates
301 * the flag register with the packed 16 bits of the result.
302 */
303 fs_inst *
304 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
305 enum brw_conditional_mod condition)
306 {
307 fs_inst *inst;
308
309 /* Take the instruction:
310 *
311 * CMP null<d> src0<f> src1<f>
312 *
313 * Original gen4 does type conversion to the destination type before
314 * comparison, producing garbage results for floating point comparisons.
315 *
316 * The destination type doesn't matter on newer generations, so we set the
317 * type to match src0 so we can compact the instruction.
318 */
319 dst.type = src0.type;
320 if (dst.file == HW_REG)
321 dst.fixed_hw_reg.type = dst.type;
322
323 resolve_ud_negate(&src0);
324 resolve_ud_negate(&src1);
325
326 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
327 inst->conditional_mod = condition;
328
329 return inst;
330 }
331
332 fs_inst *
333 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
334 int header_size)
335 {
336 assert(dst.width % 8 == 0);
337 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
338 dst, src, sources);
339 inst->header_size = header_size;
340
341 for (int i = 0; i < header_size; i++)
342 assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
343 inst->regs_written = header_size;
344
345 for (int i = header_size; i < sources; ++i)
346 assert(src[i].file != GRF || src[i].width == dst.width);
347 inst->regs_written += (sources - header_size) * (dst.width / 8);
348
349 return inst;
350 }
351
352 exec_list
353 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
354 const fs_reg &surf_index,
355 const fs_reg &varying_offset,
356 uint32_t const_offset)
357 {
358 exec_list instructions;
359 fs_inst *inst;
360
361 /* We have our constant surface use a pitch of 4 bytes, so our index can
362 * be any component of a vector, and then we load 4 contiguous
363 * components starting from that.
364 *
365 * We break down the const_offset to a portion added to the variable
366 * offset and a portion done using reg_offset, which means that if you
367 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
368 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
369 * CSE can later notice that those loads are all the same and eliminate
370 * the redundant ones.
371 */
372 fs_reg vec4_offset = vgrf(glsl_type::int_type);
373 instructions.push_tail(ADD(vec4_offset,
374 varying_offset, fs_reg(const_offset & ~3)));
375
376 int scale = 1;
377 if (devinfo->gen == 4 && dst.width == 8) {
378 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
379 * u, v, r) as parameters, or we can just use the SIMD16 message
380 * consisting of (header, u). We choose the second, at the cost of a
381 * longer return length.
382 */
383 scale = 2;
384 }
385
386 enum opcode op;
387 if (devinfo->gen >= 7)
388 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
389 else
390 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
391
392 assert(dst.width % 8 == 0);
393 int regs_written = 4 * (dst.width / 8) * scale;
394 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
395 dst.type, dst.width);
396 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
397 inst->regs_written = regs_written;
398 instructions.push_tail(inst);
399
400 if (devinfo->gen < 7) {
401 inst->base_mrf = 13;
402 inst->header_size = 1;
403 if (devinfo->gen == 4)
404 inst->mlen = 3;
405 else
406 inst->mlen = 1 + dispatch_width / 8;
407 }
408
409 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
410 instructions.push_tail(MOV(dst, result));
411
412 return instructions;
413 }
414
415 /**
416 * A helper for MOV generation for fixing up broken hardware SEND dependency
417 * handling.
418 */
419 fs_inst *
420 fs_visitor::DEP_RESOLVE_MOV(int grf)
421 {
422 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
423
424 inst->ir = NULL;
425 inst->annotation = "send dependency resolve";
426
427 /* The caller always wants uncompressed to emit the minimal extra
428 * dependencies, and to avoid having to deal with aligning its regs to 2.
429 */
430 inst->exec_size = 8;
431
432 return inst;
433 }
434
435 bool
436 fs_inst::equals(fs_inst *inst) const
437 {
438 return (opcode == inst->opcode &&
439 dst.equals(inst->dst) &&
440 src[0].equals(inst->src[0]) &&
441 src[1].equals(inst->src[1]) &&
442 src[2].equals(inst->src[2]) &&
443 saturate == inst->saturate &&
444 predicate == inst->predicate &&
445 conditional_mod == inst->conditional_mod &&
446 mlen == inst->mlen &&
447 base_mrf == inst->base_mrf &&
448 target == inst->target &&
449 eot == inst->eot &&
450 header_size == inst->header_size &&
451 shadow_compare == inst->shadow_compare &&
452 exec_size == inst->exec_size &&
453 offset == inst->offset);
454 }
455
456 bool
457 fs_inst::overwrites_reg(const fs_reg &reg) const
458 {
459 return reg.in_range(dst, regs_written);
460 }
461
462 bool
463 fs_inst::is_send_from_grf() const
464 {
465 switch (opcode) {
466 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
467 case SHADER_OPCODE_SHADER_TIME_ADD:
468 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
469 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
470 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
471 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
472 case SHADER_OPCODE_UNTYPED_ATOMIC:
473 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
474 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
475 case SHADER_OPCODE_TYPED_ATOMIC:
476 case SHADER_OPCODE_TYPED_SURFACE_READ:
477 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
478 case SHADER_OPCODE_URB_WRITE_SIMD8:
479 return true;
480 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
481 return src[1].file == GRF;
482 case FS_OPCODE_FB_WRITE:
483 return src[0].file == GRF;
484 default:
485 if (is_tex())
486 return src[0].file == GRF;
487
488 return false;
489 }
490 }
491
492 bool
493 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
494 {
495 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
496 return false;
497
498 fs_reg reg = this->src[0];
499 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
500 return false;
501
502 if (grf_alloc.sizes[reg.reg] != this->regs_written)
503 return false;
504
505 for (int i = 0; i < this->sources; i++) {
506 reg.type = this->src[i].type;
507 reg.width = this->src[i].width;
508 if (!this->src[i].equals(reg))
509 return false;
510 reg = ::offset(reg, 1);
511 }
512
513 return true;
514 }
515
516 bool
517 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
518 {
519 if (devinfo->gen == 6 && is_math())
520 return false;
521
522 if (is_send_from_grf())
523 return false;
524
525 if (!backend_instruction::can_do_source_mods())
526 return false;
527
528 return true;
529 }
530
531 bool
532 fs_inst::has_side_effects() const
533 {
534 return this->eot || backend_instruction::has_side_effects();
535 }
536
537 void
538 fs_reg::init()
539 {
540 memset(this, 0, sizeof(*this));
541 stride = 1;
542 }
543
544 /** Generic unset register constructor. */
545 fs_reg::fs_reg()
546 {
547 init();
548 this->file = BAD_FILE;
549 }
550
551 /** Immediate value constructor. */
552 fs_reg::fs_reg(float f)
553 {
554 init();
555 this->file = IMM;
556 this->type = BRW_REGISTER_TYPE_F;
557 this->fixed_hw_reg.dw1.f = f;
558 this->width = 1;
559 }
560
561 /** Immediate value constructor. */
562 fs_reg::fs_reg(int32_t i)
563 {
564 init();
565 this->file = IMM;
566 this->type = BRW_REGISTER_TYPE_D;
567 this->fixed_hw_reg.dw1.d = i;
568 this->width = 1;
569 }
570
571 /** Immediate value constructor. */
572 fs_reg::fs_reg(uint32_t u)
573 {
574 init();
575 this->file = IMM;
576 this->type = BRW_REGISTER_TYPE_UD;
577 this->fixed_hw_reg.dw1.ud = u;
578 this->width = 1;
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf[4])
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
588 }
589
590 /** Vector float immediate value constructor. */
591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
592 {
593 init();
594 this->file = IMM;
595 this->type = BRW_REGISTER_TYPE_VF;
596 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
597 (vf1 << 8) |
598 (vf2 << 16) |
599 (vf3 << 24);
600 }
601
602 /** Fixed brw_reg. */
603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
604 {
605 init();
606 this->file = HW_REG;
607 this->fixed_hw_reg = fixed_hw_reg;
608 this->type = fixed_hw_reg.type;
609 this->width = 1 << fixed_hw_reg.width;
610 }
611
612 bool
613 fs_reg::equals(const fs_reg &r) const
614 {
615 return (file == r.file &&
616 reg == r.reg &&
617 reg_offset == r.reg_offset &&
618 subreg_offset == r.subreg_offset &&
619 type == r.type &&
620 negate == r.negate &&
621 abs == r.abs &&
622 !reladdr && !r.reladdr &&
623 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
624 width == r.width &&
625 stride == r.stride);
626 }
627
628 fs_reg &
629 fs_reg::set_smear(unsigned subreg)
630 {
631 assert(file != HW_REG && file != IMM);
632 subreg_offset = subreg * type_sz(type);
633 stride = 0;
634 return *this;
635 }
636
637 bool
638 fs_reg::is_contiguous() const
639 {
640 return stride == 1;
641 }
642
643 int
644 fs_visitor::type_size(const struct glsl_type *type)
645 {
646 unsigned int size, i;
647
648 switch (type->base_type) {
649 case GLSL_TYPE_UINT:
650 case GLSL_TYPE_INT:
651 case GLSL_TYPE_FLOAT:
652 case GLSL_TYPE_BOOL:
653 return type->components();
654 case GLSL_TYPE_ARRAY:
655 return type_size(type->fields.array) * type->length;
656 case GLSL_TYPE_STRUCT:
657 size = 0;
658 for (i = 0; i < type->length; i++) {
659 size += type_size(type->fields.structure[i].type);
660 }
661 return size;
662 case GLSL_TYPE_SAMPLER:
663 /* Samplers take up no register space, since they're baked in at
664 * link time.
665 */
666 return 0;
667 case GLSL_TYPE_ATOMIC_UINT:
668 return 0;
669 case GLSL_TYPE_IMAGE:
670 case GLSL_TYPE_VOID:
671 case GLSL_TYPE_ERROR:
672 case GLSL_TYPE_INTERFACE:
673 case GLSL_TYPE_DOUBLE:
674 unreachable("not reached");
675 }
676
677 return 0;
678 }
679
680 /**
681 * Create a MOV to read the timestamp register.
682 *
683 * The caller is responsible for emitting the MOV. The return value is
684 * the destination of the MOV, with extra parameters set.
685 */
686 fs_reg
687 fs_visitor::get_timestamp(fs_inst **out_mov)
688 {
689 assert(devinfo->gen >= 7);
690
691 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
692 BRW_ARF_TIMESTAMP,
693 0),
694 BRW_REGISTER_TYPE_UD));
695
696 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
697
698 fs_inst *mov = MOV(dst, ts);
699 /* We want to read the 3 fields we care about even if it's not enabled in
700 * the dispatch.
701 */
702 mov->force_writemask_all = true;
703
704 /* The caller wants the low 32 bits of the timestamp. Since it's running
705 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
706 * which is plenty of time for our purposes. It is identical across the
707 * EUs, but since it's tracking GPU core speed it will increment at a
708 * varying rate as render P-states change.
709 *
710 * The caller could also check if render P-states have changed (or anything
711 * else that might disrupt timing) by setting smear to 2 and checking if
712 * that field is != 0.
713 */
714 dst.set_smear(0);
715
716 *out_mov = mov;
717 return dst;
718 }
719
720 void
721 fs_visitor::emit_shader_time_begin()
722 {
723 current_annotation = "shader time start";
724 fs_inst *mov;
725 shader_start_time = get_timestamp(&mov);
726 emit(mov);
727 }
728
729 void
730 fs_visitor::emit_shader_time_end()
731 {
732 current_annotation = "shader time end";
733
734 enum shader_time_shader_type type, written_type, reset_type;
735 switch (stage) {
736 case MESA_SHADER_VERTEX:
737 type = ST_VS;
738 written_type = ST_VS_WRITTEN;
739 reset_type = ST_VS_RESET;
740 break;
741 case MESA_SHADER_GEOMETRY:
742 type = ST_GS;
743 written_type = ST_GS_WRITTEN;
744 reset_type = ST_GS_RESET;
745 break;
746 case MESA_SHADER_FRAGMENT:
747 if (dispatch_width == 8) {
748 type = ST_FS8;
749 written_type = ST_FS8_WRITTEN;
750 reset_type = ST_FS8_RESET;
751 } else {
752 assert(dispatch_width == 16);
753 type = ST_FS16;
754 written_type = ST_FS16_WRITTEN;
755 reset_type = ST_FS16_RESET;
756 }
757 break;
758 case MESA_SHADER_COMPUTE:
759 type = ST_CS;
760 written_type = ST_CS_WRITTEN;
761 reset_type = ST_CS_RESET;
762 break;
763 default:
764 unreachable("fs_visitor::emit_shader_time_end missing code");
765 }
766
767 /* Insert our code just before the final SEND with EOT. */
768 exec_node *end = this->instructions.get_tail();
769 assert(end && ((fs_inst *) end)->eot);
770
771 fs_inst *tm_read;
772 fs_reg shader_end_time = get_timestamp(&tm_read);
773 end->insert_before(tm_read);
774
775 /* Check that there weren't any timestamp reset events (assuming these
776 * were the only two timestamp reads that happened).
777 */
778 fs_reg reset = shader_end_time;
779 reset.set_smear(2);
780 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
781 test->conditional_mod = BRW_CONDITIONAL_Z;
782 test->force_writemask_all = true;
783 end->insert_before(test);
784 end->insert_before(IF(BRW_PREDICATE_NORMAL));
785
786 fs_reg start = shader_start_time;
787 start.negate = true;
788 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
789 diff.set_smear(0);
790 fs_inst *add = ADD(diff, start, shader_end_time);
791 add->force_writemask_all = true;
792 end->insert_before(add);
793
794 /* If there were no instructions between the two timestamp gets, the diff
795 * is 2 cycles. Remove that overhead, so I can forget about that when
796 * trying to determine the time taken for single instructions.
797 */
798 add = ADD(diff, diff, fs_reg(-2u));
799 add->force_writemask_all = true;
800 end->insert_before(add);
801
802 end->insert_before(SHADER_TIME_ADD(type, diff));
803 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
804 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
805 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
806 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
807 }
808
809 fs_inst *
810 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
811 {
812 int shader_time_index =
813 brw_get_shader_time_index(brw, shader_prog, prog, type);
814 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
815
816 fs_reg payload;
817 if (dispatch_width == 8)
818 payload = vgrf(glsl_type::uvec2_type);
819 else
820 payload = vgrf(glsl_type::uint_type);
821
822 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
823 fs_reg(), payload, offset, value);
824 }
825
826 void
827 fs_visitor::vfail(const char *format, va_list va)
828 {
829 char *msg;
830
831 if (failed)
832 return;
833
834 failed = true;
835
836 msg = ralloc_vasprintf(mem_ctx, format, va);
837 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
838
839 this->fail_msg = msg;
840
841 if (debug_enabled) {
842 fprintf(stderr, "%s", msg);
843 }
844 }
845
846 void
847 fs_visitor::fail(const char *format, ...)
848 {
849 va_list va;
850
851 va_start(va, format);
852 vfail(format, va);
853 va_end(va);
854 }
855
856 /**
857 * Mark this program as impossible to compile in SIMD16 mode.
858 *
859 * During the SIMD8 compile (which happens first), we can detect and flag
860 * things that are unsupported in SIMD16 mode, so the compiler can skip
861 * the SIMD16 compile altogether.
862 *
863 * During a SIMD16 compile (if one happens anyway), this just calls fail().
864 */
865 void
866 fs_visitor::no16(const char *format, ...)
867 {
868 va_list va;
869
870 va_start(va, format);
871
872 if (dispatch_width == 16) {
873 vfail(format, va);
874 } else {
875 simd16_unsupported = true;
876
877 if (brw->perf_debug) {
878 if (no16_msg)
879 ralloc_vasprintf_append(&no16_msg, format, va);
880 else
881 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
882 }
883 }
884
885 va_end(va);
886 }
887
888 fs_inst *
889 fs_visitor::emit(enum opcode opcode)
890 {
891 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
892 }
893
894 fs_inst *
895 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
896 {
897 return emit(new(mem_ctx) fs_inst(opcode, dst));
898 }
899
900 fs_inst *
901 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
902 {
903 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
904 }
905
906 fs_inst *
907 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
908 const fs_reg &src1)
909 {
910 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
911 }
912
913 fs_inst *
914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
915 const fs_reg &src1, const fs_reg &src2)
916 {
917 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
918 }
919
920 fs_inst *
921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
922 fs_reg src[], int sources)
923 {
924 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
925 }
926
927 /**
928 * Returns true if the instruction has a flag that means it won't
929 * update an entire destination register.
930 *
931 * For example, dead code elimination and live variable analysis want to know
932 * when a write to a variable screens off any preceding values that were in
933 * it.
934 */
935 bool
936 fs_inst::is_partial_write() const
937 {
938 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
939 (this->dst.width * type_sz(this->dst.type)) < 32 ||
940 !this->dst.is_contiguous());
941 }
942
943 int
944 fs_inst::regs_read(int arg) const
945 {
946 if (is_tex() && arg == 0 && src[0].file == GRF) {
947 return mlen;
948 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
949 return mlen;
950 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
951 return mlen;
952 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
953 return mlen;
954 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
955 return mlen;
956 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
957 return mlen;
958 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
959 return mlen;
960 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
961 return mlen;
962 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
963 return mlen;
964 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
965 return mlen;
966 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
967 return exec_size / 4;
968 }
969
970 switch (src[arg].file) {
971 case BAD_FILE:
972 case UNIFORM:
973 case IMM:
974 return 1;
975 case GRF:
976 case HW_REG:
977 if (src[arg].stride == 0) {
978 return 1;
979 } else {
980 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
981 return (size + 31) / 32;
982 }
983 case MRF:
984 unreachable("MRF registers are not allowed as sources");
985 default:
986 unreachable("Invalid register file");
987 }
988 }
989
990 bool
991 fs_inst::reads_flag() const
992 {
993 return predicate;
994 }
995
996 bool
997 fs_inst::writes_flag() const
998 {
999 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1000 opcode != BRW_OPCODE_IF &&
1001 opcode != BRW_OPCODE_WHILE)) ||
1002 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1003 }
1004
1005 /**
1006 * Returns how many MRFs an FS opcode will write over.
1007 *
1008 * Note that this is not the 0 or 1 implied writes in an actual gen
1009 * instruction -- the FS opcodes often generate MOVs in addition.
1010 */
1011 int
1012 fs_visitor::implied_mrf_writes(fs_inst *inst)
1013 {
1014 if (inst->mlen == 0)
1015 return 0;
1016
1017 if (inst->base_mrf == -1)
1018 return 0;
1019
1020 switch (inst->opcode) {
1021 case SHADER_OPCODE_RCP:
1022 case SHADER_OPCODE_RSQ:
1023 case SHADER_OPCODE_SQRT:
1024 case SHADER_OPCODE_EXP2:
1025 case SHADER_OPCODE_LOG2:
1026 case SHADER_OPCODE_SIN:
1027 case SHADER_OPCODE_COS:
1028 return 1 * dispatch_width / 8;
1029 case SHADER_OPCODE_POW:
1030 case SHADER_OPCODE_INT_QUOTIENT:
1031 case SHADER_OPCODE_INT_REMAINDER:
1032 return 2 * dispatch_width / 8;
1033 case SHADER_OPCODE_TEX:
1034 case FS_OPCODE_TXB:
1035 case SHADER_OPCODE_TXD:
1036 case SHADER_OPCODE_TXF:
1037 case SHADER_OPCODE_TXF_CMS:
1038 case SHADER_OPCODE_TXF_MCS:
1039 case SHADER_OPCODE_TG4:
1040 case SHADER_OPCODE_TG4_OFFSET:
1041 case SHADER_OPCODE_TXL:
1042 case SHADER_OPCODE_TXS:
1043 case SHADER_OPCODE_LOD:
1044 return 1;
1045 case FS_OPCODE_FB_WRITE:
1046 return 2;
1047 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1048 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1049 return 1;
1050 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1051 return inst->mlen;
1052 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1053 return inst->mlen;
1054 case SHADER_OPCODE_UNTYPED_ATOMIC:
1055 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1056 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1057 case SHADER_OPCODE_TYPED_ATOMIC:
1058 case SHADER_OPCODE_TYPED_SURFACE_READ:
1059 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1060 case SHADER_OPCODE_URB_WRITE_SIMD8:
1061 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1062 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1063 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1064 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1065 return 0;
1066 default:
1067 unreachable("not reached");
1068 }
1069 }
1070
1071 fs_reg
1072 fs_visitor::vgrf(const glsl_type *const type)
1073 {
1074 int reg_width = dispatch_width / 8;
1075 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1076 brw_type_for_base_type(type), dispatch_width);
1077 }
1078
1079 fs_reg
1080 fs_visitor::vgrf(int num_components)
1081 {
1082 int reg_width = dispatch_width / 8;
1083 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1084 BRW_REGISTER_TYPE_F, dispatch_width);
1085 }
1086
1087 /** Fixed HW reg constructor. */
1088 fs_reg::fs_reg(enum register_file file, int reg)
1089 {
1090 init();
1091 this->file = file;
1092 this->reg = reg;
1093 this->type = BRW_REGISTER_TYPE_F;
1094
1095 switch (file) {
1096 case UNIFORM:
1097 this->width = 1;
1098 break;
1099 default:
1100 this->width = 8;
1101 }
1102 }
1103
1104 /** Fixed HW reg constructor. */
1105 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1106 {
1107 init();
1108 this->file = file;
1109 this->reg = reg;
1110 this->type = type;
1111
1112 switch (file) {
1113 case UNIFORM:
1114 this->width = 1;
1115 break;
1116 default:
1117 this->width = 8;
1118 }
1119 }
1120
1121 /** Fixed HW reg constructor. */
1122 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1123 uint8_t width)
1124 {
1125 init();
1126 this->file = file;
1127 this->reg = reg;
1128 this->type = type;
1129 this->width = width;
1130 }
1131
1132 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1133 * This brings in those uniform definitions
1134 */
1135 void
1136 fs_visitor::import_uniforms(fs_visitor *v)
1137 {
1138 this->push_constant_loc = v->push_constant_loc;
1139 this->pull_constant_loc = v->pull_constant_loc;
1140 this->uniforms = v->uniforms;
1141 this->param_size = v->param_size;
1142 }
1143
1144 fs_reg *
1145 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1146 bool origin_upper_left)
1147 {
1148 assert(stage == MESA_SHADER_FRAGMENT);
1149 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1150 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1151 fs_reg wpos = *reg;
1152 bool flip = !origin_upper_left ^ key->render_to_fbo;
1153
1154 /* gl_FragCoord.x */
1155 if (pixel_center_integer) {
1156 emit(MOV(wpos, this->pixel_x));
1157 } else {
1158 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1159 }
1160 wpos = offset(wpos, 1);
1161
1162 /* gl_FragCoord.y */
1163 if (!flip && pixel_center_integer) {
1164 emit(MOV(wpos, this->pixel_y));
1165 } else {
1166 fs_reg pixel_y = this->pixel_y;
1167 float offset = (pixel_center_integer ? 0.0 : 0.5);
1168
1169 if (flip) {
1170 pixel_y.negate = true;
1171 offset += key->drawable_height - 1.0;
1172 }
1173
1174 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1175 }
1176 wpos = offset(wpos, 1);
1177
1178 /* gl_FragCoord.z */
1179 if (devinfo->gen >= 6) {
1180 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1181 } else {
1182 emit(FS_OPCODE_LINTERP, wpos,
1183 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1184 interp_reg(VARYING_SLOT_POS, 2));
1185 }
1186 wpos = offset(wpos, 1);
1187
1188 /* gl_FragCoord.w: Already set up in emit_interpolation */
1189 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1190
1191 return reg;
1192 }
1193
1194 fs_inst *
1195 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1196 glsl_interp_qualifier interpolation_mode,
1197 bool is_centroid, bool is_sample)
1198 {
1199 brw_wm_barycentric_interp_mode barycoord_mode;
1200 if (devinfo->gen >= 6) {
1201 if (is_centroid) {
1202 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1203 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1204 else
1205 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1206 } else if (is_sample) {
1207 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1208 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1209 else
1210 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1211 } else {
1212 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1213 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1214 else
1215 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1216 }
1217 } else {
1218 /* On Ironlake and below, there is only one interpolation mode.
1219 * Centroid interpolation doesn't mean anything on this hardware --
1220 * there is no multisampling.
1221 */
1222 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1223 }
1224 return emit(FS_OPCODE_LINTERP, attr,
1225 this->delta_xy[barycoord_mode], interp);
1226 }
1227
1228 void
1229 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1230 const glsl_type *type,
1231 glsl_interp_qualifier interpolation_mode,
1232 int location, bool mod_centroid,
1233 bool mod_sample)
1234 {
1235 attr.type = brw_type_for_base_type(type->get_scalar_type());
1236
1237 assert(stage == MESA_SHADER_FRAGMENT);
1238 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1239 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1240
1241 unsigned int array_elements;
1242
1243 if (type->is_array()) {
1244 array_elements = type->length;
1245 if (array_elements == 0) {
1246 fail("dereferenced array '%s' has length 0\n", name);
1247 }
1248 type = type->fields.array;
1249 } else {
1250 array_elements = 1;
1251 }
1252
1253 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1254 bool is_gl_Color =
1255 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1256 if (key->flat_shade && is_gl_Color) {
1257 interpolation_mode = INTERP_QUALIFIER_FLAT;
1258 } else {
1259 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1260 }
1261 }
1262
1263 for (unsigned int i = 0; i < array_elements; i++) {
1264 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1265 if (prog_data->urb_setup[location] == -1) {
1266 /* If there's no incoming setup data for this slot, don't
1267 * emit interpolation for it.
1268 */
1269 attr = offset(attr, type->vector_elements);
1270 location++;
1271 continue;
1272 }
1273
1274 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1275 /* Constant interpolation (flat shading) case. The SF has
1276 * handed us defined values in only the constant offset
1277 * field of the setup reg.
1278 */
1279 for (unsigned int k = 0; k < type->vector_elements; k++) {
1280 struct brw_reg interp = interp_reg(location, k);
1281 interp = suboffset(interp, 3);
1282 interp.type = attr.type;
1283 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1284 attr = offset(attr, 1);
1285 }
1286 } else {
1287 /* Smooth/noperspective interpolation case. */
1288 for (unsigned int k = 0; k < type->vector_elements; k++) {
1289 struct brw_reg interp = interp_reg(location, k);
1290 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1291 /* Get the pixel/sample mask into f0 so that we know
1292 * which pixels are lit. Then, for each channel that is
1293 * unlit, replace the centroid data with non-centroid
1294 * data.
1295 */
1296 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1297
1298 fs_inst *inst;
1299 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1300 false, false);
1301 inst->predicate = BRW_PREDICATE_NORMAL;
1302 inst->predicate_inverse = true;
1303 if (devinfo->has_pln)
1304 inst->no_dd_clear = true;
1305
1306 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1307 mod_centroid && !key->persample_shading,
1308 mod_sample || key->persample_shading);
1309 inst->predicate = BRW_PREDICATE_NORMAL;
1310 inst->predicate_inverse = false;
1311 if (devinfo->has_pln)
1312 inst->no_dd_check = true;
1313
1314 } else {
1315 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1316 mod_centroid && !key->persample_shading,
1317 mod_sample || key->persample_shading);
1318 }
1319 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1320 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1321 }
1322 attr = offset(attr, 1);
1323 }
1324
1325 }
1326 location++;
1327 }
1328 }
1329 }
1330
1331 fs_reg *
1332 fs_visitor::emit_frontfacing_interpolation()
1333 {
1334 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1335
1336 if (devinfo->gen >= 6) {
1337 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1338 * a boolean result from this (~0/true or 0/false).
1339 *
1340 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1341 * this task in only one instruction:
1342 * - a negation source modifier will flip the bit; and
1343 * - a W -> D type conversion will sign extend the bit into the high
1344 * word of the destination.
1345 *
1346 * An ASR 15 fills the low word of the destination.
1347 */
1348 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1349 g0.negate = true;
1350
1351 emit(ASR(*reg, g0, fs_reg(15)));
1352 } else {
1353 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1354 * a boolean result from this (1/true or 0/false).
1355 *
1356 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1357 * the negation source modifier to flip it. Unfortunately the SHR
1358 * instruction only operates on UD (or D with an abs source modifier)
1359 * sources without negation.
1360 *
1361 * Instead, use ASR (which will give ~0/true or 0/false).
1362 */
1363 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1364 g1_6.negate = true;
1365
1366 emit(ASR(*reg, g1_6, fs_reg(31)));
1367 }
1368
1369 return reg;
1370 }
1371
1372 void
1373 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1374 {
1375 assert(stage == MESA_SHADER_FRAGMENT);
1376 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1377 assert(dst.type == BRW_REGISTER_TYPE_F);
1378
1379 if (key->compute_pos_offset) {
1380 /* Convert int_sample_pos to floating point */
1381 emit(MOV(dst, int_sample_pos));
1382 /* Scale to the range [0, 1] */
1383 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1384 }
1385 else {
1386 /* From ARB_sample_shading specification:
1387 * "When rendering to a non-multisample buffer, or if multisample
1388 * rasterization is disabled, gl_SamplePosition will always be
1389 * (0.5, 0.5).
1390 */
1391 emit(MOV(dst, fs_reg(0.5f)));
1392 }
1393 }
1394
1395 fs_reg *
1396 fs_visitor::emit_samplepos_setup()
1397 {
1398 assert(devinfo->gen >= 6);
1399
1400 this->current_annotation = "compute sample position";
1401 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1402 fs_reg pos = *reg;
1403 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1404 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1405
1406 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1407 * mode will be enabled.
1408 *
1409 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1410 * R31.1:0 Position Offset X/Y for Slot[3:0]
1411 * R31.3:2 Position Offset X/Y for Slot[7:4]
1412 * .....
1413 *
1414 * The X, Y sample positions come in as bytes in thread payload. So, read
1415 * the positions using vstride=16, width=8, hstride=2.
1416 */
1417 struct brw_reg sample_pos_reg =
1418 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1419 BRW_REGISTER_TYPE_B), 16, 8, 2);
1420
1421 if (dispatch_width == 8) {
1422 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1423 } else {
1424 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1425 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1426 ->force_sechalf = true;
1427 }
1428 /* Compute gl_SamplePosition.x */
1429 compute_sample_position(pos, int_sample_x);
1430 pos = offset(pos, 1);
1431 if (dispatch_width == 8) {
1432 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1433 } else {
1434 emit(MOV(half(int_sample_y, 0),
1435 fs_reg(suboffset(sample_pos_reg, 1))));
1436 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1437 ->force_sechalf = true;
1438 }
1439 /* Compute gl_SamplePosition.y */
1440 compute_sample_position(pos, int_sample_y);
1441 return reg;
1442 }
1443
1444 fs_reg *
1445 fs_visitor::emit_sampleid_setup()
1446 {
1447 assert(stage == MESA_SHADER_FRAGMENT);
1448 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1449 assert(devinfo->gen >= 6);
1450
1451 this->current_annotation = "compute sample id";
1452 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1453
1454 if (key->compute_sample_id) {
1455 fs_reg t1 = vgrf(glsl_type::int_type);
1456 fs_reg t2 = vgrf(glsl_type::int_type);
1457 t2.type = BRW_REGISTER_TYPE_UW;
1458
1459 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1460 * 8x multisampling, subspan 0 will represent sample N (where N
1461 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1462 * 7. We can find the value of N by looking at R0.0 bits 7:6
1463 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1464 * (since samples are always delivered in pairs). That is, we
1465 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1466 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1467 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1468 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1469 * populating a temporary variable with the sequence (0, 1, 2, 3),
1470 * and then reading from it using vstride=1, width=4, hstride=0.
1471 * These computations hold good for 4x multisampling as well.
1472 *
1473 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1474 * the first four slots are sample 0 of subspan 0; the next four
1475 * are sample 1 of subspan 0; the third group is sample 0 of
1476 * subspan 1, and finally sample 1 of subspan 1.
1477 */
1478 fs_inst *inst;
1479 inst = emit(BRW_OPCODE_AND, t1,
1480 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1481 fs_reg(0xc0));
1482 inst->force_writemask_all = true;
1483 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1484 inst->force_writemask_all = true;
1485 /* This works for both SIMD8 and SIMD16 */
1486 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1487 inst->force_writemask_all = true;
1488 /* This special instruction takes care of setting vstride=1,
1489 * width=4, hstride=0 of t2 during an ADD instruction.
1490 */
1491 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1492 } else {
1493 /* As per GL_ARB_sample_shading specification:
1494 * "When rendering to a non-multisample buffer, or if multisample
1495 * rasterization is disabled, gl_SampleID will always be zero."
1496 */
1497 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1498 }
1499
1500 return reg;
1501 }
1502
1503 void
1504 fs_visitor::resolve_source_modifiers(fs_reg *src)
1505 {
1506 if (!src->abs && !src->negate)
1507 return;
1508
1509 fs_reg temp = retype(vgrf(1), src->type);
1510 emit(MOV(temp, *src));
1511 *src = temp;
1512 }
1513
1514 fs_reg
1515 fs_visitor::fix_math_operand(fs_reg src)
1516 {
1517 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1518 * might be able to do better by doing execsize = 1 math and then
1519 * expanding that result out, but we would need to be careful with
1520 * masking.
1521 *
1522 * The hardware ignores source modifiers (negate and abs) on math
1523 * instructions, so we also move to a temp to set those up.
1524 */
1525 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1526 !src.abs && !src.negate)
1527 return src;
1528
1529 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1530 * operands to math
1531 */
1532 if (devinfo->gen >= 7 && src.file != IMM)
1533 return src;
1534
1535 fs_reg expanded = vgrf(glsl_type::float_type);
1536 expanded.type = src.type;
1537 emit(BRW_OPCODE_MOV, expanded, src);
1538 return expanded;
1539 }
1540
1541 fs_inst *
1542 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1543 {
1544 switch (opcode) {
1545 case SHADER_OPCODE_RCP:
1546 case SHADER_OPCODE_RSQ:
1547 case SHADER_OPCODE_SQRT:
1548 case SHADER_OPCODE_EXP2:
1549 case SHADER_OPCODE_LOG2:
1550 case SHADER_OPCODE_SIN:
1551 case SHADER_OPCODE_COS:
1552 break;
1553 default:
1554 unreachable("not reached: bad math opcode");
1555 }
1556
1557 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1558 * might be able to do better by doing execsize = 1 math and then
1559 * expanding that result out, but we would need to be careful with
1560 * masking.
1561 *
1562 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1563 * instructions, so we also move to a temp to set those up.
1564 */
1565 if (devinfo->gen == 6 || devinfo->gen == 7)
1566 src = fix_math_operand(src);
1567
1568 fs_inst *inst = emit(opcode, dst, src);
1569
1570 if (devinfo->gen < 6) {
1571 inst->base_mrf = 2;
1572 inst->mlen = dispatch_width / 8;
1573 }
1574
1575 return inst;
1576 }
1577
1578 fs_inst *
1579 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1580 {
1581 int base_mrf = 2;
1582 fs_inst *inst;
1583
1584 if (devinfo->gen >= 8) {
1585 inst = emit(opcode, dst, src0, src1);
1586 } else if (devinfo->gen >= 6) {
1587 src0 = fix_math_operand(src0);
1588 src1 = fix_math_operand(src1);
1589
1590 inst = emit(opcode, dst, src0, src1);
1591 } else {
1592 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1593 * "Message Payload":
1594 *
1595 * "Operand0[7]. For the INT DIV functions, this operand is the
1596 * denominator."
1597 * ...
1598 * "Operand1[7]. For the INT DIV functions, this operand is the
1599 * numerator."
1600 */
1601 bool is_int_div = opcode != SHADER_OPCODE_POW;
1602 fs_reg &op0 = is_int_div ? src1 : src0;
1603 fs_reg &op1 = is_int_div ? src0 : src1;
1604
1605 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1606 inst = emit(opcode, dst, op0, reg_null_f);
1607
1608 inst->base_mrf = base_mrf;
1609 inst->mlen = 2 * dispatch_width / 8;
1610 }
1611 return inst;
1612 }
1613
1614 void
1615 fs_visitor::emit_discard_jump()
1616 {
1617 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1618
1619 /* For performance, after a discard, jump to the end of the
1620 * shader if all relevant channels have been discarded.
1621 */
1622 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1623 discard_jump->flag_subreg = 1;
1624
1625 discard_jump->predicate = (dispatch_width == 8)
1626 ? BRW_PREDICATE_ALIGN1_ANY8H
1627 : BRW_PREDICATE_ALIGN1_ANY16H;
1628 discard_jump->predicate_inverse = true;
1629 }
1630
1631 void
1632 fs_visitor::assign_curb_setup()
1633 {
1634 if (dispatch_width == 8) {
1635 prog_data->dispatch_grf_start_reg = payload.num_regs;
1636 } else {
1637 if (stage == MESA_SHADER_FRAGMENT) {
1638 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1639 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1640 } else if (stage == MESA_SHADER_COMPUTE) {
1641 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1642 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1643 } else {
1644 unreachable("Unsupported shader type!");
1645 }
1646 }
1647
1648 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1649
1650 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1651 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1652 for (unsigned int i = 0; i < inst->sources; i++) {
1653 if (inst->src[i].file == UNIFORM) {
1654 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1655 int constant_nr;
1656 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1657 constant_nr = push_constant_loc[uniform_nr];
1658 } else {
1659 /* Section 5.11 of the OpenGL 4.1 spec says:
1660 * "Out-of-bounds reads return undefined values, which include
1661 * values from other variables of the active program or zero."
1662 * Just return the first push constant.
1663 */
1664 constant_nr = 0;
1665 }
1666
1667 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1668 constant_nr / 8,
1669 constant_nr % 8);
1670
1671 inst->src[i].file = HW_REG;
1672 inst->src[i].fixed_hw_reg = byte_offset(
1673 retype(brw_reg, inst->src[i].type),
1674 inst->src[i].subreg_offset);
1675 }
1676 }
1677 }
1678 }
1679
1680 void
1681 fs_visitor::calculate_urb_setup()
1682 {
1683 assert(stage == MESA_SHADER_FRAGMENT);
1684 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1685 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1686
1687 memset(prog_data->urb_setup, -1,
1688 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1689
1690 int urb_next = 0;
1691 /* Figure out where each of the incoming setup attributes lands. */
1692 if (devinfo->gen >= 6) {
1693 if (_mesa_bitcount_64(prog->InputsRead &
1694 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1695 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1696 * first 16 varying inputs, so we can put them wherever we want.
1697 * Just put them in order.
1698 *
1699 * This is useful because it means that (a) inputs not used by the
1700 * fragment shader won't take up valuable register space, and (b) we
1701 * won't have to recompile the fragment shader if it gets paired with
1702 * a different vertex (or geometry) shader.
1703 */
1704 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1705 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1706 BITFIELD64_BIT(i)) {
1707 prog_data->urb_setup[i] = urb_next++;
1708 }
1709 }
1710 } else {
1711 /* We have enough input varyings that the SF/SBE pipeline stage can't
1712 * arbitrarily rearrange them to suit our whim; we have to put them
1713 * in an order that matches the output of the previous pipeline stage
1714 * (geometry or vertex shader).
1715 */
1716 struct brw_vue_map prev_stage_vue_map;
1717 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1718 key->input_slots_valid);
1719 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1720 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1721 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1722 slot++) {
1723 int varying = prev_stage_vue_map.slot_to_varying[slot];
1724 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1725 * unused.
1726 */
1727 if (varying != BRW_VARYING_SLOT_COUNT &&
1728 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1729 BITFIELD64_BIT(varying))) {
1730 prog_data->urb_setup[varying] = slot - first_slot;
1731 }
1732 }
1733 urb_next = prev_stage_vue_map.num_slots - first_slot;
1734 }
1735 } else {
1736 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1737 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1738 /* Point size is packed into the header, not as a general attribute */
1739 if (i == VARYING_SLOT_PSIZ)
1740 continue;
1741
1742 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1743 /* The back color slot is skipped when the front color is
1744 * also written to. In addition, some slots can be
1745 * written in the vertex shader and not read in the
1746 * fragment shader. So the register number must always be
1747 * incremented, mapped or not.
1748 */
1749 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1750 prog_data->urb_setup[i] = urb_next;
1751 urb_next++;
1752 }
1753 }
1754
1755 /*
1756 * It's a FS only attribute, and we did interpolation for this attribute
1757 * in SF thread. So, count it here, too.
1758 *
1759 * See compile_sf_prog() for more info.
1760 */
1761 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1762 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1763 }
1764
1765 prog_data->num_varying_inputs = urb_next;
1766 }
1767
1768 void
1769 fs_visitor::assign_urb_setup()
1770 {
1771 assert(stage == MESA_SHADER_FRAGMENT);
1772 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1773
1774 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1775
1776 /* Offset all the urb_setup[] index by the actual position of the
1777 * setup regs, now that the location of the constants has been chosen.
1778 */
1779 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1780 if (inst->opcode == FS_OPCODE_LINTERP) {
1781 assert(inst->src[1].file == HW_REG);
1782 inst->src[1].fixed_hw_reg.nr += urb_start;
1783 }
1784
1785 if (inst->opcode == FS_OPCODE_CINTERP) {
1786 assert(inst->src[0].file == HW_REG);
1787 inst->src[0].fixed_hw_reg.nr += urb_start;
1788 }
1789 }
1790
1791 /* Each attribute is 4 setup channels, each of which is half a reg. */
1792 this->first_non_payload_grf =
1793 urb_start + prog_data->num_varying_inputs * 2;
1794 }
1795
1796 void
1797 fs_visitor::assign_vs_urb_setup()
1798 {
1799 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1800 int grf, count, slot, channel, attr;
1801
1802 assert(stage == MESA_SHADER_VERTEX);
1803 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1804 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1805 count++;
1806
1807 /* Each attribute is 4 regs. */
1808 this->first_non_payload_grf =
1809 payload.num_regs + prog_data->curb_read_length + count * 4;
1810
1811 unsigned vue_entries =
1812 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1813
1814 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1815 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1816
1817 assert(vs_prog_data->base.urb_read_length <= 15);
1818
1819 /* Rewrite all ATTR file references to the hw grf that they land in. */
1820 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1821 for (int i = 0; i < inst->sources; i++) {
1822 if (inst->src[i].file == ATTR) {
1823
1824 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1825 slot = count - 1;
1826 } else {
1827 /* Attributes come in in a contiguous block, ordered by their
1828 * gl_vert_attrib value. That means we can compute the slot
1829 * number for an attribute by masking out the enabled
1830 * attributes before it and counting the bits.
1831 */
1832 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1833 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1834 BITFIELD64_MASK(attr));
1835 }
1836
1837 channel = inst->src[i].reg_offset & 3;
1838
1839 grf = payload.num_regs +
1840 prog_data->curb_read_length +
1841 slot * 4 + channel;
1842
1843 inst->src[i].file = HW_REG;
1844 inst->src[i].fixed_hw_reg =
1845 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1846 }
1847 }
1848 }
1849 }
1850
1851 /**
1852 * Split large virtual GRFs into separate components if we can.
1853 *
1854 * This is mostly duplicated with what brw_fs_vector_splitting does,
1855 * but that's really conservative because it's afraid of doing
1856 * splitting that doesn't result in real progress after the rest of
1857 * the optimization phases, which would cause infinite looping in
1858 * optimization. We can do it once here, safely. This also has the
1859 * opportunity to split interpolated values, or maybe even uniforms,
1860 * which we don't have at the IR level.
1861 *
1862 * We want to split, because virtual GRFs are what we register
1863 * allocate and spill (due to contiguousness requirements for some
1864 * instructions), and they're what we naturally generate in the
1865 * codegen process, but most virtual GRFs don't actually need to be
1866 * contiguous sets of GRFs. If we split, we'll end up with reduced
1867 * live intervals and better dead code elimination and coalescing.
1868 */
1869 void
1870 fs_visitor::split_virtual_grfs()
1871 {
1872 int num_vars = this->alloc.count;
1873
1874 /* Count the total number of registers */
1875 int reg_count = 0;
1876 int vgrf_to_reg[num_vars];
1877 for (int i = 0; i < num_vars; i++) {
1878 vgrf_to_reg[i] = reg_count;
1879 reg_count += alloc.sizes[i];
1880 }
1881
1882 /* An array of "split points". For each register slot, this indicates
1883 * if this slot can be separated from the previous slot. Every time an
1884 * instruction uses multiple elements of a register (as a source or
1885 * destination), we mark the used slots as inseparable. Then we go
1886 * through and split the registers into the smallest pieces we can.
1887 */
1888 bool split_points[reg_count];
1889 memset(split_points, 0, sizeof(split_points));
1890
1891 /* Mark all used registers as fully splittable */
1892 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1893 if (inst->dst.file == GRF) {
1894 int reg = vgrf_to_reg[inst->dst.reg];
1895 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1896 split_points[reg + j] = true;
1897 }
1898
1899 for (int i = 0; i < inst->sources; i++) {
1900 if (inst->src[i].file == GRF) {
1901 int reg = vgrf_to_reg[inst->src[i].reg];
1902 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1903 split_points[reg + j] = true;
1904 }
1905 }
1906 }
1907
1908 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1909 if (inst->dst.file == GRF) {
1910 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1911 for (int j = 1; j < inst->regs_written; j++)
1912 split_points[reg + j] = false;
1913 }
1914 for (int i = 0; i < inst->sources; i++) {
1915 if (inst->src[i].file == GRF) {
1916 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1917 for (int j = 1; j < inst->regs_read(i); j++)
1918 split_points[reg + j] = false;
1919 }
1920 }
1921 }
1922
1923 int new_virtual_grf[reg_count];
1924 int new_reg_offset[reg_count];
1925
1926 int reg = 0;
1927 for (int i = 0; i < num_vars; i++) {
1928 /* The first one should always be 0 as a quick sanity check. */
1929 assert(split_points[reg] == false);
1930
1931 /* j = 0 case */
1932 new_reg_offset[reg] = 0;
1933 reg++;
1934 int offset = 1;
1935
1936 /* j > 0 case */
1937 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1938 /* If this is a split point, reset the offset to 0 and allocate a
1939 * new virtual GRF for the previous offset many registers
1940 */
1941 if (split_points[reg]) {
1942 assert(offset <= MAX_VGRF_SIZE);
1943 int grf = alloc.allocate(offset);
1944 for (int k = reg - offset; k < reg; k++)
1945 new_virtual_grf[k] = grf;
1946 offset = 0;
1947 }
1948 new_reg_offset[reg] = offset;
1949 offset++;
1950 reg++;
1951 }
1952
1953 /* The last one gets the original register number */
1954 assert(offset <= MAX_VGRF_SIZE);
1955 alloc.sizes[i] = offset;
1956 for (int k = reg - offset; k < reg; k++)
1957 new_virtual_grf[k] = i;
1958 }
1959 assert(reg == reg_count);
1960
1961 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1962 if (inst->dst.file == GRF) {
1963 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1964 inst->dst.reg = new_virtual_grf[reg];
1965 inst->dst.reg_offset = new_reg_offset[reg];
1966 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1967 }
1968 for (int i = 0; i < inst->sources; i++) {
1969 if (inst->src[i].file == GRF) {
1970 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1971 inst->src[i].reg = new_virtual_grf[reg];
1972 inst->src[i].reg_offset = new_reg_offset[reg];
1973 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1974 }
1975 }
1976 }
1977 invalidate_live_intervals();
1978 }
1979
1980 /**
1981 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1982 *
1983 * During code generation, we create tons of temporary variables, many of
1984 * which get immediately killed and are never used again. Yet, in later
1985 * optimization and analysis passes, such as compute_live_intervals, we need
1986 * to loop over all the virtual GRFs. Compacting them can save a lot of
1987 * overhead.
1988 */
1989 bool
1990 fs_visitor::compact_virtual_grfs()
1991 {
1992 bool progress = false;
1993 int remap_table[this->alloc.count];
1994 memset(remap_table, -1, sizeof(remap_table));
1995
1996 /* Mark which virtual GRFs are used. */
1997 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1998 if (inst->dst.file == GRF)
1999 remap_table[inst->dst.reg] = 0;
2000
2001 for (int i = 0; i < inst->sources; i++) {
2002 if (inst->src[i].file == GRF)
2003 remap_table[inst->src[i].reg] = 0;
2004 }
2005 }
2006
2007 /* Compact the GRF arrays. */
2008 int new_index = 0;
2009 for (unsigned i = 0; i < this->alloc.count; i++) {
2010 if (remap_table[i] == -1) {
2011 /* We just found an unused register. This means that we are
2012 * actually going to compact something.
2013 */
2014 progress = true;
2015 } else {
2016 remap_table[i] = new_index;
2017 alloc.sizes[new_index] = alloc.sizes[i];
2018 invalidate_live_intervals();
2019 ++new_index;
2020 }
2021 }
2022
2023 this->alloc.count = new_index;
2024
2025 /* Patch all the instructions to use the newly renumbered registers */
2026 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2027 if (inst->dst.file == GRF)
2028 inst->dst.reg = remap_table[inst->dst.reg];
2029
2030 for (int i = 0; i < inst->sources; i++) {
2031 if (inst->src[i].file == GRF)
2032 inst->src[i].reg = remap_table[inst->src[i].reg];
2033 }
2034 }
2035
2036 /* Patch all the references to delta_xy, since they're used in register
2037 * allocation. If they're unused, switch them to BAD_FILE so we don't
2038 * think some random VGRF is delta_xy.
2039 */
2040 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2041 if (delta_xy[i].file == GRF) {
2042 if (remap_table[delta_xy[i].reg] != -1) {
2043 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2044 } else {
2045 delta_xy[i].file = BAD_FILE;
2046 }
2047 }
2048 }
2049
2050 return progress;
2051 }
2052
2053 /*
2054 * Implements array access of uniforms by inserting a
2055 * PULL_CONSTANT_LOAD instruction.
2056 *
2057 * Unlike temporary GRF array access (where we don't support it due to
2058 * the difficulty of doing relative addressing on instruction
2059 * destinations), we could potentially do array access of uniforms
2060 * that were loaded in GRF space as push constants. In real-world
2061 * usage we've seen, though, the arrays being used are always larger
2062 * than we could load as push constants, so just always move all
2063 * uniform array access out to a pull constant buffer.
2064 */
2065 void
2066 fs_visitor::move_uniform_array_access_to_pull_constants()
2067 {
2068 if (dispatch_width != 8)
2069 return;
2070
2071 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2072 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2073
2074 /* Walk through and find array access of uniforms. Put a copy of that
2075 * uniform in the pull constant buffer.
2076 *
2077 * Note that we don't move constant-indexed accesses to arrays. No
2078 * testing has been done of the performance impact of this choice.
2079 */
2080 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2081 for (int i = 0 ; i < inst->sources; i++) {
2082 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2083 continue;
2084
2085 int uniform = inst->src[i].reg;
2086
2087 /* If this array isn't already present in the pull constant buffer,
2088 * add it.
2089 */
2090 if (pull_constant_loc[uniform] == -1) {
2091 const gl_constant_value **values = &stage_prog_data->param[uniform];
2092
2093 assert(param_size[uniform]);
2094
2095 for (int j = 0; j < param_size[uniform]; j++) {
2096 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2097
2098 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2099 values[j];
2100 }
2101 }
2102 }
2103 }
2104 }
2105
2106 /**
2107 * Assign UNIFORM file registers to either push constants or pull constants.
2108 *
2109 * We allow a fragment shader to have more than the specified minimum
2110 * maximum number of fragment shader uniform components (64). If
2111 * there are too many of these, they'd fill up all of register space.
2112 * So, this will push some of them out to the pull constant buffer and
2113 * update the program to load them.
2114 */
2115 void
2116 fs_visitor::assign_constant_locations()
2117 {
2118 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2119 if (dispatch_width != 8)
2120 return;
2121
2122 /* Find which UNIFORM registers are still in use. */
2123 bool is_live[uniforms];
2124 for (unsigned int i = 0; i < uniforms; i++) {
2125 is_live[i] = false;
2126 }
2127
2128 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2129 for (int i = 0; i < inst->sources; i++) {
2130 if (inst->src[i].file != UNIFORM)
2131 continue;
2132
2133 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2134 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2135 is_live[constant_nr] = true;
2136 }
2137 }
2138
2139 /* Only allow 16 registers (128 uniform components) as push constants.
2140 *
2141 * Just demote the end of the list. We could probably do better
2142 * here, demoting things that are rarely used in the program first.
2143 *
2144 * If changing this value, note the limitation about total_regs in
2145 * brw_curbe.c.
2146 */
2147 unsigned int max_push_components = 16 * 8;
2148 unsigned int num_push_constants = 0;
2149
2150 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2151
2152 for (unsigned int i = 0; i < uniforms; i++) {
2153 if (!is_live[i] || pull_constant_loc[i] != -1) {
2154 /* This UNIFORM register is either dead, or has already been demoted
2155 * to a pull const. Mark it as no longer living in the param[] array.
2156 */
2157 push_constant_loc[i] = -1;
2158 continue;
2159 }
2160
2161 if (num_push_constants < max_push_components) {
2162 /* Retain as a push constant. Record the location in the params[]
2163 * array.
2164 */
2165 push_constant_loc[i] = num_push_constants++;
2166 } else {
2167 /* Demote to a pull constant. */
2168 push_constant_loc[i] = -1;
2169
2170 int pull_index = stage_prog_data->nr_pull_params++;
2171 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2172 pull_constant_loc[i] = pull_index;
2173 }
2174 }
2175
2176 stage_prog_data->nr_params = num_push_constants;
2177
2178 /* Up until now, the param[] array has been indexed by reg + reg_offset
2179 * of UNIFORM registers. Condense it to only contain the uniforms we
2180 * chose to upload as push constants.
2181 */
2182 for (unsigned int i = 0; i < uniforms; i++) {
2183 int remapped = push_constant_loc[i];
2184
2185 if (remapped == -1)
2186 continue;
2187
2188 assert(remapped <= (int)i);
2189 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2190 }
2191 }
2192
2193 /**
2194 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2195 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2196 */
2197 void
2198 fs_visitor::demote_pull_constants()
2199 {
2200 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2201 for (int i = 0; i < inst->sources; i++) {
2202 if (inst->src[i].file != UNIFORM)
2203 continue;
2204
2205 int pull_index;
2206 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2207 if (location >= uniforms) /* Out of bounds access */
2208 pull_index = -1;
2209 else
2210 pull_index = pull_constant_loc[location];
2211
2212 if (pull_index == -1)
2213 continue;
2214
2215 /* Set up the annotation tracking for new generated instructions. */
2216 base_ir = inst->ir;
2217 current_annotation = inst->annotation;
2218
2219 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2220 fs_reg dst = vgrf(glsl_type::float_type);
2221
2222 /* Generate a pull load into dst. */
2223 if (inst->src[i].reladdr) {
2224 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2225 surf_index,
2226 *inst->src[i].reladdr,
2227 pull_index);
2228 inst->insert_before(block, &list);
2229 inst->src[i].reladdr = NULL;
2230 } else {
2231 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2232 fs_inst *pull =
2233 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2234 dst, surf_index, offset);
2235 inst->insert_before(block, pull);
2236 inst->src[i].set_smear(pull_index & 3);
2237 }
2238
2239 /* Rewrite the instruction to use the temporary VGRF. */
2240 inst->src[i].file = GRF;
2241 inst->src[i].reg = dst.reg;
2242 inst->src[i].reg_offset = 0;
2243 inst->src[i].width = dispatch_width;
2244 }
2245 }
2246 invalidate_live_intervals();
2247 }
2248
2249 bool
2250 fs_visitor::opt_algebraic()
2251 {
2252 bool progress = false;
2253
2254 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2255 switch (inst->opcode) {
2256 case BRW_OPCODE_MOV:
2257 if (inst->src[0].file != IMM)
2258 break;
2259
2260 if (inst->saturate) {
2261 if (inst->dst.type != inst->src[0].type)
2262 assert(!"unimplemented: saturate mixed types");
2263
2264 if (brw_saturate_immediate(inst->dst.type,
2265 &inst->src[0].fixed_hw_reg)) {
2266 inst->saturate = false;
2267 progress = true;
2268 }
2269 }
2270 break;
2271
2272 case BRW_OPCODE_MUL:
2273 if (inst->src[1].file != IMM)
2274 continue;
2275
2276 /* a * 1.0 = a */
2277 if (inst->src[1].is_one()) {
2278 inst->opcode = BRW_OPCODE_MOV;
2279 inst->src[1] = reg_undef;
2280 progress = true;
2281 break;
2282 }
2283
2284 /* a * -1.0 = -a */
2285 if (inst->src[1].is_negative_one()) {
2286 inst->opcode = BRW_OPCODE_MOV;
2287 inst->src[0].negate = !inst->src[0].negate;
2288 inst->src[1] = reg_undef;
2289 progress = true;
2290 break;
2291 }
2292
2293 /* a * 0.0 = 0.0 */
2294 if (inst->src[1].is_zero()) {
2295 inst->opcode = BRW_OPCODE_MOV;
2296 inst->src[0] = inst->src[1];
2297 inst->src[1] = reg_undef;
2298 progress = true;
2299 break;
2300 }
2301
2302 if (inst->src[0].file == IMM) {
2303 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2304 inst->opcode = BRW_OPCODE_MOV;
2305 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2306 inst->src[1] = reg_undef;
2307 progress = true;
2308 break;
2309 }
2310 break;
2311 case BRW_OPCODE_ADD:
2312 if (inst->src[1].file != IMM)
2313 continue;
2314
2315 /* a + 0.0 = a */
2316 if (inst->src[1].is_zero()) {
2317 inst->opcode = BRW_OPCODE_MOV;
2318 inst->src[1] = reg_undef;
2319 progress = true;
2320 break;
2321 }
2322
2323 if (inst->src[0].file == IMM) {
2324 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2325 inst->opcode = BRW_OPCODE_MOV;
2326 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2327 inst->src[1] = reg_undef;
2328 progress = true;
2329 break;
2330 }
2331 break;
2332 case BRW_OPCODE_OR:
2333 if (inst->src[0].equals(inst->src[1])) {
2334 inst->opcode = BRW_OPCODE_MOV;
2335 inst->src[1] = reg_undef;
2336 progress = true;
2337 break;
2338 }
2339 break;
2340 case BRW_OPCODE_LRP:
2341 if (inst->src[1].equals(inst->src[2])) {
2342 inst->opcode = BRW_OPCODE_MOV;
2343 inst->src[0] = inst->src[1];
2344 inst->src[1] = reg_undef;
2345 inst->src[2] = reg_undef;
2346 progress = true;
2347 break;
2348 }
2349 break;
2350 case BRW_OPCODE_CMP:
2351 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2352 inst->src[0].abs &&
2353 inst->src[0].negate &&
2354 inst->src[1].is_zero()) {
2355 inst->src[0].abs = false;
2356 inst->src[0].negate = false;
2357 inst->conditional_mod = BRW_CONDITIONAL_Z;
2358 progress = true;
2359 break;
2360 }
2361 break;
2362 case BRW_OPCODE_SEL:
2363 if (inst->src[0].equals(inst->src[1])) {
2364 inst->opcode = BRW_OPCODE_MOV;
2365 inst->src[1] = reg_undef;
2366 inst->predicate = BRW_PREDICATE_NONE;
2367 inst->predicate_inverse = false;
2368 progress = true;
2369 } else if (inst->saturate && inst->src[1].file == IMM) {
2370 switch (inst->conditional_mod) {
2371 case BRW_CONDITIONAL_LE:
2372 case BRW_CONDITIONAL_L:
2373 switch (inst->src[1].type) {
2374 case BRW_REGISTER_TYPE_F:
2375 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2376 inst->opcode = BRW_OPCODE_MOV;
2377 inst->src[1] = reg_undef;
2378 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2379 progress = true;
2380 }
2381 break;
2382 default:
2383 break;
2384 }
2385 break;
2386 case BRW_CONDITIONAL_GE:
2387 case BRW_CONDITIONAL_G:
2388 switch (inst->src[1].type) {
2389 case BRW_REGISTER_TYPE_F:
2390 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2391 inst->opcode = BRW_OPCODE_MOV;
2392 inst->src[1] = reg_undef;
2393 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2394 progress = true;
2395 }
2396 break;
2397 default:
2398 break;
2399 }
2400 default:
2401 break;
2402 }
2403 }
2404 break;
2405 case BRW_OPCODE_MAD:
2406 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2407 inst->opcode = BRW_OPCODE_MOV;
2408 inst->src[1] = reg_undef;
2409 inst->src[2] = reg_undef;
2410 progress = true;
2411 } else if (inst->src[0].is_zero()) {
2412 inst->opcode = BRW_OPCODE_MUL;
2413 inst->src[0] = inst->src[2];
2414 inst->src[2] = reg_undef;
2415 progress = true;
2416 } else if (inst->src[1].is_one()) {
2417 inst->opcode = BRW_OPCODE_ADD;
2418 inst->src[1] = inst->src[2];
2419 inst->src[2] = reg_undef;
2420 progress = true;
2421 } else if (inst->src[2].is_one()) {
2422 inst->opcode = BRW_OPCODE_ADD;
2423 inst->src[2] = reg_undef;
2424 progress = true;
2425 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2426 inst->opcode = BRW_OPCODE_ADD;
2427 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2428 inst->src[2] = reg_undef;
2429 progress = true;
2430 }
2431 break;
2432 case SHADER_OPCODE_RCP: {
2433 fs_inst *prev = (fs_inst *)inst->prev;
2434 if (prev->opcode == SHADER_OPCODE_SQRT) {
2435 if (inst->src[0].equals(prev->dst)) {
2436 inst->opcode = SHADER_OPCODE_RSQ;
2437 inst->src[0] = prev->src[0];
2438 progress = true;
2439 }
2440 }
2441 break;
2442 }
2443 case SHADER_OPCODE_BROADCAST:
2444 if (is_uniform(inst->src[0])) {
2445 inst->opcode = BRW_OPCODE_MOV;
2446 inst->sources = 1;
2447 inst->force_writemask_all = true;
2448 progress = true;
2449 } else if (inst->src[1].file == IMM) {
2450 inst->opcode = BRW_OPCODE_MOV;
2451 inst->src[0] = component(inst->src[0],
2452 inst->src[1].fixed_hw_reg.dw1.ud);
2453 inst->sources = 1;
2454 inst->force_writemask_all = true;
2455 progress = true;
2456 }
2457 break;
2458
2459 default:
2460 break;
2461 }
2462
2463 /* Swap if src[0] is immediate. */
2464 if (progress && inst->is_commutative()) {
2465 if (inst->src[0].file == IMM) {
2466 fs_reg tmp = inst->src[1];
2467 inst->src[1] = inst->src[0];
2468 inst->src[0] = tmp;
2469 }
2470 }
2471 }
2472 return progress;
2473 }
2474
2475 /**
2476 * Optimize sample messages that have constant zero values for the trailing
2477 * texture coordinates. We can just reduce the message length for these
2478 * instructions instead of reserving a register for it. Trailing parameters
2479 * that aren't sent default to zero anyway. This will cause the dead code
2480 * eliminator to remove the MOV instruction that would otherwise be emitted to
2481 * set up the zero value.
2482 */
2483 bool
2484 fs_visitor::opt_zero_samples()
2485 {
2486 /* Gen4 infers the texturing opcode based on the message length so we can't
2487 * change it.
2488 */
2489 if (devinfo->gen < 5)
2490 return false;
2491
2492 bool progress = false;
2493
2494 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2495 if (!inst->is_tex())
2496 continue;
2497
2498 fs_inst *load_payload = (fs_inst *) inst->prev;
2499
2500 if (load_payload->is_head_sentinel() ||
2501 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2502 continue;
2503
2504 /* We don't want to remove the message header or the first parameter.
2505 * Removing the first parameter is not allowed, see the Haswell PRM
2506 * volume 7, page 149:
2507 *
2508 * "Parameter 0 is required except for the sampleinfo message, which
2509 * has no parameter 0"
2510 */
2511 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2512 load_payload->src[(inst->mlen - inst->header_size) /
2513 (dispatch_width / 8) +
2514 inst->header_size - 1].is_zero()) {
2515 inst->mlen -= dispatch_width / 8;
2516 progress = true;
2517 }
2518 }
2519
2520 if (progress)
2521 invalidate_live_intervals();
2522
2523 return progress;
2524 }
2525
2526 /**
2527 * Optimize sample messages which are followed by the final RT write.
2528 *
2529 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2530 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2531 * final texturing results copied to the framebuffer write payload and modify
2532 * them to write to the framebuffer directly.
2533 */
2534 bool
2535 fs_visitor::opt_sampler_eot()
2536 {
2537 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2538
2539 if (stage != MESA_SHADER_FRAGMENT)
2540 return false;
2541
2542 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2543 return false;
2544
2545 /* FINISHME: It should be possible to implement this optimization when there
2546 * are multiple drawbuffers.
2547 */
2548 if (key->nr_color_regions != 1)
2549 return false;
2550
2551 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2552 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2553 assert(fb_write->eot);
2554 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2555
2556 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2557
2558 /* There wasn't one; nothing to do. */
2559 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2560 return false;
2561
2562 /* This optimisation doesn't seem to work for textureGather for some
2563 * reason. I can't find any documentation or known workarounds to indicate
2564 * that this is expected, but considering that it is probably pretty
2565 * unlikely that a shader would directly write out the results from
2566 * textureGather we might as well just disable it.
2567 */
2568 if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2569 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2570 return false;
2571
2572 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2573 * It's very likely to be the previous instruction.
2574 */
2575 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2576 if (load_payload->is_head_sentinel() ||
2577 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2578 return false;
2579
2580 assert(!tex_inst->eot); /* We can't get here twice */
2581 assert((tex_inst->offset & (0xff << 24)) == 0);
2582
2583 tex_inst->offset |= fb_write->target << 24;
2584 tex_inst->eot = true;
2585 tex_inst->dst = bld.null_reg_ud();
2586 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2587
2588 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2589 * to create a new LOAD_PAYLOAD command with the same sources and a space
2590 * saved for the header. Using a new destination register not only makes sure
2591 * we have enough space, but it will make sure the dead code eliminator kills
2592 * the instruction that this will replace.
2593 */
2594 if (tex_inst->header_size != 0)
2595 return true;
2596
2597 fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2598 load_payload->sources + 1);
2599 fs_reg *new_sources =
2600 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2601
2602 new_sources[0] = fs_reg();
2603 for (int i = 0; i < load_payload->sources; i++)
2604 new_sources[i+1] = load_payload->src[i];
2605
2606 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2607 * requires a lot of information about the sources to appropriately figure
2608 * out the number of registers needed to be used. Given this stage in our
2609 * optimization, we may not have the appropriate GRFs required by
2610 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2611 * manually emit the instruction.
2612 */
2613 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2614 load_payload->exec_size,
2615 send_header,
2616 new_sources,
2617 load_payload->sources + 1);
2618
2619 new_load_payload->regs_written = load_payload->regs_written + 1;
2620 new_load_payload->header_size = 1;
2621 tex_inst->mlen++;
2622 tex_inst->header_size = 1;
2623 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2624 tex_inst->src[0] = send_header;
2625
2626 return true;
2627 }
2628
2629 bool
2630 fs_visitor::opt_register_renaming()
2631 {
2632 bool progress = false;
2633 int depth = 0;
2634
2635 int remap[alloc.count];
2636 memset(remap, -1, sizeof(int) * alloc.count);
2637
2638 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2639 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2640 depth++;
2641 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2642 inst->opcode == BRW_OPCODE_WHILE) {
2643 depth--;
2644 }
2645
2646 /* Rewrite instruction sources. */
2647 for (int i = 0; i < inst->sources; i++) {
2648 if (inst->src[i].file == GRF &&
2649 remap[inst->src[i].reg] != -1 &&
2650 remap[inst->src[i].reg] != inst->src[i].reg) {
2651 inst->src[i].reg = remap[inst->src[i].reg];
2652 progress = true;
2653 }
2654 }
2655
2656 const int dst = inst->dst.reg;
2657
2658 if (depth == 0 &&
2659 inst->dst.file == GRF &&
2660 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2661 !inst->is_partial_write()) {
2662 if (remap[dst] == -1) {
2663 remap[dst] = dst;
2664 } else {
2665 remap[dst] = alloc.allocate(inst->dst.width / 8);
2666 inst->dst.reg = remap[dst];
2667 progress = true;
2668 }
2669 } else if (inst->dst.file == GRF &&
2670 remap[dst] != -1 &&
2671 remap[dst] != dst) {
2672 inst->dst.reg = remap[dst];
2673 progress = true;
2674 }
2675 }
2676
2677 if (progress) {
2678 invalidate_live_intervals();
2679
2680 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2681 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2682 delta_xy[i].reg = remap[delta_xy[i].reg];
2683 }
2684 }
2685 }
2686
2687 return progress;
2688 }
2689
2690 /**
2691 * Remove redundant or useless discard jumps.
2692 *
2693 * For example, we can eliminate jumps in the following sequence:
2694 *
2695 * discard-jump (redundant with the next jump)
2696 * discard-jump (useless; jumps to the next instruction)
2697 * placeholder-halt
2698 */
2699 bool
2700 fs_visitor::opt_redundant_discard_jumps()
2701 {
2702 bool progress = false;
2703
2704 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2705
2706 fs_inst *placeholder_halt = NULL;
2707 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2708 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2709 placeholder_halt = inst;
2710 break;
2711 }
2712 }
2713
2714 if (!placeholder_halt)
2715 return false;
2716
2717 /* Delete any HALTs immediately before the placeholder halt. */
2718 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2719 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2720 prev = (fs_inst *) placeholder_halt->prev) {
2721 prev->remove(last_bblock);
2722 progress = true;
2723 }
2724
2725 if (progress)
2726 invalidate_live_intervals();
2727
2728 return progress;
2729 }
2730
2731 bool
2732 fs_visitor::compute_to_mrf()
2733 {
2734 bool progress = false;
2735 int next_ip = 0;
2736
2737 /* No MRFs on Gen >= 7. */
2738 if (devinfo->gen >= 7)
2739 return false;
2740
2741 calculate_live_intervals();
2742
2743 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2744 int ip = next_ip;
2745 next_ip++;
2746
2747 if (inst->opcode != BRW_OPCODE_MOV ||
2748 inst->is_partial_write() ||
2749 inst->dst.file != MRF || inst->src[0].file != GRF ||
2750 inst->dst.type != inst->src[0].type ||
2751 inst->src[0].abs || inst->src[0].negate ||
2752 !inst->src[0].is_contiguous() ||
2753 inst->src[0].subreg_offset)
2754 continue;
2755
2756 /* Work out which hardware MRF registers are written by this
2757 * instruction.
2758 */
2759 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2760 int mrf_high;
2761 if (inst->dst.reg & BRW_MRF_COMPR4) {
2762 mrf_high = mrf_low + 4;
2763 } else if (inst->exec_size == 16) {
2764 mrf_high = mrf_low + 1;
2765 } else {
2766 mrf_high = mrf_low;
2767 }
2768
2769 /* Can't compute-to-MRF this GRF if someone else was going to
2770 * read it later.
2771 */
2772 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2773 continue;
2774
2775 /* Found a move of a GRF to a MRF. Let's see if we can go
2776 * rewrite the thing that made this GRF to write into the MRF.
2777 */
2778 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2779 if (scan_inst->dst.file == GRF &&
2780 scan_inst->dst.reg == inst->src[0].reg) {
2781 /* Found the last thing to write our reg we want to turn
2782 * into a compute-to-MRF.
2783 */
2784
2785 /* If this one instruction didn't populate all the
2786 * channels, bail. We might be able to rewrite everything
2787 * that writes that reg, but it would require smarter
2788 * tracking to delay the rewriting until complete success.
2789 */
2790 if (scan_inst->is_partial_write())
2791 break;
2792
2793 /* Things returning more than one register would need us to
2794 * understand coalescing out more than one MOV at a time.
2795 */
2796 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2797 break;
2798
2799 /* SEND instructions can't have MRF as a destination. */
2800 if (scan_inst->mlen)
2801 break;
2802
2803 if (devinfo->gen == 6) {
2804 /* gen6 math instructions must have the destination be
2805 * GRF, so no compute-to-MRF for them.
2806 */
2807 if (scan_inst->is_math()) {
2808 break;
2809 }
2810 }
2811
2812 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2813 /* Found the creator of our MRF's source value. */
2814 scan_inst->dst.file = MRF;
2815 scan_inst->dst.reg = inst->dst.reg;
2816 scan_inst->saturate |= inst->saturate;
2817 inst->remove(block);
2818 progress = true;
2819 }
2820 break;
2821 }
2822
2823 /* We don't handle control flow here. Most computation of
2824 * values that end up in MRFs are shortly before the MRF
2825 * write anyway.
2826 */
2827 if (block->start() == scan_inst)
2828 break;
2829
2830 /* You can't read from an MRF, so if someone else reads our
2831 * MRF's source GRF that we wanted to rewrite, that stops us.
2832 */
2833 bool interfered = false;
2834 for (int i = 0; i < scan_inst->sources; i++) {
2835 if (scan_inst->src[i].file == GRF &&
2836 scan_inst->src[i].reg == inst->src[0].reg &&
2837 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2838 interfered = true;
2839 }
2840 }
2841 if (interfered)
2842 break;
2843
2844 if (scan_inst->dst.file == MRF) {
2845 /* If somebody else writes our MRF here, we can't
2846 * compute-to-MRF before that.
2847 */
2848 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2849 int scan_mrf_high;
2850
2851 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2852 scan_mrf_high = scan_mrf_low + 4;
2853 } else if (scan_inst->exec_size == 16) {
2854 scan_mrf_high = scan_mrf_low + 1;
2855 } else {
2856 scan_mrf_high = scan_mrf_low;
2857 }
2858
2859 if (mrf_low == scan_mrf_low ||
2860 mrf_low == scan_mrf_high ||
2861 mrf_high == scan_mrf_low ||
2862 mrf_high == scan_mrf_high) {
2863 break;
2864 }
2865 }
2866
2867 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2868 /* Found a SEND instruction, which means that there are
2869 * live values in MRFs from base_mrf to base_mrf +
2870 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2871 * above it.
2872 */
2873 if (mrf_low >= scan_inst->base_mrf &&
2874 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2875 break;
2876 }
2877 if (mrf_high >= scan_inst->base_mrf &&
2878 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2879 break;
2880 }
2881 }
2882 }
2883 }
2884
2885 if (progress)
2886 invalidate_live_intervals();
2887
2888 return progress;
2889 }
2890
2891 /**
2892 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2893 * flow. We could probably do better here with some form of divergence
2894 * analysis.
2895 */
2896 bool
2897 fs_visitor::eliminate_find_live_channel()
2898 {
2899 bool progress = false;
2900 unsigned depth = 0;
2901
2902 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2903 switch (inst->opcode) {
2904 case BRW_OPCODE_IF:
2905 case BRW_OPCODE_DO:
2906 depth++;
2907 break;
2908
2909 case BRW_OPCODE_ENDIF:
2910 case BRW_OPCODE_WHILE:
2911 depth--;
2912 break;
2913
2914 case FS_OPCODE_DISCARD_JUMP:
2915 /* This can potentially make control flow non-uniform until the end
2916 * of the program.
2917 */
2918 return progress;
2919
2920 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2921 if (depth == 0) {
2922 inst->opcode = BRW_OPCODE_MOV;
2923 inst->src[0] = fs_reg(0);
2924 inst->sources = 1;
2925 inst->force_writemask_all = true;
2926 progress = true;
2927 }
2928 break;
2929
2930 default:
2931 break;
2932 }
2933 }
2934
2935 return progress;
2936 }
2937
2938 /**
2939 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2940 * instructions to FS_OPCODE_REP_FB_WRITE.
2941 */
2942 void
2943 fs_visitor::emit_repclear_shader()
2944 {
2945 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2946 int base_mrf = 1;
2947 int color_mrf = base_mrf + 2;
2948
2949 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2950 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2951 mov->force_writemask_all = true;
2952
2953 fs_inst *write;
2954 if (key->nr_color_regions == 1) {
2955 write = emit(FS_OPCODE_REP_FB_WRITE);
2956 write->saturate = key->clamp_fragment_color;
2957 write->base_mrf = color_mrf;
2958 write->target = 0;
2959 write->header_size = 0;
2960 write->mlen = 1;
2961 } else {
2962 assume(key->nr_color_regions > 0);
2963 for (int i = 0; i < key->nr_color_regions; ++i) {
2964 write = emit(FS_OPCODE_REP_FB_WRITE);
2965 write->saturate = key->clamp_fragment_color;
2966 write->base_mrf = base_mrf;
2967 write->target = i;
2968 write->header_size = 2;
2969 write->mlen = 3;
2970 }
2971 }
2972 write->eot = true;
2973
2974 calculate_cfg();
2975
2976 assign_constant_locations();
2977 assign_curb_setup();
2978
2979 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2980 assert(mov->src[0].file == HW_REG);
2981 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2982 }
2983
2984 /**
2985 * Walks through basic blocks, looking for repeated MRF writes and
2986 * removing the later ones.
2987 */
2988 bool
2989 fs_visitor::remove_duplicate_mrf_writes()
2990 {
2991 fs_inst *last_mrf_move[16];
2992 bool progress = false;
2993
2994 /* Need to update the MRF tracking for compressed instructions. */
2995 if (dispatch_width == 16)
2996 return false;
2997
2998 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2999
3000 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3001 if (inst->is_control_flow()) {
3002 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3003 }
3004
3005 if (inst->opcode == BRW_OPCODE_MOV &&
3006 inst->dst.file == MRF) {
3007 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3008 if (prev_inst && inst->equals(prev_inst)) {
3009 inst->remove(block);
3010 progress = true;
3011 continue;
3012 }
3013 }
3014
3015 /* Clear out the last-write records for MRFs that were overwritten. */
3016 if (inst->dst.file == MRF) {
3017 last_mrf_move[inst->dst.reg] = NULL;
3018 }
3019
3020 if (inst->mlen > 0 && inst->base_mrf != -1) {
3021 /* Found a SEND instruction, which will include two or fewer
3022 * implied MRF writes. We could do better here.
3023 */
3024 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3025 last_mrf_move[inst->base_mrf + i] = NULL;
3026 }
3027 }
3028
3029 /* Clear out any MRF move records whose sources got overwritten. */
3030 if (inst->dst.file == GRF) {
3031 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3032 if (last_mrf_move[i] &&
3033 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3034 last_mrf_move[i] = NULL;
3035 }
3036 }
3037 }
3038
3039 if (inst->opcode == BRW_OPCODE_MOV &&
3040 inst->dst.file == MRF &&
3041 inst->src[0].file == GRF &&
3042 !inst->is_partial_write()) {
3043 last_mrf_move[inst->dst.reg] = inst;
3044 }
3045 }
3046
3047 if (progress)
3048 invalidate_live_intervals();
3049
3050 return progress;
3051 }
3052
3053 static void
3054 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3055 {
3056 /* Clear the flag for registers that actually got read (as expected). */
3057 for (int i = 0; i < inst->sources; i++) {
3058 int grf;
3059 if (inst->src[i].file == GRF) {
3060 grf = inst->src[i].reg;
3061 } else if (inst->src[i].file == HW_REG &&
3062 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3063 grf = inst->src[i].fixed_hw_reg.nr;
3064 } else {
3065 continue;
3066 }
3067
3068 if (grf >= first_grf &&
3069 grf < first_grf + grf_len) {
3070 deps[grf - first_grf] = false;
3071 if (inst->exec_size == 16)
3072 deps[grf - first_grf + 1] = false;
3073 }
3074 }
3075 }
3076
3077 /**
3078 * Implements this workaround for the original 965:
3079 *
3080 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3081 * check for post destination dependencies on this instruction, software
3082 * must ensure that there is no destination hazard for the case of ‘write
3083 * followed by a posted write’ shown in the following example.
3084 *
3085 * 1. mov r3 0
3086 * 2. send r3.xy <rest of send instruction>
3087 * 3. mov r2 r3
3088 *
3089 * Due to no post-destination dependency check on the ‘send’, the above
3090 * code sequence could have two instructions (1 and 2) in flight at the
3091 * same time that both consider ‘r3’ as the target of their final writes.
3092 */
3093 void
3094 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3095 fs_inst *inst)
3096 {
3097 int write_len = inst->regs_written;
3098 int first_write_grf = inst->dst.reg;
3099 bool needs_dep[BRW_MAX_MRF];
3100 assert(write_len < (int)sizeof(needs_dep) - 1);
3101
3102 memset(needs_dep, false, sizeof(needs_dep));
3103 memset(needs_dep, true, write_len);
3104
3105 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3106
3107 /* Walk backwards looking for writes to registers we're writing which
3108 * aren't read since being written. If we hit the start of the program,
3109 * we assume that there are no outstanding dependencies on entry to the
3110 * program.
3111 */
3112 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3113 /* If we hit control flow, assume that there *are* outstanding
3114 * dependencies, and force their cleanup before our instruction.
3115 */
3116 if (block->start() == scan_inst) {
3117 for (int i = 0; i < write_len; i++) {
3118 if (needs_dep[i]) {
3119 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3120 }
3121 }
3122 return;
3123 }
3124
3125 /* We insert our reads as late as possible on the assumption that any
3126 * instruction but a MOV that might have left us an outstanding
3127 * dependency has more latency than a MOV.
3128 */
3129 if (scan_inst->dst.file == GRF) {
3130 for (int i = 0; i < scan_inst->regs_written; i++) {
3131 int reg = scan_inst->dst.reg + i;
3132
3133 if (reg >= first_write_grf &&
3134 reg < first_write_grf + write_len &&
3135 needs_dep[reg - first_write_grf]) {
3136 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3137 needs_dep[reg - first_write_grf] = false;
3138 if (scan_inst->exec_size == 16)
3139 needs_dep[reg - first_write_grf + 1] = false;
3140 }
3141 }
3142 }
3143
3144 /* Clear the flag for registers that actually got read (as expected). */
3145 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3146
3147 /* Continue the loop only if we haven't resolved all the dependencies */
3148 int i;
3149 for (i = 0; i < write_len; i++) {
3150 if (needs_dep[i])
3151 break;
3152 }
3153 if (i == write_len)
3154 return;
3155 }
3156 }
3157
3158 /**
3159 * Implements this workaround for the original 965:
3160 *
3161 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3162 * used as a destination register until after it has been sourced by an
3163 * instruction with a different destination register.
3164 */
3165 void
3166 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3167 {
3168 int write_len = inst->regs_written;
3169 int first_write_grf = inst->dst.reg;
3170 bool needs_dep[BRW_MAX_MRF];
3171 assert(write_len < (int)sizeof(needs_dep) - 1);
3172
3173 memset(needs_dep, false, sizeof(needs_dep));
3174 memset(needs_dep, true, write_len);
3175 /* Walk forwards looking for writes to registers we're writing which aren't
3176 * read before being written.
3177 */
3178 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3179 /* If we hit control flow, force resolve all remaining dependencies. */
3180 if (block->end() == scan_inst) {
3181 for (int i = 0; i < write_len; i++) {
3182 if (needs_dep[i])
3183 scan_inst->insert_before(block,
3184 DEP_RESOLVE_MOV(first_write_grf + i));
3185 }
3186 return;
3187 }
3188
3189 /* Clear the flag for registers that actually got read (as expected). */
3190 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3191
3192 /* We insert our reads as late as possible since they're reading the
3193 * result of a SEND, which has massive latency.
3194 */
3195 if (scan_inst->dst.file == GRF &&
3196 scan_inst->dst.reg >= first_write_grf &&
3197 scan_inst->dst.reg < first_write_grf + write_len &&
3198 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3199 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3200 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3201 }
3202
3203 /* Continue the loop only if we haven't resolved all the dependencies */
3204 int i;
3205 for (i = 0; i < write_len; i++) {
3206 if (needs_dep[i])
3207 break;
3208 }
3209 if (i == write_len)
3210 return;
3211 }
3212 }
3213
3214 void
3215 fs_visitor::insert_gen4_send_dependency_workarounds()
3216 {
3217 if (devinfo->gen != 4 || devinfo->is_g4x)
3218 return;
3219
3220 bool progress = false;
3221
3222 /* Note that we're done with register allocation, so GRF fs_regs always
3223 * have a .reg_offset of 0.
3224 */
3225
3226 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3227 if (inst->mlen != 0 && inst->dst.file == GRF) {
3228 insert_gen4_pre_send_dependency_workarounds(block, inst);
3229 insert_gen4_post_send_dependency_workarounds(block, inst);
3230 progress = true;
3231 }
3232 }
3233
3234 if (progress)
3235 invalidate_live_intervals();
3236 }
3237
3238 /**
3239 * Turns the generic expression-style uniform pull constant load instruction
3240 * into a hardware-specific series of instructions for loading a pull
3241 * constant.
3242 *
3243 * The expression style allows the CSE pass before this to optimize out
3244 * repeated loads from the same offset, and gives the pre-register-allocation
3245 * scheduling full flexibility, while the conversion to native instructions
3246 * allows the post-register-allocation scheduler the best information
3247 * possible.
3248 *
3249 * Note that execution masking for setting up pull constant loads is special:
3250 * the channels that need to be written are unrelated to the current execution
3251 * mask, since a later instruction will use one of the result channels as a
3252 * source operand for all 8 or 16 of its channels.
3253 */
3254 void
3255 fs_visitor::lower_uniform_pull_constant_loads()
3256 {
3257 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3258 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3259 continue;
3260
3261 if (devinfo->gen >= 7) {
3262 /* The offset arg before was a vec4-aligned byte offset. We need to
3263 * turn it into a dword offset.
3264 */
3265 fs_reg const_offset_reg = inst->src[1];
3266 assert(const_offset_reg.file == IMM &&
3267 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3268 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3269 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3270
3271 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3272 * Reserve space for the register.
3273 */
3274 if (devinfo->gen >= 9) {
3275 payload.reg_offset++;
3276 alloc.sizes[payload.reg] = 2;
3277 }
3278
3279 /* This is actually going to be a MOV, but since only the first dword
3280 * is accessed, we have a special opcode to do just that one. Note
3281 * that this needs to be an operation that will be considered a def
3282 * by live variable analysis, or register allocation will explode.
3283 */
3284 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3285 8, payload, const_offset_reg);
3286 setup->force_writemask_all = true;
3287
3288 setup->ir = inst->ir;
3289 setup->annotation = inst->annotation;
3290 inst->insert_before(block, setup);
3291
3292 /* Similarly, this will only populate the first 4 channels of the
3293 * result register (since we only use smear values from 0-3), but we
3294 * don't tell the optimizer.
3295 */
3296 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3297 inst->src[1] = payload;
3298
3299 invalidate_live_intervals();
3300 } else {
3301 /* Before register allocation, we didn't tell the scheduler about the
3302 * MRF we use. We know it's safe to use this MRF because nothing
3303 * else does except for register spill/unspill, which generates and
3304 * uses its MRF within a single IR instruction.
3305 */
3306 inst->base_mrf = 14;
3307 inst->mlen = 1;
3308 }
3309 }
3310 }
3311
3312 bool
3313 fs_visitor::lower_load_payload()
3314 {
3315 bool progress = false;
3316
3317 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3318 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3319 continue;
3320
3321 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3322 assert(inst->saturate == false);
3323
3324 fs_reg dst = inst->dst;
3325
3326 /* Get rid of COMPR4. We'll add it back in if we need it */
3327 if (dst.file == MRF)
3328 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3329
3330 dst.width = 8;
3331 for (uint8_t i = 0; i < inst->header_size; i++) {
3332 if (inst->src[i].file != BAD_FILE) {
3333 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3334 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3335 mov_src.width = 8;
3336 fs_inst *mov = MOV(mov_dst, mov_src);
3337 mov->force_writemask_all = true;
3338 inst->insert_before(block, mov);
3339 }
3340 dst = offset(dst, 1);
3341 }
3342
3343 dst.width = inst->exec_size;
3344 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3345 inst->exec_size > 8) {
3346 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3347 * a straightforward copy. Instead, the result of the
3348 * LOAD_PAYLOAD is treated as interleaved and the first four
3349 * non-header sources are unpacked as:
3350 *
3351 * m + 0: r0
3352 * m + 1: g0
3353 * m + 2: b0
3354 * m + 3: a0
3355 * m + 4: r1
3356 * m + 5: g1
3357 * m + 6: b1
3358 * m + 7: a1
3359 *
3360 * This is used for gen <= 5 fb writes.
3361 */
3362 assert(inst->exec_size == 16);
3363 assert(inst->header_size + 4 <= inst->sources);
3364 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3365 if (inst->src[i].file != BAD_FILE) {
3366 if (devinfo->has_compr4) {
3367 fs_reg compr4_dst = retype(dst, inst->src[i].type);
3368 compr4_dst.reg |= BRW_MRF_COMPR4;
3369
3370 fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3371 mov->force_writemask_all = inst->force_writemask_all;
3372 inst->insert_before(block, mov);
3373 } else {
3374 /* Platform doesn't have COMPR4. We have to fake it */
3375 fs_reg mov_dst = retype(dst, inst->src[i].type);
3376 mov_dst.width = 8;
3377
3378 fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3379 mov->force_writemask_all = inst->force_writemask_all;
3380 inst->insert_before(block, mov);
3381
3382 mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3383 mov->force_writemask_all = inst->force_writemask_all;
3384 mov->force_sechalf = true;
3385 inst->insert_before(block, mov);
3386 }
3387 }
3388
3389 dst.reg++;
3390 }
3391
3392 /* The loop above only ever incremented us through the first set
3393 * of 4 registers. However, thanks to the magic of COMPR4, we
3394 * actually wrote to the first 8 registers, so we need to take
3395 * that into account now.
3396 */
3397 dst.reg += 4;
3398
3399 /* The COMPR4 code took care of the first 4 sources. We'll let
3400 * the regular path handle any remaining sources. Yes, we are
3401 * modifying the instruction but we're about to delete it so
3402 * this really doesn't hurt anything.
3403 */
3404 inst->header_size += 4;
3405 }
3406
3407 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3408 if (inst->src[i].file != BAD_FILE) {
3409 fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3410 inst->src[i]);
3411 mov->force_writemask_all = inst->force_writemask_all;
3412 mov->force_sechalf = inst->force_sechalf;
3413 inst->insert_before(block, mov);
3414 }
3415 dst = offset(dst, 1);
3416 }
3417
3418 inst->remove(block);
3419 progress = true;
3420 }
3421
3422 if (progress)
3423 invalidate_live_intervals();
3424
3425 return progress;
3426 }
3427
3428 bool
3429 fs_visitor::lower_integer_multiplication()
3430 {
3431 bool progress = false;
3432
3433 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3434 * directly, but Cherryview cannot.
3435 */
3436 if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3437 return false;
3438
3439 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3440 if (inst->opcode != BRW_OPCODE_MUL ||
3441 inst->dst.is_accumulator() ||
3442 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3443 inst->dst.type != BRW_REGISTER_TYPE_UD))
3444 continue;
3445
3446 #define insert(instr) inst->insert_before(block, instr)
3447
3448 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3449 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3450 * src1 are used.
3451 *
3452 * If multiplying by an immediate value that fits in 16-bits, do a
3453 * single MUL instruction with that value in the proper location.
3454 */
3455 if (inst->src[1].file == IMM &&
3456 inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3457 if (devinfo->gen < 7) {
3458 fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3459 inst->dst.type, dispatch_width);
3460 insert(MOV(imm, inst->src[1]));
3461 insert(MUL(inst->dst, imm, inst->src[0]));
3462 } else {
3463 insert(MUL(inst->dst, inst->src[0], inst->src[1]));
3464 }
3465 } else {
3466 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3467 * do 32-bit integer multiplication in one instruction, but instead
3468 * must do a sequence (which actually calculates a 64-bit result):
3469 *
3470 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3471 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3472 * mov(8) g2<1>D acc0<8,8,1>D
3473 *
3474 * But on Gen > 6, the ability to use second accumulator register
3475 * (acc1) for non-float data types was removed, preventing a simple
3476 * implementation in SIMD16. A 16-channel result can be calculated by
3477 * executing the three instructions twice in SIMD8, once with quarter
3478 * control of 1Q for the first eight channels and again with 2Q for
3479 * the second eight channels.
3480 *
3481 * Which accumulator register is implicitly accessed (by AccWrEnable
3482 * for instance) is determined by the quarter control. Unfortunately
3483 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3484 * implicit accumulator access by an instruction with 2Q will access
3485 * acc1 regardless of whether the data type is usable in acc1.
3486 *
3487 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3488 * integer data types.
3489 *
3490 * Since we only want the low 32-bits of the result, we can do two
3491 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3492 * adjust the high result and add them (like the mach is doing):
3493 *
3494 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3495 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3496 * shl(8) g9<1>D g8<8,8,1>D 16D
3497 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3498 *
3499 * We avoid the shl instruction by realizing that we only want to add
3500 * the low 16-bits of the "high" result to the high 16-bits of the
3501 * "low" result and using proper regioning on the add:
3502 *
3503 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3504 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3505 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3506 *
3507 * Since it does not use the (single) accumulator register, we can
3508 * schedule multi-component multiplications much better.
3509 */
3510
3511 if (inst->conditional_mod && inst->dst.is_null()) {
3512 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3513 inst->dst.type, dispatch_width);
3514 }
3515 fs_reg low = inst->dst;
3516 fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3517 inst->dst.type, dispatch_width);
3518
3519 if (brw->gen >= 7) {
3520 fs_reg src1_0_w = inst->src[1];
3521 fs_reg src1_1_w = inst->src[1];
3522
3523 if (inst->src[1].file == IMM) {
3524 src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3525 src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3526 } else {
3527 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3528 src1_0_w.stride = 2;
3529
3530 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3531 src1_1_w.stride = 2;
3532 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3533 }
3534 insert(MUL(low, inst->src[0], src1_0_w));
3535 insert(MUL(high, inst->src[0], src1_1_w));
3536 } else {
3537 fs_reg src0_0_w = inst->src[0];
3538 fs_reg src0_1_w = inst->src[0];
3539
3540 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3541 src0_0_w.stride = 2;
3542
3543 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3544 src0_1_w.stride = 2;
3545 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3546
3547 insert(MUL(low, src0_0_w, inst->src[1]));
3548 insert(MUL(high, src0_1_w, inst->src[1]));
3549 }
3550
3551 fs_reg dst = inst->dst;
3552 dst.type = BRW_REGISTER_TYPE_UW;
3553 dst.subreg_offset = 2;
3554 dst.stride = 2;
3555
3556 high.type = BRW_REGISTER_TYPE_UW;
3557 high.stride = 2;
3558
3559 low.type = BRW_REGISTER_TYPE_UW;
3560 low.subreg_offset = 2;
3561 low.stride = 2;
3562
3563 insert(ADD(dst, low, high));
3564
3565 if (inst->conditional_mod) {
3566 fs_reg null(retype(brw_null_reg(), inst->dst.type));
3567 fs_inst *mov = MOV(null, inst->dst);
3568 mov->conditional_mod = inst->conditional_mod;
3569 insert(mov);
3570 }
3571 }
3572 #undef insert
3573
3574 inst->remove(block);
3575 progress = true;
3576 }
3577
3578 if (progress)
3579 invalidate_live_intervals();
3580
3581 return progress;
3582 }
3583
3584 void
3585 fs_visitor::dump_instructions()
3586 {
3587 dump_instructions(NULL);
3588 }
3589
3590 void
3591 fs_visitor::dump_instructions(const char *name)
3592 {
3593 FILE *file = stderr;
3594 if (name && geteuid() != 0) {
3595 file = fopen(name, "w");
3596 if (!file)
3597 file = stderr;
3598 }
3599
3600 if (cfg) {
3601 calculate_register_pressure();
3602 int ip = 0, max_pressure = 0;
3603 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3604 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3605 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3606 dump_instruction(inst, file);
3607 ip++;
3608 }
3609 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3610 } else {
3611 int ip = 0;
3612 foreach_in_list(backend_instruction, inst, &instructions) {
3613 fprintf(file, "%4d: ", ip++);
3614 dump_instruction(inst, file);
3615 }
3616 }
3617
3618 if (file != stderr) {
3619 fclose(file);
3620 }
3621 }
3622
3623 void
3624 fs_visitor::dump_instruction(backend_instruction *be_inst)
3625 {
3626 dump_instruction(be_inst, stderr);
3627 }
3628
3629 void
3630 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3631 {
3632 fs_inst *inst = (fs_inst *)be_inst;
3633
3634 if (inst->predicate) {
3635 fprintf(file, "(%cf0.%d) ",
3636 inst->predicate_inverse ? '-' : '+',
3637 inst->flag_subreg);
3638 }
3639
3640 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3641 if (inst->saturate)
3642 fprintf(file, ".sat");
3643 if (inst->conditional_mod) {
3644 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3645 if (!inst->predicate &&
3646 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3647 inst->opcode != BRW_OPCODE_IF &&
3648 inst->opcode != BRW_OPCODE_WHILE))) {
3649 fprintf(file, ".f0.%d", inst->flag_subreg);
3650 }
3651 }
3652 fprintf(file, "(%d) ", inst->exec_size);
3653
3654 if (inst->mlen) {
3655 fprintf(file, "(mlen: %d) ", inst->mlen);
3656 }
3657
3658 switch (inst->dst.file) {
3659 case GRF:
3660 fprintf(file, "vgrf%d", inst->dst.reg);
3661 if (inst->dst.width != dispatch_width)
3662 fprintf(file, "@%d", inst->dst.width);
3663 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3664 inst->dst.subreg_offset)
3665 fprintf(file, "+%d.%d",
3666 inst->dst.reg_offset, inst->dst.subreg_offset);
3667 break;
3668 case MRF:
3669 fprintf(file, "m%d", inst->dst.reg);
3670 break;
3671 case BAD_FILE:
3672 fprintf(file, "(null)");
3673 break;
3674 case UNIFORM:
3675 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3676 break;
3677 case ATTR:
3678 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3679 break;
3680 case HW_REG:
3681 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3682 switch (inst->dst.fixed_hw_reg.nr) {
3683 case BRW_ARF_NULL:
3684 fprintf(file, "null");
3685 break;
3686 case BRW_ARF_ADDRESS:
3687 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3688 break;
3689 case BRW_ARF_ACCUMULATOR:
3690 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3691 break;
3692 case BRW_ARF_FLAG:
3693 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3694 inst->dst.fixed_hw_reg.subnr);
3695 break;
3696 default:
3697 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3698 inst->dst.fixed_hw_reg.subnr);
3699 break;
3700 }
3701 } else {
3702 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3703 }
3704 if (inst->dst.fixed_hw_reg.subnr)
3705 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3706 break;
3707 default:
3708 fprintf(file, "???");
3709 break;
3710 }
3711 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3712
3713 for (int i = 0; i < inst->sources; i++) {
3714 if (inst->src[i].negate)
3715 fprintf(file, "-");
3716 if (inst->src[i].abs)
3717 fprintf(file, "|");
3718 switch (inst->src[i].file) {
3719 case GRF:
3720 fprintf(file, "vgrf%d", inst->src[i].reg);
3721 if (inst->src[i].width != dispatch_width)
3722 fprintf(file, "@%d", inst->src[i].width);
3723 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3724 inst->src[i].subreg_offset)
3725 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3726 inst->src[i].subreg_offset);
3727 break;
3728 case MRF:
3729 fprintf(file, "***m%d***", inst->src[i].reg);
3730 break;
3731 case ATTR:
3732 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3733 break;
3734 case UNIFORM:
3735 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3736 if (inst->src[i].reladdr) {
3737 fprintf(file, "+reladdr");
3738 } else if (inst->src[i].subreg_offset) {
3739 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3740 inst->src[i].subreg_offset);
3741 }
3742 break;
3743 case BAD_FILE:
3744 fprintf(file, "(null)");
3745 break;
3746 case IMM:
3747 switch (inst->src[i].type) {
3748 case BRW_REGISTER_TYPE_F:
3749 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3750 break;
3751 case BRW_REGISTER_TYPE_W:
3752 case BRW_REGISTER_TYPE_D:
3753 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3754 break;
3755 case BRW_REGISTER_TYPE_UW:
3756 case BRW_REGISTER_TYPE_UD:
3757 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3758 break;
3759 case BRW_REGISTER_TYPE_VF:
3760 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3761 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3762 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3763 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3764 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3765 break;
3766 default:
3767 fprintf(file, "???");
3768 break;
3769 }
3770 break;
3771 case HW_REG:
3772 if (inst->src[i].fixed_hw_reg.negate)
3773 fprintf(file, "-");
3774 if (inst->src[i].fixed_hw_reg.abs)
3775 fprintf(file, "|");
3776 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3777 switch (inst->src[i].fixed_hw_reg.nr) {
3778 case BRW_ARF_NULL:
3779 fprintf(file, "null");
3780 break;
3781 case BRW_ARF_ADDRESS:
3782 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3783 break;
3784 case BRW_ARF_ACCUMULATOR:
3785 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3786 break;
3787 case BRW_ARF_FLAG:
3788 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3789 inst->src[i].fixed_hw_reg.subnr);
3790 break;
3791 default:
3792 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3793 inst->src[i].fixed_hw_reg.subnr);
3794 break;
3795 }
3796 } else {
3797 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3798 }
3799 if (inst->src[i].fixed_hw_reg.subnr)
3800 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3801 if (inst->src[i].fixed_hw_reg.abs)
3802 fprintf(file, "|");
3803 break;
3804 default:
3805 fprintf(file, "???");
3806 break;
3807 }
3808 if (inst->src[i].abs)
3809 fprintf(file, "|");
3810
3811 if (inst->src[i].file != IMM) {
3812 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3813 }
3814
3815 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3816 fprintf(file, ", ");
3817 }
3818
3819 fprintf(file, " ");
3820
3821 if (dispatch_width == 16 && inst->exec_size == 8) {
3822 if (inst->force_sechalf)
3823 fprintf(file, "2ndhalf ");
3824 else
3825 fprintf(file, "1sthalf ");
3826 }
3827
3828 fprintf(file, "\n");
3829 }
3830
3831 /**
3832 * Possibly returns an instruction that set up @param reg.
3833 *
3834 * Sometimes we want to take the result of some expression/variable
3835 * dereference tree and rewrite the instruction generating the result
3836 * of the tree. When processing the tree, we know that the
3837 * instructions generated are all writing temporaries that are dead
3838 * outside of this tree. So, if we have some instructions that write
3839 * a temporary, we're free to point that temp write somewhere else.
3840 *
3841 * Note that this doesn't guarantee that the instruction generated
3842 * only reg -- it might be the size=4 destination of a texture instruction.
3843 */
3844 fs_inst *
3845 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3846 fs_inst *end,
3847 const fs_reg &reg)
3848 {
3849 if (end == start ||
3850 end->is_partial_write() ||
3851 reg.reladdr ||
3852 !reg.equals(end->dst)) {
3853 return NULL;
3854 } else {
3855 return end;
3856 }
3857 }
3858
3859 void
3860 fs_visitor::setup_payload_gen6()
3861 {
3862 bool uses_depth =
3863 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3864 unsigned barycentric_interp_modes =
3865 (stage == MESA_SHADER_FRAGMENT) ?
3866 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3867
3868 assert(devinfo->gen >= 6);
3869
3870 /* R0-1: masks, pixel X/Y coordinates. */
3871 payload.num_regs = 2;
3872 /* R2: only for 32-pixel dispatch.*/
3873
3874 /* R3-26: barycentric interpolation coordinates. These appear in the
3875 * same order that they appear in the brw_wm_barycentric_interp_mode
3876 * enum. Each set of coordinates occupies 2 registers if dispatch width
3877 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3878 * appear if they were enabled using the "Barycentric Interpolation
3879 * Mode" bits in WM_STATE.
3880 */
3881 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3882 if (barycentric_interp_modes & (1 << i)) {
3883 payload.barycentric_coord_reg[i] = payload.num_regs;
3884 payload.num_regs += 2;
3885 if (dispatch_width == 16) {
3886 payload.num_regs += 2;
3887 }
3888 }
3889 }
3890
3891 /* R27: interpolated depth if uses source depth */
3892 if (uses_depth) {
3893 payload.source_depth_reg = payload.num_regs;
3894 payload.num_regs++;
3895 if (dispatch_width == 16) {
3896 /* R28: interpolated depth if not SIMD8. */
3897 payload.num_regs++;
3898 }
3899 }
3900 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3901 if (uses_depth) {
3902 payload.source_w_reg = payload.num_regs;
3903 payload.num_regs++;
3904 if (dispatch_width == 16) {
3905 /* R30: interpolated W if not SIMD8. */
3906 payload.num_regs++;
3907 }
3908 }
3909
3910 if (stage == MESA_SHADER_FRAGMENT) {
3911 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3912 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3913 prog_data->uses_pos_offset = key->compute_pos_offset;
3914 /* R31: MSAA position offsets. */
3915 if (prog_data->uses_pos_offset) {
3916 payload.sample_pos_reg = payload.num_regs;
3917 payload.num_regs++;
3918 }
3919 }
3920
3921 /* R32: MSAA input coverage mask */
3922 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3923 assert(devinfo->gen >= 7);
3924 payload.sample_mask_in_reg = payload.num_regs;
3925 payload.num_regs++;
3926 if (dispatch_width == 16) {
3927 /* R33: input coverage mask if not SIMD8. */
3928 payload.num_regs++;
3929 }
3930 }
3931
3932 /* R34-: bary for 32-pixel. */
3933 /* R58-59: interp W for 32-pixel. */
3934
3935 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3936 source_depth_to_render_target = true;
3937 }
3938 }
3939
3940 void
3941 fs_visitor::setup_vs_payload()
3942 {
3943 /* R0: thread header, R1: urb handles */
3944 payload.num_regs = 2;
3945 }
3946
3947 void
3948 fs_visitor::setup_cs_payload()
3949 {
3950 assert(brw->gen >= 7);
3951
3952 payload.num_regs = 1;
3953 }
3954
3955 void
3956 fs_visitor::assign_binding_table_offsets()
3957 {
3958 assert(stage == MESA_SHADER_FRAGMENT);
3959 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3960 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3961 uint32_t next_binding_table_offset = 0;
3962
3963 /* If there are no color regions, we still perform an FB write to a null
3964 * renderbuffer, which we place at surface index 0.
3965 */
3966 prog_data->binding_table.render_target_start = next_binding_table_offset;
3967 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3968
3969 assign_common_binding_table_offsets(next_binding_table_offset);
3970 }
3971
3972 void
3973 fs_visitor::calculate_register_pressure()
3974 {
3975 invalidate_live_intervals();
3976 calculate_live_intervals();
3977
3978 unsigned num_instructions = 0;
3979 foreach_block(block, cfg)
3980 num_instructions += block->instructions.length();
3981
3982 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3983
3984 for (unsigned reg = 0; reg < alloc.count; reg++) {
3985 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3986 regs_live_at_ip[ip] += alloc.sizes[reg];
3987 }
3988 }
3989
3990 void
3991 fs_visitor::optimize()
3992 {
3993 /* bld is the common builder object pointing at the end of the program we
3994 * used to translate it into i965 IR. For the optimization and lowering
3995 * passes coming next, any code added after the end of the program without
3996 * having explicitly called fs_builder::at() clearly points at a mistake.
3997 * Ideally optimization passes wouldn't be part of the visitor so they
3998 * wouldn't have access to bld at all, but they do, so just in case some
3999 * pass forgets to ask for a location explicitly set it to NULL here to
4000 * make it trip.
4001 */
4002 bld = bld.at(NULL, NULL);
4003
4004 split_virtual_grfs();
4005
4006 move_uniform_array_access_to_pull_constants();
4007 assign_constant_locations();
4008 demote_pull_constants();
4009
4010 #define OPT(pass, args...) ({ \
4011 pass_num++; \
4012 bool this_progress = pass(args); \
4013 \
4014 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
4015 char filename[64]; \
4016 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
4017 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
4018 \
4019 backend_shader::dump_instructions(filename); \
4020 } \
4021 \
4022 progress = progress || this_progress; \
4023 this_progress; \
4024 })
4025
4026 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4027 char filename[64];
4028 snprintf(filename, 64, "%s%d-%04d-00-start",
4029 stage_abbrev, dispatch_width,
4030 shader_prog ? shader_prog->Name : 0);
4031
4032 backend_shader::dump_instructions(filename);
4033 }
4034
4035 bool progress;
4036 int iteration = 0;
4037 int pass_num = 0;
4038 do {
4039 progress = false;
4040 pass_num = 0;
4041 iteration++;
4042
4043 OPT(remove_duplicate_mrf_writes);
4044
4045 OPT(opt_algebraic);
4046 OPT(opt_cse);
4047 OPT(opt_copy_propagate);
4048 OPT(opt_peephole_predicated_break);
4049 OPT(opt_cmod_propagation);
4050 OPT(dead_code_eliminate);
4051 OPT(opt_peephole_sel);
4052 OPT(dead_control_flow_eliminate, this);
4053 OPT(opt_register_renaming);
4054 OPT(opt_redundant_discard_jumps);
4055 OPT(opt_saturate_propagation);
4056 OPT(opt_zero_samples);
4057 OPT(register_coalesce);
4058 OPT(compute_to_mrf);
4059 OPT(eliminate_find_live_channel);
4060
4061 OPT(compact_virtual_grfs);
4062 } while (progress);
4063
4064 pass_num = 0;
4065
4066 OPT(opt_sampler_eot);
4067
4068 if (OPT(lower_load_payload)) {
4069 split_virtual_grfs();
4070 OPT(register_coalesce);
4071 OPT(compute_to_mrf);
4072 OPT(dead_code_eliminate);
4073 }
4074
4075 OPT(opt_combine_constants);
4076 OPT(lower_integer_multiplication);
4077
4078 lower_uniform_pull_constant_loads();
4079 }
4080
4081 /**
4082 * Three source instruction must have a GRF/MRF destination register.
4083 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
4084 */
4085 void
4086 fs_visitor::fixup_3src_null_dest()
4087 {
4088 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4089 if (inst->is_3src() && inst->dst.is_null()) {
4090 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4091 inst->dst.type);
4092 }
4093 }
4094 }
4095
4096 void
4097 fs_visitor::allocate_registers()
4098 {
4099 bool allocated_without_spills;
4100
4101 static const enum instruction_scheduler_mode pre_modes[] = {
4102 SCHEDULE_PRE,
4103 SCHEDULE_PRE_NON_LIFO,
4104 SCHEDULE_PRE_LIFO,
4105 };
4106
4107 /* Try each scheduling heuristic to see if it can successfully register
4108 * allocate without spilling. They should be ordered by decreasing
4109 * performance but increasing likelihood of allocating.
4110 */
4111 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4112 schedule_instructions(pre_modes[i]);
4113
4114 if (0) {
4115 assign_regs_trivial();
4116 allocated_without_spills = true;
4117 } else {
4118 allocated_without_spills = assign_regs(false);
4119 }
4120 if (allocated_without_spills)
4121 break;
4122 }
4123
4124 if (!allocated_without_spills) {
4125 /* We assume that any spilling is worse than just dropping back to
4126 * SIMD8. There's probably actually some intermediate point where
4127 * SIMD16 with a couple of spills is still better.
4128 */
4129 if (dispatch_width == 16) {
4130 fail("Failure to register allocate. Reduce number of "
4131 "live scalar values to avoid this.");
4132 } else {
4133 perf_debug("%s shader triggered register spilling. "
4134 "Try reducing the number of live scalar values to "
4135 "improve performance.\n", stage_name);
4136 }
4137
4138 /* Since we're out of heuristics, just go spill registers until we
4139 * get an allocation.
4140 */
4141 while (!assign_regs(true)) {
4142 if (failed)
4143 break;
4144 }
4145 }
4146
4147 /* This must come after all optimization and register allocation, since
4148 * it inserts dead code that happens to have side effects, and it does
4149 * so based on the actual physical registers in use.
4150 */
4151 insert_gen4_send_dependency_workarounds();
4152
4153 if (failed)
4154 return;
4155
4156 if (!allocated_without_spills)
4157 schedule_instructions(SCHEDULE_POST);
4158
4159 if (last_scratch > 0)
4160 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4161 }
4162
4163 bool
4164 fs_visitor::run_vs()
4165 {
4166 assert(stage == MESA_SHADER_VERTEX);
4167
4168 assign_common_binding_table_offsets(0);
4169 setup_vs_payload();
4170
4171 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4172 emit_shader_time_begin();
4173
4174 emit_nir_code();
4175
4176 if (failed)
4177 return false;
4178
4179 emit_urb_writes();
4180
4181 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4182 emit_shader_time_end();
4183
4184 calculate_cfg();
4185
4186 optimize();
4187
4188 assign_curb_setup();
4189 assign_vs_urb_setup();
4190
4191 fixup_3src_null_dest();
4192 allocate_registers();
4193
4194 return !failed;
4195 }
4196
4197 bool
4198 fs_visitor::run_fs()
4199 {
4200 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4201 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4202
4203 assert(stage == MESA_SHADER_FRAGMENT);
4204
4205 sanity_param_count = prog->Parameters->NumParameters;
4206
4207 assign_binding_table_offsets();
4208
4209 if (devinfo->gen >= 6)
4210 setup_payload_gen6();
4211 else
4212 setup_payload_gen4();
4213
4214 if (0) {
4215 emit_dummy_fs();
4216 } else if (brw->use_rep_send && dispatch_width == 16) {
4217 emit_repclear_shader();
4218 } else {
4219 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4220 emit_shader_time_begin();
4221
4222 calculate_urb_setup();
4223 if (prog->InputsRead > 0) {
4224 if (devinfo->gen < 6)
4225 emit_interpolation_setup_gen4();
4226 else
4227 emit_interpolation_setup_gen6();
4228 }
4229
4230 /* We handle discards by keeping track of the still-live pixels in f0.1.
4231 * Initialize it with the dispatched pixels.
4232 */
4233 if (wm_prog_data->uses_kill) {
4234 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4235 discard_init->flag_subreg = 1;
4236 }
4237
4238 /* Generate FS IR for main(). (the visitor only descends into
4239 * functions called "main").
4240 */
4241 emit_nir_code();
4242
4243 if (failed)
4244 return false;
4245
4246 if (wm_prog_data->uses_kill)
4247 emit(FS_OPCODE_PLACEHOLDER_HALT);
4248
4249 if (wm_key->alpha_test_func)
4250 emit_alpha_test();
4251
4252 emit_fb_writes();
4253
4254 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4255 emit_shader_time_end();
4256
4257 calculate_cfg();
4258
4259 optimize();
4260
4261 assign_curb_setup();
4262 assign_urb_setup();
4263
4264 fixup_3src_null_dest();
4265 allocate_registers();
4266
4267 if (failed)
4268 return false;
4269 }
4270
4271 if (dispatch_width == 8)
4272 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4273 else
4274 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4275
4276 /* If any state parameters were appended, then ParameterValues could have
4277 * been realloced, in which case the driver uniform storage set up by
4278 * _mesa_associate_uniform_storage() would point to freed memory. Make
4279 * sure that didn't happen.
4280 */
4281 assert(sanity_param_count == prog->Parameters->NumParameters);
4282
4283 return !failed;
4284 }
4285
4286 bool
4287 fs_visitor::run_cs()
4288 {
4289 assert(stage == MESA_SHADER_COMPUTE);
4290 assert(shader);
4291
4292 sanity_param_count = prog->Parameters->NumParameters;
4293
4294 assign_common_binding_table_offsets(0);
4295
4296 setup_cs_payload();
4297
4298 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4299 emit_shader_time_begin();
4300
4301 emit_nir_code();
4302
4303 if (failed)
4304 return false;
4305
4306 emit_cs_terminate();
4307
4308 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4309 emit_shader_time_end();
4310
4311 calculate_cfg();
4312
4313 optimize();
4314
4315 assign_curb_setup();
4316
4317 fixup_3src_null_dest();
4318 allocate_registers();
4319
4320 if (failed)
4321 return false;
4322
4323 /* If any state parameters were appended, then ParameterValues could have
4324 * been realloced, in which case the driver uniform storage set up by
4325 * _mesa_associate_uniform_storage() would point to freed memory. Make
4326 * sure that didn't happen.
4327 */
4328 assert(sanity_param_count == prog->Parameters->NumParameters);
4329
4330 return !failed;
4331 }
4332
4333 const unsigned *
4334 brw_wm_fs_emit(struct brw_context *brw,
4335 void *mem_ctx,
4336 const struct brw_wm_prog_key *key,
4337 struct brw_wm_prog_data *prog_data,
4338 struct gl_fragment_program *fp,
4339 struct gl_shader_program *prog,
4340 unsigned *final_assembly_size)
4341 {
4342 bool start_busy = false;
4343 double start_time = 0;
4344
4345 if (unlikely(brw->perf_debug)) {
4346 start_busy = (brw->batch.last_bo &&
4347 drm_intel_bo_busy(brw->batch.last_bo));
4348 start_time = get_time();
4349 }
4350
4351 struct brw_shader *shader = NULL;
4352 if (prog)
4353 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4354
4355 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4356 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4357
4358 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4359 */
4360 fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4361 prog, &fp->Base, 8);
4362 if (!v.run_fs()) {
4363 if (prog) {
4364 prog->LinkStatus = false;
4365 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4366 }
4367
4368 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4369 v.fail_msg);
4370
4371 return NULL;
4372 }
4373
4374 cfg_t *simd16_cfg = NULL;
4375 fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4376 prog, &fp->Base, 16);
4377 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4378 if (!v.simd16_unsupported) {
4379 /* Try a SIMD16 compile */
4380 v2.import_uniforms(&v);
4381 if (!v2.run_fs()) {
4382 perf_debug("SIMD16 shader failed to compile, falling back to "
4383 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4384 } else {
4385 simd16_cfg = v2.cfg;
4386 }
4387 } else {
4388 perf_debug("SIMD16 shader unsupported, falling back to "
4389 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4390 }
4391 }
4392
4393 cfg_t *simd8_cfg;
4394 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4395 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4396 simd8_cfg = NULL;
4397 prog_data->no_8 = true;
4398 } else {
4399 simd8_cfg = v.cfg;
4400 prog_data->no_8 = false;
4401 }
4402
4403 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4404 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4405
4406 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4407 char *name;
4408 if (prog)
4409 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4410 prog->Label ? prog->Label : "unnamed",
4411 prog->Name);
4412 else
4413 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4414
4415 g.enable_debug(name);
4416 }
4417
4418 if (simd8_cfg)
4419 g.generate_code(simd8_cfg, 8);
4420 if (simd16_cfg)
4421 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4422
4423 if (unlikely(brw->perf_debug) && shader) {
4424 if (shader->compiled_once)
4425 brw_wm_debug_recompile(brw, prog, key);
4426 shader->compiled_once = true;
4427
4428 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4429 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4430 (get_time() - start_time) * 1000);
4431 }
4432 }
4433
4434 return g.get_assembly(final_assembly_size);
4435 }
4436
4437 extern "C" bool
4438 brw_fs_precompile(struct gl_context *ctx,
4439 struct gl_shader_program *shader_prog,
4440 struct gl_program *prog)
4441 {
4442 struct brw_context *brw = brw_context(ctx);
4443 struct brw_wm_prog_key key;
4444
4445 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4446 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4447 bool program_uses_dfdy = fp->UsesDFdy;
4448
4449 memset(&key, 0, sizeof(key));
4450
4451 if (brw->gen < 6) {
4452 if (fp->UsesKill)
4453 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4454
4455 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4456 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4457
4458 /* Just assume depth testing. */
4459 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4460 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4461 }
4462
4463 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4464 BRW_FS_VARYING_INPUT_MASK) > 16)
4465 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4466
4467 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4468
4469 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4470 key.drawable_height = ctx->DrawBuffer->Height;
4471 }
4472
4473 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4474 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4475 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4476
4477 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4478 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4479 key.nr_color_regions > 1;
4480 }
4481
4482 key.program_string_id = bfp->id;
4483
4484 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4485 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4486
4487 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4488
4489 brw->wm.base.prog_offset = old_prog_offset;
4490 brw->wm.prog_data = old_prog_data;
4491
4492 return success;
4493 }
4494
4495 void
4496 brw_setup_tex_for_precompile(struct brw_context *brw,
4497 struct brw_sampler_prog_key_data *tex,
4498 struct gl_program *prog)
4499 {
4500 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4501 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4502 for (unsigned i = 0; i < sampler_count; i++) {
4503 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4504 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4505 tex->swizzles[i] =
4506 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4507 } else {
4508 /* Color sampler: assume no swizzling. */
4509 tex->swizzles[i] = SWIZZLE_XYZW;
4510 }
4511 }
4512 }