9fca9914f2c7956a853b845ab500b63a2d781156
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 this->conditional_mod = BRW_CONDITIONAL_NONE;
94
95 /* This will be the case for almost all instructions. */
96 switch (dst.file) {
97 case GRF:
98 case HW_REG:
99 case MRF:
100 case ATTR:
101 this->regs_written =
102 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
103 break;
104 case BAD_FILE:
105 this->regs_written = 0;
106 break;
107 case IMM:
108 case UNIFORM:
109 unreachable("Invalid destination register file");
110 default:
111 unreachable("Invalid register file");
112 }
113
114 this->writes_accumulator = false;
115 }
116
117 fs_inst::fs_inst()
118 {
119 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
120 }
121
122 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
123 {
124 init(opcode, exec_size, reg_undef, NULL, 0);
125 }
126
127 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
128 {
129 init(opcode, 0, dst, NULL, 0);
130 }
131
132 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
133 const fs_reg &src0)
134 {
135 const fs_reg src[1] = { src0 };
136 init(opcode, exec_size, dst, src, 1);
137 }
138
139 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
140 {
141 const fs_reg src[1] = { src0 };
142 init(opcode, 0, dst, src, 1);
143 }
144
145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
146 const fs_reg &src0, const fs_reg &src1)
147 {
148 const fs_reg src[2] = { src0, src1 };
149 init(opcode, exec_size, dst, src, 2);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
153 const fs_reg &src1)
154 {
155 const fs_reg src[2] = { src0, src1 };
156 init(opcode, 0, dst, src, 2);
157 }
158
159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
160 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
161 {
162 const fs_reg src[3] = { src0, src1, src2 };
163 init(opcode, exec_size, dst, src, 3);
164 }
165
166 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
167 const fs_reg &src1, const fs_reg &src2)
168 {
169 const fs_reg src[3] = { src0, src1, src2 };
170 init(opcode, 0, dst, src, 3);
171 }
172
173 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
174 const fs_reg src[], unsigned sources)
175 {
176 init(opcode, 0, dst, src, sources);
177 }
178
179 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
180 const fs_reg src[], unsigned sources)
181 {
182 init(opcode, exec_width, dst, src, sources);
183 }
184
185 fs_inst::fs_inst(const fs_inst &that)
186 {
187 memcpy(this, &that, sizeof(that));
188
189 this->src = new fs_reg[MAX2(that.sources, 3)];
190
191 for (unsigned i = 0; i < that.sources; i++)
192 this->src[i] = that.src[i];
193 }
194
195 fs_inst::~fs_inst()
196 {
197 delete[] this->src;
198 }
199
200 void
201 fs_inst::resize_sources(uint8_t num_sources)
202 {
203 if (this->sources != num_sources) {
204 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
205
206 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
207 src[i] = this->src[i];
208
209 delete[] this->src;
210 this->src = src;
211 this->sources = num_sources;
212 }
213 }
214
215 #define ALU1(op) \
216 fs_inst * \
217 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
218 { \
219 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
220 }
221
222 #define ALU2(op) \
223 fs_inst * \
224 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
225 const fs_reg &src1) \
226 { \
227 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
228 }
229
230 #define ALU2_ACC(op) \
231 fs_inst * \
232 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
233 const fs_reg &src1) \
234 { \
235 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
236 inst->writes_accumulator = true; \
237 return inst; \
238 }
239
240 #define ALU3(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
243 const fs_reg &src1, const fs_reg &src2) \
244 { \
245 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
246 }
247
248 ALU1(NOT)
249 ALU1(MOV)
250 ALU1(FRC)
251 ALU1(RNDD)
252 ALU1(RNDE)
253 ALU1(RNDZ)
254 ALU2(ADD)
255 ALU2(MUL)
256 ALU2_ACC(MACH)
257 ALU2(AND)
258 ALU2(OR)
259 ALU2(XOR)
260 ALU2(SHL)
261 ALU2(SHR)
262 ALU2(ASR)
263 ALU3(LRP)
264 ALU1(BFREV)
265 ALU3(BFE)
266 ALU2(BFI1)
267 ALU3(BFI2)
268 ALU1(FBH)
269 ALU1(FBL)
270 ALU1(CBIT)
271 ALU3(MAD)
272 ALU2_ACC(ADDC)
273 ALU2_ACC(SUBB)
274 ALU2(SEL)
275 ALU2(MAC)
276
277 /** Gen4 predicated IF. */
278 fs_inst *
279 fs_visitor::IF(enum brw_predicate predicate)
280 {
281 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
282 inst->predicate = predicate;
283 return inst;
284 }
285
286 /** Gen6 IF with embedded comparison. */
287 fs_inst *
288 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
289 enum brw_conditional_mod condition)
290 {
291 assert(devinfo->gen == 6);
292 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
293 reg_null_d, src0, src1);
294 inst->conditional_mod = condition;
295 return inst;
296 }
297
298 /**
299 * CMP: Sets the low bit of the destination channels with the result
300 * of the comparison, while the upper bits are undefined, and updates
301 * the flag register with the packed 16 bits of the result.
302 */
303 fs_inst *
304 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
305 enum brw_conditional_mod condition)
306 {
307 fs_inst *inst;
308
309 /* Take the instruction:
310 *
311 * CMP null<d> src0<f> src1<f>
312 *
313 * Original gen4 does type conversion to the destination type before
314 * comparison, producing garbage results for floating point comparisons.
315 *
316 * The destination type doesn't matter on newer generations, so we set the
317 * type to match src0 so we can compact the instruction.
318 */
319 dst.type = src0.type;
320 if (dst.file == HW_REG)
321 dst.fixed_hw_reg.type = dst.type;
322
323 resolve_ud_negate(&src0);
324 resolve_ud_negate(&src1);
325
326 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
327 inst->conditional_mod = condition;
328
329 return inst;
330 }
331
332 fs_inst *
333 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
334 int header_size)
335 {
336 assert(dst.width % 8 == 0);
337 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
338 dst, src, sources);
339 inst->header_size = header_size;
340
341 for (int i = 0; i < header_size; i++)
342 assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
343 inst->regs_written = header_size;
344
345 for (int i = header_size; i < sources; ++i)
346 assert(src[i].file != GRF || src[i].width == dst.width);
347 inst->regs_written += (sources - header_size) * (dst.width / 8);
348
349 return inst;
350 }
351
352 exec_list
353 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
354 const fs_reg &surf_index,
355 const fs_reg &varying_offset,
356 uint32_t const_offset)
357 {
358 exec_list instructions;
359 fs_inst *inst;
360
361 /* We have our constant surface use a pitch of 4 bytes, so our index can
362 * be any component of a vector, and then we load 4 contiguous
363 * components starting from that.
364 *
365 * We break down the const_offset to a portion added to the variable
366 * offset and a portion done using reg_offset, which means that if you
367 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
368 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
369 * CSE can later notice that those loads are all the same and eliminate
370 * the redundant ones.
371 */
372 fs_reg vec4_offset = vgrf(glsl_type::int_type);
373 instructions.push_tail(ADD(vec4_offset,
374 varying_offset, fs_reg(const_offset & ~3)));
375
376 int scale = 1;
377 if (devinfo->gen == 4 && dst.width == 8) {
378 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
379 * u, v, r) as parameters, or we can just use the SIMD16 message
380 * consisting of (header, u). We choose the second, at the cost of a
381 * longer return length.
382 */
383 scale = 2;
384 }
385
386 enum opcode op;
387 if (devinfo->gen >= 7)
388 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
389 else
390 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
391
392 assert(dst.width % 8 == 0);
393 int regs_written = 4 * (dst.width / 8) * scale;
394 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
395 dst.type, dst.width);
396 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
397 inst->regs_written = regs_written;
398 instructions.push_tail(inst);
399
400 if (devinfo->gen < 7) {
401 inst->base_mrf = 13;
402 inst->header_size = 1;
403 if (devinfo->gen == 4)
404 inst->mlen = 3;
405 else
406 inst->mlen = 1 + dispatch_width / 8;
407 }
408
409 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
410 instructions.push_tail(MOV(dst, result));
411
412 return instructions;
413 }
414
415 /**
416 * A helper for MOV generation for fixing up broken hardware SEND dependency
417 * handling.
418 */
419 fs_inst *
420 fs_visitor::DEP_RESOLVE_MOV(int grf)
421 {
422 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
423
424 inst->ir = NULL;
425 inst->annotation = "send dependency resolve";
426
427 /* The caller always wants uncompressed to emit the minimal extra
428 * dependencies, and to avoid having to deal with aligning its regs to 2.
429 */
430 inst->exec_size = 8;
431
432 return inst;
433 }
434
435 bool
436 fs_inst::equals(fs_inst *inst) const
437 {
438 return (opcode == inst->opcode &&
439 dst.equals(inst->dst) &&
440 src[0].equals(inst->src[0]) &&
441 src[1].equals(inst->src[1]) &&
442 src[2].equals(inst->src[2]) &&
443 saturate == inst->saturate &&
444 predicate == inst->predicate &&
445 conditional_mod == inst->conditional_mod &&
446 mlen == inst->mlen &&
447 base_mrf == inst->base_mrf &&
448 target == inst->target &&
449 eot == inst->eot &&
450 header_size == inst->header_size &&
451 shadow_compare == inst->shadow_compare &&
452 exec_size == inst->exec_size &&
453 offset == inst->offset);
454 }
455
456 bool
457 fs_inst::overwrites_reg(const fs_reg &reg) const
458 {
459 return reg.in_range(dst, regs_written);
460 }
461
462 bool
463 fs_inst::is_send_from_grf() const
464 {
465 switch (opcode) {
466 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
467 case SHADER_OPCODE_SHADER_TIME_ADD:
468 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
469 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
470 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
471 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
472 case SHADER_OPCODE_UNTYPED_ATOMIC:
473 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
474 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
475 case SHADER_OPCODE_TYPED_ATOMIC:
476 case SHADER_OPCODE_TYPED_SURFACE_READ:
477 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
478 case SHADER_OPCODE_URB_WRITE_SIMD8:
479 return true;
480 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
481 return src[1].file == GRF;
482 case FS_OPCODE_FB_WRITE:
483 return src[0].file == GRF;
484 default:
485 if (is_tex())
486 return src[0].file == GRF;
487
488 return false;
489 }
490 }
491
492 bool
493 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
494 {
495 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
496 return false;
497
498 fs_reg reg = this->src[0];
499 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
500 return false;
501
502 if (grf_alloc.sizes[reg.reg] != this->regs_written)
503 return false;
504
505 for (int i = 0; i < this->sources; i++) {
506 reg.type = this->src[i].type;
507 reg.width = this->src[i].width;
508 if (!this->src[i].equals(reg))
509 return false;
510 reg = ::offset(reg, 1);
511 }
512
513 return true;
514 }
515
516 bool
517 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
518 {
519 if (devinfo->gen == 6 && is_math())
520 return false;
521
522 if (is_send_from_grf())
523 return false;
524
525 if (!backend_instruction::can_do_source_mods())
526 return false;
527
528 return true;
529 }
530
531 bool
532 fs_inst::has_side_effects() const
533 {
534 return this->eot || backend_instruction::has_side_effects();
535 }
536
537 void
538 fs_reg::init()
539 {
540 memset(this, 0, sizeof(*this));
541 stride = 1;
542 }
543
544 /** Generic unset register constructor. */
545 fs_reg::fs_reg()
546 {
547 init();
548 this->file = BAD_FILE;
549 }
550
551 /** Immediate value constructor. */
552 fs_reg::fs_reg(float f)
553 {
554 init();
555 this->file = IMM;
556 this->type = BRW_REGISTER_TYPE_F;
557 this->fixed_hw_reg.dw1.f = f;
558 this->width = 1;
559 }
560
561 /** Immediate value constructor. */
562 fs_reg::fs_reg(int32_t i)
563 {
564 init();
565 this->file = IMM;
566 this->type = BRW_REGISTER_TYPE_D;
567 this->fixed_hw_reg.dw1.d = i;
568 this->width = 1;
569 }
570
571 /** Immediate value constructor. */
572 fs_reg::fs_reg(uint32_t u)
573 {
574 init();
575 this->file = IMM;
576 this->type = BRW_REGISTER_TYPE_UD;
577 this->fixed_hw_reg.dw1.ud = u;
578 this->width = 1;
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf[4])
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
588 }
589
590 /** Vector float immediate value constructor. */
591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
592 {
593 init();
594 this->file = IMM;
595 this->type = BRW_REGISTER_TYPE_VF;
596 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
597 (vf1 << 8) |
598 (vf2 << 16) |
599 (vf3 << 24);
600 }
601
602 /** Fixed brw_reg. */
603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
604 {
605 init();
606 this->file = HW_REG;
607 this->fixed_hw_reg = fixed_hw_reg;
608 this->type = fixed_hw_reg.type;
609 this->width = 1 << fixed_hw_reg.width;
610 }
611
612 bool
613 fs_reg::equals(const fs_reg &r) const
614 {
615 return (file == r.file &&
616 reg == r.reg &&
617 reg_offset == r.reg_offset &&
618 subreg_offset == r.subreg_offset &&
619 type == r.type &&
620 negate == r.negate &&
621 abs == r.abs &&
622 !reladdr && !r.reladdr &&
623 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
624 width == r.width &&
625 stride == r.stride);
626 }
627
628 fs_reg &
629 fs_reg::set_smear(unsigned subreg)
630 {
631 assert(file != HW_REG && file != IMM);
632 subreg_offset = subreg * type_sz(type);
633 stride = 0;
634 return *this;
635 }
636
637 bool
638 fs_reg::is_contiguous() const
639 {
640 return stride == 1;
641 }
642
643 int
644 fs_visitor::type_size(const struct glsl_type *type)
645 {
646 unsigned int size, i;
647
648 switch (type->base_type) {
649 case GLSL_TYPE_UINT:
650 case GLSL_TYPE_INT:
651 case GLSL_TYPE_FLOAT:
652 case GLSL_TYPE_BOOL:
653 return type->components();
654 case GLSL_TYPE_ARRAY:
655 return type_size(type->fields.array) * type->length;
656 case GLSL_TYPE_STRUCT:
657 size = 0;
658 for (i = 0; i < type->length; i++) {
659 size += type_size(type->fields.structure[i].type);
660 }
661 return size;
662 case GLSL_TYPE_SAMPLER:
663 /* Samplers take up no register space, since they're baked in at
664 * link time.
665 */
666 return 0;
667 case GLSL_TYPE_ATOMIC_UINT:
668 return 0;
669 case GLSL_TYPE_IMAGE:
670 case GLSL_TYPE_VOID:
671 case GLSL_TYPE_ERROR:
672 case GLSL_TYPE_INTERFACE:
673 case GLSL_TYPE_DOUBLE:
674 unreachable("not reached");
675 }
676
677 return 0;
678 }
679
680 /**
681 * Create a MOV to read the timestamp register.
682 *
683 * The caller is responsible for emitting the MOV. The return value is
684 * the destination of the MOV, with extra parameters set.
685 */
686 fs_reg
687 fs_visitor::get_timestamp(fs_inst **out_mov)
688 {
689 assert(devinfo->gen >= 7);
690
691 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
692 BRW_ARF_TIMESTAMP,
693 0),
694 BRW_REGISTER_TYPE_UD));
695
696 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
697
698 fs_inst *mov = MOV(dst, ts);
699 /* We want to read the 3 fields we care about even if it's not enabled in
700 * the dispatch.
701 */
702 mov->force_writemask_all = true;
703
704 /* The caller wants the low 32 bits of the timestamp. Since it's running
705 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
706 * which is plenty of time for our purposes. It is identical across the
707 * EUs, but since it's tracking GPU core speed it will increment at a
708 * varying rate as render P-states change.
709 *
710 * The caller could also check if render P-states have changed (or anything
711 * else that might disrupt timing) by setting smear to 2 and checking if
712 * that field is != 0.
713 */
714 dst.set_smear(0);
715
716 *out_mov = mov;
717 return dst;
718 }
719
720 void
721 fs_visitor::emit_shader_time_begin()
722 {
723 current_annotation = "shader time start";
724 fs_inst *mov;
725 shader_start_time = get_timestamp(&mov);
726 emit(mov);
727 }
728
729 void
730 fs_visitor::emit_shader_time_end()
731 {
732 current_annotation = "shader time end";
733
734 enum shader_time_shader_type type, written_type, reset_type;
735 switch (stage) {
736 case MESA_SHADER_VERTEX:
737 type = ST_VS;
738 written_type = ST_VS_WRITTEN;
739 reset_type = ST_VS_RESET;
740 break;
741 case MESA_SHADER_GEOMETRY:
742 type = ST_GS;
743 written_type = ST_GS_WRITTEN;
744 reset_type = ST_GS_RESET;
745 break;
746 case MESA_SHADER_FRAGMENT:
747 if (dispatch_width == 8) {
748 type = ST_FS8;
749 written_type = ST_FS8_WRITTEN;
750 reset_type = ST_FS8_RESET;
751 } else {
752 assert(dispatch_width == 16);
753 type = ST_FS16;
754 written_type = ST_FS16_WRITTEN;
755 reset_type = ST_FS16_RESET;
756 }
757 break;
758 case MESA_SHADER_COMPUTE:
759 type = ST_CS;
760 written_type = ST_CS_WRITTEN;
761 reset_type = ST_CS_RESET;
762 break;
763 default:
764 unreachable("fs_visitor::emit_shader_time_end missing code");
765 }
766
767 /* Insert our code just before the final SEND with EOT. */
768 exec_node *end = this->instructions.get_tail();
769 assert(end && ((fs_inst *) end)->eot);
770
771 fs_inst *tm_read;
772 fs_reg shader_end_time = get_timestamp(&tm_read);
773 end->insert_before(tm_read);
774
775 /* Check that there weren't any timestamp reset events (assuming these
776 * were the only two timestamp reads that happened).
777 */
778 fs_reg reset = shader_end_time;
779 reset.set_smear(2);
780 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
781 test->conditional_mod = BRW_CONDITIONAL_Z;
782 test->force_writemask_all = true;
783 end->insert_before(test);
784 end->insert_before(IF(BRW_PREDICATE_NORMAL));
785
786 fs_reg start = shader_start_time;
787 start.negate = true;
788 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
789 diff.set_smear(0);
790 fs_inst *add = ADD(diff, start, shader_end_time);
791 add->force_writemask_all = true;
792 end->insert_before(add);
793
794 /* If there were no instructions between the two timestamp gets, the diff
795 * is 2 cycles. Remove that overhead, so I can forget about that when
796 * trying to determine the time taken for single instructions.
797 */
798 add = ADD(diff, diff, fs_reg(-2u));
799 add->force_writemask_all = true;
800 end->insert_before(add);
801
802 end->insert_before(SHADER_TIME_ADD(type, diff));
803 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
804 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
805 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
806 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
807 }
808
809 fs_inst *
810 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
811 {
812 int shader_time_index =
813 brw_get_shader_time_index(brw, shader_prog, prog, type);
814 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
815
816 fs_reg payload;
817 if (dispatch_width == 8)
818 payload = vgrf(glsl_type::uvec2_type);
819 else
820 payload = vgrf(glsl_type::uint_type);
821
822 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
823 fs_reg(), payload, offset, value);
824 }
825
826 void
827 fs_visitor::vfail(const char *format, va_list va)
828 {
829 char *msg;
830
831 if (failed)
832 return;
833
834 failed = true;
835
836 msg = ralloc_vasprintf(mem_ctx, format, va);
837 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
838
839 this->fail_msg = msg;
840
841 if (debug_enabled) {
842 fprintf(stderr, "%s", msg);
843 }
844 }
845
846 void
847 fs_visitor::fail(const char *format, ...)
848 {
849 va_list va;
850
851 va_start(va, format);
852 vfail(format, va);
853 va_end(va);
854 }
855
856 /**
857 * Mark this program as impossible to compile in SIMD16 mode.
858 *
859 * During the SIMD8 compile (which happens first), we can detect and flag
860 * things that are unsupported in SIMD16 mode, so the compiler can skip
861 * the SIMD16 compile altogether.
862 *
863 * During a SIMD16 compile (if one happens anyway), this just calls fail().
864 */
865 void
866 fs_visitor::no16(const char *format, ...)
867 {
868 va_list va;
869
870 va_start(va, format);
871
872 if (dispatch_width == 16) {
873 vfail(format, va);
874 } else {
875 simd16_unsupported = true;
876
877 if (brw->perf_debug) {
878 if (no16_msg)
879 ralloc_vasprintf_append(&no16_msg, format, va);
880 else
881 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
882 }
883 }
884
885 va_end(va);
886 }
887
888 fs_inst *
889 fs_visitor::emit(enum opcode opcode)
890 {
891 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
892 }
893
894 fs_inst *
895 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
896 {
897 return emit(new(mem_ctx) fs_inst(opcode, dst));
898 }
899
900 fs_inst *
901 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
902 {
903 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
904 }
905
906 fs_inst *
907 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
908 const fs_reg &src1)
909 {
910 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
911 }
912
913 fs_inst *
914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
915 const fs_reg &src1, const fs_reg &src2)
916 {
917 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
918 }
919
920 fs_inst *
921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
922 fs_reg src[], int sources)
923 {
924 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
925 }
926
927 /**
928 * Returns true if the instruction has a flag that means it won't
929 * update an entire destination register.
930 *
931 * For example, dead code elimination and live variable analysis want to know
932 * when a write to a variable screens off any preceding values that were in
933 * it.
934 */
935 bool
936 fs_inst::is_partial_write() const
937 {
938 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
939 (this->dst.width * type_sz(this->dst.type)) < 32 ||
940 !this->dst.is_contiguous());
941 }
942
943 int
944 fs_inst::regs_read(int arg) const
945 {
946 if (is_tex() && arg == 0 && src[0].file == GRF) {
947 return mlen;
948 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
949 return mlen;
950 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
951 return mlen;
952 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
953 return mlen;
954 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
955 return mlen;
956 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
957 return mlen;
958 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
959 return mlen;
960 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
961 return mlen;
962 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
963 return mlen;
964 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
965 return mlen;
966 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
967 return exec_size / 4;
968 }
969
970 switch (src[arg].file) {
971 case BAD_FILE:
972 case UNIFORM:
973 case IMM:
974 return 1;
975 case GRF:
976 case HW_REG:
977 if (src[arg].stride == 0) {
978 return 1;
979 } else {
980 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
981 return (size + 31) / 32;
982 }
983 case MRF:
984 unreachable("MRF registers are not allowed as sources");
985 default:
986 unreachable("Invalid register file");
987 }
988 }
989
990 bool
991 fs_inst::reads_flag() const
992 {
993 return predicate;
994 }
995
996 bool
997 fs_inst::writes_flag() const
998 {
999 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1000 opcode != BRW_OPCODE_IF &&
1001 opcode != BRW_OPCODE_WHILE)) ||
1002 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1003 }
1004
1005 /**
1006 * Returns how many MRFs an FS opcode will write over.
1007 *
1008 * Note that this is not the 0 or 1 implied writes in an actual gen
1009 * instruction -- the FS opcodes often generate MOVs in addition.
1010 */
1011 int
1012 fs_visitor::implied_mrf_writes(fs_inst *inst)
1013 {
1014 if (inst->mlen == 0)
1015 return 0;
1016
1017 if (inst->base_mrf == -1)
1018 return 0;
1019
1020 switch (inst->opcode) {
1021 case SHADER_OPCODE_RCP:
1022 case SHADER_OPCODE_RSQ:
1023 case SHADER_OPCODE_SQRT:
1024 case SHADER_OPCODE_EXP2:
1025 case SHADER_OPCODE_LOG2:
1026 case SHADER_OPCODE_SIN:
1027 case SHADER_OPCODE_COS:
1028 return 1 * dispatch_width / 8;
1029 case SHADER_OPCODE_POW:
1030 case SHADER_OPCODE_INT_QUOTIENT:
1031 case SHADER_OPCODE_INT_REMAINDER:
1032 return 2 * dispatch_width / 8;
1033 case SHADER_OPCODE_TEX:
1034 case FS_OPCODE_TXB:
1035 case SHADER_OPCODE_TXD:
1036 case SHADER_OPCODE_TXF:
1037 case SHADER_OPCODE_TXF_CMS:
1038 case SHADER_OPCODE_TXF_MCS:
1039 case SHADER_OPCODE_TG4:
1040 case SHADER_OPCODE_TG4_OFFSET:
1041 case SHADER_OPCODE_TXL:
1042 case SHADER_OPCODE_TXS:
1043 case SHADER_OPCODE_LOD:
1044 return 1;
1045 case FS_OPCODE_FB_WRITE:
1046 return 2;
1047 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1048 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1049 return 1;
1050 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1051 return inst->mlen;
1052 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1053 return inst->mlen;
1054 case SHADER_OPCODE_UNTYPED_ATOMIC:
1055 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1056 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1057 case SHADER_OPCODE_TYPED_ATOMIC:
1058 case SHADER_OPCODE_TYPED_SURFACE_READ:
1059 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1060 case SHADER_OPCODE_URB_WRITE_SIMD8:
1061 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1062 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1063 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1064 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1065 return 0;
1066 default:
1067 unreachable("not reached");
1068 }
1069 }
1070
1071 fs_reg
1072 fs_visitor::vgrf(const glsl_type *const type)
1073 {
1074 int reg_width = dispatch_width / 8;
1075 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1076 brw_type_for_base_type(type), dispatch_width);
1077 }
1078
1079 fs_reg
1080 fs_visitor::vgrf(int num_components)
1081 {
1082 int reg_width = dispatch_width / 8;
1083 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1084 BRW_REGISTER_TYPE_F, dispatch_width);
1085 }
1086
1087 /** Fixed HW reg constructor. */
1088 fs_reg::fs_reg(enum register_file file, int reg)
1089 {
1090 init();
1091 this->file = file;
1092 this->reg = reg;
1093 this->type = BRW_REGISTER_TYPE_F;
1094
1095 switch (file) {
1096 case UNIFORM:
1097 this->width = 1;
1098 break;
1099 default:
1100 this->width = 8;
1101 }
1102 }
1103
1104 /** Fixed HW reg constructor. */
1105 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1106 {
1107 init();
1108 this->file = file;
1109 this->reg = reg;
1110 this->type = type;
1111
1112 switch (file) {
1113 case UNIFORM:
1114 this->width = 1;
1115 break;
1116 default:
1117 this->width = 8;
1118 }
1119 }
1120
1121 /** Fixed HW reg constructor. */
1122 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1123 uint8_t width)
1124 {
1125 init();
1126 this->file = file;
1127 this->reg = reg;
1128 this->type = type;
1129 this->width = width;
1130 }
1131
1132 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1133 * This brings in those uniform definitions
1134 */
1135 void
1136 fs_visitor::import_uniforms(fs_visitor *v)
1137 {
1138 this->push_constant_loc = v->push_constant_loc;
1139 this->pull_constant_loc = v->pull_constant_loc;
1140 this->uniforms = v->uniforms;
1141 this->param_size = v->param_size;
1142 }
1143
1144 fs_reg *
1145 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1146 bool origin_upper_left)
1147 {
1148 assert(stage == MESA_SHADER_FRAGMENT);
1149 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1150 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1151 fs_reg wpos = *reg;
1152 bool flip = !origin_upper_left ^ key->render_to_fbo;
1153
1154 /* gl_FragCoord.x */
1155 if (pixel_center_integer) {
1156 emit(MOV(wpos, this->pixel_x));
1157 } else {
1158 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1159 }
1160 wpos = offset(wpos, 1);
1161
1162 /* gl_FragCoord.y */
1163 if (!flip && pixel_center_integer) {
1164 emit(MOV(wpos, this->pixel_y));
1165 } else {
1166 fs_reg pixel_y = this->pixel_y;
1167 float offset = (pixel_center_integer ? 0.0 : 0.5);
1168
1169 if (flip) {
1170 pixel_y.negate = true;
1171 offset += key->drawable_height - 1.0;
1172 }
1173
1174 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1175 }
1176 wpos = offset(wpos, 1);
1177
1178 /* gl_FragCoord.z */
1179 if (devinfo->gen >= 6) {
1180 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1181 } else {
1182 emit(FS_OPCODE_LINTERP, wpos,
1183 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1184 interp_reg(VARYING_SLOT_POS, 2));
1185 }
1186 wpos = offset(wpos, 1);
1187
1188 /* gl_FragCoord.w: Already set up in emit_interpolation */
1189 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1190
1191 return reg;
1192 }
1193
1194 fs_inst *
1195 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1196 glsl_interp_qualifier interpolation_mode,
1197 bool is_centroid, bool is_sample)
1198 {
1199 brw_wm_barycentric_interp_mode barycoord_mode;
1200 if (devinfo->gen >= 6) {
1201 if (is_centroid) {
1202 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1203 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1204 else
1205 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1206 } else if (is_sample) {
1207 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1208 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1209 else
1210 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1211 } else {
1212 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1213 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1214 else
1215 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1216 }
1217 } else {
1218 /* On Ironlake and below, there is only one interpolation mode.
1219 * Centroid interpolation doesn't mean anything on this hardware --
1220 * there is no multisampling.
1221 */
1222 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1223 }
1224 return emit(FS_OPCODE_LINTERP, attr,
1225 this->delta_xy[barycoord_mode], interp);
1226 }
1227
1228 void
1229 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1230 const glsl_type *type,
1231 glsl_interp_qualifier interpolation_mode,
1232 int location, bool mod_centroid,
1233 bool mod_sample)
1234 {
1235 attr.type = brw_type_for_base_type(type->get_scalar_type());
1236
1237 assert(stage == MESA_SHADER_FRAGMENT);
1238 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1239 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1240
1241 unsigned int array_elements;
1242
1243 if (type->is_array()) {
1244 array_elements = type->length;
1245 if (array_elements == 0) {
1246 fail("dereferenced array '%s' has length 0\n", name);
1247 }
1248 type = type->fields.array;
1249 } else {
1250 array_elements = 1;
1251 }
1252
1253 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1254 bool is_gl_Color =
1255 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1256 if (key->flat_shade && is_gl_Color) {
1257 interpolation_mode = INTERP_QUALIFIER_FLAT;
1258 } else {
1259 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1260 }
1261 }
1262
1263 for (unsigned int i = 0; i < array_elements; i++) {
1264 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1265 if (prog_data->urb_setup[location] == -1) {
1266 /* If there's no incoming setup data for this slot, don't
1267 * emit interpolation for it.
1268 */
1269 attr = offset(attr, type->vector_elements);
1270 location++;
1271 continue;
1272 }
1273
1274 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1275 /* Constant interpolation (flat shading) case. The SF has
1276 * handed us defined values in only the constant offset
1277 * field of the setup reg.
1278 */
1279 for (unsigned int k = 0; k < type->vector_elements; k++) {
1280 struct brw_reg interp = interp_reg(location, k);
1281 interp = suboffset(interp, 3);
1282 interp.type = attr.type;
1283 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1284 attr = offset(attr, 1);
1285 }
1286 } else {
1287 /* Smooth/noperspective interpolation case. */
1288 for (unsigned int k = 0; k < type->vector_elements; k++) {
1289 struct brw_reg interp = interp_reg(location, k);
1290 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1291 /* Get the pixel/sample mask into f0 so that we know
1292 * which pixels are lit. Then, for each channel that is
1293 * unlit, replace the centroid data with non-centroid
1294 * data.
1295 */
1296 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1297
1298 fs_inst *inst;
1299 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1300 false, false);
1301 inst->predicate = BRW_PREDICATE_NORMAL;
1302 inst->predicate_inverse = true;
1303 if (devinfo->has_pln)
1304 inst->no_dd_clear = true;
1305
1306 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1307 mod_centroid && !key->persample_shading,
1308 mod_sample || key->persample_shading);
1309 inst->predicate = BRW_PREDICATE_NORMAL;
1310 inst->predicate_inverse = false;
1311 if (devinfo->has_pln)
1312 inst->no_dd_check = true;
1313
1314 } else {
1315 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1316 mod_centroid && !key->persample_shading,
1317 mod_sample || key->persample_shading);
1318 }
1319 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1320 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1321 }
1322 attr = offset(attr, 1);
1323 }
1324
1325 }
1326 location++;
1327 }
1328 }
1329 }
1330
1331 fs_reg *
1332 fs_visitor::emit_frontfacing_interpolation()
1333 {
1334 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1335
1336 if (devinfo->gen >= 6) {
1337 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1338 * a boolean result from this (~0/true or 0/false).
1339 *
1340 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1341 * this task in only one instruction:
1342 * - a negation source modifier will flip the bit; and
1343 * - a W -> D type conversion will sign extend the bit into the high
1344 * word of the destination.
1345 *
1346 * An ASR 15 fills the low word of the destination.
1347 */
1348 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1349 g0.negate = true;
1350
1351 emit(ASR(*reg, g0, fs_reg(15)));
1352 } else {
1353 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1354 * a boolean result from this (1/true or 0/false).
1355 *
1356 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1357 * the negation source modifier to flip it. Unfortunately the SHR
1358 * instruction only operates on UD (or D with an abs source modifier)
1359 * sources without negation.
1360 *
1361 * Instead, use ASR (which will give ~0/true or 0/false).
1362 */
1363 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1364 g1_6.negate = true;
1365
1366 emit(ASR(*reg, g1_6, fs_reg(31)));
1367 }
1368
1369 return reg;
1370 }
1371
1372 void
1373 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1374 {
1375 assert(stage == MESA_SHADER_FRAGMENT);
1376 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1377 assert(dst.type == BRW_REGISTER_TYPE_F);
1378
1379 if (key->compute_pos_offset) {
1380 /* Convert int_sample_pos to floating point */
1381 emit(MOV(dst, int_sample_pos));
1382 /* Scale to the range [0, 1] */
1383 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1384 }
1385 else {
1386 /* From ARB_sample_shading specification:
1387 * "When rendering to a non-multisample buffer, or if multisample
1388 * rasterization is disabled, gl_SamplePosition will always be
1389 * (0.5, 0.5).
1390 */
1391 emit(MOV(dst, fs_reg(0.5f)));
1392 }
1393 }
1394
1395 fs_reg *
1396 fs_visitor::emit_samplepos_setup()
1397 {
1398 assert(devinfo->gen >= 6);
1399
1400 this->current_annotation = "compute sample position";
1401 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1402 fs_reg pos = *reg;
1403 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1404 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1405
1406 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1407 * mode will be enabled.
1408 *
1409 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1410 * R31.1:0 Position Offset X/Y for Slot[3:0]
1411 * R31.3:2 Position Offset X/Y for Slot[7:4]
1412 * .....
1413 *
1414 * The X, Y sample positions come in as bytes in thread payload. So, read
1415 * the positions using vstride=16, width=8, hstride=2.
1416 */
1417 struct brw_reg sample_pos_reg =
1418 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1419 BRW_REGISTER_TYPE_B), 16, 8, 2);
1420
1421 if (dispatch_width == 8) {
1422 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1423 } else {
1424 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1425 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1426 ->force_sechalf = true;
1427 }
1428 /* Compute gl_SamplePosition.x */
1429 compute_sample_position(pos, int_sample_x);
1430 pos = offset(pos, 1);
1431 if (dispatch_width == 8) {
1432 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1433 } else {
1434 emit(MOV(half(int_sample_y, 0),
1435 fs_reg(suboffset(sample_pos_reg, 1))));
1436 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1437 ->force_sechalf = true;
1438 }
1439 /* Compute gl_SamplePosition.y */
1440 compute_sample_position(pos, int_sample_y);
1441 return reg;
1442 }
1443
1444 fs_reg *
1445 fs_visitor::emit_sampleid_setup()
1446 {
1447 assert(stage == MESA_SHADER_FRAGMENT);
1448 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1449 assert(devinfo->gen >= 6);
1450
1451 this->current_annotation = "compute sample id";
1452 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1453
1454 if (key->compute_sample_id) {
1455 fs_reg t1 = vgrf(glsl_type::int_type);
1456 fs_reg t2 = vgrf(glsl_type::int_type);
1457 t2.type = BRW_REGISTER_TYPE_UW;
1458
1459 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1460 * 8x multisampling, subspan 0 will represent sample N (where N
1461 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1462 * 7. We can find the value of N by looking at R0.0 bits 7:6
1463 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1464 * (since samples are always delivered in pairs). That is, we
1465 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1466 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1467 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1468 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1469 * populating a temporary variable with the sequence (0, 1, 2, 3),
1470 * and then reading from it using vstride=1, width=4, hstride=0.
1471 * These computations hold good for 4x multisampling as well.
1472 *
1473 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1474 * the first four slots are sample 0 of subspan 0; the next four
1475 * are sample 1 of subspan 0; the third group is sample 0 of
1476 * subspan 1, and finally sample 1 of subspan 1.
1477 */
1478 fs_inst *inst;
1479 inst = emit(BRW_OPCODE_AND, t1,
1480 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1481 fs_reg(0xc0));
1482 inst->force_writemask_all = true;
1483 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1484 inst->force_writemask_all = true;
1485 /* This works for both SIMD8 and SIMD16 */
1486 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1487 inst->force_writemask_all = true;
1488 /* This special instruction takes care of setting vstride=1,
1489 * width=4, hstride=0 of t2 during an ADD instruction.
1490 */
1491 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1492 } else {
1493 /* As per GL_ARB_sample_shading specification:
1494 * "When rendering to a non-multisample buffer, or if multisample
1495 * rasterization is disabled, gl_SampleID will always be zero."
1496 */
1497 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1498 }
1499
1500 return reg;
1501 }
1502
1503 void
1504 fs_visitor::resolve_source_modifiers(fs_reg *src)
1505 {
1506 if (!src->abs && !src->negate)
1507 return;
1508
1509 fs_reg temp = retype(vgrf(1), src->type);
1510 emit(MOV(temp, *src));
1511 *src = temp;
1512 }
1513
1514 fs_reg
1515 fs_visitor::fix_math_operand(fs_reg src)
1516 {
1517 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1518 * might be able to do better by doing execsize = 1 math and then
1519 * expanding that result out, but we would need to be careful with
1520 * masking.
1521 *
1522 * The hardware ignores source modifiers (negate and abs) on math
1523 * instructions, so we also move to a temp to set those up.
1524 */
1525 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1526 !src.abs && !src.negate)
1527 return src;
1528
1529 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1530 * operands to math
1531 */
1532 if (devinfo->gen >= 7 && src.file != IMM)
1533 return src;
1534
1535 fs_reg expanded = vgrf(glsl_type::float_type);
1536 expanded.type = src.type;
1537 emit(BRW_OPCODE_MOV, expanded, src);
1538 return expanded;
1539 }
1540
1541 fs_inst *
1542 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1543 {
1544 switch (opcode) {
1545 case SHADER_OPCODE_RCP:
1546 case SHADER_OPCODE_RSQ:
1547 case SHADER_OPCODE_SQRT:
1548 case SHADER_OPCODE_EXP2:
1549 case SHADER_OPCODE_LOG2:
1550 case SHADER_OPCODE_SIN:
1551 case SHADER_OPCODE_COS:
1552 break;
1553 default:
1554 unreachable("not reached: bad math opcode");
1555 }
1556
1557 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1558 * might be able to do better by doing execsize = 1 math and then
1559 * expanding that result out, but we would need to be careful with
1560 * masking.
1561 *
1562 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1563 * instructions, so we also move to a temp to set those up.
1564 */
1565 if (devinfo->gen == 6 || devinfo->gen == 7)
1566 src = fix_math_operand(src);
1567
1568 fs_inst *inst = emit(opcode, dst, src);
1569
1570 if (devinfo->gen < 6) {
1571 inst->base_mrf = 2;
1572 inst->mlen = dispatch_width / 8;
1573 }
1574
1575 return inst;
1576 }
1577
1578 fs_inst *
1579 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1580 {
1581 int base_mrf = 2;
1582 fs_inst *inst;
1583
1584 if (devinfo->gen >= 8) {
1585 inst = emit(opcode, dst, src0, src1);
1586 } else if (devinfo->gen >= 6) {
1587 src0 = fix_math_operand(src0);
1588 src1 = fix_math_operand(src1);
1589
1590 inst = emit(opcode, dst, src0, src1);
1591 } else {
1592 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1593 * "Message Payload":
1594 *
1595 * "Operand0[7]. For the INT DIV functions, this operand is the
1596 * denominator."
1597 * ...
1598 * "Operand1[7]. For the INT DIV functions, this operand is the
1599 * numerator."
1600 */
1601 bool is_int_div = opcode != SHADER_OPCODE_POW;
1602 fs_reg &op0 = is_int_div ? src1 : src0;
1603 fs_reg &op1 = is_int_div ? src0 : src1;
1604
1605 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1606 inst = emit(opcode, dst, op0, reg_null_f);
1607
1608 inst->base_mrf = base_mrf;
1609 inst->mlen = 2 * dispatch_width / 8;
1610 }
1611 return inst;
1612 }
1613
1614 void
1615 fs_visitor::emit_discard_jump()
1616 {
1617 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1618
1619 /* For performance, after a discard, jump to the end of the
1620 * shader if all relevant channels have been discarded.
1621 */
1622 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1623 discard_jump->flag_subreg = 1;
1624
1625 discard_jump->predicate = (dispatch_width == 8)
1626 ? BRW_PREDICATE_ALIGN1_ANY8H
1627 : BRW_PREDICATE_ALIGN1_ANY16H;
1628 discard_jump->predicate_inverse = true;
1629 }
1630
1631 void
1632 fs_visitor::assign_curb_setup()
1633 {
1634 if (dispatch_width == 8) {
1635 prog_data->dispatch_grf_start_reg = payload.num_regs;
1636 } else {
1637 if (stage == MESA_SHADER_FRAGMENT) {
1638 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1639 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1640 } else if (stage == MESA_SHADER_COMPUTE) {
1641 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1642 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1643 } else {
1644 unreachable("Unsupported shader type!");
1645 }
1646 }
1647
1648 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1649
1650 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1651 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1652 for (unsigned int i = 0; i < inst->sources; i++) {
1653 if (inst->src[i].file == UNIFORM) {
1654 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1655 int constant_nr;
1656 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1657 constant_nr = push_constant_loc[uniform_nr];
1658 } else {
1659 /* Section 5.11 of the OpenGL 4.1 spec says:
1660 * "Out-of-bounds reads return undefined values, which include
1661 * values from other variables of the active program or zero."
1662 * Just return the first push constant.
1663 */
1664 constant_nr = 0;
1665 }
1666
1667 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1668 constant_nr / 8,
1669 constant_nr % 8);
1670
1671 inst->src[i].file = HW_REG;
1672 inst->src[i].fixed_hw_reg = byte_offset(
1673 retype(brw_reg, inst->src[i].type),
1674 inst->src[i].subreg_offset);
1675 }
1676 }
1677 }
1678 }
1679
1680 void
1681 fs_visitor::calculate_urb_setup()
1682 {
1683 assert(stage == MESA_SHADER_FRAGMENT);
1684 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1685 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1686
1687 memset(prog_data->urb_setup, -1,
1688 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1689
1690 int urb_next = 0;
1691 /* Figure out where each of the incoming setup attributes lands. */
1692 if (devinfo->gen >= 6) {
1693 if (_mesa_bitcount_64(prog->InputsRead &
1694 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1695 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1696 * first 16 varying inputs, so we can put them wherever we want.
1697 * Just put them in order.
1698 *
1699 * This is useful because it means that (a) inputs not used by the
1700 * fragment shader won't take up valuable register space, and (b) we
1701 * won't have to recompile the fragment shader if it gets paired with
1702 * a different vertex (or geometry) shader.
1703 */
1704 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1705 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1706 BITFIELD64_BIT(i)) {
1707 prog_data->urb_setup[i] = urb_next++;
1708 }
1709 }
1710 } else {
1711 /* We have enough input varyings that the SF/SBE pipeline stage can't
1712 * arbitrarily rearrange them to suit our whim; we have to put them
1713 * in an order that matches the output of the previous pipeline stage
1714 * (geometry or vertex shader).
1715 */
1716 struct brw_vue_map prev_stage_vue_map;
1717 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1718 key->input_slots_valid);
1719 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1720 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1721 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1722 slot++) {
1723 int varying = prev_stage_vue_map.slot_to_varying[slot];
1724 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1725 * unused.
1726 */
1727 if (varying != BRW_VARYING_SLOT_COUNT &&
1728 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1729 BITFIELD64_BIT(varying))) {
1730 prog_data->urb_setup[varying] = slot - first_slot;
1731 }
1732 }
1733 urb_next = prev_stage_vue_map.num_slots - first_slot;
1734 }
1735 } else {
1736 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1737 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1738 /* Point size is packed into the header, not as a general attribute */
1739 if (i == VARYING_SLOT_PSIZ)
1740 continue;
1741
1742 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1743 /* The back color slot is skipped when the front color is
1744 * also written to. In addition, some slots can be
1745 * written in the vertex shader and not read in the
1746 * fragment shader. So the register number must always be
1747 * incremented, mapped or not.
1748 */
1749 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1750 prog_data->urb_setup[i] = urb_next;
1751 urb_next++;
1752 }
1753 }
1754
1755 /*
1756 * It's a FS only attribute, and we did interpolation for this attribute
1757 * in SF thread. So, count it here, too.
1758 *
1759 * See compile_sf_prog() for more info.
1760 */
1761 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1762 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1763 }
1764
1765 prog_data->num_varying_inputs = urb_next;
1766 }
1767
1768 void
1769 fs_visitor::assign_urb_setup()
1770 {
1771 assert(stage == MESA_SHADER_FRAGMENT);
1772 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1773
1774 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1775
1776 /* Offset all the urb_setup[] index by the actual position of the
1777 * setup regs, now that the location of the constants has been chosen.
1778 */
1779 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1780 if (inst->opcode == FS_OPCODE_LINTERP) {
1781 assert(inst->src[1].file == HW_REG);
1782 inst->src[1].fixed_hw_reg.nr += urb_start;
1783 }
1784
1785 if (inst->opcode == FS_OPCODE_CINTERP) {
1786 assert(inst->src[0].file == HW_REG);
1787 inst->src[0].fixed_hw_reg.nr += urb_start;
1788 }
1789 }
1790
1791 /* Each attribute is 4 setup channels, each of which is half a reg. */
1792 this->first_non_payload_grf =
1793 urb_start + prog_data->num_varying_inputs * 2;
1794 }
1795
1796 void
1797 fs_visitor::assign_vs_urb_setup()
1798 {
1799 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1800 int grf, count, slot, channel, attr;
1801
1802 assert(stage == MESA_SHADER_VERTEX);
1803 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1804 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1805 count++;
1806
1807 /* Each attribute is 4 regs. */
1808 this->first_non_payload_grf =
1809 payload.num_regs + prog_data->curb_read_length + count * 4;
1810
1811 unsigned vue_entries =
1812 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1813
1814 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1815 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1816
1817 assert(vs_prog_data->base.urb_read_length <= 15);
1818
1819 /* Rewrite all ATTR file references to the hw grf that they land in. */
1820 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1821 for (int i = 0; i < inst->sources; i++) {
1822 if (inst->src[i].file == ATTR) {
1823
1824 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1825 slot = count - 1;
1826 } else {
1827 /* Attributes come in in a contiguous block, ordered by their
1828 * gl_vert_attrib value. That means we can compute the slot
1829 * number for an attribute by masking out the enabled
1830 * attributes before it and counting the bits.
1831 */
1832 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1833 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1834 BITFIELD64_MASK(attr));
1835 }
1836
1837 channel = inst->src[i].reg_offset & 3;
1838
1839 grf = payload.num_regs +
1840 prog_data->curb_read_length +
1841 slot * 4 + channel;
1842
1843 inst->src[i].file = HW_REG;
1844 inst->src[i].fixed_hw_reg =
1845 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1846 }
1847 }
1848 }
1849 }
1850
1851 /**
1852 * Split large virtual GRFs into separate components if we can.
1853 *
1854 * This is mostly duplicated with what brw_fs_vector_splitting does,
1855 * but that's really conservative because it's afraid of doing
1856 * splitting that doesn't result in real progress after the rest of
1857 * the optimization phases, which would cause infinite looping in
1858 * optimization. We can do it once here, safely. This also has the
1859 * opportunity to split interpolated values, or maybe even uniforms,
1860 * which we don't have at the IR level.
1861 *
1862 * We want to split, because virtual GRFs are what we register
1863 * allocate and spill (due to contiguousness requirements for some
1864 * instructions), and they're what we naturally generate in the
1865 * codegen process, but most virtual GRFs don't actually need to be
1866 * contiguous sets of GRFs. If we split, we'll end up with reduced
1867 * live intervals and better dead code elimination and coalescing.
1868 */
1869 void
1870 fs_visitor::split_virtual_grfs()
1871 {
1872 int num_vars = this->alloc.count;
1873
1874 /* Count the total number of registers */
1875 int reg_count = 0;
1876 int vgrf_to_reg[num_vars];
1877 for (int i = 0; i < num_vars; i++) {
1878 vgrf_to_reg[i] = reg_count;
1879 reg_count += alloc.sizes[i];
1880 }
1881
1882 /* An array of "split points". For each register slot, this indicates
1883 * if this slot can be separated from the previous slot. Every time an
1884 * instruction uses multiple elements of a register (as a source or
1885 * destination), we mark the used slots as inseparable. Then we go
1886 * through and split the registers into the smallest pieces we can.
1887 */
1888 bool split_points[reg_count];
1889 memset(split_points, 0, sizeof(split_points));
1890
1891 /* Mark all used registers as fully splittable */
1892 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1893 if (inst->dst.file == GRF) {
1894 int reg = vgrf_to_reg[inst->dst.reg];
1895 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1896 split_points[reg + j] = true;
1897 }
1898
1899 for (int i = 0; i < inst->sources; i++) {
1900 if (inst->src[i].file == GRF) {
1901 int reg = vgrf_to_reg[inst->src[i].reg];
1902 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1903 split_points[reg + j] = true;
1904 }
1905 }
1906 }
1907
1908 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1909 if (inst->dst.file == GRF) {
1910 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1911 for (int j = 1; j < inst->regs_written; j++)
1912 split_points[reg + j] = false;
1913 }
1914 for (int i = 0; i < inst->sources; i++) {
1915 if (inst->src[i].file == GRF) {
1916 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1917 for (int j = 1; j < inst->regs_read(i); j++)
1918 split_points[reg + j] = false;
1919 }
1920 }
1921 }
1922
1923 int new_virtual_grf[reg_count];
1924 int new_reg_offset[reg_count];
1925
1926 int reg = 0;
1927 for (int i = 0; i < num_vars; i++) {
1928 /* The first one should always be 0 as a quick sanity check. */
1929 assert(split_points[reg] == false);
1930
1931 /* j = 0 case */
1932 new_reg_offset[reg] = 0;
1933 reg++;
1934 int offset = 1;
1935
1936 /* j > 0 case */
1937 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1938 /* If this is a split point, reset the offset to 0 and allocate a
1939 * new virtual GRF for the previous offset many registers
1940 */
1941 if (split_points[reg]) {
1942 assert(offset <= MAX_VGRF_SIZE);
1943 int grf = alloc.allocate(offset);
1944 for (int k = reg - offset; k < reg; k++)
1945 new_virtual_grf[k] = grf;
1946 offset = 0;
1947 }
1948 new_reg_offset[reg] = offset;
1949 offset++;
1950 reg++;
1951 }
1952
1953 /* The last one gets the original register number */
1954 assert(offset <= MAX_VGRF_SIZE);
1955 alloc.sizes[i] = offset;
1956 for (int k = reg - offset; k < reg; k++)
1957 new_virtual_grf[k] = i;
1958 }
1959 assert(reg == reg_count);
1960
1961 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1962 if (inst->dst.file == GRF) {
1963 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1964 inst->dst.reg = new_virtual_grf[reg];
1965 inst->dst.reg_offset = new_reg_offset[reg];
1966 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1967 }
1968 for (int i = 0; i < inst->sources; i++) {
1969 if (inst->src[i].file == GRF) {
1970 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1971 inst->src[i].reg = new_virtual_grf[reg];
1972 inst->src[i].reg_offset = new_reg_offset[reg];
1973 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1974 }
1975 }
1976 }
1977 invalidate_live_intervals();
1978 }
1979
1980 /**
1981 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1982 *
1983 * During code generation, we create tons of temporary variables, many of
1984 * which get immediately killed and are never used again. Yet, in later
1985 * optimization and analysis passes, such as compute_live_intervals, we need
1986 * to loop over all the virtual GRFs. Compacting them can save a lot of
1987 * overhead.
1988 */
1989 bool
1990 fs_visitor::compact_virtual_grfs()
1991 {
1992 bool progress = false;
1993 int remap_table[this->alloc.count];
1994 memset(remap_table, -1, sizeof(remap_table));
1995
1996 /* Mark which virtual GRFs are used. */
1997 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1998 if (inst->dst.file == GRF)
1999 remap_table[inst->dst.reg] = 0;
2000
2001 for (int i = 0; i < inst->sources; i++) {
2002 if (inst->src[i].file == GRF)
2003 remap_table[inst->src[i].reg] = 0;
2004 }
2005 }
2006
2007 /* Compact the GRF arrays. */
2008 int new_index = 0;
2009 for (unsigned i = 0; i < this->alloc.count; i++) {
2010 if (remap_table[i] == -1) {
2011 /* We just found an unused register. This means that we are
2012 * actually going to compact something.
2013 */
2014 progress = true;
2015 } else {
2016 remap_table[i] = new_index;
2017 alloc.sizes[new_index] = alloc.sizes[i];
2018 invalidate_live_intervals();
2019 ++new_index;
2020 }
2021 }
2022
2023 this->alloc.count = new_index;
2024
2025 /* Patch all the instructions to use the newly renumbered registers */
2026 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2027 if (inst->dst.file == GRF)
2028 inst->dst.reg = remap_table[inst->dst.reg];
2029
2030 for (int i = 0; i < inst->sources; i++) {
2031 if (inst->src[i].file == GRF)
2032 inst->src[i].reg = remap_table[inst->src[i].reg];
2033 }
2034 }
2035
2036 /* Patch all the references to delta_xy, since they're used in register
2037 * allocation. If they're unused, switch them to BAD_FILE so we don't
2038 * think some random VGRF is delta_xy.
2039 */
2040 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2041 if (delta_xy[i].file == GRF) {
2042 if (remap_table[delta_xy[i].reg] != -1) {
2043 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2044 } else {
2045 delta_xy[i].file = BAD_FILE;
2046 }
2047 }
2048 }
2049
2050 return progress;
2051 }
2052
2053 /*
2054 * Implements array access of uniforms by inserting a
2055 * PULL_CONSTANT_LOAD instruction.
2056 *
2057 * Unlike temporary GRF array access (where we don't support it due to
2058 * the difficulty of doing relative addressing on instruction
2059 * destinations), we could potentially do array access of uniforms
2060 * that were loaded in GRF space as push constants. In real-world
2061 * usage we've seen, though, the arrays being used are always larger
2062 * than we could load as push constants, so just always move all
2063 * uniform array access out to a pull constant buffer.
2064 */
2065 void
2066 fs_visitor::move_uniform_array_access_to_pull_constants()
2067 {
2068 if (dispatch_width != 8)
2069 return;
2070
2071 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2072 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2073
2074 /* Walk through and find array access of uniforms. Put a copy of that
2075 * uniform in the pull constant buffer.
2076 *
2077 * Note that we don't move constant-indexed accesses to arrays. No
2078 * testing has been done of the performance impact of this choice.
2079 */
2080 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2081 for (int i = 0 ; i < inst->sources; i++) {
2082 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2083 continue;
2084
2085 int uniform = inst->src[i].reg;
2086
2087 /* If this array isn't already present in the pull constant buffer,
2088 * add it.
2089 */
2090 if (pull_constant_loc[uniform] == -1) {
2091 const gl_constant_value **values = &stage_prog_data->param[uniform];
2092
2093 assert(param_size[uniform]);
2094
2095 for (int j = 0; j < param_size[uniform]; j++) {
2096 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2097
2098 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2099 values[j];
2100 }
2101 }
2102 }
2103 }
2104 }
2105
2106 /**
2107 * Assign UNIFORM file registers to either push constants or pull constants.
2108 *
2109 * We allow a fragment shader to have more than the specified minimum
2110 * maximum number of fragment shader uniform components (64). If
2111 * there are too many of these, they'd fill up all of register space.
2112 * So, this will push some of them out to the pull constant buffer and
2113 * update the program to load them.
2114 */
2115 void
2116 fs_visitor::assign_constant_locations()
2117 {
2118 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2119 if (dispatch_width != 8)
2120 return;
2121
2122 /* Find which UNIFORM registers are still in use. */
2123 bool is_live[uniforms];
2124 for (unsigned int i = 0; i < uniforms; i++) {
2125 is_live[i] = false;
2126 }
2127
2128 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2129 for (int i = 0; i < inst->sources; i++) {
2130 if (inst->src[i].file != UNIFORM)
2131 continue;
2132
2133 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2134 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2135 is_live[constant_nr] = true;
2136 }
2137 }
2138
2139 /* Only allow 16 registers (128 uniform components) as push constants.
2140 *
2141 * Just demote the end of the list. We could probably do better
2142 * here, demoting things that are rarely used in the program first.
2143 *
2144 * If changing this value, note the limitation about total_regs in
2145 * brw_curbe.c.
2146 */
2147 unsigned int max_push_components = 16 * 8;
2148 unsigned int num_push_constants = 0;
2149
2150 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2151
2152 for (unsigned int i = 0; i < uniforms; i++) {
2153 if (!is_live[i] || pull_constant_loc[i] != -1) {
2154 /* This UNIFORM register is either dead, or has already been demoted
2155 * to a pull const. Mark it as no longer living in the param[] array.
2156 */
2157 push_constant_loc[i] = -1;
2158 continue;
2159 }
2160
2161 if (num_push_constants < max_push_components) {
2162 /* Retain as a push constant. Record the location in the params[]
2163 * array.
2164 */
2165 push_constant_loc[i] = num_push_constants++;
2166 } else {
2167 /* Demote to a pull constant. */
2168 push_constant_loc[i] = -1;
2169
2170 int pull_index = stage_prog_data->nr_pull_params++;
2171 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2172 pull_constant_loc[i] = pull_index;
2173 }
2174 }
2175
2176 stage_prog_data->nr_params = num_push_constants;
2177
2178 /* Up until now, the param[] array has been indexed by reg + reg_offset
2179 * of UNIFORM registers. Condense it to only contain the uniforms we
2180 * chose to upload as push constants.
2181 */
2182 for (unsigned int i = 0; i < uniforms; i++) {
2183 int remapped = push_constant_loc[i];
2184
2185 if (remapped == -1)
2186 continue;
2187
2188 assert(remapped <= (int)i);
2189 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2190 }
2191 }
2192
2193 /**
2194 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2195 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2196 */
2197 void
2198 fs_visitor::demote_pull_constants()
2199 {
2200 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2201 for (int i = 0; i < inst->sources; i++) {
2202 if (inst->src[i].file != UNIFORM)
2203 continue;
2204
2205 int pull_index;
2206 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2207 if (location >= uniforms) /* Out of bounds access */
2208 pull_index = -1;
2209 else
2210 pull_index = pull_constant_loc[location];
2211
2212 if (pull_index == -1)
2213 continue;
2214
2215 /* Set up the annotation tracking for new generated instructions. */
2216 base_ir = inst->ir;
2217 current_annotation = inst->annotation;
2218
2219 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2220 fs_reg dst = vgrf(glsl_type::float_type);
2221
2222 /* Generate a pull load into dst. */
2223 if (inst->src[i].reladdr) {
2224 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2225 surf_index,
2226 *inst->src[i].reladdr,
2227 pull_index);
2228 inst->insert_before(block, &list);
2229 inst->src[i].reladdr = NULL;
2230 } else {
2231 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2232 fs_inst *pull =
2233 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2234 dst, surf_index, offset);
2235 inst->insert_before(block, pull);
2236 inst->src[i].set_smear(pull_index & 3);
2237 }
2238
2239 /* Rewrite the instruction to use the temporary VGRF. */
2240 inst->src[i].file = GRF;
2241 inst->src[i].reg = dst.reg;
2242 inst->src[i].reg_offset = 0;
2243 inst->src[i].width = dispatch_width;
2244 }
2245 }
2246 invalidate_live_intervals();
2247 }
2248
2249 bool
2250 fs_visitor::opt_algebraic()
2251 {
2252 bool progress = false;
2253
2254 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2255 switch (inst->opcode) {
2256 case BRW_OPCODE_MOV:
2257 if (inst->src[0].file != IMM)
2258 break;
2259
2260 if (inst->saturate) {
2261 if (inst->dst.type != inst->src[0].type)
2262 assert(!"unimplemented: saturate mixed types");
2263
2264 if (brw_saturate_immediate(inst->dst.type,
2265 &inst->src[0].fixed_hw_reg)) {
2266 inst->saturate = false;
2267 progress = true;
2268 }
2269 }
2270 break;
2271
2272 case BRW_OPCODE_MUL:
2273 if (inst->src[1].file != IMM)
2274 continue;
2275
2276 /* a * 1.0 = a */
2277 if (inst->src[1].is_one()) {
2278 inst->opcode = BRW_OPCODE_MOV;
2279 inst->src[1] = reg_undef;
2280 progress = true;
2281 break;
2282 }
2283
2284 /* a * -1.0 = -a */
2285 if (inst->src[1].is_negative_one()) {
2286 inst->opcode = BRW_OPCODE_MOV;
2287 inst->src[0].negate = !inst->src[0].negate;
2288 inst->src[1] = reg_undef;
2289 progress = true;
2290 break;
2291 }
2292
2293 /* a * 0.0 = 0.0 */
2294 if (inst->src[1].is_zero()) {
2295 inst->opcode = BRW_OPCODE_MOV;
2296 inst->src[0] = inst->src[1];
2297 inst->src[1] = reg_undef;
2298 progress = true;
2299 break;
2300 }
2301
2302 if (inst->src[0].file == IMM) {
2303 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2304 inst->opcode = BRW_OPCODE_MOV;
2305 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2306 inst->src[1] = reg_undef;
2307 progress = true;
2308 break;
2309 }
2310 break;
2311 case BRW_OPCODE_ADD:
2312 if (inst->src[1].file != IMM)
2313 continue;
2314
2315 /* a + 0.0 = a */
2316 if (inst->src[1].is_zero()) {
2317 inst->opcode = BRW_OPCODE_MOV;
2318 inst->src[1] = reg_undef;
2319 progress = true;
2320 break;
2321 }
2322
2323 if (inst->src[0].file == IMM) {
2324 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2325 inst->opcode = BRW_OPCODE_MOV;
2326 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2327 inst->src[1] = reg_undef;
2328 progress = true;
2329 break;
2330 }
2331 break;
2332 case BRW_OPCODE_OR:
2333 if (inst->src[0].equals(inst->src[1])) {
2334 inst->opcode = BRW_OPCODE_MOV;
2335 inst->src[1] = reg_undef;
2336 progress = true;
2337 break;
2338 }
2339 break;
2340 case BRW_OPCODE_LRP:
2341 if (inst->src[1].equals(inst->src[2])) {
2342 inst->opcode = BRW_OPCODE_MOV;
2343 inst->src[0] = inst->src[1];
2344 inst->src[1] = reg_undef;
2345 inst->src[2] = reg_undef;
2346 progress = true;
2347 break;
2348 }
2349 break;
2350 case BRW_OPCODE_CMP:
2351 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2352 inst->src[0].abs &&
2353 inst->src[0].negate &&
2354 inst->src[1].is_zero()) {
2355 inst->src[0].abs = false;
2356 inst->src[0].negate = false;
2357 inst->conditional_mod = BRW_CONDITIONAL_Z;
2358 progress = true;
2359 break;
2360 }
2361 break;
2362 case BRW_OPCODE_SEL:
2363 if (inst->src[0].equals(inst->src[1])) {
2364 inst->opcode = BRW_OPCODE_MOV;
2365 inst->src[1] = reg_undef;
2366 inst->predicate = BRW_PREDICATE_NONE;
2367 inst->predicate_inverse = false;
2368 progress = true;
2369 } else if (inst->saturate && inst->src[1].file == IMM) {
2370 switch (inst->conditional_mod) {
2371 case BRW_CONDITIONAL_LE:
2372 case BRW_CONDITIONAL_L:
2373 switch (inst->src[1].type) {
2374 case BRW_REGISTER_TYPE_F:
2375 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2376 inst->opcode = BRW_OPCODE_MOV;
2377 inst->src[1] = reg_undef;
2378 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2379 progress = true;
2380 }
2381 break;
2382 default:
2383 break;
2384 }
2385 break;
2386 case BRW_CONDITIONAL_GE:
2387 case BRW_CONDITIONAL_G:
2388 switch (inst->src[1].type) {
2389 case BRW_REGISTER_TYPE_F:
2390 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2391 inst->opcode = BRW_OPCODE_MOV;
2392 inst->src[1] = reg_undef;
2393 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2394 progress = true;
2395 }
2396 break;
2397 default:
2398 break;
2399 }
2400 default:
2401 break;
2402 }
2403 }
2404 break;
2405 case BRW_OPCODE_MAD:
2406 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2407 inst->opcode = BRW_OPCODE_MOV;
2408 inst->src[1] = reg_undef;
2409 inst->src[2] = reg_undef;
2410 progress = true;
2411 } else if (inst->src[0].is_zero()) {
2412 inst->opcode = BRW_OPCODE_MUL;
2413 inst->src[0] = inst->src[2];
2414 inst->src[2] = reg_undef;
2415 progress = true;
2416 } else if (inst->src[1].is_one()) {
2417 inst->opcode = BRW_OPCODE_ADD;
2418 inst->src[1] = inst->src[2];
2419 inst->src[2] = reg_undef;
2420 progress = true;
2421 } else if (inst->src[2].is_one()) {
2422 inst->opcode = BRW_OPCODE_ADD;
2423 inst->src[2] = reg_undef;
2424 progress = true;
2425 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2426 inst->opcode = BRW_OPCODE_ADD;
2427 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2428 inst->src[2] = reg_undef;
2429 progress = true;
2430 }
2431 break;
2432 case SHADER_OPCODE_RCP: {
2433 fs_inst *prev = (fs_inst *)inst->prev;
2434 if (prev->opcode == SHADER_OPCODE_SQRT) {
2435 if (inst->src[0].equals(prev->dst)) {
2436 inst->opcode = SHADER_OPCODE_RSQ;
2437 inst->src[0] = prev->src[0];
2438 progress = true;
2439 }
2440 }
2441 break;
2442 }
2443 case SHADER_OPCODE_BROADCAST:
2444 if (is_uniform(inst->src[0])) {
2445 inst->opcode = BRW_OPCODE_MOV;
2446 inst->sources = 1;
2447 inst->force_writemask_all = true;
2448 progress = true;
2449 } else if (inst->src[1].file == IMM) {
2450 inst->opcode = BRW_OPCODE_MOV;
2451 inst->src[0] = component(inst->src[0],
2452 inst->src[1].fixed_hw_reg.dw1.ud);
2453 inst->sources = 1;
2454 inst->force_writemask_all = true;
2455 progress = true;
2456 }
2457 break;
2458
2459 default:
2460 break;
2461 }
2462
2463 /* Swap if src[0] is immediate. */
2464 if (progress && inst->is_commutative()) {
2465 if (inst->src[0].file == IMM) {
2466 fs_reg tmp = inst->src[1];
2467 inst->src[1] = inst->src[0];
2468 inst->src[0] = tmp;
2469 }
2470 }
2471 }
2472 return progress;
2473 }
2474
2475 /**
2476 * Optimize sample messages that have constant zero values for the trailing
2477 * texture coordinates. We can just reduce the message length for these
2478 * instructions instead of reserving a register for it. Trailing parameters
2479 * that aren't sent default to zero anyway. This will cause the dead code
2480 * eliminator to remove the MOV instruction that would otherwise be emitted to
2481 * set up the zero value.
2482 */
2483 bool
2484 fs_visitor::opt_zero_samples()
2485 {
2486 /* Gen4 infers the texturing opcode based on the message length so we can't
2487 * change it.
2488 */
2489 if (devinfo->gen < 5)
2490 return false;
2491
2492 bool progress = false;
2493
2494 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2495 if (!inst->is_tex())
2496 continue;
2497
2498 fs_inst *load_payload = (fs_inst *) inst->prev;
2499
2500 if (load_payload->is_head_sentinel() ||
2501 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2502 continue;
2503
2504 /* We don't want to remove the message header or the first parameter.
2505 * Removing the first parameter is not allowed, see the Haswell PRM
2506 * volume 7, page 149:
2507 *
2508 * "Parameter 0 is required except for the sampleinfo message, which
2509 * has no parameter 0"
2510 */
2511 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2512 load_payload->src[(inst->mlen - inst->header_size) /
2513 (dispatch_width / 8) +
2514 inst->header_size - 1].is_zero()) {
2515 inst->mlen -= dispatch_width / 8;
2516 progress = true;
2517 }
2518 }
2519
2520 if (progress)
2521 invalidate_live_intervals();
2522
2523 return progress;
2524 }
2525
2526 /**
2527 * Optimize sample messages which are followed by the final RT write.
2528 *
2529 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2530 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2531 * final texturing results copied to the framebuffer write payload and modify
2532 * them to write to the framebuffer directly.
2533 */
2534 bool
2535 fs_visitor::opt_sampler_eot()
2536 {
2537 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2538
2539 if (stage != MESA_SHADER_FRAGMENT)
2540 return false;
2541
2542 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2543 return false;
2544
2545 /* FINISHME: It should be possible to implement this optimization when there
2546 * are multiple drawbuffers.
2547 */
2548 if (key->nr_color_regions != 1)
2549 return false;
2550
2551 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2552 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2553 assert(fb_write->eot);
2554 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2555
2556 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2557
2558 /* There wasn't one; nothing to do. */
2559 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2560 return false;
2561
2562 /* This optimisation doesn't seem to work for textureGather for some
2563 * reason. I can't find any documentation or known workarounds to indicate
2564 * that this is expected, but considering that it is probably pretty
2565 * unlikely that a shader would directly write out the results from
2566 * textureGather we might as well just disable it.
2567 */
2568 if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2569 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2570 return false;
2571
2572 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2573 * It's very likely to be the previous instruction.
2574 */
2575 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2576 if (load_payload->is_head_sentinel() ||
2577 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2578 return false;
2579
2580 assert(!tex_inst->eot); /* We can't get here twice */
2581 assert((tex_inst->offset & (0xff << 24)) == 0);
2582
2583 tex_inst->offset |= fb_write->target << 24;
2584 tex_inst->eot = true;
2585 tex_inst->dst = reg_null_ud;
2586 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2587
2588 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2589 * to create a new LOAD_PAYLOAD command with the same sources and a space
2590 * saved for the header. Using a new destination register not only makes sure
2591 * we have enough space, but it will make sure the dead code eliminator kills
2592 * the instruction that this will replace.
2593 */
2594 if (tex_inst->header_size != 0)
2595 return true;
2596
2597 fs_reg send_header = vgrf(load_payload->sources + 1);
2598 fs_reg *new_sources =
2599 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2600
2601 new_sources[0] = fs_reg();
2602 for (int i = 0; i < load_payload->sources; i++)
2603 new_sources[i+1] = load_payload->src[i];
2604
2605 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2606 * requires a lot of information about the sources to appropriately figure
2607 * out the number of registers needed to be used. Given this stage in our
2608 * optimization, we may not have the appropriate GRFs required by
2609 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2610 * manually emit the instruction.
2611 */
2612 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2613 load_payload->exec_size,
2614 send_header,
2615 new_sources,
2616 load_payload->sources + 1);
2617
2618 new_load_payload->regs_written = load_payload->regs_written + 1;
2619 new_load_payload->header_size = 1;
2620 tex_inst->mlen++;
2621 tex_inst->header_size = 1;
2622 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2623 tex_inst->src[0] = send_header;
2624
2625 return true;
2626 }
2627
2628 bool
2629 fs_visitor::opt_register_renaming()
2630 {
2631 bool progress = false;
2632 int depth = 0;
2633
2634 int remap[alloc.count];
2635 memset(remap, -1, sizeof(int) * alloc.count);
2636
2637 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2638 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2639 depth++;
2640 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2641 inst->opcode == BRW_OPCODE_WHILE) {
2642 depth--;
2643 }
2644
2645 /* Rewrite instruction sources. */
2646 for (int i = 0; i < inst->sources; i++) {
2647 if (inst->src[i].file == GRF &&
2648 remap[inst->src[i].reg] != -1 &&
2649 remap[inst->src[i].reg] != inst->src[i].reg) {
2650 inst->src[i].reg = remap[inst->src[i].reg];
2651 progress = true;
2652 }
2653 }
2654
2655 const int dst = inst->dst.reg;
2656
2657 if (depth == 0 &&
2658 inst->dst.file == GRF &&
2659 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2660 !inst->is_partial_write()) {
2661 if (remap[dst] == -1) {
2662 remap[dst] = dst;
2663 } else {
2664 remap[dst] = alloc.allocate(inst->dst.width / 8);
2665 inst->dst.reg = remap[dst];
2666 progress = true;
2667 }
2668 } else if (inst->dst.file == GRF &&
2669 remap[dst] != -1 &&
2670 remap[dst] != dst) {
2671 inst->dst.reg = remap[dst];
2672 progress = true;
2673 }
2674 }
2675
2676 if (progress) {
2677 invalidate_live_intervals();
2678
2679 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2680 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2681 delta_xy[i].reg = remap[delta_xy[i].reg];
2682 }
2683 }
2684 }
2685
2686 return progress;
2687 }
2688
2689 /**
2690 * Remove redundant or useless discard jumps.
2691 *
2692 * For example, we can eliminate jumps in the following sequence:
2693 *
2694 * discard-jump (redundant with the next jump)
2695 * discard-jump (useless; jumps to the next instruction)
2696 * placeholder-halt
2697 */
2698 bool
2699 fs_visitor::opt_redundant_discard_jumps()
2700 {
2701 bool progress = false;
2702
2703 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2704
2705 fs_inst *placeholder_halt = NULL;
2706 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2707 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2708 placeholder_halt = inst;
2709 break;
2710 }
2711 }
2712
2713 if (!placeholder_halt)
2714 return false;
2715
2716 /* Delete any HALTs immediately before the placeholder halt. */
2717 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2718 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2719 prev = (fs_inst *) placeholder_halt->prev) {
2720 prev->remove(last_bblock);
2721 progress = true;
2722 }
2723
2724 if (progress)
2725 invalidate_live_intervals();
2726
2727 return progress;
2728 }
2729
2730 bool
2731 fs_visitor::compute_to_mrf()
2732 {
2733 bool progress = false;
2734 int next_ip = 0;
2735
2736 /* No MRFs on Gen >= 7. */
2737 if (devinfo->gen >= 7)
2738 return false;
2739
2740 calculate_live_intervals();
2741
2742 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2743 int ip = next_ip;
2744 next_ip++;
2745
2746 if (inst->opcode != BRW_OPCODE_MOV ||
2747 inst->is_partial_write() ||
2748 inst->dst.file != MRF || inst->src[0].file != GRF ||
2749 inst->dst.type != inst->src[0].type ||
2750 inst->src[0].abs || inst->src[0].negate ||
2751 !inst->src[0].is_contiguous() ||
2752 inst->src[0].subreg_offset)
2753 continue;
2754
2755 /* Work out which hardware MRF registers are written by this
2756 * instruction.
2757 */
2758 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2759 int mrf_high;
2760 if (inst->dst.reg & BRW_MRF_COMPR4) {
2761 mrf_high = mrf_low + 4;
2762 } else if (inst->exec_size == 16) {
2763 mrf_high = mrf_low + 1;
2764 } else {
2765 mrf_high = mrf_low;
2766 }
2767
2768 /* Can't compute-to-MRF this GRF if someone else was going to
2769 * read it later.
2770 */
2771 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2772 continue;
2773
2774 /* Found a move of a GRF to a MRF. Let's see if we can go
2775 * rewrite the thing that made this GRF to write into the MRF.
2776 */
2777 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2778 if (scan_inst->dst.file == GRF &&
2779 scan_inst->dst.reg == inst->src[0].reg) {
2780 /* Found the last thing to write our reg we want to turn
2781 * into a compute-to-MRF.
2782 */
2783
2784 /* If this one instruction didn't populate all the
2785 * channels, bail. We might be able to rewrite everything
2786 * that writes that reg, but it would require smarter
2787 * tracking to delay the rewriting until complete success.
2788 */
2789 if (scan_inst->is_partial_write())
2790 break;
2791
2792 /* Things returning more than one register would need us to
2793 * understand coalescing out more than one MOV at a time.
2794 */
2795 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2796 break;
2797
2798 /* SEND instructions can't have MRF as a destination. */
2799 if (scan_inst->mlen)
2800 break;
2801
2802 if (devinfo->gen == 6) {
2803 /* gen6 math instructions must have the destination be
2804 * GRF, so no compute-to-MRF for them.
2805 */
2806 if (scan_inst->is_math()) {
2807 break;
2808 }
2809 }
2810
2811 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2812 /* Found the creator of our MRF's source value. */
2813 scan_inst->dst.file = MRF;
2814 scan_inst->dst.reg = inst->dst.reg;
2815 scan_inst->saturate |= inst->saturate;
2816 inst->remove(block);
2817 progress = true;
2818 }
2819 break;
2820 }
2821
2822 /* We don't handle control flow here. Most computation of
2823 * values that end up in MRFs are shortly before the MRF
2824 * write anyway.
2825 */
2826 if (block->start() == scan_inst)
2827 break;
2828
2829 /* You can't read from an MRF, so if someone else reads our
2830 * MRF's source GRF that we wanted to rewrite, that stops us.
2831 */
2832 bool interfered = false;
2833 for (int i = 0; i < scan_inst->sources; i++) {
2834 if (scan_inst->src[i].file == GRF &&
2835 scan_inst->src[i].reg == inst->src[0].reg &&
2836 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2837 interfered = true;
2838 }
2839 }
2840 if (interfered)
2841 break;
2842
2843 if (scan_inst->dst.file == MRF) {
2844 /* If somebody else writes our MRF here, we can't
2845 * compute-to-MRF before that.
2846 */
2847 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2848 int scan_mrf_high;
2849
2850 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2851 scan_mrf_high = scan_mrf_low + 4;
2852 } else if (scan_inst->exec_size == 16) {
2853 scan_mrf_high = scan_mrf_low + 1;
2854 } else {
2855 scan_mrf_high = scan_mrf_low;
2856 }
2857
2858 if (mrf_low == scan_mrf_low ||
2859 mrf_low == scan_mrf_high ||
2860 mrf_high == scan_mrf_low ||
2861 mrf_high == scan_mrf_high) {
2862 break;
2863 }
2864 }
2865
2866 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2867 /* Found a SEND instruction, which means that there are
2868 * live values in MRFs from base_mrf to base_mrf +
2869 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2870 * above it.
2871 */
2872 if (mrf_low >= scan_inst->base_mrf &&
2873 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2874 break;
2875 }
2876 if (mrf_high >= scan_inst->base_mrf &&
2877 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2878 break;
2879 }
2880 }
2881 }
2882 }
2883
2884 if (progress)
2885 invalidate_live_intervals();
2886
2887 return progress;
2888 }
2889
2890 /**
2891 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2892 * flow. We could probably do better here with some form of divergence
2893 * analysis.
2894 */
2895 bool
2896 fs_visitor::eliminate_find_live_channel()
2897 {
2898 bool progress = false;
2899 unsigned depth = 0;
2900
2901 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2902 switch (inst->opcode) {
2903 case BRW_OPCODE_IF:
2904 case BRW_OPCODE_DO:
2905 depth++;
2906 break;
2907
2908 case BRW_OPCODE_ENDIF:
2909 case BRW_OPCODE_WHILE:
2910 depth--;
2911 break;
2912
2913 case FS_OPCODE_DISCARD_JUMP:
2914 /* This can potentially make control flow non-uniform until the end
2915 * of the program.
2916 */
2917 return progress;
2918
2919 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2920 if (depth == 0) {
2921 inst->opcode = BRW_OPCODE_MOV;
2922 inst->src[0] = fs_reg(0);
2923 inst->sources = 1;
2924 inst->force_writemask_all = true;
2925 progress = true;
2926 }
2927 break;
2928
2929 default:
2930 break;
2931 }
2932 }
2933
2934 return progress;
2935 }
2936
2937 /**
2938 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2939 * instructions to FS_OPCODE_REP_FB_WRITE.
2940 */
2941 void
2942 fs_visitor::emit_repclear_shader()
2943 {
2944 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2945 int base_mrf = 1;
2946 int color_mrf = base_mrf + 2;
2947
2948 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2949 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2950 mov->force_writemask_all = true;
2951
2952 fs_inst *write;
2953 if (key->nr_color_regions == 1) {
2954 write = emit(FS_OPCODE_REP_FB_WRITE);
2955 write->saturate = key->clamp_fragment_color;
2956 write->base_mrf = color_mrf;
2957 write->target = 0;
2958 write->header_size = 0;
2959 write->mlen = 1;
2960 } else {
2961 assume(key->nr_color_regions > 0);
2962 for (int i = 0; i < key->nr_color_regions; ++i) {
2963 write = emit(FS_OPCODE_REP_FB_WRITE);
2964 write->saturate = key->clamp_fragment_color;
2965 write->base_mrf = base_mrf;
2966 write->target = i;
2967 write->header_size = 2;
2968 write->mlen = 3;
2969 }
2970 }
2971 write->eot = true;
2972
2973 calculate_cfg();
2974
2975 assign_constant_locations();
2976 assign_curb_setup();
2977
2978 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2979 assert(mov->src[0].file == HW_REG);
2980 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2981 }
2982
2983 /**
2984 * Walks through basic blocks, looking for repeated MRF writes and
2985 * removing the later ones.
2986 */
2987 bool
2988 fs_visitor::remove_duplicate_mrf_writes()
2989 {
2990 fs_inst *last_mrf_move[16];
2991 bool progress = false;
2992
2993 /* Need to update the MRF tracking for compressed instructions. */
2994 if (dispatch_width == 16)
2995 return false;
2996
2997 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2998
2999 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3000 if (inst->is_control_flow()) {
3001 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3002 }
3003
3004 if (inst->opcode == BRW_OPCODE_MOV &&
3005 inst->dst.file == MRF) {
3006 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3007 if (prev_inst && inst->equals(prev_inst)) {
3008 inst->remove(block);
3009 progress = true;
3010 continue;
3011 }
3012 }
3013
3014 /* Clear out the last-write records for MRFs that were overwritten. */
3015 if (inst->dst.file == MRF) {
3016 last_mrf_move[inst->dst.reg] = NULL;
3017 }
3018
3019 if (inst->mlen > 0 && inst->base_mrf != -1) {
3020 /* Found a SEND instruction, which will include two or fewer
3021 * implied MRF writes. We could do better here.
3022 */
3023 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3024 last_mrf_move[inst->base_mrf + i] = NULL;
3025 }
3026 }
3027
3028 /* Clear out any MRF move records whose sources got overwritten. */
3029 if (inst->dst.file == GRF) {
3030 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3031 if (last_mrf_move[i] &&
3032 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3033 last_mrf_move[i] = NULL;
3034 }
3035 }
3036 }
3037
3038 if (inst->opcode == BRW_OPCODE_MOV &&
3039 inst->dst.file == MRF &&
3040 inst->src[0].file == GRF &&
3041 !inst->is_partial_write()) {
3042 last_mrf_move[inst->dst.reg] = inst;
3043 }
3044 }
3045
3046 if (progress)
3047 invalidate_live_intervals();
3048
3049 return progress;
3050 }
3051
3052 static void
3053 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3054 {
3055 /* Clear the flag for registers that actually got read (as expected). */
3056 for (int i = 0; i < inst->sources; i++) {
3057 int grf;
3058 if (inst->src[i].file == GRF) {
3059 grf = inst->src[i].reg;
3060 } else if (inst->src[i].file == HW_REG &&
3061 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3062 grf = inst->src[i].fixed_hw_reg.nr;
3063 } else {
3064 continue;
3065 }
3066
3067 if (grf >= first_grf &&
3068 grf < first_grf + grf_len) {
3069 deps[grf - first_grf] = false;
3070 if (inst->exec_size == 16)
3071 deps[grf - first_grf + 1] = false;
3072 }
3073 }
3074 }
3075
3076 /**
3077 * Implements this workaround for the original 965:
3078 *
3079 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3080 * check for post destination dependencies on this instruction, software
3081 * must ensure that there is no destination hazard for the case of ‘write
3082 * followed by a posted write’ shown in the following example.
3083 *
3084 * 1. mov r3 0
3085 * 2. send r3.xy <rest of send instruction>
3086 * 3. mov r2 r3
3087 *
3088 * Due to no post-destination dependency check on the ‘send’, the above
3089 * code sequence could have two instructions (1 and 2) in flight at the
3090 * same time that both consider ‘r3’ as the target of their final writes.
3091 */
3092 void
3093 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3094 fs_inst *inst)
3095 {
3096 int write_len = inst->regs_written;
3097 int first_write_grf = inst->dst.reg;
3098 bool needs_dep[BRW_MAX_MRF];
3099 assert(write_len < (int)sizeof(needs_dep) - 1);
3100
3101 memset(needs_dep, false, sizeof(needs_dep));
3102 memset(needs_dep, true, write_len);
3103
3104 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3105
3106 /* Walk backwards looking for writes to registers we're writing which
3107 * aren't read since being written. If we hit the start of the program,
3108 * we assume that there are no outstanding dependencies on entry to the
3109 * program.
3110 */
3111 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3112 /* If we hit control flow, assume that there *are* outstanding
3113 * dependencies, and force their cleanup before our instruction.
3114 */
3115 if (block->start() == scan_inst) {
3116 for (int i = 0; i < write_len; i++) {
3117 if (needs_dep[i]) {
3118 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3119 }
3120 }
3121 return;
3122 }
3123
3124 /* We insert our reads as late as possible on the assumption that any
3125 * instruction but a MOV that might have left us an outstanding
3126 * dependency has more latency than a MOV.
3127 */
3128 if (scan_inst->dst.file == GRF) {
3129 for (int i = 0; i < scan_inst->regs_written; i++) {
3130 int reg = scan_inst->dst.reg + i;
3131
3132 if (reg >= first_write_grf &&
3133 reg < first_write_grf + write_len &&
3134 needs_dep[reg - first_write_grf]) {
3135 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3136 needs_dep[reg - first_write_grf] = false;
3137 if (scan_inst->exec_size == 16)
3138 needs_dep[reg - first_write_grf + 1] = false;
3139 }
3140 }
3141 }
3142
3143 /* Clear the flag for registers that actually got read (as expected). */
3144 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3145
3146 /* Continue the loop only if we haven't resolved all the dependencies */
3147 int i;
3148 for (i = 0; i < write_len; i++) {
3149 if (needs_dep[i])
3150 break;
3151 }
3152 if (i == write_len)
3153 return;
3154 }
3155 }
3156
3157 /**
3158 * Implements this workaround for the original 965:
3159 *
3160 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3161 * used as a destination register until after it has been sourced by an
3162 * instruction with a different destination register.
3163 */
3164 void
3165 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3166 {
3167 int write_len = inst->regs_written;
3168 int first_write_grf = inst->dst.reg;
3169 bool needs_dep[BRW_MAX_MRF];
3170 assert(write_len < (int)sizeof(needs_dep) - 1);
3171
3172 memset(needs_dep, false, sizeof(needs_dep));
3173 memset(needs_dep, true, write_len);
3174 /* Walk forwards looking for writes to registers we're writing which aren't
3175 * read before being written.
3176 */
3177 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3178 /* If we hit control flow, force resolve all remaining dependencies. */
3179 if (block->end() == scan_inst) {
3180 for (int i = 0; i < write_len; i++) {
3181 if (needs_dep[i])
3182 scan_inst->insert_before(block,
3183 DEP_RESOLVE_MOV(first_write_grf + i));
3184 }
3185 return;
3186 }
3187
3188 /* Clear the flag for registers that actually got read (as expected). */
3189 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3190
3191 /* We insert our reads as late as possible since they're reading the
3192 * result of a SEND, which has massive latency.
3193 */
3194 if (scan_inst->dst.file == GRF &&
3195 scan_inst->dst.reg >= first_write_grf &&
3196 scan_inst->dst.reg < first_write_grf + write_len &&
3197 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3198 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3199 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3200 }
3201
3202 /* Continue the loop only if we haven't resolved all the dependencies */
3203 int i;
3204 for (i = 0; i < write_len; i++) {
3205 if (needs_dep[i])
3206 break;
3207 }
3208 if (i == write_len)
3209 return;
3210 }
3211 }
3212
3213 void
3214 fs_visitor::insert_gen4_send_dependency_workarounds()
3215 {
3216 if (devinfo->gen != 4 || devinfo->is_g4x)
3217 return;
3218
3219 bool progress = false;
3220
3221 /* Note that we're done with register allocation, so GRF fs_regs always
3222 * have a .reg_offset of 0.
3223 */
3224
3225 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3226 if (inst->mlen != 0 && inst->dst.file == GRF) {
3227 insert_gen4_pre_send_dependency_workarounds(block, inst);
3228 insert_gen4_post_send_dependency_workarounds(block, inst);
3229 progress = true;
3230 }
3231 }
3232
3233 if (progress)
3234 invalidate_live_intervals();
3235 }
3236
3237 /**
3238 * Turns the generic expression-style uniform pull constant load instruction
3239 * into a hardware-specific series of instructions for loading a pull
3240 * constant.
3241 *
3242 * The expression style allows the CSE pass before this to optimize out
3243 * repeated loads from the same offset, and gives the pre-register-allocation
3244 * scheduling full flexibility, while the conversion to native instructions
3245 * allows the post-register-allocation scheduler the best information
3246 * possible.
3247 *
3248 * Note that execution masking for setting up pull constant loads is special:
3249 * the channels that need to be written are unrelated to the current execution
3250 * mask, since a later instruction will use one of the result channels as a
3251 * source operand for all 8 or 16 of its channels.
3252 */
3253 void
3254 fs_visitor::lower_uniform_pull_constant_loads()
3255 {
3256 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3257 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3258 continue;
3259
3260 if (devinfo->gen >= 7) {
3261 /* The offset arg before was a vec4-aligned byte offset. We need to
3262 * turn it into a dword offset.
3263 */
3264 fs_reg const_offset_reg = inst->src[1];
3265 assert(const_offset_reg.file == IMM &&
3266 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3267 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3268 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3269
3270 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3271 * Reserve space for the register.
3272 */
3273 if (devinfo->gen >= 9) {
3274 payload.reg_offset++;
3275 alloc.sizes[payload.reg] = 2;
3276 }
3277
3278 /* This is actually going to be a MOV, but since only the first dword
3279 * is accessed, we have a special opcode to do just that one. Note
3280 * that this needs to be an operation that will be considered a def
3281 * by live variable analysis, or register allocation will explode.
3282 */
3283 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3284 8, payload, const_offset_reg);
3285 setup->force_writemask_all = true;
3286
3287 setup->ir = inst->ir;
3288 setup->annotation = inst->annotation;
3289 inst->insert_before(block, setup);
3290
3291 /* Similarly, this will only populate the first 4 channels of the
3292 * result register (since we only use smear values from 0-3), but we
3293 * don't tell the optimizer.
3294 */
3295 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3296 inst->src[1] = payload;
3297
3298 invalidate_live_intervals();
3299 } else {
3300 /* Before register allocation, we didn't tell the scheduler about the
3301 * MRF we use. We know it's safe to use this MRF because nothing
3302 * else does except for register spill/unspill, which generates and
3303 * uses its MRF within a single IR instruction.
3304 */
3305 inst->base_mrf = 14;
3306 inst->mlen = 1;
3307 }
3308 }
3309 }
3310
3311 bool
3312 fs_visitor::lower_load_payload()
3313 {
3314 bool progress = false;
3315
3316 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3317 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3318 continue;
3319
3320 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3321 assert(inst->saturate == false);
3322
3323 fs_reg dst = inst->dst;
3324
3325 /* Get rid of COMPR4. We'll add it back in if we need it */
3326 if (dst.file == MRF)
3327 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3328
3329 dst.width = 8;
3330 for (uint8_t i = 0; i < inst->header_size; i++) {
3331 if (inst->src[i].file != BAD_FILE) {
3332 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3333 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3334 mov_src.width = 8;
3335 fs_inst *mov = MOV(mov_dst, mov_src);
3336 mov->force_writemask_all = true;
3337 inst->insert_before(block, mov);
3338 }
3339 dst = offset(dst, 1);
3340 }
3341
3342 dst.width = inst->exec_size;
3343 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3344 inst->exec_size > 8) {
3345 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3346 * a straightforward copy. Instead, the result of the
3347 * LOAD_PAYLOAD is treated as interleaved and the first four
3348 * non-header sources are unpacked as:
3349 *
3350 * m + 0: r0
3351 * m + 1: g0
3352 * m + 2: b0
3353 * m + 3: a0
3354 * m + 4: r1
3355 * m + 5: g1
3356 * m + 6: b1
3357 * m + 7: a1
3358 *
3359 * This is used for gen <= 5 fb writes.
3360 */
3361 assert(inst->exec_size == 16);
3362 assert(inst->header_size + 4 <= inst->sources);
3363 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3364 if (inst->src[i].file != BAD_FILE) {
3365 if (devinfo->has_compr4) {
3366 fs_reg compr4_dst = retype(dst, inst->src[i].type);
3367 compr4_dst.reg |= BRW_MRF_COMPR4;
3368
3369 fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3370 mov->force_writemask_all = inst->force_writemask_all;
3371 inst->insert_before(block, mov);
3372 } else {
3373 /* Platform doesn't have COMPR4. We have to fake it */
3374 fs_reg mov_dst = retype(dst, inst->src[i].type);
3375 mov_dst.width = 8;
3376
3377 fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3378 mov->force_writemask_all = inst->force_writemask_all;
3379 inst->insert_before(block, mov);
3380
3381 mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3382 mov->force_writemask_all = inst->force_writemask_all;
3383 mov->force_sechalf = true;
3384 inst->insert_before(block, mov);
3385 }
3386 }
3387
3388 dst.reg++;
3389 }
3390
3391 /* The loop above only ever incremented us through the first set
3392 * of 4 registers. However, thanks to the magic of COMPR4, we
3393 * actually wrote to the first 8 registers, so we need to take
3394 * that into account now.
3395 */
3396 dst.reg += 4;
3397
3398 /* The COMPR4 code took care of the first 4 sources. We'll let
3399 * the regular path handle any remaining sources. Yes, we are
3400 * modifying the instruction but we're about to delete it so
3401 * this really doesn't hurt anything.
3402 */
3403 inst->header_size += 4;
3404 }
3405
3406 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3407 if (inst->src[i].file != BAD_FILE) {
3408 fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3409 inst->src[i]);
3410 mov->force_writemask_all = inst->force_writemask_all;
3411 mov->force_sechalf = inst->force_sechalf;
3412 inst->insert_before(block, mov);
3413 }
3414 dst = offset(dst, 1);
3415 }
3416
3417 inst->remove(block);
3418 progress = true;
3419 }
3420
3421 if (progress)
3422 invalidate_live_intervals();
3423
3424 return progress;
3425 }
3426
3427 bool
3428 fs_visitor::lower_integer_multiplication()
3429 {
3430 bool progress = false;
3431
3432 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3433 * directly, but Cherryview cannot.
3434 */
3435 if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3436 return false;
3437
3438 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3439 if (inst->opcode != BRW_OPCODE_MUL ||
3440 inst->dst.is_accumulator() ||
3441 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3442 inst->dst.type != BRW_REGISTER_TYPE_UD))
3443 continue;
3444
3445 #define insert(instr) inst->insert_before(block, instr)
3446
3447 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3448 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3449 * src1 are used.
3450 *
3451 * If multiplying by an immediate value that fits in 16-bits, do a
3452 * single MUL instruction with that value in the proper location.
3453 */
3454 if (inst->src[1].file == IMM &&
3455 inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3456 if (devinfo->gen < 7) {
3457 fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3458 inst->dst.type, dispatch_width);
3459 insert(MOV(imm, inst->src[1]));
3460 insert(MUL(inst->dst, imm, inst->src[0]));
3461 } else {
3462 insert(MUL(inst->dst, inst->src[0], inst->src[1]));
3463 }
3464 } else {
3465 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3466 * do 32-bit integer multiplication in one instruction, but instead
3467 * must do a sequence (which actually calculates a 64-bit result):
3468 *
3469 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3470 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3471 * mov(8) g2<1>D acc0<8,8,1>D
3472 *
3473 * But on Gen > 6, the ability to use second accumulator register
3474 * (acc1) for non-float data types was removed, preventing a simple
3475 * implementation in SIMD16. A 16-channel result can be calculated by
3476 * executing the three instructions twice in SIMD8, once with quarter
3477 * control of 1Q for the first eight channels and again with 2Q for
3478 * the second eight channels.
3479 *
3480 * Which accumulator register is implicitly accessed (by AccWrEnable
3481 * for instance) is determined by the quarter control. Unfortunately
3482 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3483 * implicit accumulator access by an instruction with 2Q will access
3484 * acc1 regardless of whether the data type is usable in acc1.
3485 *
3486 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3487 * integer data types.
3488 *
3489 * Since we only want the low 32-bits of the result, we can do two
3490 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3491 * adjust the high result and add them (like the mach is doing):
3492 *
3493 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3494 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3495 * shl(8) g9<1>D g8<8,8,1>D 16D
3496 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3497 *
3498 * We avoid the shl instruction by realizing that we only want to add
3499 * the low 16-bits of the "high" result to the high 16-bits of the
3500 * "low" result and using proper regioning on the add:
3501 *
3502 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3503 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3504 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3505 *
3506 * Since it does not use the (single) accumulator register, we can
3507 * schedule multi-component multiplications much better.
3508 */
3509
3510 if (inst->conditional_mod && inst->dst.is_null()) {
3511 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3512 inst->dst.type, dispatch_width);
3513 }
3514 fs_reg low = inst->dst;
3515 fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3516 inst->dst.type, dispatch_width);
3517
3518 if (brw->gen >= 7) {
3519 fs_reg src1_0_w = inst->src[1];
3520 fs_reg src1_1_w = inst->src[1];
3521
3522 if (inst->src[1].file == IMM) {
3523 src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3524 src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3525 } else {
3526 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3527 src1_0_w.stride = 2;
3528
3529 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3530 src1_1_w.stride = 2;
3531 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3532 }
3533 insert(MUL(low, inst->src[0], src1_0_w));
3534 insert(MUL(high, inst->src[0], src1_1_w));
3535 } else {
3536 fs_reg src0_0_w = inst->src[0];
3537 fs_reg src0_1_w = inst->src[0];
3538
3539 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3540 src0_0_w.stride = 2;
3541
3542 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3543 src0_1_w.stride = 2;
3544 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3545
3546 insert(MUL(low, src0_0_w, inst->src[1]));
3547 insert(MUL(high, src0_1_w, inst->src[1]));
3548 }
3549
3550 fs_reg dst = inst->dst;
3551 dst.type = BRW_REGISTER_TYPE_UW;
3552 dst.subreg_offset = 2;
3553 dst.stride = 2;
3554
3555 high.type = BRW_REGISTER_TYPE_UW;
3556 high.stride = 2;
3557
3558 low.type = BRW_REGISTER_TYPE_UW;
3559 low.subreg_offset = 2;
3560 low.stride = 2;
3561
3562 insert(ADD(dst, low, high));
3563
3564 if (inst->conditional_mod) {
3565 fs_reg null(retype(brw_null_reg(), inst->dst.type));
3566 fs_inst *mov = MOV(null, inst->dst);
3567 mov->conditional_mod = inst->conditional_mod;
3568 insert(mov);
3569 }
3570 }
3571 #undef insert
3572
3573 inst->remove(block);
3574 progress = true;
3575 }
3576
3577 if (progress)
3578 invalidate_live_intervals();
3579
3580 return progress;
3581 }
3582
3583 void
3584 fs_visitor::dump_instructions()
3585 {
3586 dump_instructions(NULL);
3587 }
3588
3589 void
3590 fs_visitor::dump_instructions(const char *name)
3591 {
3592 FILE *file = stderr;
3593 if (name && geteuid() != 0) {
3594 file = fopen(name, "w");
3595 if (!file)
3596 file = stderr;
3597 }
3598
3599 if (cfg) {
3600 calculate_register_pressure();
3601 int ip = 0, max_pressure = 0;
3602 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3603 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3604 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3605 dump_instruction(inst, file);
3606 ip++;
3607 }
3608 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3609 } else {
3610 int ip = 0;
3611 foreach_in_list(backend_instruction, inst, &instructions) {
3612 fprintf(file, "%4d: ", ip++);
3613 dump_instruction(inst, file);
3614 }
3615 }
3616
3617 if (file != stderr) {
3618 fclose(file);
3619 }
3620 }
3621
3622 void
3623 fs_visitor::dump_instruction(backend_instruction *be_inst)
3624 {
3625 dump_instruction(be_inst, stderr);
3626 }
3627
3628 void
3629 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3630 {
3631 fs_inst *inst = (fs_inst *)be_inst;
3632
3633 if (inst->predicate) {
3634 fprintf(file, "(%cf0.%d) ",
3635 inst->predicate_inverse ? '-' : '+',
3636 inst->flag_subreg);
3637 }
3638
3639 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3640 if (inst->saturate)
3641 fprintf(file, ".sat");
3642 if (inst->conditional_mod) {
3643 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3644 if (!inst->predicate &&
3645 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3646 inst->opcode != BRW_OPCODE_IF &&
3647 inst->opcode != BRW_OPCODE_WHILE))) {
3648 fprintf(file, ".f0.%d", inst->flag_subreg);
3649 }
3650 }
3651 fprintf(file, "(%d) ", inst->exec_size);
3652
3653 if (inst->mlen) {
3654 fprintf(file, "(mlen: %d) ", inst->mlen);
3655 }
3656
3657 switch (inst->dst.file) {
3658 case GRF:
3659 fprintf(file, "vgrf%d", inst->dst.reg);
3660 if (inst->dst.width != dispatch_width)
3661 fprintf(file, "@%d", inst->dst.width);
3662 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3663 inst->dst.subreg_offset)
3664 fprintf(file, "+%d.%d",
3665 inst->dst.reg_offset, inst->dst.subreg_offset);
3666 break;
3667 case MRF:
3668 fprintf(file, "m%d", inst->dst.reg);
3669 break;
3670 case BAD_FILE:
3671 fprintf(file, "(null)");
3672 break;
3673 case UNIFORM:
3674 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3675 break;
3676 case ATTR:
3677 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3678 break;
3679 case HW_REG:
3680 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3681 switch (inst->dst.fixed_hw_reg.nr) {
3682 case BRW_ARF_NULL:
3683 fprintf(file, "null");
3684 break;
3685 case BRW_ARF_ADDRESS:
3686 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3687 break;
3688 case BRW_ARF_ACCUMULATOR:
3689 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3690 break;
3691 case BRW_ARF_FLAG:
3692 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3693 inst->dst.fixed_hw_reg.subnr);
3694 break;
3695 default:
3696 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3697 inst->dst.fixed_hw_reg.subnr);
3698 break;
3699 }
3700 } else {
3701 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3702 }
3703 if (inst->dst.fixed_hw_reg.subnr)
3704 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3705 break;
3706 default:
3707 fprintf(file, "???");
3708 break;
3709 }
3710 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3711
3712 for (int i = 0; i < inst->sources; i++) {
3713 if (inst->src[i].negate)
3714 fprintf(file, "-");
3715 if (inst->src[i].abs)
3716 fprintf(file, "|");
3717 switch (inst->src[i].file) {
3718 case GRF:
3719 fprintf(file, "vgrf%d", inst->src[i].reg);
3720 if (inst->src[i].width != dispatch_width)
3721 fprintf(file, "@%d", inst->src[i].width);
3722 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3723 inst->src[i].subreg_offset)
3724 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3725 inst->src[i].subreg_offset);
3726 break;
3727 case MRF:
3728 fprintf(file, "***m%d***", inst->src[i].reg);
3729 break;
3730 case ATTR:
3731 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3732 break;
3733 case UNIFORM:
3734 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3735 if (inst->src[i].reladdr) {
3736 fprintf(file, "+reladdr");
3737 } else if (inst->src[i].subreg_offset) {
3738 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3739 inst->src[i].subreg_offset);
3740 }
3741 break;
3742 case BAD_FILE:
3743 fprintf(file, "(null)");
3744 break;
3745 case IMM:
3746 switch (inst->src[i].type) {
3747 case BRW_REGISTER_TYPE_F:
3748 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3749 break;
3750 case BRW_REGISTER_TYPE_W:
3751 case BRW_REGISTER_TYPE_D:
3752 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3753 break;
3754 case BRW_REGISTER_TYPE_UW:
3755 case BRW_REGISTER_TYPE_UD:
3756 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3757 break;
3758 case BRW_REGISTER_TYPE_VF:
3759 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3760 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3761 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3762 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3763 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3764 break;
3765 default:
3766 fprintf(file, "???");
3767 break;
3768 }
3769 break;
3770 case HW_REG:
3771 if (inst->src[i].fixed_hw_reg.negate)
3772 fprintf(file, "-");
3773 if (inst->src[i].fixed_hw_reg.abs)
3774 fprintf(file, "|");
3775 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3776 switch (inst->src[i].fixed_hw_reg.nr) {
3777 case BRW_ARF_NULL:
3778 fprintf(file, "null");
3779 break;
3780 case BRW_ARF_ADDRESS:
3781 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3782 break;
3783 case BRW_ARF_ACCUMULATOR:
3784 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3785 break;
3786 case BRW_ARF_FLAG:
3787 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3788 inst->src[i].fixed_hw_reg.subnr);
3789 break;
3790 default:
3791 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3792 inst->src[i].fixed_hw_reg.subnr);
3793 break;
3794 }
3795 } else {
3796 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3797 }
3798 if (inst->src[i].fixed_hw_reg.subnr)
3799 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3800 if (inst->src[i].fixed_hw_reg.abs)
3801 fprintf(file, "|");
3802 break;
3803 default:
3804 fprintf(file, "???");
3805 break;
3806 }
3807 if (inst->src[i].abs)
3808 fprintf(file, "|");
3809
3810 if (inst->src[i].file != IMM) {
3811 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3812 }
3813
3814 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3815 fprintf(file, ", ");
3816 }
3817
3818 fprintf(file, " ");
3819
3820 if (dispatch_width == 16 && inst->exec_size == 8) {
3821 if (inst->force_sechalf)
3822 fprintf(file, "2ndhalf ");
3823 else
3824 fprintf(file, "1sthalf ");
3825 }
3826
3827 fprintf(file, "\n");
3828 }
3829
3830 /**
3831 * Possibly returns an instruction that set up @param reg.
3832 *
3833 * Sometimes we want to take the result of some expression/variable
3834 * dereference tree and rewrite the instruction generating the result
3835 * of the tree. When processing the tree, we know that the
3836 * instructions generated are all writing temporaries that are dead
3837 * outside of this tree. So, if we have some instructions that write
3838 * a temporary, we're free to point that temp write somewhere else.
3839 *
3840 * Note that this doesn't guarantee that the instruction generated
3841 * only reg -- it might be the size=4 destination of a texture instruction.
3842 */
3843 fs_inst *
3844 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3845 fs_inst *end,
3846 const fs_reg &reg)
3847 {
3848 if (end == start ||
3849 end->is_partial_write() ||
3850 reg.reladdr ||
3851 !reg.equals(end->dst)) {
3852 return NULL;
3853 } else {
3854 return end;
3855 }
3856 }
3857
3858 void
3859 fs_visitor::setup_payload_gen6()
3860 {
3861 bool uses_depth =
3862 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3863 unsigned barycentric_interp_modes =
3864 (stage == MESA_SHADER_FRAGMENT) ?
3865 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3866
3867 assert(devinfo->gen >= 6);
3868
3869 /* R0-1: masks, pixel X/Y coordinates. */
3870 payload.num_regs = 2;
3871 /* R2: only for 32-pixel dispatch.*/
3872
3873 /* R3-26: barycentric interpolation coordinates. These appear in the
3874 * same order that they appear in the brw_wm_barycentric_interp_mode
3875 * enum. Each set of coordinates occupies 2 registers if dispatch width
3876 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3877 * appear if they were enabled using the "Barycentric Interpolation
3878 * Mode" bits in WM_STATE.
3879 */
3880 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3881 if (barycentric_interp_modes & (1 << i)) {
3882 payload.barycentric_coord_reg[i] = payload.num_regs;
3883 payload.num_regs += 2;
3884 if (dispatch_width == 16) {
3885 payload.num_regs += 2;
3886 }
3887 }
3888 }
3889
3890 /* R27: interpolated depth if uses source depth */
3891 if (uses_depth) {
3892 payload.source_depth_reg = payload.num_regs;
3893 payload.num_regs++;
3894 if (dispatch_width == 16) {
3895 /* R28: interpolated depth if not SIMD8. */
3896 payload.num_regs++;
3897 }
3898 }
3899 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3900 if (uses_depth) {
3901 payload.source_w_reg = payload.num_regs;
3902 payload.num_regs++;
3903 if (dispatch_width == 16) {
3904 /* R30: interpolated W if not SIMD8. */
3905 payload.num_regs++;
3906 }
3907 }
3908
3909 if (stage == MESA_SHADER_FRAGMENT) {
3910 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3911 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3912 prog_data->uses_pos_offset = key->compute_pos_offset;
3913 /* R31: MSAA position offsets. */
3914 if (prog_data->uses_pos_offset) {
3915 payload.sample_pos_reg = payload.num_regs;
3916 payload.num_regs++;
3917 }
3918 }
3919
3920 /* R32: MSAA input coverage mask */
3921 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3922 assert(devinfo->gen >= 7);
3923 payload.sample_mask_in_reg = payload.num_regs;
3924 payload.num_regs++;
3925 if (dispatch_width == 16) {
3926 /* R33: input coverage mask if not SIMD8. */
3927 payload.num_regs++;
3928 }
3929 }
3930
3931 /* R34-: bary for 32-pixel. */
3932 /* R58-59: interp W for 32-pixel. */
3933
3934 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3935 source_depth_to_render_target = true;
3936 }
3937 }
3938
3939 void
3940 fs_visitor::setup_vs_payload()
3941 {
3942 /* R0: thread header, R1: urb handles */
3943 payload.num_regs = 2;
3944 }
3945
3946 void
3947 fs_visitor::setup_cs_payload()
3948 {
3949 assert(brw->gen >= 7);
3950
3951 payload.num_regs = 1;
3952 }
3953
3954 void
3955 fs_visitor::assign_binding_table_offsets()
3956 {
3957 assert(stage == MESA_SHADER_FRAGMENT);
3958 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3959 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3960 uint32_t next_binding_table_offset = 0;
3961
3962 /* If there are no color regions, we still perform an FB write to a null
3963 * renderbuffer, which we place at surface index 0.
3964 */
3965 prog_data->binding_table.render_target_start = next_binding_table_offset;
3966 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3967
3968 assign_common_binding_table_offsets(next_binding_table_offset);
3969 }
3970
3971 void
3972 fs_visitor::calculate_register_pressure()
3973 {
3974 invalidate_live_intervals();
3975 calculate_live_intervals();
3976
3977 unsigned num_instructions = 0;
3978 foreach_block(block, cfg)
3979 num_instructions += block->instructions.length();
3980
3981 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3982
3983 for (unsigned reg = 0; reg < alloc.count; reg++) {
3984 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3985 regs_live_at_ip[ip] += alloc.sizes[reg];
3986 }
3987 }
3988
3989 void
3990 fs_visitor::optimize()
3991 {
3992 /* bld is the common builder object pointing at the end of the program we
3993 * used to translate it into i965 IR. For the optimization and lowering
3994 * passes coming next, any code added after the end of the program without
3995 * having explicitly called fs_builder::at() clearly points at a mistake.
3996 * Ideally optimization passes wouldn't be part of the visitor so they
3997 * wouldn't have access to bld at all, but they do, so just in case some
3998 * pass forgets to ask for a location explicitly set it to NULL here to
3999 * make it trip.
4000 */
4001 bld = bld.at(NULL, NULL);
4002
4003 split_virtual_grfs();
4004
4005 move_uniform_array_access_to_pull_constants();
4006 assign_constant_locations();
4007 demote_pull_constants();
4008
4009 #define OPT(pass, args...) ({ \
4010 pass_num++; \
4011 bool this_progress = pass(args); \
4012 \
4013 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
4014 char filename[64]; \
4015 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
4016 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
4017 \
4018 backend_shader::dump_instructions(filename); \
4019 } \
4020 \
4021 progress = progress || this_progress; \
4022 this_progress; \
4023 })
4024
4025 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4026 char filename[64];
4027 snprintf(filename, 64, "%s%d-%04d-00-start",
4028 stage_abbrev, dispatch_width,
4029 shader_prog ? shader_prog->Name : 0);
4030
4031 backend_shader::dump_instructions(filename);
4032 }
4033
4034 bool progress;
4035 int iteration = 0;
4036 int pass_num = 0;
4037 do {
4038 progress = false;
4039 pass_num = 0;
4040 iteration++;
4041
4042 OPT(remove_duplicate_mrf_writes);
4043
4044 OPT(opt_algebraic);
4045 OPT(opt_cse);
4046 OPT(opt_copy_propagate);
4047 OPT(opt_peephole_predicated_break);
4048 OPT(opt_cmod_propagation);
4049 OPT(dead_code_eliminate);
4050 OPT(opt_peephole_sel);
4051 OPT(dead_control_flow_eliminate, this);
4052 OPT(opt_register_renaming);
4053 OPT(opt_redundant_discard_jumps);
4054 OPT(opt_saturate_propagation);
4055 OPT(opt_zero_samples);
4056 OPT(register_coalesce);
4057 OPT(compute_to_mrf);
4058 OPT(eliminate_find_live_channel);
4059
4060 OPT(compact_virtual_grfs);
4061 } while (progress);
4062
4063 pass_num = 0;
4064
4065 OPT(opt_sampler_eot);
4066
4067 if (OPT(lower_load_payload)) {
4068 split_virtual_grfs();
4069 OPT(register_coalesce);
4070 OPT(compute_to_mrf);
4071 OPT(dead_code_eliminate);
4072 }
4073
4074 OPT(opt_combine_constants);
4075 OPT(lower_integer_multiplication);
4076
4077 lower_uniform_pull_constant_loads();
4078 }
4079
4080 /**
4081 * Three source instruction must have a GRF/MRF destination register.
4082 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
4083 */
4084 void
4085 fs_visitor::fixup_3src_null_dest()
4086 {
4087 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4088 if (inst->is_3src() && inst->dst.is_null()) {
4089 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4090 inst->dst.type);
4091 }
4092 }
4093 }
4094
4095 void
4096 fs_visitor::allocate_registers()
4097 {
4098 bool allocated_without_spills;
4099
4100 static const enum instruction_scheduler_mode pre_modes[] = {
4101 SCHEDULE_PRE,
4102 SCHEDULE_PRE_NON_LIFO,
4103 SCHEDULE_PRE_LIFO,
4104 };
4105
4106 /* Try each scheduling heuristic to see if it can successfully register
4107 * allocate without spilling. They should be ordered by decreasing
4108 * performance but increasing likelihood of allocating.
4109 */
4110 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4111 schedule_instructions(pre_modes[i]);
4112
4113 if (0) {
4114 assign_regs_trivial();
4115 allocated_without_spills = true;
4116 } else {
4117 allocated_without_spills = assign_regs(false);
4118 }
4119 if (allocated_without_spills)
4120 break;
4121 }
4122
4123 if (!allocated_without_spills) {
4124 /* We assume that any spilling is worse than just dropping back to
4125 * SIMD8. There's probably actually some intermediate point where
4126 * SIMD16 with a couple of spills is still better.
4127 */
4128 if (dispatch_width == 16) {
4129 fail("Failure to register allocate. Reduce number of "
4130 "live scalar values to avoid this.");
4131 } else {
4132 perf_debug("%s shader triggered register spilling. "
4133 "Try reducing the number of live scalar values to "
4134 "improve performance.\n", stage_name);
4135 }
4136
4137 /* Since we're out of heuristics, just go spill registers until we
4138 * get an allocation.
4139 */
4140 while (!assign_regs(true)) {
4141 if (failed)
4142 break;
4143 }
4144 }
4145
4146 /* This must come after all optimization and register allocation, since
4147 * it inserts dead code that happens to have side effects, and it does
4148 * so based on the actual physical registers in use.
4149 */
4150 insert_gen4_send_dependency_workarounds();
4151
4152 if (failed)
4153 return;
4154
4155 if (!allocated_without_spills)
4156 schedule_instructions(SCHEDULE_POST);
4157
4158 if (last_scratch > 0)
4159 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4160 }
4161
4162 bool
4163 fs_visitor::run_vs()
4164 {
4165 assert(stage == MESA_SHADER_VERTEX);
4166
4167 assign_common_binding_table_offsets(0);
4168 setup_vs_payload();
4169
4170 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4171 emit_shader_time_begin();
4172
4173 emit_nir_code();
4174
4175 if (failed)
4176 return false;
4177
4178 emit_urb_writes();
4179
4180 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4181 emit_shader_time_end();
4182
4183 calculate_cfg();
4184
4185 optimize();
4186
4187 assign_curb_setup();
4188 assign_vs_urb_setup();
4189
4190 fixup_3src_null_dest();
4191 allocate_registers();
4192
4193 return !failed;
4194 }
4195
4196 bool
4197 fs_visitor::run_fs()
4198 {
4199 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4200 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4201
4202 assert(stage == MESA_SHADER_FRAGMENT);
4203
4204 sanity_param_count = prog->Parameters->NumParameters;
4205
4206 assign_binding_table_offsets();
4207
4208 if (devinfo->gen >= 6)
4209 setup_payload_gen6();
4210 else
4211 setup_payload_gen4();
4212
4213 if (0) {
4214 emit_dummy_fs();
4215 } else if (brw->use_rep_send && dispatch_width == 16) {
4216 emit_repclear_shader();
4217 } else {
4218 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4219 emit_shader_time_begin();
4220
4221 calculate_urb_setup();
4222 if (prog->InputsRead > 0) {
4223 if (devinfo->gen < 6)
4224 emit_interpolation_setup_gen4();
4225 else
4226 emit_interpolation_setup_gen6();
4227 }
4228
4229 /* We handle discards by keeping track of the still-live pixels in f0.1.
4230 * Initialize it with the dispatched pixels.
4231 */
4232 if (wm_prog_data->uses_kill) {
4233 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4234 discard_init->flag_subreg = 1;
4235 }
4236
4237 /* Generate FS IR for main(). (the visitor only descends into
4238 * functions called "main").
4239 */
4240 emit_nir_code();
4241
4242 if (failed)
4243 return false;
4244
4245 if (wm_prog_data->uses_kill)
4246 emit(FS_OPCODE_PLACEHOLDER_HALT);
4247
4248 if (wm_key->alpha_test_func)
4249 emit_alpha_test();
4250
4251 emit_fb_writes();
4252
4253 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4254 emit_shader_time_end();
4255
4256 calculate_cfg();
4257
4258 optimize();
4259
4260 assign_curb_setup();
4261 assign_urb_setup();
4262
4263 fixup_3src_null_dest();
4264 allocate_registers();
4265
4266 if (failed)
4267 return false;
4268 }
4269
4270 if (dispatch_width == 8)
4271 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4272 else
4273 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4274
4275 /* If any state parameters were appended, then ParameterValues could have
4276 * been realloced, in which case the driver uniform storage set up by
4277 * _mesa_associate_uniform_storage() would point to freed memory. Make
4278 * sure that didn't happen.
4279 */
4280 assert(sanity_param_count == prog->Parameters->NumParameters);
4281
4282 return !failed;
4283 }
4284
4285 bool
4286 fs_visitor::run_cs()
4287 {
4288 assert(stage == MESA_SHADER_COMPUTE);
4289 assert(shader);
4290
4291 sanity_param_count = prog->Parameters->NumParameters;
4292
4293 assign_common_binding_table_offsets(0);
4294
4295 setup_cs_payload();
4296
4297 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4298 emit_shader_time_begin();
4299
4300 emit_nir_code();
4301
4302 if (failed)
4303 return false;
4304
4305 emit_cs_terminate();
4306
4307 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4308 emit_shader_time_end();
4309
4310 calculate_cfg();
4311
4312 optimize();
4313
4314 assign_curb_setup();
4315
4316 fixup_3src_null_dest();
4317 allocate_registers();
4318
4319 if (failed)
4320 return false;
4321
4322 /* If any state parameters were appended, then ParameterValues could have
4323 * been realloced, in which case the driver uniform storage set up by
4324 * _mesa_associate_uniform_storage() would point to freed memory. Make
4325 * sure that didn't happen.
4326 */
4327 assert(sanity_param_count == prog->Parameters->NumParameters);
4328
4329 return !failed;
4330 }
4331
4332 const unsigned *
4333 brw_wm_fs_emit(struct brw_context *brw,
4334 void *mem_ctx,
4335 const struct brw_wm_prog_key *key,
4336 struct brw_wm_prog_data *prog_data,
4337 struct gl_fragment_program *fp,
4338 struct gl_shader_program *prog,
4339 unsigned *final_assembly_size)
4340 {
4341 bool start_busy = false;
4342 double start_time = 0;
4343
4344 if (unlikely(brw->perf_debug)) {
4345 start_busy = (brw->batch.last_bo &&
4346 drm_intel_bo_busy(brw->batch.last_bo));
4347 start_time = get_time();
4348 }
4349
4350 struct brw_shader *shader = NULL;
4351 if (prog)
4352 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4353
4354 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4355 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4356
4357 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4358 */
4359 fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4360 prog, &fp->Base, 8);
4361 if (!v.run_fs()) {
4362 if (prog) {
4363 prog->LinkStatus = false;
4364 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4365 }
4366
4367 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4368 v.fail_msg);
4369
4370 return NULL;
4371 }
4372
4373 cfg_t *simd16_cfg = NULL;
4374 fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4375 prog, &fp->Base, 16);
4376 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4377 if (!v.simd16_unsupported) {
4378 /* Try a SIMD16 compile */
4379 v2.import_uniforms(&v);
4380 if (!v2.run_fs()) {
4381 perf_debug("SIMD16 shader failed to compile, falling back to "
4382 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4383 } else {
4384 simd16_cfg = v2.cfg;
4385 }
4386 } else {
4387 perf_debug("SIMD16 shader unsupported, falling back to "
4388 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4389 }
4390 }
4391
4392 cfg_t *simd8_cfg;
4393 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4394 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4395 simd8_cfg = NULL;
4396 prog_data->no_8 = true;
4397 } else {
4398 simd8_cfg = v.cfg;
4399 prog_data->no_8 = false;
4400 }
4401
4402 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4403 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4404
4405 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4406 char *name;
4407 if (prog)
4408 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4409 prog->Label ? prog->Label : "unnamed",
4410 prog->Name);
4411 else
4412 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4413
4414 g.enable_debug(name);
4415 }
4416
4417 if (simd8_cfg)
4418 g.generate_code(simd8_cfg, 8);
4419 if (simd16_cfg)
4420 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4421
4422 if (unlikely(brw->perf_debug) && shader) {
4423 if (shader->compiled_once)
4424 brw_wm_debug_recompile(brw, prog, key);
4425 shader->compiled_once = true;
4426
4427 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4428 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4429 (get_time() - start_time) * 1000);
4430 }
4431 }
4432
4433 return g.get_assembly(final_assembly_size);
4434 }
4435
4436 extern "C" bool
4437 brw_fs_precompile(struct gl_context *ctx,
4438 struct gl_shader_program *shader_prog,
4439 struct gl_program *prog)
4440 {
4441 struct brw_context *brw = brw_context(ctx);
4442 struct brw_wm_prog_key key;
4443
4444 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4445 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4446 bool program_uses_dfdy = fp->UsesDFdy;
4447
4448 memset(&key, 0, sizeof(key));
4449
4450 if (brw->gen < 6) {
4451 if (fp->UsesKill)
4452 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4453
4454 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4455 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4456
4457 /* Just assume depth testing. */
4458 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4459 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4460 }
4461
4462 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4463 BRW_FS_VARYING_INPUT_MASK) > 16)
4464 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4465
4466 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4467
4468 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4469 key.drawable_height = ctx->DrawBuffer->Height;
4470 }
4471
4472 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4473 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4474 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4475
4476 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4477 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4478 key.nr_color_regions > 1;
4479 }
4480
4481 key.program_string_id = bfp->id;
4482
4483 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4484 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4485
4486 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4487
4488 brw->wm.base.prog_offset = old_prog_offset;
4489 brw->wm.prog_data = old_prog_data;
4490
4491 return success;
4492 }
4493
4494 void
4495 brw_setup_tex_for_precompile(struct brw_context *brw,
4496 struct brw_sampler_prog_key_data *tex,
4497 struct gl_program *prog)
4498 {
4499 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4500 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4501 for (unsigned i = 0; i < sampler_count; i++) {
4502 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4503 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4504 tex->swizzles[i] =
4505 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4506 } else {
4507 /* Color sampler: assume no swizzling. */
4508 tex->swizzles[i] = SWIZZLE_XYZW;
4509 }
4510 }
4511 }