b46879b7beba628ddbfef39859bf1151c1fad7fd
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 extern "C" {
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(vec4_visitor *v,
34 enum opcode opcode, const dst_reg &dst,
35 const src_reg &src0, const src_reg &src1,
36 const src_reg &src2)
37 {
38 this->opcode = opcode;
39 this->dst = dst;
40 this->src[0] = src0;
41 this->src[1] = src1;
42 this->src[2] = src2;
43 this->saturate = false;
44 this->force_writemask_all = false;
45 this->no_dd_clear = false;
46 this->no_dd_check = false;
47 this->writes_accumulator = false;
48 this->conditional_mod = BRW_CONDITIONAL_NONE;
49 this->texture_offset = 0;
50 this->target = 0;
51 this->shadow_compare = false;
52 this->ir = v->base_ir;
53 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
54 this->header_present = false;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = v->current_annotation;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 this->instructions.push_tail(inst);
65
66 return inst;
67 }
68
69 vec4_instruction *
70 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
71 vec4_instruction *new_inst)
72 {
73 new_inst->ir = inst->ir;
74 new_inst->annotation = inst->annotation;
75
76 inst->insert_before(block, new_inst);
77
78 return inst;
79 }
80
81 vec4_instruction *
82 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
83 src_reg src0, src_reg src1, src_reg src2)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
86 src0, src1, src2));
87 }
88
89
90 vec4_instruction *
91 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
92 {
93 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
94 }
95
96 vec4_instruction *
97 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
98 {
99 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
100 }
101
102 vec4_instruction *
103 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
104 {
105 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode)
110 {
111 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
112 }
113
114 #define ALU1(op) \
115 vec4_instruction * \
116 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
117 { \
118 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
119 src0); \
120 }
121
122 #define ALU2(op) \
123 vec4_instruction * \
124 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
125 const src_reg &src1) \
126 { \
127 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
128 src0, src1); \
129 }
130
131 #define ALU2_ACC(op) \
132 vec4_instruction * \
133 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
134 const src_reg &src1) \
135 { \
136 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
137 BRW_OPCODE_##op, dst, src0, src1); \
138 inst->writes_accumulator = true; \
139 return inst; \
140 }
141
142 #define ALU3(op) \
143 vec4_instruction * \
144 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
145 const src_reg &src1, const src_reg &src2) \
146 { \
147 assert(brw->gen >= 6); \
148 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
149 src0, src1, src2); \
150 }
151
152 ALU1(NOT)
153 ALU1(MOV)
154 ALU1(FRC)
155 ALU1(RNDD)
156 ALU1(RNDE)
157 ALU1(RNDZ)
158 ALU1(F32TO16)
159 ALU1(F16TO32)
160 ALU2(ADD)
161 ALU2(MUL)
162 ALU2_ACC(MACH)
163 ALU2(AND)
164 ALU2(OR)
165 ALU2(XOR)
166 ALU2(DP3)
167 ALU2(DP4)
168 ALU2(DPH)
169 ALU2(SHL)
170 ALU2(SHR)
171 ALU2(ASR)
172 ALU3(LRP)
173 ALU1(BFREV)
174 ALU3(BFE)
175 ALU2(BFI1)
176 ALU3(BFI2)
177 ALU1(FBH)
178 ALU1(FBL)
179 ALU1(CBIT)
180 ALU3(MAD)
181 ALU2_ACC(ADDC)
182 ALU2_ACC(SUBB)
183 ALU2(MAC)
184
185 /** Gen4 predicated IF. */
186 vec4_instruction *
187 vec4_visitor::IF(enum brw_predicate predicate)
188 {
189 vec4_instruction *inst;
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
192 inst->predicate = predicate;
193
194 return inst;
195 }
196
197 /** Gen6 IF with embedded comparison. */
198 vec4_instruction *
199 vec4_visitor::IF(src_reg src0, src_reg src1,
200 enum brw_conditional_mod condition)
201 {
202 assert(brw->gen == 6);
203
204 vec4_instruction *inst;
205
206 resolve_ud_negate(&src0);
207 resolve_ud_negate(&src1);
208
209 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
210 src0, src1);
211 inst->conditional_mod = condition;
212
213 return inst;
214 }
215
216 /**
217 * CMP: Sets the low bit of the destination channels with the result
218 * of the comparison, while the upper bits are undefined, and updates
219 * the flag register with the packed 16 bits of the result.
220 */
221 vec4_instruction *
222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
223 enum brw_conditional_mod condition)
224 {
225 vec4_instruction *inst;
226
227 /* original gen4 does type conversion to the destination type
228 * before before comparison, producing garbage results for floating
229 * point comparisons.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 vec4_instruction *
247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
248 {
249 vec4_instruction *inst;
250
251 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
252 dst, index);
253 inst->base_mrf = 14;
254 inst->mlen = 2;
255
256 return inst;
257 }
258
259 vec4_instruction *
260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
261 const src_reg &index)
262 {
263 vec4_instruction *inst;
264
265 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
266 dst, src, index);
267 inst->base_mrf = 13;
268 inst->mlen = 3;
269
270 return inst;
271 }
272
273 void
274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
275 {
276 static enum opcode dot_opcodes[] = {
277 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
278 };
279
280 emit(dot_opcodes[elements - 2], dst, src0, src1);
281 }
282
283 src_reg
284 vec4_visitor::fix_3src_operand(src_reg src)
285 {
286 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
287 * able to use vertical stride of zero to replicate the vec4 uniform, like
288 *
289 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
290 *
291 * But you can't, since vertical stride is always four in three-source
292 * instructions. Instead, insert a MOV instruction to do the replication so
293 * that the three-source instruction can consume it.
294 */
295
296 /* The MOV is only needed if the source is a uniform or immediate. */
297 if (src.file != UNIFORM && src.file != IMM)
298 return src;
299
300 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
304 expanded.type = src.type;
305 emit(MOV(expanded, src));
306 return src_reg(expanded);
307 }
308
309 src_reg
310 vec4_visitor::fix_math_operand(src_reg src)
311 {
312 /* The gen6 math instruction ignores the source modifiers --
313 * swizzle, abs, negate, and at least some parts of the register
314 * region description.
315 *
316 * Rather than trying to enumerate all these cases, *always* expand the
317 * operand to a temp GRF for gen6.
318 *
319 * For gen7, keep the operand as-is, except if immediate, which gen7 still
320 * can't use.
321 */
322
323 if (brw->gen == 7 && src.file != IMM)
324 return src;
325
326 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
327 expanded.type = src.type;
328 emit(MOV(expanded, src));
329 return src_reg(expanded);
330 }
331
332 void
333 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
334 {
335 src = fix_math_operand(src);
336
337 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
338 /* The gen6 math instruction must be align1, so we can't do
339 * writemasks.
340 */
341 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
342
343 emit(opcode, temp_dst, src);
344
345 emit(MOV(dst, src_reg(temp_dst)));
346 } else {
347 emit(opcode, dst, src);
348 }
349 }
350
351 void
352 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
353 {
354 vec4_instruction *inst = emit(opcode, dst, src);
355 inst->base_mrf = 1;
356 inst->mlen = 1;
357 }
358
359 void
360 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
361 {
362 switch (opcode) {
363 case SHADER_OPCODE_RCP:
364 case SHADER_OPCODE_RSQ:
365 case SHADER_OPCODE_SQRT:
366 case SHADER_OPCODE_EXP2:
367 case SHADER_OPCODE_LOG2:
368 case SHADER_OPCODE_SIN:
369 case SHADER_OPCODE_COS:
370 break;
371 default:
372 unreachable("not reached: bad math opcode");
373 }
374
375 if (brw->gen >= 8) {
376 emit(opcode, dst, src);
377 } else if (brw->gen >= 6) {
378 emit_math1_gen6(opcode, dst, src);
379 } else {
380 emit_math1_gen4(opcode, dst, src);
381 }
382 }
383
384 void
385 vec4_visitor::emit_math2_gen6(enum opcode opcode,
386 dst_reg dst, src_reg src0, src_reg src1)
387 {
388 src0 = fix_math_operand(src0);
389 src1 = fix_math_operand(src1);
390
391 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
392 /* The gen6 math instruction must be align1, so we can't do
393 * writemasks.
394 */
395 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
396 temp_dst.type = dst.type;
397
398 emit(opcode, temp_dst, src0, src1);
399
400 emit(MOV(dst, src_reg(temp_dst)));
401 } else {
402 emit(opcode, dst, src0, src1);
403 }
404 }
405
406 void
407 vec4_visitor::emit_math2_gen4(enum opcode opcode,
408 dst_reg dst, src_reg src0, src_reg src1)
409 {
410 vec4_instruction *inst = emit(opcode, dst, src0, src1);
411 inst->base_mrf = 1;
412 inst->mlen = 2;
413 }
414
415 void
416 vec4_visitor::emit_math(enum opcode opcode,
417 dst_reg dst, src_reg src0, src_reg src1)
418 {
419 switch (opcode) {
420 case SHADER_OPCODE_POW:
421 case SHADER_OPCODE_INT_QUOTIENT:
422 case SHADER_OPCODE_INT_REMAINDER:
423 break;
424 default:
425 unreachable("not reached: unsupported binary math opcode");
426 }
427
428 if (brw->gen >= 8) {
429 emit(opcode, dst, src0, src1);
430 } else if (brw->gen >= 6) {
431 emit_math2_gen6(opcode, dst, src0, src1);
432 } else {
433 emit_math2_gen4(opcode, dst, src0, src1);
434 }
435 }
436
437 void
438 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (brw->gen < 7) {
441 unreachable("ir_unop_pack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_UD);
445 assert(src0.type == BRW_REGISTER_TYPE_F);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the destination data type must be Word (W).
451 *
452 * The destination must be DWord-aligned and specify a horizontal stride
453 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
454 * each destination channel and the upper word is not modified.
455 *
456 * The above restriction implies that the f32to16 instruction must use
457 * align1 mode, because only in align1 mode is it possible to specify
458 * horizontal stride. We choose here to defy the hardware docs and emit
459 * align16 instructions.
460 *
461 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
462 * instructions. I was partially successful in that the code passed all
463 * tests. However, the code was dubiously correct and fragile, and the
464 * tests were not harsh enough to probe that frailty. Not trusting the
465 * code, I chose instead to remain in align16 mode in defiance of the hw
466 * docs).
467 *
468 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
469 * simulator, emitting a f32to16 in align16 mode with UD as destination
470 * data type is safe. The behavior differs from that specified in the PRM
471 * in that the upper word of each destination channel is cleared to 0.
472 */
473
474 dst_reg tmp_dst(this, glsl_type::uvec2_type);
475 src_reg tmp_src(tmp_dst);
476
477 #if 0
478 /* Verify the undocumented behavior on which the following instructions
479 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
480 * then the result of the bit-or instruction below will be incorrect.
481 *
482 * You should inspect the disasm output in order to verify that the MOV is
483 * not optimized away.
484 */
485 emit(MOV(tmp_dst, src_reg(0x12345678u)));
486 #endif
487
488 /* Give tmp the form below, where "." means untouched.
489 *
490 * w z y x w z y x
491 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
492 *
493 * That the upper word of each write-channel be 0 is required for the
494 * following bit-shift and bit-or instructions to work. Note that this
495 * relies on the undocumented hardware behavior mentioned above.
496 */
497 tmp_dst.writemask = WRITEMASK_XY;
498 emit(F32TO16(tmp_dst, src0));
499
500 /* Give the write-channels of dst the form:
501 * 0xhhhh0000
502 */
503 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
504 emit(SHL(dst, tmp_src, src_reg(16u)));
505
506 /* Finally, give the write-channels of dst the form of packHalf2x16's
507 * output:
508 * 0xhhhhllll
509 */
510 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
511 emit(OR(dst, src_reg(dst), tmp_src));
512 }
513
514 void
515 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
516 {
517 if (brw->gen < 7) {
518 unreachable("ir_unop_unpack_half_2x16 should be lowered");
519 }
520
521 assert(dst.type == BRW_REGISTER_TYPE_F);
522 assert(src0.type == BRW_REGISTER_TYPE_UD);
523
524 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
525 *
526 * Because this instruction does not have a 16-bit floating-point type,
527 * the source data type must be Word (W). The destination type must be
528 * F (Float).
529 *
530 * To use W as the source data type, we must adjust horizontal strides,
531 * which is only possible in align1 mode. All my [chadv] attempts at
532 * emitting align1 instructions for unpackHalf2x16 failed to pass the
533 * Piglit tests, so I gave up.
534 *
535 * I've verified that, on gen7 hardware and the simulator, it is safe to
536 * emit f16to32 in align16 mode with UD as source data type.
537 */
538
539 dst_reg tmp_dst(this, glsl_type::uvec2_type);
540 src_reg tmp_src(tmp_dst);
541
542 tmp_dst.writemask = WRITEMASK_X;
543 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
544
545 tmp_dst.writemask = WRITEMASK_Y;
546 emit(SHR(tmp_dst, src0, src_reg(16u)));
547
548 dst.writemask = WRITEMASK_XY;
549 emit(F16TO32(dst, tmp_src));
550 }
551
552 void
553 vec4_visitor::visit_instructions(const exec_list *list)
554 {
555 foreach_in_list(ir_instruction, ir, list) {
556 base_ir = ir;
557 ir->accept(this);
558 }
559 }
560
561
562 static int
563 type_size(const struct glsl_type *type)
564 {
565 unsigned int i;
566 int size;
567
568 switch (type->base_type) {
569 case GLSL_TYPE_UINT:
570 case GLSL_TYPE_INT:
571 case GLSL_TYPE_FLOAT:
572 case GLSL_TYPE_BOOL:
573 if (type->is_matrix()) {
574 return type->matrix_columns;
575 } else {
576 /* Regardless of size of vector, it gets a vec4. This is bad
577 * packing for things like floats, but otherwise arrays become a
578 * mess. Hopefully a later pass over the code can pack scalars
579 * down if appropriate.
580 */
581 return 1;
582 }
583 case GLSL_TYPE_ARRAY:
584 assert(type->length > 0);
585 return type_size(type->fields.array) * type->length;
586 case GLSL_TYPE_STRUCT:
587 size = 0;
588 for (i = 0; i < type->length; i++) {
589 size += type_size(type->fields.structure[i].type);
590 }
591 return size;
592 case GLSL_TYPE_SAMPLER:
593 /* Samplers take up no register space, since they're baked in at
594 * link time.
595 */
596 return 0;
597 case GLSL_TYPE_ATOMIC_UINT:
598 return 0;
599 case GLSL_TYPE_IMAGE:
600 case GLSL_TYPE_VOID:
601 case GLSL_TYPE_ERROR:
602 case GLSL_TYPE_INTERFACE:
603 unreachable("not reached");
604 }
605
606 return 0;
607 }
608
609 int
610 vec4_visitor::virtual_grf_alloc(int size)
611 {
612 if (virtual_grf_array_size <= virtual_grf_count) {
613 if (virtual_grf_array_size == 0)
614 virtual_grf_array_size = 16;
615 else
616 virtual_grf_array_size *= 2;
617 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
618 virtual_grf_array_size);
619 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
620 virtual_grf_array_size);
621 }
622 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
623 virtual_grf_reg_count += size;
624 virtual_grf_sizes[virtual_grf_count] = size;
625 return virtual_grf_count++;
626 }
627
628 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
629 {
630 init();
631
632 this->file = GRF;
633 this->reg = v->virtual_grf_alloc(type_size(type));
634
635 if (type->is_array() || type->is_record()) {
636 this->swizzle = BRW_SWIZZLE_NOOP;
637 } else {
638 this->swizzle = swizzle_for_size(type->vector_elements);
639 }
640
641 this->type = brw_type_for_base_type(type);
642 }
643
644 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
645 {
646 assert(size > 0);
647
648 init();
649
650 this->file = GRF;
651 this->reg = v->virtual_grf_alloc(type_size(type) * size);
652
653 this->swizzle = BRW_SWIZZLE_NOOP;
654
655 this->type = brw_type_for_base_type(type);
656 }
657
658 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
659 {
660 init();
661
662 this->file = GRF;
663 this->reg = v->virtual_grf_alloc(type_size(type));
664
665 if (type->is_array() || type->is_record()) {
666 this->writemask = WRITEMASK_XYZW;
667 } else {
668 this->writemask = (1 << type->vector_elements) - 1;
669 }
670
671 this->type = brw_type_for_base_type(type);
672 }
673
674 /* Our support for uniforms is piggy-backed on the struct
675 * gl_fragment_program, because that's where the values actually
676 * get stored, rather than in some global gl_shader_program uniform
677 * store.
678 */
679 void
680 vec4_visitor::setup_uniform_values(ir_variable *ir)
681 {
682 int namelen = strlen(ir->name);
683
684 /* The data for our (non-builtin) uniforms is stored in a series of
685 * gl_uniform_driver_storage structs for each subcomponent that
686 * glGetUniformLocation() could name. We know it's been set up in the same
687 * order we'd walk the type, so walk the list of storage and find anything
688 * with our name, or the prefix of a component that starts with our name.
689 */
690 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
691 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
692
693 if (strncmp(ir->name, storage->name, namelen) != 0 ||
694 (storage->name[namelen] != 0 &&
695 storage->name[namelen] != '.' &&
696 storage->name[namelen] != '[')) {
697 continue;
698 }
699
700 gl_constant_value *components = storage->storage;
701 unsigned vector_count = (MAX2(storage->array_elements, 1) *
702 storage->type->matrix_columns);
703
704 for (unsigned s = 0; s < vector_count; s++) {
705 assert(uniforms < uniform_array_size);
706 uniform_vector_size[uniforms] = storage->type->vector_elements;
707
708 int i;
709 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
710 stage_prog_data->param[uniforms * 4 + i] = components;
711 components++;
712 }
713 for (; i < 4; i++) {
714 static gl_constant_value zero = { 0.0 };
715 stage_prog_data->param[uniforms * 4 + i] = &zero;
716 }
717
718 uniforms++;
719 }
720 }
721 }
722
723 void
724 vec4_visitor::setup_uniform_clipplane_values()
725 {
726 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
727
728 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
729 assert(this->uniforms < uniform_array_size);
730 this->uniform_vector_size[this->uniforms] = 4;
731 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
732 this->userplane[i].type = BRW_REGISTER_TYPE_F;
733 for (int j = 0; j < 4; ++j) {
734 stage_prog_data->param[this->uniforms * 4 + j] =
735 (gl_constant_value *) &clip_planes[i][j];
736 }
737 ++this->uniforms;
738 }
739 }
740
741 /* Our support for builtin uniforms is even scarier than non-builtin.
742 * It sits on top of the PROG_STATE_VAR parameters that are
743 * automatically updated from GL context state.
744 */
745 void
746 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
747 {
748 const ir_state_slot *const slots = ir->get_state_slots();
749 assert(slots != NULL);
750
751 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
752 /* This state reference has already been setup by ir_to_mesa,
753 * but we'll get the same index back here. We can reference
754 * ParameterValues directly, since unlike brw_fs.cpp, we never
755 * add new state references during compile.
756 */
757 int index = _mesa_add_state_reference(this->prog->Parameters,
758 (gl_state_index *)slots[i].tokens);
759 gl_constant_value *values =
760 &this->prog->Parameters->ParameterValues[index][0];
761
762 assert(this->uniforms < uniform_array_size);
763 this->uniform_vector_size[this->uniforms] = 0;
764 /* Add each of the unique swizzled channels of the element.
765 * This will end up matching the size of the glsl_type of this field.
766 */
767 int last_swiz = -1;
768 for (unsigned int j = 0; j < 4; j++) {
769 int swiz = GET_SWZ(slots[i].swizzle, j);
770 last_swiz = swiz;
771
772 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
773 assert(this->uniforms < uniform_array_size);
774 if (swiz <= last_swiz)
775 this->uniform_vector_size[this->uniforms]++;
776 }
777 this->uniforms++;
778 }
779 }
780
781 dst_reg *
782 vec4_visitor::variable_storage(ir_variable *var)
783 {
784 return (dst_reg *)hash_table_find(this->variable_ht, var);
785 }
786
787 void
788 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
789 enum brw_predicate *predicate)
790 {
791 ir_expression *expr = ir->as_expression();
792
793 *predicate = BRW_PREDICATE_NORMAL;
794
795 if (expr && expr->operation != ir_binop_ubo_load) {
796 src_reg op[3];
797 vec4_instruction *inst;
798
799 assert(expr->get_num_operands() <= 3);
800 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
801 expr->operands[i]->accept(this);
802 op[i] = this->result;
803
804 resolve_ud_negate(&op[i]);
805 }
806
807 switch (expr->operation) {
808 case ir_unop_logic_not:
809 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
810 inst->conditional_mod = BRW_CONDITIONAL_Z;
811 break;
812
813 case ir_binop_logic_xor:
814 inst = emit(XOR(dst_null_d(), op[0], op[1]));
815 inst->conditional_mod = BRW_CONDITIONAL_NZ;
816 break;
817
818 case ir_binop_logic_or:
819 inst = emit(OR(dst_null_d(), op[0], op[1]));
820 inst->conditional_mod = BRW_CONDITIONAL_NZ;
821 break;
822
823 case ir_binop_logic_and:
824 inst = emit(AND(dst_null_d(), op[0], op[1]));
825 inst->conditional_mod = BRW_CONDITIONAL_NZ;
826 break;
827
828 case ir_unop_f2b:
829 if (brw->gen >= 6) {
830 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
831 } else {
832 inst = emit(MOV(dst_null_f(), op[0]));
833 inst->conditional_mod = BRW_CONDITIONAL_NZ;
834 }
835 break;
836
837 case ir_unop_i2b:
838 if (brw->gen >= 6) {
839 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
840 } else {
841 inst = emit(MOV(dst_null_d(), op[0]));
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 }
844 break;
845
846 case ir_binop_all_equal:
847 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
848 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
849 break;
850
851 case ir_binop_any_nequal:
852 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
853 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
854 break;
855
856 case ir_unop_any:
857 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
858 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
859 break;
860
861 case ir_binop_greater:
862 case ir_binop_gequal:
863 case ir_binop_less:
864 case ir_binop_lequal:
865 case ir_binop_equal:
866 case ir_binop_nequal:
867 emit(CMP(dst_null_d(), op[0], op[1],
868 brw_conditional_for_comparison(expr->operation)));
869 break;
870
871 case ir_triop_csel: {
872 /* Expand the boolean condition into the flag register. */
873 inst = emit(MOV(dst_null_d(), op[0]));
874 inst->conditional_mod = BRW_CONDITIONAL_NZ;
875
876 /* Select which boolean to return. */
877 dst_reg temp(this, expr->operands[1]->type);
878 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
879 inst->predicate = BRW_PREDICATE_NORMAL;
880
881 /* Expand the result to a condition code. */
882 inst = emit(MOV(dst_null_d(), src_reg(temp)));
883 inst->conditional_mod = BRW_CONDITIONAL_NZ;
884 break;
885 }
886
887 default:
888 unreachable("not reached");
889 }
890 return;
891 }
892
893 ir->accept(this);
894
895 resolve_ud_negate(&this->result);
896
897 if (brw->gen >= 6) {
898 vec4_instruction *inst = emit(AND(dst_null_d(),
899 this->result, src_reg(1)));
900 inst->conditional_mod = BRW_CONDITIONAL_NZ;
901 } else {
902 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
903 inst->conditional_mod = BRW_CONDITIONAL_NZ;
904 }
905 }
906
907 /**
908 * Emit a gen6 IF statement with the comparison folded into the IF
909 * instruction.
910 */
911 void
912 vec4_visitor::emit_if_gen6(ir_if *ir)
913 {
914 ir_expression *expr = ir->condition->as_expression();
915
916 if (expr && expr->operation != ir_binop_ubo_load) {
917 src_reg op[3];
918 dst_reg temp;
919
920 assert(expr->get_num_operands() <= 3);
921 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
922 expr->operands[i]->accept(this);
923 op[i] = this->result;
924 }
925
926 switch (expr->operation) {
927 case ir_unop_logic_not:
928 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
929 return;
930
931 case ir_binop_logic_xor:
932 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
933 return;
934
935 case ir_binop_logic_or:
936 temp = dst_reg(this, glsl_type::bool_type);
937 emit(OR(temp, op[0], op[1]));
938 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
939 return;
940
941 case ir_binop_logic_and:
942 temp = dst_reg(this, glsl_type::bool_type);
943 emit(AND(temp, op[0], op[1]));
944 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
945 return;
946
947 case ir_unop_f2b:
948 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
949 return;
950
951 case ir_unop_i2b:
952 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
953 return;
954
955 case ir_binop_greater:
956 case ir_binop_gequal:
957 case ir_binop_less:
958 case ir_binop_lequal:
959 case ir_binop_equal:
960 case ir_binop_nequal:
961 emit(IF(op[0], op[1],
962 brw_conditional_for_comparison(expr->operation)));
963 return;
964
965 case ir_binop_all_equal:
966 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
967 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
968 return;
969
970 case ir_binop_any_nequal:
971 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
972 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
973 return;
974
975 case ir_unop_any:
976 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
977 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
978 return;
979
980 case ir_triop_csel: {
981 /* Expand the boolean condition into the flag register. */
982 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
983 inst->conditional_mod = BRW_CONDITIONAL_NZ;
984
985 /* Select which boolean to return. */
986 dst_reg temp(this, expr->operands[1]->type);
987 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
988 inst->predicate = BRW_PREDICATE_NORMAL;
989
990 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
991 return;
992 }
993
994 default:
995 unreachable("not reached");
996 }
997 return;
998 }
999
1000 ir->condition->accept(this);
1001
1002 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1003 }
1004
1005 void
1006 vec4_visitor::visit(ir_variable *ir)
1007 {
1008 dst_reg *reg = NULL;
1009
1010 if (variable_storage(ir))
1011 return;
1012
1013 switch (ir->data.mode) {
1014 case ir_var_shader_in:
1015 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1016 break;
1017
1018 case ir_var_shader_out:
1019 reg = new(mem_ctx) dst_reg(this, ir->type);
1020
1021 for (int i = 0; i < type_size(ir->type); i++) {
1022 output_reg[ir->data.location + i] = *reg;
1023 output_reg[ir->data.location + i].reg_offset = i;
1024 output_reg[ir->data.location + i].type =
1025 brw_type_for_base_type(ir->type->get_scalar_type());
1026 output_reg_annotation[ir->data.location + i] = ir->name;
1027 }
1028 break;
1029
1030 case ir_var_auto:
1031 case ir_var_temporary:
1032 reg = new(mem_ctx) dst_reg(this, ir->type);
1033 break;
1034
1035 case ir_var_uniform:
1036 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1037
1038 /* Thanks to the lower_ubo_reference pass, we will see only
1039 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1040 * variables, so no need for them to be in variable_ht.
1041 *
1042 * Some uniforms, such as samplers and atomic counters, have no actual
1043 * storage, so we should ignore them.
1044 */
1045 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1046 return;
1047
1048 /* Track how big the whole uniform variable is, in case we need to put a
1049 * copy of its data into pull constants for array access.
1050 */
1051 assert(this->uniforms < uniform_array_size);
1052 this->uniform_size[this->uniforms] = type_size(ir->type);
1053
1054 if (!strncmp(ir->name, "gl_", 3)) {
1055 setup_builtin_uniform_values(ir);
1056 } else {
1057 setup_uniform_values(ir);
1058 }
1059 break;
1060
1061 case ir_var_system_value:
1062 reg = make_reg_for_system_value(ir);
1063 break;
1064
1065 default:
1066 unreachable("not reached");
1067 }
1068
1069 reg->type = brw_type_for_base_type(ir->type);
1070 hash_table_insert(this->variable_ht, reg, ir);
1071 }
1072
1073 void
1074 vec4_visitor::visit(ir_loop *ir)
1075 {
1076 /* We don't want debugging output to print the whole body of the
1077 * loop as the annotation.
1078 */
1079 this->base_ir = NULL;
1080
1081 emit(BRW_OPCODE_DO);
1082
1083 visit_instructions(&ir->body_instructions);
1084
1085 emit(BRW_OPCODE_WHILE);
1086 }
1087
1088 void
1089 vec4_visitor::visit(ir_loop_jump *ir)
1090 {
1091 switch (ir->mode) {
1092 case ir_loop_jump::jump_break:
1093 emit(BRW_OPCODE_BREAK);
1094 break;
1095 case ir_loop_jump::jump_continue:
1096 emit(BRW_OPCODE_CONTINUE);
1097 break;
1098 }
1099 }
1100
1101
1102 void
1103 vec4_visitor::visit(ir_function_signature *)
1104 {
1105 unreachable("not reached");
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_function *ir)
1110 {
1111 /* Ignore function bodies other than main() -- we shouldn't see calls to
1112 * them since they should all be inlined.
1113 */
1114 if (strcmp(ir->name, "main") == 0) {
1115 const ir_function_signature *sig;
1116 exec_list empty;
1117
1118 sig = ir->matching_signature(NULL, &empty, false);
1119
1120 assert(sig);
1121
1122 visit_instructions(&sig->body);
1123 }
1124 }
1125
1126 bool
1127 vec4_visitor::try_emit_mad(ir_expression *ir)
1128 {
1129 /* 3-src instructions were introduced in gen6. */
1130 if (brw->gen < 6)
1131 return false;
1132
1133 /* MAD can only handle floating-point data. */
1134 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1135 return false;
1136
1137 ir_rvalue *nonmul = ir->operands[1];
1138 ir_expression *mul = ir->operands[0]->as_expression();
1139
1140 if (!mul || mul->operation != ir_binop_mul) {
1141 nonmul = ir->operands[0];
1142 mul = ir->operands[1]->as_expression();
1143
1144 if (!mul || mul->operation != ir_binop_mul)
1145 return false;
1146 }
1147
1148 nonmul->accept(this);
1149 src_reg src0 = fix_3src_operand(this->result);
1150
1151 mul->operands[0]->accept(this);
1152 src_reg src1 = fix_3src_operand(this->result);
1153
1154 mul->operands[1]->accept(this);
1155 src_reg src2 = fix_3src_operand(this->result);
1156
1157 this->result = src_reg(this, ir->type);
1158 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1159
1160 return true;
1161 }
1162
1163 bool
1164 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1165 {
1166 /* This optimization relies on CMP setting the destination to 0 when
1167 * false. Early hardware only sets the least significant bit, and
1168 * leaves the other bits undefined. So we can't use it.
1169 */
1170 if (brw->gen < 6)
1171 return false;
1172
1173 ir_expression *const cmp = ir->operands[0]->as_expression();
1174
1175 if (cmp == NULL)
1176 return false;
1177
1178 switch (cmp->operation) {
1179 case ir_binop_less:
1180 case ir_binop_greater:
1181 case ir_binop_lequal:
1182 case ir_binop_gequal:
1183 case ir_binop_equal:
1184 case ir_binop_nequal:
1185 break;
1186
1187 default:
1188 return false;
1189 }
1190
1191 cmp->operands[0]->accept(this);
1192 const src_reg cmp_src0 = this->result;
1193
1194 cmp->operands[1]->accept(this);
1195 const src_reg cmp_src1 = this->result;
1196
1197 this->result = src_reg(this, ir->type);
1198
1199 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1200 brw_conditional_for_comparison(cmp->operation)));
1201
1202 /* If the comparison is false, this->result will just happen to be zero.
1203 */
1204 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1205 this->result, src_reg(1.0f));
1206 inst->predicate = BRW_PREDICATE_NORMAL;
1207 inst->predicate_inverse = true;
1208
1209 return true;
1210 }
1211
1212 void
1213 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1214 src_reg src0, src_reg src1)
1215 {
1216 vec4_instruction *inst;
1217
1218 if (brw->gen >= 6) {
1219 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1220 inst->conditional_mod = conditionalmod;
1221 } else {
1222 emit(CMP(dst, src0, src1, conditionalmod));
1223
1224 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1225 inst->predicate = BRW_PREDICATE_NORMAL;
1226 }
1227 }
1228
1229 void
1230 vec4_visitor::emit_lrp(const dst_reg &dst,
1231 const src_reg &x, const src_reg &y, const src_reg &a)
1232 {
1233 if (brw->gen >= 6) {
1234 /* Note that the instruction's argument order is reversed from GLSL
1235 * and the IR.
1236 */
1237 emit(LRP(dst,
1238 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1239 } else {
1240 /* Earlier generations don't support three source operations, so we
1241 * need to emit x*(1-a) + y*a.
1242 */
1243 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1244 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1245 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1246 y_times_a.writemask = dst.writemask;
1247 one_minus_a.writemask = dst.writemask;
1248 x_times_one_minus_a.writemask = dst.writemask;
1249
1250 emit(MUL(y_times_a, y, a));
1251 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1252 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1253 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1254 }
1255 }
1256
1257 void
1258 vec4_visitor::visit(ir_expression *ir)
1259 {
1260 unsigned int operand;
1261 src_reg op[Elements(ir->operands)];
1262 vec4_instruction *inst;
1263
1264 if (ir->operation == ir_binop_add) {
1265 if (try_emit_mad(ir))
1266 return;
1267 }
1268
1269 if (ir->operation == ir_unop_b2f) {
1270 if (try_emit_b2f_of_compare(ir))
1271 return;
1272 }
1273
1274 /* Storage for our result. Ideally for an assignment we'd be using
1275 * the actual storage for the result here, instead.
1276 */
1277 dst_reg result_dst(this, ir->type);
1278 src_reg result_src(result_dst);
1279
1280 if (ir->operation == ir_triop_csel) {
1281 ir->operands[1]->accept(this);
1282 op[1] = this->result;
1283 ir->operands[2]->accept(this);
1284 op[2] = this->result;
1285
1286 enum brw_predicate predicate;
1287 emit_bool_to_cond_code(ir->operands[0], &predicate);
1288 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1289 inst->predicate = predicate;
1290 this->result = result_src;
1291 return;
1292 }
1293
1294 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1295 this->result.file = BAD_FILE;
1296 ir->operands[operand]->accept(this);
1297 if (this->result.file == BAD_FILE) {
1298 fprintf(stderr, "Failed to get tree for expression operand:\n");
1299 ir->operands[operand]->fprint(stderr);
1300 exit(1);
1301 }
1302 op[operand] = this->result;
1303
1304 /* Matrix expression operands should have been broken down to vector
1305 * operations already.
1306 */
1307 assert(!ir->operands[operand]->type->is_matrix());
1308 }
1309
1310 /* If nothing special happens, this is the result. */
1311 this->result = result_src;
1312
1313 switch (ir->operation) {
1314 case ir_unop_logic_not:
1315 if (ctx->Const.UniformBooleanTrue != 1) {
1316 emit(NOT(result_dst, op[0]));
1317 } else {
1318 emit(XOR(result_dst, op[0], src_reg(1u)));
1319 }
1320 break;
1321 case ir_unop_neg:
1322 op[0].negate = !op[0].negate;
1323 emit(MOV(result_dst, op[0]));
1324 break;
1325 case ir_unop_abs:
1326 op[0].abs = true;
1327 op[0].negate = false;
1328 emit(MOV(result_dst, op[0]));
1329 break;
1330
1331 case ir_unop_sign:
1332 if (ir->type->is_float()) {
1333 /* AND(val, 0x80000000) gives the sign bit.
1334 *
1335 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1336 * zero.
1337 */
1338 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1339
1340 op[0].type = BRW_REGISTER_TYPE_UD;
1341 result_dst.type = BRW_REGISTER_TYPE_UD;
1342 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1343
1344 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1345 inst->predicate = BRW_PREDICATE_NORMAL;
1346
1347 this->result.type = BRW_REGISTER_TYPE_F;
1348 } else {
1349 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1350 * -> non-negative val generates 0x00000000.
1351 * Predicated OR sets 1 if val is positive.
1352 */
1353 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1354
1355 emit(ASR(result_dst, op[0], src_reg(31)));
1356
1357 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1358 inst->predicate = BRW_PREDICATE_NORMAL;
1359 }
1360 break;
1361
1362 case ir_unop_rcp:
1363 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1364 break;
1365
1366 case ir_unop_exp2:
1367 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1368 break;
1369 case ir_unop_log2:
1370 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1371 break;
1372 case ir_unop_exp:
1373 case ir_unop_log:
1374 unreachable("not reached: should be handled by ir_explog_to_explog2");
1375 case ir_unop_sin:
1376 case ir_unop_sin_reduced:
1377 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1378 break;
1379 case ir_unop_cos:
1380 case ir_unop_cos_reduced:
1381 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1382 break;
1383
1384 case ir_unop_dFdx:
1385 case ir_unop_dFdx_coarse:
1386 case ir_unop_dFdx_fine:
1387 case ir_unop_dFdy:
1388 case ir_unop_dFdy_coarse:
1389 case ir_unop_dFdy_fine:
1390 unreachable("derivatives not valid in vertex shader");
1391
1392 case ir_unop_bitfield_reverse:
1393 emit(BFREV(result_dst, op[0]));
1394 break;
1395 case ir_unop_bit_count:
1396 emit(CBIT(result_dst, op[0]));
1397 break;
1398 case ir_unop_find_msb: {
1399 src_reg temp = src_reg(this, glsl_type::uint_type);
1400
1401 inst = emit(FBH(dst_reg(temp), op[0]));
1402 inst->dst.writemask = WRITEMASK_XYZW;
1403
1404 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1405 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1406 * subtract the result from 31 to convert the MSB count into an LSB count.
1407 */
1408
1409 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1410 temp.swizzle = BRW_SWIZZLE_NOOP;
1411 emit(MOV(result_dst, temp));
1412
1413 src_reg src_tmp = src_reg(result_dst);
1414 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1415
1416 src_tmp.negate = true;
1417 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1418 inst->predicate = BRW_PREDICATE_NORMAL;
1419 break;
1420 }
1421 case ir_unop_find_lsb:
1422 emit(FBL(result_dst, op[0]));
1423 break;
1424 case ir_unop_saturate:
1425 inst = emit(MOV(result_dst, op[0]));
1426 inst->saturate = true;
1427 break;
1428
1429 case ir_unop_noise:
1430 unreachable("not reached: should be handled by lower_noise");
1431
1432 case ir_binop_add:
1433 emit(ADD(result_dst, op[0], op[1]));
1434 break;
1435 case ir_binop_sub:
1436 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1437
1438 case ir_binop_mul:
1439 if (brw->gen < 8 && ir->type->is_integer()) {
1440 /* For integer multiplication, the MUL uses the low 16 bits of one of
1441 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1442 * accumulates in the contribution of the upper 16 bits of that
1443 * operand. If we can determine that one of the args is in the low
1444 * 16 bits, though, we can just emit a single MUL.
1445 */
1446 if (ir->operands[0]->is_uint16_constant()) {
1447 if (brw->gen < 7)
1448 emit(MUL(result_dst, op[0], op[1]));
1449 else
1450 emit(MUL(result_dst, op[1], op[0]));
1451 } else if (ir->operands[1]->is_uint16_constant()) {
1452 if (brw->gen < 7)
1453 emit(MUL(result_dst, op[1], op[0]));
1454 else
1455 emit(MUL(result_dst, op[0], op[1]));
1456 } else {
1457 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1458
1459 emit(MUL(acc, op[0], op[1]));
1460 emit(MACH(dst_null_d(), op[0], op[1]));
1461 emit(MOV(result_dst, src_reg(acc)));
1462 }
1463 } else {
1464 emit(MUL(result_dst, op[0], op[1]));
1465 }
1466 break;
1467 case ir_binop_imul_high: {
1468 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1469
1470 emit(MUL(acc, op[0], op[1]));
1471 emit(MACH(result_dst, op[0], op[1]));
1472 break;
1473 }
1474 case ir_binop_div:
1475 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1476 assert(ir->type->is_integer());
1477 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1478 break;
1479 case ir_binop_carry: {
1480 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1481
1482 emit(ADDC(dst_null_ud(), op[0], op[1]));
1483 emit(MOV(result_dst, src_reg(acc)));
1484 break;
1485 }
1486 case ir_binop_borrow: {
1487 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1488
1489 emit(SUBB(dst_null_ud(), op[0], op[1]));
1490 emit(MOV(result_dst, src_reg(acc)));
1491 break;
1492 }
1493 case ir_binop_mod:
1494 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1495 assert(ir->type->is_integer());
1496 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1497 break;
1498
1499 case ir_binop_less:
1500 case ir_binop_greater:
1501 case ir_binop_lequal:
1502 case ir_binop_gequal:
1503 case ir_binop_equal:
1504 case ir_binop_nequal: {
1505 emit(CMP(result_dst, op[0], op[1],
1506 brw_conditional_for_comparison(ir->operation)));
1507 if (ctx->Const.UniformBooleanTrue == 1) {
1508 emit(AND(result_dst, result_src, src_reg(1u)));
1509 }
1510 break;
1511 }
1512
1513 case ir_binop_all_equal:
1514 /* "==" operator producing a scalar boolean. */
1515 if (ir->operands[0]->type->is_vector() ||
1516 ir->operands[1]->type->is_vector()) {
1517 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1518 emit(MOV(result_dst, src_reg(0)));
1519 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1520 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1521 } else {
1522 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1523 if (ctx->Const.UniformBooleanTrue == 1) {
1524 emit(AND(result_dst, result_src, src_reg(1u)));
1525 }
1526 }
1527 break;
1528 case ir_binop_any_nequal:
1529 /* "!=" operator producing a scalar boolean. */
1530 if (ir->operands[0]->type->is_vector() ||
1531 ir->operands[1]->type->is_vector()) {
1532 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1533
1534 emit(MOV(result_dst, src_reg(0)));
1535 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1536 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1537 } else {
1538 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1539 if (ctx->Const.UniformBooleanTrue == 1) {
1540 emit(AND(result_dst, result_src, src_reg(1u)));
1541 }
1542 }
1543 break;
1544
1545 case ir_unop_any:
1546 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1547 emit(MOV(result_dst, src_reg(0)));
1548
1549 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1550 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1551 break;
1552
1553 case ir_binop_logic_xor:
1554 emit(XOR(result_dst, op[0], op[1]));
1555 break;
1556
1557 case ir_binop_logic_or:
1558 emit(OR(result_dst, op[0], op[1]));
1559 break;
1560
1561 case ir_binop_logic_and:
1562 emit(AND(result_dst, op[0], op[1]));
1563 break;
1564
1565 case ir_binop_dot:
1566 assert(ir->operands[0]->type->is_vector());
1567 assert(ir->operands[0]->type == ir->operands[1]->type);
1568 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1569 break;
1570
1571 case ir_unop_sqrt:
1572 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1573 break;
1574 case ir_unop_rsq:
1575 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1576 break;
1577
1578 case ir_unop_bitcast_i2f:
1579 case ir_unop_bitcast_u2f:
1580 this->result = op[0];
1581 this->result.type = BRW_REGISTER_TYPE_F;
1582 break;
1583
1584 case ir_unop_bitcast_f2i:
1585 this->result = op[0];
1586 this->result.type = BRW_REGISTER_TYPE_D;
1587 break;
1588
1589 case ir_unop_bitcast_f2u:
1590 this->result = op[0];
1591 this->result.type = BRW_REGISTER_TYPE_UD;
1592 break;
1593
1594 case ir_unop_i2f:
1595 case ir_unop_i2u:
1596 case ir_unop_u2i:
1597 case ir_unop_u2f:
1598 case ir_unop_f2i:
1599 case ir_unop_f2u:
1600 emit(MOV(result_dst, op[0]));
1601 break;
1602 case ir_unop_b2i:
1603 if (ctx->Const.UniformBooleanTrue != 1) {
1604 emit(AND(result_dst, op[0], src_reg(1u)));
1605 } else {
1606 emit(MOV(result_dst, op[0]));
1607 }
1608 break;
1609 case ir_unop_b2f:
1610 if (ctx->Const.UniformBooleanTrue != 1) {
1611 op[0].type = BRW_REGISTER_TYPE_UD;
1612 result_dst.type = BRW_REGISTER_TYPE_UD;
1613 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1614 result_dst.type = BRW_REGISTER_TYPE_F;
1615 } else {
1616 emit(MOV(result_dst, op[0]));
1617 }
1618 break;
1619 case ir_unop_f2b:
1620 case ir_unop_i2b:
1621 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1622 if (ctx->Const.UniformBooleanTrue == 1) {
1623 emit(AND(result_dst, result_src, src_reg(1u)));
1624 }
1625 break;
1626
1627 case ir_unop_trunc:
1628 emit(RNDZ(result_dst, op[0]));
1629 break;
1630 case ir_unop_ceil:
1631 op[0].negate = !op[0].negate;
1632 inst = emit(RNDD(result_dst, op[0]));
1633 this->result.negate = true;
1634 break;
1635 case ir_unop_floor:
1636 inst = emit(RNDD(result_dst, op[0]));
1637 break;
1638 case ir_unop_fract:
1639 inst = emit(FRC(result_dst, op[0]));
1640 break;
1641 case ir_unop_round_even:
1642 emit(RNDE(result_dst, op[0]));
1643 break;
1644
1645 case ir_binop_min:
1646 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1647 break;
1648 case ir_binop_max:
1649 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1650 break;
1651
1652 case ir_binop_pow:
1653 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1654 break;
1655
1656 case ir_unop_bit_not:
1657 inst = emit(NOT(result_dst, op[0]));
1658 break;
1659 case ir_binop_bit_and:
1660 inst = emit(AND(result_dst, op[0], op[1]));
1661 break;
1662 case ir_binop_bit_xor:
1663 inst = emit(XOR(result_dst, op[0], op[1]));
1664 break;
1665 case ir_binop_bit_or:
1666 inst = emit(OR(result_dst, op[0], op[1]));
1667 break;
1668
1669 case ir_binop_lshift:
1670 inst = emit(SHL(result_dst, op[0], op[1]));
1671 break;
1672
1673 case ir_binop_rshift:
1674 if (ir->type->base_type == GLSL_TYPE_INT)
1675 inst = emit(ASR(result_dst, op[0], op[1]));
1676 else
1677 inst = emit(SHR(result_dst, op[0], op[1]));
1678 break;
1679
1680 case ir_binop_bfm:
1681 emit(BFI1(result_dst, op[0], op[1]));
1682 break;
1683
1684 case ir_binop_ubo_load: {
1685 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1686 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1687 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1688 src_reg offset;
1689
1690 /* Now, load the vector from that offset. */
1691 assert(ir->type->is_vector() || ir->type->is_scalar());
1692
1693 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1694 packed_consts.type = result.type;
1695 src_reg surf_index;
1696
1697 if (const_uniform_block) {
1698 /* The block index is a constant, so just emit the binding table entry
1699 * as an immediate.
1700 */
1701 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1702 const_uniform_block->value.u[0]);
1703 } else {
1704 /* The block index is not a constant. Evaluate the index expression
1705 * per-channel and add the base UBO index; the generator will select
1706 * a value from any live channel.
1707 */
1708 surf_index = src_reg(this, glsl_type::uint_type);
1709 emit(ADD(dst_reg(surf_index), op[0],
1710 src_reg(prog_data->base.binding_table.ubo_start)));
1711
1712 /* Assume this may touch any UBO. It would be nice to provide
1713 * a tighter bound, but the array information is already lowered away.
1714 */
1715 brw_mark_surface_used(&prog_data->base,
1716 prog_data->base.binding_table.ubo_start +
1717 shader_prog->NumUniformBlocks - 1);
1718 }
1719
1720 if (const_offset_ir) {
1721 if (brw->gen >= 8) {
1722 /* Store the offset in a GRF so we can send-from-GRF. */
1723 offset = src_reg(this, glsl_type::int_type);
1724 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1725 } else {
1726 /* Immediates are fine on older generations since they'll be moved
1727 * to a (potentially fake) MRF at the generator level.
1728 */
1729 offset = src_reg(const_offset / 16);
1730 }
1731 } else {
1732 offset = src_reg(this, glsl_type::uint_type);
1733 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1734 }
1735
1736 if (brw->gen >= 7) {
1737 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1738 grf_offset.type = offset.type;
1739
1740 emit(MOV(grf_offset, offset));
1741
1742 emit(new(mem_ctx) vec4_instruction(this,
1743 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1744 dst_reg(packed_consts),
1745 surf_index,
1746 src_reg(grf_offset)));
1747 } else {
1748 vec4_instruction *pull =
1749 emit(new(mem_ctx) vec4_instruction(this,
1750 VS_OPCODE_PULL_CONSTANT_LOAD,
1751 dst_reg(packed_consts),
1752 surf_index,
1753 offset));
1754 pull->base_mrf = 14;
1755 pull->mlen = 1;
1756 }
1757
1758 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1759 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1760 const_offset % 16 / 4,
1761 const_offset % 16 / 4,
1762 const_offset % 16 / 4);
1763
1764 /* UBO bools are any nonzero int. We need to convert them to use the
1765 * value of true stored in ctx->Const.UniformBooleanTrue.
1766 */
1767 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1768 emit(CMP(result_dst, packed_consts, src_reg(0u),
1769 BRW_CONDITIONAL_NZ));
1770 if (ctx->Const.UniformBooleanTrue == 1) {
1771 emit(AND(result_dst, result, src_reg(1u)));
1772 }
1773 } else {
1774 emit(MOV(result_dst, packed_consts));
1775 }
1776 break;
1777 }
1778
1779 case ir_binop_vector_extract:
1780 unreachable("should have been lowered by vec_index_to_cond_assign");
1781
1782 case ir_triop_fma:
1783 op[0] = fix_3src_operand(op[0]);
1784 op[1] = fix_3src_operand(op[1]);
1785 op[2] = fix_3src_operand(op[2]);
1786 /* Note that the instruction's argument order is reversed from GLSL
1787 * and the IR.
1788 */
1789 emit(MAD(result_dst, op[2], op[1], op[0]));
1790 break;
1791
1792 case ir_triop_lrp:
1793 emit_lrp(result_dst, op[0], op[1], op[2]);
1794 break;
1795
1796 case ir_triop_csel:
1797 unreachable("already handled above");
1798 break;
1799
1800 case ir_triop_bfi:
1801 op[0] = fix_3src_operand(op[0]);
1802 op[1] = fix_3src_operand(op[1]);
1803 op[2] = fix_3src_operand(op[2]);
1804 emit(BFI2(result_dst, op[0], op[1], op[2]));
1805 break;
1806
1807 case ir_triop_bitfield_extract:
1808 op[0] = fix_3src_operand(op[0]);
1809 op[1] = fix_3src_operand(op[1]);
1810 op[2] = fix_3src_operand(op[2]);
1811 /* Note that the instruction's argument order is reversed from GLSL
1812 * and the IR.
1813 */
1814 emit(BFE(result_dst, op[2], op[1], op[0]));
1815 break;
1816
1817 case ir_triop_vector_insert:
1818 unreachable("should have been lowered by lower_vector_insert");
1819
1820 case ir_quadop_bitfield_insert:
1821 unreachable("not reached: should be handled by "
1822 "bitfield_insert_to_bfm_bfi\n");
1823
1824 case ir_quadop_vector:
1825 unreachable("not reached: should be handled by lower_quadop_vector");
1826
1827 case ir_unop_pack_half_2x16:
1828 emit_pack_half_2x16(result_dst, op[0]);
1829 break;
1830 case ir_unop_unpack_half_2x16:
1831 emit_unpack_half_2x16(result_dst, op[0]);
1832 break;
1833 case ir_unop_pack_snorm_2x16:
1834 case ir_unop_pack_snorm_4x8:
1835 case ir_unop_pack_unorm_2x16:
1836 case ir_unop_pack_unorm_4x8:
1837 case ir_unop_unpack_snorm_2x16:
1838 case ir_unop_unpack_snorm_4x8:
1839 case ir_unop_unpack_unorm_2x16:
1840 case ir_unop_unpack_unorm_4x8:
1841 unreachable("not reached: should be handled by lower_packing_builtins");
1842 case ir_unop_unpack_half_2x16_split_x:
1843 case ir_unop_unpack_half_2x16_split_y:
1844 case ir_binop_pack_half_2x16_split:
1845 case ir_unop_interpolate_at_centroid:
1846 case ir_binop_interpolate_at_sample:
1847 case ir_binop_interpolate_at_offset:
1848 unreachable("not reached: should not occur in vertex shader");
1849 case ir_binop_ldexp:
1850 unreachable("not reached: should be handled by ldexp_to_arith()");
1851 }
1852 }
1853
1854
1855 void
1856 vec4_visitor::visit(ir_swizzle *ir)
1857 {
1858 src_reg src;
1859 int i = 0;
1860 int swizzle[4];
1861
1862 /* Note that this is only swizzles in expressions, not those on the left
1863 * hand side of an assignment, which do write masking. See ir_assignment
1864 * for that.
1865 */
1866
1867 ir->val->accept(this);
1868 src = this->result;
1869 assert(src.file != BAD_FILE);
1870
1871 for (i = 0; i < ir->type->vector_elements; i++) {
1872 switch (i) {
1873 case 0:
1874 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1875 break;
1876 case 1:
1877 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1878 break;
1879 case 2:
1880 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1881 break;
1882 case 3:
1883 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1884 break;
1885 }
1886 }
1887 for (; i < 4; i++) {
1888 /* Replicate the last channel out. */
1889 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1890 }
1891
1892 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1893
1894 this->result = src;
1895 }
1896
1897 void
1898 vec4_visitor::visit(ir_dereference_variable *ir)
1899 {
1900 const struct glsl_type *type = ir->type;
1901 dst_reg *reg = variable_storage(ir->var);
1902
1903 if (!reg) {
1904 fail("Failed to find variable storage for %s\n", ir->var->name);
1905 this->result = src_reg(brw_null_reg());
1906 return;
1907 }
1908
1909 this->result = src_reg(*reg);
1910
1911 /* System values get their swizzle from the dst_reg writemask */
1912 if (ir->var->data.mode == ir_var_system_value)
1913 return;
1914
1915 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1916 this->result.swizzle = swizzle_for_size(type->vector_elements);
1917 }
1918
1919
1920 int
1921 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1922 {
1923 /* Under normal circumstances array elements are stored consecutively, so
1924 * the stride is equal to the size of the array element.
1925 */
1926 return type_size(ir->type);
1927 }
1928
1929
1930 void
1931 vec4_visitor::visit(ir_dereference_array *ir)
1932 {
1933 ir_constant *constant_index;
1934 src_reg src;
1935 int array_stride = compute_array_stride(ir);
1936
1937 constant_index = ir->array_index->constant_expression_value();
1938
1939 ir->array->accept(this);
1940 src = this->result;
1941
1942 if (constant_index) {
1943 src.reg_offset += constant_index->value.i[0] * array_stride;
1944 } else {
1945 /* Variable index array dereference. It eats the "vec4" of the
1946 * base of the array and an index that offsets the Mesa register
1947 * index.
1948 */
1949 ir->array_index->accept(this);
1950
1951 src_reg index_reg;
1952
1953 if (array_stride == 1) {
1954 index_reg = this->result;
1955 } else {
1956 index_reg = src_reg(this, glsl_type::int_type);
1957
1958 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1959 }
1960
1961 if (src.reladdr) {
1962 src_reg temp = src_reg(this, glsl_type::int_type);
1963
1964 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1965
1966 index_reg = temp;
1967 }
1968
1969 src.reladdr = ralloc(mem_ctx, src_reg);
1970 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1971 }
1972
1973 /* If the type is smaller than a vec4, replicate the last channel out. */
1974 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1975 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1976 else
1977 src.swizzle = BRW_SWIZZLE_NOOP;
1978 src.type = brw_type_for_base_type(ir->type);
1979
1980 this->result = src;
1981 }
1982
1983 void
1984 vec4_visitor::visit(ir_dereference_record *ir)
1985 {
1986 unsigned int i;
1987 const glsl_type *struct_type = ir->record->type;
1988 int offset = 0;
1989
1990 ir->record->accept(this);
1991
1992 for (i = 0; i < struct_type->length; i++) {
1993 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1994 break;
1995 offset += type_size(struct_type->fields.structure[i].type);
1996 }
1997
1998 /* If the type is smaller than a vec4, replicate the last channel out. */
1999 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2000 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2001 else
2002 this->result.swizzle = BRW_SWIZZLE_NOOP;
2003 this->result.type = brw_type_for_base_type(ir->type);
2004
2005 this->result.reg_offset += offset;
2006 }
2007
2008 /**
2009 * We want to be careful in assignment setup to hit the actual storage
2010 * instead of potentially using a temporary like we might with the
2011 * ir_dereference handler.
2012 */
2013 static dst_reg
2014 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2015 {
2016 /* The LHS must be a dereference. If the LHS is a variable indexed array
2017 * access of a vector, it must be separated into a series conditional moves
2018 * before reaching this point (see ir_vec_index_to_cond_assign).
2019 */
2020 assert(ir->as_dereference());
2021 ir_dereference_array *deref_array = ir->as_dereference_array();
2022 if (deref_array) {
2023 assert(!deref_array->array->type->is_vector());
2024 }
2025
2026 /* Use the rvalue deref handler for the most part. We'll ignore
2027 * swizzles in it and write swizzles using writemask, though.
2028 */
2029 ir->accept(v);
2030 return dst_reg(v->result);
2031 }
2032
2033 void
2034 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2035 const struct glsl_type *type,
2036 enum brw_predicate predicate)
2037 {
2038 if (type->base_type == GLSL_TYPE_STRUCT) {
2039 for (unsigned int i = 0; i < type->length; i++) {
2040 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2041 }
2042 return;
2043 }
2044
2045 if (type->is_array()) {
2046 for (unsigned int i = 0; i < type->length; i++) {
2047 emit_block_move(dst, src, type->fields.array, predicate);
2048 }
2049 return;
2050 }
2051
2052 if (type->is_matrix()) {
2053 const struct glsl_type *vec_type;
2054
2055 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2056 type->vector_elements, 1);
2057
2058 for (int i = 0; i < type->matrix_columns; i++) {
2059 emit_block_move(dst, src, vec_type, predicate);
2060 }
2061 return;
2062 }
2063
2064 assert(type->is_scalar() || type->is_vector());
2065
2066 dst->type = brw_type_for_base_type(type);
2067 src->type = dst->type;
2068
2069 dst->writemask = (1 << type->vector_elements) - 1;
2070
2071 src->swizzle = swizzle_for_size(type->vector_elements);
2072
2073 vec4_instruction *inst = emit(MOV(*dst, *src));
2074 inst->predicate = predicate;
2075
2076 dst->reg_offset++;
2077 src->reg_offset++;
2078 }
2079
2080
2081 /* If the RHS processing resulted in an instruction generating a
2082 * temporary value, and it would be easy to rewrite the instruction to
2083 * generate its result right into the LHS instead, do so. This ends
2084 * up reliably removing instructions where it can be tricky to do so
2085 * later without real UD chain information.
2086 */
2087 bool
2088 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2089 dst_reg dst,
2090 src_reg src,
2091 vec4_instruction *pre_rhs_inst,
2092 vec4_instruction *last_rhs_inst)
2093 {
2094 /* This could be supported, but it would take more smarts. */
2095 if (ir->condition)
2096 return false;
2097
2098 if (pre_rhs_inst == last_rhs_inst)
2099 return false; /* No instructions generated to work with. */
2100
2101 /* Make sure the last instruction generated our source reg. */
2102 if (src.file != GRF ||
2103 src.file != last_rhs_inst->dst.file ||
2104 src.reg != last_rhs_inst->dst.reg ||
2105 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2106 src.reladdr ||
2107 src.abs ||
2108 src.negate ||
2109 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2110 return false;
2111
2112 /* Check that that last instruction fully initialized the channels
2113 * we want to use, in the order we want to use them. We could
2114 * potentially reswizzle the operands of many instructions so that
2115 * we could handle out of order channels, but don't yet.
2116 */
2117
2118 for (unsigned i = 0; i < 4; i++) {
2119 if (dst.writemask & (1 << i)) {
2120 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2121 return false;
2122
2123 if (BRW_GET_SWZ(src.swizzle, i) != i)
2124 return false;
2125 }
2126 }
2127
2128 /* Success! Rewrite the instruction. */
2129 last_rhs_inst->dst.file = dst.file;
2130 last_rhs_inst->dst.reg = dst.reg;
2131 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2132 last_rhs_inst->dst.reladdr = dst.reladdr;
2133 last_rhs_inst->dst.writemask &= dst.writemask;
2134
2135 return true;
2136 }
2137
2138 void
2139 vec4_visitor::visit(ir_assignment *ir)
2140 {
2141 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2142 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2143
2144 if (!ir->lhs->type->is_scalar() &&
2145 !ir->lhs->type->is_vector()) {
2146 ir->rhs->accept(this);
2147 src_reg src = this->result;
2148
2149 if (ir->condition) {
2150 emit_bool_to_cond_code(ir->condition, &predicate);
2151 }
2152
2153 /* emit_block_move doesn't account for swizzles in the source register.
2154 * This should be ok, since the source register is a structure or an
2155 * array, and those can't be swizzled. But double-check to be sure.
2156 */
2157 assert(src.swizzle ==
2158 (ir->rhs->type->is_matrix()
2159 ? swizzle_for_size(ir->rhs->type->vector_elements)
2160 : BRW_SWIZZLE_NOOP));
2161
2162 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2163 return;
2164 }
2165
2166 /* Now we're down to just a scalar/vector with writemasks. */
2167 int i;
2168
2169 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2170 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2171
2172 ir->rhs->accept(this);
2173
2174 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2175
2176 src_reg src = this->result;
2177
2178 int swizzles[4];
2179 int first_enabled_chan = 0;
2180 int src_chan = 0;
2181
2182 assert(ir->lhs->type->is_vector() ||
2183 ir->lhs->type->is_scalar());
2184 dst.writemask = ir->write_mask;
2185
2186 for (int i = 0; i < 4; i++) {
2187 if (dst.writemask & (1 << i)) {
2188 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2189 break;
2190 }
2191 }
2192
2193 /* Swizzle a small RHS vector into the channels being written.
2194 *
2195 * glsl ir treats write_mask as dictating how many channels are
2196 * present on the RHS while in our instructions we need to make
2197 * those channels appear in the slots of the vec4 they're written to.
2198 */
2199 for (int i = 0; i < 4; i++) {
2200 if (dst.writemask & (1 << i))
2201 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2202 else
2203 swizzles[i] = first_enabled_chan;
2204 }
2205 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2206 swizzles[2], swizzles[3]);
2207
2208 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2209 return;
2210 }
2211
2212 if (ir->condition) {
2213 emit_bool_to_cond_code(ir->condition, &predicate);
2214 }
2215
2216 for (i = 0; i < type_size(ir->lhs->type); i++) {
2217 vec4_instruction *inst = emit(MOV(dst, src));
2218 inst->predicate = predicate;
2219
2220 dst.reg_offset++;
2221 src.reg_offset++;
2222 }
2223 }
2224
2225 void
2226 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2227 {
2228 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2229 foreach_in_list(ir_constant, field_value, &ir->components) {
2230 emit_constant_values(dst, field_value);
2231 }
2232 return;
2233 }
2234
2235 if (ir->type->is_array()) {
2236 for (unsigned int i = 0; i < ir->type->length; i++) {
2237 emit_constant_values(dst, ir->array_elements[i]);
2238 }
2239 return;
2240 }
2241
2242 if (ir->type->is_matrix()) {
2243 for (int i = 0; i < ir->type->matrix_columns; i++) {
2244 float *vec = &ir->value.f[i * ir->type->vector_elements];
2245
2246 for (int j = 0; j < ir->type->vector_elements; j++) {
2247 dst->writemask = 1 << j;
2248 dst->type = BRW_REGISTER_TYPE_F;
2249
2250 emit(MOV(*dst, src_reg(vec[j])));
2251 }
2252 dst->reg_offset++;
2253 }
2254 return;
2255 }
2256
2257 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2258
2259 for (int i = 0; i < ir->type->vector_elements; i++) {
2260 if (!(remaining_writemask & (1 << i)))
2261 continue;
2262
2263 dst->writemask = 1 << i;
2264 dst->type = brw_type_for_base_type(ir->type);
2265
2266 /* Find other components that match the one we're about to
2267 * write. Emits fewer instructions for things like vec4(0.5,
2268 * 1.5, 1.5, 1.5).
2269 */
2270 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2271 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2272 if (ir->value.b[i] == ir->value.b[j])
2273 dst->writemask |= (1 << j);
2274 } else {
2275 /* u, i, and f storage all line up, so no need for a
2276 * switch case for comparing each type.
2277 */
2278 if (ir->value.u[i] == ir->value.u[j])
2279 dst->writemask |= (1 << j);
2280 }
2281 }
2282
2283 switch (ir->type->base_type) {
2284 case GLSL_TYPE_FLOAT:
2285 emit(MOV(*dst, src_reg(ir->value.f[i])));
2286 break;
2287 case GLSL_TYPE_INT:
2288 emit(MOV(*dst, src_reg(ir->value.i[i])));
2289 break;
2290 case GLSL_TYPE_UINT:
2291 emit(MOV(*dst, src_reg(ir->value.u[i])));
2292 break;
2293 case GLSL_TYPE_BOOL:
2294 emit(MOV(*dst,
2295 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2296 : 0u)));
2297 break;
2298 default:
2299 unreachable("Non-float/uint/int/bool constant");
2300 }
2301
2302 remaining_writemask &= ~dst->writemask;
2303 }
2304 dst->reg_offset++;
2305 }
2306
2307 void
2308 vec4_visitor::visit(ir_constant *ir)
2309 {
2310 dst_reg dst = dst_reg(this, ir->type);
2311 this->result = src_reg(dst);
2312
2313 emit_constant_values(&dst, ir);
2314 }
2315
2316 void
2317 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2318 {
2319 ir_dereference *deref = static_cast<ir_dereference *>(
2320 ir->actual_parameters.get_head());
2321 ir_variable *location = deref->variable_referenced();
2322 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2323 location->data.binding);
2324
2325 /* Calculate the surface offset */
2326 src_reg offset(this, glsl_type::uint_type);
2327 ir_dereference_array *deref_array = deref->as_dereference_array();
2328 if (deref_array) {
2329 deref_array->array_index->accept(this);
2330
2331 src_reg tmp(this, glsl_type::uint_type);
2332 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2333 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2334 } else {
2335 offset = location->data.atomic.offset;
2336 }
2337
2338 /* Emit the appropriate machine instruction */
2339 const char *callee = ir->callee->function_name();
2340 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2341
2342 if (!strcmp("__intrinsic_atomic_read", callee)) {
2343 emit_untyped_surface_read(surf_index, dst, offset);
2344
2345 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2346 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2347 src_reg(), src_reg());
2348
2349 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2350 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2351 src_reg(), src_reg());
2352 }
2353 }
2354
2355 void
2356 vec4_visitor::visit(ir_call *ir)
2357 {
2358 const char *callee = ir->callee->function_name();
2359
2360 if (!strcmp("__intrinsic_atomic_read", callee) ||
2361 !strcmp("__intrinsic_atomic_increment", callee) ||
2362 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2363 visit_atomic_counter_intrinsic(ir);
2364 } else {
2365 unreachable("Unsupported intrinsic.");
2366 }
2367 }
2368
2369 src_reg
2370 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2371 {
2372 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2373 inst->base_mrf = 2;
2374 inst->mlen = 1;
2375 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2376 inst->dst.writemask = WRITEMASK_XYZW;
2377
2378 inst->src[1] = sampler;
2379
2380 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2381 int param_base = inst->base_mrf;
2382 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2383 int zero_mask = 0xf & ~coord_mask;
2384
2385 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2386 coordinate));
2387
2388 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2389 src_reg(0)));
2390
2391 emit(inst);
2392 return src_reg(inst->dst);
2393 }
2394
2395 static bool
2396 is_high_sampler(struct brw_context *brw, src_reg sampler)
2397 {
2398 if (brw->gen < 8 && !brw->is_haswell)
2399 return false;
2400
2401 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2402 }
2403
2404 void
2405 vec4_visitor::visit(ir_texture *ir)
2406 {
2407 uint32_t sampler =
2408 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2409
2410 ir_rvalue *nonconst_sampler_index =
2411 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2412
2413 /* Handle non-constant sampler array indexing */
2414 src_reg sampler_reg;
2415 if (nonconst_sampler_index) {
2416 /* The highest sampler which may be used by this operation is
2417 * the last element of the array. Mark it here, because the generator
2418 * doesn't have enough information to determine the bound.
2419 */
2420 uint32_t array_size = ir->sampler->as_dereference_array()
2421 ->array->type->array_size();
2422
2423 uint32_t max_used = sampler + array_size - 1;
2424 if (ir->op == ir_tg4 && brw->gen < 8) {
2425 max_used += prog_data->base.binding_table.gather_texture_start;
2426 } else {
2427 max_used += prog_data->base.binding_table.texture_start;
2428 }
2429
2430 brw_mark_surface_used(&prog_data->base, max_used);
2431
2432 /* Emit code to evaluate the actual indexing expression */
2433 nonconst_sampler_index->accept(this);
2434 dst_reg temp(this, glsl_type::uint_type);
2435 emit(ADD(temp, this->result, src_reg(sampler)))
2436 ->force_writemask_all = true;
2437 sampler_reg = src_reg(temp);
2438 } else {
2439 /* Single sampler, or constant array index; the indexing expression
2440 * is just an immediate.
2441 */
2442 sampler_reg = src_reg(sampler);
2443 }
2444
2445 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2446 * emitting anything other than setting up the constant result.
2447 */
2448 if (ir->op == ir_tg4) {
2449 ir_constant *chan = ir->lod_info.component->as_constant();
2450 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2451 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2452 dst_reg result(this, ir->type);
2453 this->result = src_reg(result);
2454 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2455 return;
2456 }
2457 }
2458
2459 /* Should be lowered by do_lower_texture_projection */
2460 assert(!ir->projector);
2461
2462 /* Should be lowered */
2463 assert(!ir->offset || !ir->offset->type->is_array());
2464
2465 /* Generate code to compute all the subexpression trees. This has to be
2466 * done before loading any values into MRFs for the sampler message since
2467 * generating these values may involve SEND messages that need the MRFs.
2468 */
2469 src_reg coordinate;
2470 if (ir->coordinate) {
2471 ir->coordinate->accept(this);
2472 coordinate = this->result;
2473 }
2474
2475 src_reg shadow_comparitor;
2476 if (ir->shadow_comparitor) {
2477 ir->shadow_comparitor->accept(this);
2478 shadow_comparitor = this->result;
2479 }
2480
2481 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2482 src_reg offset_value;
2483 if (has_nonconstant_offset) {
2484 ir->offset->accept(this);
2485 offset_value = src_reg(this->result);
2486 }
2487
2488 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2489 src_reg lod, dPdx, dPdy, sample_index, mcs;
2490 switch (ir->op) {
2491 case ir_tex:
2492 lod = src_reg(0.0f);
2493 lod_type = glsl_type::float_type;
2494 break;
2495 case ir_txf:
2496 case ir_txl:
2497 case ir_txs:
2498 ir->lod_info.lod->accept(this);
2499 lod = this->result;
2500 lod_type = ir->lod_info.lod->type;
2501 break;
2502 case ir_query_levels:
2503 lod = src_reg(0);
2504 lod_type = glsl_type::int_type;
2505 break;
2506 case ir_txf_ms:
2507 ir->lod_info.sample_index->accept(this);
2508 sample_index = this->result;
2509 sample_index_type = ir->lod_info.sample_index->type;
2510
2511 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2512 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2513 else
2514 mcs = src_reg(0u);
2515 break;
2516 case ir_txd:
2517 ir->lod_info.grad.dPdx->accept(this);
2518 dPdx = this->result;
2519
2520 ir->lod_info.grad.dPdy->accept(this);
2521 dPdy = this->result;
2522
2523 lod_type = ir->lod_info.grad.dPdx->type;
2524 break;
2525 case ir_txb:
2526 case ir_lod:
2527 case ir_tg4:
2528 break;
2529 }
2530
2531 enum opcode opcode;
2532 switch (ir->op) {
2533 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2534 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2535 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2536 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2537 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2538 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2539 case ir_tg4: opcode = has_nonconstant_offset
2540 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2541 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2542 case ir_txb:
2543 unreachable("TXB is not valid for vertex shaders.");
2544 case ir_lod:
2545 unreachable("LOD is not valid for vertex shaders.");
2546 default:
2547 unreachable("Unrecognized tex op");
2548 }
2549
2550 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2551
2552 if (ir->offset != NULL && !has_nonconstant_offset) {
2553 inst->texture_offset =
2554 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2555 ir->offset->type->vector_elements);
2556 }
2557
2558 /* Stuff the channel select bits in the top of the texture offset */
2559 if (ir->op == ir_tg4)
2560 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2561
2562 /* The message header is necessary for:
2563 * - Gen4 (always)
2564 * - Texel offsets
2565 * - Gather channel selection
2566 * - Sampler indices too large to fit in a 4-bit value.
2567 */
2568 inst->header_present =
2569 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2570 is_high_sampler(brw, sampler_reg);
2571 inst->base_mrf = 2;
2572 inst->mlen = inst->header_present + 1; /* always at least one */
2573 inst->dst = dst_reg(this, ir->type);
2574 inst->dst.writemask = WRITEMASK_XYZW;
2575 inst->shadow_compare = ir->shadow_comparitor != NULL;
2576
2577 inst->src[1] = sampler_reg;
2578
2579 /* MRF for the first parameter */
2580 int param_base = inst->base_mrf + inst->header_present;
2581
2582 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2583 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2584 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2585 } else {
2586 /* Load the coordinate */
2587 /* FINISHME: gl_clamp_mask and saturate */
2588 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2589 int zero_mask = 0xf & ~coord_mask;
2590
2591 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2592 coordinate));
2593
2594 if (zero_mask != 0) {
2595 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2596 src_reg(0)));
2597 }
2598 /* Load the shadow comparitor */
2599 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2600 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2601 WRITEMASK_X),
2602 shadow_comparitor));
2603 inst->mlen++;
2604 }
2605
2606 /* Load the LOD info */
2607 if (ir->op == ir_tex || ir->op == ir_txl) {
2608 int mrf, writemask;
2609 if (brw->gen >= 5) {
2610 mrf = param_base + 1;
2611 if (ir->shadow_comparitor) {
2612 writemask = WRITEMASK_Y;
2613 /* mlen already incremented */
2614 } else {
2615 writemask = WRITEMASK_X;
2616 inst->mlen++;
2617 }
2618 } else /* brw->gen == 4 */ {
2619 mrf = param_base;
2620 writemask = WRITEMASK_W;
2621 }
2622 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2623 } else if (ir->op == ir_txf) {
2624 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2625 } else if (ir->op == ir_txf_ms) {
2626 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2627 sample_index));
2628 if (brw->gen >= 7) {
2629 /* MCS data is in the first channel of `mcs`, but we need to get it into
2630 * the .y channel of the second vec4 of params, so replicate .x across
2631 * the whole vec4 and then mask off everything except .y
2632 */
2633 mcs.swizzle = BRW_SWIZZLE_XXXX;
2634 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2635 mcs));
2636 }
2637 inst->mlen++;
2638 } else if (ir->op == ir_txd) {
2639 const glsl_type *type = lod_type;
2640
2641 if (brw->gen >= 5) {
2642 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2643 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2644 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2645 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2646 inst->mlen++;
2647
2648 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2649 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2650 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2651 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2652 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2653 inst->mlen++;
2654
2655 if (ir->shadow_comparitor) {
2656 emit(MOV(dst_reg(MRF, param_base + 2,
2657 ir->shadow_comparitor->type, WRITEMASK_Z),
2658 shadow_comparitor));
2659 }
2660 }
2661 } else /* brw->gen == 4 */ {
2662 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2663 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2664 inst->mlen += 2;
2665 }
2666 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2667 if (ir->shadow_comparitor) {
2668 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2669 shadow_comparitor));
2670 }
2671
2672 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2673 offset_value));
2674 inst->mlen++;
2675 }
2676 }
2677
2678 emit(inst);
2679
2680 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2681 * spec requires layers.
2682 */
2683 if (ir->op == ir_txs) {
2684 glsl_type const *type = ir->sampler->type;
2685 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2686 type->sampler_array) {
2687 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2688 writemask(inst->dst, WRITEMASK_Z),
2689 src_reg(inst->dst), src_reg(6));
2690 }
2691 }
2692
2693 if (brw->gen == 6 && ir->op == ir_tg4) {
2694 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2695 }
2696
2697 swizzle_result(ir, src_reg(inst->dst), sampler);
2698 }
2699
2700 /**
2701 * Apply workarounds for Gen6 gather with UINT/SINT
2702 */
2703 void
2704 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2705 {
2706 if (!wa)
2707 return;
2708
2709 int width = (wa & WA_8BIT) ? 8 : 16;
2710 dst_reg dst_f = dst;
2711 dst_f.type = BRW_REGISTER_TYPE_F;
2712
2713 /* Convert from UNORM to UINT */
2714 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2715 emit(MOV(dst, src_reg(dst_f)));
2716
2717 if (wa & WA_SIGN) {
2718 /* Reinterpret the UINT value as a signed INT value by
2719 * shifting the sign bit into place, then shifting back
2720 * preserving sign.
2721 */
2722 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2723 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2724 }
2725 }
2726
2727 /**
2728 * Set up the gather channel based on the swizzle, for gather4.
2729 */
2730 uint32_t
2731 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2732 {
2733 ir_constant *chan = ir->lod_info.component->as_constant();
2734 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2735 switch (swiz) {
2736 case SWIZZLE_X: return 0;
2737 case SWIZZLE_Y:
2738 /* gather4 sampler is broken for green channel on RG32F --
2739 * we must ask for blue instead.
2740 */
2741 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2742 return 2;
2743 return 1;
2744 case SWIZZLE_Z: return 2;
2745 case SWIZZLE_W: return 3;
2746 default:
2747 unreachable("Not reached"); /* zero, one swizzles handled already */
2748 }
2749 }
2750
2751 void
2752 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2753 {
2754 int s = key->tex.swizzles[sampler];
2755
2756 this->result = src_reg(this, ir->type);
2757 dst_reg swizzled_result(this->result);
2758
2759 if (ir->op == ir_query_levels) {
2760 /* # levels is in .w */
2761 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2762 emit(MOV(swizzled_result, orig_val));
2763 return;
2764 }
2765
2766 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2767 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2768 emit(MOV(swizzled_result, orig_val));
2769 return;
2770 }
2771
2772
2773 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2774 int swizzle[4] = {0};
2775
2776 for (int i = 0; i < 4; i++) {
2777 switch (GET_SWZ(s, i)) {
2778 case SWIZZLE_ZERO:
2779 zero_mask |= (1 << i);
2780 break;
2781 case SWIZZLE_ONE:
2782 one_mask |= (1 << i);
2783 break;
2784 default:
2785 copy_mask |= (1 << i);
2786 swizzle[i] = GET_SWZ(s, i);
2787 break;
2788 }
2789 }
2790
2791 if (copy_mask) {
2792 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2793 swizzled_result.writemask = copy_mask;
2794 emit(MOV(swizzled_result, orig_val));
2795 }
2796
2797 if (zero_mask) {
2798 swizzled_result.writemask = zero_mask;
2799 emit(MOV(swizzled_result, src_reg(0.0f)));
2800 }
2801
2802 if (one_mask) {
2803 swizzled_result.writemask = one_mask;
2804 emit(MOV(swizzled_result, src_reg(1.0f)));
2805 }
2806 }
2807
2808 void
2809 vec4_visitor::visit(ir_return *)
2810 {
2811 unreachable("not reached");
2812 }
2813
2814 void
2815 vec4_visitor::visit(ir_discard *)
2816 {
2817 unreachable("not reached");
2818 }
2819
2820 void
2821 vec4_visitor::visit(ir_if *ir)
2822 {
2823 /* Don't point the annotation at the if statement, because then it plus
2824 * the then and else blocks get printed.
2825 */
2826 this->base_ir = ir->condition;
2827
2828 if (brw->gen == 6) {
2829 emit_if_gen6(ir);
2830 } else {
2831 enum brw_predicate predicate;
2832 emit_bool_to_cond_code(ir->condition, &predicate);
2833 emit(IF(predicate));
2834 }
2835
2836 visit_instructions(&ir->then_instructions);
2837
2838 if (!ir->else_instructions.is_empty()) {
2839 this->base_ir = ir->condition;
2840 emit(BRW_OPCODE_ELSE);
2841
2842 visit_instructions(&ir->else_instructions);
2843 }
2844
2845 this->base_ir = ir->condition;
2846 emit(BRW_OPCODE_ENDIF);
2847 }
2848
2849 void
2850 vec4_visitor::visit(ir_emit_vertex *)
2851 {
2852 unreachable("not reached");
2853 }
2854
2855 void
2856 vec4_visitor::visit(ir_end_primitive *)
2857 {
2858 unreachable("not reached");
2859 }
2860
2861 void
2862 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2863 dst_reg dst, src_reg offset,
2864 src_reg src0, src_reg src1)
2865 {
2866 unsigned mlen = 0;
2867
2868 /* Set the atomic operation offset. */
2869 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2870 mlen++;
2871
2872 /* Set the atomic operation arguments. */
2873 if (src0.file != BAD_FILE) {
2874 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2875 mlen++;
2876 }
2877
2878 if (src1.file != BAD_FILE) {
2879 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2880 mlen++;
2881 }
2882
2883 /* Emit the instruction. Note that this maps to the normal SIMD8
2884 * untyped atomic message on Ivy Bridge, but that's OK because
2885 * unused channels will be masked out.
2886 */
2887 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2888 src_reg(atomic_op), src_reg(surf_index));
2889 inst->base_mrf = 0;
2890 inst->mlen = mlen;
2891 }
2892
2893 void
2894 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2895 src_reg offset)
2896 {
2897 /* Set the surface read offset. */
2898 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2899
2900 /* Emit the instruction. Note that this maps to the normal SIMD8
2901 * untyped surface read message, but that's OK because unused
2902 * channels will be masked out.
2903 */
2904 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2905 dst, src_reg(surf_index));
2906 inst->base_mrf = 0;
2907 inst->mlen = 1;
2908 }
2909
2910 void
2911 vec4_visitor::emit_ndc_computation()
2912 {
2913 /* Get the position */
2914 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2915
2916 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2917 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2918 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2919
2920 current_annotation = "NDC";
2921 dst_reg ndc_w = ndc;
2922 ndc_w.writemask = WRITEMASK_W;
2923 src_reg pos_w = pos;
2924 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2925 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2926
2927 dst_reg ndc_xyz = ndc;
2928 ndc_xyz.writemask = WRITEMASK_XYZ;
2929
2930 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2931 }
2932
2933 void
2934 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2935 {
2936 if (brw->gen < 6 &&
2937 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2938 key->userclip_active || brw->has_negative_rhw_bug)) {
2939 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2940 dst_reg header1_w = header1;
2941 header1_w.writemask = WRITEMASK_W;
2942
2943 emit(MOV(header1, 0u));
2944
2945 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2946 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2947
2948 current_annotation = "Point size";
2949 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2950 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2951 }
2952
2953 if (key->userclip_active) {
2954 current_annotation = "Clipping flags";
2955 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2956 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2957
2958 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2959 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2960 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2961
2962 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2963 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2964 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2965 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2966 }
2967
2968 /* i965 clipping workaround:
2969 * 1) Test for -ve rhw
2970 * 2) If set,
2971 * set ndc = (0,0,0,0)
2972 * set ucp[6] = 1
2973 *
2974 * Later, clipping will detect ucp[6] and ensure the primitive is
2975 * clipped against all fixed planes.
2976 */
2977 if (brw->has_negative_rhw_bug) {
2978 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2979 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2980 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2981 vec4_instruction *inst;
2982 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2983 inst->predicate = BRW_PREDICATE_NORMAL;
2984 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2985 inst->predicate = BRW_PREDICATE_NORMAL;
2986 }
2987
2988 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2989 } else if (brw->gen < 6) {
2990 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2991 } else {
2992 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2993 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2994 dst_reg reg_w = reg;
2995 reg_w.writemask = WRITEMASK_W;
2996 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
2997 }
2998 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2999 dst_reg reg_y = reg;
3000 reg_y.writemask = WRITEMASK_Y;
3001 reg_y.type = BRW_REGISTER_TYPE_D;
3002 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3003 }
3004 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3005 dst_reg reg_z = reg;
3006 reg_z.writemask = WRITEMASK_Z;
3007 reg_z.type = BRW_REGISTER_TYPE_D;
3008 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3009 }
3010 }
3011 }
3012
3013 void
3014 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3015 {
3016 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3017 *
3018 * "If a linked set of shaders forming the vertex stage contains no
3019 * static write to gl_ClipVertex or gl_ClipDistance, but the
3020 * application has requested clipping against user clip planes through
3021 * the API, then the coordinate written to gl_Position is used for
3022 * comparison against the user clip planes."
3023 *
3024 * This function is only called if the shader didn't write to
3025 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3026 * if the user wrote to it; otherwise we use gl_Position.
3027 */
3028 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3029 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3030 clip_vertex = VARYING_SLOT_POS;
3031 }
3032
3033 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3034 ++i) {
3035 reg.writemask = 1 << i;
3036 emit(DP4(reg,
3037 src_reg(output_reg[clip_vertex]),
3038 src_reg(this->userplane[i + offset])));
3039 }
3040 }
3041
3042 void
3043 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3044 {
3045 assert (varying < VARYING_SLOT_MAX);
3046 reg.type = output_reg[varying].type;
3047 current_annotation = output_reg_annotation[varying];
3048 /* Copy the register, saturating if necessary */
3049 vec4_instruction *inst = emit(MOV(reg,
3050 src_reg(output_reg[varying])));
3051 if ((varying == VARYING_SLOT_COL0 ||
3052 varying == VARYING_SLOT_COL1 ||
3053 varying == VARYING_SLOT_BFC0 ||
3054 varying == VARYING_SLOT_BFC1) &&
3055 key->clamp_vertex_color) {
3056 inst->saturate = true;
3057 }
3058 }
3059
3060 void
3061 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3062 {
3063 reg.type = BRW_REGISTER_TYPE_F;
3064
3065 switch (varying) {
3066 case VARYING_SLOT_PSIZ:
3067 {
3068 /* PSIZ is always in slot 0, and is coupled with other flags. */
3069 current_annotation = "indices, point width, clip flags";
3070 emit_psiz_and_flags(reg);
3071 break;
3072 }
3073 case BRW_VARYING_SLOT_NDC:
3074 current_annotation = "NDC";
3075 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3076 break;
3077 case VARYING_SLOT_POS:
3078 current_annotation = "gl_Position";
3079 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3080 break;
3081 case VARYING_SLOT_EDGE:
3082 /* This is present when doing unfilled polygons. We're supposed to copy
3083 * the edge flag from the user-provided vertex array
3084 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3085 * of that attribute (starts as 1.0f). This is then used in clipping to
3086 * determine which edges should be drawn as wireframe.
3087 */
3088 current_annotation = "edge flag";
3089 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3090 glsl_type::float_type, WRITEMASK_XYZW))));
3091 break;
3092 case BRW_VARYING_SLOT_PAD:
3093 /* No need to write to this slot */
3094 break;
3095 default:
3096 emit_generic_urb_slot(reg, varying);
3097 break;
3098 }
3099 }
3100
3101 static int
3102 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3103 {
3104 if (brw->gen >= 6) {
3105 /* URB data written (does not include the message header reg) must
3106 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3107 * section 5.4.3.2.2: URB_INTERLEAVED.
3108 *
3109 * URB entries are allocated on a multiple of 1024 bits, so an
3110 * extra 128 bits written here to make the end align to 256 is
3111 * no problem.
3112 */
3113 if ((mlen % 2) != 1)
3114 mlen++;
3115 }
3116
3117 return mlen;
3118 }
3119
3120
3121 /**
3122 * Generates the VUE payload plus the necessary URB write instructions to
3123 * output it.
3124 *
3125 * The VUE layout is documented in Volume 2a.
3126 */
3127 void
3128 vec4_visitor::emit_vertex()
3129 {
3130 /* MRF 0 is reserved for the debugger, so start with message header
3131 * in MRF 1.
3132 */
3133 int base_mrf = 1;
3134 int mrf = base_mrf;
3135 /* In the process of generating our URB write message contents, we
3136 * may need to unspill a register or load from an array. Those
3137 * reads would use MRFs 14-15.
3138 */
3139 int max_usable_mrf = 13;
3140
3141 /* The following assertion verifies that max_usable_mrf causes an
3142 * even-numbered amount of URB write data, which will meet gen6's
3143 * requirements for length alignment.
3144 */
3145 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3146
3147 /* First mrf is the g0-based message header containing URB handles and
3148 * such.
3149 */
3150 emit_urb_write_header(mrf++);
3151
3152 if (brw->gen < 6) {
3153 emit_ndc_computation();
3154 }
3155
3156 /* Lower legacy ff and ClipVertex clipping to clip distances */
3157 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3158 current_annotation = "user clip distances";
3159
3160 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3161 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3162
3163 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3164 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3165 }
3166
3167 /* We may need to split this up into several URB writes, so do them in a
3168 * loop.
3169 */
3170 int slot = 0;
3171 bool complete = false;
3172 do {
3173 /* URB offset is in URB row increments, and each of our MRFs is half of
3174 * one of those, since we're doing interleaved writes.
3175 */
3176 int offset = slot / 2;
3177
3178 mrf = base_mrf + 1;
3179 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3180 emit_urb_slot(dst_reg(MRF, mrf++),
3181 prog_data->vue_map.slot_to_varying[slot]);
3182
3183 /* If this was max_usable_mrf, we can't fit anything more into this
3184 * URB WRITE.
3185 */
3186 if (mrf > max_usable_mrf) {
3187 slot++;
3188 break;
3189 }
3190 }
3191
3192 complete = slot >= prog_data->vue_map.num_slots;
3193 current_annotation = "URB write";
3194 vec4_instruction *inst = emit_urb_write_opcode(complete);
3195 inst->base_mrf = base_mrf;
3196 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3197 inst->offset += offset;
3198 } while(!complete);
3199 }
3200
3201
3202 src_reg
3203 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3204 src_reg *reladdr, int reg_offset)
3205 {
3206 /* Because we store the values to scratch interleaved like our
3207 * vertex data, we need to scale the vec4 index by 2.
3208 */
3209 int message_header_scale = 2;
3210
3211 /* Pre-gen6, the message header uses byte offsets instead of vec4
3212 * (16-byte) offset units.
3213 */
3214 if (brw->gen < 6)
3215 message_header_scale *= 16;
3216
3217 if (reladdr) {
3218 src_reg index = src_reg(this, glsl_type::int_type);
3219
3220 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3221 src_reg(reg_offset)));
3222 emit_before(block, inst, MUL(dst_reg(index), index,
3223 src_reg(message_header_scale)));
3224
3225 return index;
3226 } else {
3227 return src_reg(reg_offset * message_header_scale);
3228 }
3229 }
3230
3231 src_reg
3232 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3233 src_reg *reladdr, int reg_offset)
3234 {
3235 if (reladdr) {
3236 src_reg index = src_reg(this, glsl_type::int_type);
3237
3238 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3239 src_reg(reg_offset)));
3240
3241 /* Pre-gen6, the message header uses byte offsets instead of vec4
3242 * (16-byte) offset units.
3243 */
3244 if (brw->gen < 6) {
3245 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3246 }
3247
3248 return index;
3249 } else if (brw->gen >= 8) {
3250 /* Store the offset in a GRF so we can send-from-GRF. */
3251 src_reg offset = src_reg(this, glsl_type::int_type);
3252 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3253 return offset;
3254 } else {
3255 int message_header_scale = brw->gen < 6 ? 16 : 1;
3256 return src_reg(reg_offset * message_header_scale);
3257 }
3258 }
3259
3260 /**
3261 * Emits an instruction before @inst to load the value named by @orig_src
3262 * from scratch space at @base_offset to @temp.
3263 *
3264 * @base_offset is measured in 32-byte units (the size of a register).
3265 */
3266 void
3267 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3268 dst_reg temp, src_reg orig_src,
3269 int base_offset)
3270 {
3271 int reg_offset = base_offset + orig_src.reg_offset;
3272 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3273 reg_offset);
3274
3275 emit_before(block, inst, SCRATCH_READ(temp, index));
3276 }
3277
3278 /**
3279 * Emits an instruction after @inst to store the value to be written
3280 * to @orig_dst to scratch space at @base_offset, from @temp.
3281 *
3282 * @base_offset is measured in 32-byte units (the size of a register).
3283 */
3284 void
3285 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3286 int base_offset)
3287 {
3288 int reg_offset = base_offset + inst->dst.reg_offset;
3289 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3290 reg_offset);
3291
3292 /* Create a temporary register to store *inst's result in.
3293 *
3294 * We have to be careful in MOVing from our temporary result register in
3295 * the scratch write. If we swizzle from channels of the temporary that
3296 * weren't initialized, it will confuse live interval analysis, which will
3297 * make spilling fail to make progress.
3298 */
3299 src_reg temp = src_reg(this, glsl_type::vec4_type);
3300 temp.type = inst->dst.type;
3301 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3302 int swizzles[4];
3303 for (int i = 0; i < 4; i++)
3304 if (inst->dst.writemask & (1 << i))
3305 swizzles[i] = i;
3306 else
3307 swizzles[i] = first_writemask_chan;
3308 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3309 swizzles[2], swizzles[3]);
3310
3311 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3312 inst->dst.writemask));
3313 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3314 write->predicate = inst->predicate;
3315 write->ir = inst->ir;
3316 write->annotation = inst->annotation;
3317 inst->insert_after(block, write);
3318
3319 inst->dst.file = temp.file;
3320 inst->dst.reg = temp.reg;
3321 inst->dst.reg_offset = temp.reg_offset;
3322 inst->dst.reladdr = NULL;
3323 }
3324
3325 /**
3326 * We can't generally support array access in GRF space, because a
3327 * single instruction's destination can only span 2 contiguous
3328 * registers. So, we send all GRF arrays that get variable index
3329 * access to scratch space.
3330 */
3331 void
3332 vec4_visitor::move_grf_array_access_to_scratch()
3333 {
3334 int scratch_loc[this->virtual_grf_count];
3335 memset(scratch_loc, -1, sizeof(scratch_loc));
3336
3337 /* First, calculate the set of virtual GRFs that need to be punted
3338 * to scratch due to having any array access on them, and where in
3339 * scratch.
3340 */
3341 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3342 if (inst->dst.file == GRF && inst->dst.reladdr &&
3343 scratch_loc[inst->dst.reg] == -1) {
3344 scratch_loc[inst->dst.reg] = c->last_scratch;
3345 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3346 }
3347
3348 for (int i = 0 ; i < 3; i++) {
3349 src_reg *src = &inst->src[i];
3350
3351 if (src->file == GRF && src->reladdr &&
3352 scratch_loc[src->reg] == -1) {
3353 scratch_loc[src->reg] = c->last_scratch;
3354 c->last_scratch += this->virtual_grf_sizes[src->reg];
3355 }
3356 }
3357 }
3358
3359 /* Now, for anything that will be accessed through scratch, rewrite
3360 * it to load/store. Note that this is a _safe list walk, because
3361 * we may generate a new scratch_write instruction after the one
3362 * we're processing.
3363 */
3364 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3365 /* Set up the annotation tracking for new generated instructions. */
3366 base_ir = inst->ir;
3367 current_annotation = inst->annotation;
3368
3369 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3370 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3371 }
3372
3373 for (int i = 0 ; i < 3; i++) {
3374 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3375 continue;
3376
3377 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3378
3379 emit_scratch_read(block, inst, temp, inst->src[i],
3380 scratch_loc[inst->src[i].reg]);
3381
3382 inst->src[i].file = temp.file;
3383 inst->src[i].reg = temp.reg;
3384 inst->src[i].reg_offset = temp.reg_offset;
3385 inst->src[i].reladdr = NULL;
3386 }
3387 }
3388 }
3389
3390 /**
3391 * Emits an instruction before @inst to load the value named by @orig_src
3392 * from the pull constant buffer (surface) at @base_offset to @temp.
3393 */
3394 void
3395 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3396 dst_reg temp, src_reg orig_src,
3397 int base_offset)
3398 {
3399 int reg_offset = base_offset + orig_src.reg_offset;
3400 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3401 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3402 reg_offset);
3403 vec4_instruction *load;
3404
3405 if (brw->gen >= 7) {
3406 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3407 grf_offset.type = offset.type;
3408 emit_before(block, inst, MOV(grf_offset, offset));
3409
3410 load = new(mem_ctx) vec4_instruction(this,
3411 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3412 temp, index, src_reg(grf_offset));
3413 } else {
3414 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3415 temp, index, offset);
3416 load->base_mrf = 14;
3417 load->mlen = 1;
3418 }
3419 emit_before(block, inst, load);
3420 }
3421
3422 /**
3423 * Implements array access of uniforms by inserting a
3424 * PULL_CONSTANT_LOAD instruction.
3425 *
3426 * Unlike temporary GRF array access (where we don't support it due to
3427 * the difficulty of doing relative addressing on instruction
3428 * destinations), we could potentially do array access of uniforms
3429 * that were loaded in GRF space as push constants. In real-world
3430 * usage we've seen, though, the arrays being used are always larger
3431 * than we could load as push constants, so just always move all
3432 * uniform array access out to a pull constant buffer.
3433 */
3434 void
3435 vec4_visitor::move_uniform_array_access_to_pull_constants()
3436 {
3437 int pull_constant_loc[this->uniforms];
3438 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3439
3440 /* Walk through and find array access of uniforms. Put a copy of that
3441 * uniform in the pull constant buffer.
3442 *
3443 * Note that we don't move constant-indexed accesses to arrays. No
3444 * testing has been done of the performance impact of this choice.
3445 */
3446 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3447 for (int i = 0 ; i < 3; i++) {
3448 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3449 continue;
3450
3451 int uniform = inst->src[i].reg;
3452
3453 /* If this array isn't already present in the pull constant buffer,
3454 * add it.
3455 */
3456 if (pull_constant_loc[uniform] == -1) {
3457 const gl_constant_value **values =
3458 &stage_prog_data->param[uniform * 4];
3459
3460 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3461
3462 assert(uniform < uniform_array_size);
3463 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3464 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3465 = values[j];
3466 }
3467 }
3468
3469 /* Set up the annotation tracking for new generated instructions. */
3470 base_ir = inst->ir;
3471 current_annotation = inst->annotation;
3472
3473 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3474
3475 emit_pull_constant_load(block, inst, temp, inst->src[i],
3476 pull_constant_loc[uniform]);
3477
3478 inst->src[i].file = temp.file;
3479 inst->src[i].reg = temp.reg;
3480 inst->src[i].reg_offset = temp.reg_offset;
3481 inst->src[i].reladdr = NULL;
3482 }
3483 }
3484
3485 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3486 * no need to track them as larger-than-vec4 objects. This will be
3487 * relied on in cutting out unused uniform vectors from push
3488 * constants.
3489 */
3490 split_uniform_registers();
3491 }
3492
3493 void
3494 vec4_visitor::resolve_ud_negate(src_reg *reg)
3495 {
3496 if (reg->type != BRW_REGISTER_TYPE_UD ||
3497 !reg->negate)
3498 return;
3499
3500 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3501 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3502 *reg = temp;
3503 }
3504
3505 vec4_visitor::vec4_visitor(struct brw_context *brw,
3506 struct brw_vec4_compile *c,
3507 struct gl_program *prog,
3508 const struct brw_vec4_prog_key *key,
3509 struct brw_vec4_prog_data *prog_data,
3510 struct gl_shader_program *shader_prog,
3511 gl_shader_stage stage,
3512 void *mem_ctx,
3513 bool debug_flag,
3514 bool no_spills,
3515 shader_time_shader_type st_base,
3516 shader_time_shader_type st_written,
3517 shader_time_shader_type st_reset)
3518 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3519 c(c),
3520 key(key),
3521 prog_data(prog_data),
3522 sanity_param_count(0),
3523 fail_msg(NULL),
3524 first_non_payload_grf(0),
3525 need_all_constants_in_pull_buffer(false),
3526 debug_flag(debug_flag),
3527 no_spills(no_spills),
3528 st_base(st_base),
3529 st_written(st_written),
3530 st_reset(st_reset)
3531 {
3532 this->mem_ctx = mem_ctx;
3533 this->failed = false;
3534
3535 this->base_ir = NULL;
3536 this->current_annotation = NULL;
3537 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3538
3539 this->variable_ht = hash_table_ctor(0,
3540 hash_table_pointer_hash,
3541 hash_table_pointer_compare);
3542
3543 this->virtual_grf_start = NULL;
3544 this->virtual_grf_end = NULL;
3545 this->virtual_grf_sizes = NULL;
3546 this->virtual_grf_count = 0;
3547 this->virtual_grf_reg_map = NULL;
3548 this->virtual_grf_reg_count = 0;
3549 this->virtual_grf_array_size = 0;
3550 this->live_intervals_valid = false;
3551
3552 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3553
3554 this->uniforms = 0;
3555
3556 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3557 * at least one. See setup_uniforms() in brw_vec4.cpp.
3558 */
3559 this->uniform_array_size = 1;
3560 if (prog_data) {
3561 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3562 }
3563
3564 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3565 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3566 }
3567
3568 vec4_visitor::~vec4_visitor()
3569 {
3570 hash_table_dtor(this->variable_ht);
3571 }
3572
3573
3574 void
3575 vec4_visitor::fail(const char *format, ...)
3576 {
3577 va_list va;
3578 char *msg;
3579
3580 if (failed)
3581 return;
3582
3583 failed = true;
3584
3585 va_start(va, format);
3586 msg = ralloc_vasprintf(mem_ctx, format, va);
3587 va_end(va);
3588 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3589
3590 this->fail_msg = msg;
3591
3592 if (debug_flag) {
3593 fprintf(stderr, "%s", msg);
3594 }
3595 }
3596
3597 } /* namespace brw */