yi965/vec4: Add support for ir_unop_saturate
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
123 const src_reg &src1) \
124 { \
125 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
126 src0, src1); \
127 }
128
129 #define ALU2_ACC(op) \
130 vec4_instruction * \
131 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
132 const src_reg &src1) \
133 { \
134 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
135 BRW_OPCODE_##op, dst, src0, src1); \
136 inst->writes_accumulator = true; \
137 return inst; \
138 }
139
140 #define ALU3(op) \
141 vec4_instruction * \
142 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
143 const src_reg &src1, const src_reg &src2) \
144 { \
145 assert(brw->gen >= 6); \
146 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
147 src0, src1, src2); \
148 }
149
150 ALU1(NOT)
151 ALU1(MOV)
152 ALU1(FRC)
153 ALU1(RNDD)
154 ALU1(RNDE)
155 ALU1(RNDZ)
156 ALU1(F32TO16)
157 ALU1(F16TO32)
158 ALU2(ADD)
159 ALU2(MUL)
160 ALU2_ACC(MACH)
161 ALU2(AND)
162 ALU2(OR)
163 ALU2(XOR)
164 ALU2(DP3)
165 ALU2(DP4)
166 ALU2(DPH)
167 ALU2(SHL)
168 ALU2(SHR)
169 ALU2(ASR)
170 ALU3(LRP)
171 ALU1(BFREV)
172 ALU3(BFE)
173 ALU2(BFI1)
174 ALU3(BFI2)
175 ALU1(FBH)
176 ALU1(FBL)
177 ALU1(CBIT)
178 ALU3(MAD)
179 ALU2_ACC(ADDC)
180 ALU2_ACC(SUBB)
181 ALU2(MAC)
182
183 /** Gen4 predicated IF. */
184 vec4_instruction *
185 vec4_visitor::IF(enum brw_predicate predicate)
186 {
187 vec4_instruction *inst;
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
190 inst->predicate = predicate;
191
192 return inst;
193 }
194
195 /** Gen6 IF with embedded comparison. */
196 vec4_instruction *
197 vec4_visitor::IF(src_reg src0, src_reg src1,
198 enum brw_conditional_mod condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
221 enum brw_conditional_mod condition)
222 {
223 vec4_instruction *inst;
224
225 /* original gen4 does type conversion to the destination type
226 * before before comparison, producing garbage results for floating
227 * point comparisons.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 vec4_instruction *
245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
246 {
247 vec4_instruction *inst;
248
249 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
250 dst, index);
251 inst->base_mrf = 14;
252 inst->mlen = 2;
253
254 return inst;
255 }
256
257 vec4_instruction *
258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
259 const src_reg &index)
260 {
261 vec4_instruction *inst;
262
263 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
264 dst, src, index);
265 inst->base_mrf = 13;
266 inst->mlen = 3;
267
268 return inst;
269 }
270
271 void
272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
273 {
274 static enum opcode dot_opcodes[] = {
275 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
276 };
277
278 emit(dot_opcodes[elements - 2], dst, src0, src1);
279 }
280
281 src_reg
282 vec4_visitor::fix_3src_operand(src_reg src)
283 {
284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285 * able to use vertical stride of zero to replicate the vec4 uniform, like
286 *
287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288 *
289 * But you can't, since vertical stride is always four in three-source
290 * instructions. Instead, insert a MOV instruction to do the replication so
291 * that the three-source instruction can consume it.
292 */
293
294 /* The MOV is only needed if the source is a uniform or immediate. */
295 if (src.file != UNIFORM && src.file != IMM)
296 return src;
297
298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299 return src;
300
301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302 expanded.type = src.type;
303 emit(MOV(expanded, src));
304 return src_reg(expanded);
305 }
306
307 src_reg
308 vec4_visitor::fix_math_operand(src_reg src)
309 {
310 /* The gen6 math instruction ignores the source modifiers --
311 * swizzle, abs, negate, and at least some parts of the register
312 * region description.
313 *
314 * Rather than trying to enumerate all these cases, *always* expand the
315 * operand to a temp GRF for gen6.
316 *
317 * For gen7, keep the operand as-is, except if immediate, which gen7 still
318 * can't use.
319 */
320
321 if (brw->gen == 7 && src.file != IMM)
322 return src;
323
324 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
325 expanded.type = src.type;
326 emit(MOV(expanded, src));
327 return src_reg(expanded);
328 }
329
330 void
331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
332 {
333 src = fix_math_operand(src);
334
335 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
336 /* The gen6 math instruction must be align1, so we can't do
337 * writemasks.
338 */
339 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
340
341 emit(opcode, temp_dst, src);
342
343 emit(MOV(dst, src_reg(temp_dst)));
344 } else {
345 emit(opcode, dst, src);
346 }
347 }
348
349 void
350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
351 {
352 vec4_instruction *inst = emit(opcode, dst, src);
353 inst->base_mrf = 1;
354 inst->mlen = 1;
355 }
356
357 void
358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
359 {
360 switch (opcode) {
361 case SHADER_OPCODE_RCP:
362 case SHADER_OPCODE_RSQ:
363 case SHADER_OPCODE_SQRT:
364 case SHADER_OPCODE_EXP2:
365 case SHADER_OPCODE_LOG2:
366 case SHADER_OPCODE_SIN:
367 case SHADER_OPCODE_COS:
368 break;
369 default:
370 unreachable("not reached: bad math opcode");
371 }
372
373 if (brw->gen >= 8) {
374 emit(opcode, dst, src);
375 } else if (brw->gen >= 6) {
376 emit_math1_gen6(opcode, dst, src);
377 } else {
378 emit_math1_gen4(opcode, dst, src);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 src0 = fix_math_operand(src0);
387 src1 = fix_math_operand(src1);
388
389 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
390 /* The gen6 math instruction must be align1, so we can't do
391 * writemasks.
392 */
393 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
394 temp_dst.type = dst.type;
395
396 emit(opcode, temp_dst, src0, src1);
397
398 emit(MOV(dst, src_reg(temp_dst)));
399 } else {
400 emit(opcode, dst, src0, src1);
401 }
402 }
403
404 void
405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
406 dst_reg dst, src_reg src0, src_reg src1)
407 {
408 vec4_instruction *inst = emit(opcode, dst, src0, src1);
409 inst->base_mrf = 1;
410 inst->mlen = 2;
411 }
412
413 void
414 vec4_visitor::emit_math(enum opcode opcode,
415 dst_reg dst, src_reg src0, src_reg src1)
416 {
417 switch (opcode) {
418 case SHADER_OPCODE_POW:
419 case SHADER_OPCODE_INT_QUOTIENT:
420 case SHADER_OPCODE_INT_REMAINDER:
421 break;
422 default:
423 unreachable("not reached: unsupported binary math opcode");
424 }
425
426 if (brw->gen >= 8) {
427 emit(opcode, dst, src0, src1);
428 } else if (brw->gen >= 6) {
429 emit_math2_gen6(opcode, dst, src0, src1);
430 } else {
431 emit_math2_gen4(opcode, dst, src0, src1);
432 }
433 }
434
435 void
436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
437 {
438 if (brw->gen < 7) {
439 unreachable("ir_unop_pack_half_2x16 should be lowered");
440 }
441
442 assert(dst.type == BRW_REGISTER_TYPE_UD);
443 assert(src0.type == BRW_REGISTER_TYPE_F);
444
445 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
446 *
447 * Because this instruction does not have a 16-bit floating-point type,
448 * the destination data type must be Word (W).
449 *
450 * The destination must be DWord-aligned and specify a horizontal stride
451 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
452 * each destination channel and the upper word is not modified.
453 *
454 * The above restriction implies that the f32to16 instruction must use
455 * align1 mode, because only in align1 mode is it possible to specify
456 * horizontal stride. We choose here to defy the hardware docs and emit
457 * align16 instructions.
458 *
459 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
460 * instructions. I was partially successful in that the code passed all
461 * tests. However, the code was dubiously correct and fragile, and the
462 * tests were not harsh enough to probe that frailty. Not trusting the
463 * code, I chose instead to remain in align16 mode in defiance of the hw
464 * docs).
465 *
466 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
467 * simulator, emitting a f32to16 in align16 mode with UD as destination
468 * data type is safe. The behavior differs from that specified in the PRM
469 * in that the upper word of each destination channel is cleared to 0.
470 */
471
472 dst_reg tmp_dst(this, glsl_type::uvec2_type);
473 src_reg tmp_src(tmp_dst);
474
475 #if 0
476 /* Verify the undocumented behavior on which the following instructions
477 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
478 * then the result of the bit-or instruction below will be incorrect.
479 *
480 * You should inspect the disasm output in order to verify that the MOV is
481 * not optimized away.
482 */
483 emit(MOV(tmp_dst, src_reg(0x12345678u)));
484 #endif
485
486 /* Give tmp the form below, where "." means untouched.
487 *
488 * w z y x w z y x
489 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
490 *
491 * That the upper word of each write-channel be 0 is required for the
492 * following bit-shift and bit-or instructions to work. Note that this
493 * relies on the undocumented hardware behavior mentioned above.
494 */
495 tmp_dst.writemask = WRITEMASK_XY;
496 emit(F32TO16(tmp_dst, src0));
497
498 /* Give the write-channels of dst the form:
499 * 0xhhhh0000
500 */
501 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
502 emit(SHL(dst, tmp_src, src_reg(16u)));
503
504 /* Finally, give the write-channels of dst the form of packHalf2x16's
505 * output:
506 * 0xhhhhllll
507 */
508 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
509 emit(OR(dst, src_reg(dst), tmp_src));
510 }
511
512 void
513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
514 {
515 if (brw->gen < 7) {
516 unreachable("ir_unop_unpack_half_2x16 should be lowered");
517 }
518
519 assert(dst.type == BRW_REGISTER_TYPE_F);
520 assert(src0.type == BRW_REGISTER_TYPE_UD);
521
522 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
523 *
524 * Because this instruction does not have a 16-bit floating-point type,
525 * the source data type must be Word (W). The destination type must be
526 * F (Float).
527 *
528 * To use W as the source data type, we must adjust horizontal strides,
529 * which is only possible in align1 mode. All my [chadv] attempts at
530 * emitting align1 instructions for unpackHalf2x16 failed to pass the
531 * Piglit tests, so I gave up.
532 *
533 * I've verified that, on gen7 hardware and the simulator, it is safe to
534 * emit f16to32 in align16 mode with UD as source data type.
535 */
536
537 dst_reg tmp_dst(this, glsl_type::uvec2_type);
538 src_reg tmp_src(tmp_dst);
539
540 tmp_dst.writemask = WRITEMASK_X;
541 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
542
543 tmp_dst.writemask = WRITEMASK_Y;
544 emit(SHR(tmp_dst, src0, src_reg(16u)));
545
546 dst.writemask = WRITEMASK_XY;
547 emit(F16TO32(dst, tmp_src));
548 }
549
550 void
551 vec4_visitor::visit_instructions(const exec_list *list)
552 {
553 foreach_in_list(ir_instruction, ir, list) {
554 base_ir = ir;
555 ir->accept(this);
556 }
557 }
558
559
560 static int
561 type_size(const struct glsl_type *type)
562 {
563 unsigned int i;
564 int size;
565
566 switch (type->base_type) {
567 case GLSL_TYPE_UINT:
568 case GLSL_TYPE_INT:
569 case GLSL_TYPE_FLOAT:
570 case GLSL_TYPE_BOOL:
571 if (type->is_matrix()) {
572 return type->matrix_columns;
573 } else {
574 /* Regardless of size of vector, it gets a vec4. This is bad
575 * packing for things like floats, but otherwise arrays become a
576 * mess. Hopefully a later pass over the code can pack scalars
577 * down if appropriate.
578 */
579 return 1;
580 }
581 case GLSL_TYPE_ARRAY:
582 assert(type->length > 0);
583 return type_size(type->fields.array) * type->length;
584 case GLSL_TYPE_STRUCT:
585 size = 0;
586 for (i = 0; i < type->length; i++) {
587 size += type_size(type->fields.structure[i].type);
588 }
589 return size;
590 case GLSL_TYPE_SAMPLER:
591 /* Samplers take up one slot in UNIFORMS[], but they're baked in
592 * at link time.
593 */
594 return 1;
595 case GLSL_TYPE_ATOMIC_UINT:
596 return 0;
597 case GLSL_TYPE_IMAGE:
598 case GLSL_TYPE_VOID:
599 case GLSL_TYPE_ERROR:
600 case GLSL_TYPE_INTERFACE:
601 unreachable("not reached");
602 }
603
604 return 0;
605 }
606
607 int
608 vec4_visitor::virtual_grf_alloc(int size)
609 {
610 if (virtual_grf_array_size <= virtual_grf_count) {
611 if (virtual_grf_array_size == 0)
612 virtual_grf_array_size = 16;
613 else
614 virtual_grf_array_size *= 2;
615 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
616 virtual_grf_array_size);
617 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
618 virtual_grf_array_size);
619 }
620 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
621 virtual_grf_reg_count += size;
622 virtual_grf_sizes[virtual_grf_count] = size;
623 return virtual_grf_count++;
624 }
625
626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
627 {
628 init();
629
630 this->file = GRF;
631 this->reg = v->virtual_grf_alloc(type_size(type));
632
633 if (type->is_array() || type->is_record()) {
634 this->swizzle = BRW_SWIZZLE_NOOP;
635 } else {
636 this->swizzle = swizzle_for_size(type->vector_elements);
637 }
638
639 this->type = brw_type_for_base_type(type);
640 }
641
642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
643 {
644 init();
645
646 this->file = GRF;
647 this->reg = v->virtual_grf_alloc(type_size(type));
648
649 if (type->is_array() || type->is_record()) {
650 this->writemask = WRITEMASK_XYZW;
651 } else {
652 this->writemask = (1 << type->vector_elements) - 1;
653 }
654
655 this->type = brw_type_for_base_type(type);
656 }
657
658 /* Our support for uniforms is piggy-backed on the struct
659 * gl_fragment_program, because that's where the values actually
660 * get stored, rather than in some global gl_shader_program uniform
661 * store.
662 */
663 void
664 vec4_visitor::setup_uniform_values(ir_variable *ir)
665 {
666 int namelen = strlen(ir->name);
667
668 /* The data for our (non-builtin) uniforms is stored in a series of
669 * gl_uniform_driver_storage structs for each subcomponent that
670 * glGetUniformLocation() could name. We know it's been set up in the same
671 * order we'd walk the type, so walk the list of storage and find anything
672 * with our name, or the prefix of a component that starts with our name.
673 */
674 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
675 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
676
677 if (strncmp(ir->name, storage->name, namelen) != 0 ||
678 (storage->name[namelen] != 0 &&
679 storage->name[namelen] != '.' &&
680 storage->name[namelen] != '[')) {
681 continue;
682 }
683
684 gl_constant_value *components = storage->storage;
685 unsigned vector_count = (MAX2(storage->array_elements, 1) *
686 storage->type->matrix_columns);
687
688 for (unsigned s = 0; s < vector_count; s++) {
689 assert(uniforms < uniform_array_size);
690 uniform_vector_size[uniforms] = storage->type->vector_elements;
691
692 int i;
693 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
694 stage_prog_data->param[uniforms * 4 + i] = components;
695 components++;
696 }
697 for (; i < 4; i++) {
698 static gl_constant_value zero = { 0.0 };
699 stage_prog_data->param[uniforms * 4 + i] = &zero;
700 }
701
702 uniforms++;
703 }
704 }
705 }
706
707 void
708 vec4_visitor::setup_uniform_clipplane_values()
709 {
710 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
711
712 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
713 assert(this->uniforms < uniform_array_size);
714 this->uniform_vector_size[this->uniforms] = 4;
715 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
716 this->userplane[i].type = BRW_REGISTER_TYPE_F;
717 for (int j = 0; j < 4; ++j) {
718 stage_prog_data->param[this->uniforms * 4 + j] =
719 (gl_constant_value *) &clip_planes[i][j];
720 }
721 ++this->uniforms;
722 }
723 }
724
725 /* Our support for builtin uniforms is even scarier than non-builtin.
726 * It sits on top of the PROG_STATE_VAR parameters that are
727 * automatically updated from GL context state.
728 */
729 void
730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
731 {
732 const ir_state_slot *const slots = ir->state_slots;
733 assert(ir->state_slots != NULL);
734
735 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
736 /* This state reference has already been setup by ir_to_mesa,
737 * but we'll get the same index back here. We can reference
738 * ParameterValues directly, since unlike brw_fs.cpp, we never
739 * add new state references during compile.
740 */
741 int index = _mesa_add_state_reference(this->prog->Parameters,
742 (gl_state_index *)slots[i].tokens);
743 gl_constant_value *values =
744 &this->prog->Parameters->ParameterValues[index][0];
745
746 assert(this->uniforms < uniform_array_size);
747 this->uniform_vector_size[this->uniforms] = 0;
748 /* Add each of the unique swizzled channels of the element.
749 * This will end up matching the size of the glsl_type of this field.
750 */
751 int last_swiz = -1;
752 for (unsigned int j = 0; j < 4; j++) {
753 int swiz = GET_SWZ(slots[i].swizzle, j);
754 last_swiz = swiz;
755
756 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
757 assert(this->uniforms < uniform_array_size);
758 if (swiz <= last_swiz)
759 this->uniform_vector_size[this->uniforms]++;
760 }
761 this->uniforms++;
762 }
763 }
764
765 dst_reg *
766 vec4_visitor::variable_storage(ir_variable *var)
767 {
768 return (dst_reg *)hash_table_find(this->variable_ht, var);
769 }
770
771 void
772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
773 enum brw_predicate *predicate)
774 {
775 ir_expression *expr = ir->as_expression();
776
777 *predicate = BRW_PREDICATE_NORMAL;
778
779 if (expr) {
780 src_reg op[2];
781 vec4_instruction *inst;
782
783 assert(expr->get_num_operands() <= 2);
784 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
785 expr->operands[i]->accept(this);
786 op[i] = this->result;
787
788 resolve_ud_negate(&op[i]);
789 }
790
791 switch (expr->operation) {
792 case ir_unop_logic_not:
793 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
794 inst->conditional_mod = BRW_CONDITIONAL_Z;
795 break;
796
797 case ir_binop_logic_xor:
798 inst = emit(XOR(dst_null_d(), op[0], op[1]));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 break;
801
802 case ir_binop_logic_or:
803 inst = emit(OR(dst_null_d(), op[0], op[1]));
804 inst->conditional_mod = BRW_CONDITIONAL_NZ;
805 break;
806
807 case ir_binop_logic_and:
808 inst = emit(AND(dst_null_d(), op[0], op[1]));
809 inst->conditional_mod = BRW_CONDITIONAL_NZ;
810 break;
811
812 case ir_unop_f2b:
813 if (brw->gen >= 6) {
814 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
815 } else {
816 inst = emit(MOV(dst_null_f(), op[0]));
817 inst->conditional_mod = BRW_CONDITIONAL_NZ;
818 }
819 break;
820
821 case ir_unop_i2b:
822 if (brw->gen >= 6) {
823 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
824 } else {
825 inst = emit(MOV(dst_null_d(), op[0]));
826 inst->conditional_mod = BRW_CONDITIONAL_NZ;
827 }
828 break;
829
830 case ir_binop_all_equal:
831 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
832 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
833 break;
834
835 case ir_binop_any_nequal:
836 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
837 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
838 break;
839
840 case ir_unop_any:
841 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
842 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
843 break;
844
845 case ir_binop_greater:
846 case ir_binop_gequal:
847 case ir_binop_less:
848 case ir_binop_lequal:
849 case ir_binop_equal:
850 case ir_binop_nequal:
851 emit(CMP(dst_null_d(), op[0], op[1],
852 brw_conditional_for_comparison(expr->operation)));
853 break;
854
855 default:
856 unreachable("not reached");
857 }
858 return;
859 }
860
861 ir->accept(this);
862
863 resolve_ud_negate(&this->result);
864
865 if (brw->gen >= 6) {
866 vec4_instruction *inst = emit(AND(dst_null_d(),
867 this->result, src_reg(1)));
868 inst->conditional_mod = BRW_CONDITIONAL_NZ;
869 } else {
870 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
871 inst->conditional_mod = BRW_CONDITIONAL_NZ;
872 }
873 }
874
875 /**
876 * Emit a gen6 IF statement with the comparison folded into the IF
877 * instruction.
878 */
879 void
880 vec4_visitor::emit_if_gen6(ir_if *ir)
881 {
882 ir_expression *expr = ir->condition->as_expression();
883
884 if (expr) {
885 src_reg op[2];
886 dst_reg temp;
887
888 assert(expr->get_num_operands() <= 2);
889 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
890 expr->operands[i]->accept(this);
891 op[i] = this->result;
892 }
893
894 switch (expr->operation) {
895 case ir_unop_logic_not:
896 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
897 return;
898
899 case ir_binop_logic_xor:
900 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
901 return;
902
903 case ir_binop_logic_or:
904 temp = dst_reg(this, glsl_type::bool_type);
905 emit(OR(temp, op[0], op[1]));
906 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
907 return;
908
909 case ir_binop_logic_and:
910 temp = dst_reg(this, glsl_type::bool_type);
911 emit(AND(temp, op[0], op[1]));
912 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
913 return;
914
915 case ir_unop_f2b:
916 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
917 return;
918
919 case ir_unop_i2b:
920 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
921 return;
922
923 case ir_binop_greater:
924 case ir_binop_gequal:
925 case ir_binop_less:
926 case ir_binop_lequal:
927 case ir_binop_equal:
928 case ir_binop_nequal:
929 emit(IF(op[0], op[1],
930 brw_conditional_for_comparison(expr->operation)));
931 return;
932
933 case ir_binop_all_equal:
934 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
935 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
936 return;
937
938 case ir_binop_any_nequal:
939 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
940 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
941 return;
942
943 case ir_unop_any:
944 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
945 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
946 return;
947
948 default:
949 unreachable("not reached");
950 }
951 return;
952 }
953
954 ir->condition->accept(this);
955
956 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
957 }
958
959 void
960 vec4_visitor::visit(ir_variable *ir)
961 {
962 dst_reg *reg = NULL;
963
964 if (variable_storage(ir))
965 return;
966
967 switch (ir->data.mode) {
968 case ir_var_shader_in:
969 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
970 break;
971
972 case ir_var_shader_out:
973 reg = new(mem_ctx) dst_reg(this, ir->type);
974
975 for (int i = 0; i < type_size(ir->type); i++) {
976 output_reg[ir->data.location + i] = *reg;
977 output_reg[ir->data.location + i].reg_offset = i;
978 output_reg[ir->data.location + i].type =
979 brw_type_for_base_type(ir->type->get_scalar_type());
980 output_reg_annotation[ir->data.location + i] = ir->name;
981 }
982 break;
983
984 case ir_var_auto:
985 case ir_var_temporary:
986 reg = new(mem_ctx) dst_reg(this, ir->type);
987 break;
988
989 case ir_var_uniform:
990 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
991
992 /* Thanks to the lower_ubo_reference pass, we will see only
993 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
994 * variables, so no need for them to be in variable_ht.
995 *
996 * Atomic counters take no uniform storage, no need to do
997 * anything here.
998 */
999 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1000 return;
1001
1002 /* Track how big the whole uniform variable is, in case we need to put a
1003 * copy of its data into pull constants for array access.
1004 */
1005 assert(this->uniforms < uniform_array_size);
1006 this->uniform_size[this->uniforms] = type_size(ir->type);
1007
1008 if (!strncmp(ir->name, "gl_", 3)) {
1009 setup_builtin_uniform_values(ir);
1010 } else {
1011 setup_uniform_values(ir);
1012 }
1013 break;
1014
1015 case ir_var_system_value:
1016 reg = make_reg_for_system_value(ir);
1017 break;
1018
1019 default:
1020 unreachable("not reached");
1021 }
1022
1023 reg->type = brw_type_for_base_type(ir->type);
1024 hash_table_insert(this->variable_ht, reg, ir);
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_loop *ir)
1029 {
1030 /* We don't want debugging output to print the whole body of the
1031 * loop as the annotation.
1032 */
1033 this->base_ir = NULL;
1034
1035 emit(BRW_OPCODE_DO);
1036
1037 visit_instructions(&ir->body_instructions);
1038
1039 emit(BRW_OPCODE_WHILE);
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_loop_jump *ir)
1044 {
1045 switch (ir->mode) {
1046 case ir_loop_jump::jump_break:
1047 emit(BRW_OPCODE_BREAK);
1048 break;
1049 case ir_loop_jump::jump_continue:
1050 emit(BRW_OPCODE_CONTINUE);
1051 break;
1052 }
1053 }
1054
1055
1056 void
1057 vec4_visitor::visit(ir_function_signature *)
1058 {
1059 unreachable("not reached");
1060 }
1061
1062 void
1063 vec4_visitor::visit(ir_function *ir)
1064 {
1065 /* Ignore function bodies other than main() -- we shouldn't see calls to
1066 * them since they should all be inlined.
1067 */
1068 if (strcmp(ir->name, "main") == 0) {
1069 const ir_function_signature *sig;
1070 exec_list empty;
1071
1072 sig = ir->matching_signature(NULL, &empty, false);
1073
1074 assert(sig);
1075
1076 visit_instructions(&sig->body);
1077 }
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_sat(ir_expression *ir)
1082 {
1083 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1084 if (!sat_src)
1085 return false;
1086
1087 sat_src->accept(this);
1088 src_reg src = this->result;
1089
1090 this->result = src_reg(this, ir->type);
1091 vec4_instruction *inst;
1092 inst = emit(MOV(dst_reg(this->result), src));
1093 inst->saturate = true;
1094
1095 return true;
1096 }
1097
1098 bool
1099 vec4_visitor::try_emit_mad(ir_expression *ir)
1100 {
1101 /* 3-src instructions were introduced in gen6. */
1102 if (brw->gen < 6)
1103 return false;
1104
1105 /* MAD can only handle floating-point data. */
1106 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1107 return false;
1108
1109 ir_rvalue *nonmul = ir->operands[1];
1110 ir_expression *mul = ir->operands[0]->as_expression();
1111
1112 if (!mul || mul->operation != ir_binop_mul) {
1113 nonmul = ir->operands[0];
1114 mul = ir->operands[1]->as_expression();
1115
1116 if (!mul || mul->operation != ir_binop_mul)
1117 return false;
1118 }
1119
1120 nonmul->accept(this);
1121 src_reg src0 = fix_3src_operand(this->result);
1122
1123 mul->operands[0]->accept(this);
1124 src_reg src1 = fix_3src_operand(this->result);
1125
1126 mul->operands[1]->accept(this);
1127 src_reg src2 = fix_3src_operand(this->result);
1128
1129 this->result = src_reg(this, ir->type);
1130 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1131
1132 return true;
1133 }
1134
1135 bool
1136 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1137 {
1138 /* This optimization relies on CMP setting the destination to 0 when
1139 * false. Early hardware only sets the least significant bit, and
1140 * leaves the other bits undefined. So we can't use it.
1141 */
1142 if (brw->gen < 6)
1143 return false;
1144
1145 ir_expression *const cmp = ir->operands[0]->as_expression();
1146
1147 if (cmp == NULL)
1148 return false;
1149
1150 switch (cmp->operation) {
1151 case ir_binop_less:
1152 case ir_binop_greater:
1153 case ir_binop_lequal:
1154 case ir_binop_gequal:
1155 case ir_binop_equal:
1156 case ir_binop_nequal:
1157 break;
1158
1159 default:
1160 return false;
1161 }
1162
1163 cmp->operands[0]->accept(this);
1164 const src_reg cmp_src0 = this->result;
1165
1166 cmp->operands[1]->accept(this);
1167 const src_reg cmp_src1 = this->result;
1168
1169 this->result = src_reg(this, ir->type);
1170
1171 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1172 brw_conditional_for_comparison(cmp->operation)));
1173
1174 /* If the comparison is false, this->result will just happen to be zero.
1175 */
1176 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1177 this->result, src_reg(1.0f));
1178 inst->predicate = BRW_PREDICATE_NORMAL;
1179 inst->predicate_inverse = true;
1180
1181 return true;
1182 }
1183
1184 void
1185 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1186 src_reg src0, src_reg src1)
1187 {
1188 vec4_instruction *inst;
1189
1190 if (brw->gen >= 6) {
1191 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1192 inst->conditional_mod = conditionalmod;
1193 } else {
1194 emit(CMP(dst, src0, src1, conditionalmod));
1195
1196 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1197 inst->predicate = BRW_PREDICATE_NORMAL;
1198 }
1199 }
1200
1201 void
1202 vec4_visitor::emit_lrp(const dst_reg &dst,
1203 const src_reg &x, const src_reg &y, const src_reg &a)
1204 {
1205 if (brw->gen >= 6) {
1206 /* Note that the instruction's argument order is reversed from GLSL
1207 * and the IR.
1208 */
1209 emit(LRP(dst,
1210 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1211 } else {
1212 /* Earlier generations don't support three source operations, so we
1213 * need to emit x*(1-a) + y*a.
1214 */
1215 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1216 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1217 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1218 y_times_a.writemask = dst.writemask;
1219 one_minus_a.writemask = dst.writemask;
1220 x_times_one_minus_a.writemask = dst.writemask;
1221
1222 emit(MUL(y_times_a, y, a));
1223 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1224 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1225 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1226 }
1227 }
1228
1229 void
1230 vec4_visitor::visit(ir_expression *ir)
1231 {
1232 unsigned int operand;
1233 src_reg op[Elements(ir->operands)];
1234 src_reg result_src;
1235 dst_reg result_dst;
1236 vec4_instruction *inst;
1237
1238 if (try_emit_sat(ir))
1239 return;
1240
1241 if (ir->operation == ir_binop_add) {
1242 if (try_emit_mad(ir))
1243 return;
1244 }
1245
1246 if (ir->operation == ir_unop_b2f) {
1247 if (try_emit_b2f_of_compare(ir))
1248 return;
1249 }
1250
1251 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1252 this->result.file = BAD_FILE;
1253 ir->operands[operand]->accept(this);
1254 if (this->result.file == BAD_FILE) {
1255 fprintf(stderr, "Failed to get tree for expression operand:\n");
1256 ir->operands[operand]->fprint(stderr);
1257 exit(1);
1258 }
1259 op[operand] = this->result;
1260
1261 /* Matrix expression operands should have been broken down to vector
1262 * operations already.
1263 */
1264 assert(!ir->operands[operand]->type->is_matrix());
1265 }
1266
1267 int vector_elements = ir->operands[0]->type->vector_elements;
1268 if (ir->operands[1]) {
1269 vector_elements = MAX2(vector_elements,
1270 ir->operands[1]->type->vector_elements);
1271 }
1272
1273 this->result.file = BAD_FILE;
1274
1275 /* Storage for our result. Ideally for an assignment we'd be using
1276 * the actual storage for the result here, instead.
1277 */
1278 result_src = src_reg(this, ir->type);
1279 /* convenience for the emit functions below. */
1280 result_dst = dst_reg(result_src);
1281 /* If nothing special happens, this is the result. */
1282 this->result = result_src;
1283 /* Limit writes to the channels that will be used by result_src later.
1284 * This does limit this temp's use as a temporary for multi-instruction
1285 * sequences.
1286 */
1287 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1288
1289 switch (ir->operation) {
1290 case ir_unop_logic_not:
1291 if (ctx->Const.UniformBooleanTrue != 1) {
1292 emit(NOT(result_dst, op[0]));
1293 } else {
1294 emit(XOR(result_dst, op[0], src_reg(1)));
1295 }
1296 break;
1297 case ir_unop_neg:
1298 op[0].negate = !op[0].negate;
1299 emit(MOV(result_dst, op[0]));
1300 break;
1301 case ir_unop_abs:
1302 op[0].abs = true;
1303 op[0].negate = false;
1304 emit(MOV(result_dst, op[0]));
1305 break;
1306
1307 case ir_unop_sign:
1308 if (ir->type->is_float()) {
1309 /* AND(val, 0x80000000) gives the sign bit.
1310 *
1311 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1312 * zero.
1313 */
1314 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1315
1316 op[0].type = BRW_REGISTER_TYPE_UD;
1317 result_dst.type = BRW_REGISTER_TYPE_UD;
1318 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1319
1320 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1321 inst->predicate = BRW_PREDICATE_NORMAL;
1322
1323 this->result.type = BRW_REGISTER_TYPE_F;
1324 } else {
1325 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1326 * -> non-negative val generates 0x00000000.
1327 * Predicated OR sets 1 if val is positive.
1328 */
1329 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1330
1331 emit(ASR(result_dst, op[0], src_reg(31)));
1332
1333 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1334 inst->predicate = BRW_PREDICATE_NORMAL;
1335 }
1336 break;
1337
1338 case ir_unop_rcp:
1339 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1340 break;
1341
1342 case ir_unop_exp2:
1343 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1344 break;
1345 case ir_unop_log2:
1346 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1347 break;
1348 case ir_unop_exp:
1349 case ir_unop_log:
1350 unreachable("not reached: should be handled by ir_explog_to_explog2");
1351 case ir_unop_sin:
1352 case ir_unop_sin_reduced:
1353 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1354 break;
1355 case ir_unop_cos:
1356 case ir_unop_cos_reduced:
1357 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1358 break;
1359
1360 case ir_unop_dFdx:
1361 case ir_unop_dFdx_coarse:
1362 case ir_unop_dFdx_fine:
1363 case ir_unop_dFdy:
1364 case ir_unop_dFdy_coarse:
1365 case ir_unop_dFdy_fine:
1366 unreachable("derivatives not valid in vertex shader");
1367
1368 case ir_unop_bitfield_reverse:
1369 emit(BFREV(result_dst, op[0]));
1370 break;
1371 case ir_unop_bit_count:
1372 emit(CBIT(result_dst, op[0]));
1373 break;
1374 case ir_unop_find_msb: {
1375 src_reg temp = src_reg(this, glsl_type::uint_type);
1376
1377 inst = emit(FBH(dst_reg(temp), op[0]));
1378 inst->dst.writemask = WRITEMASK_XYZW;
1379
1380 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1381 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1382 * subtract the result from 31 to convert the MSB count into an LSB count.
1383 */
1384
1385 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1386 temp.swizzle = BRW_SWIZZLE_NOOP;
1387 emit(MOV(result_dst, temp));
1388
1389 src_reg src_tmp = src_reg(result_dst);
1390 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1391
1392 src_tmp.negate = true;
1393 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1394 inst->predicate = BRW_PREDICATE_NORMAL;
1395 break;
1396 }
1397 case ir_unop_find_lsb:
1398 emit(FBL(result_dst, op[0]));
1399 break;
1400 case ir_unop_saturate:
1401 inst = emit(MOV(result_dst, op[0]));
1402 inst->saturate = true;
1403 break;
1404
1405 case ir_unop_noise:
1406 unreachable("not reached: should be handled by lower_noise");
1407
1408 case ir_binop_add:
1409 emit(ADD(result_dst, op[0], op[1]));
1410 break;
1411 case ir_binop_sub:
1412 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1413
1414 case ir_binop_mul:
1415 if (brw->gen < 8 && ir->type->is_integer()) {
1416 /* For integer multiplication, the MUL uses the low 16 bits of one of
1417 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1418 * accumulates in the contribution of the upper 16 bits of that
1419 * operand. If we can determine that one of the args is in the low
1420 * 16 bits, though, we can just emit a single MUL.
1421 */
1422 if (ir->operands[0]->is_uint16_constant()) {
1423 if (brw->gen < 7)
1424 emit(MUL(result_dst, op[0], op[1]));
1425 else
1426 emit(MUL(result_dst, op[1], op[0]));
1427 } else if (ir->operands[1]->is_uint16_constant()) {
1428 if (brw->gen < 7)
1429 emit(MUL(result_dst, op[1], op[0]));
1430 else
1431 emit(MUL(result_dst, op[0], op[1]));
1432 } else {
1433 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1434
1435 emit(MUL(acc, op[0], op[1]));
1436 emit(MACH(dst_null_d(), op[0], op[1]));
1437 emit(MOV(result_dst, src_reg(acc)));
1438 }
1439 } else {
1440 emit(MUL(result_dst, op[0], op[1]));
1441 }
1442 break;
1443 case ir_binop_imul_high: {
1444 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1445
1446 emit(MUL(acc, op[0], op[1]));
1447 emit(MACH(result_dst, op[0], op[1]));
1448 break;
1449 }
1450 case ir_binop_div:
1451 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1452 assert(ir->type->is_integer());
1453 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1454 break;
1455 case ir_binop_carry: {
1456 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1457
1458 emit(ADDC(dst_null_ud(), op[0], op[1]));
1459 emit(MOV(result_dst, src_reg(acc)));
1460 break;
1461 }
1462 case ir_binop_borrow: {
1463 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1464
1465 emit(SUBB(dst_null_ud(), op[0], op[1]));
1466 emit(MOV(result_dst, src_reg(acc)));
1467 break;
1468 }
1469 case ir_binop_mod:
1470 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1471 assert(ir->type->is_integer());
1472 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1473 break;
1474
1475 case ir_binop_less:
1476 case ir_binop_greater:
1477 case ir_binop_lequal:
1478 case ir_binop_gequal:
1479 case ir_binop_equal:
1480 case ir_binop_nequal: {
1481 emit(CMP(result_dst, op[0], op[1],
1482 brw_conditional_for_comparison(ir->operation)));
1483 if (ctx->Const.UniformBooleanTrue == 1) {
1484 emit(AND(result_dst, result_src, src_reg(1)));
1485 }
1486 break;
1487 }
1488
1489 case ir_binop_all_equal:
1490 /* "==" operator producing a scalar boolean. */
1491 if (ir->operands[0]->type->is_vector() ||
1492 ir->operands[1]->type->is_vector()) {
1493 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1494 emit(MOV(result_dst, src_reg(0)));
1495 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1496 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1497 } else {
1498 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1499 if (ctx->Const.UniformBooleanTrue == 1) {
1500 emit(AND(result_dst, result_src, src_reg(1)));
1501 }
1502 }
1503 break;
1504 case ir_binop_any_nequal:
1505 /* "!=" operator producing a scalar boolean. */
1506 if (ir->operands[0]->type->is_vector() ||
1507 ir->operands[1]->type->is_vector()) {
1508 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1509
1510 emit(MOV(result_dst, src_reg(0)));
1511 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1512 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1513 } else {
1514 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1515 if (ctx->Const.UniformBooleanTrue == 1) {
1516 emit(AND(result_dst, result_src, src_reg(1)));
1517 }
1518 }
1519 break;
1520
1521 case ir_unop_any:
1522 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1523 emit(MOV(result_dst, src_reg(0)));
1524
1525 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1526 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1527 break;
1528
1529 case ir_binop_logic_xor:
1530 emit(XOR(result_dst, op[0], op[1]));
1531 break;
1532
1533 case ir_binop_logic_or:
1534 emit(OR(result_dst, op[0], op[1]));
1535 break;
1536
1537 case ir_binop_logic_and:
1538 emit(AND(result_dst, op[0], op[1]));
1539 break;
1540
1541 case ir_binop_dot:
1542 assert(ir->operands[0]->type->is_vector());
1543 assert(ir->operands[0]->type == ir->operands[1]->type);
1544 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1545 break;
1546
1547 case ir_unop_sqrt:
1548 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1549 break;
1550 case ir_unop_rsq:
1551 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1552 break;
1553
1554 case ir_unop_bitcast_i2f:
1555 case ir_unop_bitcast_u2f:
1556 this->result = op[0];
1557 this->result.type = BRW_REGISTER_TYPE_F;
1558 break;
1559
1560 case ir_unop_bitcast_f2i:
1561 this->result = op[0];
1562 this->result.type = BRW_REGISTER_TYPE_D;
1563 break;
1564
1565 case ir_unop_bitcast_f2u:
1566 this->result = op[0];
1567 this->result.type = BRW_REGISTER_TYPE_UD;
1568 break;
1569
1570 case ir_unop_i2f:
1571 case ir_unop_i2u:
1572 case ir_unop_u2i:
1573 case ir_unop_u2f:
1574 case ir_unop_f2i:
1575 case ir_unop_f2u:
1576 emit(MOV(result_dst, op[0]));
1577 break;
1578 case ir_unop_b2i:
1579 if (ctx->Const.UniformBooleanTrue != 1) {
1580 emit(AND(result_dst, op[0], src_reg(1)));
1581 } else {
1582 emit(MOV(result_dst, op[0]));
1583 }
1584 break;
1585 case ir_unop_b2f:
1586 if (ctx->Const.UniformBooleanTrue != 1) {
1587 op[0].type = BRW_REGISTER_TYPE_UD;
1588 result_dst.type = BRW_REGISTER_TYPE_UD;
1589 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1590 result_dst.type = BRW_REGISTER_TYPE_F;
1591 } else {
1592 emit(MOV(result_dst, op[0]));
1593 }
1594 break;
1595 case ir_unop_f2b:
1596 case ir_unop_i2b:
1597 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1598 if (ctx->Const.UniformBooleanTrue == 1) {
1599 emit(AND(result_dst, result_src, src_reg(1)));
1600 }
1601 break;
1602
1603 case ir_unop_trunc:
1604 emit(RNDZ(result_dst, op[0]));
1605 break;
1606 case ir_unop_ceil:
1607 op[0].negate = !op[0].negate;
1608 inst = emit(RNDD(result_dst, op[0]));
1609 this->result.negate = true;
1610 break;
1611 case ir_unop_floor:
1612 inst = emit(RNDD(result_dst, op[0]));
1613 break;
1614 case ir_unop_fract:
1615 inst = emit(FRC(result_dst, op[0]));
1616 break;
1617 case ir_unop_round_even:
1618 emit(RNDE(result_dst, op[0]));
1619 break;
1620
1621 case ir_binop_min:
1622 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1623 break;
1624 case ir_binop_max:
1625 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1626 break;
1627
1628 case ir_binop_pow:
1629 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1630 break;
1631
1632 case ir_unop_bit_not:
1633 inst = emit(NOT(result_dst, op[0]));
1634 break;
1635 case ir_binop_bit_and:
1636 inst = emit(AND(result_dst, op[0], op[1]));
1637 break;
1638 case ir_binop_bit_xor:
1639 inst = emit(XOR(result_dst, op[0], op[1]));
1640 break;
1641 case ir_binop_bit_or:
1642 inst = emit(OR(result_dst, op[0], op[1]));
1643 break;
1644
1645 case ir_binop_lshift:
1646 inst = emit(SHL(result_dst, op[0], op[1]));
1647 break;
1648
1649 case ir_binop_rshift:
1650 if (ir->type->base_type == GLSL_TYPE_INT)
1651 inst = emit(ASR(result_dst, op[0], op[1]));
1652 else
1653 inst = emit(SHR(result_dst, op[0], op[1]));
1654 break;
1655
1656 case ir_binop_bfm:
1657 emit(BFI1(result_dst, op[0], op[1]));
1658 break;
1659
1660 case ir_binop_ubo_load: {
1661 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1662 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1663 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1664 src_reg offset;
1665
1666 /* Now, load the vector from that offset. */
1667 assert(ir->type->is_vector() || ir->type->is_scalar());
1668
1669 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1670 packed_consts.type = result.type;
1671 src_reg surf_index;
1672
1673 if (const_uniform_block) {
1674 /* The block index is a constant, so just emit the binding table entry
1675 * as an immediate.
1676 */
1677 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1678 const_uniform_block->value.u[0]);
1679 } else {
1680 /* The block index is not a constant. Evaluate the index expression
1681 * per-channel and add the base UBO index; the generator will select
1682 * a value from any live channel.
1683 */
1684 surf_index = src_reg(this, glsl_type::uint_type);
1685 emit(ADD(dst_reg(surf_index), op[0],
1686 src_reg(prog_data->base.binding_table.ubo_start)));
1687
1688 /* Assume this may touch any UBO. It would be nice to provide
1689 * a tighter bound, but the array information is already lowered away.
1690 */
1691 brw_mark_surface_used(&prog_data->base,
1692 prog_data->base.binding_table.ubo_start +
1693 shader_prog->NumUniformBlocks - 1);
1694 }
1695
1696 if (const_offset_ir) {
1697 if (brw->gen >= 8) {
1698 /* Store the offset in a GRF so we can send-from-GRF. */
1699 offset = src_reg(this, glsl_type::int_type);
1700 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1701 } else {
1702 /* Immediates are fine on older generations since they'll be moved
1703 * to a (potentially fake) MRF at the generator level.
1704 */
1705 offset = src_reg(const_offset / 16);
1706 }
1707 } else {
1708 offset = src_reg(this, glsl_type::uint_type);
1709 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1710 }
1711
1712 if (brw->gen >= 7) {
1713 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1714 grf_offset.type = offset.type;
1715
1716 emit(MOV(grf_offset, offset));
1717
1718 emit(new(mem_ctx) vec4_instruction(this,
1719 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1720 dst_reg(packed_consts),
1721 surf_index,
1722 src_reg(grf_offset)));
1723 } else {
1724 vec4_instruction *pull =
1725 emit(new(mem_ctx) vec4_instruction(this,
1726 VS_OPCODE_PULL_CONSTANT_LOAD,
1727 dst_reg(packed_consts),
1728 surf_index,
1729 offset));
1730 pull->base_mrf = 14;
1731 pull->mlen = 1;
1732 }
1733
1734 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1735 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1736 const_offset % 16 / 4,
1737 const_offset % 16 / 4,
1738 const_offset % 16 / 4);
1739
1740 /* UBO bools are any nonzero int. We need to convert them to use the
1741 * value of true stored in ctx->Const.UniformBooleanTrue.
1742 */
1743 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1744 emit(CMP(result_dst, packed_consts, src_reg(0u),
1745 BRW_CONDITIONAL_NZ));
1746 if (ctx->Const.UniformBooleanTrue == 1) {
1747 emit(AND(result_dst, result, src_reg(1)));
1748 }
1749 } else {
1750 emit(MOV(result_dst, packed_consts));
1751 }
1752 break;
1753 }
1754
1755 case ir_binop_vector_extract:
1756 unreachable("should have been lowered by vec_index_to_cond_assign");
1757
1758 case ir_triop_fma:
1759 op[0] = fix_3src_operand(op[0]);
1760 op[1] = fix_3src_operand(op[1]);
1761 op[2] = fix_3src_operand(op[2]);
1762 /* Note that the instruction's argument order is reversed from GLSL
1763 * and the IR.
1764 */
1765 emit(MAD(result_dst, op[2], op[1], op[0]));
1766 break;
1767
1768 case ir_triop_lrp:
1769 emit_lrp(result_dst, op[0], op[1], op[2]);
1770 break;
1771
1772 case ir_triop_csel:
1773 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1774 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1775 inst->predicate = BRW_PREDICATE_NORMAL;
1776 break;
1777
1778 case ir_triop_bfi:
1779 op[0] = fix_3src_operand(op[0]);
1780 op[1] = fix_3src_operand(op[1]);
1781 op[2] = fix_3src_operand(op[2]);
1782 emit(BFI2(result_dst, op[0], op[1], op[2]));
1783 break;
1784
1785 case ir_triop_bitfield_extract:
1786 op[0] = fix_3src_operand(op[0]);
1787 op[1] = fix_3src_operand(op[1]);
1788 op[2] = fix_3src_operand(op[2]);
1789 /* Note that the instruction's argument order is reversed from GLSL
1790 * and the IR.
1791 */
1792 emit(BFE(result_dst, op[2], op[1], op[0]));
1793 break;
1794
1795 case ir_triop_vector_insert:
1796 unreachable("should have been lowered by lower_vector_insert");
1797
1798 case ir_quadop_bitfield_insert:
1799 unreachable("not reached: should be handled by "
1800 "bitfield_insert_to_bfm_bfi\n");
1801
1802 case ir_quadop_vector:
1803 unreachable("not reached: should be handled by lower_quadop_vector");
1804
1805 case ir_unop_pack_half_2x16:
1806 emit_pack_half_2x16(result_dst, op[0]);
1807 break;
1808 case ir_unop_unpack_half_2x16:
1809 emit_unpack_half_2x16(result_dst, op[0]);
1810 break;
1811 case ir_unop_pack_snorm_2x16:
1812 case ir_unop_pack_snorm_4x8:
1813 case ir_unop_pack_unorm_2x16:
1814 case ir_unop_pack_unorm_4x8:
1815 case ir_unop_unpack_snorm_2x16:
1816 case ir_unop_unpack_snorm_4x8:
1817 case ir_unop_unpack_unorm_2x16:
1818 case ir_unop_unpack_unorm_4x8:
1819 unreachable("not reached: should be handled by lower_packing_builtins");
1820 case ir_unop_unpack_half_2x16_split_x:
1821 case ir_unop_unpack_half_2x16_split_y:
1822 case ir_binop_pack_half_2x16_split:
1823 case ir_unop_interpolate_at_centroid:
1824 case ir_binop_interpolate_at_sample:
1825 case ir_binop_interpolate_at_offset:
1826 unreachable("not reached: should not occur in vertex shader");
1827 case ir_binop_ldexp:
1828 unreachable("not reached: should be handled by ldexp_to_arith()");
1829 }
1830 }
1831
1832
1833 void
1834 vec4_visitor::visit(ir_swizzle *ir)
1835 {
1836 src_reg src;
1837 int i = 0;
1838 int swizzle[4];
1839
1840 /* Note that this is only swizzles in expressions, not those on the left
1841 * hand side of an assignment, which do write masking. See ir_assignment
1842 * for that.
1843 */
1844
1845 ir->val->accept(this);
1846 src = this->result;
1847 assert(src.file != BAD_FILE);
1848
1849 for (i = 0; i < ir->type->vector_elements; i++) {
1850 switch (i) {
1851 case 0:
1852 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1853 break;
1854 case 1:
1855 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1856 break;
1857 case 2:
1858 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1859 break;
1860 case 3:
1861 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1862 break;
1863 }
1864 }
1865 for (; i < 4; i++) {
1866 /* Replicate the last channel out. */
1867 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1868 }
1869
1870 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1871
1872 this->result = src;
1873 }
1874
1875 void
1876 vec4_visitor::visit(ir_dereference_variable *ir)
1877 {
1878 const struct glsl_type *type = ir->type;
1879 dst_reg *reg = variable_storage(ir->var);
1880
1881 if (!reg) {
1882 fail("Failed to find variable storage for %s\n", ir->var->name);
1883 this->result = src_reg(brw_null_reg());
1884 return;
1885 }
1886
1887 this->result = src_reg(*reg);
1888
1889 /* System values get their swizzle from the dst_reg writemask */
1890 if (ir->var->data.mode == ir_var_system_value)
1891 return;
1892
1893 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1894 this->result.swizzle = swizzle_for_size(type->vector_elements);
1895 }
1896
1897
1898 int
1899 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1900 {
1901 /* Under normal circumstances array elements are stored consecutively, so
1902 * the stride is equal to the size of the array element.
1903 */
1904 return type_size(ir->type);
1905 }
1906
1907
1908 void
1909 vec4_visitor::visit(ir_dereference_array *ir)
1910 {
1911 ir_constant *constant_index;
1912 src_reg src;
1913 int array_stride = compute_array_stride(ir);
1914
1915 constant_index = ir->array_index->constant_expression_value();
1916
1917 ir->array->accept(this);
1918 src = this->result;
1919
1920 if (constant_index) {
1921 src.reg_offset += constant_index->value.i[0] * array_stride;
1922 } else {
1923 /* Variable index array dereference. It eats the "vec4" of the
1924 * base of the array and an index that offsets the Mesa register
1925 * index.
1926 */
1927 ir->array_index->accept(this);
1928
1929 src_reg index_reg;
1930
1931 if (array_stride == 1) {
1932 index_reg = this->result;
1933 } else {
1934 index_reg = src_reg(this, glsl_type::int_type);
1935
1936 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1937 }
1938
1939 if (src.reladdr) {
1940 src_reg temp = src_reg(this, glsl_type::int_type);
1941
1942 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1943
1944 index_reg = temp;
1945 }
1946
1947 src.reladdr = ralloc(mem_ctx, src_reg);
1948 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1949 }
1950
1951 /* If the type is smaller than a vec4, replicate the last channel out. */
1952 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1953 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1954 else
1955 src.swizzle = BRW_SWIZZLE_NOOP;
1956 src.type = brw_type_for_base_type(ir->type);
1957
1958 this->result = src;
1959 }
1960
1961 void
1962 vec4_visitor::visit(ir_dereference_record *ir)
1963 {
1964 unsigned int i;
1965 const glsl_type *struct_type = ir->record->type;
1966 int offset = 0;
1967
1968 ir->record->accept(this);
1969
1970 for (i = 0; i < struct_type->length; i++) {
1971 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1972 break;
1973 offset += type_size(struct_type->fields.structure[i].type);
1974 }
1975
1976 /* If the type is smaller than a vec4, replicate the last channel out. */
1977 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1978 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1979 else
1980 this->result.swizzle = BRW_SWIZZLE_NOOP;
1981 this->result.type = brw_type_for_base_type(ir->type);
1982
1983 this->result.reg_offset += offset;
1984 }
1985
1986 /**
1987 * We want to be careful in assignment setup to hit the actual storage
1988 * instead of potentially using a temporary like we might with the
1989 * ir_dereference handler.
1990 */
1991 static dst_reg
1992 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1993 {
1994 /* The LHS must be a dereference. If the LHS is a variable indexed array
1995 * access of a vector, it must be separated into a series conditional moves
1996 * before reaching this point (see ir_vec_index_to_cond_assign).
1997 */
1998 assert(ir->as_dereference());
1999 ir_dereference_array *deref_array = ir->as_dereference_array();
2000 if (deref_array) {
2001 assert(!deref_array->array->type->is_vector());
2002 }
2003
2004 /* Use the rvalue deref handler for the most part. We'll ignore
2005 * swizzles in it and write swizzles using writemask, though.
2006 */
2007 ir->accept(v);
2008 return dst_reg(v->result);
2009 }
2010
2011 void
2012 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2013 const struct glsl_type *type,
2014 enum brw_predicate predicate)
2015 {
2016 if (type->base_type == GLSL_TYPE_STRUCT) {
2017 for (unsigned int i = 0; i < type->length; i++) {
2018 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2019 }
2020 return;
2021 }
2022
2023 if (type->is_array()) {
2024 for (unsigned int i = 0; i < type->length; i++) {
2025 emit_block_move(dst, src, type->fields.array, predicate);
2026 }
2027 return;
2028 }
2029
2030 if (type->is_matrix()) {
2031 const struct glsl_type *vec_type;
2032
2033 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2034 type->vector_elements, 1);
2035
2036 for (int i = 0; i < type->matrix_columns; i++) {
2037 emit_block_move(dst, src, vec_type, predicate);
2038 }
2039 return;
2040 }
2041
2042 assert(type->is_scalar() || type->is_vector());
2043
2044 dst->type = brw_type_for_base_type(type);
2045 src->type = dst->type;
2046
2047 dst->writemask = (1 << type->vector_elements) - 1;
2048
2049 src->swizzle = swizzle_for_size(type->vector_elements);
2050
2051 vec4_instruction *inst = emit(MOV(*dst, *src));
2052 inst->predicate = predicate;
2053
2054 dst->reg_offset++;
2055 src->reg_offset++;
2056 }
2057
2058
2059 /* If the RHS processing resulted in an instruction generating a
2060 * temporary value, and it would be easy to rewrite the instruction to
2061 * generate its result right into the LHS instead, do so. This ends
2062 * up reliably removing instructions where it can be tricky to do so
2063 * later without real UD chain information.
2064 */
2065 bool
2066 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2067 dst_reg dst,
2068 src_reg src,
2069 vec4_instruction *pre_rhs_inst,
2070 vec4_instruction *last_rhs_inst)
2071 {
2072 /* This could be supported, but it would take more smarts. */
2073 if (ir->condition)
2074 return false;
2075
2076 if (pre_rhs_inst == last_rhs_inst)
2077 return false; /* No instructions generated to work with. */
2078
2079 /* Make sure the last instruction generated our source reg. */
2080 if (src.file != GRF ||
2081 src.file != last_rhs_inst->dst.file ||
2082 src.reg != last_rhs_inst->dst.reg ||
2083 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2084 src.reladdr ||
2085 src.abs ||
2086 src.negate ||
2087 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2088 return false;
2089
2090 /* Check that that last instruction fully initialized the channels
2091 * we want to use, in the order we want to use them. We could
2092 * potentially reswizzle the operands of many instructions so that
2093 * we could handle out of order channels, but don't yet.
2094 */
2095
2096 for (unsigned i = 0; i < 4; i++) {
2097 if (dst.writemask & (1 << i)) {
2098 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2099 return false;
2100
2101 if (BRW_GET_SWZ(src.swizzle, i) != i)
2102 return false;
2103 }
2104 }
2105
2106 /* Success! Rewrite the instruction. */
2107 last_rhs_inst->dst.file = dst.file;
2108 last_rhs_inst->dst.reg = dst.reg;
2109 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2110 last_rhs_inst->dst.reladdr = dst.reladdr;
2111 last_rhs_inst->dst.writemask &= dst.writemask;
2112
2113 return true;
2114 }
2115
2116 void
2117 vec4_visitor::visit(ir_assignment *ir)
2118 {
2119 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2120 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2121
2122 if (!ir->lhs->type->is_scalar() &&
2123 !ir->lhs->type->is_vector()) {
2124 ir->rhs->accept(this);
2125 src_reg src = this->result;
2126
2127 if (ir->condition) {
2128 emit_bool_to_cond_code(ir->condition, &predicate);
2129 }
2130
2131 /* emit_block_move doesn't account for swizzles in the source register.
2132 * This should be ok, since the source register is a structure or an
2133 * array, and those can't be swizzled. But double-check to be sure.
2134 */
2135 assert(src.swizzle ==
2136 (ir->rhs->type->is_matrix()
2137 ? swizzle_for_size(ir->rhs->type->vector_elements)
2138 : BRW_SWIZZLE_NOOP));
2139
2140 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2141 return;
2142 }
2143
2144 /* Now we're down to just a scalar/vector with writemasks. */
2145 int i;
2146
2147 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2148 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2149
2150 ir->rhs->accept(this);
2151
2152 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2153
2154 src_reg src = this->result;
2155
2156 int swizzles[4];
2157 int first_enabled_chan = 0;
2158 int src_chan = 0;
2159
2160 assert(ir->lhs->type->is_vector() ||
2161 ir->lhs->type->is_scalar());
2162 dst.writemask = ir->write_mask;
2163
2164 for (int i = 0; i < 4; i++) {
2165 if (dst.writemask & (1 << i)) {
2166 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2167 break;
2168 }
2169 }
2170
2171 /* Swizzle a small RHS vector into the channels being written.
2172 *
2173 * glsl ir treats write_mask as dictating how many channels are
2174 * present on the RHS while in our instructions we need to make
2175 * those channels appear in the slots of the vec4 they're written to.
2176 */
2177 for (int i = 0; i < 4; i++) {
2178 if (dst.writemask & (1 << i))
2179 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2180 else
2181 swizzles[i] = first_enabled_chan;
2182 }
2183 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2184 swizzles[2], swizzles[3]);
2185
2186 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2187 return;
2188 }
2189
2190 if (ir->condition) {
2191 emit_bool_to_cond_code(ir->condition, &predicate);
2192 }
2193
2194 for (i = 0; i < type_size(ir->lhs->type); i++) {
2195 vec4_instruction *inst = emit(MOV(dst, src));
2196 inst->predicate = predicate;
2197
2198 dst.reg_offset++;
2199 src.reg_offset++;
2200 }
2201 }
2202
2203 void
2204 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2205 {
2206 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2207 foreach_in_list(ir_constant, field_value, &ir->components) {
2208 emit_constant_values(dst, field_value);
2209 }
2210 return;
2211 }
2212
2213 if (ir->type->is_array()) {
2214 for (unsigned int i = 0; i < ir->type->length; i++) {
2215 emit_constant_values(dst, ir->array_elements[i]);
2216 }
2217 return;
2218 }
2219
2220 if (ir->type->is_matrix()) {
2221 for (int i = 0; i < ir->type->matrix_columns; i++) {
2222 float *vec = &ir->value.f[i * ir->type->vector_elements];
2223
2224 for (int j = 0; j < ir->type->vector_elements; j++) {
2225 dst->writemask = 1 << j;
2226 dst->type = BRW_REGISTER_TYPE_F;
2227
2228 emit(MOV(*dst, src_reg(vec[j])));
2229 }
2230 dst->reg_offset++;
2231 }
2232 return;
2233 }
2234
2235 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2236
2237 for (int i = 0; i < ir->type->vector_elements; i++) {
2238 if (!(remaining_writemask & (1 << i)))
2239 continue;
2240
2241 dst->writemask = 1 << i;
2242 dst->type = brw_type_for_base_type(ir->type);
2243
2244 /* Find other components that match the one we're about to
2245 * write. Emits fewer instructions for things like vec4(0.5,
2246 * 1.5, 1.5, 1.5).
2247 */
2248 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2249 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2250 if (ir->value.b[i] == ir->value.b[j])
2251 dst->writemask |= (1 << j);
2252 } else {
2253 /* u, i, and f storage all line up, so no need for a
2254 * switch case for comparing each type.
2255 */
2256 if (ir->value.u[i] == ir->value.u[j])
2257 dst->writemask |= (1 << j);
2258 }
2259 }
2260
2261 switch (ir->type->base_type) {
2262 case GLSL_TYPE_FLOAT:
2263 emit(MOV(*dst, src_reg(ir->value.f[i])));
2264 break;
2265 case GLSL_TYPE_INT:
2266 emit(MOV(*dst, src_reg(ir->value.i[i])));
2267 break;
2268 case GLSL_TYPE_UINT:
2269 emit(MOV(*dst, src_reg(ir->value.u[i])));
2270 break;
2271 case GLSL_TYPE_BOOL:
2272 emit(MOV(*dst,
2273 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2274 : 0)));
2275 break;
2276 default:
2277 unreachable("Non-float/uint/int/bool constant");
2278 }
2279
2280 remaining_writemask &= ~dst->writemask;
2281 }
2282 dst->reg_offset++;
2283 }
2284
2285 void
2286 vec4_visitor::visit(ir_constant *ir)
2287 {
2288 dst_reg dst = dst_reg(this, ir->type);
2289 this->result = src_reg(dst);
2290
2291 emit_constant_values(&dst, ir);
2292 }
2293
2294 void
2295 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2296 {
2297 ir_dereference *deref = static_cast<ir_dereference *>(
2298 ir->actual_parameters.get_head());
2299 ir_variable *location = deref->variable_referenced();
2300 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2301 location->data.binding);
2302
2303 /* Calculate the surface offset */
2304 src_reg offset(this, glsl_type::uint_type);
2305 ir_dereference_array *deref_array = deref->as_dereference_array();
2306 if (deref_array) {
2307 deref_array->array_index->accept(this);
2308
2309 src_reg tmp(this, glsl_type::uint_type);
2310 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2311 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2312 } else {
2313 offset = location->data.atomic.offset;
2314 }
2315
2316 /* Emit the appropriate machine instruction */
2317 const char *callee = ir->callee->function_name();
2318 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2319
2320 if (!strcmp("__intrinsic_atomic_read", callee)) {
2321 emit_untyped_surface_read(surf_index, dst, offset);
2322
2323 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2324 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2325 src_reg(), src_reg());
2326
2327 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2328 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2329 src_reg(), src_reg());
2330 }
2331 }
2332
2333 void
2334 vec4_visitor::visit(ir_call *ir)
2335 {
2336 const char *callee = ir->callee->function_name();
2337
2338 if (!strcmp("__intrinsic_atomic_read", callee) ||
2339 !strcmp("__intrinsic_atomic_increment", callee) ||
2340 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2341 visit_atomic_counter_intrinsic(ir);
2342 } else {
2343 unreachable("Unsupported intrinsic.");
2344 }
2345 }
2346
2347 src_reg
2348 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2349 {
2350 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2351 inst->base_mrf = 2;
2352 inst->mlen = 1;
2353 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2354 inst->dst.writemask = WRITEMASK_XYZW;
2355
2356 inst->src[1] = sampler;
2357
2358 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2359 int param_base = inst->base_mrf;
2360 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2361 int zero_mask = 0xf & ~coord_mask;
2362
2363 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2364 coordinate));
2365
2366 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2367 src_reg(0)));
2368
2369 emit(inst);
2370 return src_reg(inst->dst);
2371 }
2372
2373 static bool
2374 is_high_sampler(struct brw_context *brw, src_reg sampler)
2375 {
2376 if (brw->gen < 8 && !brw->is_haswell)
2377 return false;
2378
2379 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2380 }
2381
2382 void
2383 vec4_visitor::visit(ir_texture *ir)
2384 {
2385 uint32_t sampler =
2386 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2387
2388 ir_rvalue *nonconst_sampler_index =
2389 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2390
2391 /* Handle non-constant sampler array indexing */
2392 src_reg sampler_reg;
2393 if (nonconst_sampler_index) {
2394 /* The highest sampler which may be used by this operation is
2395 * the last element of the array. Mark it here, because the generator
2396 * doesn't have enough information to determine the bound.
2397 */
2398 uint32_t array_size = ir->sampler->as_dereference_array()
2399 ->array->type->array_size();
2400
2401 uint32_t max_used = sampler + array_size - 1;
2402 if (ir->op == ir_tg4 && brw->gen < 8) {
2403 max_used += prog_data->base.binding_table.gather_texture_start;
2404 } else {
2405 max_used += prog_data->base.binding_table.texture_start;
2406 }
2407
2408 brw_mark_surface_used(&prog_data->base, max_used);
2409
2410 /* Emit code to evaluate the actual indexing expression */
2411 nonconst_sampler_index->accept(this);
2412 dst_reg temp(this, glsl_type::uint_type);
2413 emit(ADD(temp, this->result, src_reg(sampler)))
2414 ->force_writemask_all = true;
2415 sampler_reg = src_reg(temp);
2416 } else {
2417 /* Single sampler, or constant array index; the indexing expression
2418 * is just an immediate.
2419 */
2420 sampler_reg = src_reg(sampler);
2421 }
2422
2423 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2424 * emitting anything other than setting up the constant result.
2425 */
2426 if (ir->op == ir_tg4) {
2427 ir_constant *chan = ir->lod_info.component->as_constant();
2428 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2429 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2430 dst_reg result(this, ir->type);
2431 this->result = src_reg(result);
2432 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2433 return;
2434 }
2435 }
2436
2437 /* Should be lowered by do_lower_texture_projection */
2438 assert(!ir->projector);
2439
2440 /* Should be lowered */
2441 assert(!ir->offset || !ir->offset->type->is_array());
2442
2443 /* Generate code to compute all the subexpression trees. This has to be
2444 * done before loading any values into MRFs for the sampler message since
2445 * generating these values may involve SEND messages that need the MRFs.
2446 */
2447 src_reg coordinate;
2448 if (ir->coordinate) {
2449 ir->coordinate->accept(this);
2450 coordinate = this->result;
2451 }
2452
2453 src_reg shadow_comparitor;
2454 if (ir->shadow_comparitor) {
2455 ir->shadow_comparitor->accept(this);
2456 shadow_comparitor = this->result;
2457 }
2458
2459 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2460 src_reg offset_value;
2461 if (has_nonconstant_offset) {
2462 ir->offset->accept(this);
2463 offset_value = src_reg(this->result);
2464 }
2465
2466 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2467 src_reg lod, dPdx, dPdy, sample_index, mcs;
2468 switch (ir->op) {
2469 case ir_tex:
2470 lod = src_reg(0.0f);
2471 lod_type = glsl_type::float_type;
2472 break;
2473 case ir_txf:
2474 case ir_txl:
2475 case ir_txs:
2476 ir->lod_info.lod->accept(this);
2477 lod = this->result;
2478 lod_type = ir->lod_info.lod->type;
2479 break;
2480 case ir_query_levels:
2481 lod = src_reg(0);
2482 lod_type = glsl_type::int_type;
2483 break;
2484 case ir_txf_ms:
2485 ir->lod_info.sample_index->accept(this);
2486 sample_index = this->result;
2487 sample_index_type = ir->lod_info.sample_index->type;
2488
2489 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2490 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2491 else
2492 mcs = src_reg(0u);
2493 break;
2494 case ir_txd:
2495 ir->lod_info.grad.dPdx->accept(this);
2496 dPdx = this->result;
2497
2498 ir->lod_info.grad.dPdy->accept(this);
2499 dPdy = this->result;
2500
2501 lod_type = ir->lod_info.grad.dPdx->type;
2502 break;
2503 case ir_txb:
2504 case ir_lod:
2505 case ir_tg4:
2506 break;
2507 }
2508
2509 enum opcode opcode;
2510 switch (ir->op) {
2511 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2512 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2513 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2514 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2515 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2516 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2517 case ir_tg4: opcode = has_nonconstant_offset
2518 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2519 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2520 case ir_txb:
2521 unreachable("TXB is not valid for vertex shaders.");
2522 case ir_lod:
2523 unreachable("LOD is not valid for vertex shaders.");
2524 default:
2525 unreachable("Unrecognized tex op");
2526 }
2527
2528 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2529
2530 if (ir->offset != NULL && ir->op != ir_txf)
2531 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2532
2533 /* Stuff the channel select bits in the top of the texture offset */
2534 if (ir->op == ir_tg4)
2535 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2536
2537 /* The message header is necessary for:
2538 * - Gen4 (always)
2539 * - Texel offsets
2540 * - Gather channel selection
2541 * - Sampler indices too large to fit in a 4-bit value.
2542 */
2543 inst->header_present =
2544 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2545 is_high_sampler(brw, sampler_reg);
2546 inst->base_mrf = 2;
2547 inst->mlen = inst->header_present + 1; /* always at least one */
2548 inst->dst = dst_reg(this, ir->type);
2549 inst->dst.writemask = WRITEMASK_XYZW;
2550 inst->shadow_compare = ir->shadow_comparitor != NULL;
2551
2552 inst->src[1] = sampler_reg;
2553
2554 /* MRF for the first parameter */
2555 int param_base = inst->base_mrf + inst->header_present;
2556
2557 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2558 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2559 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2560 } else {
2561 /* Load the coordinate */
2562 /* FINISHME: gl_clamp_mask and saturate */
2563 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2564 int zero_mask = 0xf & ~coord_mask;
2565
2566 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2567 coordinate));
2568
2569 if (zero_mask != 0) {
2570 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2571 src_reg(0)));
2572 }
2573 /* Load the shadow comparitor */
2574 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2575 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2576 WRITEMASK_X),
2577 shadow_comparitor));
2578 inst->mlen++;
2579 }
2580
2581 /* Load the LOD info */
2582 if (ir->op == ir_tex || ir->op == ir_txl) {
2583 int mrf, writemask;
2584 if (brw->gen >= 5) {
2585 mrf = param_base + 1;
2586 if (ir->shadow_comparitor) {
2587 writemask = WRITEMASK_Y;
2588 /* mlen already incremented */
2589 } else {
2590 writemask = WRITEMASK_X;
2591 inst->mlen++;
2592 }
2593 } else /* brw->gen == 4 */ {
2594 mrf = param_base;
2595 writemask = WRITEMASK_W;
2596 }
2597 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2598 } else if (ir->op == ir_txf) {
2599 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2600 } else if (ir->op == ir_txf_ms) {
2601 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2602 sample_index));
2603 if (brw->gen >= 7)
2604 /* MCS data is in the first channel of `mcs`, but we need to get it into
2605 * the .y channel of the second vec4 of params, so replicate .x across
2606 * the whole vec4 and then mask off everything except .y
2607 */
2608 mcs.swizzle = BRW_SWIZZLE_XXXX;
2609 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2610 mcs));
2611 inst->mlen++;
2612 } else if (ir->op == ir_txd) {
2613 const glsl_type *type = lod_type;
2614
2615 if (brw->gen >= 5) {
2616 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2617 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2618 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2619 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2620 inst->mlen++;
2621
2622 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2623 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2624 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2625 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2626 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2627 inst->mlen++;
2628
2629 if (ir->shadow_comparitor) {
2630 emit(MOV(dst_reg(MRF, param_base + 2,
2631 ir->shadow_comparitor->type, WRITEMASK_Z),
2632 shadow_comparitor));
2633 }
2634 }
2635 } else /* brw->gen == 4 */ {
2636 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2637 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2638 inst->mlen += 2;
2639 }
2640 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2641 if (ir->shadow_comparitor) {
2642 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2643 shadow_comparitor));
2644 }
2645
2646 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2647 offset_value));
2648 inst->mlen++;
2649 }
2650 }
2651
2652 emit(inst);
2653
2654 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2655 * spec requires layers.
2656 */
2657 if (ir->op == ir_txs) {
2658 glsl_type const *type = ir->sampler->type;
2659 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2660 type->sampler_array) {
2661 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2662 writemask(inst->dst, WRITEMASK_Z),
2663 src_reg(inst->dst), src_reg(6));
2664 }
2665 }
2666
2667 if (brw->gen == 6 && ir->op == ir_tg4) {
2668 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2669 }
2670
2671 swizzle_result(ir, src_reg(inst->dst), sampler);
2672 }
2673
2674 /**
2675 * Apply workarounds for Gen6 gather with UINT/SINT
2676 */
2677 void
2678 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2679 {
2680 if (!wa)
2681 return;
2682
2683 int width = (wa & WA_8BIT) ? 8 : 16;
2684 dst_reg dst_f = dst;
2685 dst_f.type = BRW_REGISTER_TYPE_F;
2686
2687 /* Convert from UNORM to UINT */
2688 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2689 emit(MOV(dst, src_reg(dst_f)));
2690
2691 if (wa & WA_SIGN) {
2692 /* Reinterpret the UINT value as a signed INT value by
2693 * shifting the sign bit into place, then shifting back
2694 * preserving sign.
2695 */
2696 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2697 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2698 }
2699 }
2700
2701 /**
2702 * Set up the gather channel based on the swizzle, for gather4.
2703 */
2704 uint32_t
2705 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2706 {
2707 ir_constant *chan = ir->lod_info.component->as_constant();
2708 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2709 switch (swiz) {
2710 case SWIZZLE_X: return 0;
2711 case SWIZZLE_Y:
2712 /* gather4 sampler is broken for green channel on RG32F --
2713 * we must ask for blue instead.
2714 */
2715 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2716 return 2;
2717 return 1;
2718 case SWIZZLE_Z: return 2;
2719 case SWIZZLE_W: return 3;
2720 default:
2721 unreachable("Not reached"); /* zero, one swizzles handled already */
2722 }
2723 }
2724
2725 void
2726 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2727 {
2728 int s = key->tex.swizzles[sampler];
2729
2730 this->result = src_reg(this, ir->type);
2731 dst_reg swizzled_result(this->result);
2732
2733 if (ir->op == ir_query_levels) {
2734 /* # levels is in .w */
2735 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2736 emit(MOV(swizzled_result, orig_val));
2737 return;
2738 }
2739
2740 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2741 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2742 emit(MOV(swizzled_result, orig_val));
2743 return;
2744 }
2745
2746
2747 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2748 int swizzle[4] = {0};
2749
2750 for (int i = 0; i < 4; i++) {
2751 switch (GET_SWZ(s, i)) {
2752 case SWIZZLE_ZERO:
2753 zero_mask |= (1 << i);
2754 break;
2755 case SWIZZLE_ONE:
2756 one_mask |= (1 << i);
2757 break;
2758 default:
2759 copy_mask |= (1 << i);
2760 swizzle[i] = GET_SWZ(s, i);
2761 break;
2762 }
2763 }
2764
2765 if (copy_mask) {
2766 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2767 swizzled_result.writemask = copy_mask;
2768 emit(MOV(swizzled_result, orig_val));
2769 }
2770
2771 if (zero_mask) {
2772 swizzled_result.writemask = zero_mask;
2773 emit(MOV(swizzled_result, src_reg(0.0f)));
2774 }
2775
2776 if (one_mask) {
2777 swizzled_result.writemask = one_mask;
2778 emit(MOV(swizzled_result, src_reg(1.0f)));
2779 }
2780 }
2781
2782 void
2783 vec4_visitor::visit(ir_return *)
2784 {
2785 unreachable("not reached");
2786 }
2787
2788 void
2789 vec4_visitor::visit(ir_discard *)
2790 {
2791 unreachable("not reached");
2792 }
2793
2794 void
2795 vec4_visitor::visit(ir_if *ir)
2796 {
2797 /* Don't point the annotation at the if statement, because then it plus
2798 * the then and else blocks get printed.
2799 */
2800 this->base_ir = ir->condition;
2801
2802 if (brw->gen == 6) {
2803 emit_if_gen6(ir);
2804 } else {
2805 enum brw_predicate predicate;
2806 emit_bool_to_cond_code(ir->condition, &predicate);
2807 emit(IF(predicate));
2808 }
2809
2810 visit_instructions(&ir->then_instructions);
2811
2812 if (!ir->else_instructions.is_empty()) {
2813 this->base_ir = ir->condition;
2814 emit(BRW_OPCODE_ELSE);
2815
2816 visit_instructions(&ir->else_instructions);
2817 }
2818
2819 this->base_ir = ir->condition;
2820 emit(BRW_OPCODE_ENDIF);
2821 }
2822
2823 void
2824 vec4_visitor::visit(ir_emit_vertex *)
2825 {
2826 unreachable("not reached");
2827 }
2828
2829 void
2830 vec4_visitor::visit(ir_end_primitive *)
2831 {
2832 unreachable("not reached");
2833 }
2834
2835 void
2836 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2837 dst_reg dst, src_reg offset,
2838 src_reg src0, src_reg src1)
2839 {
2840 unsigned mlen = 0;
2841
2842 /* Set the atomic operation offset. */
2843 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2844 mlen++;
2845
2846 /* Set the atomic operation arguments. */
2847 if (src0.file != BAD_FILE) {
2848 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2849 mlen++;
2850 }
2851
2852 if (src1.file != BAD_FILE) {
2853 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2854 mlen++;
2855 }
2856
2857 /* Emit the instruction. Note that this maps to the normal SIMD8
2858 * untyped atomic message on Ivy Bridge, but that's OK because
2859 * unused channels will be masked out.
2860 */
2861 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2862 src_reg(atomic_op), src_reg(surf_index));
2863 inst->base_mrf = 0;
2864 inst->mlen = mlen;
2865 }
2866
2867 void
2868 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2869 src_reg offset)
2870 {
2871 /* Set the surface read offset. */
2872 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2873
2874 /* Emit the instruction. Note that this maps to the normal SIMD8
2875 * untyped surface read message, but that's OK because unused
2876 * channels will be masked out.
2877 */
2878 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2879 dst, src_reg(surf_index));
2880 inst->base_mrf = 0;
2881 inst->mlen = 1;
2882 }
2883
2884 void
2885 vec4_visitor::emit_ndc_computation()
2886 {
2887 /* Get the position */
2888 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2889
2890 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2891 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2892 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2893
2894 current_annotation = "NDC";
2895 dst_reg ndc_w = ndc;
2896 ndc_w.writemask = WRITEMASK_W;
2897 src_reg pos_w = pos;
2898 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2899 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2900
2901 dst_reg ndc_xyz = ndc;
2902 ndc_xyz.writemask = WRITEMASK_XYZ;
2903
2904 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2905 }
2906
2907 void
2908 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2909 {
2910 if (brw->gen < 6 &&
2911 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2912 key->userclip_active || brw->has_negative_rhw_bug)) {
2913 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2914 dst_reg header1_w = header1;
2915 header1_w.writemask = WRITEMASK_W;
2916
2917 emit(MOV(header1, 0u));
2918
2919 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2920 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2921
2922 current_annotation = "Point size";
2923 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2924 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2925 }
2926
2927 if (key->userclip_active) {
2928 current_annotation = "Clipping flags";
2929 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2930 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2931
2932 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2933 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2934 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2935
2936 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2937 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2938 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2939 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2940 }
2941
2942 /* i965 clipping workaround:
2943 * 1) Test for -ve rhw
2944 * 2) If set,
2945 * set ndc = (0,0,0,0)
2946 * set ucp[6] = 1
2947 *
2948 * Later, clipping will detect ucp[6] and ensure the primitive is
2949 * clipped against all fixed planes.
2950 */
2951 if (brw->has_negative_rhw_bug) {
2952 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2953 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2954 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2955 vec4_instruction *inst;
2956 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2957 inst->predicate = BRW_PREDICATE_NORMAL;
2958 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2959 inst->predicate = BRW_PREDICATE_NORMAL;
2960 }
2961
2962 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2963 } else if (brw->gen < 6) {
2964 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2965 } else {
2966 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2967 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2968 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2969 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2970 }
2971 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2972 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2973 src_reg(output_reg[VARYING_SLOT_LAYER])));
2974 }
2975 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2976 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2977 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2978 }
2979 }
2980 }
2981
2982 void
2983 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2984 {
2985 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2986 *
2987 * "If a linked set of shaders forming the vertex stage contains no
2988 * static write to gl_ClipVertex or gl_ClipDistance, but the
2989 * application has requested clipping against user clip planes through
2990 * the API, then the coordinate written to gl_Position is used for
2991 * comparison against the user clip planes."
2992 *
2993 * This function is only called if the shader didn't write to
2994 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2995 * if the user wrote to it; otherwise we use gl_Position.
2996 */
2997 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2998 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2999 clip_vertex = VARYING_SLOT_POS;
3000 }
3001
3002 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3003 ++i) {
3004 reg.writemask = 1 << i;
3005 emit(DP4(reg,
3006 src_reg(output_reg[clip_vertex]),
3007 src_reg(this->userplane[i + offset])));
3008 }
3009 }
3010
3011 void
3012 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3013 {
3014 assert (varying < VARYING_SLOT_MAX);
3015 reg.type = output_reg[varying].type;
3016 current_annotation = output_reg_annotation[varying];
3017 /* Copy the register, saturating if necessary */
3018 vec4_instruction *inst = emit(MOV(reg,
3019 src_reg(output_reg[varying])));
3020 if ((varying == VARYING_SLOT_COL0 ||
3021 varying == VARYING_SLOT_COL1 ||
3022 varying == VARYING_SLOT_BFC0 ||
3023 varying == VARYING_SLOT_BFC1) &&
3024 key->clamp_vertex_color) {
3025 inst->saturate = true;
3026 }
3027 }
3028
3029 void
3030 vec4_visitor::emit_urb_slot(int mrf, int varying)
3031 {
3032 struct brw_reg hw_reg = brw_message_reg(mrf);
3033 dst_reg reg = dst_reg(MRF, mrf);
3034 reg.type = BRW_REGISTER_TYPE_F;
3035
3036 switch (varying) {
3037 case VARYING_SLOT_PSIZ:
3038 /* PSIZ is always in slot 0, and is coupled with other flags. */
3039 current_annotation = "indices, point width, clip flags";
3040 emit_psiz_and_flags(hw_reg);
3041 break;
3042 case BRW_VARYING_SLOT_NDC:
3043 current_annotation = "NDC";
3044 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3045 break;
3046 case VARYING_SLOT_POS:
3047 current_annotation = "gl_Position";
3048 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3049 break;
3050 case VARYING_SLOT_EDGE:
3051 /* This is present when doing unfilled polygons. We're supposed to copy
3052 * the edge flag from the user-provided vertex array
3053 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3054 * of that attribute (starts as 1.0f). This is then used in clipping to
3055 * determine which edges should be drawn as wireframe.
3056 */
3057 current_annotation = "edge flag";
3058 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3059 glsl_type::float_type, WRITEMASK_XYZW))));
3060 break;
3061 case BRW_VARYING_SLOT_PAD:
3062 /* No need to write to this slot */
3063 break;
3064 default:
3065 emit_generic_urb_slot(reg, varying);
3066 break;
3067 }
3068 }
3069
3070 static int
3071 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3072 {
3073 if (brw->gen >= 6) {
3074 /* URB data written (does not include the message header reg) must
3075 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3076 * section 5.4.3.2.2: URB_INTERLEAVED.
3077 *
3078 * URB entries are allocated on a multiple of 1024 bits, so an
3079 * extra 128 bits written here to make the end align to 256 is
3080 * no problem.
3081 */
3082 if ((mlen % 2) != 1)
3083 mlen++;
3084 }
3085
3086 return mlen;
3087 }
3088
3089
3090 /**
3091 * Generates the VUE payload plus the necessary URB write instructions to
3092 * output it.
3093 *
3094 * The VUE layout is documented in Volume 2a.
3095 */
3096 void
3097 vec4_visitor::emit_vertex()
3098 {
3099 /* MRF 0 is reserved for the debugger, so start with message header
3100 * in MRF 1.
3101 */
3102 int base_mrf = 1;
3103 int mrf = base_mrf;
3104 /* In the process of generating our URB write message contents, we
3105 * may need to unspill a register or load from an array. Those
3106 * reads would use MRFs 14-15.
3107 */
3108 int max_usable_mrf = 13;
3109
3110 /* The following assertion verifies that max_usable_mrf causes an
3111 * even-numbered amount of URB write data, which will meet gen6's
3112 * requirements for length alignment.
3113 */
3114 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3115
3116 /* First mrf is the g0-based message header containing URB handles and
3117 * such.
3118 */
3119 emit_urb_write_header(mrf++);
3120
3121 if (brw->gen < 6) {
3122 emit_ndc_computation();
3123 }
3124
3125 /* Lower legacy ff and ClipVertex clipping to clip distances */
3126 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3127 current_annotation = "user clip distances";
3128
3129 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3130 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3131
3132 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3133 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3134 }
3135
3136 /* We may need to split this up into several URB writes, so do them in a
3137 * loop.
3138 */
3139 int slot = 0;
3140 bool complete = false;
3141 do {
3142 /* URB offset is in URB row increments, and each of our MRFs is half of
3143 * one of those, since we're doing interleaved writes.
3144 */
3145 int offset = slot / 2;
3146
3147 mrf = base_mrf + 1;
3148 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3149 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3150
3151 /* If this was max_usable_mrf, we can't fit anything more into this
3152 * URB WRITE.
3153 */
3154 if (mrf > max_usable_mrf) {
3155 slot++;
3156 break;
3157 }
3158 }
3159
3160 complete = slot >= prog_data->vue_map.num_slots;
3161 current_annotation = "URB write";
3162 vec4_instruction *inst = emit_urb_write_opcode(complete);
3163 inst->base_mrf = base_mrf;
3164 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3165 inst->offset += offset;
3166 } while(!complete);
3167 }
3168
3169
3170 src_reg
3171 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3172 src_reg *reladdr, int reg_offset)
3173 {
3174 /* Because we store the values to scratch interleaved like our
3175 * vertex data, we need to scale the vec4 index by 2.
3176 */
3177 int message_header_scale = 2;
3178
3179 /* Pre-gen6, the message header uses byte offsets instead of vec4
3180 * (16-byte) offset units.
3181 */
3182 if (brw->gen < 6)
3183 message_header_scale *= 16;
3184
3185 if (reladdr) {
3186 src_reg index = src_reg(this, glsl_type::int_type);
3187
3188 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3189 emit_before(inst, MUL(dst_reg(index),
3190 index, src_reg(message_header_scale)));
3191
3192 return index;
3193 } else {
3194 return src_reg(reg_offset * message_header_scale);
3195 }
3196 }
3197
3198 src_reg
3199 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3200 src_reg *reladdr, int reg_offset)
3201 {
3202 if (reladdr) {
3203 src_reg index = src_reg(this, glsl_type::int_type);
3204
3205 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3206
3207 /* Pre-gen6, the message header uses byte offsets instead of vec4
3208 * (16-byte) offset units.
3209 */
3210 if (brw->gen < 6) {
3211 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3212 }
3213
3214 return index;
3215 } else if (brw->gen >= 8) {
3216 /* Store the offset in a GRF so we can send-from-GRF. */
3217 src_reg offset = src_reg(this, glsl_type::int_type);
3218 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3219 return offset;
3220 } else {
3221 int message_header_scale = brw->gen < 6 ? 16 : 1;
3222 return src_reg(reg_offset * message_header_scale);
3223 }
3224 }
3225
3226 /**
3227 * Emits an instruction before @inst to load the value named by @orig_src
3228 * from scratch space at @base_offset to @temp.
3229 *
3230 * @base_offset is measured in 32-byte units (the size of a register).
3231 */
3232 void
3233 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3234 dst_reg temp, src_reg orig_src,
3235 int base_offset)
3236 {
3237 int reg_offset = base_offset + orig_src.reg_offset;
3238 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3239
3240 emit_before(inst, SCRATCH_READ(temp, index));
3241 }
3242
3243 /**
3244 * Emits an instruction after @inst to store the value to be written
3245 * to @orig_dst to scratch space at @base_offset, from @temp.
3246 *
3247 * @base_offset is measured in 32-byte units (the size of a register).
3248 */
3249 void
3250 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3251 {
3252 int reg_offset = base_offset + inst->dst.reg_offset;
3253 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3254
3255 /* Create a temporary register to store *inst's result in.
3256 *
3257 * We have to be careful in MOVing from our temporary result register in
3258 * the scratch write. If we swizzle from channels of the temporary that
3259 * weren't initialized, it will confuse live interval analysis, which will
3260 * make spilling fail to make progress.
3261 */
3262 src_reg temp = src_reg(this, glsl_type::vec4_type);
3263 temp.type = inst->dst.type;
3264 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3265 int swizzles[4];
3266 for (int i = 0; i < 4; i++)
3267 if (inst->dst.writemask & (1 << i))
3268 swizzles[i] = i;
3269 else
3270 swizzles[i] = first_writemask_chan;
3271 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3272 swizzles[2], swizzles[3]);
3273
3274 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3275 inst->dst.writemask));
3276 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3277 write->predicate = inst->predicate;
3278 write->ir = inst->ir;
3279 write->annotation = inst->annotation;
3280 inst->insert_after(write);
3281
3282 inst->dst.file = temp.file;
3283 inst->dst.reg = temp.reg;
3284 inst->dst.reg_offset = temp.reg_offset;
3285 inst->dst.reladdr = NULL;
3286 }
3287
3288 /**
3289 * We can't generally support array access in GRF space, because a
3290 * single instruction's destination can only span 2 contiguous
3291 * registers. So, we send all GRF arrays that get variable index
3292 * access to scratch space.
3293 */
3294 void
3295 vec4_visitor::move_grf_array_access_to_scratch()
3296 {
3297 int scratch_loc[this->virtual_grf_count];
3298
3299 for (int i = 0; i < this->virtual_grf_count; i++) {
3300 scratch_loc[i] = -1;
3301 }
3302
3303 /* First, calculate the set of virtual GRFs that need to be punted
3304 * to scratch due to having any array access on them, and where in
3305 * scratch.
3306 */
3307 foreach_in_list(vec4_instruction, inst, &instructions) {
3308 if (inst->dst.file == GRF && inst->dst.reladdr &&
3309 scratch_loc[inst->dst.reg] == -1) {
3310 scratch_loc[inst->dst.reg] = c->last_scratch;
3311 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3312 }
3313
3314 for (int i = 0 ; i < 3; i++) {
3315 src_reg *src = &inst->src[i];
3316
3317 if (src->file == GRF && src->reladdr &&
3318 scratch_loc[src->reg] == -1) {
3319 scratch_loc[src->reg] = c->last_scratch;
3320 c->last_scratch += this->virtual_grf_sizes[src->reg];
3321 }
3322 }
3323 }
3324
3325 /* Now, for anything that will be accessed through scratch, rewrite
3326 * it to load/store. Note that this is a _safe list walk, because
3327 * we may generate a new scratch_write instruction after the one
3328 * we're processing.
3329 */
3330 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3331 /* Set up the annotation tracking for new generated instructions. */
3332 base_ir = inst->ir;
3333 current_annotation = inst->annotation;
3334
3335 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3336 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3337 }
3338
3339 for (int i = 0 ; i < 3; i++) {
3340 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3341 continue;
3342
3343 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3344
3345 emit_scratch_read(inst, temp, inst->src[i],
3346 scratch_loc[inst->src[i].reg]);
3347
3348 inst->src[i].file = temp.file;
3349 inst->src[i].reg = temp.reg;
3350 inst->src[i].reg_offset = temp.reg_offset;
3351 inst->src[i].reladdr = NULL;
3352 }
3353 }
3354 }
3355
3356 /**
3357 * Emits an instruction before @inst to load the value named by @orig_src
3358 * from the pull constant buffer (surface) at @base_offset to @temp.
3359 */
3360 void
3361 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3362 dst_reg temp, src_reg orig_src,
3363 int base_offset)
3364 {
3365 int reg_offset = base_offset + orig_src.reg_offset;
3366 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3367 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3368 vec4_instruction *load;
3369
3370 if (brw->gen >= 7) {
3371 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3372 grf_offset.type = offset.type;
3373 emit_before(inst, MOV(grf_offset, offset));
3374
3375 load = new(mem_ctx) vec4_instruction(this,
3376 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3377 temp, index, src_reg(grf_offset));
3378 } else {
3379 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3380 temp, index, offset);
3381 load->base_mrf = 14;
3382 load->mlen = 1;
3383 }
3384 emit_before(inst, load);
3385 }
3386
3387 /**
3388 * Implements array access of uniforms by inserting a
3389 * PULL_CONSTANT_LOAD instruction.
3390 *
3391 * Unlike temporary GRF array access (where we don't support it due to
3392 * the difficulty of doing relative addressing on instruction
3393 * destinations), we could potentially do array access of uniforms
3394 * that were loaded in GRF space as push constants. In real-world
3395 * usage we've seen, though, the arrays being used are always larger
3396 * than we could load as push constants, so just always move all
3397 * uniform array access out to a pull constant buffer.
3398 */
3399 void
3400 vec4_visitor::move_uniform_array_access_to_pull_constants()
3401 {
3402 int pull_constant_loc[this->uniforms];
3403
3404 for (int i = 0; i < this->uniforms; i++) {
3405 pull_constant_loc[i] = -1;
3406 }
3407
3408 /* Walk through and find array access of uniforms. Put a copy of that
3409 * uniform in the pull constant buffer.
3410 *
3411 * Note that we don't move constant-indexed accesses to arrays. No
3412 * testing has been done of the performance impact of this choice.
3413 */
3414 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3415 for (int i = 0 ; i < 3; i++) {
3416 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3417 continue;
3418
3419 int uniform = inst->src[i].reg;
3420
3421 /* If this array isn't already present in the pull constant buffer,
3422 * add it.
3423 */
3424 if (pull_constant_loc[uniform] == -1) {
3425 const gl_constant_value **values =
3426 &stage_prog_data->param[uniform * 4];
3427
3428 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3429
3430 assert(uniform < uniform_array_size);
3431 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3432 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3433 = values[j];
3434 }
3435 }
3436
3437 /* Set up the annotation tracking for new generated instructions. */
3438 base_ir = inst->ir;
3439 current_annotation = inst->annotation;
3440
3441 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3442
3443 emit_pull_constant_load(inst, temp, inst->src[i],
3444 pull_constant_loc[uniform]);
3445
3446 inst->src[i].file = temp.file;
3447 inst->src[i].reg = temp.reg;
3448 inst->src[i].reg_offset = temp.reg_offset;
3449 inst->src[i].reladdr = NULL;
3450 }
3451 }
3452
3453 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3454 * no need to track them as larger-than-vec4 objects. This will be
3455 * relied on in cutting out unused uniform vectors from push
3456 * constants.
3457 */
3458 split_uniform_registers();
3459 }
3460
3461 void
3462 vec4_visitor::resolve_ud_negate(src_reg *reg)
3463 {
3464 if (reg->type != BRW_REGISTER_TYPE_UD ||
3465 !reg->negate)
3466 return;
3467
3468 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3469 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3470 *reg = temp;
3471 }
3472
3473 vec4_visitor::vec4_visitor(struct brw_context *brw,
3474 struct brw_vec4_compile *c,
3475 struct gl_program *prog,
3476 const struct brw_vec4_prog_key *key,
3477 struct brw_vec4_prog_data *prog_data,
3478 struct gl_shader_program *shader_prog,
3479 gl_shader_stage stage,
3480 void *mem_ctx,
3481 bool debug_flag,
3482 bool no_spills,
3483 shader_time_shader_type st_base,
3484 shader_time_shader_type st_written,
3485 shader_time_shader_type st_reset)
3486 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3487 c(c),
3488 key(key),
3489 prog_data(prog_data),
3490 sanity_param_count(0),
3491 fail_msg(NULL),
3492 first_non_payload_grf(0),
3493 need_all_constants_in_pull_buffer(false),
3494 debug_flag(debug_flag),
3495 no_spills(no_spills),
3496 st_base(st_base),
3497 st_written(st_written),
3498 st_reset(st_reset)
3499 {
3500 this->mem_ctx = mem_ctx;
3501 this->failed = false;
3502
3503 this->base_ir = NULL;
3504 this->current_annotation = NULL;
3505 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3506
3507 this->variable_ht = hash_table_ctor(0,
3508 hash_table_pointer_hash,
3509 hash_table_pointer_compare);
3510
3511 this->virtual_grf_start = NULL;
3512 this->virtual_grf_end = NULL;
3513 this->virtual_grf_sizes = NULL;
3514 this->virtual_grf_count = 0;
3515 this->virtual_grf_reg_map = NULL;
3516 this->virtual_grf_reg_count = 0;
3517 this->virtual_grf_array_size = 0;
3518 this->live_intervals_valid = false;
3519
3520 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3521
3522 this->uniforms = 0;
3523
3524 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3525 * at least one. See setup_uniforms() in brw_vec4.cpp.
3526 */
3527 this->uniform_array_size = 1;
3528 if (prog_data) {
3529 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3530 }
3531
3532 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3533 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3534 }
3535
3536 vec4_visitor::~vec4_visitor()
3537 {
3538 hash_table_dtor(this->variable_ht);
3539 }
3540
3541
3542 void
3543 vec4_visitor::fail(const char *format, ...)
3544 {
3545 va_list va;
3546 char *msg;
3547
3548 if (failed)
3549 return;
3550
3551 failed = true;
3552
3553 va_start(va, format);
3554 msg = ralloc_vasprintf(mem_ctx, format, va);
3555 va_end(va);
3556 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3557
3558 this->fail_msg = msg;
3559
3560 if (debug_flag) {
3561 fprintf(stderr, "%s", msg);
3562 }
3563 }
3564
3565 } /* namespace brw */