i965: Handle ir_triop_csel in emit_bool_to_cond_code().
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
123 const src_reg &src1) \
124 { \
125 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
126 src0, src1); \
127 }
128
129 #define ALU2_ACC(op) \
130 vec4_instruction * \
131 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
132 const src_reg &src1) \
133 { \
134 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
135 BRW_OPCODE_##op, dst, src0, src1); \
136 inst->writes_accumulator = true; \
137 return inst; \
138 }
139
140 #define ALU3(op) \
141 vec4_instruction * \
142 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
143 const src_reg &src1, const src_reg &src2) \
144 { \
145 assert(brw->gen >= 6); \
146 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
147 src0, src1, src2); \
148 }
149
150 ALU1(NOT)
151 ALU1(MOV)
152 ALU1(FRC)
153 ALU1(RNDD)
154 ALU1(RNDE)
155 ALU1(RNDZ)
156 ALU1(F32TO16)
157 ALU1(F16TO32)
158 ALU2(ADD)
159 ALU2(MUL)
160 ALU2_ACC(MACH)
161 ALU2(AND)
162 ALU2(OR)
163 ALU2(XOR)
164 ALU2(DP3)
165 ALU2(DP4)
166 ALU2(DPH)
167 ALU2(SHL)
168 ALU2(SHR)
169 ALU2(ASR)
170 ALU3(LRP)
171 ALU1(BFREV)
172 ALU3(BFE)
173 ALU2(BFI1)
174 ALU3(BFI2)
175 ALU1(FBH)
176 ALU1(FBL)
177 ALU1(CBIT)
178 ALU3(MAD)
179 ALU2_ACC(ADDC)
180 ALU2_ACC(SUBB)
181 ALU2(MAC)
182
183 /** Gen4 predicated IF. */
184 vec4_instruction *
185 vec4_visitor::IF(enum brw_predicate predicate)
186 {
187 vec4_instruction *inst;
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
190 inst->predicate = predicate;
191
192 return inst;
193 }
194
195 /** Gen6 IF with embedded comparison. */
196 vec4_instruction *
197 vec4_visitor::IF(src_reg src0, src_reg src1,
198 enum brw_conditional_mod condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
221 enum brw_conditional_mod condition)
222 {
223 vec4_instruction *inst;
224
225 /* original gen4 does type conversion to the destination type
226 * before before comparison, producing garbage results for floating
227 * point comparisons.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 vec4_instruction *
245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
246 {
247 vec4_instruction *inst;
248
249 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
250 dst, index);
251 inst->base_mrf = 14;
252 inst->mlen = 2;
253
254 return inst;
255 }
256
257 vec4_instruction *
258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
259 const src_reg &index)
260 {
261 vec4_instruction *inst;
262
263 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
264 dst, src, index);
265 inst->base_mrf = 13;
266 inst->mlen = 3;
267
268 return inst;
269 }
270
271 void
272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
273 {
274 static enum opcode dot_opcodes[] = {
275 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
276 };
277
278 emit(dot_opcodes[elements - 2], dst, src0, src1);
279 }
280
281 src_reg
282 vec4_visitor::fix_3src_operand(src_reg src)
283 {
284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285 * able to use vertical stride of zero to replicate the vec4 uniform, like
286 *
287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288 *
289 * But you can't, since vertical stride is always four in three-source
290 * instructions. Instead, insert a MOV instruction to do the replication so
291 * that the three-source instruction can consume it.
292 */
293
294 /* The MOV is only needed if the source is a uniform or immediate. */
295 if (src.file != UNIFORM && src.file != IMM)
296 return src;
297
298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299 return src;
300
301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302 expanded.type = src.type;
303 emit(MOV(expanded, src));
304 return src_reg(expanded);
305 }
306
307 src_reg
308 vec4_visitor::fix_math_operand(src_reg src)
309 {
310 /* The gen6 math instruction ignores the source modifiers --
311 * swizzle, abs, negate, and at least some parts of the register
312 * region description.
313 *
314 * Rather than trying to enumerate all these cases, *always* expand the
315 * operand to a temp GRF for gen6.
316 *
317 * For gen7, keep the operand as-is, except if immediate, which gen7 still
318 * can't use.
319 */
320
321 if (brw->gen == 7 && src.file != IMM)
322 return src;
323
324 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
325 expanded.type = src.type;
326 emit(MOV(expanded, src));
327 return src_reg(expanded);
328 }
329
330 void
331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
332 {
333 src = fix_math_operand(src);
334
335 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
336 /* The gen6 math instruction must be align1, so we can't do
337 * writemasks.
338 */
339 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
340
341 emit(opcode, temp_dst, src);
342
343 emit(MOV(dst, src_reg(temp_dst)));
344 } else {
345 emit(opcode, dst, src);
346 }
347 }
348
349 void
350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
351 {
352 vec4_instruction *inst = emit(opcode, dst, src);
353 inst->base_mrf = 1;
354 inst->mlen = 1;
355 }
356
357 void
358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
359 {
360 switch (opcode) {
361 case SHADER_OPCODE_RCP:
362 case SHADER_OPCODE_RSQ:
363 case SHADER_OPCODE_SQRT:
364 case SHADER_OPCODE_EXP2:
365 case SHADER_OPCODE_LOG2:
366 case SHADER_OPCODE_SIN:
367 case SHADER_OPCODE_COS:
368 break;
369 default:
370 unreachable("not reached: bad math opcode");
371 }
372
373 if (brw->gen >= 8) {
374 emit(opcode, dst, src);
375 } else if (brw->gen >= 6) {
376 emit_math1_gen6(opcode, dst, src);
377 } else {
378 emit_math1_gen4(opcode, dst, src);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 src0 = fix_math_operand(src0);
387 src1 = fix_math_operand(src1);
388
389 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
390 /* The gen6 math instruction must be align1, so we can't do
391 * writemasks.
392 */
393 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
394 temp_dst.type = dst.type;
395
396 emit(opcode, temp_dst, src0, src1);
397
398 emit(MOV(dst, src_reg(temp_dst)));
399 } else {
400 emit(opcode, dst, src0, src1);
401 }
402 }
403
404 void
405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
406 dst_reg dst, src_reg src0, src_reg src1)
407 {
408 vec4_instruction *inst = emit(opcode, dst, src0, src1);
409 inst->base_mrf = 1;
410 inst->mlen = 2;
411 }
412
413 void
414 vec4_visitor::emit_math(enum opcode opcode,
415 dst_reg dst, src_reg src0, src_reg src1)
416 {
417 switch (opcode) {
418 case SHADER_OPCODE_POW:
419 case SHADER_OPCODE_INT_QUOTIENT:
420 case SHADER_OPCODE_INT_REMAINDER:
421 break;
422 default:
423 unreachable("not reached: unsupported binary math opcode");
424 }
425
426 if (brw->gen >= 8) {
427 emit(opcode, dst, src0, src1);
428 } else if (brw->gen >= 6) {
429 emit_math2_gen6(opcode, dst, src0, src1);
430 } else {
431 emit_math2_gen4(opcode, dst, src0, src1);
432 }
433 }
434
435 void
436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
437 {
438 if (brw->gen < 7) {
439 unreachable("ir_unop_pack_half_2x16 should be lowered");
440 }
441
442 assert(dst.type == BRW_REGISTER_TYPE_UD);
443 assert(src0.type == BRW_REGISTER_TYPE_F);
444
445 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
446 *
447 * Because this instruction does not have a 16-bit floating-point type,
448 * the destination data type must be Word (W).
449 *
450 * The destination must be DWord-aligned and specify a horizontal stride
451 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
452 * each destination channel and the upper word is not modified.
453 *
454 * The above restriction implies that the f32to16 instruction must use
455 * align1 mode, because only in align1 mode is it possible to specify
456 * horizontal stride. We choose here to defy the hardware docs and emit
457 * align16 instructions.
458 *
459 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
460 * instructions. I was partially successful in that the code passed all
461 * tests. However, the code was dubiously correct and fragile, and the
462 * tests were not harsh enough to probe that frailty. Not trusting the
463 * code, I chose instead to remain in align16 mode in defiance of the hw
464 * docs).
465 *
466 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
467 * simulator, emitting a f32to16 in align16 mode with UD as destination
468 * data type is safe. The behavior differs from that specified in the PRM
469 * in that the upper word of each destination channel is cleared to 0.
470 */
471
472 dst_reg tmp_dst(this, glsl_type::uvec2_type);
473 src_reg tmp_src(tmp_dst);
474
475 #if 0
476 /* Verify the undocumented behavior on which the following instructions
477 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
478 * then the result of the bit-or instruction below will be incorrect.
479 *
480 * You should inspect the disasm output in order to verify that the MOV is
481 * not optimized away.
482 */
483 emit(MOV(tmp_dst, src_reg(0x12345678u)));
484 #endif
485
486 /* Give tmp the form below, where "." means untouched.
487 *
488 * w z y x w z y x
489 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
490 *
491 * That the upper word of each write-channel be 0 is required for the
492 * following bit-shift and bit-or instructions to work. Note that this
493 * relies on the undocumented hardware behavior mentioned above.
494 */
495 tmp_dst.writemask = WRITEMASK_XY;
496 emit(F32TO16(tmp_dst, src0));
497
498 /* Give the write-channels of dst the form:
499 * 0xhhhh0000
500 */
501 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
502 emit(SHL(dst, tmp_src, src_reg(16u)));
503
504 /* Finally, give the write-channels of dst the form of packHalf2x16's
505 * output:
506 * 0xhhhhllll
507 */
508 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
509 emit(OR(dst, src_reg(dst), tmp_src));
510 }
511
512 void
513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
514 {
515 if (brw->gen < 7) {
516 unreachable("ir_unop_unpack_half_2x16 should be lowered");
517 }
518
519 assert(dst.type == BRW_REGISTER_TYPE_F);
520 assert(src0.type == BRW_REGISTER_TYPE_UD);
521
522 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
523 *
524 * Because this instruction does not have a 16-bit floating-point type,
525 * the source data type must be Word (W). The destination type must be
526 * F (Float).
527 *
528 * To use W as the source data type, we must adjust horizontal strides,
529 * which is only possible in align1 mode. All my [chadv] attempts at
530 * emitting align1 instructions for unpackHalf2x16 failed to pass the
531 * Piglit tests, so I gave up.
532 *
533 * I've verified that, on gen7 hardware and the simulator, it is safe to
534 * emit f16to32 in align16 mode with UD as source data type.
535 */
536
537 dst_reg tmp_dst(this, glsl_type::uvec2_type);
538 src_reg tmp_src(tmp_dst);
539
540 tmp_dst.writemask = WRITEMASK_X;
541 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
542
543 tmp_dst.writemask = WRITEMASK_Y;
544 emit(SHR(tmp_dst, src0, src_reg(16u)));
545
546 dst.writemask = WRITEMASK_XY;
547 emit(F16TO32(dst, tmp_src));
548 }
549
550 void
551 vec4_visitor::visit_instructions(const exec_list *list)
552 {
553 foreach_in_list(ir_instruction, ir, list) {
554 base_ir = ir;
555 ir->accept(this);
556 }
557 }
558
559
560 static int
561 type_size(const struct glsl_type *type)
562 {
563 unsigned int i;
564 int size;
565
566 switch (type->base_type) {
567 case GLSL_TYPE_UINT:
568 case GLSL_TYPE_INT:
569 case GLSL_TYPE_FLOAT:
570 case GLSL_TYPE_BOOL:
571 if (type->is_matrix()) {
572 return type->matrix_columns;
573 } else {
574 /* Regardless of size of vector, it gets a vec4. This is bad
575 * packing for things like floats, but otherwise arrays become a
576 * mess. Hopefully a later pass over the code can pack scalars
577 * down if appropriate.
578 */
579 return 1;
580 }
581 case GLSL_TYPE_ARRAY:
582 assert(type->length > 0);
583 return type_size(type->fields.array) * type->length;
584 case GLSL_TYPE_STRUCT:
585 size = 0;
586 for (i = 0; i < type->length; i++) {
587 size += type_size(type->fields.structure[i].type);
588 }
589 return size;
590 case GLSL_TYPE_SAMPLER:
591 /* Samplers take up one slot in UNIFORMS[], but they're baked in
592 * at link time.
593 */
594 return 1;
595 case GLSL_TYPE_ATOMIC_UINT:
596 return 0;
597 case GLSL_TYPE_IMAGE:
598 case GLSL_TYPE_VOID:
599 case GLSL_TYPE_ERROR:
600 case GLSL_TYPE_INTERFACE:
601 unreachable("not reached");
602 }
603
604 return 0;
605 }
606
607 int
608 vec4_visitor::virtual_grf_alloc(int size)
609 {
610 if (virtual_grf_array_size <= virtual_grf_count) {
611 if (virtual_grf_array_size == 0)
612 virtual_grf_array_size = 16;
613 else
614 virtual_grf_array_size *= 2;
615 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
616 virtual_grf_array_size);
617 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
618 virtual_grf_array_size);
619 }
620 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
621 virtual_grf_reg_count += size;
622 virtual_grf_sizes[virtual_grf_count] = size;
623 return virtual_grf_count++;
624 }
625
626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
627 {
628 init();
629
630 this->file = GRF;
631 this->reg = v->virtual_grf_alloc(type_size(type));
632
633 if (type->is_array() || type->is_record()) {
634 this->swizzle = BRW_SWIZZLE_NOOP;
635 } else {
636 this->swizzle = swizzle_for_size(type->vector_elements);
637 }
638
639 this->type = brw_type_for_base_type(type);
640 }
641
642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
643 {
644 init();
645
646 this->file = GRF;
647 this->reg = v->virtual_grf_alloc(type_size(type));
648
649 if (type->is_array() || type->is_record()) {
650 this->writemask = WRITEMASK_XYZW;
651 } else {
652 this->writemask = (1 << type->vector_elements) - 1;
653 }
654
655 this->type = brw_type_for_base_type(type);
656 }
657
658 /* Our support for uniforms is piggy-backed on the struct
659 * gl_fragment_program, because that's where the values actually
660 * get stored, rather than in some global gl_shader_program uniform
661 * store.
662 */
663 void
664 vec4_visitor::setup_uniform_values(ir_variable *ir)
665 {
666 int namelen = strlen(ir->name);
667
668 /* The data for our (non-builtin) uniforms is stored in a series of
669 * gl_uniform_driver_storage structs for each subcomponent that
670 * glGetUniformLocation() could name. We know it's been set up in the same
671 * order we'd walk the type, so walk the list of storage and find anything
672 * with our name, or the prefix of a component that starts with our name.
673 */
674 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
675 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
676
677 if (strncmp(ir->name, storage->name, namelen) != 0 ||
678 (storage->name[namelen] != 0 &&
679 storage->name[namelen] != '.' &&
680 storage->name[namelen] != '[')) {
681 continue;
682 }
683
684 gl_constant_value *components = storage->storage;
685 unsigned vector_count = (MAX2(storage->array_elements, 1) *
686 storage->type->matrix_columns);
687
688 for (unsigned s = 0; s < vector_count; s++) {
689 assert(uniforms < uniform_array_size);
690 uniform_vector_size[uniforms] = storage->type->vector_elements;
691
692 int i;
693 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
694 stage_prog_data->param[uniforms * 4 + i] = components;
695 components++;
696 }
697 for (; i < 4; i++) {
698 static gl_constant_value zero = { 0.0 };
699 stage_prog_data->param[uniforms * 4 + i] = &zero;
700 }
701
702 uniforms++;
703 }
704 }
705 }
706
707 void
708 vec4_visitor::setup_uniform_clipplane_values()
709 {
710 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
711
712 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
713 assert(this->uniforms < uniform_array_size);
714 this->uniform_vector_size[this->uniforms] = 4;
715 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
716 this->userplane[i].type = BRW_REGISTER_TYPE_F;
717 for (int j = 0; j < 4; ++j) {
718 stage_prog_data->param[this->uniforms * 4 + j] =
719 (gl_constant_value *) &clip_planes[i][j];
720 }
721 ++this->uniforms;
722 }
723 }
724
725 /* Our support for builtin uniforms is even scarier than non-builtin.
726 * It sits on top of the PROG_STATE_VAR parameters that are
727 * automatically updated from GL context state.
728 */
729 void
730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
731 {
732 const ir_state_slot *const slots = ir->state_slots;
733 assert(ir->state_slots != NULL);
734
735 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
736 /* This state reference has already been setup by ir_to_mesa,
737 * but we'll get the same index back here. We can reference
738 * ParameterValues directly, since unlike brw_fs.cpp, we never
739 * add new state references during compile.
740 */
741 int index = _mesa_add_state_reference(this->prog->Parameters,
742 (gl_state_index *)slots[i].tokens);
743 gl_constant_value *values =
744 &this->prog->Parameters->ParameterValues[index][0];
745
746 assert(this->uniforms < uniform_array_size);
747 this->uniform_vector_size[this->uniforms] = 0;
748 /* Add each of the unique swizzled channels of the element.
749 * This will end up matching the size of the glsl_type of this field.
750 */
751 int last_swiz = -1;
752 for (unsigned int j = 0; j < 4; j++) {
753 int swiz = GET_SWZ(slots[i].swizzle, j);
754 last_swiz = swiz;
755
756 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
757 assert(this->uniforms < uniform_array_size);
758 if (swiz <= last_swiz)
759 this->uniform_vector_size[this->uniforms]++;
760 }
761 this->uniforms++;
762 }
763 }
764
765 dst_reg *
766 vec4_visitor::variable_storage(ir_variable *var)
767 {
768 return (dst_reg *)hash_table_find(this->variable_ht, var);
769 }
770
771 void
772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
773 enum brw_predicate *predicate)
774 {
775 ir_expression *expr = ir->as_expression();
776
777 *predicate = BRW_PREDICATE_NORMAL;
778
779 if (expr) {
780 src_reg op[3];
781 vec4_instruction *inst;
782
783 assert(expr->get_num_operands() <= 3);
784 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
785 expr->operands[i]->accept(this);
786 op[i] = this->result;
787
788 resolve_ud_negate(&op[i]);
789 }
790
791 switch (expr->operation) {
792 case ir_unop_logic_not:
793 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
794 inst->conditional_mod = BRW_CONDITIONAL_Z;
795 break;
796
797 case ir_binop_logic_xor:
798 inst = emit(XOR(dst_null_d(), op[0], op[1]));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 break;
801
802 case ir_binop_logic_or:
803 inst = emit(OR(dst_null_d(), op[0], op[1]));
804 inst->conditional_mod = BRW_CONDITIONAL_NZ;
805 break;
806
807 case ir_binop_logic_and:
808 inst = emit(AND(dst_null_d(), op[0], op[1]));
809 inst->conditional_mod = BRW_CONDITIONAL_NZ;
810 break;
811
812 case ir_unop_f2b:
813 if (brw->gen >= 6) {
814 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
815 } else {
816 inst = emit(MOV(dst_null_f(), op[0]));
817 inst->conditional_mod = BRW_CONDITIONAL_NZ;
818 }
819 break;
820
821 case ir_unop_i2b:
822 if (brw->gen >= 6) {
823 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
824 } else {
825 inst = emit(MOV(dst_null_d(), op[0]));
826 inst->conditional_mod = BRW_CONDITIONAL_NZ;
827 }
828 break;
829
830 case ir_binop_all_equal:
831 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
832 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
833 break;
834
835 case ir_binop_any_nequal:
836 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
837 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
838 break;
839
840 case ir_unop_any:
841 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
842 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
843 break;
844
845 case ir_binop_greater:
846 case ir_binop_gequal:
847 case ir_binop_less:
848 case ir_binop_lequal:
849 case ir_binop_equal:
850 case ir_binop_nequal:
851 emit(CMP(dst_null_d(), op[0], op[1],
852 brw_conditional_for_comparison(expr->operation)));
853 break;
854
855 case ir_triop_csel: {
856 /* Expand the boolean condition into the flag register. */
857 inst = emit(MOV(dst_null_d(), op[0]));
858 inst->conditional_mod = BRW_CONDITIONAL_NZ;
859
860 /* Select which boolean to return. */
861 dst_reg temp(this, expr->operands[1]->type);
862 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
863 inst->predicate = BRW_PREDICATE_NORMAL;
864
865 /* Expand the result to a condition code. */
866 inst = emit(MOV(dst_null_d(), src_reg(temp)));
867 inst->conditional_mod = BRW_CONDITIONAL_NZ;
868 break;
869 }
870
871 default:
872 unreachable("not reached");
873 }
874 return;
875 }
876
877 ir->accept(this);
878
879 resolve_ud_negate(&this->result);
880
881 if (brw->gen >= 6) {
882 vec4_instruction *inst = emit(AND(dst_null_d(),
883 this->result, src_reg(1)));
884 inst->conditional_mod = BRW_CONDITIONAL_NZ;
885 } else {
886 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
887 inst->conditional_mod = BRW_CONDITIONAL_NZ;
888 }
889 }
890
891 /**
892 * Emit a gen6 IF statement with the comparison folded into the IF
893 * instruction.
894 */
895 void
896 vec4_visitor::emit_if_gen6(ir_if *ir)
897 {
898 ir_expression *expr = ir->condition->as_expression();
899
900 if (expr) {
901 src_reg op[2];
902 dst_reg temp;
903
904 assert(expr->get_num_operands() <= 2);
905 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
906 expr->operands[i]->accept(this);
907 op[i] = this->result;
908 }
909
910 switch (expr->operation) {
911 case ir_unop_logic_not:
912 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
913 return;
914
915 case ir_binop_logic_xor:
916 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
917 return;
918
919 case ir_binop_logic_or:
920 temp = dst_reg(this, glsl_type::bool_type);
921 emit(OR(temp, op[0], op[1]));
922 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
923 return;
924
925 case ir_binop_logic_and:
926 temp = dst_reg(this, glsl_type::bool_type);
927 emit(AND(temp, op[0], op[1]));
928 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
929 return;
930
931 case ir_unop_f2b:
932 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
933 return;
934
935 case ir_unop_i2b:
936 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
937 return;
938
939 case ir_binop_greater:
940 case ir_binop_gequal:
941 case ir_binop_less:
942 case ir_binop_lequal:
943 case ir_binop_equal:
944 case ir_binop_nequal:
945 emit(IF(op[0], op[1],
946 brw_conditional_for_comparison(expr->operation)));
947 return;
948
949 case ir_binop_all_equal:
950 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
951 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
952 return;
953
954 case ir_binop_any_nequal:
955 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
956 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
957 return;
958
959 case ir_unop_any:
960 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
961 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
962 return;
963
964 default:
965 unreachable("not reached");
966 }
967 return;
968 }
969
970 ir->condition->accept(this);
971
972 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
973 }
974
975 void
976 vec4_visitor::visit(ir_variable *ir)
977 {
978 dst_reg *reg = NULL;
979
980 if (variable_storage(ir))
981 return;
982
983 switch (ir->data.mode) {
984 case ir_var_shader_in:
985 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
986 break;
987
988 case ir_var_shader_out:
989 reg = new(mem_ctx) dst_reg(this, ir->type);
990
991 for (int i = 0; i < type_size(ir->type); i++) {
992 output_reg[ir->data.location + i] = *reg;
993 output_reg[ir->data.location + i].reg_offset = i;
994 output_reg[ir->data.location + i].type =
995 brw_type_for_base_type(ir->type->get_scalar_type());
996 output_reg_annotation[ir->data.location + i] = ir->name;
997 }
998 break;
999
1000 case ir_var_auto:
1001 case ir_var_temporary:
1002 reg = new(mem_ctx) dst_reg(this, ir->type);
1003 break;
1004
1005 case ir_var_uniform:
1006 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1007
1008 /* Thanks to the lower_ubo_reference pass, we will see only
1009 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1010 * variables, so no need for them to be in variable_ht.
1011 *
1012 * Atomic counters take no uniform storage, no need to do
1013 * anything here.
1014 */
1015 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1016 return;
1017
1018 /* Track how big the whole uniform variable is, in case we need to put a
1019 * copy of its data into pull constants for array access.
1020 */
1021 assert(this->uniforms < uniform_array_size);
1022 this->uniform_size[this->uniforms] = type_size(ir->type);
1023
1024 if (!strncmp(ir->name, "gl_", 3)) {
1025 setup_builtin_uniform_values(ir);
1026 } else {
1027 setup_uniform_values(ir);
1028 }
1029 break;
1030
1031 case ir_var_system_value:
1032 reg = make_reg_for_system_value(ir);
1033 break;
1034
1035 default:
1036 unreachable("not reached");
1037 }
1038
1039 reg->type = brw_type_for_base_type(ir->type);
1040 hash_table_insert(this->variable_ht, reg, ir);
1041 }
1042
1043 void
1044 vec4_visitor::visit(ir_loop *ir)
1045 {
1046 /* We don't want debugging output to print the whole body of the
1047 * loop as the annotation.
1048 */
1049 this->base_ir = NULL;
1050
1051 emit(BRW_OPCODE_DO);
1052
1053 visit_instructions(&ir->body_instructions);
1054
1055 emit(BRW_OPCODE_WHILE);
1056 }
1057
1058 void
1059 vec4_visitor::visit(ir_loop_jump *ir)
1060 {
1061 switch (ir->mode) {
1062 case ir_loop_jump::jump_break:
1063 emit(BRW_OPCODE_BREAK);
1064 break;
1065 case ir_loop_jump::jump_continue:
1066 emit(BRW_OPCODE_CONTINUE);
1067 break;
1068 }
1069 }
1070
1071
1072 void
1073 vec4_visitor::visit(ir_function_signature *)
1074 {
1075 unreachable("not reached");
1076 }
1077
1078 void
1079 vec4_visitor::visit(ir_function *ir)
1080 {
1081 /* Ignore function bodies other than main() -- we shouldn't see calls to
1082 * them since they should all be inlined.
1083 */
1084 if (strcmp(ir->name, "main") == 0) {
1085 const ir_function_signature *sig;
1086 exec_list empty;
1087
1088 sig = ir->matching_signature(NULL, &empty, false);
1089
1090 assert(sig);
1091
1092 visit_instructions(&sig->body);
1093 }
1094 }
1095
1096 bool
1097 vec4_visitor::try_emit_mad(ir_expression *ir)
1098 {
1099 /* 3-src instructions were introduced in gen6. */
1100 if (brw->gen < 6)
1101 return false;
1102
1103 /* MAD can only handle floating-point data. */
1104 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1105 return false;
1106
1107 ir_rvalue *nonmul = ir->operands[1];
1108 ir_expression *mul = ir->operands[0]->as_expression();
1109
1110 if (!mul || mul->operation != ir_binop_mul) {
1111 nonmul = ir->operands[0];
1112 mul = ir->operands[1]->as_expression();
1113
1114 if (!mul || mul->operation != ir_binop_mul)
1115 return false;
1116 }
1117
1118 nonmul->accept(this);
1119 src_reg src0 = fix_3src_operand(this->result);
1120
1121 mul->operands[0]->accept(this);
1122 src_reg src1 = fix_3src_operand(this->result);
1123
1124 mul->operands[1]->accept(this);
1125 src_reg src2 = fix_3src_operand(this->result);
1126
1127 this->result = src_reg(this, ir->type);
1128 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1129
1130 return true;
1131 }
1132
1133 bool
1134 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1135 {
1136 /* This optimization relies on CMP setting the destination to 0 when
1137 * false. Early hardware only sets the least significant bit, and
1138 * leaves the other bits undefined. So we can't use it.
1139 */
1140 if (brw->gen < 6)
1141 return false;
1142
1143 ir_expression *const cmp = ir->operands[0]->as_expression();
1144
1145 if (cmp == NULL)
1146 return false;
1147
1148 switch (cmp->operation) {
1149 case ir_binop_less:
1150 case ir_binop_greater:
1151 case ir_binop_lequal:
1152 case ir_binop_gequal:
1153 case ir_binop_equal:
1154 case ir_binop_nequal:
1155 break;
1156
1157 default:
1158 return false;
1159 }
1160
1161 cmp->operands[0]->accept(this);
1162 const src_reg cmp_src0 = this->result;
1163
1164 cmp->operands[1]->accept(this);
1165 const src_reg cmp_src1 = this->result;
1166
1167 this->result = src_reg(this, ir->type);
1168
1169 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1170 brw_conditional_for_comparison(cmp->operation)));
1171
1172 /* If the comparison is false, this->result will just happen to be zero.
1173 */
1174 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1175 this->result, src_reg(1.0f));
1176 inst->predicate = BRW_PREDICATE_NORMAL;
1177 inst->predicate_inverse = true;
1178
1179 return true;
1180 }
1181
1182 void
1183 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1184 src_reg src0, src_reg src1)
1185 {
1186 vec4_instruction *inst;
1187
1188 if (brw->gen >= 6) {
1189 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1190 inst->conditional_mod = conditionalmod;
1191 } else {
1192 emit(CMP(dst, src0, src1, conditionalmod));
1193
1194 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1195 inst->predicate = BRW_PREDICATE_NORMAL;
1196 }
1197 }
1198
1199 void
1200 vec4_visitor::emit_lrp(const dst_reg &dst,
1201 const src_reg &x, const src_reg &y, const src_reg &a)
1202 {
1203 if (brw->gen >= 6) {
1204 /* Note that the instruction's argument order is reversed from GLSL
1205 * and the IR.
1206 */
1207 emit(LRP(dst,
1208 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1209 } else {
1210 /* Earlier generations don't support three source operations, so we
1211 * need to emit x*(1-a) + y*a.
1212 */
1213 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1214 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1215 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1216 y_times_a.writemask = dst.writemask;
1217 one_minus_a.writemask = dst.writemask;
1218 x_times_one_minus_a.writemask = dst.writemask;
1219
1220 emit(MUL(y_times_a, y, a));
1221 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1222 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1223 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1224 }
1225 }
1226
1227 void
1228 vec4_visitor::visit(ir_expression *ir)
1229 {
1230 unsigned int operand;
1231 src_reg op[Elements(ir->operands)];
1232 src_reg result_src;
1233 dst_reg result_dst;
1234 vec4_instruction *inst;
1235
1236 if (ir->operation == ir_binop_add) {
1237 if (try_emit_mad(ir))
1238 return;
1239 }
1240
1241 if (ir->operation == ir_unop_b2f) {
1242 if (try_emit_b2f_of_compare(ir))
1243 return;
1244 }
1245
1246 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1247 this->result.file = BAD_FILE;
1248 ir->operands[operand]->accept(this);
1249 if (this->result.file == BAD_FILE) {
1250 fprintf(stderr, "Failed to get tree for expression operand:\n");
1251 ir->operands[operand]->fprint(stderr);
1252 exit(1);
1253 }
1254 op[operand] = this->result;
1255
1256 /* Matrix expression operands should have been broken down to vector
1257 * operations already.
1258 */
1259 assert(!ir->operands[operand]->type->is_matrix());
1260 }
1261
1262 int vector_elements = ir->operands[0]->type->vector_elements;
1263 if (ir->operands[1]) {
1264 vector_elements = MAX2(vector_elements,
1265 ir->operands[1]->type->vector_elements);
1266 }
1267
1268 this->result.file = BAD_FILE;
1269
1270 /* Storage for our result. Ideally for an assignment we'd be using
1271 * the actual storage for the result here, instead.
1272 */
1273 result_src = src_reg(this, ir->type);
1274 /* convenience for the emit functions below. */
1275 result_dst = dst_reg(result_src);
1276 /* If nothing special happens, this is the result. */
1277 this->result = result_src;
1278 /* Limit writes to the channels that will be used by result_src later.
1279 * This does limit this temp's use as a temporary for multi-instruction
1280 * sequences.
1281 */
1282 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1283
1284 switch (ir->operation) {
1285 case ir_unop_logic_not:
1286 if (ctx->Const.UniformBooleanTrue != 1) {
1287 emit(NOT(result_dst, op[0]));
1288 } else {
1289 emit(XOR(result_dst, op[0], src_reg(1)));
1290 }
1291 break;
1292 case ir_unop_neg:
1293 op[0].negate = !op[0].negate;
1294 emit(MOV(result_dst, op[0]));
1295 break;
1296 case ir_unop_abs:
1297 op[0].abs = true;
1298 op[0].negate = false;
1299 emit(MOV(result_dst, op[0]));
1300 break;
1301
1302 case ir_unop_sign:
1303 if (ir->type->is_float()) {
1304 /* AND(val, 0x80000000) gives the sign bit.
1305 *
1306 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1307 * zero.
1308 */
1309 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1310
1311 op[0].type = BRW_REGISTER_TYPE_UD;
1312 result_dst.type = BRW_REGISTER_TYPE_UD;
1313 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1314
1315 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1316 inst->predicate = BRW_PREDICATE_NORMAL;
1317
1318 this->result.type = BRW_REGISTER_TYPE_F;
1319 } else {
1320 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1321 * -> non-negative val generates 0x00000000.
1322 * Predicated OR sets 1 if val is positive.
1323 */
1324 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1325
1326 emit(ASR(result_dst, op[0], src_reg(31)));
1327
1328 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1329 inst->predicate = BRW_PREDICATE_NORMAL;
1330 }
1331 break;
1332
1333 case ir_unop_rcp:
1334 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1335 break;
1336
1337 case ir_unop_exp2:
1338 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1339 break;
1340 case ir_unop_log2:
1341 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1342 break;
1343 case ir_unop_exp:
1344 case ir_unop_log:
1345 unreachable("not reached: should be handled by ir_explog_to_explog2");
1346 case ir_unop_sin:
1347 case ir_unop_sin_reduced:
1348 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1349 break;
1350 case ir_unop_cos:
1351 case ir_unop_cos_reduced:
1352 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1353 break;
1354
1355 case ir_unop_dFdx:
1356 case ir_unop_dFdx_coarse:
1357 case ir_unop_dFdx_fine:
1358 case ir_unop_dFdy:
1359 case ir_unop_dFdy_coarse:
1360 case ir_unop_dFdy_fine:
1361 unreachable("derivatives not valid in vertex shader");
1362
1363 case ir_unop_bitfield_reverse:
1364 emit(BFREV(result_dst, op[0]));
1365 break;
1366 case ir_unop_bit_count:
1367 emit(CBIT(result_dst, op[0]));
1368 break;
1369 case ir_unop_find_msb: {
1370 src_reg temp = src_reg(this, glsl_type::uint_type);
1371
1372 inst = emit(FBH(dst_reg(temp), op[0]));
1373 inst->dst.writemask = WRITEMASK_XYZW;
1374
1375 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1376 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1377 * subtract the result from 31 to convert the MSB count into an LSB count.
1378 */
1379
1380 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1381 temp.swizzle = BRW_SWIZZLE_NOOP;
1382 emit(MOV(result_dst, temp));
1383
1384 src_reg src_tmp = src_reg(result_dst);
1385 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1386
1387 src_tmp.negate = true;
1388 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1389 inst->predicate = BRW_PREDICATE_NORMAL;
1390 break;
1391 }
1392 case ir_unop_find_lsb:
1393 emit(FBL(result_dst, op[0]));
1394 break;
1395 case ir_unop_saturate:
1396 inst = emit(MOV(result_dst, op[0]));
1397 inst->saturate = true;
1398 break;
1399
1400 case ir_unop_noise:
1401 unreachable("not reached: should be handled by lower_noise");
1402
1403 case ir_binop_add:
1404 emit(ADD(result_dst, op[0], op[1]));
1405 break;
1406 case ir_binop_sub:
1407 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1408
1409 case ir_binop_mul:
1410 if (brw->gen < 8 && ir->type->is_integer()) {
1411 /* For integer multiplication, the MUL uses the low 16 bits of one of
1412 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1413 * accumulates in the contribution of the upper 16 bits of that
1414 * operand. If we can determine that one of the args is in the low
1415 * 16 bits, though, we can just emit a single MUL.
1416 */
1417 if (ir->operands[0]->is_uint16_constant()) {
1418 if (brw->gen < 7)
1419 emit(MUL(result_dst, op[0], op[1]));
1420 else
1421 emit(MUL(result_dst, op[1], op[0]));
1422 } else if (ir->operands[1]->is_uint16_constant()) {
1423 if (brw->gen < 7)
1424 emit(MUL(result_dst, op[1], op[0]));
1425 else
1426 emit(MUL(result_dst, op[0], op[1]));
1427 } else {
1428 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1429
1430 emit(MUL(acc, op[0], op[1]));
1431 emit(MACH(dst_null_d(), op[0], op[1]));
1432 emit(MOV(result_dst, src_reg(acc)));
1433 }
1434 } else {
1435 emit(MUL(result_dst, op[0], op[1]));
1436 }
1437 break;
1438 case ir_binop_imul_high: {
1439 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1440
1441 emit(MUL(acc, op[0], op[1]));
1442 emit(MACH(result_dst, op[0], op[1]));
1443 break;
1444 }
1445 case ir_binop_div:
1446 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1447 assert(ir->type->is_integer());
1448 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1449 break;
1450 case ir_binop_carry: {
1451 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1452
1453 emit(ADDC(dst_null_ud(), op[0], op[1]));
1454 emit(MOV(result_dst, src_reg(acc)));
1455 break;
1456 }
1457 case ir_binop_borrow: {
1458 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1459
1460 emit(SUBB(dst_null_ud(), op[0], op[1]));
1461 emit(MOV(result_dst, src_reg(acc)));
1462 break;
1463 }
1464 case ir_binop_mod:
1465 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1466 assert(ir->type->is_integer());
1467 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1468 break;
1469
1470 case ir_binop_less:
1471 case ir_binop_greater:
1472 case ir_binop_lequal:
1473 case ir_binop_gequal:
1474 case ir_binop_equal:
1475 case ir_binop_nequal: {
1476 emit(CMP(result_dst, op[0], op[1],
1477 brw_conditional_for_comparison(ir->operation)));
1478 if (ctx->Const.UniformBooleanTrue == 1) {
1479 emit(AND(result_dst, result_src, src_reg(1)));
1480 }
1481 break;
1482 }
1483
1484 case ir_binop_all_equal:
1485 /* "==" operator producing a scalar boolean. */
1486 if (ir->operands[0]->type->is_vector() ||
1487 ir->operands[1]->type->is_vector()) {
1488 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1489 emit(MOV(result_dst, src_reg(0)));
1490 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1491 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1492 } else {
1493 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1494 if (ctx->Const.UniformBooleanTrue == 1) {
1495 emit(AND(result_dst, result_src, src_reg(1)));
1496 }
1497 }
1498 break;
1499 case ir_binop_any_nequal:
1500 /* "!=" operator producing a scalar boolean. */
1501 if (ir->operands[0]->type->is_vector() ||
1502 ir->operands[1]->type->is_vector()) {
1503 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1504
1505 emit(MOV(result_dst, src_reg(0)));
1506 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1507 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1508 } else {
1509 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1510 if (ctx->Const.UniformBooleanTrue == 1) {
1511 emit(AND(result_dst, result_src, src_reg(1)));
1512 }
1513 }
1514 break;
1515
1516 case ir_unop_any:
1517 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1518 emit(MOV(result_dst, src_reg(0)));
1519
1520 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1521 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1522 break;
1523
1524 case ir_binop_logic_xor:
1525 emit(XOR(result_dst, op[0], op[1]));
1526 break;
1527
1528 case ir_binop_logic_or:
1529 emit(OR(result_dst, op[0], op[1]));
1530 break;
1531
1532 case ir_binop_logic_and:
1533 emit(AND(result_dst, op[0], op[1]));
1534 break;
1535
1536 case ir_binop_dot:
1537 assert(ir->operands[0]->type->is_vector());
1538 assert(ir->operands[0]->type == ir->operands[1]->type);
1539 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1540 break;
1541
1542 case ir_unop_sqrt:
1543 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1544 break;
1545 case ir_unop_rsq:
1546 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1547 break;
1548
1549 case ir_unop_bitcast_i2f:
1550 case ir_unop_bitcast_u2f:
1551 this->result = op[0];
1552 this->result.type = BRW_REGISTER_TYPE_F;
1553 break;
1554
1555 case ir_unop_bitcast_f2i:
1556 this->result = op[0];
1557 this->result.type = BRW_REGISTER_TYPE_D;
1558 break;
1559
1560 case ir_unop_bitcast_f2u:
1561 this->result = op[0];
1562 this->result.type = BRW_REGISTER_TYPE_UD;
1563 break;
1564
1565 case ir_unop_i2f:
1566 case ir_unop_i2u:
1567 case ir_unop_u2i:
1568 case ir_unop_u2f:
1569 case ir_unop_f2i:
1570 case ir_unop_f2u:
1571 emit(MOV(result_dst, op[0]));
1572 break;
1573 case ir_unop_b2i:
1574 if (ctx->Const.UniformBooleanTrue != 1) {
1575 emit(AND(result_dst, op[0], src_reg(1)));
1576 } else {
1577 emit(MOV(result_dst, op[0]));
1578 }
1579 break;
1580 case ir_unop_b2f:
1581 if (ctx->Const.UniformBooleanTrue != 1) {
1582 op[0].type = BRW_REGISTER_TYPE_UD;
1583 result_dst.type = BRW_REGISTER_TYPE_UD;
1584 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1585 result_dst.type = BRW_REGISTER_TYPE_F;
1586 } else {
1587 emit(MOV(result_dst, op[0]));
1588 }
1589 break;
1590 case ir_unop_f2b:
1591 case ir_unop_i2b:
1592 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1593 if (ctx->Const.UniformBooleanTrue == 1) {
1594 emit(AND(result_dst, result_src, src_reg(1)));
1595 }
1596 break;
1597
1598 case ir_unop_trunc:
1599 emit(RNDZ(result_dst, op[0]));
1600 break;
1601 case ir_unop_ceil:
1602 op[0].negate = !op[0].negate;
1603 inst = emit(RNDD(result_dst, op[0]));
1604 this->result.negate = true;
1605 break;
1606 case ir_unop_floor:
1607 inst = emit(RNDD(result_dst, op[0]));
1608 break;
1609 case ir_unop_fract:
1610 inst = emit(FRC(result_dst, op[0]));
1611 break;
1612 case ir_unop_round_even:
1613 emit(RNDE(result_dst, op[0]));
1614 break;
1615
1616 case ir_binop_min:
1617 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1618 break;
1619 case ir_binop_max:
1620 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1621 break;
1622
1623 case ir_binop_pow:
1624 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1625 break;
1626
1627 case ir_unop_bit_not:
1628 inst = emit(NOT(result_dst, op[0]));
1629 break;
1630 case ir_binop_bit_and:
1631 inst = emit(AND(result_dst, op[0], op[1]));
1632 break;
1633 case ir_binop_bit_xor:
1634 inst = emit(XOR(result_dst, op[0], op[1]));
1635 break;
1636 case ir_binop_bit_or:
1637 inst = emit(OR(result_dst, op[0], op[1]));
1638 break;
1639
1640 case ir_binop_lshift:
1641 inst = emit(SHL(result_dst, op[0], op[1]));
1642 break;
1643
1644 case ir_binop_rshift:
1645 if (ir->type->base_type == GLSL_TYPE_INT)
1646 inst = emit(ASR(result_dst, op[0], op[1]));
1647 else
1648 inst = emit(SHR(result_dst, op[0], op[1]));
1649 break;
1650
1651 case ir_binop_bfm:
1652 emit(BFI1(result_dst, op[0], op[1]));
1653 break;
1654
1655 case ir_binop_ubo_load: {
1656 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1657 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1658 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1659 src_reg offset;
1660
1661 /* Now, load the vector from that offset. */
1662 assert(ir->type->is_vector() || ir->type->is_scalar());
1663
1664 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1665 packed_consts.type = result.type;
1666 src_reg surf_index;
1667
1668 if (const_uniform_block) {
1669 /* The block index is a constant, so just emit the binding table entry
1670 * as an immediate.
1671 */
1672 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1673 const_uniform_block->value.u[0]);
1674 } else {
1675 /* The block index is not a constant. Evaluate the index expression
1676 * per-channel and add the base UBO index; the generator will select
1677 * a value from any live channel.
1678 */
1679 surf_index = src_reg(this, glsl_type::uint_type);
1680 emit(ADD(dst_reg(surf_index), op[0],
1681 src_reg(prog_data->base.binding_table.ubo_start)));
1682
1683 /* Assume this may touch any UBO. It would be nice to provide
1684 * a tighter bound, but the array information is already lowered away.
1685 */
1686 brw_mark_surface_used(&prog_data->base,
1687 prog_data->base.binding_table.ubo_start +
1688 shader_prog->NumUniformBlocks - 1);
1689 }
1690
1691 if (const_offset_ir) {
1692 if (brw->gen >= 8) {
1693 /* Store the offset in a GRF so we can send-from-GRF. */
1694 offset = src_reg(this, glsl_type::int_type);
1695 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1696 } else {
1697 /* Immediates are fine on older generations since they'll be moved
1698 * to a (potentially fake) MRF at the generator level.
1699 */
1700 offset = src_reg(const_offset / 16);
1701 }
1702 } else {
1703 offset = src_reg(this, glsl_type::uint_type);
1704 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1705 }
1706
1707 if (brw->gen >= 7) {
1708 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1709 grf_offset.type = offset.type;
1710
1711 emit(MOV(grf_offset, offset));
1712
1713 emit(new(mem_ctx) vec4_instruction(this,
1714 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1715 dst_reg(packed_consts),
1716 surf_index,
1717 src_reg(grf_offset)));
1718 } else {
1719 vec4_instruction *pull =
1720 emit(new(mem_ctx) vec4_instruction(this,
1721 VS_OPCODE_PULL_CONSTANT_LOAD,
1722 dst_reg(packed_consts),
1723 surf_index,
1724 offset));
1725 pull->base_mrf = 14;
1726 pull->mlen = 1;
1727 }
1728
1729 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1730 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1731 const_offset % 16 / 4,
1732 const_offset % 16 / 4,
1733 const_offset % 16 / 4);
1734
1735 /* UBO bools are any nonzero int. We need to convert them to use the
1736 * value of true stored in ctx->Const.UniformBooleanTrue.
1737 */
1738 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1739 emit(CMP(result_dst, packed_consts, src_reg(0u),
1740 BRW_CONDITIONAL_NZ));
1741 if (ctx->Const.UniformBooleanTrue == 1) {
1742 emit(AND(result_dst, result, src_reg(1)));
1743 }
1744 } else {
1745 emit(MOV(result_dst, packed_consts));
1746 }
1747 break;
1748 }
1749
1750 case ir_binop_vector_extract:
1751 unreachable("should have been lowered by vec_index_to_cond_assign");
1752
1753 case ir_triop_fma:
1754 op[0] = fix_3src_operand(op[0]);
1755 op[1] = fix_3src_operand(op[1]);
1756 op[2] = fix_3src_operand(op[2]);
1757 /* Note that the instruction's argument order is reversed from GLSL
1758 * and the IR.
1759 */
1760 emit(MAD(result_dst, op[2], op[1], op[0]));
1761 break;
1762
1763 case ir_triop_lrp:
1764 emit_lrp(result_dst, op[0], op[1], op[2]);
1765 break;
1766
1767 case ir_triop_csel:
1768 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1769 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1770 inst->predicate = BRW_PREDICATE_NORMAL;
1771 break;
1772
1773 case ir_triop_bfi:
1774 op[0] = fix_3src_operand(op[0]);
1775 op[1] = fix_3src_operand(op[1]);
1776 op[2] = fix_3src_operand(op[2]);
1777 emit(BFI2(result_dst, op[0], op[1], op[2]));
1778 break;
1779
1780 case ir_triop_bitfield_extract:
1781 op[0] = fix_3src_operand(op[0]);
1782 op[1] = fix_3src_operand(op[1]);
1783 op[2] = fix_3src_operand(op[2]);
1784 /* Note that the instruction's argument order is reversed from GLSL
1785 * and the IR.
1786 */
1787 emit(BFE(result_dst, op[2], op[1], op[0]));
1788 break;
1789
1790 case ir_triop_vector_insert:
1791 unreachable("should have been lowered by lower_vector_insert");
1792
1793 case ir_quadop_bitfield_insert:
1794 unreachable("not reached: should be handled by "
1795 "bitfield_insert_to_bfm_bfi\n");
1796
1797 case ir_quadop_vector:
1798 unreachable("not reached: should be handled by lower_quadop_vector");
1799
1800 case ir_unop_pack_half_2x16:
1801 emit_pack_half_2x16(result_dst, op[0]);
1802 break;
1803 case ir_unop_unpack_half_2x16:
1804 emit_unpack_half_2x16(result_dst, op[0]);
1805 break;
1806 case ir_unop_pack_snorm_2x16:
1807 case ir_unop_pack_snorm_4x8:
1808 case ir_unop_pack_unorm_2x16:
1809 case ir_unop_pack_unorm_4x8:
1810 case ir_unop_unpack_snorm_2x16:
1811 case ir_unop_unpack_snorm_4x8:
1812 case ir_unop_unpack_unorm_2x16:
1813 case ir_unop_unpack_unorm_4x8:
1814 unreachable("not reached: should be handled by lower_packing_builtins");
1815 case ir_unop_unpack_half_2x16_split_x:
1816 case ir_unop_unpack_half_2x16_split_y:
1817 case ir_binop_pack_half_2x16_split:
1818 case ir_unop_interpolate_at_centroid:
1819 case ir_binop_interpolate_at_sample:
1820 case ir_binop_interpolate_at_offset:
1821 unreachable("not reached: should not occur in vertex shader");
1822 case ir_binop_ldexp:
1823 unreachable("not reached: should be handled by ldexp_to_arith()");
1824 }
1825 }
1826
1827
1828 void
1829 vec4_visitor::visit(ir_swizzle *ir)
1830 {
1831 src_reg src;
1832 int i = 0;
1833 int swizzle[4];
1834
1835 /* Note that this is only swizzles in expressions, not those on the left
1836 * hand side of an assignment, which do write masking. See ir_assignment
1837 * for that.
1838 */
1839
1840 ir->val->accept(this);
1841 src = this->result;
1842 assert(src.file != BAD_FILE);
1843
1844 for (i = 0; i < ir->type->vector_elements; i++) {
1845 switch (i) {
1846 case 0:
1847 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1848 break;
1849 case 1:
1850 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1851 break;
1852 case 2:
1853 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1854 break;
1855 case 3:
1856 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1857 break;
1858 }
1859 }
1860 for (; i < 4; i++) {
1861 /* Replicate the last channel out. */
1862 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1863 }
1864
1865 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1866
1867 this->result = src;
1868 }
1869
1870 void
1871 vec4_visitor::visit(ir_dereference_variable *ir)
1872 {
1873 const struct glsl_type *type = ir->type;
1874 dst_reg *reg = variable_storage(ir->var);
1875
1876 if (!reg) {
1877 fail("Failed to find variable storage for %s\n", ir->var->name);
1878 this->result = src_reg(brw_null_reg());
1879 return;
1880 }
1881
1882 this->result = src_reg(*reg);
1883
1884 /* System values get their swizzle from the dst_reg writemask */
1885 if (ir->var->data.mode == ir_var_system_value)
1886 return;
1887
1888 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1889 this->result.swizzle = swizzle_for_size(type->vector_elements);
1890 }
1891
1892
1893 int
1894 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1895 {
1896 /* Under normal circumstances array elements are stored consecutively, so
1897 * the stride is equal to the size of the array element.
1898 */
1899 return type_size(ir->type);
1900 }
1901
1902
1903 void
1904 vec4_visitor::visit(ir_dereference_array *ir)
1905 {
1906 ir_constant *constant_index;
1907 src_reg src;
1908 int array_stride = compute_array_stride(ir);
1909
1910 constant_index = ir->array_index->constant_expression_value();
1911
1912 ir->array->accept(this);
1913 src = this->result;
1914
1915 if (constant_index) {
1916 src.reg_offset += constant_index->value.i[0] * array_stride;
1917 } else {
1918 /* Variable index array dereference. It eats the "vec4" of the
1919 * base of the array and an index that offsets the Mesa register
1920 * index.
1921 */
1922 ir->array_index->accept(this);
1923
1924 src_reg index_reg;
1925
1926 if (array_stride == 1) {
1927 index_reg = this->result;
1928 } else {
1929 index_reg = src_reg(this, glsl_type::int_type);
1930
1931 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1932 }
1933
1934 if (src.reladdr) {
1935 src_reg temp = src_reg(this, glsl_type::int_type);
1936
1937 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1938
1939 index_reg = temp;
1940 }
1941
1942 src.reladdr = ralloc(mem_ctx, src_reg);
1943 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1944 }
1945
1946 /* If the type is smaller than a vec4, replicate the last channel out. */
1947 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1948 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1949 else
1950 src.swizzle = BRW_SWIZZLE_NOOP;
1951 src.type = brw_type_for_base_type(ir->type);
1952
1953 this->result = src;
1954 }
1955
1956 void
1957 vec4_visitor::visit(ir_dereference_record *ir)
1958 {
1959 unsigned int i;
1960 const glsl_type *struct_type = ir->record->type;
1961 int offset = 0;
1962
1963 ir->record->accept(this);
1964
1965 for (i = 0; i < struct_type->length; i++) {
1966 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1967 break;
1968 offset += type_size(struct_type->fields.structure[i].type);
1969 }
1970
1971 /* If the type is smaller than a vec4, replicate the last channel out. */
1972 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1973 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1974 else
1975 this->result.swizzle = BRW_SWIZZLE_NOOP;
1976 this->result.type = brw_type_for_base_type(ir->type);
1977
1978 this->result.reg_offset += offset;
1979 }
1980
1981 /**
1982 * We want to be careful in assignment setup to hit the actual storage
1983 * instead of potentially using a temporary like we might with the
1984 * ir_dereference handler.
1985 */
1986 static dst_reg
1987 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1988 {
1989 /* The LHS must be a dereference. If the LHS is a variable indexed array
1990 * access of a vector, it must be separated into a series conditional moves
1991 * before reaching this point (see ir_vec_index_to_cond_assign).
1992 */
1993 assert(ir->as_dereference());
1994 ir_dereference_array *deref_array = ir->as_dereference_array();
1995 if (deref_array) {
1996 assert(!deref_array->array->type->is_vector());
1997 }
1998
1999 /* Use the rvalue deref handler for the most part. We'll ignore
2000 * swizzles in it and write swizzles using writemask, though.
2001 */
2002 ir->accept(v);
2003 return dst_reg(v->result);
2004 }
2005
2006 void
2007 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2008 const struct glsl_type *type,
2009 enum brw_predicate predicate)
2010 {
2011 if (type->base_type == GLSL_TYPE_STRUCT) {
2012 for (unsigned int i = 0; i < type->length; i++) {
2013 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2014 }
2015 return;
2016 }
2017
2018 if (type->is_array()) {
2019 for (unsigned int i = 0; i < type->length; i++) {
2020 emit_block_move(dst, src, type->fields.array, predicate);
2021 }
2022 return;
2023 }
2024
2025 if (type->is_matrix()) {
2026 const struct glsl_type *vec_type;
2027
2028 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2029 type->vector_elements, 1);
2030
2031 for (int i = 0; i < type->matrix_columns; i++) {
2032 emit_block_move(dst, src, vec_type, predicate);
2033 }
2034 return;
2035 }
2036
2037 assert(type->is_scalar() || type->is_vector());
2038
2039 dst->type = brw_type_for_base_type(type);
2040 src->type = dst->type;
2041
2042 dst->writemask = (1 << type->vector_elements) - 1;
2043
2044 src->swizzle = swizzle_for_size(type->vector_elements);
2045
2046 vec4_instruction *inst = emit(MOV(*dst, *src));
2047 inst->predicate = predicate;
2048
2049 dst->reg_offset++;
2050 src->reg_offset++;
2051 }
2052
2053
2054 /* If the RHS processing resulted in an instruction generating a
2055 * temporary value, and it would be easy to rewrite the instruction to
2056 * generate its result right into the LHS instead, do so. This ends
2057 * up reliably removing instructions where it can be tricky to do so
2058 * later without real UD chain information.
2059 */
2060 bool
2061 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2062 dst_reg dst,
2063 src_reg src,
2064 vec4_instruction *pre_rhs_inst,
2065 vec4_instruction *last_rhs_inst)
2066 {
2067 /* This could be supported, but it would take more smarts. */
2068 if (ir->condition)
2069 return false;
2070
2071 if (pre_rhs_inst == last_rhs_inst)
2072 return false; /* No instructions generated to work with. */
2073
2074 /* Make sure the last instruction generated our source reg. */
2075 if (src.file != GRF ||
2076 src.file != last_rhs_inst->dst.file ||
2077 src.reg != last_rhs_inst->dst.reg ||
2078 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2079 src.reladdr ||
2080 src.abs ||
2081 src.negate ||
2082 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2083 return false;
2084
2085 /* Check that that last instruction fully initialized the channels
2086 * we want to use, in the order we want to use them. We could
2087 * potentially reswizzle the operands of many instructions so that
2088 * we could handle out of order channels, but don't yet.
2089 */
2090
2091 for (unsigned i = 0; i < 4; i++) {
2092 if (dst.writemask & (1 << i)) {
2093 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2094 return false;
2095
2096 if (BRW_GET_SWZ(src.swizzle, i) != i)
2097 return false;
2098 }
2099 }
2100
2101 /* Success! Rewrite the instruction. */
2102 last_rhs_inst->dst.file = dst.file;
2103 last_rhs_inst->dst.reg = dst.reg;
2104 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2105 last_rhs_inst->dst.reladdr = dst.reladdr;
2106 last_rhs_inst->dst.writemask &= dst.writemask;
2107
2108 return true;
2109 }
2110
2111 void
2112 vec4_visitor::visit(ir_assignment *ir)
2113 {
2114 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2115 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2116
2117 if (!ir->lhs->type->is_scalar() &&
2118 !ir->lhs->type->is_vector()) {
2119 ir->rhs->accept(this);
2120 src_reg src = this->result;
2121
2122 if (ir->condition) {
2123 emit_bool_to_cond_code(ir->condition, &predicate);
2124 }
2125
2126 /* emit_block_move doesn't account for swizzles in the source register.
2127 * This should be ok, since the source register is a structure or an
2128 * array, and those can't be swizzled. But double-check to be sure.
2129 */
2130 assert(src.swizzle ==
2131 (ir->rhs->type->is_matrix()
2132 ? swizzle_for_size(ir->rhs->type->vector_elements)
2133 : BRW_SWIZZLE_NOOP));
2134
2135 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2136 return;
2137 }
2138
2139 /* Now we're down to just a scalar/vector with writemasks. */
2140 int i;
2141
2142 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2143 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2144
2145 ir->rhs->accept(this);
2146
2147 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2148
2149 src_reg src = this->result;
2150
2151 int swizzles[4];
2152 int first_enabled_chan = 0;
2153 int src_chan = 0;
2154
2155 assert(ir->lhs->type->is_vector() ||
2156 ir->lhs->type->is_scalar());
2157 dst.writemask = ir->write_mask;
2158
2159 for (int i = 0; i < 4; i++) {
2160 if (dst.writemask & (1 << i)) {
2161 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2162 break;
2163 }
2164 }
2165
2166 /* Swizzle a small RHS vector into the channels being written.
2167 *
2168 * glsl ir treats write_mask as dictating how many channels are
2169 * present on the RHS while in our instructions we need to make
2170 * those channels appear in the slots of the vec4 they're written to.
2171 */
2172 for (int i = 0; i < 4; i++) {
2173 if (dst.writemask & (1 << i))
2174 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2175 else
2176 swizzles[i] = first_enabled_chan;
2177 }
2178 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2179 swizzles[2], swizzles[3]);
2180
2181 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2182 return;
2183 }
2184
2185 if (ir->condition) {
2186 emit_bool_to_cond_code(ir->condition, &predicate);
2187 }
2188
2189 for (i = 0; i < type_size(ir->lhs->type); i++) {
2190 vec4_instruction *inst = emit(MOV(dst, src));
2191 inst->predicate = predicate;
2192
2193 dst.reg_offset++;
2194 src.reg_offset++;
2195 }
2196 }
2197
2198 void
2199 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2200 {
2201 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2202 foreach_in_list(ir_constant, field_value, &ir->components) {
2203 emit_constant_values(dst, field_value);
2204 }
2205 return;
2206 }
2207
2208 if (ir->type->is_array()) {
2209 for (unsigned int i = 0; i < ir->type->length; i++) {
2210 emit_constant_values(dst, ir->array_elements[i]);
2211 }
2212 return;
2213 }
2214
2215 if (ir->type->is_matrix()) {
2216 for (int i = 0; i < ir->type->matrix_columns; i++) {
2217 float *vec = &ir->value.f[i * ir->type->vector_elements];
2218
2219 for (int j = 0; j < ir->type->vector_elements; j++) {
2220 dst->writemask = 1 << j;
2221 dst->type = BRW_REGISTER_TYPE_F;
2222
2223 emit(MOV(*dst, src_reg(vec[j])));
2224 }
2225 dst->reg_offset++;
2226 }
2227 return;
2228 }
2229
2230 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2231
2232 for (int i = 0; i < ir->type->vector_elements; i++) {
2233 if (!(remaining_writemask & (1 << i)))
2234 continue;
2235
2236 dst->writemask = 1 << i;
2237 dst->type = brw_type_for_base_type(ir->type);
2238
2239 /* Find other components that match the one we're about to
2240 * write. Emits fewer instructions for things like vec4(0.5,
2241 * 1.5, 1.5, 1.5).
2242 */
2243 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2244 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2245 if (ir->value.b[i] == ir->value.b[j])
2246 dst->writemask |= (1 << j);
2247 } else {
2248 /* u, i, and f storage all line up, so no need for a
2249 * switch case for comparing each type.
2250 */
2251 if (ir->value.u[i] == ir->value.u[j])
2252 dst->writemask |= (1 << j);
2253 }
2254 }
2255
2256 switch (ir->type->base_type) {
2257 case GLSL_TYPE_FLOAT:
2258 emit(MOV(*dst, src_reg(ir->value.f[i])));
2259 break;
2260 case GLSL_TYPE_INT:
2261 emit(MOV(*dst, src_reg(ir->value.i[i])));
2262 break;
2263 case GLSL_TYPE_UINT:
2264 emit(MOV(*dst, src_reg(ir->value.u[i])));
2265 break;
2266 case GLSL_TYPE_BOOL:
2267 emit(MOV(*dst,
2268 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2269 : 0)));
2270 break;
2271 default:
2272 unreachable("Non-float/uint/int/bool constant");
2273 }
2274
2275 remaining_writemask &= ~dst->writemask;
2276 }
2277 dst->reg_offset++;
2278 }
2279
2280 void
2281 vec4_visitor::visit(ir_constant *ir)
2282 {
2283 dst_reg dst = dst_reg(this, ir->type);
2284 this->result = src_reg(dst);
2285
2286 emit_constant_values(&dst, ir);
2287 }
2288
2289 void
2290 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2291 {
2292 ir_dereference *deref = static_cast<ir_dereference *>(
2293 ir->actual_parameters.get_head());
2294 ir_variable *location = deref->variable_referenced();
2295 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2296 location->data.binding);
2297
2298 /* Calculate the surface offset */
2299 src_reg offset(this, glsl_type::uint_type);
2300 ir_dereference_array *deref_array = deref->as_dereference_array();
2301 if (deref_array) {
2302 deref_array->array_index->accept(this);
2303
2304 src_reg tmp(this, glsl_type::uint_type);
2305 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2306 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2307 } else {
2308 offset = location->data.atomic.offset;
2309 }
2310
2311 /* Emit the appropriate machine instruction */
2312 const char *callee = ir->callee->function_name();
2313 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2314
2315 if (!strcmp("__intrinsic_atomic_read", callee)) {
2316 emit_untyped_surface_read(surf_index, dst, offset);
2317
2318 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2319 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2320 src_reg(), src_reg());
2321
2322 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2323 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2324 src_reg(), src_reg());
2325 }
2326 }
2327
2328 void
2329 vec4_visitor::visit(ir_call *ir)
2330 {
2331 const char *callee = ir->callee->function_name();
2332
2333 if (!strcmp("__intrinsic_atomic_read", callee) ||
2334 !strcmp("__intrinsic_atomic_increment", callee) ||
2335 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2336 visit_atomic_counter_intrinsic(ir);
2337 } else {
2338 unreachable("Unsupported intrinsic.");
2339 }
2340 }
2341
2342 src_reg
2343 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2344 {
2345 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2346 inst->base_mrf = 2;
2347 inst->mlen = 1;
2348 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2349 inst->dst.writemask = WRITEMASK_XYZW;
2350
2351 inst->src[1] = sampler;
2352
2353 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2354 int param_base = inst->base_mrf;
2355 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2356 int zero_mask = 0xf & ~coord_mask;
2357
2358 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2359 coordinate));
2360
2361 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2362 src_reg(0)));
2363
2364 emit(inst);
2365 return src_reg(inst->dst);
2366 }
2367
2368 static bool
2369 is_high_sampler(struct brw_context *brw, src_reg sampler)
2370 {
2371 if (brw->gen < 8 && !brw->is_haswell)
2372 return false;
2373
2374 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2375 }
2376
2377 void
2378 vec4_visitor::visit(ir_texture *ir)
2379 {
2380 uint32_t sampler =
2381 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2382
2383 ir_rvalue *nonconst_sampler_index =
2384 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2385
2386 /* Handle non-constant sampler array indexing */
2387 src_reg sampler_reg;
2388 if (nonconst_sampler_index) {
2389 /* The highest sampler which may be used by this operation is
2390 * the last element of the array. Mark it here, because the generator
2391 * doesn't have enough information to determine the bound.
2392 */
2393 uint32_t array_size = ir->sampler->as_dereference_array()
2394 ->array->type->array_size();
2395
2396 uint32_t max_used = sampler + array_size - 1;
2397 if (ir->op == ir_tg4 && brw->gen < 8) {
2398 max_used += prog_data->base.binding_table.gather_texture_start;
2399 } else {
2400 max_used += prog_data->base.binding_table.texture_start;
2401 }
2402
2403 brw_mark_surface_used(&prog_data->base, max_used);
2404
2405 /* Emit code to evaluate the actual indexing expression */
2406 nonconst_sampler_index->accept(this);
2407 dst_reg temp(this, glsl_type::uint_type);
2408 emit(ADD(temp, this->result, src_reg(sampler)))
2409 ->force_writemask_all = true;
2410 sampler_reg = src_reg(temp);
2411 } else {
2412 /* Single sampler, or constant array index; the indexing expression
2413 * is just an immediate.
2414 */
2415 sampler_reg = src_reg(sampler);
2416 }
2417
2418 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2419 * emitting anything other than setting up the constant result.
2420 */
2421 if (ir->op == ir_tg4) {
2422 ir_constant *chan = ir->lod_info.component->as_constant();
2423 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2424 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2425 dst_reg result(this, ir->type);
2426 this->result = src_reg(result);
2427 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2428 return;
2429 }
2430 }
2431
2432 /* Should be lowered by do_lower_texture_projection */
2433 assert(!ir->projector);
2434
2435 /* Should be lowered */
2436 assert(!ir->offset || !ir->offset->type->is_array());
2437
2438 /* Generate code to compute all the subexpression trees. This has to be
2439 * done before loading any values into MRFs for the sampler message since
2440 * generating these values may involve SEND messages that need the MRFs.
2441 */
2442 src_reg coordinate;
2443 if (ir->coordinate) {
2444 ir->coordinate->accept(this);
2445 coordinate = this->result;
2446 }
2447
2448 src_reg shadow_comparitor;
2449 if (ir->shadow_comparitor) {
2450 ir->shadow_comparitor->accept(this);
2451 shadow_comparitor = this->result;
2452 }
2453
2454 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2455 src_reg offset_value;
2456 if (has_nonconstant_offset) {
2457 ir->offset->accept(this);
2458 offset_value = src_reg(this->result);
2459 }
2460
2461 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2462 src_reg lod, dPdx, dPdy, sample_index, mcs;
2463 switch (ir->op) {
2464 case ir_tex:
2465 lod = src_reg(0.0f);
2466 lod_type = glsl_type::float_type;
2467 break;
2468 case ir_txf:
2469 case ir_txl:
2470 case ir_txs:
2471 ir->lod_info.lod->accept(this);
2472 lod = this->result;
2473 lod_type = ir->lod_info.lod->type;
2474 break;
2475 case ir_query_levels:
2476 lod = src_reg(0);
2477 lod_type = glsl_type::int_type;
2478 break;
2479 case ir_txf_ms:
2480 ir->lod_info.sample_index->accept(this);
2481 sample_index = this->result;
2482 sample_index_type = ir->lod_info.sample_index->type;
2483
2484 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2485 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2486 else
2487 mcs = src_reg(0u);
2488 break;
2489 case ir_txd:
2490 ir->lod_info.grad.dPdx->accept(this);
2491 dPdx = this->result;
2492
2493 ir->lod_info.grad.dPdy->accept(this);
2494 dPdy = this->result;
2495
2496 lod_type = ir->lod_info.grad.dPdx->type;
2497 break;
2498 case ir_txb:
2499 case ir_lod:
2500 case ir_tg4:
2501 break;
2502 }
2503
2504 enum opcode opcode;
2505 switch (ir->op) {
2506 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2507 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2508 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2509 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2510 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2511 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2512 case ir_tg4: opcode = has_nonconstant_offset
2513 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2514 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2515 case ir_txb:
2516 unreachable("TXB is not valid for vertex shaders.");
2517 case ir_lod:
2518 unreachable("LOD is not valid for vertex shaders.");
2519 default:
2520 unreachable("Unrecognized tex op");
2521 }
2522
2523 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2524
2525 if (ir->offset != NULL && ir->op != ir_txf)
2526 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2527
2528 /* Stuff the channel select bits in the top of the texture offset */
2529 if (ir->op == ir_tg4)
2530 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2531
2532 /* The message header is necessary for:
2533 * - Gen4 (always)
2534 * - Texel offsets
2535 * - Gather channel selection
2536 * - Sampler indices too large to fit in a 4-bit value.
2537 */
2538 inst->header_present =
2539 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2540 is_high_sampler(brw, sampler_reg);
2541 inst->base_mrf = 2;
2542 inst->mlen = inst->header_present + 1; /* always at least one */
2543 inst->dst = dst_reg(this, ir->type);
2544 inst->dst.writemask = WRITEMASK_XYZW;
2545 inst->shadow_compare = ir->shadow_comparitor != NULL;
2546
2547 inst->src[1] = sampler_reg;
2548
2549 /* MRF for the first parameter */
2550 int param_base = inst->base_mrf + inst->header_present;
2551
2552 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2553 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2554 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2555 } else {
2556 /* Load the coordinate */
2557 /* FINISHME: gl_clamp_mask and saturate */
2558 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2559 int zero_mask = 0xf & ~coord_mask;
2560
2561 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2562 coordinate));
2563
2564 if (zero_mask != 0) {
2565 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2566 src_reg(0)));
2567 }
2568 /* Load the shadow comparitor */
2569 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2570 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2571 WRITEMASK_X),
2572 shadow_comparitor));
2573 inst->mlen++;
2574 }
2575
2576 /* Load the LOD info */
2577 if (ir->op == ir_tex || ir->op == ir_txl) {
2578 int mrf, writemask;
2579 if (brw->gen >= 5) {
2580 mrf = param_base + 1;
2581 if (ir->shadow_comparitor) {
2582 writemask = WRITEMASK_Y;
2583 /* mlen already incremented */
2584 } else {
2585 writemask = WRITEMASK_X;
2586 inst->mlen++;
2587 }
2588 } else /* brw->gen == 4 */ {
2589 mrf = param_base;
2590 writemask = WRITEMASK_W;
2591 }
2592 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2593 } else if (ir->op == ir_txf) {
2594 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2595 } else if (ir->op == ir_txf_ms) {
2596 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2597 sample_index));
2598 if (brw->gen >= 7) {
2599 /* MCS data is in the first channel of `mcs`, but we need to get it into
2600 * the .y channel of the second vec4 of params, so replicate .x across
2601 * the whole vec4 and then mask off everything except .y
2602 */
2603 mcs.swizzle = BRW_SWIZZLE_XXXX;
2604 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2605 mcs));
2606 }
2607 inst->mlen++;
2608 } else if (ir->op == ir_txd) {
2609 const glsl_type *type = lod_type;
2610
2611 if (brw->gen >= 5) {
2612 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2613 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2614 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2615 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2616 inst->mlen++;
2617
2618 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2619 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2620 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2621 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2622 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2623 inst->mlen++;
2624
2625 if (ir->shadow_comparitor) {
2626 emit(MOV(dst_reg(MRF, param_base + 2,
2627 ir->shadow_comparitor->type, WRITEMASK_Z),
2628 shadow_comparitor));
2629 }
2630 }
2631 } else /* brw->gen == 4 */ {
2632 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2633 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2634 inst->mlen += 2;
2635 }
2636 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2637 if (ir->shadow_comparitor) {
2638 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2639 shadow_comparitor));
2640 }
2641
2642 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2643 offset_value));
2644 inst->mlen++;
2645 }
2646 }
2647
2648 emit(inst);
2649
2650 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2651 * spec requires layers.
2652 */
2653 if (ir->op == ir_txs) {
2654 glsl_type const *type = ir->sampler->type;
2655 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2656 type->sampler_array) {
2657 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2658 writemask(inst->dst, WRITEMASK_Z),
2659 src_reg(inst->dst), src_reg(6));
2660 }
2661 }
2662
2663 if (brw->gen == 6 && ir->op == ir_tg4) {
2664 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2665 }
2666
2667 swizzle_result(ir, src_reg(inst->dst), sampler);
2668 }
2669
2670 /**
2671 * Apply workarounds for Gen6 gather with UINT/SINT
2672 */
2673 void
2674 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2675 {
2676 if (!wa)
2677 return;
2678
2679 int width = (wa & WA_8BIT) ? 8 : 16;
2680 dst_reg dst_f = dst;
2681 dst_f.type = BRW_REGISTER_TYPE_F;
2682
2683 /* Convert from UNORM to UINT */
2684 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2685 emit(MOV(dst, src_reg(dst_f)));
2686
2687 if (wa & WA_SIGN) {
2688 /* Reinterpret the UINT value as a signed INT value by
2689 * shifting the sign bit into place, then shifting back
2690 * preserving sign.
2691 */
2692 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2693 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2694 }
2695 }
2696
2697 /**
2698 * Set up the gather channel based on the swizzle, for gather4.
2699 */
2700 uint32_t
2701 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2702 {
2703 ir_constant *chan = ir->lod_info.component->as_constant();
2704 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2705 switch (swiz) {
2706 case SWIZZLE_X: return 0;
2707 case SWIZZLE_Y:
2708 /* gather4 sampler is broken for green channel on RG32F --
2709 * we must ask for blue instead.
2710 */
2711 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2712 return 2;
2713 return 1;
2714 case SWIZZLE_Z: return 2;
2715 case SWIZZLE_W: return 3;
2716 default:
2717 unreachable("Not reached"); /* zero, one swizzles handled already */
2718 }
2719 }
2720
2721 void
2722 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2723 {
2724 int s = key->tex.swizzles[sampler];
2725
2726 this->result = src_reg(this, ir->type);
2727 dst_reg swizzled_result(this->result);
2728
2729 if (ir->op == ir_query_levels) {
2730 /* # levels is in .w */
2731 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2732 emit(MOV(swizzled_result, orig_val));
2733 return;
2734 }
2735
2736 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2737 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2738 emit(MOV(swizzled_result, orig_val));
2739 return;
2740 }
2741
2742
2743 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2744 int swizzle[4] = {0};
2745
2746 for (int i = 0; i < 4; i++) {
2747 switch (GET_SWZ(s, i)) {
2748 case SWIZZLE_ZERO:
2749 zero_mask |= (1 << i);
2750 break;
2751 case SWIZZLE_ONE:
2752 one_mask |= (1 << i);
2753 break;
2754 default:
2755 copy_mask |= (1 << i);
2756 swizzle[i] = GET_SWZ(s, i);
2757 break;
2758 }
2759 }
2760
2761 if (copy_mask) {
2762 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2763 swizzled_result.writemask = copy_mask;
2764 emit(MOV(swizzled_result, orig_val));
2765 }
2766
2767 if (zero_mask) {
2768 swizzled_result.writemask = zero_mask;
2769 emit(MOV(swizzled_result, src_reg(0.0f)));
2770 }
2771
2772 if (one_mask) {
2773 swizzled_result.writemask = one_mask;
2774 emit(MOV(swizzled_result, src_reg(1.0f)));
2775 }
2776 }
2777
2778 void
2779 vec4_visitor::visit(ir_return *)
2780 {
2781 unreachable("not reached");
2782 }
2783
2784 void
2785 vec4_visitor::visit(ir_discard *)
2786 {
2787 unreachable("not reached");
2788 }
2789
2790 void
2791 vec4_visitor::visit(ir_if *ir)
2792 {
2793 /* Don't point the annotation at the if statement, because then it plus
2794 * the then and else blocks get printed.
2795 */
2796 this->base_ir = ir->condition;
2797
2798 if (brw->gen == 6) {
2799 emit_if_gen6(ir);
2800 } else {
2801 enum brw_predicate predicate;
2802 emit_bool_to_cond_code(ir->condition, &predicate);
2803 emit(IF(predicate));
2804 }
2805
2806 visit_instructions(&ir->then_instructions);
2807
2808 if (!ir->else_instructions.is_empty()) {
2809 this->base_ir = ir->condition;
2810 emit(BRW_OPCODE_ELSE);
2811
2812 visit_instructions(&ir->else_instructions);
2813 }
2814
2815 this->base_ir = ir->condition;
2816 emit(BRW_OPCODE_ENDIF);
2817 }
2818
2819 void
2820 vec4_visitor::visit(ir_emit_vertex *)
2821 {
2822 unreachable("not reached");
2823 }
2824
2825 void
2826 vec4_visitor::visit(ir_end_primitive *)
2827 {
2828 unreachable("not reached");
2829 }
2830
2831 void
2832 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2833 dst_reg dst, src_reg offset,
2834 src_reg src0, src_reg src1)
2835 {
2836 unsigned mlen = 0;
2837
2838 /* Set the atomic operation offset. */
2839 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2840 mlen++;
2841
2842 /* Set the atomic operation arguments. */
2843 if (src0.file != BAD_FILE) {
2844 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2845 mlen++;
2846 }
2847
2848 if (src1.file != BAD_FILE) {
2849 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2850 mlen++;
2851 }
2852
2853 /* Emit the instruction. Note that this maps to the normal SIMD8
2854 * untyped atomic message on Ivy Bridge, but that's OK because
2855 * unused channels will be masked out.
2856 */
2857 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2858 src_reg(atomic_op), src_reg(surf_index));
2859 inst->base_mrf = 0;
2860 inst->mlen = mlen;
2861 }
2862
2863 void
2864 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2865 src_reg offset)
2866 {
2867 /* Set the surface read offset. */
2868 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2869
2870 /* Emit the instruction. Note that this maps to the normal SIMD8
2871 * untyped surface read message, but that's OK because unused
2872 * channels will be masked out.
2873 */
2874 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2875 dst, src_reg(surf_index));
2876 inst->base_mrf = 0;
2877 inst->mlen = 1;
2878 }
2879
2880 void
2881 vec4_visitor::emit_ndc_computation()
2882 {
2883 /* Get the position */
2884 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2885
2886 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2887 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2888 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2889
2890 current_annotation = "NDC";
2891 dst_reg ndc_w = ndc;
2892 ndc_w.writemask = WRITEMASK_W;
2893 src_reg pos_w = pos;
2894 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2895 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2896
2897 dst_reg ndc_xyz = ndc;
2898 ndc_xyz.writemask = WRITEMASK_XYZ;
2899
2900 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2901 }
2902
2903 void
2904 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2905 {
2906 if (brw->gen < 6 &&
2907 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2908 key->userclip_active || brw->has_negative_rhw_bug)) {
2909 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2910 dst_reg header1_w = header1;
2911 header1_w.writemask = WRITEMASK_W;
2912
2913 emit(MOV(header1, 0u));
2914
2915 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2916 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2917
2918 current_annotation = "Point size";
2919 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2920 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2921 }
2922
2923 if (key->userclip_active) {
2924 current_annotation = "Clipping flags";
2925 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2926 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2927
2928 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2929 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2930 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2931
2932 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2933 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2934 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2935 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2936 }
2937
2938 /* i965 clipping workaround:
2939 * 1) Test for -ve rhw
2940 * 2) If set,
2941 * set ndc = (0,0,0,0)
2942 * set ucp[6] = 1
2943 *
2944 * Later, clipping will detect ucp[6] and ensure the primitive is
2945 * clipped against all fixed planes.
2946 */
2947 if (brw->has_negative_rhw_bug) {
2948 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2949 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2950 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2951 vec4_instruction *inst;
2952 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2953 inst->predicate = BRW_PREDICATE_NORMAL;
2954 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2955 inst->predicate = BRW_PREDICATE_NORMAL;
2956 }
2957
2958 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2959 } else if (brw->gen < 6) {
2960 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2961 } else {
2962 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2963 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2964 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2965 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2966 }
2967 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2968 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2969 src_reg(output_reg[VARYING_SLOT_LAYER])));
2970 }
2971 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2972 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2973 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2974 }
2975 }
2976 }
2977
2978 void
2979 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2980 {
2981 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2982 *
2983 * "If a linked set of shaders forming the vertex stage contains no
2984 * static write to gl_ClipVertex or gl_ClipDistance, but the
2985 * application has requested clipping against user clip planes through
2986 * the API, then the coordinate written to gl_Position is used for
2987 * comparison against the user clip planes."
2988 *
2989 * This function is only called if the shader didn't write to
2990 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2991 * if the user wrote to it; otherwise we use gl_Position.
2992 */
2993 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2994 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2995 clip_vertex = VARYING_SLOT_POS;
2996 }
2997
2998 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2999 ++i) {
3000 reg.writemask = 1 << i;
3001 emit(DP4(reg,
3002 src_reg(output_reg[clip_vertex]),
3003 src_reg(this->userplane[i + offset])));
3004 }
3005 }
3006
3007 void
3008 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3009 {
3010 assert (varying < VARYING_SLOT_MAX);
3011 reg.type = output_reg[varying].type;
3012 current_annotation = output_reg_annotation[varying];
3013 /* Copy the register, saturating if necessary */
3014 vec4_instruction *inst = emit(MOV(reg,
3015 src_reg(output_reg[varying])));
3016 if ((varying == VARYING_SLOT_COL0 ||
3017 varying == VARYING_SLOT_COL1 ||
3018 varying == VARYING_SLOT_BFC0 ||
3019 varying == VARYING_SLOT_BFC1) &&
3020 key->clamp_vertex_color) {
3021 inst->saturate = true;
3022 }
3023 }
3024
3025 void
3026 vec4_visitor::emit_urb_slot(int mrf, int varying)
3027 {
3028 struct brw_reg hw_reg = brw_message_reg(mrf);
3029 dst_reg reg = dst_reg(MRF, mrf);
3030 reg.type = BRW_REGISTER_TYPE_F;
3031
3032 switch (varying) {
3033 case VARYING_SLOT_PSIZ:
3034 /* PSIZ is always in slot 0, and is coupled with other flags. */
3035 current_annotation = "indices, point width, clip flags";
3036 emit_psiz_and_flags(hw_reg);
3037 break;
3038 case BRW_VARYING_SLOT_NDC:
3039 current_annotation = "NDC";
3040 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3041 break;
3042 case VARYING_SLOT_POS:
3043 current_annotation = "gl_Position";
3044 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3045 break;
3046 case VARYING_SLOT_EDGE:
3047 /* This is present when doing unfilled polygons. We're supposed to copy
3048 * the edge flag from the user-provided vertex array
3049 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3050 * of that attribute (starts as 1.0f). This is then used in clipping to
3051 * determine which edges should be drawn as wireframe.
3052 */
3053 current_annotation = "edge flag";
3054 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3055 glsl_type::float_type, WRITEMASK_XYZW))));
3056 break;
3057 case BRW_VARYING_SLOT_PAD:
3058 /* No need to write to this slot */
3059 break;
3060 default:
3061 emit_generic_urb_slot(reg, varying);
3062 break;
3063 }
3064 }
3065
3066 static int
3067 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3068 {
3069 if (brw->gen >= 6) {
3070 /* URB data written (does not include the message header reg) must
3071 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3072 * section 5.4.3.2.2: URB_INTERLEAVED.
3073 *
3074 * URB entries are allocated on a multiple of 1024 bits, so an
3075 * extra 128 bits written here to make the end align to 256 is
3076 * no problem.
3077 */
3078 if ((mlen % 2) != 1)
3079 mlen++;
3080 }
3081
3082 return mlen;
3083 }
3084
3085
3086 /**
3087 * Generates the VUE payload plus the necessary URB write instructions to
3088 * output it.
3089 *
3090 * The VUE layout is documented in Volume 2a.
3091 */
3092 void
3093 vec4_visitor::emit_vertex()
3094 {
3095 /* MRF 0 is reserved for the debugger, so start with message header
3096 * in MRF 1.
3097 */
3098 int base_mrf = 1;
3099 int mrf = base_mrf;
3100 /* In the process of generating our URB write message contents, we
3101 * may need to unspill a register or load from an array. Those
3102 * reads would use MRFs 14-15.
3103 */
3104 int max_usable_mrf = 13;
3105
3106 /* The following assertion verifies that max_usable_mrf causes an
3107 * even-numbered amount of URB write data, which will meet gen6's
3108 * requirements for length alignment.
3109 */
3110 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3111
3112 /* First mrf is the g0-based message header containing URB handles and
3113 * such.
3114 */
3115 emit_urb_write_header(mrf++);
3116
3117 if (brw->gen < 6) {
3118 emit_ndc_computation();
3119 }
3120
3121 /* Lower legacy ff and ClipVertex clipping to clip distances */
3122 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3123 current_annotation = "user clip distances";
3124
3125 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3126 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3127
3128 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3129 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3130 }
3131
3132 /* We may need to split this up into several URB writes, so do them in a
3133 * loop.
3134 */
3135 int slot = 0;
3136 bool complete = false;
3137 do {
3138 /* URB offset is in URB row increments, and each of our MRFs is half of
3139 * one of those, since we're doing interleaved writes.
3140 */
3141 int offset = slot / 2;
3142
3143 mrf = base_mrf + 1;
3144 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3145 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3146
3147 /* If this was max_usable_mrf, we can't fit anything more into this
3148 * URB WRITE.
3149 */
3150 if (mrf > max_usable_mrf) {
3151 slot++;
3152 break;
3153 }
3154 }
3155
3156 complete = slot >= prog_data->vue_map.num_slots;
3157 current_annotation = "URB write";
3158 vec4_instruction *inst = emit_urb_write_opcode(complete);
3159 inst->base_mrf = base_mrf;
3160 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3161 inst->offset += offset;
3162 } while(!complete);
3163 }
3164
3165
3166 src_reg
3167 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3168 src_reg *reladdr, int reg_offset)
3169 {
3170 /* Because we store the values to scratch interleaved like our
3171 * vertex data, we need to scale the vec4 index by 2.
3172 */
3173 int message_header_scale = 2;
3174
3175 /* Pre-gen6, the message header uses byte offsets instead of vec4
3176 * (16-byte) offset units.
3177 */
3178 if (brw->gen < 6)
3179 message_header_scale *= 16;
3180
3181 if (reladdr) {
3182 src_reg index = src_reg(this, glsl_type::int_type);
3183
3184 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3185 emit_before(inst, MUL(dst_reg(index),
3186 index, src_reg(message_header_scale)));
3187
3188 return index;
3189 } else {
3190 return src_reg(reg_offset * message_header_scale);
3191 }
3192 }
3193
3194 src_reg
3195 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3196 src_reg *reladdr, int reg_offset)
3197 {
3198 if (reladdr) {
3199 src_reg index = src_reg(this, glsl_type::int_type);
3200
3201 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3202
3203 /* Pre-gen6, the message header uses byte offsets instead of vec4
3204 * (16-byte) offset units.
3205 */
3206 if (brw->gen < 6) {
3207 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3208 }
3209
3210 return index;
3211 } else if (brw->gen >= 8) {
3212 /* Store the offset in a GRF so we can send-from-GRF. */
3213 src_reg offset = src_reg(this, glsl_type::int_type);
3214 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3215 return offset;
3216 } else {
3217 int message_header_scale = brw->gen < 6 ? 16 : 1;
3218 return src_reg(reg_offset * message_header_scale);
3219 }
3220 }
3221
3222 /**
3223 * Emits an instruction before @inst to load the value named by @orig_src
3224 * from scratch space at @base_offset to @temp.
3225 *
3226 * @base_offset is measured in 32-byte units (the size of a register).
3227 */
3228 void
3229 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3230 dst_reg temp, src_reg orig_src,
3231 int base_offset)
3232 {
3233 int reg_offset = base_offset + orig_src.reg_offset;
3234 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3235
3236 emit_before(inst, SCRATCH_READ(temp, index));
3237 }
3238
3239 /**
3240 * Emits an instruction after @inst to store the value to be written
3241 * to @orig_dst to scratch space at @base_offset, from @temp.
3242 *
3243 * @base_offset is measured in 32-byte units (the size of a register).
3244 */
3245 void
3246 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3247 {
3248 int reg_offset = base_offset + inst->dst.reg_offset;
3249 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3250
3251 /* Create a temporary register to store *inst's result in.
3252 *
3253 * We have to be careful in MOVing from our temporary result register in
3254 * the scratch write. If we swizzle from channels of the temporary that
3255 * weren't initialized, it will confuse live interval analysis, which will
3256 * make spilling fail to make progress.
3257 */
3258 src_reg temp = src_reg(this, glsl_type::vec4_type);
3259 temp.type = inst->dst.type;
3260 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3261 int swizzles[4];
3262 for (int i = 0; i < 4; i++)
3263 if (inst->dst.writemask & (1 << i))
3264 swizzles[i] = i;
3265 else
3266 swizzles[i] = first_writemask_chan;
3267 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3268 swizzles[2], swizzles[3]);
3269
3270 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3271 inst->dst.writemask));
3272 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3273 write->predicate = inst->predicate;
3274 write->ir = inst->ir;
3275 write->annotation = inst->annotation;
3276 inst->insert_after(write);
3277
3278 inst->dst.file = temp.file;
3279 inst->dst.reg = temp.reg;
3280 inst->dst.reg_offset = temp.reg_offset;
3281 inst->dst.reladdr = NULL;
3282 }
3283
3284 /**
3285 * We can't generally support array access in GRF space, because a
3286 * single instruction's destination can only span 2 contiguous
3287 * registers. So, we send all GRF arrays that get variable index
3288 * access to scratch space.
3289 */
3290 void
3291 vec4_visitor::move_grf_array_access_to_scratch()
3292 {
3293 int scratch_loc[this->virtual_grf_count];
3294
3295 for (int i = 0; i < this->virtual_grf_count; i++) {
3296 scratch_loc[i] = -1;
3297 }
3298
3299 /* First, calculate the set of virtual GRFs that need to be punted
3300 * to scratch due to having any array access on them, and where in
3301 * scratch.
3302 */
3303 foreach_in_list(vec4_instruction, inst, &instructions) {
3304 if (inst->dst.file == GRF && inst->dst.reladdr &&
3305 scratch_loc[inst->dst.reg] == -1) {
3306 scratch_loc[inst->dst.reg] = c->last_scratch;
3307 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3308 }
3309
3310 for (int i = 0 ; i < 3; i++) {
3311 src_reg *src = &inst->src[i];
3312
3313 if (src->file == GRF && src->reladdr &&
3314 scratch_loc[src->reg] == -1) {
3315 scratch_loc[src->reg] = c->last_scratch;
3316 c->last_scratch += this->virtual_grf_sizes[src->reg];
3317 }
3318 }
3319 }
3320
3321 /* Now, for anything that will be accessed through scratch, rewrite
3322 * it to load/store. Note that this is a _safe list walk, because
3323 * we may generate a new scratch_write instruction after the one
3324 * we're processing.
3325 */
3326 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3327 /* Set up the annotation tracking for new generated instructions. */
3328 base_ir = inst->ir;
3329 current_annotation = inst->annotation;
3330
3331 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3332 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3333 }
3334
3335 for (int i = 0 ; i < 3; i++) {
3336 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3337 continue;
3338
3339 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3340
3341 emit_scratch_read(inst, temp, inst->src[i],
3342 scratch_loc[inst->src[i].reg]);
3343
3344 inst->src[i].file = temp.file;
3345 inst->src[i].reg = temp.reg;
3346 inst->src[i].reg_offset = temp.reg_offset;
3347 inst->src[i].reladdr = NULL;
3348 }
3349 }
3350 }
3351
3352 /**
3353 * Emits an instruction before @inst to load the value named by @orig_src
3354 * from the pull constant buffer (surface) at @base_offset to @temp.
3355 */
3356 void
3357 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3358 dst_reg temp, src_reg orig_src,
3359 int base_offset)
3360 {
3361 int reg_offset = base_offset + orig_src.reg_offset;
3362 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3363 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3364 vec4_instruction *load;
3365
3366 if (brw->gen >= 7) {
3367 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3368 grf_offset.type = offset.type;
3369 emit_before(inst, MOV(grf_offset, offset));
3370
3371 load = new(mem_ctx) vec4_instruction(this,
3372 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3373 temp, index, src_reg(grf_offset));
3374 } else {
3375 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3376 temp, index, offset);
3377 load->base_mrf = 14;
3378 load->mlen = 1;
3379 }
3380 emit_before(inst, load);
3381 }
3382
3383 /**
3384 * Implements array access of uniforms by inserting a
3385 * PULL_CONSTANT_LOAD instruction.
3386 *
3387 * Unlike temporary GRF array access (where we don't support it due to
3388 * the difficulty of doing relative addressing on instruction
3389 * destinations), we could potentially do array access of uniforms
3390 * that were loaded in GRF space as push constants. In real-world
3391 * usage we've seen, though, the arrays being used are always larger
3392 * than we could load as push constants, so just always move all
3393 * uniform array access out to a pull constant buffer.
3394 */
3395 void
3396 vec4_visitor::move_uniform_array_access_to_pull_constants()
3397 {
3398 int pull_constant_loc[this->uniforms];
3399
3400 for (int i = 0; i < this->uniforms; i++) {
3401 pull_constant_loc[i] = -1;
3402 }
3403
3404 /* Walk through and find array access of uniforms. Put a copy of that
3405 * uniform in the pull constant buffer.
3406 *
3407 * Note that we don't move constant-indexed accesses to arrays. No
3408 * testing has been done of the performance impact of this choice.
3409 */
3410 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3411 for (int i = 0 ; i < 3; i++) {
3412 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3413 continue;
3414
3415 int uniform = inst->src[i].reg;
3416
3417 /* If this array isn't already present in the pull constant buffer,
3418 * add it.
3419 */
3420 if (pull_constant_loc[uniform] == -1) {
3421 const gl_constant_value **values =
3422 &stage_prog_data->param[uniform * 4];
3423
3424 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3425
3426 assert(uniform < uniform_array_size);
3427 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3428 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3429 = values[j];
3430 }
3431 }
3432
3433 /* Set up the annotation tracking for new generated instructions. */
3434 base_ir = inst->ir;
3435 current_annotation = inst->annotation;
3436
3437 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3438
3439 emit_pull_constant_load(inst, temp, inst->src[i],
3440 pull_constant_loc[uniform]);
3441
3442 inst->src[i].file = temp.file;
3443 inst->src[i].reg = temp.reg;
3444 inst->src[i].reg_offset = temp.reg_offset;
3445 inst->src[i].reladdr = NULL;
3446 }
3447 }
3448
3449 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3450 * no need to track them as larger-than-vec4 objects. This will be
3451 * relied on in cutting out unused uniform vectors from push
3452 * constants.
3453 */
3454 split_uniform_registers();
3455 }
3456
3457 void
3458 vec4_visitor::resolve_ud_negate(src_reg *reg)
3459 {
3460 if (reg->type != BRW_REGISTER_TYPE_UD ||
3461 !reg->negate)
3462 return;
3463
3464 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3465 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3466 *reg = temp;
3467 }
3468
3469 vec4_visitor::vec4_visitor(struct brw_context *brw,
3470 struct brw_vec4_compile *c,
3471 struct gl_program *prog,
3472 const struct brw_vec4_prog_key *key,
3473 struct brw_vec4_prog_data *prog_data,
3474 struct gl_shader_program *shader_prog,
3475 gl_shader_stage stage,
3476 void *mem_ctx,
3477 bool debug_flag,
3478 bool no_spills,
3479 shader_time_shader_type st_base,
3480 shader_time_shader_type st_written,
3481 shader_time_shader_type st_reset)
3482 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3483 c(c),
3484 key(key),
3485 prog_data(prog_data),
3486 sanity_param_count(0),
3487 fail_msg(NULL),
3488 first_non_payload_grf(0),
3489 need_all_constants_in_pull_buffer(false),
3490 debug_flag(debug_flag),
3491 no_spills(no_spills),
3492 st_base(st_base),
3493 st_written(st_written),
3494 st_reset(st_reset)
3495 {
3496 this->mem_ctx = mem_ctx;
3497 this->failed = false;
3498
3499 this->base_ir = NULL;
3500 this->current_annotation = NULL;
3501 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3502
3503 this->variable_ht = hash_table_ctor(0,
3504 hash_table_pointer_hash,
3505 hash_table_pointer_compare);
3506
3507 this->virtual_grf_start = NULL;
3508 this->virtual_grf_end = NULL;
3509 this->virtual_grf_sizes = NULL;
3510 this->virtual_grf_count = 0;
3511 this->virtual_grf_reg_map = NULL;
3512 this->virtual_grf_reg_count = 0;
3513 this->virtual_grf_array_size = 0;
3514 this->live_intervals_valid = false;
3515
3516 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3517
3518 this->uniforms = 0;
3519
3520 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3521 * at least one. See setup_uniforms() in brw_vec4.cpp.
3522 */
3523 this->uniform_array_size = 1;
3524 if (prog_data) {
3525 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3526 }
3527
3528 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3529 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3530 }
3531
3532 vec4_visitor::~vec4_visitor()
3533 {
3534 hash_table_dtor(this->variable_ht);
3535 }
3536
3537
3538 void
3539 vec4_visitor::fail(const char *format, ...)
3540 {
3541 va_list va;
3542 char *msg;
3543
3544 if (failed)
3545 return;
3546
3547 failed = true;
3548
3549 va_start(va, format);
3550 msg = ralloc_vasprintf(mem_ctx, format, va);
3551 va_end(va);
3552 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3553
3554 this->fail_msg = msg;
3555
3556 if (debug_flag) {
3557 fprintf(stderr, "%s", msg);
3558 }
3559 }
3560
3561 } /* namespace brw */