i965/vec4: Emit shader w/a for Gen6 gather
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->sampler = 0;
47 this->texture_offset = 0;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->ir = v->base_ir;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_present = false;
53 this->mlen = 0;
54 this->base_mrf = 0;
55 this->offset = 0;
56 this->annotation = v->current_annotation;
57 }
58
59 vec4_instruction *
60 vec4_visitor::emit(vec4_instruction *inst)
61 {
62 this->instructions.push_tail(inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
69 {
70 new_inst->ir = inst->ir;
71 new_inst->annotation = inst->annotation;
72
73 inst->insert_before(new_inst);
74
75 return inst;
76 }
77
78 vec4_instruction *
79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
80 src_reg src0, src_reg src1, src_reg src2)
81 {
82 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
83 src0, src1, src2));
84 }
85
86
87 vec4_instruction *
88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
89 {
90 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
91 }
92
93 vec4_instruction *
94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
95 {
96 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
101 {
102 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode)
107 {
108 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
109 }
110
111 #define ALU1(op) \
112 vec4_instruction * \
113 vec4_visitor::op(dst_reg dst, src_reg src0) \
114 { \
115 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
116 src0); \
117 }
118
119 #define ALU2(op) \
120 vec4_instruction * \
121 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
122 { \
123 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
124 src0, src1); \
125 }
126
127 #define ALU3(op) \
128 vec4_instruction * \
129 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
130 { \
131 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
132 src0, src1, src2); \
133 }
134
135 ALU1(NOT)
136 ALU1(MOV)
137 ALU1(FRC)
138 ALU1(RNDD)
139 ALU1(RNDE)
140 ALU1(RNDZ)
141 ALU1(F32TO16)
142 ALU1(F16TO32)
143 ALU2(ADD)
144 ALU2(MUL)
145 ALU2(MACH)
146 ALU2(AND)
147 ALU2(OR)
148 ALU2(XOR)
149 ALU2(DP3)
150 ALU2(DP4)
151 ALU2(DPH)
152 ALU2(SHL)
153 ALU2(SHR)
154 ALU2(ASR)
155 ALU3(LRP)
156 ALU1(BFREV)
157 ALU3(BFE)
158 ALU2(BFI1)
159 ALU3(BFI2)
160 ALU1(FBH)
161 ALU1(FBL)
162 ALU1(CBIT)
163 ALU3(MAD)
164 ALU2(ADDC)
165 ALU2(SUBB)
166
167 /** Gen4 predicated IF. */
168 vec4_instruction *
169 vec4_visitor::IF(uint32_t predicate)
170 {
171 vec4_instruction *inst;
172
173 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
174 inst->predicate = predicate;
175
176 return inst;
177 }
178
179 /** Gen6 IF with embedded comparison. */
180 vec4_instruction *
181 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
182 {
183 assert(brw->gen == 6);
184
185 vec4_instruction *inst;
186
187 resolve_ud_negate(&src0);
188 resolve_ud_negate(&src1);
189
190 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
191 src0, src1);
192 inst->conditional_mod = condition;
193
194 return inst;
195 }
196
197 /**
198 * CMP: Sets the low bit of the destination channels with the result
199 * of the comparison, while the upper bits are undefined, and updates
200 * the flag register with the packed 16 bits of the result.
201 */
202 vec4_instruction *
203 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
204 {
205 vec4_instruction *inst;
206
207 /* original gen4 does type conversion to the destination type
208 * before before comparison, producing garbage results for floating
209 * point comparisons.
210 */
211 if (brw->gen == 4) {
212 dst.type = src0.type;
213 if (dst.file == HW_REG)
214 dst.fixed_hw_reg.type = dst.type;
215 }
216
217 resolve_ud_negate(&src0);
218 resolve_ud_negate(&src1);
219
220 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
221 inst->conditional_mod = condition;
222
223 return inst;
224 }
225
226 vec4_instruction *
227 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
228 {
229 vec4_instruction *inst;
230
231 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
232 dst, index);
233 inst->base_mrf = 14;
234 inst->mlen = 2;
235
236 return inst;
237 }
238
239 vec4_instruction *
240 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
241 {
242 vec4_instruction *inst;
243
244 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
245 dst, src, index);
246 inst->base_mrf = 13;
247 inst->mlen = 3;
248
249 return inst;
250 }
251
252 void
253 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
254 {
255 static enum opcode dot_opcodes[] = {
256 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
257 };
258
259 emit(dot_opcodes[elements - 2], dst, src0, src1);
260 }
261
262 src_reg
263 vec4_visitor::fix_3src_operand(src_reg src)
264 {
265 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
266 * able to use vertical stride of zero to replicate the vec4 uniform, like
267 *
268 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
269 *
270 * But you can't, since vertical stride is always four in three-source
271 * instructions. Instead, insert a MOV instruction to do the replication so
272 * that the three-source instruction can consume it.
273 */
274
275 /* The MOV is only needed if the source is a uniform or immediate. */
276 if (src.file != UNIFORM && src.file != IMM)
277 return src;
278
279 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
280 expanded.type = src.type;
281 emit(MOV(expanded, src));
282 return src_reg(expanded);
283 }
284
285 src_reg
286 vec4_visitor::fix_math_operand(src_reg src)
287 {
288 /* The gen6 math instruction ignores the source modifiers --
289 * swizzle, abs, negate, and at least some parts of the register
290 * region description.
291 *
292 * Rather than trying to enumerate all these cases, *always* expand the
293 * operand to a temp GRF for gen6.
294 *
295 * For gen7, keep the operand as-is, except if immediate, which gen7 still
296 * can't use.
297 */
298
299 if (brw->gen == 7 && src.file != IMM)
300 return src;
301
302 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303 expanded.type = src.type;
304 emit(MOV(expanded, src));
305 return src_reg(expanded);
306 }
307
308 void
309 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
310 {
311 src = fix_math_operand(src);
312
313 if (dst.writemask != WRITEMASK_XYZW) {
314 /* The gen6 math instruction must be align1, so we can't do
315 * writemasks.
316 */
317 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
318
319 emit(opcode, temp_dst, src);
320
321 emit(MOV(dst, src_reg(temp_dst)));
322 } else {
323 emit(opcode, dst, src);
324 }
325 }
326
327 void
328 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
329 {
330 vec4_instruction *inst = emit(opcode, dst, src);
331 inst->base_mrf = 1;
332 inst->mlen = 1;
333 }
334
335 void
336 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
337 {
338 switch (opcode) {
339 case SHADER_OPCODE_RCP:
340 case SHADER_OPCODE_RSQ:
341 case SHADER_OPCODE_SQRT:
342 case SHADER_OPCODE_EXP2:
343 case SHADER_OPCODE_LOG2:
344 case SHADER_OPCODE_SIN:
345 case SHADER_OPCODE_COS:
346 break;
347 default:
348 assert(!"not reached: bad math opcode");
349 return;
350 }
351
352 if (brw->gen >= 6) {
353 return emit_math1_gen6(opcode, dst, src);
354 } else {
355 return emit_math1_gen4(opcode, dst, src);
356 }
357 }
358
359 void
360 vec4_visitor::emit_math2_gen6(enum opcode opcode,
361 dst_reg dst, src_reg src0, src_reg src1)
362 {
363 src0 = fix_math_operand(src0);
364 src1 = fix_math_operand(src1);
365
366 if (dst.writemask != WRITEMASK_XYZW) {
367 /* The gen6 math instruction must be align1, so we can't do
368 * writemasks.
369 */
370 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
371 temp_dst.type = dst.type;
372
373 emit(opcode, temp_dst, src0, src1);
374
375 emit(MOV(dst, src_reg(temp_dst)));
376 } else {
377 emit(opcode, dst, src0, src1);
378 }
379 }
380
381 void
382 vec4_visitor::emit_math2_gen4(enum opcode opcode,
383 dst_reg dst, src_reg src0, src_reg src1)
384 {
385 vec4_instruction *inst = emit(opcode, dst, src0, src1);
386 inst->base_mrf = 1;
387 inst->mlen = 2;
388 }
389
390 void
391 vec4_visitor::emit_math(enum opcode opcode,
392 dst_reg dst, src_reg src0, src_reg src1)
393 {
394 switch (opcode) {
395 case SHADER_OPCODE_POW:
396 case SHADER_OPCODE_INT_QUOTIENT:
397 case SHADER_OPCODE_INT_REMAINDER:
398 break;
399 default:
400 assert(!"not reached: unsupported binary math opcode");
401 return;
402 }
403
404 if (brw->gen >= 6) {
405 return emit_math2_gen6(opcode, dst, src0, src1);
406 } else {
407 return emit_math2_gen4(opcode, dst, src0, src1);
408 }
409 }
410
411 void
412 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
413 {
414 if (brw->gen < 7)
415 assert(!"ir_unop_pack_half_2x16 should be lowered");
416
417 assert(dst.type == BRW_REGISTER_TYPE_UD);
418 assert(src0.type == BRW_REGISTER_TYPE_F);
419
420 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
421 *
422 * Because this instruction does not have a 16-bit floating-point type,
423 * the destination data type must be Word (W).
424 *
425 * The destination must be DWord-aligned and specify a horizontal stride
426 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
427 * each destination channel and the upper word is not modified.
428 *
429 * The above restriction implies that the f32to16 instruction must use
430 * align1 mode, because only in align1 mode is it possible to specify
431 * horizontal stride. We choose here to defy the hardware docs and emit
432 * align16 instructions.
433 *
434 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
435 * instructions. I was partially successful in that the code passed all
436 * tests. However, the code was dubiously correct and fragile, and the
437 * tests were not harsh enough to probe that frailty. Not trusting the
438 * code, I chose instead to remain in align16 mode in defiance of the hw
439 * docs).
440 *
441 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
442 * simulator, emitting a f32to16 in align16 mode with UD as destination
443 * data type is safe. The behavior differs from that specified in the PRM
444 * in that the upper word of each destination channel is cleared to 0.
445 */
446
447 dst_reg tmp_dst(this, glsl_type::uvec2_type);
448 src_reg tmp_src(tmp_dst);
449
450 #if 0
451 /* Verify the undocumented behavior on which the following instructions
452 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
453 * then the result of the bit-or instruction below will be incorrect.
454 *
455 * You should inspect the disasm output in order to verify that the MOV is
456 * not optimized away.
457 */
458 emit(MOV(tmp_dst, src_reg(0x12345678u)));
459 #endif
460
461 /* Give tmp the form below, where "." means untouched.
462 *
463 * w z y x w z y x
464 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
465 *
466 * That the upper word of each write-channel be 0 is required for the
467 * following bit-shift and bit-or instructions to work. Note that this
468 * relies on the undocumented hardware behavior mentioned above.
469 */
470 tmp_dst.writemask = WRITEMASK_XY;
471 emit(F32TO16(tmp_dst, src0));
472
473 /* Give the write-channels of dst the form:
474 * 0xhhhh0000
475 */
476 tmp_src.swizzle = SWIZZLE_Y;
477 emit(SHL(dst, tmp_src, src_reg(16u)));
478
479 /* Finally, give the write-channels of dst the form of packHalf2x16's
480 * output:
481 * 0xhhhhllll
482 */
483 tmp_src.swizzle = SWIZZLE_X;
484 emit(OR(dst, src_reg(dst), tmp_src));
485 }
486
487 void
488 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
489 {
490 if (brw->gen < 7)
491 assert(!"ir_unop_unpack_half_2x16 should be lowered");
492
493 assert(dst.type == BRW_REGISTER_TYPE_F);
494 assert(src0.type == BRW_REGISTER_TYPE_UD);
495
496 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
497 *
498 * Because this instruction does not have a 16-bit floating-point type,
499 * the source data type must be Word (W). The destination type must be
500 * F (Float).
501 *
502 * To use W as the source data type, we must adjust horizontal strides,
503 * which is only possible in align1 mode. All my [chadv] attempts at
504 * emitting align1 instructions for unpackHalf2x16 failed to pass the
505 * Piglit tests, so I gave up.
506 *
507 * I've verified that, on gen7 hardware and the simulator, it is safe to
508 * emit f16to32 in align16 mode with UD as source data type.
509 */
510
511 dst_reg tmp_dst(this, glsl_type::uvec2_type);
512 src_reg tmp_src(tmp_dst);
513
514 tmp_dst.writemask = WRITEMASK_X;
515 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
516
517 tmp_dst.writemask = WRITEMASK_Y;
518 emit(SHR(tmp_dst, src0, src_reg(16u)));
519
520 dst.writemask = WRITEMASK_XY;
521 emit(F16TO32(dst, tmp_src));
522 }
523
524 void
525 vec4_visitor::visit_instructions(const exec_list *list)
526 {
527 foreach_list(node, list) {
528 ir_instruction *ir = (ir_instruction *)node;
529
530 base_ir = ir;
531 ir->accept(this);
532 }
533 }
534
535
536 static int
537 type_size(const struct glsl_type *type)
538 {
539 unsigned int i;
540 int size;
541
542 switch (type->base_type) {
543 case GLSL_TYPE_UINT:
544 case GLSL_TYPE_INT:
545 case GLSL_TYPE_FLOAT:
546 case GLSL_TYPE_BOOL:
547 if (type->is_matrix()) {
548 return type->matrix_columns;
549 } else {
550 /* Regardless of size of vector, it gets a vec4. This is bad
551 * packing for things like floats, but otherwise arrays become a
552 * mess. Hopefully a later pass over the code can pack scalars
553 * down if appropriate.
554 */
555 return 1;
556 }
557 case GLSL_TYPE_ARRAY:
558 assert(type->length > 0);
559 return type_size(type->fields.array) * type->length;
560 case GLSL_TYPE_STRUCT:
561 size = 0;
562 for (i = 0; i < type->length; i++) {
563 size += type_size(type->fields.structure[i].type);
564 }
565 return size;
566 case GLSL_TYPE_SAMPLER:
567 /* Samplers take up one slot in UNIFORMS[], but they're baked in
568 * at link time.
569 */
570 return 1;
571 case GLSL_TYPE_ATOMIC_UINT:
572 return 0;
573 case GLSL_TYPE_VOID:
574 case GLSL_TYPE_ERROR:
575 case GLSL_TYPE_INTERFACE:
576 assert(0);
577 break;
578 }
579
580 return 0;
581 }
582
583 int
584 vec4_visitor::virtual_grf_alloc(int size)
585 {
586 if (virtual_grf_array_size <= virtual_grf_count) {
587 if (virtual_grf_array_size == 0)
588 virtual_grf_array_size = 16;
589 else
590 virtual_grf_array_size *= 2;
591 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
592 virtual_grf_array_size);
593 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
594 virtual_grf_array_size);
595 }
596 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
597 virtual_grf_reg_count += size;
598 virtual_grf_sizes[virtual_grf_count] = size;
599 return virtual_grf_count++;
600 }
601
602 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
603 {
604 init();
605
606 this->file = GRF;
607 this->reg = v->virtual_grf_alloc(type_size(type));
608
609 if (type->is_array() || type->is_record()) {
610 this->swizzle = BRW_SWIZZLE_NOOP;
611 } else {
612 this->swizzle = swizzle_for_size(type->vector_elements);
613 }
614
615 this->type = brw_type_for_base_type(type);
616 }
617
618 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
619 {
620 init();
621
622 this->file = GRF;
623 this->reg = v->virtual_grf_alloc(type_size(type));
624
625 if (type->is_array() || type->is_record()) {
626 this->writemask = WRITEMASK_XYZW;
627 } else {
628 this->writemask = (1 << type->vector_elements) - 1;
629 }
630
631 this->type = brw_type_for_base_type(type);
632 }
633
634 /* Our support for uniforms is piggy-backed on the struct
635 * gl_fragment_program, because that's where the values actually
636 * get stored, rather than in some global gl_shader_program uniform
637 * store.
638 */
639 void
640 vec4_visitor::setup_uniform_values(ir_variable *ir)
641 {
642 int namelen = strlen(ir->name);
643
644 /* The data for our (non-builtin) uniforms is stored in a series of
645 * gl_uniform_driver_storage structs for each subcomponent that
646 * glGetUniformLocation() could name. We know it's been set up in the same
647 * order we'd walk the type, so walk the list of storage and find anything
648 * with our name, or the prefix of a component that starts with our name.
649 */
650 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
651 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
652
653 if (strncmp(ir->name, storage->name, namelen) != 0 ||
654 (storage->name[namelen] != 0 &&
655 storage->name[namelen] != '.' &&
656 storage->name[namelen] != '[')) {
657 continue;
658 }
659
660 gl_constant_value *components = storage->storage;
661 unsigned vector_count = (MAX2(storage->array_elements, 1) *
662 storage->type->matrix_columns);
663
664 for (unsigned s = 0; s < vector_count; s++) {
665 uniform_vector_size[uniforms] = storage->type->vector_elements;
666
667 int i;
668 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
669 prog_data->param[uniforms * 4 + i] = &components->f;
670 components++;
671 }
672 for (; i < 4; i++) {
673 static float zero = 0;
674 prog_data->param[uniforms * 4 + i] = &zero;
675 }
676
677 uniforms++;
678 }
679 }
680 }
681
682 void
683 vec4_visitor::setup_uniform_clipplane_values()
684 {
685 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
686
687 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
688 this->uniform_vector_size[this->uniforms] = 4;
689 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
690 this->userplane[i].type = BRW_REGISTER_TYPE_F;
691 for (int j = 0; j < 4; ++j) {
692 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
693 }
694 ++this->uniforms;
695 }
696 }
697
698 /* Our support for builtin uniforms is even scarier than non-builtin.
699 * It sits on top of the PROG_STATE_VAR parameters that are
700 * automatically updated from GL context state.
701 */
702 void
703 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
704 {
705 const ir_state_slot *const slots = ir->state_slots;
706 assert(ir->state_slots != NULL);
707
708 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
709 /* This state reference has already been setup by ir_to_mesa,
710 * but we'll get the same index back here. We can reference
711 * ParameterValues directly, since unlike brw_fs.cpp, we never
712 * add new state references during compile.
713 */
714 int index = _mesa_add_state_reference(this->prog->Parameters,
715 (gl_state_index *)slots[i].tokens);
716 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
717
718 this->uniform_vector_size[this->uniforms] = 0;
719 /* Add each of the unique swizzled channels of the element.
720 * This will end up matching the size of the glsl_type of this field.
721 */
722 int last_swiz = -1;
723 for (unsigned int j = 0; j < 4; j++) {
724 int swiz = GET_SWZ(slots[i].swizzle, j);
725 last_swiz = swiz;
726
727 prog_data->param[this->uniforms * 4 + j] = &values[swiz];
728 if (swiz <= last_swiz)
729 this->uniform_vector_size[this->uniforms]++;
730 }
731 this->uniforms++;
732 }
733 }
734
735 dst_reg *
736 vec4_visitor::variable_storage(ir_variable *var)
737 {
738 return (dst_reg *)hash_table_find(this->variable_ht, var);
739 }
740
741 void
742 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
743 {
744 ir_expression *expr = ir->as_expression();
745
746 *predicate = BRW_PREDICATE_NORMAL;
747
748 if (expr) {
749 src_reg op[2];
750 vec4_instruction *inst;
751
752 assert(expr->get_num_operands() <= 2);
753 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
754 expr->operands[i]->accept(this);
755 op[i] = this->result;
756
757 resolve_ud_negate(&op[i]);
758 }
759
760 switch (expr->operation) {
761 case ir_unop_logic_not:
762 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
763 inst->conditional_mod = BRW_CONDITIONAL_Z;
764 break;
765
766 case ir_binop_logic_xor:
767 inst = emit(XOR(dst_null_d(), op[0], op[1]));
768 inst->conditional_mod = BRW_CONDITIONAL_NZ;
769 break;
770
771 case ir_binop_logic_or:
772 inst = emit(OR(dst_null_d(), op[0], op[1]));
773 inst->conditional_mod = BRW_CONDITIONAL_NZ;
774 break;
775
776 case ir_binop_logic_and:
777 inst = emit(AND(dst_null_d(), op[0], op[1]));
778 inst->conditional_mod = BRW_CONDITIONAL_NZ;
779 break;
780
781 case ir_unop_f2b:
782 if (brw->gen >= 6) {
783 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
784 } else {
785 inst = emit(MOV(dst_null_f(), op[0]));
786 inst->conditional_mod = BRW_CONDITIONAL_NZ;
787 }
788 break;
789
790 case ir_unop_i2b:
791 if (brw->gen >= 6) {
792 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
793 } else {
794 inst = emit(MOV(dst_null_d(), op[0]));
795 inst->conditional_mod = BRW_CONDITIONAL_NZ;
796 }
797 break;
798
799 case ir_binop_all_equal:
800 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
801 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
802 break;
803
804 case ir_binop_any_nequal:
805 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
806 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
807 break;
808
809 case ir_unop_any:
810 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
811 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
812 break;
813
814 case ir_binop_greater:
815 case ir_binop_gequal:
816 case ir_binop_less:
817 case ir_binop_lequal:
818 case ir_binop_equal:
819 case ir_binop_nequal:
820 emit(CMP(dst_null_d(), op[0], op[1],
821 brw_conditional_for_comparison(expr->operation)));
822 break;
823
824 default:
825 assert(!"not reached");
826 break;
827 }
828 return;
829 }
830
831 ir->accept(this);
832
833 resolve_ud_negate(&this->result);
834
835 if (brw->gen >= 6) {
836 vec4_instruction *inst = emit(AND(dst_null_d(),
837 this->result, src_reg(1)));
838 inst->conditional_mod = BRW_CONDITIONAL_NZ;
839 } else {
840 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
841 inst->conditional_mod = BRW_CONDITIONAL_NZ;
842 }
843 }
844
845 /**
846 * Emit a gen6 IF statement with the comparison folded into the IF
847 * instruction.
848 */
849 void
850 vec4_visitor::emit_if_gen6(ir_if *ir)
851 {
852 ir_expression *expr = ir->condition->as_expression();
853
854 if (expr) {
855 src_reg op[2];
856 dst_reg temp;
857
858 assert(expr->get_num_operands() <= 2);
859 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
860 expr->operands[i]->accept(this);
861 op[i] = this->result;
862 }
863
864 switch (expr->operation) {
865 case ir_unop_logic_not:
866 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
867 return;
868
869 case ir_binop_logic_xor:
870 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
871 return;
872
873 case ir_binop_logic_or:
874 temp = dst_reg(this, glsl_type::bool_type);
875 emit(OR(temp, op[0], op[1]));
876 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
877 return;
878
879 case ir_binop_logic_and:
880 temp = dst_reg(this, glsl_type::bool_type);
881 emit(AND(temp, op[0], op[1]));
882 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
883 return;
884
885 case ir_unop_f2b:
886 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
887 return;
888
889 case ir_unop_i2b:
890 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
891 return;
892
893 case ir_binop_greater:
894 case ir_binop_gequal:
895 case ir_binop_less:
896 case ir_binop_lequal:
897 case ir_binop_equal:
898 case ir_binop_nequal:
899 emit(IF(op[0], op[1],
900 brw_conditional_for_comparison(expr->operation)));
901 return;
902
903 case ir_binop_all_equal:
904 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
905 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
906 return;
907
908 case ir_binop_any_nequal:
909 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
910 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
911 return;
912
913 case ir_unop_any:
914 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
915 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
916 return;
917
918 default:
919 assert(!"not reached");
920 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
921 return;
922 }
923 return;
924 }
925
926 ir->condition->accept(this);
927
928 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
929 }
930
931 dst_reg
932 with_writemask(dst_reg const & r, int mask)
933 {
934 dst_reg result = r;
935 result.writemask = mask;
936 return result;
937 }
938
939
940 void
941 vec4_visitor::visit(ir_variable *ir)
942 {
943 dst_reg *reg = NULL;
944
945 if (variable_storage(ir))
946 return;
947
948 switch (ir->data.mode) {
949 case ir_var_shader_in:
950 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
951 break;
952
953 case ir_var_shader_out:
954 reg = new(mem_ctx) dst_reg(this, ir->type);
955
956 for (int i = 0; i < type_size(ir->type); i++) {
957 output_reg[ir->data.location + i] = *reg;
958 output_reg[ir->data.location + i].reg_offset = i;
959 output_reg[ir->data.location + i].type =
960 brw_type_for_base_type(ir->type->get_scalar_type());
961 output_reg_annotation[ir->data.location + i] = ir->name;
962 }
963 break;
964
965 case ir_var_auto:
966 case ir_var_temporary:
967 reg = new(mem_ctx) dst_reg(this, ir->type);
968 break;
969
970 case ir_var_uniform:
971 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
972
973 /* Thanks to the lower_ubo_reference pass, we will see only
974 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
975 * variables, so no need for them to be in variable_ht.
976 *
977 * Atomic counters take no uniform storage, no need to do
978 * anything here.
979 */
980 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
981 return;
982
983 /* Track how big the whole uniform variable is, in case we need to put a
984 * copy of its data into pull constants for array access.
985 */
986 this->uniform_size[this->uniforms] = type_size(ir->type);
987
988 if (!strncmp(ir->name, "gl_", 3)) {
989 setup_builtin_uniform_values(ir);
990 } else {
991 setup_uniform_values(ir);
992 }
993 break;
994
995 case ir_var_system_value:
996 reg = make_reg_for_system_value(ir);
997 break;
998
999 default:
1000 assert(!"not reached");
1001 }
1002
1003 reg->type = brw_type_for_base_type(ir->type);
1004 hash_table_insert(this->variable_ht, reg, ir);
1005 }
1006
1007 void
1008 vec4_visitor::visit(ir_loop *ir)
1009 {
1010 /* We don't want debugging output to print the whole body of the
1011 * loop as the annotation.
1012 */
1013 this->base_ir = NULL;
1014
1015 emit(BRW_OPCODE_DO);
1016
1017 visit_instructions(&ir->body_instructions);
1018
1019 emit(BRW_OPCODE_WHILE);
1020 }
1021
1022 void
1023 vec4_visitor::visit(ir_loop_jump *ir)
1024 {
1025 switch (ir->mode) {
1026 case ir_loop_jump::jump_break:
1027 emit(BRW_OPCODE_BREAK);
1028 break;
1029 case ir_loop_jump::jump_continue:
1030 emit(BRW_OPCODE_CONTINUE);
1031 break;
1032 }
1033 }
1034
1035
1036 void
1037 vec4_visitor::visit(ir_function_signature *ir)
1038 {
1039 assert(0);
1040 (void)ir;
1041 }
1042
1043 void
1044 vec4_visitor::visit(ir_function *ir)
1045 {
1046 /* Ignore function bodies other than main() -- we shouldn't see calls to
1047 * them since they should all be inlined.
1048 */
1049 if (strcmp(ir->name, "main") == 0) {
1050 const ir_function_signature *sig;
1051 exec_list empty;
1052
1053 sig = ir->matching_signature(NULL, &empty);
1054
1055 assert(sig);
1056
1057 visit_instructions(&sig->body);
1058 }
1059 }
1060
1061 bool
1062 vec4_visitor::try_emit_sat(ir_expression *ir)
1063 {
1064 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1065 if (!sat_src)
1066 return false;
1067
1068 sat_src->accept(this);
1069 src_reg src = this->result;
1070
1071 this->result = src_reg(this, ir->type);
1072 vec4_instruction *inst;
1073 inst = emit(MOV(dst_reg(this->result), src));
1074 inst->saturate = true;
1075
1076 return true;
1077 }
1078
1079 bool
1080 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1081 {
1082 /* 3-src instructions were introduced in gen6. */
1083 if (brw->gen < 6)
1084 return false;
1085
1086 /* MAD can only handle floating-point data. */
1087 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1088 return false;
1089
1090 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1091 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1092
1093 if (!mul || mul->operation != ir_binop_mul)
1094 return false;
1095
1096 nonmul->accept(this);
1097 src_reg src0 = fix_3src_operand(this->result);
1098
1099 mul->operands[0]->accept(this);
1100 src_reg src1 = fix_3src_operand(this->result);
1101
1102 mul->operands[1]->accept(this);
1103 src_reg src2 = fix_3src_operand(this->result);
1104
1105 this->result = src_reg(this, ir->type);
1106 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1107
1108 return true;
1109 }
1110
1111 void
1112 vec4_visitor::emit_bool_comparison(unsigned int op,
1113 dst_reg dst, src_reg src0, src_reg src1)
1114 {
1115 /* original gen4 does destination conversion before comparison. */
1116 if (brw->gen < 5)
1117 dst.type = src0.type;
1118
1119 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1120
1121 dst.type = BRW_REGISTER_TYPE_D;
1122 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1123 }
1124
1125 void
1126 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1127 src_reg src0, src_reg src1)
1128 {
1129 vec4_instruction *inst;
1130
1131 if (brw->gen >= 6) {
1132 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1133 inst->conditional_mod = conditionalmod;
1134 } else {
1135 emit(CMP(dst, src0, src1, conditionalmod));
1136
1137 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1138 inst->predicate = BRW_PREDICATE_NORMAL;
1139 }
1140 }
1141
1142 static bool
1143 is_16bit_constant(ir_rvalue *rvalue)
1144 {
1145 ir_constant *constant = rvalue->as_constant();
1146 if (!constant)
1147 return false;
1148
1149 if (constant->type != glsl_type::int_type &&
1150 constant->type != glsl_type::uint_type)
1151 return false;
1152
1153 return constant->value.u[0] < (1 << 16);
1154 }
1155
1156 void
1157 vec4_visitor::visit(ir_expression *ir)
1158 {
1159 unsigned int operand;
1160 src_reg op[Elements(ir->operands)];
1161 src_reg result_src;
1162 dst_reg result_dst;
1163 vec4_instruction *inst;
1164
1165 if (try_emit_sat(ir))
1166 return;
1167
1168 if (ir->operation == ir_binop_add) {
1169 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1170 return;
1171 }
1172
1173 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1174 this->result.file = BAD_FILE;
1175 ir->operands[operand]->accept(this);
1176 if (this->result.file == BAD_FILE) {
1177 printf("Failed to get tree for expression operand:\n");
1178 ir->operands[operand]->print();
1179 exit(1);
1180 }
1181 op[operand] = this->result;
1182
1183 /* Matrix expression operands should have been broken down to vector
1184 * operations already.
1185 */
1186 assert(!ir->operands[operand]->type->is_matrix());
1187 }
1188
1189 int vector_elements = ir->operands[0]->type->vector_elements;
1190 if (ir->operands[1]) {
1191 vector_elements = MAX2(vector_elements,
1192 ir->operands[1]->type->vector_elements);
1193 }
1194
1195 this->result.file = BAD_FILE;
1196
1197 /* Storage for our result. Ideally for an assignment we'd be using
1198 * the actual storage for the result here, instead.
1199 */
1200 result_src = src_reg(this, ir->type);
1201 /* convenience for the emit functions below. */
1202 result_dst = dst_reg(result_src);
1203 /* If nothing special happens, this is the result. */
1204 this->result = result_src;
1205 /* Limit writes to the channels that will be used by result_src later.
1206 * This does limit this temp's use as a temporary for multi-instruction
1207 * sequences.
1208 */
1209 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1210
1211 switch (ir->operation) {
1212 case ir_unop_logic_not:
1213 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1214 * ones complement of the whole register, not just bit 0.
1215 */
1216 emit(XOR(result_dst, op[0], src_reg(1)));
1217 break;
1218 case ir_unop_neg:
1219 op[0].negate = !op[0].negate;
1220 emit(MOV(result_dst, op[0]));
1221 break;
1222 case ir_unop_abs:
1223 op[0].abs = true;
1224 op[0].negate = false;
1225 emit(MOV(result_dst, op[0]));
1226 break;
1227
1228 case ir_unop_sign:
1229 if (ir->type->is_float()) {
1230 /* AND(val, 0x80000000) gives the sign bit.
1231 *
1232 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1233 * zero.
1234 */
1235 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1236
1237 op[0].type = BRW_REGISTER_TYPE_UD;
1238 result_dst.type = BRW_REGISTER_TYPE_UD;
1239 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1240
1241 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1242 inst->predicate = BRW_PREDICATE_NORMAL;
1243
1244 this->result.type = BRW_REGISTER_TYPE_F;
1245 } else {
1246 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1247 * -> non-negative val generates 0x00000000.
1248 * Predicated OR sets 1 if val is positive.
1249 */
1250 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1251
1252 emit(ASR(result_dst, op[0], src_reg(31)));
1253
1254 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1255 inst->predicate = BRW_PREDICATE_NORMAL;
1256 }
1257 break;
1258
1259 case ir_unop_rcp:
1260 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1261 break;
1262
1263 case ir_unop_exp2:
1264 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1265 break;
1266 case ir_unop_log2:
1267 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1268 break;
1269 case ir_unop_exp:
1270 case ir_unop_log:
1271 assert(!"not reached: should be handled by ir_explog_to_explog2");
1272 break;
1273 case ir_unop_sin:
1274 case ir_unop_sin_reduced:
1275 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1276 break;
1277 case ir_unop_cos:
1278 case ir_unop_cos_reduced:
1279 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1280 break;
1281
1282 case ir_unop_dFdx:
1283 case ir_unop_dFdy:
1284 assert(!"derivatives not valid in vertex shader");
1285 break;
1286
1287 case ir_unop_bitfield_reverse:
1288 emit(BFREV(result_dst, op[0]));
1289 break;
1290 case ir_unop_bit_count:
1291 emit(CBIT(result_dst, op[0]));
1292 break;
1293 case ir_unop_find_msb: {
1294 src_reg temp = src_reg(this, glsl_type::uint_type);
1295
1296 inst = emit(FBH(dst_reg(temp), op[0]));
1297 inst->dst.writemask = WRITEMASK_XYZW;
1298
1299 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1300 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1301 * subtract the result from 31 to convert the MSB count into an LSB count.
1302 */
1303
1304 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1305 temp.swizzle = BRW_SWIZZLE_NOOP;
1306 emit(MOV(result_dst, temp));
1307
1308 src_reg src_tmp = src_reg(result_dst);
1309 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1310
1311 src_tmp.negate = true;
1312 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1313 inst->predicate = BRW_PREDICATE_NORMAL;
1314 break;
1315 }
1316 case ir_unop_find_lsb:
1317 emit(FBL(result_dst, op[0]));
1318 break;
1319
1320 case ir_unop_noise:
1321 assert(!"not reached: should be handled by lower_noise");
1322 break;
1323
1324 case ir_binop_add:
1325 emit(ADD(result_dst, op[0], op[1]));
1326 break;
1327 case ir_binop_sub:
1328 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1329 break;
1330
1331 case ir_binop_mul:
1332 if (brw->gen < 8 && ir->type->is_integer()) {
1333 /* For integer multiplication, the MUL uses the low 16 bits of one of
1334 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1335 * accumulates in the contribution of the upper 16 bits of that
1336 * operand. If we can determine that one of the args is in the low
1337 * 16 bits, though, we can just emit a single MUL.
1338 */
1339 if (is_16bit_constant(ir->operands[0])) {
1340 if (brw->gen < 7)
1341 emit(MUL(result_dst, op[0], op[1]));
1342 else
1343 emit(MUL(result_dst, op[1], op[0]));
1344 } else if (is_16bit_constant(ir->operands[1])) {
1345 if (brw->gen < 7)
1346 emit(MUL(result_dst, op[1], op[0]));
1347 else
1348 emit(MUL(result_dst, op[0], op[1]));
1349 } else {
1350 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1351
1352 emit(MUL(acc, op[0], op[1]));
1353 emit(MACH(dst_null_d(), op[0], op[1]));
1354 emit(MOV(result_dst, src_reg(acc)));
1355 }
1356 } else {
1357 emit(MUL(result_dst, op[0], op[1]));
1358 }
1359 break;
1360 case ir_binop_imul_high: {
1361 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1362
1363 emit(MUL(acc, op[0], op[1]));
1364 emit(MACH(result_dst, op[0], op[1]));
1365 break;
1366 }
1367 case ir_binop_div:
1368 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1369 assert(ir->type->is_integer());
1370 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1371 break;
1372 case ir_binop_carry: {
1373 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1374
1375 emit(ADDC(dst_null_ud(), op[0], op[1]));
1376 emit(MOV(result_dst, src_reg(acc)));
1377 break;
1378 }
1379 case ir_binop_borrow: {
1380 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1381
1382 emit(SUBB(dst_null_ud(), op[0], op[1]));
1383 emit(MOV(result_dst, src_reg(acc)));
1384 break;
1385 }
1386 case ir_binop_mod:
1387 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1388 assert(ir->type->is_integer());
1389 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1390 break;
1391
1392 case ir_binop_less:
1393 case ir_binop_greater:
1394 case ir_binop_lequal:
1395 case ir_binop_gequal:
1396 case ir_binop_equal:
1397 case ir_binop_nequal: {
1398 emit(CMP(result_dst, op[0], op[1],
1399 brw_conditional_for_comparison(ir->operation)));
1400 emit(AND(result_dst, result_src, src_reg(0x1)));
1401 break;
1402 }
1403
1404 case ir_binop_all_equal:
1405 /* "==" operator producing a scalar boolean. */
1406 if (ir->operands[0]->type->is_vector() ||
1407 ir->operands[1]->type->is_vector()) {
1408 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1409 emit(MOV(result_dst, src_reg(0)));
1410 inst = emit(MOV(result_dst, src_reg(1)));
1411 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1412 } else {
1413 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1414 emit(AND(result_dst, result_src, src_reg(0x1)));
1415 }
1416 break;
1417 case ir_binop_any_nequal:
1418 /* "!=" operator producing a scalar boolean. */
1419 if (ir->operands[0]->type->is_vector() ||
1420 ir->operands[1]->type->is_vector()) {
1421 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1422
1423 emit(MOV(result_dst, src_reg(0)));
1424 inst = emit(MOV(result_dst, src_reg(1)));
1425 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1426 } else {
1427 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1428 emit(AND(result_dst, result_src, src_reg(0x1)));
1429 }
1430 break;
1431
1432 case ir_unop_any:
1433 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1434 emit(MOV(result_dst, src_reg(0)));
1435
1436 inst = emit(MOV(result_dst, src_reg(1)));
1437 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1438 break;
1439
1440 case ir_binop_logic_xor:
1441 emit(XOR(result_dst, op[0], op[1]));
1442 break;
1443
1444 case ir_binop_logic_or:
1445 emit(OR(result_dst, op[0], op[1]));
1446 break;
1447
1448 case ir_binop_logic_and:
1449 emit(AND(result_dst, op[0], op[1]));
1450 break;
1451
1452 case ir_binop_dot:
1453 assert(ir->operands[0]->type->is_vector());
1454 assert(ir->operands[0]->type == ir->operands[1]->type);
1455 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1456 break;
1457
1458 case ir_unop_sqrt:
1459 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1460 break;
1461 case ir_unop_rsq:
1462 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1463 break;
1464
1465 case ir_unop_bitcast_i2f:
1466 case ir_unop_bitcast_u2f:
1467 this->result = op[0];
1468 this->result.type = BRW_REGISTER_TYPE_F;
1469 break;
1470
1471 case ir_unop_bitcast_f2i:
1472 this->result = op[0];
1473 this->result.type = BRW_REGISTER_TYPE_D;
1474 break;
1475
1476 case ir_unop_bitcast_f2u:
1477 this->result = op[0];
1478 this->result.type = BRW_REGISTER_TYPE_UD;
1479 break;
1480
1481 case ir_unop_i2f:
1482 case ir_unop_i2u:
1483 case ir_unop_u2i:
1484 case ir_unop_u2f:
1485 case ir_unop_b2f:
1486 case ir_unop_b2i:
1487 case ir_unop_f2i:
1488 case ir_unop_f2u:
1489 emit(MOV(result_dst, op[0]));
1490 break;
1491 case ir_unop_f2b:
1492 case ir_unop_i2b: {
1493 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1494 emit(AND(result_dst, result_src, src_reg(1)));
1495 break;
1496 }
1497
1498 case ir_unop_trunc:
1499 emit(RNDZ(result_dst, op[0]));
1500 break;
1501 case ir_unop_ceil:
1502 op[0].negate = !op[0].negate;
1503 inst = emit(RNDD(result_dst, op[0]));
1504 this->result.negate = true;
1505 break;
1506 case ir_unop_floor:
1507 inst = emit(RNDD(result_dst, op[0]));
1508 break;
1509 case ir_unop_fract:
1510 inst = emit(FRC(result_dst, op[0]));
1511 break;
1512 case ir_unop_round_even:
1513 emit(RNDE(result_dst, op[0]));
1514 break;
1515
1516 case ir_binop_min:
1517 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1518 break;
1519 case ir_binop_max:
1520 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1521 break;
1522
1523 case ir_binop_pow:
1524 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1525 break;
1526
1527 case ir_unop_bit_not:
1528 inst = emit(NOT(result_dst, op[0]));
1529 break;
1530 case ir_binop_bit_and:
1531 inst = emit(AND(result_dst, op[0], op[1]));
1532 break;
1533 case ir_binop_bit_xor:
1534 inst = emit(XOR(result_dst, op[0], op[1]));
1535 break;
1536 case ir_binop_bit_or:
1537 inst = emit(OR(result_dst, op[0], op[1]));
1538 break;
1539
1540 case ir_binop_lshift:
1541 inst = emit(SHL(result_dst, op[0], op[1]));
1542 break;
1543
1544 case ir_binop_rshift:
1545 if (ir->type->base_type == GLSL_TYPE_INT)
1546 inst = emit(ASR(result_dst, op[0], op[1]));
1547 else
1548 inst = emit(SHR(result_dst, op[0], op[1]));
1549 break;
1550
1551 case ir_binop_bfm:
1552 emit(BFI1(result_dst, op[0], op[1]));
1553 break;
1554
1555 case ir_binop_ubo_load: {
1556 ir_constant *uniform_block = ir->operands[0]->as_constant();
1557 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1558 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1559 src_reg offset;
1560
1561 /* Now, load the vector from that offset. */
1562 assert(ir->type->is_vector() || ir->type->is_scalar());
1563
1564 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1565 packed_consts.type = result.type;
1566 src_reg surf_index =
1567 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1568 if (const_offset_ir) {
1569 if (brw->gen >= 8) {
1570 /* Store the offset in a GRF so we can send-from-GRF. */
1571 offset = src_reg(this, glsl_type::int_type);
1572 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1573 } else {
1574 /* Immediates are fine on older generations since they'll be moved
1575 * to a (potentially fake) MRF at the generator level.
1576 */
1577 offset = src_reg(const_offset / 16);
1578 }
1579 } else {
1580 offset = src_reg(this, glsl_type::uint_type);
1581 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1582 }
1583
1584 vec4_instruction *pull =
1585 emit(new(mem_ctx) vec4_instruction(this,
1586 VS_OPCODE_PULL_CONSTANT_LOAD,
1587 dst_reg(packed_consts),
1588 surf_index,
1589 offset));
1590 pull->base_mrf = 14;
1591 pull->mlen = 1;
1592
1593 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1594 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1595 const_offset % 16 / 4,
1596 const_offset % 16 / 4,
1597 const_offset % 16 / 4);
1598
1599 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1600 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1601 emit(CMP(result_dst, packed_consts, src_reg(0u),
1602 BRW_CONDITIONAL_NZ));
1603 emit(AND(result_dst, result, src_reg(0x1)));
1604 } else {
1605 emit(MOV(result_dst, packed_consts));
1606 }
1607 break;
1608 }
1609
1610 case ir_binop_vector_extract:
1611 assert(!"should have been lowered by vec_index_to_cond_assign");
1612 break;
1613
1614 case ir_triop_fma:
1615 op[0] = fix_3src_operand(op[0]);
1616 op[1] = fix_3src_operand(op[1]);
1617 op[2] = fix_3src_operand(op[2]);
1618 /* Note that the instruction's argument order is reversed from GLSL
1619 * and the IR.
1620 */
1621 emit(MAD(result_dst, op[2], op[1], op[0]));
1622 break;
1623
1624 case ir_triop_lrp:
1625 op[0] = fix_3src_operand(op[0]);
1626 op[1] = fix_3src_operand(op[1]);
1627 op[2] = fix_3src_operand(op[2]);
1628 /* Note that the instruction's argument order is reversed from GLSL
1629 * and the IR.
1630 */
1631 emit(LRP(result_dst, op[2], op[1], op[0]));
1632 break;
1633
1634 case ir_triop_csel:
1635 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1636 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1637 inst->predicate = BRW_PREDICATE_NORMAL;
1638 break;
1639
1640 case ir_triop_bfi:
1641 op[0] = fix_3src_operand(op[0]);
1642 op[1] = fix_3src_operand(op[1]);
1643 op[2] = fix_3src_operand(op[2]);
1644 emit(BFI2(result_dst, op[0], op[1], op[2]));
1645 break;
1646
1647 case ir_triop_bitfield_extract:
1648 op[0] = fix_3src_operand(op[0]);
1649 op[1] = fix_3src_operand(op[1]);
1650 op[2] = fix_3src_operand(op[2]);
1651 /* Note that the instruction's argument order is reversed from GLSL
1652 * and the IR.
1653 */
1654 emit(BFE(result_dst, op[2], op[1], op[0]));
1655 break;
1656
1657 case ir_triop_vector_insert:
1658 assert(!"should have been lowered by lower_vector_insert");
1659 break;
1660
1661 case ir_quadop_bitfield_insert:
1662 assert(!"not reached: should be handled by "
1663 "bitfield_insert_to_bfm_bfi\n");
1664 break;
1665
1666 case ir_quadop_vector:
1667 assert(!"not reached: should be handled by lower_quadop_vector");
1668 break;
1669
1670 case ir_unop_pack_half_2x16:
1671 emit_pack_half_2x16(result_dst, op[0]);
1672 break;
1673 case ir_unop_unpack_half_2x16:
1674 emit_unpack_half_2x16(result_dst, op[0]);
1675 break;
1676 case ir_unop_pack_snorm_2x16:
1677 case ir_unop_pack_snorm_4x8:
1678 case ir_unop_pack_unorm_2x16:
1679 case ir_unop_pack_unorm_4x8:
1680 case ir_unop_unpack_snorm_2x16:
1681 case ir_unop_unpack_snorm_4x8:
1682 case ir_unop_unpack_unorm_2x16:
1683 case ir_unop_unpack_unorm_4x8:
1684 assert(!"not reached: should be handled by lower_packing_builtins");
1685 break;
1686 case ir_unop_unpack_half_2x16_split_x:
1687 case ir_unop_unpack_half_2x16_split_y:
1688 case ir_binop_pack_half_2x16_split:
1689 assert(!"not reached: should not occur in vertex shader");
1690 break;
1691 case ir_binop_ldexp:
1692 assert(!"not reached: should be handled by ldexp_to_arith()");
1693 break;
1694 }
1695 }
1696
1697
1698 void
1699 vec4_visitor::visit(ir_swizzle *ir)
1700 {
1701 src_reg src;
1702 int i = 0;
1703 int swizzle[4];
1704
1705 /* Note that this is only swizzles in expressions, not those on the left
1706 * hand side of an assignment, which do write masking. See ir_assignment
1707 * for that.
1708 */
1709
1710 ir->val->accept(this);
1711 src = this->result;
1712 assert(src.file != BAD_FILE);
1713
1714 for (i = 0; i < ir->type->vector_elements; i++) {
1715 switch (i) {
1716 case 0:
1717 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1718 break;
1719 case 1:
1720 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1721 break;
1722 case 2:
1723 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1724 break;
1725 case 3:
1726 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1727 break;
1728 }
1729 }
1730 for (; i < 4; i++) {
1731 /* Replicate the last channel out. */
1732 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1733 }
1734
1735 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1736
1737 this->result = src;
1738 }
1739
1740 void
1741 vec4_visitor::visit(ir_dereference_variable *ir)
1742 {
1743 const struct glsl_type *type = ir->type;
1744 dst_reg *reg = variable_storage(ir->var);
1745
1746 if (!reg) {
1747 fail("Failed to find variable storage for %s\n", ir->var->name);
1748 this->result = src_reg(brw_null_reg());
1749 return;
1750 }
1751
1752 this->result = src_reg(*reg);
1753
1754 /* System values get their swizzle from the dst_reg writemask */
1755 if (ir->var->data.mode == ir_var_system_value)
1756 return;
1757
1758 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1759 this->result.swizzle = swizzle_for_size(type->vector_elements);
1760 }
1761
1762
1763 int
1764 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1765 {
1766 /* Under normal circumstances array elements are stored consecutively, so
1767 * the stride is equal to the size of the array element.
1768 */
1769 return type_size(ir->type);
1770 }
1771
1772
1773 void
1774 vec4_visitor::visit(ir_dereference_array *ir)
1775 {
1776 ir_constant *constant_index;
1777 src_reg src;
1778 int array_stride = compute_array_stride(ir);
1779
1780 constant_index = ir->array_index->constant_expression_value();
1781
1782 ir->array->accept(this);
1783 src = this->result;
1784
1785 if (constant_index) {
1786 src.reg_offset += constant_index->value.i[0] * array_stride;
1787 } else {
1788 /* Variable index array dereference. It eats the "vec4" of the
1789 * base of the array and an index that offsets the Mesa register
1790 * index.
1791 */
1792 ir->array_index->accept(this);
1793
1794 src_reg index_reg;
1795
1796 if (array_stride == 1) {
1797 index_reg = this->result;
1798 } else {
1799 index_reg = src_reg(this, glsl_type::int_type);
1800
1801 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1802 }
1803
1804 if (src.reladdr) {
1805 src_reg temp = src_reg(this, glsl_type::int_type);
1806
1807 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1808
1809 index_reg = temp;
1810 }
1811
1812 src.reladdr = ralloc(mem_ctx, src_reg);
1813 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1814 }
1815
1816 /* If the type is smaller than a vec4, replicate the last channel out. */
1817 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1818 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1819 else
1820 src.swizzle = BRW_SWIZZLE_NOOP;
1821 src.type = brw_type_for_base_type(ir->type);
1822
1823 this->result = src;
1824 }
1825
1826 void
1827 vec4_visitor::visit(ir_dereference_record *ir)
1828 {
1829 unsigned int i;
1830 const glsl_type *struct_type = ir->record->type;
1831 int offset = 0;
1832
1833 ir->record->accept(this);
1834
1835 for (i = 0; i < struct_type->length; i++) {
1836 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1837 break;
1838 offset += type_size(struct_type->fields.structure[i].type);
1839 }
1840
1841 /* If the type is smaller than a vec4, replicate the last channel out. */
1842 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1843 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1844 else
1845 this->result.swizzle = BRW_SWIZZLE_NOOP;
1846 this->result.type = brw_type_for_base_type(ir->type);
1847
1848 this->result.reg_offset += offset;
1849 }
1850
1851 /**
1852 * We want to be careful in assignment setup to hit the actual storage
1853 * instead of potentially using a temporary like we might with the
1854 * ir_dereference handler.
1855 */
1856 static dst_reg
1857 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1858 {
1859 /* The LHS must be a dereference. If the LHS is a variable indexed array
1860 * access of a vector, it must be separated into a series conditional moves
1861 * before reaching this point (see ir_vec_index_to_cond_assign).
1862 */
1863 assert(ir->as_dereference());
1864 ir_dereference_array *deref_array = ir->as_dereference_array();
1865 if (deref_array) {
1866 assert(!deref_array->array->type->is_vector());
1867 }
1868
1869 /* Use the rvalue deref handler for the most part. We'll ignore
1870 * swizzles in it and write swizzles using writemask, though.
1871 */
1872 ir->accept(v);
1873 return dst_reg(v->result);
1874 }
1875
1876 void
1877 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1878 const struct glsl_type *type, uint32_t predicate)
1879 {
1880 if (type->base_type == GLSL_TYPE_STRUCT) {
1881 for (unsigned int i = 0; i < type->length; i++) {
1882 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1883 }
1884 return;
1885 }
1886
1887 if (type->is_array()) {
1888 for (unsigned int i = 0; i < type->length; i++) {
1889 emit_block_move(dst, src, type->fields.array, predicate);
1890 }
1891 return;
1892 }
1893
1894 if (type->is_matrix()) {
1895 const struct glsl_type *vec_type;
1896
1897 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1898 type->vector_elements, 1);
1899
1900 for (int i = 0; i < type->matrix_columns; i++) {
1901 emit_block_move(dst, src, vec_type, predicate);
1902 }
1903 return;
1904 }
1905
1906 assert(type->is_scalar() || type->is_vector());
1907
1908 dst->type = brw_type_for_base_type(type);
1909 src->type = dst->type;
1910
1911 dst->writemask = (1 << type->vector_elements) - 1;
1912
1913 src->swizzle = swizzle_for_size(type->vector_elements);
1914
1915 vec4_instruction *inst = emit(MOV(*dst, *src));
1916 inst->predicate = predicate;
1917
1918 dst->reg_offset++;
1919 src->reg_offset++;
1920 }
1921
1922
1923 /* If the RHS processing resulted in an instruction generating a
1924 * temporary value, and it would be easy to rewrite the instruction to
1925 * generate its result right into the LHS instead, do so. This ends
1926 * up reliably removing instructions where it can be tricky to do so
1927 * later without real UD chain information.
1928 */
1929 bool
1930 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1931 dst_reg dst,
1932 src_reg src,
1933 vec4_instruction *pre_rhs_inst,
1934 vec4_instruction *last_rhs_inst)
1935 {
1936 /* This could be supported, but it would take more smarts. */
1937 if (ir->condition)
1938 return false;
1939
1940 if (pre_rhs_inst == last_rhs_inst)
1941 return false; /* No instructions generated to work with. */
1942
1943 /* Make sure the last instruction generated our source reg. */
1944 if (src.file != GRF ||
1945 src.file != last_rhs_inst->dst.file ||
1946 src.reg != last_rhs_inst->dst.reg ||
1947 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1948 src.reladdr ||
1949 src.abs ||
1950 src.negate ||
1951 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1952 return false;
1953
1954 /* Check that that last instruction fully initialized the channels
1955 * we want to use, in the order we want to use them. We could
1956 * potentially reswizzle the operands of many instructions so that
1957 * we could handle out of order channels, but don't yet.
1958 */
1959
1960 for (unsigned i = 0; i < 4; i++) {
1961 if (dst.writemask & (1 << i)) {
1962 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1963 return false;
1964
1965 if (BRW_GET_SWZ(src.swizzle, i) != i)
1966 return false;
1967 }
1968 }
1969
1970 /* Success! Rewrite the instruction. */
1971 last_rhs_inst->dst.file = dst.file;
1972 last_rhs_inst->dst.reg = dst.reg;
1973 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1974 last_rhs_inst->dst.reladdr = dst.reladdr;
1975 last_rhs_inst->dst.writemask &= dst.writemask;
1976
1977 return true;
1978 }
1979
1980 void
1981 vec4_visitor::visit(ir_assignment *ir)
1982 {
1983 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1984 uint32_t predicate = BRW_PREDICATE_NONE;
1985
1986 if (!ir->lhs->type->is_scalar() &&
1987 !ir->lhs->type->is_vector()) {
1988 ir->rhs->accept(this);
1989 src_reg src = this->result;
1990
1991 if (ir->condition) {
1992 emit_bool_to_cond_code(ir->condition, &predicate);
1993 }
1994
1995 /* emit_block_move doesn't account for swizzles in the source register.
1996 * This should be ok, since the source register is a structure or an
1997 * array, and those can't be swizzled. But double-check to be sure.
1998 */
1999 assert(src.swizzle ==
2000 (ir->rhs->type->is_matrix()
2001 ? swizzle_for_size(ir->rhs->type->vector_elements)
2002 : BRW_SWIZZLE_NOOP));
2003
2004 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2005 return;
2006 }
2007
2008 /* Now we're down to just a scalar/vector with writemasks. */
2009 int i;
2010
2011 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2012 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2013
2014 ir->rhs->accept(this);
2015
2016 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2017
2018 src_reg src = this->result;
2019
2020 int swizzles[4];
2021 int first_enabled_chan = 0;
2022 int src_chan = 0;
2023
2024 assert(ir->lhs->type->is_vector() ||
2025 ir->lhs->type->is_scalar());
2026 dst.writemask = ir->write_mask;
2027
2028 for (int i = 0; i < 4; i++) {
2029 if (dst.writemask & (1 << i)) {
2030 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2031 break;
2032 }
2033 }
2034
2035 /* Swizzle a small RHS vector into the channels being written.
2036 *
2037 * glsl ir treats write_mask as dictating how many channels are
2038 * present on the RHS while in our instructions we need to make
2039 * those channels appear in the slots of the vec4 they're written to.
2040 */
2041 for (int i = 0; i < 4; i++) {
2042 if (dst.writemask & (1 << i))
2043 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2044 else
2045 swizzles[i] = first_enabled_chan;
2046 }
2047 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2048 swizzles[2], swizzles[3]);
2049
2050 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2051 return;
2052 }
2053
2054 if (ir->condition) {
2055 emit_bool_to_cond_code(ir->condition, &predicate);
2056 }
2057
2058 for (i = 0; i < type_size(ir->lhs->type); i++) {
2059 vec4_instruction *inst = emit(MOV(dst, src));
2060 inst->predicate = predicate;
2061
2062 dst.reg_offset++;
2063 src.reg_offset++;
2064 }
2065 }
2066
2067 void
2068 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2069 {
2070 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2071 foreach_list(node, &ir->components) {
2072 ir_constant *field_value = (ir_constant *)node;
2073
2074 emit_constant_values(dst, field_value);
2075 }
2076 return;
2077 }
2078
2079 if (ir->type->is_array()) {
2080 for (unsigned int i = 0; i < ir->type->length; i++) {
2081 emit_constant_values(dst, ir->array_elements[i]);
2082 }
2083 return;
2084 }
2085
2086 if (ir->type->is_matrix()) {
2087 for (int i = 0; i < ir->type->matrix_columns; i++) {
2088 float *vec = &ir->value.f[i * ir->type->vector_elements];
2089
2090 for (int j = 0; j < ir->type->vector_elements; j++) {
2091 dst->writemask = 1 << j;
2092 dst->type = BRW_REGISTER_TYPE_F;
2093
2094 emit(MOV(*dst, src_reg(vec[j])));
2095 }
2096 dst->reg_offset++;
2097 }
2098 return;
2099 }
2100
2101 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2102
2103 for (int i = 0; i < ir->type->vector_elements; i++) {
2104 if (!(remaining_writemask & (1 << i)))
2105 continue;
2106
2107 dst->writemask = 1 << i;
2108 dst->type = brw_type_for_base_type(ir->type);
2109
2110 /* Find other components that match the one we're about to
2111 * write. Emits fewer instructions for things like vec4(0.5,
2112 * 1.5, 1.5, 1.5).
2113 */
2114 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2115 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2116 if (ir->value.b[i] == ir->value.b[j])
2117 dst->writemask |= (1 << j);
2118 } else {
2119 /* u, i, and f storage all line up, so no need for a
2120 * switch case for comparing each type.
2121 */
2122 if (ir->value.u[i] == ir->value.u[j])
2123 dst->writemask |= (1 << j);
2124 }
2125 }
2126
2127 switch (ir->type->base_type) {
2128 case GLSL_TYPE_FLOAT:
2129 emit(MOV(*dst, src_reg(ir->value.f[i])));
2130 break;
2131 case GLSL_TYPE_INT:
2132 emit(MOV(*dst, src_reg(ir->value.i[i])));
2133 break;
2134 case GLSL_TYPE_UINT:
2135 emit(MOV(*dst, src_reg(ir->value.u[i])));
2136 break;
2137 case GLSL_TYPE_BOOL:
2138 emit(MOV(*dst, src_reg(ir->value.b[i])));
2139 break;
2140 default:
2141 assert(!"Non-float/uint/int/bool constant");
2142 break;
2143 }
2144
2145 remaining_writemask &= ~dst->writemask;
2146 }
2147 dst->reg_offset++;
2148 }
2149
2150 void
2151 vec4_visitor::visit(ir_constant *ir)
2152 {
2153 dst_reg dst = dst_reg(this, ir->type);
2154 this->result = src_reg(dst);
2155
2156 emit_constant_values(&dst, ir);
2157 }
2158
2159 void
2160 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2161 {
2162 ir_dereference *deref = static_cast<ir_dereference *>(
2163 ir->actual_parameters.get_head());
2164 ir_variable *location = deref->variable_referenced();
2165 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2166 location->data.atomic.buffer_index);
2167
2168 /* Calculate the surface offset */
2169 src_reg offset(this, glsl_type::uint_type);
2170 ir_dereference_array *deref_array = deref->as_dereference_array();
2171 if (deref_array) {
2172 deref_array->array_index->accept(this);
2173
2174 src_reg tmp(this, glsl_type::uint_type);
2175 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2176 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2177 } else {
2178 offset = location->data.atomic.offset;
2179 }
2180
2181 /* Emit the appropriate machine instruction */
2182 const char *callee = ir->callee->function_name();
2183 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2184
2185 if (!strcmp("__intrinsic_atomic_read", callee)) {
2186 emit_untyped_surface_read(surf_index, dst, offset);
2187
2188 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2189 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2190 src_reg(), src_reg());
2191
2192 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2193 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2194 src_reg(), src_reg());
2195 }
2196 }
2197
2198 void
2199 vec4_visitor::visit(ir_call *ir)
2200 {
2201 const char *callee = ir->callee->function_name();
2202
2203 if (!strcmp("__intrinsic_atomic_read", callee) ||
2204 !strcmp("__intrinsic_atomic_increment", callee) ||
2205 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2206 visit_atomic_counter_intrinsic(ir);
2207 } else {
2208 assert(!"Unsupported intrinsic.");
2209 }
2210 }
2211
2212 src_reg
2213 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2214 {
2215 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2216 inst->base_mrf = 2;
2217 inst->mlen = 1;
2218 inst->sampler = sampler;
2219 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2220 inst->dst.writemask = WRITEMASK_XYZW;
2221
2222 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2223 int param_base = inst->base_mrf;
2224 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2225 int zero_mask = 0xf & ~coord_mask;
2226
2227 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2228 coordinate));
2229
2230 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2231 src_reg(0)));
2232
2233 emit(inst);
2234 return src_reg(inst->dst);
2235 }
2236
2237 void
2238 vec4_visitor::visit(ir_texture *ir)
2239 {
2240 int sampler =
2241 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2242
2243 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2244 * emitting anything other than setting up the constant result.
2245 */
2246 if (ir->op == ir_tg4) {
2247 ir_constant *chan = ir->lod_info.component->as_constant();
2248 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2249 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2250 dst_reg result(this, ir->type);
2251 this->result = src_reg(result);
2252 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2253 return;
2254 }
2255 }
2256
2257 /* Should be lowered by do_lower_texture_projection */
2258 assert(!ir->projector);
2259
2260 /* Should be lowered */
2261 assert(!ir->offset || !ir->offset->type->is_array());
2262
2263 /* Generate code to compute all the subexpression trees. This has to be
2264 * done before loading any values into MRFs for the sampler message since
2265 * generating these values may involve SEND messages that need the MRFs.
2266 */
2267 src_reg coordinate;
2268 if (ir->coordinate) {
2269 ir->coordinate->accept(this);
2270 coordinate = this->result;
2271 }
2272
2273 src_reg shadow_comparitor;
2274 if (ir->shadow_comparitor) {
2275 ir->shadow_comparitor->accept(this);
2276 shadow_comparitor = this->result;
2277 }
2278
2279 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2280 src_reg offset_value;
2281 if (has_nonconstant_offset) {
2282 ir->offset->accept(this);
2283 offset_value = src_reg(this->result);
2284 }
2285
2286 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2287 src_reg lod, dPdx, dPdy, sample_index, mcs;
2288 switch (ir->op) {
2289 case ir_tex:
2290 lod = src_reg(0.0f);
2291 lod_type = glsl_type::float_type;
2292 break;
2293 case ir_txf:
2294 case ir_txl:
2295 case ir_txs:
2296 ir->lod_info.lod->accept(this);
2297 lod = this->result;
2298 lod_type = ir->lod_info.lod->type;
2299 break;
2300 case ir_query_levels:
2301 lod = src_reg(0);
2302 lod_type = glsl_type::int_type;
2303 break;
2304 case ir_txf_ms:
2305 ir->lod_info.sample_index->accept(this);
2306 sample_index = this->result;
2307 sample_index_type = ir->lod_info.sample_index->type;
2308
2309 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2310 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2311 else
2312 mcs = src_reg(0u);
2313 break;
2314 case ir_txd:
2315 ir->lod_info.grad.dPdx->accept(this);
2316 dPdx = this->result;
2317
2318 ir->lod_info.grad.dPdy->accept(this);
2319 dPdy = this->result;
2320
2321 lod_type = ir->lod_info.grad.dPdx->type;
2322 break;
2323 case ir_txb:
2324 case ir_lod:
2325 case ir_tg4:
2326 break;
2327 }
2328
2329 vec4_instruction *inst = NULL;
2330 switch (ir->op) {
2331 case ir_tex:
2332 case ir_txl:
2333 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2334 break;
2335 case ir_txd:
2336 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2337 break;
2338 case ir_txf:
2339 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2340 break;
2341 case ir_txf_ms:
2342 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2343 break;
2344 case ir_txs:
2345 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2346 break;
2347 case ir_tg4:
2348 if (has_nonconstant_offset)
2349 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2350 else
2351 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2352 break;
2353 case ir_query_levels:
2354 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2355 break;
2356 case ir_txb:
2357 assert(!"TXB is not valid for vertex shaders.");
2358 break;
2359 case ir_lod:
2360 assert(!"LOD is not valid for vertex shaders.");
2361 break;
2362 default:
2363 assert(!"Unrecognized tex op");
2364 }
2365
2366 if (ir->offset != NULL && ir->op != ir_txf)
2367 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2368
2369 /* Stuff the channel select bits in the top of the texture offset */
2370 if (ir->op == ir_tg4)
2371 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2372
2373 /* The message header is necessary for:
2374 * - Gen4 (always)
2375 * - Texel offsets
2376 * - Gather channel selection
2377 * - Sampler indices too large to fit in a 4-bit value.
2378 */
2379 inst->header_present =
2380 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2381 sampler >= 16;
2382 inst->base_mrf = 2;
2383 inst->mlen = inst->header_present + 1; /* always at least one */
2384 inst->sampler = sampler;
2385 inst->dst = dst_reg(this, ir->type);
2386 inst->dst.writemask = WRITEMASK_XYZW;
2387 inst->shadow_compare = ir->shadow_comparitor != NULL;
2388
2389 /* MRF for the first parameter */
2390 int param_base = inst->base_mrf + inst->header_present;
2391
2392 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2393 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2394 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2395 } else {
2396 /* Load the coordinate */
2397 /* FINISHME: gl_clamp_mask and saturate */
2398 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2399 int zero_mask = 0xf & ~coord_mask;
2400
2401 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2402 coordinate));
2403
2404 if (zero_mask != 0) {
2405 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2406 src_reg(0)));
2407 }
2408 /* Load the shadow comparitor */
2409 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2410 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2411 WRITEMASK_X),
2412 shadow_comparitor));
2413 inst->mlen++;
2414 }
2415
2416 /* Load the LOD info */
2417 if (ir->op == ir_tex || ir->op == ir_txl) {
2418 int mrf, writemask;
2419 if (brw->gen >= 5) {
2420 mrf = param_base + 1;
2421 if (ir->shadow_comparitor) {
2422 writemask = WRITEMASK_Y;
2423 /* mlen already incremented */
2424 } else {
2425 writemask = WRITEMASK_X;
2426 inst->mlen++;
2427 }
2428 } else /* brw->gen == 4 */ {
2429 mrf = param_base;
2430 writemask = WRITEMASK_W;
2431 }
2432 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2433 } else if (ir->op == ir_txf) {
2434 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2435 } else if (ir->op == ir_txf_ms) {
2436 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2437 sample_index));
2438 if (brw->gen >= 7)
2439 /* MCS data is in the first channel of `mcs`, but we need to get it into
2440 * the .y channel of the second vec4 of params, so replicate .x across
2441 * the whole vec4 and then mask off everything except .y
2442 */
2443 mcs.swizzle = BRW_SWIZZLE_XXXX;
2444 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2445 mcs));
2446 inst->mlen++;
2447 } else if (ir->op == ir_txd) {
2448 const glsl_type *type = lod_type;
2449
2450 if (brw->gen >= 5) {
2451 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2452 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2453 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2454 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2455 inst->mlen++;
2456
2457 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2458 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2459 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2460 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2461 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2462 inst->mlen++;
2463
2464 if (ir->shadow_comparitor) {
2465 emit(MOV(dst_reg(MRF, param_base + 2,
2466 ir->shadow_comparitor->type, WRITEMASK_Z),
2467 shadow_comparitor));
2468 }
2469 }
2470 } else /* brw->gen == 4 */ {
2471 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2472 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2473 inst->mlen += 2;
2474 }
2475 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2476 if (ir->shadow_comparitor) {
2477 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2478 shadow_comparitor));
2479 }
2480
2481 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2482 offset_value));
2483 inst->mlen++;
2484 }
2485 }
2486
2487 emit(inst);
2488
2489 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2490 * spec requires layers.
2491 */
2492 if (ir->op == ir_txs) {
2493 glsl_type const *type = ir->sampler->type;
2494 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2495 type->sampler_array) {
2496 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2497 with_writemask(inst->dst, WRITEMASK_Z),
2498 src_reg(inst->dst), src_reg(6));
2499 }
2500 }
2501
2502 if (brw->gen == 6 && ir->op == ir_tg4) {
2503 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2504 }
2505
2506 swizzle_result(ir, src_reg(inst->dst), sampler);
2507 }
2508
2509 /**
2510 * Apply workarounds for Gen6 gather with UINT/SINT
2511 */
2512 void
2513 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2514 {
2515 if (!wa)
2516 return;
2517
2518 int width = (wa & WA_8BIT) ? 8 : 16;
2519 dst_reg dst_f = dst;
2520 dst_f.type = BRW_REGISTER_TYPE_F;
2521
2522 /* Convert from UNORM to UINT */
2523 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2524 emit(MOV(dst, src_reg(dst_f)));
2525
2526 if (wa & WA_SIGN) {
2527 /* Reinterpret the UINT value as a signed INT value by
2528 * shifting the sign bit into place, then shifting back
2529 * preserving sign.
2530 */
2531 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2532 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2533 }
2534 }
2535
2536 /**
2537 * Set up the gather channel based on the swizzle, for gather4.
2538 */
2539 uint32_t
2540 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2541 {
2542 ir_constant *chan = ir->lod_info.component->as_constant();
2543 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2544 switch (swiz) {
2545 case SWIZZLE_X: return 0;
2546 case SWIZZLE_Y:
2547 /* gather4 sampler is broken for green channel on RG32F --
2548 * we must ask for blue instead.
2549 */
2550 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2551 return 2;
2552 return 1;
2553 case SWIZZLE_Z: return 2;
2554 case SWIZZLE_W: return 3;
2555 default:
2556 assert(!"Not reached"); /* zero, one swizzles handled already */
2557 return 0;
2558 }
2559 }
2560
2561 void
2562 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2563 {
2564 int s = key->tex.swizzles[sampler];
2565
2566 this->result = src_reg(this, ir->type);
2567 dst_reg swizzled_result(this->result);
2568
2569 if (ir->op == ir_query_levels) {
2570 /* # levels is in .w */
2571 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2572 emit(MOV(swizzled_result, orig_val));
2573 return;
2574 }
2575
2576 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2577 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2578 emit(MOV(swizzled_result, orig_val));
2579 return;
2580 }
2581
2582
2583 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2584 int swizzle[4] = {0};
2585
2586 for (int i = 0; i < 4; i++) {
2587 switch (GET_SWZ(s, i)) {
2588 case SWIZZLE_ZERO:
2589 zero_mask |= (1 << i);
2590 break;
2591 case SWIZZLE_ONE:
2592 one_mask |= (1 << i);
2593 break;
2594 default:
2595 copy_mask |= (1 << i);
2596 swizzle[i] = GET_SWZ(s, i);
2597 break;
2598 }
2599 }
2600
2601 if (copy_mask) {
2602 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2603 swizzled_result.writemask = copy_mask;
2604 emit(MOV(swizzled_result, orig_val));
2605 }
2606
2607 if (zero_mask) {
2608 swizzled_result.writemask = zero_mask;
2609 emit(MOV(swizzled_result, src_reg(0.0f)));
2610 }
2611
2612 if (one_mask) {
2613 swizzled_result.writemask = one_mask;
2614 emit(MOV(swizzled_result, src_reg(1.0f)));
2615 }
2616 }
2617
2618 void
2619 vec4_visitor::visit(ir_return *ir)
2620 {
2621 assert(!"not reached");
2622 }
2623
2624 void
2625 vec4_visitor::visit(ir_discard *ir)
2626 {
2627 assert(!"not reached");
2628 }
2629
2630 void
2631 vec4_visitor::visit(ir_if *ir)
2632 {
2633 /* Don't point the annotation at the if statement, because then it plus
2634 * the then and else blocks get printed.
2635 */
2636 this->base_ir = ir->condition;
2637
2638 if (brw->gen == 6) {
2639 emit_if_gen6(ir);
2640 } else {
2641 uint32_t predicate;
2642 emit_bool_to_cond_code(ir->condition, &predicate);
2643 emit(IF(predicate));
2644 }
2645
2646 visit_instructions(&ir->then_instructions);
2647
2648 if (!ir->else_instructions.is_empty()) {
2649 this->base_ir = ir->condition;
2650 emit(BRW_OPCODE_ELSE);
2651
2652 visit_instructions(&ir->else_instructions);
2653 }
2654
2655 this->base_ir = ir->condition;
2656 emit(BRW_OPCODE_ENDIF);
2657 }
2658
2659 void
2660 vec4_visitor::visit(ir_emit_vertex *)
2661 {
2662 assert(!"not reached");
2663 }
2664
2665 void
2666 vec4_visitor::visit(ir_end_primitive *)
2667 {
2668 assert(!"not reached");
2669 }
2670
2671 void
2672 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2673 dst_reg dst, src_reg offset,
2674 src_reg src0, src_reg src1)
2675 {
2676 unsigned mlen = 0;
2677
2678 /* Set the atomic operation offset. */
2679 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2680 mlen++;
2681
2682 /* Set the atomic operation arguments. */
2683 if (src0.file != BAD_FILE) {
2684 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2685 mlen++;
2686 }
2687
2688 if (src1.file != BAD_FILE) {
2689 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2690 mlen++;
2691 }
2692
2693 /* Emit the instruction. Note that this maps to the normal SIMD8
2694 * untyped atomic message on Ivy Bridge, but that's OK because
2695 * unused channels will be masked out.
2696 */
2697 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2698 src_reg(atomic_op), src_reg(surf_index));
2699 inst->base_mrf = 0;
2700 inst->mlen = mlen;
2701 }
2702
2703 void
2704 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2705 src_reg offset)
2706 {
2707 /* Set the surface read offset. */
2708 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2709
2710 /* Emit the instruction. Note that this maps to the normal SIMD8
2711 * untyped surface read message, but that's OK because unused
2712 * channels will be masked out.
2713 */
2714 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2715 dst, src_reg(surf_index));
2716 inst->base_mrf = 0;
2717 inst->mlen = 1;
2718 }
2719
2720 void
2721 vec4_visitor::emit_ndc_computation()
2722 {
2723 /* Get the position */
2724 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2725
2726 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2727 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2728 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2729
2730 current_annotation = "NDC";
2731 dst_reg ndc_w = ndc;
2732 ndc_w.writemask = WRITEMASK_W;
2733 src_reg pos_w = pos;
2734 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2735 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2736
2737 dst_reg ndc_xyz = ndc;
2738 ndc_xyz.writemask = WRITEMASK_XYZ;
2739
2740 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2741 }
2742
2743 void
2744 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2745 {
2746 if (brw->gen < 6 &&
2747 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2748 key->userclip_active || brw->has_negative_rhw_bug)) {
2749 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2750 dst_reg header1_w = header1;
2751 header1_w.writemask = WRITEMASK_W;
2752
2753 emit(MOV(header1, 0u));
2754
2755 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2756 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2757
2758 current_annotation = "Point size";
2759 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2760 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2761 }
2762
2763 if (key->userclip_active) {
2764 current_annotation = "Clipping flags";
2765 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2766 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2767
2768 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2769 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2770 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2771
2772 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2773 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2774 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2775 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2776 }
2777
2778 /* i965 clipping workaround:
2779 * 1) Test for -ve rhw
2780 * 2) If set,
2781 * set ndc = (0,0,0,0)
2782 * set ucp[6] = 1
2783 *
2784 * Later, clipping will detect ucp[6] and ensure the primitive is
2785 * clipped against all fixed planes.
2786 */
2787 if (brw->has_negative_rhw_bug) {
2788 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2789 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2790 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2791 vec4_instruction *inst;
2792 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2793 inst->predicate = BRW_PREDICATE_NORMAL;
2794 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2795 inst->predicate = BRW_PREDICATE_NORMAL;
2796 }
2797
2798 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2799 } else if (brw->gen < 6) {
2800 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2801 } else {
2802 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2803 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2804 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2805 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2806 }
2807 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2808 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2809 src_reg(output_reg[VARYING_SLOT_LAYER])));
2810 }
2811 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2812 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2813 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2814 }
2815 }
2816 }
2817
2818 void
2819 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2820 {
2821 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2822 *
2823 * "If a linked set of shaders forming the vertex stage contains no
2824 * static write to gl_ClipVertex or gl_ClipDistance, but the
2825 * application has requested clipping against user clip planes through
2826 * the API, then the coordinate written to gl_Position is used for
2827 * comparison against the user clip planes."
2828 *
2829 * This function is only called if the shader didn't write to
2830 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2831 * if the user wrote to it; otherwise we use gl_Position.
2832 */
2833 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2834 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2835 clip_vertex = VARYING_SLOT_POS;
2836 }
2837
2838 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2839 ++i) {
2840 reg.writemask = 1 << i;
2841 emit(DP4(reg,
2842 src_reg(output_reg[clip_vertex]),
2843 src_reg(this->userplane[i + offset])));
2844 }
2845 }
2846
2847 void
2848 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2849 {
2850 assert (varying < VARYING_SLOT_MAX);
2851 reg.type = output_reg[varying].type;
2852 current_annotation = output_reg_annotation[varying];
2853 /* Copy the register, saturating if necessary */
2854 vec4_instruction *inst = emit(MOV(reg,
2855 src_reg(output_reg[varying])));
2856 if ((varying == VARYING_SLOT_COL0 ||
2857 varying == VARYING_SLOT_COL1 ||
2858 varying == VARYING_SLOT_BFC0 ||
2859 varying == VARYING_SLOT_BFC1) &&
2860 key->clamp_vertex_color) {
2861 inst->saturate = true;
2862 }
2863 }
2864
2865 void
2866 vec4_visitor::emit_urb_slot(int mrf, int varying)
2867 {
2868 struct brw_reg hw_reg = brw_message_reg(mrf);
2869 dst_reg reg = dst_reg(MRF, mrf);
2870 reg.type = BRW_REGISTER_TYPE_F;
2871
2872 switch (varying) {
2873 case VARYING_SLOT_PSIZ:
2874 /* PSIZ is always in slot 0, and is coupled with other flags. */
2875 current_annotation = "indices, point width, clip flags";
2876 emit_psiz_and_flags(hw_reg);
2877 break;
2878 case BRW_VARYING_SLOT_NDC:
2879 current_annotation = "NDC";
2880 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2881 break;
2882 case VARYING_SLOT_POS:
2883 current_annotation = "gl_Position";
2884 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2885 break;
2886 case VARYING_SLOT_EDGE:
2887 /* This is present when doing unfilled polygons. We're supposed to copy
2888 * the edge flag from the user-provided vertex array
2889 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2890 * of that attribute (starts as 1.0f). This is then used in clipping to
2891 * determine which edges should be drawn as wireframe.
2892 */
2893 current_annotation = "edge flag";
2894 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2895 glsl_type::float_type, WRITEMASK_XYZW))));
2896 break;
2897 case BRW_VARYING_SLOT_PAD:
2898 /* No need to write to this slot */
2899 break;
2900 default:
2901 emit_generic_urb_slot(reg, varying);
2902 break;
2903 }
2904 }
2905
2906 static int
2907 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2908 {
2909 if (brw->gen >= 6) {
2910 /* URB data written (does not include the message header reg) must
2911 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2912 * section 5.4.3.2.2: URB_INTERLEAVED.
2913 *
2914 * URB entries are allocated on a multiple of 1024 bits, so an
2915 * extra 128 bits written here to make the end align to 256 is
2916 * no problem.
2917 */
2918 if ((mlen % 2) != 1)
2919 mlen++;
2920 }
2921
2922 return mlen;
2923 }
2924
2925
2926 /**
2927 * Generates the VUE payload plus the necessary URB write instructions to
2928 * output it.
2929 *
2930 * The VUE layout is documented in Volume 2a.
2931 */
2932 void
2933 vec4_visitor::emit_vertex()
2934 {
2935 /* MRF 0 is reserved for the debugger, so start with message header
2936 * in MRF 1.
2937 */
2938 int base_mrf = 1;
2939 int mrf = base_mrf;
2940 /* In the process of generating our URB write message contents, we
2941 * may need to unspill a register or load from an array. Those
2942 * reads would use MRFs 14-15.
2943 */
2944 int max_usable_mrf = 13;
2945
2946 /* The following assertion verifies that max_usable_mrf causes an
2947 * even-numbered amount of URB write data, which will meet gen6's
2948 * requirements for length alignment.
2949 */
2950 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2951
2952 /* First mrf is the g0-based message header containing URB handles and
2953 * such.
2954 */
2955 emit_urb_write_header(mrf++);
2956
2957 if (brw->gen < 6) {
2958 emit_ndc_computation();
2959 }
2960
2961 /* Lower legacy ff and ClipVertex clipping to clip distances */
2962 if (key->userclip_active && !prog->UsesClipDistanceOut) {
2963 current_annotation = "user clip distances";
2964
2965 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2966 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2967
2968 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2969 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2970 }
2971
2972 /* We may need to split this up into several URB writes, so do them in a
2973 * loop.
2974 */
2975 int slot = 0;
2976 bool complete = false;
2977 do {
2978 /* URB offset is in URB row increments, and each of our MRFs is half of
2979 * one of those, since we're doing interleaved writes.
2980 */
2981 int offset = slot / 2;
2982
2983 mrf = base_mrf + 1;
2984 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2985 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2986
2987 /* If this was max_usable_mrf, we can't fit anything more into this
2988 * URB WRITE.
2989 */
2990 if (mrf > max_usable_mrf) {
2991 slot++;
2992 break;
2993 }
2994 }
2995
2996 complete = slot >= prog_data->vue_map.num_slots;
2997 current_annotation = "URB write";
2998 vec4_instruction *inst = emit_urb_write_opcode(complete);
2999 inst->base_mrf = base_mrf;
3000 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3001 inst->offset += offset;
3002 } while(!complete);
3003 }
3004
3005
3006 src_reg
3007 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3008 src_reg *reladdr, int reg_offset)
3009 {
3010 /* Because we store the values to scratch interleaved like our
3011 * vertex data, we need to scale the vec4 index by 2.
3012 */
3013 int message_header_scale = 2;
3014
3015 /* Pre-gen6, the message header uses byte offsets instead of vec4
3016 * (16-byte) offset units.
3017 */
3018 if (brw->gen < 6)
3019 message_header_scale *= 16;
3020
3021 if (reladdr) {
3022 src_reg index = src_reg(this, glsl_type::int_type);
3023
3024 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3025 emit_before(inst, MUL(dst_reg(index),
3026 index, src_reg(message_header_scale)));
3027
3028 return index;
3029 } else {
3030 return src_reg(reg_offset * message_header_scale);
3031 }
3032 }
3033
3034 src_reg
3035 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3036 src_reg *reladdr, int reg_offset)
3037 {
3038 if (reladdr) {
3039 src_reg index = src_reg(this, glsl_type::int_type);
3040
3041 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3042
3043 /* Pre-gen6, the message header uses byte offsets instead of vec4
3044 * (16-byte) offset units.
3045 */
3046 if (brw->gen < 6) {
3047 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3048 }
3049
3050 return index;
3051 } else if (brw->gen >= 8) {
3052 /* Store the offset in a GRF so we can send-from-GRF. */
3053 src_reg offset = src_reg(this, glsl_type::int_type);
3054 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3055 return offset;
3056 } else {
3057 int message_header_scale = brw->gen < 6 ? 16 : 1;
3058 return src_reg(reg_offset * message_header_scale);
3059 }
3060 }
3061
3062 /**
3063 * Emits an instruction before @inst to load the value named by @orig_src
3064 * from scratch space at @base_offset to @temp.
3065 *
3066 * @base_offset is measured in 32-byte units (the size of a register).
3067 */
3068 void
3069 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3070 dst_reg temp, src_reg orig_src,
3071 int base_offset)
3072 {
3073 int reg_offset = base_offset + orig_src.reg_offset;
3074 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3075
3076 emit_before(inst, SCRATCH_READ(temp, index));
3077 }
3078
3079 /**
3080 * Emits an instruction after @inst to store the value to be written
3081 * to @orig_dst to scratch space at @base_offset, from @temp.
3082 *
3083 * @base_offset is measured in 32-byte units (the size of a register).
3084 */
3085 void
3086 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3087 {
3088 int reg_offset = base_offset + inst->dst.reg_offset;
3089 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3090
3091 /* Create a temporary register to store *inst's result in.
3092 *
3093 * We have to be careful in MOVing from our temporary result register in
3094 * the scratch write. If we swizzle from channels of the temporary that
3095 * weren't initialized, it will confuse live interval analysis, which will
3096 * make spilling fail to make progress.
3097 */
3098 src_reg temp = src_reg(this, glsl_type::vec4_type);
3099 temp.type = inst->dst.type;
3100 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3101 int swizzles[4];
3102 for (int i = 0; i < 4; i++)
3103 if (inst->dst.writemask & (1 << i))
3104 swizzles[i] = i;
3105 else
3106 swizzles[i] = first_writemask_chan;
3107 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3108 swizzles[2], swizzles[3]);
3109
3110 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3111 inst->dst.writemask));
3112 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3113 write->predicate = inst->predicate;
3114 write->ir = inst->ir;
3115 write->annotation = inst->annotation;
3116 inst->insert_after(write);
3117
3118 inst->dst.file = temp.file;
3119 inst->dst.reg = temp.reg;
3120 inst->dst.reg_offset = temp.reg_offset;
3121 inst->dst.reladdr = NULL;
3122 }
3123
3124 /**
3125 * We can't generally support array access in GRF space, because a
3126 * single instruction's destination can only span 2 contiguous
3127 * registers. So, we send all GRF arrays that get variable index
3128 * access to scratch space.
3129 */
3130 void
3131 vec4_visitor::move_grf_array_access_to_scratch()
3132 {
3133 int scratch_loc[this->virtual_grf_count];
3134
3135 for (int i = 0; i < this->virtual_grf_count; i++) {
3136 scratch_loc[i] = -1;
3137 }
3138
3139 /* First, calculate the set of virtual GRFs that need to be punted
3140 * to scratch due to having any array access on them, and where in
3141 * scratch.
3142 */
3143 foreach_list(node, &this->instructions) {
3144 vec4_instruction *inst = (vec4_instruction *)node;
3145
3146 if (inst->dst.file == GRF && inst->dst.reladdr &&
3147 scratch_loc[inst->dst.reg] == -1) {
3148 scratch_loc[inst->dst.reg] = c->last_scratch;
3149 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3150 }
3151
3152 for (int i = 0 ; i < 3; i++) {
3153 src_reg *src = &inst->src[i];
3154
3155 if (src->file == GRF && src->reladdr &&
3156 scratch_loc[src->reg] == -1) {
3157 scratch_loc[src->reg] = c->last_scratch;
3158 c->last_scratch += this->virtual_grf_sizes[src->reg];
3159 }
3160 }
3161 }
3162
3163 /* Now, for anything that will be accessed through scratch, rewrite
3164 * it to load/store. Note that this is a _safe list walk, because
3165 * we may generate a new scratch_write instruction after the one
3166 * we're processing.
3167 */
3168 foreach_list_safe(node, &this->instructions) {
3169 vec4_instruction *inst = (vec4_instruction *)node;
3170
3171 /* Set up the annotation tracking for new generated instructions. */
3172 base_ir = inst->ir;
3173 current_annotation = inst->annotation;
3174
3175 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3176 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3177 }
3178
3179 for (int i = 0 ; i < 3; i++) {
3180 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3181 continue;
3182
3183 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3184
3185 emit_scratch_read(inst, temp, inst->src[i],
3186 scratch_loc[inst->src[i].reg]);
3187
3188 inst->src[i].file = temp.file;
3189 inst->src[i].reg = temp.reg;
3190 inst->src[i].reg_offset = temp.reg_offset;
3191 inst->src[i].reladdr = NULL;
3192 }
3193 }
3194 }
3195
3196 /**
3197 * Emits an instruction before @inst to load the value named by @orig_src
3198 * from the pull constant buffer (surface) at @base_offset to @temp.
3199 */
3200 void
3201 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3202 dst_reg temp, src_reg orig_src,
3203 int base_offset)
3204 {
3205 int reg_offset = base_offset + orig_src.reg_offset;
3206 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3207 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3208 vec4_instruction *load;
3209
3210 if (brw->gen >= 7) {
3211 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3212 grf_offset.type = offset.type;
3213 emit_before(inst, MOV(grf_offset, offset));
3214
3215 load = new(mem_ctx) vec4_instruction(this,
3216 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3217 temp, index, src_reg(grf_offset));
3218 } else {
3219 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3220 temp, index, offset);
3221 load->base_mrf = 14;
3222 load->mlen = 1;
3223 }
3224 emit_before(inst, load);
3225 }
3226
3227 /**
3228 * Implements array access of uniforms by inserting a
3229 * PULL_CONSTANT_LOAD instruction.
3230 *
3231 * Unlike temporary GRF array access (where we don't support it due to
3232 * the difficulty of doing relative addressing on instruction
3233 * destinations), we could potentially do array access of uniforms
3234 * that were loaded in GRF space as push constants. In real-world
3235 * usage we've seen, though, the arrays being used are always larger
3236 * than we could load as push constants, so just always move all
3237 * uniform array access out to a pull constant buffer.
3238 */
3239 void
3240 vec4_visitor::move_uniform_array_access_to_pull_constants()
3241 {
3242 int pull_constant_loc[this->uniforms];
3243
3244 for (int i = 0; i < this->uniforms; i++) {
3245 pull_constant_loc[i] = -1;
3246 }
3247
3248 /* Walk through and find array access of uniforms. Put a copy of that
3249 * uniform in the pull constant buffer.
3250 *
3251 * Note that we don't move constant-indexed accesses to arrays. No
3252 * testing has been done of the performance impact of this choice.
3253 */
3254 foreach_list_safe(node, &this->instructions) {
3255 vec4_instruction *inst = (vec4_instruction *)node;
3256
3257 for (int i = 0 ; i < 3; i++) {
3258 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3259 continue;
3260
3261 int uniform = inst->src[i].reg;
3262
3263 /* If this array isn't already present in the pull constant buffer,
3264 * add it.
3265 */
3266 if (pull_constant_loc[uniform] == -1) {
3267 const float **values = &prog_data->param[uniform * 4];
3268
3269 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3270
3271 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3272 prog_data->pull_param[prog_data->nr_pull_params++]
3273 = values[j];
3274 }
3275 }
3276
3277 /* Set up the annotation tracking for new generated instructions. */
3278 base_ir = inst->ir;
3279 current_annotation = inst->annotation;
3280
3281 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3282
3283 emit_pull_constant_load(inst, temp, inst->src[i],
3284 pull_constant_loc[uniform]);
3285
3286 inst->src[i].file = temp.file;
3287 inst->src[i].reg = temp.reg;
3288 inst->src[i].reg_offset = temp.reg_offset;
3289 inst->src[i].reladdr = NULL;
3290 }
3291 }
3292
3293 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3294 * no need to track them as larger-than-vec4 objects. This will be
3295 * relied on in cutting out unused uniform vectors from push
3296 * constants.
3297 */
3298 split_uniform_registers();
3299 }
3300
3301 void
3302 vec4_visitor::resolve_ud_negate(src_reg *reg)
3303 {
3304 if (reg->type != BRW_REGISTER_TYPE_UD ||
3305 !reg->negate)
3306 return;
3307
3308 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3309 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3310 *reg = temp;
3311 }
3312
3313 vec4_visitor::vec4_visitor(struct brw_context *brw,
3314 struct brw_vec4_compile *c,
3315 struct gl_program *prog,
3316 const struct brw_vec4_prog_key *key,
3317 struct brw_vec4_prog_data *prog_data,
3318 struct gl_shader_program *shader_prog,
3319 struct brw_shader *shader,
3320 void *mem_ctx,
3321 bool debug_flag,
3322 bool no_spills,
3323 shader_time_shader_type st_base,
3324 shader_time_shader_type st_written,
3325 shader_time_shader_type st_reset)
3326 : sanity_param_count(0),
3327 fail_msg(NULL),
3328 first_non_payload_grf(0),
3329 need_all_constants_in_pull_buffer(false),
3330 debug_flag(debug_flag),
3331 no_spills(no_spills),
3332 st_base(st_base),
3333 st_written(st_written),
3334 st_reset(st_reset)
3335 {
3336 this->brw = brw;
3337 this->ctx = &brw->ctx;
3338 this->shader_prog = shader_prog;
3339 this->shader = shader;
3340
3341 this->mem_ctx = mem_ctx;
3342 this->failed = false;
3343
3344 this->base_ir = NULL;
3345 this->current_annotation = NULL;
3346 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3347
3348 this->c = c;
3349 this->prog = prog;
3350 this->key = key;
3351 this->prog_data = prog_data;
3352 this->stage_prog_data = &prog_data->base;
3353
3354 this->variable_ht = hash_table_ctor(0,
3355 hash_table_pointer_hash,
3356 hash_table_pointer_compare);
3357
3358 this->virtual_grf_start = NULL;
3359 this->virtual_grf_end = NULL;
3360 this->virtual_grf_sizes = NULL;
3361 this->virtual_grf_count = 0;
3362 this->virtual_grf_reg_map = NULL;
3363 this->virtual_grf_reg_count = 0;
3364 this->virtual_grf_array_size = 0;
3365 this->live_intervals_valid = false;
3366
3367 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3368
3369 this->uniforms = 0;
3370 }
3371
3372 vec4_visitor::~vec4_visitor()
3373 {
3374 hash_table_dtor(this->variable_ht);
3375 }
3376
3377
3378 void
3379 vec4_visitor::fail(const char *format, ...)
3380 {
3381 va_list va;
3382 char *msg;
3383
3384 if (failed)
3385 return;
3386
3387 failed = true;
3388
3389 va_start(va, format);
3390 msg = ralloc_vasprintf(mem_ctx, format, va);
3391 va_end(va);
3392 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3393
3394 this->fail_msg = msg;
3395
3396 if (debug_flag) {
3397 fprintf(stderr, "%s", msg);
3398 }
3399 }
3400
3401 } /* namespace brw */