16a188fd30e068f09960d10b18fcc131422cc8e6
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->sampler = 0;
47 this->texture_offset = 0;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->ir = v->base_ir;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_present = false;
53 this->mlen = 0;
54 this->base_mrf = 0;
55 this->offset = 0;
56 this->ir = NULL;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(dst_reg dst, src_reg src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
123 { \
124 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
125 src0, src1); \
126 }
127
128 #define ALU3(op) \
129 vec4_instruction * \
130 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
131 { \
132 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
133 src0, src1, src2); \
134 }
135
136 ALU1(NOT)
137 ALU1(MOV)
138 ALU1(FRC)
139 ALU1(RNDD)
140 ALU1(RNDE)
141 ALU1(RNDZ)
142 ALU1(F32TO16)
143 ALU1(F16TO32)
144 ALU2(ADD)
145 ALU2(MUL)
146 ALU2(MACH)
147 ALU2(AND)
148 ALU2(OR)
149 ALU2(XOR)
150 ALU2(DP3)
151 ALU2(DP4)
152 ALU2(DPH)
153 ALU2(SHL)
154 ALU2(SHR)
155 ALU2(ASR)
156 ALU3(LRP)
157 ALU1(BFREV)
158 ALU3(BFE)
159 ALU2(BFI1)
160 ALU3(BFI2)
161 ALU1(FBH)
162 ALU1(FBL)
163 ALU1(CBIT)
164 ALU3(MAD)
165 ALU2(ADDC)
166 ALU2(SUBB)
167
168 /** Gen4 predicated IF. */
169 vec4_instruction *
170 vec4_visitor::IF(uint32_t predicate)
171 {
172 vec4_instruction *inst;
173
174 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
175 inst->predicate = predicate;
176
177 return inst;
178 }
179
180 /** Gen6+ IF with embedded comparison. */
181 vec4_instruction *
182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
183 {
184 assert(brw->gen >= 6);
185
186 vec4_instruction *inst;
187
188 resolve_ud_negate(&src0);
189 resolve_ud_negate(&src1);
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
192 src0, src1);
193 inst->conditional_mod = condition;
194
195 return inst;
196 }
197
198 /**
199 * CMP: Sets the low bit of the destination channels with the result
200 * of the comparison, while the upper bits are undefined, and updates
201 * the flag register with the packed 16 bits of the result.
202 */
203 vec4_instruction *
204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
205 {
206 vec4_instruction *inst;
207
208 /* original gen4 does type conversion to the destination type
209 * before before comparison, producing garbage results for floating
210 * point comparisons.
211 */
212 if (brw->gen == 4) {
213 dst.type = src0.type;
214 if (dst.file == HW_REG)
215 dst.fixed_hw_reg.type = dst.type;
216 }
217
218 resolve_ud_negate(&src0);
219 resolve_ud_negate(&src1);
220
221 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
222 inst->conditional_mod = condition;
223
224 return inst;
225 }
226
227 vec4_instruction *
228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
229 {
230 vec4_instruction *inst;
231
232 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
233 dst, index);
234 inst->base_mrf = 14;
235 inst->mlen = 2;
236
237 return inst;
238 }
239
240 vec4_instruction *
241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
242 {
243 vec4_instruction *inst;
244
245 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
246 dst, src, index);
247 inst->base_mrf = 13;
248 inst->mlen = 3;
249
250 return inst;
251 }
252
253 void
254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
255 {
256 static enum opcode dot_opcodes[] = {
257 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
258 };
259
260 emit(dot_opcodes[elements - 2], dst, src0, src1);
261 }
262
263 src_reg
264 vec4_visitor::fix_3src_operand(src_reg src)
265 {
266 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
267 * able to use vertical stride of zero to replicate the vec4 uniform, like
268 *
269 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
270 *
271 * But you can't, since vertical stride is always four in three-source
272 * instructions. Instead, insert a MOV instruction to do the replication so
273 * that the three-source instruction can consume it.
274 */
275
276 /* The MOV is only needed if the source is a uniform or immediate. */
277 if (src.file != UNIFORM && src.file != IMM)
278 return src;
279
280 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
281 expanded.type = src.type;
282 emit(MOV(expanded, src));
283 return src_reg(expanded);
284 }
285
286 src_reg
287 vec4_visitor::fix_math_operand(src_reg src)
288 {
289 /* The gen6 math instruction ignores the source modifiers --
290 * swizzle, abs, negate, and at least some parts of the register
291 * region description.
292 *
293 * Rather than trying to enumerate all these cases, *always* expand the
294 * operand to a temp GRF for gen6.
295 *
296 * For gen7, keep the operand as-is, except if immediate, which gen7 still
297 * can't use.
298 */
299
300 if (brw->gen == 7 && src.file != IMM)
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
304 expanded.type = src.type;
305 emit(MOV(expanded, src));
306 return src_reg(expanded);
307 }
308
309 void
310 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
311 {
312 src = fix_math_operand(src);
313
314 if (dst.writemask != WRITEMASK_XYZW) {
315 /* The gen6 math instruction must be align1, so we can't do
316 * writemasks.
317 */
318 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
319
320 emit(opcode, temp_dst, src);
321
322 emit(MOV(dst, src_reg(temp_dst)));
323 } else {
324 emit(opcode, dst, src);
325 }
326 }
327
328 void
329 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
330 {
331 vec4_instruction *inst = emit(opcode, dst, src);
332 inst->base_mrf = 1;
333 inst->mlen = 1;
334 }
335
336 void
337 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
338 {
339 switch (opcode) {
340 case SHADER_OPCODE_RCP:
341 case SHADER_OPCODE_RSQ:
342 case SHADER_OPCODE_SQRT:
343 case SHADER_OPCODE_EXP2:
344 case SHADER_OPCODE_LOG2:
345 case SHADER_OPCODE_SIN:
346 case SHADER_OPCODE_COS:
347 break;
348 default:
349 assert(!"not reached: bad math opcode");
350 return;
351 }
352
353 if (brw->gen >= 6) {
354 return emit_math1_gen6(opcode, dst, src);
355 } else {
356 return emit_math1_gen4(opcode, dst, src);
357 }
358 }
359
360 void
361 vec4_visitor::emit_math2_gen6(enum opcode opcode,
362 dst_reg dst, src_reg src0, src_reg src1)
363 {
364 src0 = fix_math_operand(src0);
365 src1 = fix_math_operand(src1);
366
367 if (dst.writemask != WRITEMASK_XYZW) {
368 /* The gen6 math instruction must be align1, so we can't do
369 * writemasks.
370 */
371 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
372 temp_dst.type = dst.type;
373
374 emit(opcode, temp_dst, src0, src1);
375
376 emit(MOV(dst, src_reg(temp_dst)));
377 } else {
378 emit(opcode, dst, src0, src1);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen4(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 vec4_instruction *inst = emit(opcode, dst, src0, src1);
387 inst->base_mrf = 1;
388 inst->mlen = 2;
389 }
390
391 void
392 vec4_visitor::emit_math(enum opcode opcode,
393 dst_reg dst, src_reg src0, src_reg src1)
394 {
395 switch (opcode) {
396 case SHADER_OPCODE_POW:
397 case SHADER_OPCODE_INT_QUOTIENT:
398 case SHADER_OPCODE_INT_REMAINDER:
399 break;
400 default:
401 assert(!"not reached: unsupported binary math opcode");
402 return;
403 }
404
405 if (brw->gen >= 6) {
406 return emit_math2_gen6(opcode, dst, src0, src1);
407 } else {
408 return emit_math2_gen4(opcode, dst, src0, src1);
409 }
410 }
411
412 void
413 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
414 {
415 if (brw->gen < 7)
416 assert(!"ir_unop_pack_half_2x16 should be lowered");
417
418 assert(dst.type == BRW_REGISTER_TYPE_UD);
419 assert(src0.type == BRW_REGISTER_TYPE_F);
420
421 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
422 *
423 * Because this instruction does not have a 16-bit floating-point type,
424 * the destination data type must be Word (W).
425 *
426 * The destination must be DWord-aligned and specify a horizontal stride
427 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
428 * each destination channel and the upper word is not modified.
429 *
430 * The above restriction implies that the f32to16 instruction must use
431 * align1 mode, because only in align1 mode is it possible to specify
432 * horizontal stride. We choose here to defy the hardware docs and emit
433 * align16 instructions.
434 *
435 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
436 * instructions. I was partially successful in that the code passed all
437 * tests. However, the code was dubiously correct and fragile, and the
438 * tests were not harsh enough to probe that frailty. Not trusting the
439 * code, I chose instead to remain in align16 mode in defiance of the hw
440 * docs).
441 *
442 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
443 * simulator, emitting a f32to16 in align16 mode with UD as destination
444 * data type is safe. The behavior differs from that specified in the PRM
445 * in that the upper word of each destination channel is cleared to 0.
446 */
447
448 dst_reg tmp_dst(this, glsl_type::uvec2_type);
449 src_reg tmp_src(tmp_dst);
450
451 #if 0
452 /* Verify the undocumented behavior on which the following instructions
453 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
454 * then the result of the bit-or instruction below will be incorrect.
455 *
456 * You should inspect the disasm output in order to verify that the MOV is
457 * not optimized away.
458 */
459 emit(MOV(tmp_dst, src_reg(0x12345678u)));
460 #endif
461
462 /* Give tmp the form below, where "." means untouched.
463 *
464 * w z y x w z y x
465 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
466 *
467 * That the upper word of each write-channel be 0 is required for the
468 * following bit-shift and bit-or instructions to work. Note that this
469 * relies on the undocumented hardware behavior mentioned above.
470 */
471 tmp_dst.writemask = WRITEMASK_XY;
472 emit(F32TO16(tmp_dst, src0));
473
474 /* Give the write-channels of dst the form:
475 * 0xhhhh0000
476 */
477 tmp_src.swizzle = SWIZZLE_Y;
478 emit(SHL(dst, tmp_src, src_reg(16u)));
479
480 /* Finally, give the write-channels of dst the form of packHalf2x16's
481 * output:
482 * 0xhhhhllll
483 */
484 tmp_src.swizzle = SWIZZLE_X;
485 emit(OR(dst, src_reg(dst), tmp_src));
486 }
487
488 void
489 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
490 {
491 if (brw->gen < 7)
492 assert(!"ir_unop_unpack_half_2x16 should be lowered");
493
494 assert(dst.type == BRW_REGISTER_TYPE_F);
495 assert(src0.type == BRW_REGISTER_TYPE_UD);
496
497 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
498 *
499 * Because this instruction does not have a 16-bit floating-point type,
500 * the source data type must be Word (W). The destination type must be
501 * F (Float).
502 *
503 * To use W as the source data type, we must adjust horizontal strides,
504 * which is only possible in align1 mode. All my [chadv] attempts at
505 * emitting align1 instructions for unpackHalf2x16 failed to pass the
506 * Piglit tests, so I gave up.
507 *
508 * I've verified that, on gen7 hardware and the simulator, it is safe to
509 * emit f16to32 in align16 mode with UD as source data type.
510 */
511
512 dst_reg tmp_dst(this, glsl_type::uvec2_type);
513 src_reg tmp_src(tmp_dst);
514
515 tmp_dst.writemask = WRITEMASK_X;
516 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
517
518 tmp_dst.writemask = WRITEMASK_Y;
519 emit(SHR(tmp_dst, src0, src_reg(16u)));
520
521 dst.writemask = WRITEMASK_XY;
522 emit(F16TO32(dst, tmp_src));
523 }
524
525 void
526 vec4_visitor::visit_instructions(const exec_list *list)
527 {
528 foreach_list(node, list) {
529 ir_instruction *ir = (ir_instruction *)node;
530
531 base_ir = ir;
532 ir->accept(this);
533 }
534 }
535
536
537 static int
538 type_size(const struct glsl_type *type)
539 {
540 unsigned int i;
541 int size;
542
543 switch (type->base_type) {
544 case GLSL_TYPE_UINT:
545 case GLSL_TYPE_INT:
546 case GLSL_TYPE_FLOAT:
547 case GLSL_TYPE_BOOL:
548 if (type->is_matrix()) {
549 return type->matrix_columns;
550 } else {
551 /* Regardless of size of vector, it gets a vec4. This is bad
552 * packing for things like floats, but otherwise arrays become a
553 * mess. Hopefully a later pass over the code can pack scalars
554 * down if appropriate.
555 */
556 return 1;
557 }
558 case GLSL_TYPE_ARRAY:
559 assert(type->length > 0);
560 return type_size(type->fields.array) * type->length;
561 case GLSL_TYPE_STRUCT:
562 size = 0;
563 for (i = 0; i < type->length; i++) {
564 size += type_size(type->fields.structure[i].type);
565 }
566 return size;
567 case GLSL_TYPE_SAMPLER:
568 /* Samplers take up one slot in UNIFORMS[], but they're baked in
569 * at link time.
570 */
571 return 1;
572 case GLSL_TYPE_ATOMIC_UINT:
573 return 0;
574 case GLSL_TYPE_VOID:
575 case GLSL_TYPE_ERROR:
576 case GLSL_TYPE_INTERFACE:
577 assert(0);
578 break;
579 }
580
581 return 0;
582 }
583
584 int
585 vec4_visitor::virtual_grf_alloc(int size)
586 {
587 if (virtual_grf_array_size <= virtual_grf_count) {
588 if (virtual_grf_array_size == 0)
589 virtual_grf_array_size = 16;
590 else
591 virtual_grf_array_size *= 2;
592 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
593 virtual_grf_array_size);
594 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
595 virtual_grf_array_size);
596 }
597 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
598 virtual_grf_reg_count += size;
599 virtual_grf_sizes[virtual_grf_count] = size;
600 return virtual_grf_count++;
601 }
602
603 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
604 {
605 init();
606
607 this->file = GRF;
608 this->reg = v->virtual_grf_alloc(type_size(type));
609
610 if (type->is_array() || type->is_record()) {
611 this->swizzle = BRW_SWIZZLE_NOOP;
612 } else {
613 this->swizzle = swizzle_for_size(type->vector_elements);
614 }
615
616 this->type = brw_type_for_base_type(type);
617 }
618
619 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
620 {
621 init();
622
623 this->file = GRF;
624 this->reg = v->virtual_grf_alloc(type_size(type));
625
626 if (type->is_array() || type->is_record()) {
627 this->writemask = WRITEMASK_XYZW;
628 } else {
629 this->writemask = (1 << type->vector_elements) - 1;
630 }
631
632 this->type = brw_type_for_base_type(type);
633 }
634
635 /* Our support for uniforms is piggy-backed on the struct
636 * gl_fragment_program, because that's where the values actually
637 * get stored, rather than in some global gl_shader_program uniform
638 * store.
639 */
640 void
641 vec4_visitor::setup_uniform_values(ir_variable *ir)
642 {
643 int namelen = strlen(ir->name);
644
645 /* The data for our (non-builtin) uniforms is stored in a series of
646 * gl_uniform_driver_storage structs for each subcomponent that
647 * glGetUniformLocation() could name. We know it's been set up in the same
648 * order we'd walk the type, so walk the list of storage and find anything
649 * with our name, or the prefix of a component that starts with our name.
650 */
651 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
652 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
653
654 if (strncmp(ir->name, storage->name, namelen) != 0 ||
655 (storage->name[namelen] != 0 &&
656 storage->name[namelen] != '.' &&
657 storage->name[namelen] != '[')) {
658 continue;
659 }
660
661 gl_constant_value *components = storage->storage;
662 unsigned vector_count = (MAX2(storage->array_elements, 1) *
663 storage->type->matrix_columns);
664
665 for (unsigned s = 0; s < vector_count; s++) {
666 uniform_vector_size[uniforms] = storage->type->vector_elements;
667
668 int i;
669 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
670 prog_data->param[uniforms * 4 + i] = &components->f;
671 components++;
672 }
673 for (; i < 4; i++) {
674 static float zero = 0;
675 prog_data->param[uniforms * 4 + i] = &zero;
676 }
677
678 uniforms++;
679 }
680 }
681 }
682
683 void
684 vec4_visitor::setup_uniform_clipplane_values()
685 {
686 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
687
688 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
689 this->uniform_vector_size[this->uniforms] = 4;
690 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
691 this->userplane[i].type = BRW_REGISTER_TYPE_F;
692 for (int j = 0; j < 4; ++j) {
693 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
694 }
695 ++this->uniforms;
696 }
697 }
698
699 /* Our support for builtin uniforms is even scarier than non-builtin.
700 * It sits on top of the PROG_STATE_VAR parameters that are
701 * automatically updated from GL context state.
702 */
703 void
704 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
705 {
706 const ir_state_slot *const slots = ir->state_slots;
707 assert(ir->state_slots != NULL);
708
709 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
710 /* This state reference has already been setup by ir_to_mesa,
711 * but we'll get the same index back here. We can reference
712 * ParameterValues directly, since unlike brw_fs.cpp, we never
713 * add new state references during compile.
714 */
715 int index = _mesa_add_state_reference(this->prog->Parameters,
716 (gl_state_index *)slots[i].tokens);
717 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
718
719 this->uniform_vector_size[this->uniforms] = 0;
720 /* Add each of the unique swizzled channels of the element.
721 * This will end up matching the size of the glsl_type of this field.
722 */
723 int last_swiz = -1;
724 for (unsigned int j = 0; j < 4; j++) {
725 int swiz = GET_SWZ(slots[i].swizzle, j);
726 last_swiz = swiz;
727
728 prog_data->param[this->uniforms * 4 + j] = &values[swiz];
729 if (swiz <= last_swiz)
730 this->uniform_vector_size[this->uniforms]++;
731 }
732 this->uniforms++;
733 }
734 }
735
736 dst_reg *
737 vec4_visitor::variable_storage(ir_variable *var)
738 {
739 return (dst_reg *)hash_table_find(this->variable_ht, var);
740 }
741
742 void
743 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
744 {
745 ir_expression *expr = ir->as_expression();
746
747 *predicate = BRW_PREDICATE_NORMAL;
748
749 if (expr) {
750 src_reg op[2];
751 vec4_instruction *inst;
752
753 assert(expr->get_num_operands() <= 2);
754 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
755 expr->operands[i]->accept(this);
756 op[i] = this->result;
757
758 resolve_ud_negate(&op[i]);
759 }
760
761 switch (expr->operation) {
762 case ir_unop_logic_not:
763 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
764 inst->conditional_mod = BRW_CONDITIONAL_Z;
765 break;
766
767 case ir_binop_logic_xor:
768 inst = emit(XOR(dst_null_d(), op[0], op[1]));
769 inst->conditional_mod = BRW_CONDITIONAL_NZ;
770 break;
771
772 case ir_binop_logic_or:
773 inst = emit(OR(dst_null_d(), op[0], op[1]));
774 inst->conditional_mod = BRW_CONDITIONAL_NZ;
775 break;
776
777 case ir_binop_logic_and:
778 inst = emit(AND(dst_null_d(), op[0], op[1]));
779 inst->conditional_mod = BRW_CONDITIONAL_NZ;
780 break;
781
782 case ir_unop_f2b:
783 if (brw->gen >= 6) {
784 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
785 } else {
786 inst = emit(MOV(dst_null_f(), op[0]));
787 inst->conditional_mod = BRW_CONDITIONAL_NZ;
788 }
789 break;
790
791 case ir_unop_i2b:
792 if (brw->gen >= 6) {
793 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
794 } else {
795 inst = emit(MOV(dst_null_d(), op[0]));
796 inst->conditional_mod = BRW_CONDITIONAL_NZ;
797 }
798 break;
799
800 case ir_binop_all_equal:
801 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
802 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
803 break;
804
805 case ir_binop_any_nequal:
806 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
807 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
808 break;
809
810 case ir_unop_any:
811 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
812 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
813 break;
814
815 case ir_binop_greater:
816 case ir_binop_gequal:
817 case ir_binop_less:
818 case ir_binop_lequal:
819 case ir_binop_equal:
820 case ir_binop_nequal:
821 emit(CMP(dst_null_d(), op[0], op[1],
822 brw_conditional_for_comparison(expr->operation)));
823 break;
824
825 default:
826 assert(!"not reached");
827 break;
828 }
829 return;
830 }
831
832 ir->accept(this);
833
834 resolve_ud_negate(&this->result);
835
836 if (brw->gen >= 6) {
837 vec4_instruction *inst = emit(AND(dst_null_d(),
838 this->result, src_reg(1)));
839 inst->conditional_mod = BRW_CONDITIONAL_NZ;
840 } else {
841 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 }
844 }
845
846 /**
847 * Emit a gen6 IF statement with the comparison folded into the IF
848 * instruction.
849 */
850 void
851 vec4_visitor::emit_if_gen6(ir_if *ir)
852 {
853 ir_expression *expr = ir->condition->as_expression();
854
855 if (expr) {
856 src_reg op[2];
857 dst_reg temp;
858
859 assert(expr->get_num_operands() <= 2);
860 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
861 expr->operands[i]->accept(this);
862 op[i] = this->result;
863 }
864
865 switch (expr->operation) {
866 case ir_unop_logic_not:
867 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
868 return;
869
870 case ir_binop_logic_xor:
871 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
872 return;
873
874 case ir_binop_logic_or:
875 temp = dst_reg(this, glsl_type::bool_type);
876 emit(OR(temp, op[0], op[1]));
877 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
878 return;
879
880 case ir_binop_logic_and:
881 temp = dst_reg(this, glsl_type::bool_type);
882 emit(AND(temp, op[0], op[1]));
883 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
884 return;
885
886 case ir_unop_f2b:
887 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
888 return;
889
890 case ir_unop_i2b:
891 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
892 return;
893
894 case ir_binop_greater:
895 case ir_binop_gequal:
896 case ir_binop_less:
897 case ir_binop_lequal:
898 case ir_binop_equal:
899 case ir_binop_nequal:
900 emit(IF(op[0], op[1],
901 brw_conditional_for_comparison(expr->operation)));
902 return;
903
904 case ir_binop_all_equal:
905 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
906 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
907 return;
908
909 case ir_binop_any_nequal:
910 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
911 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
912 return;
913
914 case ir_unop_any:
915 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
916 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
917 return;
918
919 default:
920 assert(!"not reached");
921 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
922 return;
923 }
924 return;
925 }
926
927 ir->condition->accept(this);
928
929 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
930 }
931
932 dst_reg
933 with_writemask(dst_reg const & r, int mask)
934 {
935 dst_reg result = r;
936 result.writemask = mask;
937 return result;
938 }
939
940
941 void
942 vec4_visitor::visit(ir_variable *ir)
943 {
944 dst_reg *reg = NULL;
945
946 if (variable_storage(ir))
947 return;
948
949 switch (ir->mode) {
950 case ir_var_shader_in:
951 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
952 break;
953
954 case ir_var_shader_out:
955 reg = new(mem_ctx) dst_reg(this, ir->type);
956
957 for (int i = 0; i < type_size(ir->type); i++) {
958 output_reg[ir->location + i] = *reg;
959 output_reg[ir->location + i].reg_offset = i;
960 output_reg[ir->location + i].type =
961 brw_type_for_base_type(ir->type->get_scalar_type());
962 output_reg_annotation[ir->location + i] = ir->name;
963 }
964 break;
965
966 case ir_var_auto:
967 case ir_var_temporary:
968 reg = new(mem_ctx) dst_reg(this, ir->type);
969 break;
970
971 case ir_var_uniform:
972 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
973
974 /* Thanks to the lower_ubo_reference pass, we will see only
975 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
976 * variables, so no need for them to be in variable_ht.
977 */
978 if (ir->is_in_uniform_block())
979 return;
980
981 /* Track how big the whole uniform variable is, in case we need to put a
982 * copy of its data into pull constants for array access.
983 */
984 this->uniform_size[this->uniforms] = type_size(ir->type);
985
986 if (!strncmp(ir->name, "gl_", 3)) {
987 setup_builtin_uniform_values(ir);
988 } else {
989 setup_uniform_values(ir);
990 }
991 break;
992
993 case ir_var_system_value:
994 reg = make_reg_for_system_value(ir);
995 break;
996
997 default:
998 assert(!"not reached");
999 }
1000
1001 reg->type = brw_type_for_base_type(ir->type);
1002 hash_table_insert(this->variable_ht, reg, ir);
1003 }
1004
1005 void
1006 vec4_visitor::visit(ir_loop *ir)
1007 {
1008 dst_reg counter;
1009
1010 /* We don't want debugging output to print the whole body of the
1011 * loop as the annotation.
1012 */
1013 this->base_ir = NULL;
1014
1015 if (ir->counter != NULL) {
1016 this->base_ir = ir->counter;
1017 ir->counter->accept(this);
1018 counter = *(variable_storage(ir->counter));
1019
1020 if (ir->from != NULL) {
1021 this->base_ir = ir->from;
1022 ir->from->accept(this);
1023
1024 emit(MOV(counter, this->result));
1025 }
1026 }
1027
1028 emit(BRW_OPCODE_DO);
1029
1030 if (ir->to) {
1031 this->base_ir = ir->to;
1032 ir->to->accept(this);
1033
1034 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1035 brw_conditional_for_comparison(ir->cmp)));
1036
1037 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1038 inst->predicate = BRW_PREDICATE_NORMAL;
1039 }
1040
1041 visit_instructions(&ir->body_instructions);
1042
1043
1044 if (ir->increment) {
1045 this->base_ir = ir->increment;
1046 ir->increment->accept(this);
1047 emit(ADD(counter, src_reg(counter), this->result));
1048 }
1049
1050 emit(BRW_OPCODE_WHILE);
1051 }
1052
1053 void
1054 vec4_visitor::visit(ir_loop_jump *ir)
1055 {
1056 switch (ir->mode) {
1057 case ir_loop_jump::jump_break:
1058 emit(BRW_OPCODE_BREAK);
1059 break;
1060 case ir_loop_jump::jump_continue:
1061 emit(BRW_OPCODE_CONTINUE);
1062 break;
1063 }
1064 }
1065
1066
1067 void
1068 vec4_visitor::visit(ir_function_signature *ir)
1069 {
1070 assert(0);
1071 (void)ir;
1072 }
1073
1074 void
1075 vec4_visitor::visit(ir_function *ir)
1076 {
1077 /* Ignore function bodies other than main() -- we shouldn't see calls to
1078 * them since they should all be inlined.
1079 */
1080 if (strcmp(ir->name, "main") == 0) {
1081 const ir_function_signature *sig;
1082 exec_list empty;
1083
1084 sig = ir->matching_signature(NULL, &empty);
1085
1086 assert(sig);
1087
1088 visit_instructions(&sig->body);
1089 }
1090 }
1091
1092 bool
1093 vec4_visitor::try_emit_sat(ir_expression *ir)
1094 {
1095 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1096 if (!sat_src)
1097 return false;
1098
1099 sat_src->accept(this);
1100 src_reg src = this->result;
1101
1102 this->result = src_reg(this, ir->type);
1103 vec4_instruction *inst;
1104 inst = emit(MOV(dst_reg(this->result), src));
1105 inst->saturate = true;
1106
1107 return true;
1108 }
1109
1110 bool
1111 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1112 {
1113 /* 3-src instructions were introduced in gen6. */
1114 if (brw->gen < 6)
1115 return false;
1116
1117 /* MAD can only handle floating-point data. */
1118 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1119 return false;
1120
1121 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1122 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1123
1124 if (!mul || mul->operation != ir_binop_mul)
1125 return false;
1126
1127 nonmul->accept(this);
1128 src_reg src0 = fix_3src_operand(this->result);
1129
1130 mul->operands[0]->accept(this);
1131 src_reg src1 = fix_3src_operand(this->result);
1132
1133 mul->operands[1]->accept(this);
1134 src_reg src2 = fix_3src_operand(this->result);
1135
1136 this->result = src_reg(this, ir->type);
1137 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1138
1139 return true;
1140 }
1141
1142 void
1143 vec4_visitor::emit_bool_comparison(unsigned int op,
1144 dst_reg dst, src_reg src0, src_reg src1)
1145 {
1146 /* original gen4 does destination conversion before comparison. */
1147 if (brw->gen < 5)
1148 dst.type = src0.type;
1149
1150 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1151
1152 dst.type = BRW_REGISTER_TYPE_D;
1153 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1154 }
1155
1156 void
1157 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1158 src_reg src0, src_reg src1)
1159 {
1160 vec4_instruction *inst;
1161
1162 if (brw->gen >= 6) {
1163 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1164 inst->conditional_mod = conditionalmod;
1165 } else {
1166 emit(CMP(dst, src0, src1, conditionalmod));
1167
1168 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1169 inst->predicate = BRW_PREDICATE_NORMAL;
1170 }
1171 }
1172
1173 static bool
1174 is_16bit_constant(ir_rvalue *rvalue)
1175 {
1176 ir_constant *constant = rvalue->as_constant();
1177 if (!constant)
1178 return false;
1179
1180 if (constant->type != glsl_type::int_type &&
1181 constant->type != glsl_type::uint_type)
1182 return false;
1183
1184 return constant->value.u[0] < (1 << 16);
1185 }
1186
1187 void
1188 vec4_visitor::visit(ir_expression *ir)
1189 {
1190 unsigned int operand;
1191 src_reg op[Elements(ir->operands)];
1192 src_reg result_src;
1193 dst_reg result_dst;
1194 vec4_instruction *inst;
1195
1196 if (try_emit_sat(ir))
1197 return;
1198
1199 if (ir->operation == ir_binop_add) {
1200 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1201 return;
1202 }
1203
1204 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1205 this->result.file = BAD_FILE;
1206 ir->operands[operand]->accept(this);
1207 if (this->result.file == BAD_FILE) {
1208 printf("Failed to get tree for expression operand:\n");
1209 ir->operands[operand]->print();
1210 exit(1);
1211 }
1212 op[operand] = this->result;
1213
1214 /* Matrix expression operands should have been broken down to vector
1215 * operations already.
1216 */
1217 assert(!ir->operands[operand]->type->is_matrix());
1218 }
1219
1220 int vector_elements = ir->operands[0]->type->vector_elements;
1221 if (ir->operands[1]) {
1222 vector_elements = MAX2(vector_elements,
1223 ir->operands[1]->type->vector_elements);
1224 }
1225
1226 this->result.file = BAD_FILE;
1227
1228 /* Storage for our result. Ideally for an assignment we'd be using
1229 * the actual storage for the result here, instead.
1230 */
1231 result_src = src_reg(this, ir->type);
1232 /* convenience for the emit functions below. */
1233 result_dst = dst_reg(result_src);
1234 /* If nothing special happens, this is the result. */
1235 this->result = result_src;
1236 /* Limit writes to the channels that will be used by result_src later.
1237 * This does limit this temp's use as a temporary for multi-instruction
1238 * sequences.
1239 */
1240 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1241
1242 switch (ir->operation) {
1243 case ir_unop_logic_not:
1244 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1245 * ones complement of the whole register, not just bit 0.
1246 */
1247 emit(XOR(result_dst, op[0], src_reg(1)));
1248 break;
1249 case ir_unop_neg:
1250 op[0].negate = !op[0].negate;
1251 emit(MOV(result_dst, op[0]));
1252 break;
1253 case ir_unop_abs:
1254 op[0].abs = true;
1255 op[0].negate = false;
1256 emit(MOV(result_dst, op[0]));
1257 break;
1258
1259 case ir_unop_sign:
1260 emit(MOV(result_dst, src_reg(0.0f)));
1261
1262 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1263 inst = emit(MOV(result_dst, src_reg(1.0f)));
1264 inst->predicate = BRW_PREDICATE_NORMAL;
1265
1266 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1267 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1268 inst->predicate = BRW_PREDICATE_NORMAL;
1269
1270 break;
1271
1272 case ir_unop_rcp:
1273 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1274 break;
1275
1276 case ir_unop_exp2:
1277 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1278 break;
1279 case ir_unop_log2:
1280 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1281 break;
1282 case ir_unop_exp:
1283 case ir_unop_log:
1284 assert(!"not reached: should be handled by ir_explog_to_explog2");
1285 break;
1286 case ir_unop_sin:
1287 case ir_unop_sin_reduced:
1288 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1289 break;
1290 case ir_unop_cos:
1291 case ir_unop_cos_reduced:
1292 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1293 break;
1294
1295 case ir_unop_dFdx:
1296 case ir_unop_dFdy:
1297 assert(!"derivatives not valid in vertex shader");
1298 break;
1299
1300 case ir_unop_bitfield_reverse:
1301 emit(BFREV(result_dst, op[0]));
1302 break;
1303 case ir_unop_bit_count:
1304 emit(CBIT(result_dst, op[0]));
1305 break;
1306 case ir_unop_find_msb: {
1307 src_reg temp = src_reg(this, glsl_type::uint_type);
1308
1309 inst = emit(FBH(dst_reg(temp), op[0]));
1310 inst->dst.writemask = WRITEMASK_XYZW;
1311
1312 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1313 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1314 * subtract the result from 31 to convert the MSB count into an LSB count.
1315 */
1316
1317 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1318 temp.swizzle = BRW_SWIZZLE_NOOP;
1319 emit(MOV(result_dst, temp));
1320
1321 src_reg src_tmp = src_reg(result_dst);
1322 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1323
1324 src_tmp.negate = true;
1325 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1326 inst->predicate = BRW_PREDICATE_NORMAL;
1327 break;
1328 }
1329 case ir_unop_find_lsb:
1330 emit(FBL(result_dst, op[0]));
1331 break;
1332
1333 case ir_unop_noise:
1334 assert(!"not reached: should be handled by lower_noise");
1335 break;
1336
1337 case ir_binop_add:
1338 emit(ADD(result_dst, op[0], op[1]));
1339 break;
1340 case ir_binop_sub:
1341 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1342 break;
1343
1344 case ir_binop_mul:
1345 if (ir->type->is_integer()) {
1346 /* For integer multiplication, the MUL uses the low 16 bits of one of
1347 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1348 * accumulates in the contribution of the upper 16 bits of that
1349 * operand. If we can determine that one of the args is in the low
1350 * 16 bits, though, we can just emit a single MUL.
1351 */
1352 if (is_16bit_constant(ir->operands[0])) {
1353 if (brw->gen < 7)
1354 emit(MUL(result_dst, op[0], op[1]));
1355 else
1356 emit(MUL(result_dst, op[1], op[0]));
1357 } else if (is_16bit_constant(ir->operands[1])) {
1358 if (brw->gen < 7)
1359 emit(MUL(result_dst, op[1], op[0]));
1360 else
1361 emit(MUL(result_dst, op[0], op[1]));
1362 } else {
1363 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1364
1365 emit(MUL(acc, op[0], op[1]));
1366 emit(MACH(dst_null_d(), op[0], op[1]));
1367 emit(MOV(result_dst, src_reg(acc)));
1368 }
1369 } else {
1370 emit(MUL(result_dst, op[0], op[1]));
1371 }
1372 break;
1373 case ir_binop_imul_high: {
1374 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1375
1376 emit(MUL(acc, op[0], op[1]));
1377 emit(MACH(result_dst, op[0], op[1]));
1378 break;
1379 }
1380 case ir_binop_div:
1381 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1382 assert(ir->type->is_integer());
1383 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1384 break;
1385 case ir_binop_carry: {
1386 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1387
1388 emit(ADDC(dst_null_ud(), op[0], op[1]));
1389 emit(MOV(result_dst, src_reg(acc)));
1390 break;
1391 }
1392 case ir_binop_borrow: {
1393 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1394
1395 emit(SUBB(dst_null_ud(), op[0], op[1]));
1396 emit(MOV(result_dst, src_reg(acc)));
1397 break;
1398 }
1399 case ir_binop_mod:
1400 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1401 assert(ir->type->is_integer());
1402 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1403 break;
1404
1405 case ir_binop_less:
1406 case ir_binop_greater:
1407 case ir_binop_lequal:
1408 case ir_binop_gequal:
1409 case ir_binop_equal:
1410 case ir_binop_nequal: {
1411 emit(CMP(result_dst, op[0], op[1],
1412 brw_conditional_for_comparison(ir->operation)));
1413 emit(AND(result_dst, result_src, src_reg(0x1)));
1414 break;
1415 }
1416
1417 case ir_binop_all_equal:
1418 /* "==" operator producing a scalar boolean. */
1419 if (ir->operands[0]->type->is_vector() ||
1420 ir->operands[1]->type->is_vector()) {
1421 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1422 emit(MOV(result_dst, src_reg(0)));
1423 inst = emit(MOV(result_dst, src_reg(1)));
1424 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1425 } else {
1426 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1427 emit(AND(result_dst, result_src, src_reg(0x1)));
1428 }
1429 break;
1430 case ir_binop_any_nequal:
1431 /* "!=" operator producing a scalar boolean. */
1432 if (ir->operands[0]->type->is_vector() ||
1433 ir->operands[1]->type->is_vector()) {
1434 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1435
1436 emit(MOV(result_dst, src_reg(0)));
1437 inst = emit(MOV(result_dst, src_reg(1)));
1438 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1439 } else {
1440 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1441 emit(AND(result_dst, result_src, src_reg(0x1)));
1442 }
1443 break;
1444
1445 case ir_unop_any:
1446 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1447 emit(MOV(result_dst, src_reg(0)));
1448
1449 inst = emit(MOV(result_dst, src_reg(1)));
1450 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1451 break;
1452
1453 case ir_binop_logic_xor:
1454 emit(XOR(result_dst, op[0], op[1]));
1455 break;
1456
1457 case ir_binop_logic_or:
1458 emit(OR(result_dst, op[0], op[1]));
1459 break;
1460
1461 case ir_binop_logic_and:
1462 emit(AND(result_dst, op[0], op[1]));
1463 break;
1464
1465 case ir_binop_dot:
1466 assert(ir->operands[0]->type->is_vector());
1467 assert(ir->operands[0]->type == ir->operands[1]->type);
1468 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1469 break;
1470
1471 case ir_unop_sqrt:
1472 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1473 break;
1474 case ir_unop_rsq:
1475 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1476 break;
1477
1478 case ir_unop_bitcast_i2f:
1479 case ir_unop_bitcast_u2f:
1480 this->result = op[0];
1481 this->result.type = BRW_REGISTER_TYPE_F;
1482 break;
1483
1484 case ir_unop_bitcast_f2i:
1485 this->result = op[0];
1486 this->result.type = BRW_REGISTER_TYPE_D;
1487 break;
1488
1489 case ir_unop_bitcast_f2u:
1490 this->result = op[0];
1491 this->result.type = BRW_REGISTER_TYPE_UD;
1492 break;
1493
1494 case ir_unop_i2f:
1495 case ir_unop_i2u:
1496 case ir_unop_u2i:
1497 case ir_unop_u2f:
1498 case ir_unop_b2f:
1499 case ir_unop_b2i:
1500 case ir_unop_f2i:
1501 case ir_unop_f2u:
1502 emit(MOV(result_dst, op[0]));
1503 break;
1504 case ir_unop_f2b:
1505 case ir_unop_i2b: {
1506 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1507 emit(AND(result_dst, result_src, src_reg(1)));
1508 break;
1509 }
1510
1511 case ir_unop_trunc:
1512 emit(RNDZ(result_dst, op[0]));
1513 break;
1514 case ir_unop_ceil:
1515 op[0].negate = !op[0].negate;
1516 inst = emit(RNDD(result_dst, op[0]));
1517 this->result.negate = true;
1518 break;
1519 case ir_unop_floor:
1520 inst = emit(RNDD(result_dst, op[0]));
1521 break;
1522 case ir_unop_fract:
1523 inst = emit(FRC(result_dst, op[0]));
1524 break;
1525 case ir_unop_round_even:
1526 emit(RNDE(result_dst, op[0]));
1527 break;
1528
1529 case ir_binop_min:
1530 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1531 break;
1532 case ir_binop_max:
1533 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1534 break;
1535
1536 case ir_binop_pow:
1537 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1538 break;
1539
1540 case ir_unop_bit_not:
1541 inst = emit(NOT(result_dst, op[0]));
1542 break;
1543 case ir_binop_bit_and:
1544 inst = emit(AND(result_dst, op[0], op[1]));
1545 break;
1546 case ir_binop_bit_xor:
1547 inst = emit(XOR(result_dst, op[0], op[1]));
1548 break;
1549 case ir_binop_bit_or:
1550 inst = emit(OR(result_dst, op[0], op[1]));
1551 break;
1552
1553 case ir_binop_lshift:
1554 inst = emit(SHL(result_dst, op[0], op[1]));
1555 break;
1556
1557 case ir_binop_rshift:
1558 if (ir->type->base_type == GLSL_TYPE_INT)
1559 inst = emit(ASR(result_dst, op[0], op[1]));
1560 else
1561 inst = emit(SHR(result_dst, op[0], op[1]));
1562 break;
1563
1564 case ir_binop_bfm:
1565 emit(BFI1(result_dst, op[0], op[1]));
1566 break;
1567
1568 case ir_binop_ubo_load: {
1569 ir_constant *uniform_block = ir->operands[0]->as_constant();
1570 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1571 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1572 src_reg offset = op[1];
1573
1574 /* Now, load the vector from that offset. */
1575 assert(ir->type->is_vector() || ir->type->is_scalar());
1576
1577 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1578 packed_consts.type = result.type;
1579 src_reg surf_index =
1580 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1581 if (const_offset_ir) {
1582 offset = src_reg(const_offset / 16);
1583 } else {
1584 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1585 }
1586
1587 vec4_instruction *pull =
1588 emit(new(mem_ctx) vec4_instruction(this,
1589 VS_OPCODE_PULL_CONSTANT_LOAD,
1590 dst_reg(packed_consts),
1591 surf_index,
1592 offset));
1593 pull->base_mrf = 14;
1594 pull->mlen = 1;
1595
1596 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1597 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1598 const_offset % 16 / 4,
1599 const_offset % 16 / 4,
1600 const_offset % 16 / 4);
1601
1602 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1603 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1604 emit(CMP(result_dst, packed_consts, src_reg(0u),
1605 BRW_CONDITIONAL_NZ));
1606 emit(AND(result_dst, result, src_reg(0x1)));
1607 } else {
1608 emit(MOV(result_dst, packed_consts));
1609 }
1610 break;
1611 }
1612
1613 case ir_binop_vector_extract:
1614 assert(!"should have been lowered by vec_index_to_cond_assign");
1615 break;
1616
1617 case ir_triop_fma:
1618 op[0] = fix_3src_operand(op[0]);
1619 op[1] = fix_3src_operand(op[1]);
1620 op[2] = fix_3src_operand(op[2]);
1621 /* Note that the instruction's argument order is reversed from GLSL
1622 * and the IR.
1623 */
1624 emit(MAD(result_dst, op[2], op[1], op[0]));
1625 break;
1626
1627 case ir_triop_lrp:
1628 op[0] = fix_3src_operand(op[0]);
1629 op[1] = fix_3src_operand(op[1]);
1630 op[2] = fix_3src_operand(op[2]);
1631 /* Note that the instruction's argument order is reversed from GLSL
1632 * and the IR.
1633 */
1634 emit(LRP(result_dst, op[2], op[1], op[0]));
1635 break;
1636
1637 case ir_triop_csel:
1638 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1639 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1640 inst->predicate = BRW_PREDICATE_NORMAL;
1641 break;
1642
1643 case ir_triop_bfi:
1644 op[0] = fix_3src_operand(op[0]);
1645 op[1] = fix_3src_operand(op[1]);
1646 op[2] = fix_3src_operand(op[2]);
1647 emit(BFI2(result_dst, op[0], op[1], op[2]));
1648 break;
1649
1650 case ir_triop_bitfield_extract:
1651 op[0] = fix_3src_operand(op[0]);
1652 op[1] = fix_3src_operand(op[1]);
1653 op[2] = fix_3src_operand(op[2]);
1654 /* Note that the instruction's argument order is reversed from GLSL
1655 * and the IR.
1656 */
1657 emit(BFE(result_dst, op[2], op[1], op[0]));
1658 break;
1659
1660 case ir_triop_vector_insert:
1661 assert(!"should have been lowered by lower_vector_insert");
1662 break;
1663
1664 case ir_quadop_bitfield_insert:
1665 assert(!"not reached: should be handled by "
1666 "bitfield_insert_to_bfm_bfi\n");
1667 break;
1668
1669 case ir_quadop_vector:
1670 assert(!"not reached: should be handled by lower_quadop_vector");
1671 break;
1672
1673 case ir_unop_pack_half_2x16:
1674 emit_pack_half_2x16(result_dst, op[0]);
1675 break;
1676 case ir_unop_unpack_half_2x16:
1677 emit_unpack_half_2x16(result_dst, op[0]);
1678 break;
1679 case ir_unop_pack_snorm_2x16:
1680 case ir_unop_pack_snorm_4x8:
1681 case ir_unop_pack_unorm_2x16:
1682 case ir_unop_pack_unorm_4x8:
1683 case ir_unop_unpack_snorm_2x16:
1684 case ir_unop_unpack_snorm_4x8:
1685 case ir_unop_unpack_unorm_2x16:
1686 case ir_unop_unpack_unorm_4x8:
1687 assert(!"not reached: should be handled by lower_packing_builtins");
1688 break;
1689 case ir_unop_unpack_half_2x16_split_x:
1690 case ir_unop_unpack_half_2x16_split_y:
1691 case ir_binop_pack_half_2x16_split:
1692 assert(!"not reached: should not occur in vertex shader");
1693 break;
1694 case ir_binop_ldexp:
1695 assert(!"not reached: should be handled by ldexp_to_arith()");
1696 break;
1697 }
1698 }
1699
1700
1701 void
1702 vec4_visitor::visit(ir_swizzle *ir)
1703 {
1704 src_reg src;
1705 int i = 0;
1706 int swizzle[4];
1707
1708 /* Note that this is only swizzles in expressions, not those on the left
1709 * hand side of an assignment, which do write masking. See ir_assignment
1710 * for that.
1711 */
1712
1713 ir->val->accept(this);
1714 src = this->result;
1715 assert(src.file != BAD_FILE);
1716
1717 for (i = 0; i < ir->type->vector_elements; i++) {
1718 switch (i) {
1719 case 0:
1720 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1721 break;
1722 case 1:
1723 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1724 break;
1725 case 2:
1726 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1727 break;
1728 case 3:
1729 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1730 break;
1731 }
1732 }
1733 for (; i < 4; i++) {
1734 /* Replicate the last channel out. */
1735 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1736 }
1737
1738 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1739
1740 this->result = src;
1741 }
1742
1743 void
1744 vec4_visitor::visit(ir_dereference_variable *ir)
1745 {
1746 const struct glsl_type *type = ir->type;
1747 dst_reg *reg = variable_storage(ir->var);
1748
1749 if (!reg) {
1750 fail("Failed to find variable storage for %s\n", ir->var->name);
1751 this->result = src_reg(brw_null_reg());
1752 return;
1753 }
1754
1755 this->result = src_reg(*reg);
1756
1757 /* System values get their swizzle from the dst_reg writemask */
1758 if (ir->var->mode == ir_var_system_value)
1759 return;
1760
1761 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1762 this->result.swizzle = swizzle_for_size(type->vector_elements);
1763 }
1764
1765
1766 int
1767 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1768 {
1769 /* Under normal circumstances array elements are stored consecutively, so
1770 * the stride is equal to the size of the array element.
1771 */
1772 return type_size(ir->type);
1773 }
1774
1775
1776 void
1777 vec4_visitor::visit(ir_dereference_array *ir)
1778 {
1779 ir_constant *constant_index;
1780 src_reg src;
1781 int array_stride = compute_array_stride(ir);
1782
1783 constant_index = ir->array_index->constant_expression_value();
1784
1785 ir->array->accept(this);
1786 src = this->result;
1787
1788 if (constant_index) {
1789 src.reg_offset += constant_index->value.i[0] * array_stride;
1790 } else {
1791 /* Variable index array dereference. It eats the "vec4" of the
1792 * base of the array and an index that offsets the Mesa register
1793 * index.
1794 */
1795 ir->array_index->accept(this);
1796
1797 src_reg index_reg;
1798
1799 if (array_stride == 1) {
1800 index_reg = this->result;
1801 } else {
1802 index_reg = src_reg(this, glsl_type::int_type);
1803
1804 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1805 }
1806
1807 if (src.reladdr) {
1808 src_reg temp = src_reg(this, glsl_type::int_type);
1809
1810 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1811
1812 index_reg = temp;
1813 }
1814
1815 src.reladdr = ralloc(mem_ctx, src_reg);
1816 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1817 }
1818
1819 /* If the type is smaller than a vec4, replicate the last channel out. */
1820 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1821 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1822 else
1823 src.swizzle = BRW_SWIZZLE_NOOP;
1824 src.type = brw_type_for_base_type(ir->type);
1825
1826 this->result = src;
1827 }
1828
1829 void
1830 vec4_visitor::visit(ir_dereference_record *ir)
1831 {
1832 unsigned int i;
1833 const glsl_type *struct_type = ir->record->type;
1834 int offset = 0;
1835
1836 ir->record->accept(this);
1837
1838 for (i = 0; i < struct_type->length; i++) {
1839 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1840 break;
1841 offset += type_size(struct_type->fields.structure[i].type);
1842 }
1843
1844 /* If the type is smaller than a vec4, replicate the last channel out. */
1845 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1846 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1847 else
1848 this->result.swizzle = BRW_SWIZZLE_NOOP;
1849 this->result.type = brw_type_for_base_type(ir->type);
1850
1851 this->result.reg_offset += offset;
1852 }
1853
1854 /**
1855 * We want to be careful in assignment setup to hit the actual storage
1856 * instead of potentially using a temporary like we might with the
1857 * ir_dereference handler.
1858 */
1859 static dst_reg
1860 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1861 {
1862 /* The LHS must be a dereference. If the LHS is a variable indexed array
1863 * access of a vector, it must be separated into a series conditional moves
1864 * before reaching this point (see ir_vec_index_to_cond_assign).
1865 */
1866 assert(ir->as_dereference());
1867 ir_dereference_array *deref_array = ir->as_dereference_array();
1868 if (deref_array) {
1869 assert(!deref_array->array->type->is_vector());
1870 }
1871
1872 /* Use the rvalue deref handler for the most part. We'll ignore
1873 * swizzles in it and write swizzles using writemask, though.
1874 */
1875 ir->accept(v);
1876 return dst_reg(v->result);
1877 }
1878
1879 void
1880 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1881 const struct glsl_type *type, uint32_t predicate)
1882 {
1883 if (type->base_type == GLSL_TYPE_STRUCT) {
1884 for (unsigned int i = 0; i < type->length; i++) {
1885 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1886 }
1887 return;
1888 }
1889
1890 if (type->is_array()) {
1891 for (unsigned int i = 0; i < type->length; i++) {
1892 emit_block_move(dst, src, type->fields.array, predicate);
1893 }
1894 return;
1895 }
1896
1897 if (type->is_matrix()) {
1898 const struct glsl_type *vec_type;
1899
1900 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1901 type->vector_elements, 1);
1902
1903 for (int i = 0; i < type->matrix_columns; i++) {
1904 emit_block_move(dst, src, vec_type, predicate);
1905 }
1906 return;
1907 }
1908
1909 assert(type->is_scalar() || type->is_vector());
1910
1911 dst->type = brw_type_for_base_type(type);
1912 src->type = dst->type;
1913
1914 dst->writemask = (1 << type->vector_elements) - 1;
1915
1916 src->swizzle = swizzle_for_size(type->vector_elements);
1917
1918 vec4_instruction *inst = emit(MOV(*dst, *src));
1919 inst->predicate = predicate;
1920
1921 dst->reg_offset++;
1922 src->reg_offset++;
1923 }
1924
1925
1926 /* If the RHS processing resulted in an instruction generating a
1927 * temporary value, and it would be easy to rewrite the instruction to
1928 * generate its result right into the LHS instead, do so. This ends
1929 * up reliably removing instructions where it can be tricky to do so
1930 * later without real UD chain information.
1931 */
1932 bool
1933 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1934 dst_reg dst,
1935 src_reg src,
1936 vec4_instruction *pre_rhs_inst,
1937 vec4_instruction *last_rhs_inst)
1938 {
1939 /* This could be supported, but it would take more smarts. */
1940 if (ir->condition)
1941 return false;
1942
1943 if (pre_rhs_inst == last_rhs_inst)
1944 return false; /* No instructions generated to work with. */
1945
1946 /* Make sure the last instruction generated our source reg. */
1947 if (src.file != GRF ||
1948 src.file != last_rhs_inst->dst.file ||
1949 src.reg != last_rhs_inst->dst.reg ||
1950 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1951 src.reladdr ||
1952 src.abs ||
1953 src.negate ||
1954 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1955 return false;
1956
1957 /* Check that that last instruction fully initialized the channels
1958 * we want to use, in the order we want to use them. We could
1959 * potentially reswizzle the operands of many instructions so that
1960 * we could handle out of order channels, but don't yet.
1961 */
1962
1963 for (unsigned i = 0; i < 4; i++) {
1964 if (dst.writemask & (1 << i)) {
1965 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1966 return false;
1967
1968 if (BRW_GET_SWZ(src.swizzle, i) != i)
1969 return false;
1970 }
1971 }
1972
1973 /* Success! Rewrite the instruction. */
1974 last_rhs_inst->dst.file = dst.file;
1975 last_rhs_inst->dst.reg = dst.reg;
1976 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1977 last_rhs_inst->dst.reladdr = dst.reladdr;
1978 last_rhs_inst->dst.writemask &= dst.writemask;
1979
1980 return true;
1981 }
1982
1983 void
1984 vec4_visitor::visit(ir_assignment *ir)
1985 {
1986 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1987 uint32_t predicate = BRW_PREDICATE_NONE;
1988
1989 if (!ir->lhs->type->is_scalar() &&
1990 !ir->lhs->type->is_vector()) {
1991 ir->rhs->accept(this);
1992 src_reg src = this->result;
1993
1994 if (ir->condition) {
1995 emit_bool_to_cond_code(ir->condition, &predicate);
1996 }
1997
1998 /* emit_block_move doesn't account for swizzles in the source register.
1999 * This should be ok, since the source register is a structure or an
2000 * array, and those can't be swizzled. But double-check to be sure.
2001 */
2002 assert(src.swizzle ==
2003 (ir->rhs->type->is_matrix()
2004 ? swizzle_for_size(ir->rhs->type->vector_elements)
2005 : BRW_SWIZZLE_NOOP));
2006
2007 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2008 return;
2009 }
2010
2011 /* Now we're down to just a scalar/vector with writemasks. */
2012 int i;
2013
2014 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2015 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2016
2017 ir->rhs->accept(this);
2018
2019 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2020
2021 src_reg src = this->result;
2022
2023 int swizzles[4];
2024 int first_enabled_chan = 0;
2025 int src_chan = 0;
2026
2027 assert(ir->lhs->type->is_vector() ||
2028 ir->lhs->type->is_scalar());
2029 dst.writemask = ir->write_mask;
2030
2031 for (int i = 0; i < 4; i++) {
2032 if (dst.writemask & (1 << i)) {
2033 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2034 break;
2035 }
2036 }
2037
2038 /* Swizzle a small RHS vector into the channels being written.
2039 *
2040 * glsl ir treats write_mask as dictating how many channels are
2041 * present on the RHS while in our instructions we need to make
2042 * those channels appear in the slots of the vec4 they're written to.
2043 */
2044 for (int i = 0; i < 4; i++) {
2045 if (dst.writemask & (1 << i))
2046 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2047 else
2048 swizzles[i] = first_enabled_chan;
2049 }
2050 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2051 swizzles[2], swizzles[3]);
2052
2053 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2054 return;
2055 }
2056
2057 if (ir->condition) {
2058 emit_bool_to_cond_code(ir->condition, &predicate);
2059 }
2060
2061 for (i = 0; i < type_size(ir->lhs->type); i++) {
2062 vec4_instruction *inst = emit(MOV(dst, src));
2063 inst->predicate = predicate;
2064
2065 dst.reg_offset++;
2066 src.reg_offset++;
2067 }
2068 }
2069
2070 void
2071 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2072 {
2073 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2074 foreach_list(node, &ir->components) {
2075 ir_constant *field_value = (ir_constant *)node;
2076
2077 emit_constant_values(dst, field_value);
2078 }
2079 return;
2080 }
2081
2082 if (ir->type->is_array()) {
2083 for (unsigned int i = 0; i < ir->type->length; i++) {
2084 emit_constant_values(dst, ir->array_elements[i]);
2085 }
2086 return;
2087 }
2088
2089 if (ir->type->is_matrix()) {
2090 for (int i = 0; i < ir->type->matrix_columns; i++) {
2091 float *vec = &ir->value.f[i * ir->type->vector_elements];
2092
2093 for (int j = 0; j < ir->type->vector_elements; j++) {
2094 dst->writemask = 1 << j;
2095 dst->type = BRW_REGISTER_TYPE_F;
2096
2097 emit(MOV(*dst, src_reg(vec[j])));
2098 }
2099 dst->reg_offset++;
2100 }
2101 return;
2102 }
2103
2104 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2105
2106 for (int i = 0; i < ir->type->vector_elements; i++) {
2107 if (!(remaining_writemask & (1 << i)))
2108 continue;
2109
2110 dst->writemask = 1 << i;
2111 dst->type = brw_type_for_base_type(ir->type);
2112
2113 /* Find other components that match the one we're about to
2114 * write. Emits fewer instructions for things like vec4(0.5,
2115 * 1.5, 1.5, 1.5).
2116 */
2117 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2118 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2119 if (ir->value.b[i] == ir->value.b[j])
2120 dst->writemask |= (1 << j);
2121 } else {
2122 /* u, i, and f storage all line up, so no need for a
2123 * switch case for comparing each type.
2124 */
2125 if (ir->value.u[i] == ir->value.u[j])
2126 dst->writemask |= (1 << j);
2127 }
2128 }
2129
2130 switch (ir->type->base_type) {
2131 case GLSL_TYPE_FLOAT:
2132 emit(MOV(*dst, src_reg(ir->value.f[i])));
2133 break;
2134 case GLSL_TYPE_INT:
2135 emit(MOV(*dst, src_reg(ir->value.i[i])));
2136 break;
2137 case GLSL_TYPE_UINT:
2138 emit(MOV(*dst, src_reg(ir->value.u[i])));
2139 break;
2140 case GLSL_TYPE_BOOL:
2141 emit(MOV(*dst, src_reg(ir->value.b[i])));
2142 break;
2143 default:
2144 assert(!"Non-float/uint/int/bool constant");
2145 break;
2146 }
2147
2148 remaining_writemask &= ~dst->writemask;
2149 }
2150 dst->reg_offset++;
2151 }
2152
2153 void
2154 vec4_visitor::visit(ir_constant *ir)
2155 {
2156 dst_reg dst = dst_reg(this, ir->type);
2157 this->result = src_reg(dst);
2158
2159 emit_constant_values(&dst, ir);
2160 }
2161
2162 void
2163 vec4_visitor::visit(ir_call *ir)
2164 {
2165 assert(!"not reached");
2166 }
2167
2168 void
2169 vec4_visitor::visit(ir_texture *ir)
2170 {
2171 int sampler =
2172 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2173
2174 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2175 * emitting anything other than setting up the constant result.
2176 */
2177 if (ir->op == ir_tg4) {
2178 ir_constant *chan = ir->lod_info.component->as_constant();
2179 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2180 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2181 dst_reg result(this, ir->type);
2182 this->result = src_reg(result);
2183 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2184 return;
2185 }
2186 }
2187
2188 /* Should be lowered by do_lower_texture_projection */
2189 assert(!ir->projector);
2190
2191 /* Should be lowered */
2192 assert(!ir->offset || !ir->offset->type->is_array());
2193
2194 /* Generate code to compute all the subexpression trees. This has to be
2195 * done before loading any values into MRFs for the sampler message since
2196 * generating these values may involve SEND messages that need the MRFs.
2197 */
2198 src_reg coordinate;
2199 if (ir->coordinate) {
2200 ir->coordinate->accept(this);
2201 coordinate = this->result;
2202 }
2203
2204 src_reg shadow_comparitor;
2205 if (ir->shadow_comparitor) {
2206 ir->shadow_comparitor->accept(this);
2207 shadow_comparitor = this->result;
2208 }
2209
2210 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2211 src_reg offset_value;
2212 if (has_nonconstant_offset) {
2213 ir->offset->accept(this);
2214 offset_value = src_reg(this->result);
2215 }
2216
2217 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2218 src_reg lod, dPdx, dPdy, sample_index;
2219 switch (ir->op) {
2220 case ir_tex:
2221 lod = src_reg(0.0f);
2222 lod_type = glsl_type::float_type;
2223 break;
2224 case ir_txf:
2225 case ir_txl:
2226 case ir_txs:
2227 ir->lod_info.lod->accept(this);
2228 lod = this->result;
2229 lod_type = ir->lod_info.lod->type;
2230 break;
2231 case ir_query_levels:
2232 lod = src_reg(0);
2233 lod_type = glsl_type::int_type;
2234 break;
2235 case ir_txf_ms:
2236 ir->lod_info.sample_index->accept(this);
2237 sample_index = this->result;
2238 sample_index_type = ir->lod_info.sample_index->type;
2239 break;
2240 case ir_txd:
2241 ir->lod_info.grad.dPdx->accept(this);
2242 dPdx = this->result;
2243
2244 ir->lod_info.grad.dPdy->accept(this);
2245 dPdy = this->result;
2246
2247 lod_type = ir->lod_info.grad.dPdx->type;
2248 break;
2249 case ir_txb:
2250 case ir_lod:
2251 case ir_tg4:
2252 break;
2253 }
2254
2255 vec4_instruction *inst = NULL;
2256 switch (ir->op) {
2257 case ir_tex:
2258 case ir_txl:
2259 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2260 break;
2261 case ir_txd:
2262 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2263 break;
2264 case ir_txf:
2265 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2266 break;
2267 case ir_txf_ms:
2268 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2269 break;
2270 case ir_txs:
2271 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2272 break;
2273 case ir_tg4:
2274 if (has_nonconstant_offset)
2275 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2276 else
2277 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2278 break;
2279 case ir_query_levels:
2280 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2281 break;
2282 case ir_txb:
2283 assert(!"TXB is not valid for vertex shaders.");
2284 break;
2285 case ir_lod:
2286 assert(!"LOD is not valid for vertex shaders.");
2287 break;
2288 default:
2289 assert(!"Unrecognized tex op");
2290 }
2291
2292 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2293
2294 /* Texel offsets go in the message header; Gen4 also requires headers. */
2295 inst->header_present = use_texture_offset || brw->gen < 5 || ir->op == ir_tg4;
2296 inst->base_mrf = 2;
2297 inst->mlen = inst->header_present + 1; /* always at least one */
2298 inst->sampler = sampler;
2299 inst->dst = dst_reg(this, ir->type);
2300 inst->dst.writemask = WRITEMASK_XYZW;
2301 inst->shadow_compare = ir->shadow_comparitor != NULL;
2302
2303 if (use_texture_offset)
2304 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2305
2306 /* Stuff the channel select bits in the top of the texture offset */
2307 if (ir->op == ir_tg4)
2308 inst->texture_offset |= gather_channel(ir, sampler)<<16;
2309
2310 /* MRF for the first parameter */
2311 int param_base = inst->base_mrf + inst->header_present;
2312
2313 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2314 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2315 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2316 } else {
2317 /* Load the coordinate */
2318 /* FINISHME: gl_clamp_mask and saturate */
2319 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2320 int zero_mask = 0xf & ~coord_mask;
2321
2322 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2323 coordinate));
2324
2325 if (zero_mask != 0) {
2326 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2327 src_reg(0)));
2328 }
2329 /* Load the shadow comparitor */
2330 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2331 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2332 WRITEMASK_X),
2333 shadow_comparitor));
2334 inst->mlen++;
2335 }
2336
2337 /* Load the LOD info */
2338 if (ir->op == ir_tex || ir->op == ir_txl) {
2339 int mrf, writemask;
2340 if (brw->gen >= 5) {
2341 mrf = param_base + 1;
2342 if (ir->shadow_comparitor) {
2343 writemask = WRITEMASK_Y;
2344 /* mlen already incremented */
2345 } else {
2346 writemask = WRITEMASK_X;
2347 inst->mlen++;
2348 }
2349 } else /* brw->gen == 4 */ {
2350 mrf = param_base;
2351 writemask = WRITEMASK_W;
2352 }
2353 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2354 } else if (ir->op == ir_txf) {
2355 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2356 } else if (ir->op == ir_txf_ms) {
2357 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2358 sample_index));
2359 inst->mlen++;
2360
2361 /* on Gen7, there is an additional MCS parameter here after SI,
2362 * but we don't bother to emit it since it's always zero. If
2363 * we start supporting texturing from CMS surfaces, this will have
2364 * to change
2365 */
2366 } else if (ir->op == ir_txd) {
2367 const glsl_type *type = lod_type;
2368
2369 if (brw->gen >= 5) {
2370 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2371 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2372 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2373 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2374 inst->mlen++;
2375
2376 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2377 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2378 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2379 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2380 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2381 inst->mlen++;
2382
2383 if (ir->shadow_comparitor) {
2384 emit(MOV(dst_reg(MRF, param_base + 2,
2385 ir->shadow_comparitor->type, WRITEMASK_Z),
2386 shadow_comparitor));
2387 }
2388 }
2389 } else /* brw->gen == 4 */ {
2390 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2391 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2392 inst->mlen += 2;
2393 }
2394 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2395 if (ir->shadow_comparitor) {
2396 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2397 shadow_comparitor));
2398 }
2399
2400 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2401 offset_value));
2402 inst->mlen++;
2403 }
2404 }
2405
2406 emit(inst);
2407
2408 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2409 * spec requires layers.
2410 */
2411 if (ir->op == ir_txs) {
2412 glsl_type const *type = ir->sampler->type;
2413 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2414 type->sampler_array) {
2415 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2416 with_writemask(inst->dst, WRITEMASK_Z),
2417 src_reg(inst->dst), src_reg(6));
2418 }
2419 }
2420
2421 swizzle_result(ir, src_reg(inst->dst), sampler);
2422 }
2423
2424 /**
2425 * Set up the gather channel based on the swizzle, for gather4.
2426 */
2427 uint32_t
2428 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2429 {
2430 ir_constant *chan = ir->lod_info.component->as_constant();
2431 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2432 switch (swiz) {
2433 case SWIZZLE_X: return 0;
2434 case SWIZZLE_Y:
2435 /* gather4 sampler is broken for green channel on RG32F --
2436 * we must ask for blue instead.
2437 */
2438 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2439 return 2;
2440 return 1;
2441 case SWIZZLE_Z: return 2;
2442 case SWIZZLE_W: return 3;
2443 default:
2444 assert(!"Not reached"); /* zero, one swizzles handled already */
2445 return 0;
2446 }
2447 }
2448
2449 void
2450 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2451 {
2452 int s = key->tex.swizzles[sampler];
2453
2454 this->result = src_reg(this, ir->type);
2455 dst_reg swizzled_result(this->result);
2456
2457 if (ir->op == ir_query_levels) {
2458 /* # levels is in .w */
2459 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2460 emit(MOV(swizzled_result, orig_val));
2461 return;
2462 }
2463
2464 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2465 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2466 emit(MOV(swizzled_result, orig_val));
2467 return;
2468 }
2469
2470
2471 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2472 int swizzle[4] = {0};
2473
2474 for (int i = 0; i < 4; i++) {
2475 switch (GET_SWZ(s, i)) {
2476 case SWIZZLE_ZERO:
2477 zero_mask |= (1 << i);
2478 break;
2479 case SWIZZLE_ONE:
2480 one_mask |= (1 << i);
2481 break;
2482 default:
2483 copy_mask |= (1 << i);
2484 swizzle[i] = GET_SWZ(s, i);
2485 break;
2486 }
2487 }
2488
2489 if (copy_mask) {
2490 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2491 swizzled_result.writemask = copy_mask;
2492 emit(MOV(swizzled_result, orig_val));
2493 }
2494
2495 if (zero_mask) {
2496 swizzled_result.writemask = zero_mask;
2497 emit(MOV(swizzled_result, src_reg(0.0f)));
2498 }
2499
2500 if (one_mask) {
2501 swizzled_result.writemask = one_mask;
2502 emit(MOV(swizzled_result, src_reg(1.0f)));
2503 }
2504 }
2505
2506 void
2507 vec4_visitor::visit(ir_return *ir)
2508 {
2509 assert(!"not reached");
2510 }
2511
2512 void
2513 vec4_visitor::visit(ir_discard *ir)
2514 {
2515 assert(!"not reached");
2516 }
2517
2518 void
2519 vec4_visitor::visit(ir_if *ir)
2520 {
2521 /* Don't point the annotation at the if statement, because then it plus
2522 * the then and else blocks get printed.
2523 */
2524 this->base_ir = ir->condition;
2525
2526 if (brw->gen == 6) {
2527 emit_if_gen6(ir);
2528 } else {
2529 uint32_t predicate;
2530 emit_bool_to_cond_code(ir->condition, &predicate);
2531 emit(IF(predicate));
2532 }
2533
2534 visit_instructions(&ir->then_instructions);
2535
2536 if (!ir->else_instructions.is_empty()) {
2537 this->base_ir = ir->condition;
2538 emit(BRW_OPCODE_ELSE);
2539
2540 visit_instructions(&ir->else_instructions);
2541 }
2542
2543 this->base_ir = ir->condition;
2544 emit(BRW_OPCODE_ENDIF);
2545 }
2546
2547 void
2548 vec4_visitor::visit(ir_emit_vertex *)
2549 {
2550 assert(!"not reached");
2551 }
2552
2553 void
2554 vec4_visitor::visit(ir_end_primitive *)
2555 {
2556 assert(!"not reached");
2557 }
2558
2559 void
2560 vec4_visitor::emit_ndc_computation()
2561 {
2562 /* Get the position */
2563 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2564
2565 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2566 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2567 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2568
2569 current_annotation = "NDC";
2570 dst_reg ndc_w = ndc;
2571 ndc_w.writemask = WRITEMASK_W;
2572 src_reg pos_w = pos;
2573 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2574 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2575
2576 dst_reg ndc_xyz = ndc;
2577 ndc_xyz.writemask = WRITEMASK_XYZ;
2578
2579 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2580 }
2581
2582 void
2583 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2584 {
2585 if (brw->gen < 6 &&
2586 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2587 key->userclip_active || brw->has_negative_rhw_bug)) {
2588 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2589 dst_reg header1_w = header1;
2590 header1_w.writemask = WRITEMASK_W;
2591
2592 emit(MOV(header1, 0u));
2593
2594 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2595 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2596
2597 current_annotation = "Point size";
2598 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2599 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2600 }
2601
2602 if (key->userclip_active) {
2603 current_annotation = "Clipping flags";
2604 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2605 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2606
2607 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2608 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2609 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2610
2611 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2612 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2613 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2614 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2615 }
2616
2617 /* i965 clipping workaround:
2618 * 1) Test for -ve rhw
2619 * 2) If set,
2620 * set ndc = (0,0,0,0)
2621 * set ucp[6] = 1
2622 *
2623 * Later, clipping will detect ucp[6] and ensure the primitive is
2624 * clipped against all fixed planes.
2625 */
2626 if (brw->has_negative_rhw_bug) {
2627 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2628 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2629 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2630 vec4_instruction *inst;
2631 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2632 inst->predicate = BRW_PREDICATE_NORMAL;
2633 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2634 inst->predicate = BRW_PREDICATE_NORMAL;
2635 }
2636
2637 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2638 } else if (brw->gen < 6) {
2639 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2640 } else {
2641 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2642 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2643 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2644 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2645 }
2646 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2647 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2648 src_reg(output_reg[VARYING_SLOT_LAYER])));
2649 }
2650 }
2651 }
2652
2653 void
2654 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2655 {
2656 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2657 *
2658 * "If a linked set of shaders forming the vertex stage contains no
2659 * static write to gl_ClipVertex or gl_ClipDistance, but the
2660 * application has requested clipping against user clip planes through
2661 * the API, then the coordinate written to gl_Position is used for
2662 * comparison against the user clip planes."
2663 *
2664 * This function is only called if the shader didn't write to
2665 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2666 * if the user wrote to it; otherwise we use gl_Position.
2667 */
2668 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2669 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2670 clip_vertex = VARYING_SLOT_POS;
2671 }
2672
2673 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2674 ++i) {
2675 reg.writemask = 1 << i;
2676 emit(DP4(reg,
2677 src_reg(output_reg[clip_vertex]),
2678 src_reg(this->userplane[i + offset])));
2679 }
2680 }
2681
2682 void
2683 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2684 {
2685 assert (varying < VARYING_SLOT_MAX);
2686 reg.type = output_reg[varying].type;
2687 current_annotation = output_reg_annotation[varying];
2688 /* Copy the register, saturating if necessary */
2689 vec4_instruction *inst = emit(MOV(reg,
2690 src_reg(output_reg[varying])));
2691 if ((varying == VARYING_SLOT_COL0 ||
2692 varying == VARYING_SLOT_COL1 ||
2693 varying == VARYING_SLOT_BFC0 ||
2694 varying == VARYING_SLOT_BFC1) &&
2695 key->clamp_vertex_color) {
2696 inst->saturate = true;
2697 }
2698 }
2699
2700 void
2701 vec4_visitor::emit_urb_slot(int mrf, int varying)
2702 {
2703 struct brw_reg hw_reg = brw_message_reg(mrf);
2704 dst_reg reg = dst_reg(MRF, mrf);
2705 reg.type = BRW_REGISTER_TYPE_F;
2706
2707 switch (varying) {
2708 case VARYING_SLOT_PSIZ:
2709 /* PSIZ is always in slot 0, and is coupled with other flags. */
2710 current_annotation = "indices, point width, clip flags";
2711 emit_psiz_and_flags(hw_reg);
2712 break;
2713 case BRW_VARYING_SLOT_NDC:
2714 current_annotation = "NDC";
2715 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2716 break;
2717 case VARYING_SLOT_POS:
2718 current_annotation = "gl_Position";
2719 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2720 break;
2721 case VARYING_SLOT_EDGE:
2722 /* This is present when doing unfilled polygons. We're supposed to copy
2723 * the edge flag from the user-provided vertex array
2724 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2725 * of that attribute (starts as 1.0f). This is then used in clipping to
2726 * determine which edges should be drawn as wireframe.
2727 */
2728 current_annotation = "edge flag";
2729 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2730 glsl_type::float_type, WRITEMASK_XYZW))));
2731 break;
2732 case BRW_VARYING_SLOT_PAD:
2733 /* No need to write to this slot */
2734 break;
2735 default:
2736 emit_generic_urb_slot(reg, varying);
2737 break;
2738 }
2739 }
2740
2741 static int
2742 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2743 {
2744 if (brw->gen >= 6) {
2745 /* URB data written (does not include the message header reg) must
2746 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2747 * section 5.4.3.2.2: URB_INTERLEAVED.
2748 *
2749 * URB entries are allocated on a multiple of 1024 bits, so an
2750 * extra 128 bits written here to make the end align to 256 is
2751 * no problem.
2752 */
2753 if ((mlen % 2) != 1)
2754 mlen++;
2755 }
2756
2757 return mlen;
2758 }
2759
2760
2761 /**
2762 * Generates the VUE payload plus the necessary URB write instructions to
2763 * output it.
2764 *
2765 * The VUE layout is documented in Volume 2a.
2766 */
2767 void
2768 vec4_visitor::emit_vertex()
2769 {
2770 /* MRF 0 is reserved for the debugger, so start with message header
2771 * in MRF 1.
2772 */
2773 int base_mrf = 1;
2774 int mrf = base_mrf;
2775 /* In the process of generating our URB write message contents, we
2776 * may need to unspill a register or load from an array. Those
2777 * reads would use MRFs 14-15.
2778 */
2779 int max_usable_mrf = 13;
2780
2781 /* The following assertion verifies that max_usable_mrf causes an
2782 * even-numbered amount of URB write data, which will meet gen6's
2783 * requirements for length alignment.
2784 */
2785 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2786
2787 /* First mrf is the g0-based message header containing URB handles and
2788 * such.
2789 */
2790 emit_urb_write_header(mrf++);
2791
2792 if (brw->gen < 6) {
2793 emit_ndc_computation();
2794 }
2795
2796 /* Lower legacy ff and ClipVertex clipping to clip distances */
2797 if (key->userclip_active && !prog->UsesClipDistanceOut) {
2798 current_annotation = "user clip distances";
2799
2800 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2801 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2802
2803 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2804 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2805 }
2806
2807 /* We may need to split this up into several URB writes, so do them in a
2808 * loop.
2809 */
2810 int slot = 0;
2811 bool complete = false;
2812 do {
2813 /* URB offset is in URB row increments, and each of our MRFs is half of
2814 * one of those, since we're doing interleaved writes.
2815 */
2816 int offset = slot / 2;
2817
2818 mrf = base_mrf + 1;
2819 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2820 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2821
2822 /* If this was max_usable_mrf, we can't fit anything more into this
2823 * URB WRITE.
2824 */
2825 if (mrf > max_usable_mrf) {
2826 slot++;
2827 break;
2828 }
2829 }
2830
2831 complete = slot >= prog_data->vue_map.num_slots;
2832 current_annotation = "URB write";
2833 vec4_instruction *inst = emit_urb_write_opcode(complete);
2834 inst->base_mrf = base_mrf;
2835 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2836 inst->offset += offset;
2837 } while(!complete);
2838 }
2839
2840
2841 src_reg
2842 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2843 src_reg *reladdr, int reg_offset)
2844 {
2845 /* Because we store the values to scratch interleaved like our
2846 * vertex data, we need to scale the vec4 index by 2.
2847 */
2848 int message_header_scale = 2;
2849
2850 /* Pre-gen6, the message header uses byte offsets instead of vec4
2851 * (16-byte) offset units.
2852 */
2853 if (brw->gen < 6)
2854 message_header_scale *= 16;
2855
2856 if (reladdr) {
2857 src_reg index = src_reg(this, glsl_type::int_type);
2858
2859 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2860 emit_before(inst, MUL(dst_reg(index),
2861 index, src_reg(message_header_scale)));
2862
2863 return index;
2864 } else {
2865 return src_reg(reg_offset * message_header_scale);
2866 }
2867 }
2868
2869 src_reg
2870 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2871 src_reg *reladdr, int reg_offset)
2872 {
2873 if (reladdr) {
2874 src_reg index = src_reg(this, glsl_type::int_type);
2875
2876 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2877
2878 /* Pre-gen6, the message header uses byte offsets instead of vec4
2879 * (16-byte) offset units.
2880 */
2881 if (brw->gen < 6) {
2882 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2883 }
2884
2885 return index;
2886 } else {
2887 int message_header_scale = brw->gen < 6 ? 16 : 1;
2888 return src_reg(reg_offset * message_header_scale);
2889 }
2890 }
2891
2892 /**
2893 * Emits an instruction before @inst to load the value named by @orig_src
2894 * from scratch space at @base_offset to @temp.
2895 *
2896 * @base_offset is measured in 32-byte units (the size of a register).
2897 */
2898 void
2899 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2900 dst_reg temp, src_reg orig_src,
2901 int base_offset)
2902 {
2903 int reg_offset = base_offset + orig_src.reg_offset;
2904 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2905
2906 emit_before(inst, SCRATCH_READ(temp, index));
2907 }
2908
2909 /**
2910 * Emits an instruction after @inst to store the value to be written
2911 * to @orig_dst to scratch space at @base_offset, from @temp.
2912 *
2913 * @base_offset is measured in 32-byte units (the size of a register).
2914 */
2915 void
2916 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2917 {
2918 int reg_offset = base_offset + inst->dst.reg_offset;
2919 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2920
2921 /* Create a temporary register to store *inst's result in.
2922 *
2923 * We have to be careful in MOVing from our temporary result register in
2924 * the scratch write. If we swizzle from channels of the temporary that
2925 * weren't initialized, it will confuse live interval analysis, which will
2926 * make spilling fail to make progress.
2927 */
2928 src_reg temp = src_reg(this, glsl_type::vec4_type);
2929 temp.type = inst->dst.type;
2930 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2931 int swizzles[4];
2932 for (int i = 0; i < 4; i++)
2933 if (inst->dst.writemask & (1 << i))
2934 swizzles[i] = i;
2935 else
2936 swizzles[i] = first_writemask_chan;
2937 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2938 swizzles[2], swizzles[3]);
2939
2940 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2941 inst->dst.writemask));
2942 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2943 write->predicate = inst->predicate;
2944 write->ir = inst->ir;
2945 write->annotation = inst->annotation;
2946 inst->insert_after(write);
2947
2948 inst->dst.file = temp.file;
2949 inst->dst.reg = temp.reg;
2950 inst->dst.reg_offset = temp.reg_offset;
2951 inst->dst.reladdr = NULL;
2952 }
2953
2954 /**
2955 * We can't generally support array access in GRF space, because a
2956 * single instruction's destination can only span 2 contiguous
2957 * registers. So, we send all GRF arrays that get variable index
2958 * access to scratch space.
2959 */
2960 void
2961 vec4_visitor::move_grf_array_access_to_scratch()
2962 {
2963 int scratch_loc[this->virtual_grf_count];
2964
2965 for (int i = 0; i < this->virtual_grf_count; i++) {
2966 scratch_loc[i] = -1;
2967 }
2968
2969 /* First, calculate the set of virtual GRFs that need to be punted
2970 * to scratch due to having any array access on them, and where in
2971 * scratch.
2972 */
2973 foreach_list(node, &this->instructions) {
2974 vec4_instruction *inst = (vec4_instruction *)node;
2975
2976 if (inst->dst.file == GRF && inst->dst.reladdr &&
2977 scratch_loc[inst->dst.reg] == -1) {
2978 scratch_loc[inst->dst.reg] = c->last_scratch;
2979 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2980 }
2981
2982 for (int i = 0 ; i < 3; i++) {
2983 src_reg *src = &inst->src[i];
2984
2985 if (src->file == GRF && src->reladdr &&
2986 scratch_loc[src->reg] == -1) {
2987 scratch_loc[src->reg] = c->last_scratch;
2988 c->last_scratch += this->virtual_grf_sizes[src->reg];
2989 }
2990 }
2991 }
2992
2993 /* Now, for anything that will be accessed through scratch, rewrite
2994 * it to load/store. Note that this is a _safe list walk, because
2995 * we may generate a new scratch_write instruction after the one
2996 * we're processing.
2997 */
2998 foreach_list_safe(node, &this->instructions) {
2999 vec4_instruction *inst = (vec4_instruction *)node;
3000
3001 /* Set up the annotation tracking for new generated instructions. */
3002 base_ir = inst->ir;
3003 current_annotation = inst->annotation;
3004
3005 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3006 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3007 }
3008
3009 for (int i = 0 ; i < 3; i++) {
3010 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3011 continue;
3012
3013 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3014
3015 emit_scratch_read(inst, temp, inst->src[i],
3016 scratch_loc[inst->src[i].reg]);
3017
3018 inst->src[i].file = temp.file;
3019 inst->src[i].reg = temp.reg;
3020 inst->src[i].reg_offset = temp.reg_offset;
3021 inst->src[i].reladdr = NULL;
3022 }
3023 }
3024 }
3025
3026 /**
3027 * Emits an instruction before @inst to load the value named by @orig_src
3028 * from the pull constant buffer (surface) at @base_offset to @temp.
3029 */
3030 void
3031 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3032 dst_reg temp, src_reg orig_src,
3033 int base_offset)
3034 {
3035 int reg_offset = base_offset + orig_src.reg_offset;
3036 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3037 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3038 vec4_instruction *load;
3039
3040 if (brw->gen >= 7) {
3041 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3042 grf_offset.type = offset.type;
3043 emit_before(inst, MOV(grf_offset, offset));
3044
3045 load = new(mem_ctx) vec4_instruction(this,
3046 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3047 temp, index, src_reg(grf_offset));
3048 } else {
3049 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3050 temp, index, offset);
3051 load->base_mrf = 14;
3052 load->mlen = 1;
3053 }
3054 emit_before(inst, load);
3055 }
3056
3057 /**
3058 * Implements array access of uniforms by inserting a
3059 * PULL_CONSTANT_LOAD instruction.
3060 *
3061 * Unlike temporary GRF array access (where we don't support it due to
3062 * the difficulty of doing relative addressing on instruction
3063 * destinations), we could potentially do array access of uniforms
3064 * that were loaded in GRF space as push constants. In real-world
3065 * usage we've seen, though, the arrays being used are always larger
3066 * than we could load as push constants, so just always move all
3067 * uniform array access out to a pull constant buffer.
3068 */
3069 void
3070 vec4_visitor::move_uniform_array_access_to_pull_constants()
3071 {
3072 int pull_constant_loc[this->uniforms];
3073
3074 for (int i = 0; i < this->uniforms; i++) {
3075 pull_constant_loc[i] = -1;
3076 }
3077
3078 /* Walk through and find array access of uniforms. Put a copy of that
3079 * uniform in the pull constant buffer.
3080 *
3081 * Note that we don't move constant-indexed accesses to arrays. No
3082 * testing has been done of the performance impact of this choice.
3083 */
3084 foreach_list_safe(node, &this->instructions) {
3085 vec4_instruction *inst = (vec4_instruction *)node;
3086
3087 for (int i = 0 ; i < 3; i++) {
3088 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3089 continue;
3090
3091 int uniform = inst->src[i].reg;
3092
3093 /* If this array isn't already present in the pull constant buffer,
3094 * add it.
3095 */
3096 if (pull_constant_loc[uniform] == -1) {
3097 const float **values = &prog_data->param[uniform * 4];
3098
3099 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3100
3101 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3102 prog_data->pull_param[prog_data->nr_pull_params++]
3103 = values[j];
3104 }
3105 }
3106
3107 /* Set up the annotation tracking for new generated instructions. */
3108 base_ir = inst->ir;
3109 current_annotation = inst->annotation;
3110
3111 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3112
3113 emit_pull_constant_load(inst, temp, inst->src[i],
3114 pull_constant_loc[uniform]);
3115
3116 inst->src[i].file = temp.file;
3117 inst->src[i].reg = temp.reg;
3118 inst->src[i].reg_offset = temp.reg_offset;
3119 inst->src[i].reladdr = NULL;
3120 }
3121 }
3122
3123 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3124 * no need to track them as larger-than-vec4 objects. This will be
3125 * relied on in cutting out unused uniform vectors from push
3126 * constants.
3127 */
3128 split_uniform_registers();
3129 }
3130
3131 void
3132 vec4_visitor::resolve_ud_negate(src_reg *reg)
3133 {
3134 if (reg->type != BRW_REGISTER_TYPE_UD ||
3135 !reg->negate)
3136 return;
3137
3138 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3139 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3140 *reg = temp;
3141 }
3142
3143 vec4_visitor::vec4_visitor(struct brw_context *brw,
3144 struct brw_vec4_compile *c,
3145 struct gl_program *prog,
3146 const struct brw_vec4_prog_key *key,
3147 struct brw_vec4_prog_data *prog_data,
3148 struct gl_shader_program *shader_prog,
3149 struct brw_shader *shader,
3150 void *mem_ctx,
3151 bool debug_flag,
3152 bool no_spills)
3153 : debug_flag(debug_flag), no_spills(no_spills)
3154 {
3155 this->brw = brw;
3156 this->ctx = &brw->ctx;
3157 this->shader_prog = shader_prog;
3158 this->shader = shader;
3159
3160 this->mem_ctx = mem_ctx;
3161 this->failed = false;
3162
3163 this->base_ir = NULL;
3164 this->current_annotation = NULL;
3165 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3166
3167 this->c = c;
3168 this->prog = prog;
3169 this->key = key;
3170 this->prog_data = prog_data;
3171 this->stage_prog_data = &prog_data->base;
3172
3173 this->variable_ht = hash_table_ctor(0,
3174 hash_table_pointer_hash,
3175 hash_table_pointer_compare);
3176
3177 this->virtual_grf_start = NULL;
3178 this->virtual_grf_end = NULL;
3179 this->virtual_grf_sizes = NULL;
3180 this->virtual_grf_count = 0;
3181 this->virtual_grf_reg_map = NULL;
3182 this->virtual_grf_reg_count = 0;
3183 this->virtual_grf_array_size = 0;
3184 this->live_intervals_valid = false;
3185
3186 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3187
3188 this->uniforms = 0;
3189 }
3190
3191 vec4_visitor::~vec4_visitor()
3192 {
3193 hash_table_dtor(this->variable_ht);
3194 }
3195
3196
3197 void
3198 vec4_visitor::fail(const char *format, ...)
3199 {
3200 va_list va;
3201 char *msg;
3202
3203 if (failed)
3204 return;
3205
3206 failed = true;
3207
3208 va_start(va, format);
3209 msg = ralloc_vasprintf(mem_ctx, format, va);
3210 va_end(va);
3211 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3212
3213 this->fail_msg = msg;
3214
3215 if (debug_flag) {
3216 fprintf(stderr, "%s", msg);
3217 }
3218 }
3219
3220 } /* namespace brw */