i965/vec4: Add the ability to suppress register spilling.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->sampler = 0;
47 this->texture_offset = 0;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->ir = v->base_ir;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_present = false;
53 this->mlen = 0;
54 this->base_mrf = 0;
55 this->offset = 0;
56 this->ir = NULL;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(dst_reg dst, src_reg src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
123 { \
124 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
125 src0, src1); \
126 }
127
128 #define ALU3(op) \
129 vec4_instruction * \
130 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
131 { \
132 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
133 src0, src1, src2); \
134 }
135
136 ALU1(NOT)
137 ALU1(MOV)
138 ALU1(FRC)
139 ALU1(RNDD)
140 ALU1(RNDE)
141 ALU1(RNDZ)
142 ALU1(F32TO16)
143 ALU1(F16TO32)
144 ALU2(ADD)
145 ALU2(MUL)
146 ALU2(MACH)
147 ALU2(AND)
148 ALU2(OR)
149 ALU2(XOR)
150 ALU2(DP3)
151 ALU2(DP4)
152 ALU2(DPH)
153 ALU2(SHL)
154 ALU2(SHR)
155 ALU2(ASR)
156 ALU3(LRP)
157 ALU1(BFREV)
158 ALU3(BFE)
159 ALU2(BFI1)
160 ALU3(BFI2)
161 ALU1(FBH)
162 ALU1(FBL)
163 ALU1(CBIT)
164 ALU3(MAD)
165 ALU2(ADDC)
166 ALU2(SUBB)
167
168 /** Gen4 predicated IF. */
169 vec4_instruction *
170 vec4_visitor::IF(uint32_t predicate)
171 {
172 vec4_instruction *inst;
173
174 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
175 inst->predicate = predicate;
176
177 return inst;
178 }
179
180 /** Gen6+ IF with embedded comparison. */
181 vec4_instruction *
182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
183 {
184 assert(brw->gen >= 6);
185
186 vec4_instruction *inst;
187
188 resolve_ud_negate(&src0);
189 resolve_ud_negate(&src1);
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
192 src0, src1);
193 inst->conditional_mod = condition;
194
195 return inst;
196 }
197
198 /**
199 * CMP: Sets the low bit of the destination channels with the result
200 * of the comparison, while the upper bits are undefined, and updates
201 * the flag register with the packed 16 bits of the result.
202 */
203 vec4_instruction *
204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
205 {
206 vec4_instruction *inst;
207
208 /* original gen4 does type conversion to the destination type
209 * before before comparison, producing garbage results for floating
210 * point comparisons.
211 */
212 if (brw->gen == 4) {
213 dst.type = src0.type;
214 if (dst.file == HW_REG)
215 dst.fixed_hw_reg.type = dst.type;
216 }
217
218 resolve_ud_negate(&src0);
219 resolve_ud_negate(&src1);
220
221 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
222 inst->conditional_mod = condition;
223
224 return inst;
225 }
226
227 vec4_instruction *
228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
229 {
230 vec4_instruction *inst;
231
232 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
233 dst, index);
234 inst->base_mrf = 14;
235 inst->mlen = 2;
236
237 return inst;
238 }
239
240 vec4_instruction *
241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
242 {
243 vec4_instruction *inst;
244
245 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
246 dst, src, index);
247 inst->base_mrf = 13;
248 inst->mlen = 3;
249
250 return inst;
251 }
252
253 void
254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
255 {
256 static enum opcode dot_opcodes[] = {
257 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
258 };
259
260 emit(dot_opcodes[elements - 2], dst, src0, src1);
261 }
262
263 src_reg
264 vec4_visitor::fix_3src_operand(src_reg src)
265 {
266 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
267 * able to use vertical stride of zero to replicate the vec4 uniform, like
268 *
269 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
270 *
271 * But you can't, since vertical stride is always four in three-source
272 * instructions. Instead, insert a MOV instruction to do the replication so
273 * that the three-source instruction can consume it.
274 */
275
276 /* The MOV is only needed if the source is a uniform or immediate. */
277 if (src.file != UNIFORM && src.file != IMM)
278 return src;
279
280 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
281 expanded.type = src.type;
282 emit(MOV(expanded, src));
283 return src_reg(expanded);
284 }
285
286 src_reg
287 vec4_visitor::fix_math_operand(src_reg src)
288 {
289 /* The gen6 math instruction ignores the source modifiers --
290 * swizzle, abs, negate, and at least some parts of the register
291 * region description.
292 *
293 * Rather than trying to enumerate all these cases, *always* expand the
294 * operand to a temp GRF for gen6.
295 *
296 * For gen7, keep the operand as-is, except if immediate, which gen7 still
297 * can't use.
298 */
299
300 if (brw->gen == 7 && src.file != IMM)
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
304 expanded.type = src.type;
305 emit(MOV(expanded, src));
306 return src_reg(expanded);
307 }
308
309 void
310 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
311 {
312 src = fix_math_operand(src);
313
314 if (dst.writemask != WRITEMASK_XYZW) {
315 /* The gen6 math instruction must be align1, so we can't do
316 * writemasks.
317 */
318 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
319
320 emit(opcode, temp_dst, src);
321
322 emit(MOV(dst, src_reg(temp_dst)));
323 } else {
324 emit(opcode, dst, src);
325 }
326 }
327
328 void
329 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
330 {
331 vec4_instruction *inst = emit(opcode, dst, src);
332 inst->base_mrf = 1;
333 inst->mlen = 1;
334 }
335
336 void
337 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
338 {
339 switch (opcode) {
340 case SHADER_OPCODE_RCP:
341 case SHADER_OPCODE_RSQ:
342 case SHADER_OPCODE_SQRT:
343 case SHADER_OPCODE_EXP2:
344 case SHADER_OPCODE_LOG2:
345 case SHADER_OPCODE_SIN:
346 case SHADER_OPCODE_COS:
347 break;
348 default:
349 assert(!"not reached: bad math opcode");
350 return;
351 }
352
353 if (brw->gen >= 6) {
354 return emit_math1_gen6(opcode, dst, src);
355 } else {
356 return emit_math1_gen4(opcode, dst, src);
357 }
358 }
359
360 void
361 vec4_visitor::emit_math2_gen6(enum opcode opcode,
362 dst_reg dst, src_reg src0, src_reg src1)
363 {
364 src0 = fix_math_operand(src0);
365 src1 = fix_math_operand(src1);
366
367 if (dst.writemask != WRITEMASK_XYZW) {
368 /* The gen6 math instruction must be align1, so we can't do
369 * writemasks.
370 */
371 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
372 temp_dst.type = dst.type;
373
374 emit(opcode, temp_dst, src0, src1);
375
376 emit(MOV(dst, src_reg(temp_dst)));
377 } else {
378 emit(opcode, dst, src0, src1);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen4(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 vec4_instruction *inst = emit(opcode, dst, src0, src1);
387 inst->base_mrf = 1;
388 inst->mlen = 2;
389 }
390
391 void
392 vec4_visitor::emit_math(enum opcode opcode,
393 dst_reg dst, src_reg src0, src_reg src1)
394 {
395 switch (opcode) {
396 case SHADER_OPCODE_POW:
397 case SHADER_OPCODE_INT_QUOTIENT:
398 case SHADER_OPCODE_INT_REMAINDER:
399 break;
400 default:
401 assert(!"not reached: unsupported binary math opcode");
402 return;
403 }
404
405 if (brw->gen >= 6) {
406 return emit_math2_gen6(opcode, dst, src0, src1);
407 } else {
408 return emit_math2_gen4(opcode, dst, src0, src1);
409 }
410 }
411
412 void
413 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
414 {
415 if (brw->gen < 7)
416 assert(!"ir_unop_pack_half_2x16 should be lowered");
417
418 assert(dst.type == BRW_REGISTER_TYPE_UD);
419 assert(src0.type == BRW_REGISTER_TYPE_F);
420
421 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
422 *
423 * Because this instruction does not have a 16-bit floating-point type,
424 * the destination data type must be Word (W).
425 *
426 * The destination must be DWord-aligned and specify a horizontal stride
427 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
428 * each destination channel and the upper word is not modified.
429 *
430 * The above restriction implies that the f32to16 instruction must use
431 * align1 mode, because only in align1 mode is it possible to specify
432 * horizontal stride. We choose here to defy the hardware docs and emit
433 * align16 instructions.
434 *
435 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
436 * instructions. I was partially successful in that the code passed all
437 * tests. However, the code was dubiously correct and fragile, and the
438 * tests were not harsh enough to probe that frailty. Not trusting the
439 * code, I chose instead to remain in align16 mode in defiance of the hw
440 * docs).
441 *
442 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
443 * simulator, emitting a f32to16 in align16 mode with UD as destination
444 * data type is safe. The behavior differs from that specified in the PRM
445 * in that the upper word of each destination channel is cleared to 0.
446 */
447
448 dst_reg tmp_dst(this, glsl_type::uvec2_type);
449 src_reg tmp_src(tmp_dst);
450
451 #if 0
452 /* Verify the undocumented behavior on which the following instructions
453 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
454 * then the result of the bit-or instruction below will be incorrect.
455 *
456 * You should inspect the disasm output in order to verify that the MOV is
457 * not optimized away.
458 */
459 emit(MOV(tmp_dst, src_reg(0x12345678u)));
460 #endif
461
462 /* Give tmp the form below, where "." means untouched.
463 *
464 * w z y x w z y x
465 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
466 *
467 * That the upper word of each write-channel be 0 is required for the
468 * following bit-shift and bit-or instructions to work. Note that this
469 * relies on the undocumented hardware behavior mentioned above.
470 */
471 tmp_dst.writemask = WRITEMASK_XY;
472 emit(F32TO16(tmp_dst, src0));
473
474 /* Give the write-channels of dst the form:
475 * 0xhhhh0000
476 */
477 tmp_src.swizzle = SWIZZLE_Y;
478 emit(SHL(dst, tmp_src, src_reg(16u)));
479
480 /* Finally, give the write-channels of dst the form of packHalf2x16's
481 * output:
482 * 0xhhhhllll
483 */
484 tmp_src.swizzle = SWIZZLE_X;
485 emit(OR(dst, src_reg(dst), tmp_src));
486 }
487
488 void
489 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
490 {
491 if (brw->gen < 7)
492 assert(!"ir_unop_unpack_half_2x16 should be lowered");
493
494 assert(dst.type == BRW_REGISTER_TYPE_F);
495 assert(src0.type == BRW_REGISTER_TYPE_UD);
496
497 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
498 *
499 * Because this instruction does not have a 16-bit floating-point type,
500 * the source data type must be Word (W). The destination type must be
501 * F (Float).
502 *
503 * To use W as the source data type, we must adjust horizontal strides,
504 * which is only possible in align1 mode. All my [chadv] attempts at
505 * emitting align1 instructions for unpackHalf2x16 failed to pass the
506 * Piglit tests, so I gave up.
507 *
508 * I've verified that, on gen7 hardware and the simulator, it is safe to
509 * emit f16to32 in align16 mode with UD as source data type.
510 */
511
512 dst_reg tmp_dst(this, glsl_type::uvec2_type);
513 src_reg tmp_src(tmp_dst);
514
515 tmp_dst.writemask = WRITEMASK_X;
516 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
517
518 tmp_dst.writemask = WRITEMASK_Y;
519 emit(SHR(tmp_dst, src0, src_reg(16u)));
520
521 dst.writemask = WRITEMASK_XY;
522 emit(F16TO32(dst, tmp_src));
523 }
524
525 void
526 vec4_visitor::visit_instructions(const exec_list *list)
527 {
528 foreach_list(node, list) {
529 ir_instruction *ir = (ir_instruction *)node;
530
531 base_ir = ir;
532 ir->accept(this);
533 }
534 }
535
536
537 static int
538 type_size(const struct glsl_type *type)
539 {
540 unsigned int i;
541 int size;
542
543 switch (type->base_type) {
544 case GLSL_TYPE_UINT:
545 case GLSL_TYPE_INT:
546 case GLSL_TYPE_FLOAT:
547 case GLSL_TYPE_BOOL:
548 if (type->is_matrix()) {
549 return type->matrix_columns;
550 } else {
551 /* Regardless of size of vector, it gets a vec4. This is bad
552 * packing for things like floats, but otherwise arrays become a
553 * mess. Hopefully a later pass over the code can pack scalars
554 * down if appropriate.
555 */
556 return 1;
557 }
558 case GLSL_TYPE_ARRAY:
559 assert(type->length > 0);
560 return type_size(type->fields.array) * type->length;
561 case GLSL_TYPE_STRUCT:
562 size = 0;
563 for (i = 0; i < type->length; i++) {
564 size += type_size(type->fields.structure[i].type);
565 }
566 return size;
567 case GLSL_TYPE_SAMPLER:
568 /* Samplers take up one slot in UNIFORMS[], but they're baked in
569 * at link time.
570 */
571 return 1;
572 case GLSL_TYPE_VOID:
573 case GLSL_TYPE_ERROR:
574 case GLSL_TYPE_INTERFACE:
575 assert(0);
576 break;
577 }
578
579 return 0;
580 }
581
582 int
583 vec4_visitor::virtual_grf_alloc(int size)
584 {
585 if (virtual_grf_array_size <= virtual_grf_count) {
586 if (virtual_grf_array_size == 0)
587 virtual_grf_array_size = 16;
588 else
589 virtual_grf_array_size *= 2;
590 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
591 virtual_grf_array_size);
592 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
593 virtual_grf_array_size);
594 }
595 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
596 virtual_grf_reg_count += size;
597 virtual_grf_sizes[virtual_grf_count] = size;
598 return virtual_grf_count++;
599 }
600
601 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
602 {
603 init();
604
605 this->file = GRF;
606 this->reg = v->virtual_grf_alloc(type_size(type));
607
608 if (type->is_array() || type->is_record()) {
609 this->swizzle = BRW_SWIZZLE_NOOP;
610 } else {
611 this->swizzle = swizzle_for_size(type->vector_elements);
612 }
613
614 this->type = brw_type_for_base_type(type);
615 }
616
617 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
618 {
619 init();
620
621 this->file = GRF;
622 this->reg = v->virtual_grf_alloc(type_size(type));
623
624 if (type->is_array() || type->is_record()) {
625 this->writemask = WRITEMASK_XYZW;
626 } else {
627 this->writemask = (1 << type->vector_elements) - 1;
628 }
629
630 this->type = brw_type_for_base_type(type);
631 }
632
633 /* Our support for uniforms is piggy-backed on the struct
634 * gl_fragment_program, because that's where the values actually
635 * get stored, rather than in some global gl_shader_program uniform
636 * store.
637 */
638 void
639 vec4_visitor::setup_uniform_values(ir_variable *ir)
640 {
641 int namelen = strlen(ir->name);
642
643 /* The data for our (non-builtin) uniforms is stored in a series of
644 * gl_uniform_driver_storage structs for each subcomponent that
645 * glGetUniformLocation() could name. We know it's been set up in the same
646 * order we'd walk the type, so walk the list of storage and find anything
647 * with our name, or the prefix of a component that starts with our name.
648 */
649 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
650 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
651
652 if (strncmp(ir->name, storage->name, namelen) != 0 ||
653 (storage->name[namelen] != 0 &&
654 storage->name[namelen] != '.' &&
655 storage->name[namelen] != '[')) {
656 continue;
657 }
658
659 gl_constant_value *components = storage->storage;
660 unsigned vector_count = (MAX2(storage->array_elements, 1) *
661 storage->type->matrix_columns);
662
663 for (unsigned s = 0; s < vector_count; s++) {
664 uniform_vector_size[uniforms] = storage->type->vector_elements;
665
666 int i;
667 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
668 prog_data->param[uniforms * 4 + i] = &components->f;
669 components++;
670 }
671 for (; i < 4; i++) {
672 static float zero = 0;
673 prog_data->param[uniforms * 4 + i] = &zero;
674 }
675
676 uniforms++;
677 }
678 }
679 }
680
681 void
682 vec4_visitor::setup_uniform_clipplane_values()
683 {
684 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
685
686 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
687 this->uniform_vector_size[this->uniforms] = 4;
688 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
689 this->userplane[i].type = BRW_REGISTER_TYPE_F;
690 for (int j = 0; j < 4; ++j) {
691 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
692 }
693 ++this->uniforms;
694 }
695 }
696
697 /* Our support for builtin uniforms is even scarier than non-builtin.
698 * It sits on top of the PROG_STATE_VAR parameters that are
699 * automatically updated from GL context state.
700 */
701 void
702 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
703 {
704 const ir_state_slot *const slots = ir->state_slots;
705 assert(ir->state_slots != NULL);
706
707 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
708 /* This state reference has already been setup by ir_to_mesa,
709 * but we'll get the same index back here. We can reference
710 * ParameterValues directly, since unlike brw_fs.cpp, we never
711 * add new state references during compile.
712 */
713 int index = _mesa_add_state_reference(this->prog->Parameters,
714 (gl_state_index *)slots[i].tokens);
715 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
716
717 this->uniform_vector_size[this->uniforms] = 0;
718 /* Add each of the unique swizzled channels of the element.
719 * This will end up matching the size of the glsl_type of this field.
720 */
721 int last_swiz = -1;
722 for (unsigned int j = 0; j < 4; j++) {
723 int swiz = GET_SWZ(slots[i].swizzle, j);
724 last_swiz = swiz;
725
726 prog_data->param[this->uniforms * 4 + j] = &values[swiz];
727 if (swiz <= last_swiz)
728 this->uniform_vector_size[this->uniforms]++;
729 }
730 this->uniforms++;
731 }
732 }
733
734 dst_reg *
735 vec4_visitor::variable_storage(ir_variable *var)
736 {
737 return (dst_reg *)hash_table_find(this->variable_ht, var);
738 }
739
740 void
741 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
742 {
743 ir_expression *expr = ir->as_expression();
744
745 *predicate = BRW_PREDICATE_NORMAL;
746
747 if (expr) {
748 src_reg op[2];
749 vec4_instruction *inst;
750
751 assert(expr->get_num_operands() <= 2);
752 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
753 expr->operands[i]->accept(this);
754 op[i] = this->result;
755
756 resolve_ud_negate(&op[i]);
757 }
758
759 switch (expr->operation) {
760 case ir_unop_logic_not:
761 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
762 inst->conditional_mod = BRW_CONDITIONAL_Z;
763 break;
764
765 case ir_binop_logic_xor:
766 inst = emit(XOR(dst_null_d(), op[0], op[1]));
767 inst->conditional_mod = BRW_CONDITIONAL_NZ;
768 break;
769
770 case ir_binop_logic_or:
771 inst = emit(OR(dst_null_d(), op[0], op[1]));
772 inst->conditional_mod = BRW_CONDITIONAL_NZ;
773 break;
774
775 case ir_binop_logic_and:
776 inst = emit(AND(dst_null_d(), op[0], op[1]));
777 inst->conditional_mod = BRW_CONDITIONAL_NZ;
778 break;
779
780 case ir_unop_f2b:
781 if (brw->gen >= 6) {
782 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
783 } else {
784 inst = emit(MOV(dst_null_f(), op[0]));
785 inst->conditional_mod = BRW_CONDITIONAL_NZ;
786 }
787 break;
788
789 case ir_unop_i2b:
790 if (brw->gen >= 6) {
791 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
792 } else {
793 inst = emit(MOV(dst_null_d(), op[0]));
794 inst->conditional_mod = BRW_CONDITIONAL_NZ;
795 }
796 break;
797
798 case ir_binop_all_equal:
799 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
800 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
801 break;
802
803 case ir_binop_any_nequal:
804 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
805 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
806 break;
807
808 case ir_unop_any:
809 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
810 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
811 break;
812
813 case ir_binop_greater:
814 case ir_binop_gequal:
815 case ir_binop_less:
816 case ir_binop_lequal:
817 case ir_binop_equal:
818 case ir_binop_nequal:
819 emit(CMP(dst_null_d(), op[0], op[1],
820 brw_conditional_for_comparison(expr->operation)));
821 break;
822
823 default:
824 assert(!"not reached");
825 break;
826 }
827 return;
828 }
829
830 ir->accept(this);
831
832 resolve_ud_negate(&this->result);
833
834 if (brw->gen >= 6) {
835 vec4_instruction *inst = emit(AND(dst_null_d(),
836 this->result, src_reg(1)));
837 inst->conditional_mod = BRW_CONDITIONAL_NZ;
838 } else {
839 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
840 inst->conditional_mod = BRW_CONDITIONAL_NZ;
841 }
842 }
843
844 /**
845 * Emit a gen6 IF statement with the comparison folded into the IF
846 * instruction.
847 */
848 void
849 vec4_visitor::emit_if_gen6(ir_if *ir)
850 {
851 ir_expression *expr = ir->condition->as_expression();
852
853 if (expr) {
854 src_reg op[2];
855 dst_reg temp;
856
857 assert(expr->get_num_operands() <= 2);
858 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
859 expr->operands[i]->accept(this);
860 op[i] = this->result;
861 }
862
863 switch (expr->operation) {
864 case ir_unop_logic_not:
865 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
866 return;
867
868 case ir_binop_logic_xor:
869 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
870 return;
871
872 case ir_binop_logic_or:
873 temp = dst_reg(this, glsl_type::bool_type);
874 emit(OR(temp, op[0], op[1]));
875 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
876 return;
877
878 case ir_binop_logic_and:
879 temp = dst_reg(this, glsl_type::bool_type);
880 emit(AND(temp, op[0], op[1]));
881 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
882 return;
883
884 case ir_unop_f2b:
885 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
886 return;
887
888 case ir_unop_i2b:
889 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
890 return;
891
892 case ir_binop_greater:
893 case ir_binop_gequal:
894 case ir_binop_less:
895 case ir_binop_lequal:
896 case ir_binop_equal:
897 case ir_binop_nequal:
898 emit(IF(op[0], op[1],
899 brw_conditional_for_comparison(expr->operation)));
900 return;
901
902 case ir_binop_all_equal:
903 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
904 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
905 return;
906
907 case ir_binop_any_nequal:
908 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
909 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
910 return;
911
912 case ir_unop_any:
913 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
914 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
915 return;
916
917 default:
918 assert(!"not reached");
919 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
920 return;
921 }
922 return;
923 }
924
925 ir->condition->accept(this);
926
927 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
928 }
929
930 dst_reg
931 with_writemask(dst_reg const & r, int mask)
932 {
933 dst_reg result = r;
934 result.writemask = mask;
935 return result;
936 }
937
938
939 void
940 vec4_visitor::visit(ir_variable *ir)
941 {
942 dst_reg *reg = NULL;
943
944 if (variable_storage(ir))
945 return;
946
947 switch (ir->mode) {
948 case ir_var_shader_in:
949 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
950 break;
951
952 case ir_var_shader_out:
953 reg = new(mem_ctx) dst_reg(this, ir->type);
954
955 for (int i = 0; i < type_size(ir->type); i++) {
956 output_reg[ir->location + i] = *reg;
957 output_reg[ir->location + i].reg_offset = i;
958 output_reg[ir->location + i].type =
959 brw_type_for_base_type(ir->type->get_scalar_type());
960 output_reg_annotation[ir->location + i] = ir->name;
961 }
962 break;
963
964 case ir_var_auto:
965 case ir_var_temporary:
966 reg = new(mem_ctx) dst_reg(this, ir->type);
967 break;
968
969 case ir_var_uniform:
970 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
971
972 /* Thanks to the lower_ubo_reference pass, we will see only
973 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
974 * variables, so no need for them to be in variable_ht.
975 */
976 if (ir->is_in_uniform_block())
977 return;
978
979 /* Track how big the whole uniform variable is, in case we need to put a
980 * copy of its data into pull constants for array access.
981 */
982 this->uniform_size[this->uniforms] = type_size(ir->type);
983
984 if (!strncmp(ir->name, "gl_", 3)) {
985 setup_builtin_uniform_values(ir);
986 } else {
987 setup_uniform_values(ir);
988 }
989 break;
990
991 case ir_var_system_value:
992 reg = make_reg_for_system_value(ir);
993 break;
994
995 default:
996 assert(!"not reached");
997 }
998
999 reg->type = brw_type_for_base_type(ir->type);
1000 hash_table_insert(this->variable_ht, reg, ir);
1001 }
1002
1003 void
1004 vec4_visitor::visit(ir_loop *ir)
1005 {
1006 dst_reg counter;
1007
1008 /* We don't want debugging output to print the whole body of the
1009 * loop as the annotation.
1010 */
1011 this->base_ir = NULL;
1012
1013 if (ir->counter != NULL) {
1014 this->base_ir = ir->counter;
1015 ir->counter->accept(this);
1016 counter = *(variable_storage(ir->counter));
1017
1018 if (ir->from != NULL) {
1019 this->base_ir = ir->from;
1020 ir->from->accept(this);
1021
1022 emit(MOV(counter, this->result));
1023 }
1024 }
1025
1026 emit(BRW_OPCODE_DO);
1027
1028 if (ir->to) {
1029 this->base_ir = ir->to;
1030 ir->to->accept(this);
1031
1032 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1033 brw_conditional_for_comparison(ir->cmp)));
1034
1035 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1036 inst->predicate = BRW_PREDICATE_NORMAL;
1037 }
1038
1039 visit_instructions(&ir->body_instructions);
1040
1041
1042 if (ir->increment) {
1043 this->base_ir = ir->increment;
1044 ir->increment->accept(this);
1045 emit(ADD(counter, src_reg(counter), this->result));
1046 }
1047
1048 emit(BRW_OPCODE_WHILE);
1049 }
1050
1051 void
1052 vec4_visitor::visit(ir_loop_jump *ir)
1053 {
1054 switch (ir->mode) {
1055 case ir_loop_jump::jump_break:
1056 emit(BRW_OPCODE_BREAK);
1057 break;
1058 case ir_loop_jump::jump_continue:
1059 emit(BRW_OPCODE_CONTINUE);
1060 break;
1061 }
1062 }
1063
1064
1065 void
1066 vec4_visitor::visit(ir_function_signature *ir)
1067 {
1068 assert(0);
1069 (void)ir;
1070 }
1071
1072 void
1073 vec4_visitor::visit(ir_function *ir)
1074 {
1075 /* Ignore function bodies other than main() -- we shouldn't see calls to
1076 * them since they should all be inlined.
1077 */
1078 if (strcmp(ir->name, "main") == 0) {
1079 const ir_function_signature *sig;
1080 exec_list empty;
1081
1082 sig = ir->matching_signature(NULL, &empty);
1083
1084 assert(sig);
1085
1086 visit_instructions(&sig->body);
1087 }
1088 }
1089
1090 bool
1091 vec4_visitor::try_emit_sat(ir_expression *ir)
1092 {
1093 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1094 if (!sat_src)
1095 return false;
1096
1097 sat_src->accept(this);
1098 src_reg src = this->result;
1099
1100 this->result = src_reg(this, ir->type);
1101 vec4_instruction *inst;
1102 inst = emit(MOV(dst_reg(this->result), src));
1103 inst->saturate = true;
1104
1105 return true;
1106 }
1107
1108 bool
1109 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1110 {
1111 /* 3-src instructions were introduced in gen6. */
1112 if (brw->gen < 6)
1113 return false;
1114
1115 /* MAD can only handle floating-point data. */
1116 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1117 return false;
1118
1119 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1120 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1121
1122 if (!mul || mul->operation != ir_binop_mul)
1123 return false;
1124
1125 nonmul->accept(this);
1126 src_reg src0 = fix_3src_operand(this->result);
1127
1128 mul->operands[0]->accept(this);
1129 src_reg src1 = fix_3src_operand(this->result);
1130
1131 mul->operands[1]->accept(this);
1132 src_reg src2 = fix_3src_operand(this->result);
1133
1134 this->result = src_reg(this, ir->type);
1135 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1136
1137 return true;
1138 }
1139
1140 void
1141 vec4_visitor::emit_bool_comparison(unsigned int op,
1142 dst_reg dst, src_reg src0, src_reg src1)
1143 {
1144 /* original gen4 does destination conversion before comparison. */
1145 if (brw->gen < 5)
1146 dst.type = src0.type;
1147
1148 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1149
1150 dst.type = BRW_REGISTER_TYPE_D;
1151 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1152 }
1153
1154 void
1155 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1156 src_reg src0, src_reg src1)
1157 {
1158 vec4_instruction *inst;
1159
1160 if (brw->gen >= 6) {
1161 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1162 inst->conditional_mod = conditionalmod;
1163 } else {
1164 emit(CMP(dst, src0, src1, conditionalmod));
1165
1166 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1167 inst->predicate = BRW_PREDICATE_NORMAL;
1168 }
1169 }
1170
1171 static bool
1172 is_16bit_constant(ir_rvalue *rvalue)
1173 {
1174 ir_constant *constant = rvalue->as_constant();
1175 if (!constant)
1176 return false;
1177
1178 if (constant->type != glsl_type::int_type &&
1179 constant->type != glsl_type::uint_type)
1180 return false;
1181
1182 return constant->value.u[0] < (1 << 16);
1183 }
1184
1185 void
1186 vec4_visitor::visit(ir_expression *ir)
1187 {
1188 unsigned int operand;
1189 src_reg op[Elements(ir->operands)];
1190 src_reg result_src;
1191 dst_reg result_dst;
1192 vec4_instruction *inst;
1193
1194 if (try_emit_sat(ir))
1195 return;
1196
1197 if (ir->operation == ir_binop_add) {
1198 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1199 return;
1200 }
1201
1202 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1203 this->result.file = BAD_FILE;
1204 ir->operands[operand]->accept(this);
1205 if (this->result.file == BAD_FILE) {
1206 printf("Failed to get tree for expression operand:\n");
1207 ir->operands[operand]->print();
1208 exit(1);
1209 }
1210 op[operand] = this->result;
1211
1212 /* Matrix expression operands should have been broken down to vector
1213 * operations already.
1214 */
1215 assert(!ir->operands[operand]->type->is_matrix());
1216 }
1217
1218 int vector_elements = ir->operands[0]->type->vector_elements;
1219 if (ir->operands[1]) {
1220 vector_elements = MAX2(vector_elements,
1221 ir->operands[1]->type->vector_elements);
1222 }
1223
1224 this->result.file = BAD_FILE;
1225
1226 /* Storage for our result. Ideally for an assignment we'd be using
1227 * the actual storage for the result here, instead.
1228 */
1229 result_src = src_reg(this, ir->type);
1230 /* convenience for the emit functions below. */
1231 result_dst = dst_reg(result_src);
1232 /* If nothing special happens, this is the result. */
1233 this->result = result_src;
1234 /* Limit writes to the channels that will be used by result_src later.
1235 * This does limit this temp's use as a temporary for multi-instruction
1236 * sequences.
1237 */
1238 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1239
1240 switch (ir->operation) {
1241 case ir_unop_logic_not:
1242 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1243 * ones complement of the whole register, not just bit 0.
1244 */
1245 emit(XOR(result_dst, op[0], src_reg(1)));
1246 break;
1247 case ir_unop_neg:
1248 op[0].negate = !op[0].negate;
1249 emit(MOV(result_dst, op[0]));
1250 break;
1251 case ir_unop_abs:
1252 op[0].abs = true;
1253 op[0].negate = false;
1254 emit(MOV(result_dst, op[0]));
1255 break;
1256
1257 case ir_unop_sign:
1258 emit(MOV(result_dst, src_reg(0.0f)));
1259
1260 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1261 inst = emit(MOV(result_dst, src_reg(1.0f)));
1262 inst->predicate = BRW_PREDICATE_NORMAL;
1263
1264 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1265 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1266 inst->predicate = BRW_PREDICATE_NORMAL;
1267
1268 break;
1269
1270 case ir_unop_rcp:
1271 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1272 break;
1273
1274 case ir_unop_exp2:
1275 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1276 break;
1277 case ir_unop_log2:
1278 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1279 break;
1280 case ir_unop_exp:
1281 case ir_unop_log:
1282 assert(!"not reached: should be handled by ir_explog_to_explog2");
1283 break;
1284 case ir_unop_sin:
1285 case ir_unop_sin_reduced:
1286 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1287 break;
1288 case ir_unop_cos:
1289 case ir_unop_cos_reduced:
1290 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1291 break;
1292
1293 case ir_unop_dFdx:
1294 case ir_unop_dFdy:
1295 assert(!"derivatives not valid in vertex shader");
1296 break;
1297
1298 case ir_unop_bitfield_reverse:
1299 emit(BFREV(result_dst, op[0]));
1300 break;
1301 case ir_unop_bit_count:
1302 emit(CBIT(result_dst, op[0]));
1303 break;
1304 case ir_unop_find_msb: {
1305 src_reg temp = src_reg(this, glsl_type::uint_type);
1306
1307 inst = emit(FBH(dst_reg(temp), op[0]));
1308 inst->dst.writemask = WRITEMASK_XYZW;
1309
1310 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1311 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1312 * subtract the result from 31 to convert the MSB count into an LSB count.
1313 */
1314
1315 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1316 temp.swizzle = BRW_SWIZZLE_NOOP;
1317 emit(MOV(result_dst, temp));
1318
1319 src_reg src_tmp = src_reg(result_dst);
1320 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1321
1322 src_tmp.negate = true;
1323 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1324 inst->predicate = BRW_PREDICATE_NORMAL;
1325 break;
1326 }
1327 case ir_unop_find_lsb:
1328 emit(FBL(result_dst, op[0]));
1329 break;
1330
1331 case ir_unop_noise:
1332 assert(!"not reached: should be handled by lower_noise");
1333 break;
1334
1335 case ir_binop_add:
1336 emit(ADD(result_dst, op[0], op[1]));
1337 break;
1338 case ir_binop_sub:
1339 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1340 break;
1341
1342 case ir_binop_mul:
1343 if (ir->type->is_integer()) {
1344 /* For integer multiplication, the MUL uses the low 16 bits of one of
1345 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1346 * accumulates in the contribution of the upper 16 bits of that
1347 * operand. If we can determine that one of the args is in the low
1348 * 16 bits, though, we can just emit a single MUL.
1349 */
1350 if (is_16bit_constant(ir->operands[0])) {
1351 if (brw->gen < 7)
1352 emit(MUL(result_dst, op[0], op[1]));
1353 else
1354 emit(MUL(result_dst, op[1], op[0]));
1355 } else if (is_16bit_constant(ir->operands[1])) {
1356 if (brw->gen < 7)
1357 emit(MUL(result_dst, op[1], op[0]));
1358 else
1359 emit(MUL(result_dst, op[0], op[1]));
1360 } else {
1361 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1362
1363 emit(MUL(acc, op[0], op[1]));
1364 emit(MACH(dst_null_d(), op[0], op[1]));
1365 emit(MOV(result_dst, src_reg(acc)));
1366 }
1367 } else {
1368 emit(MUL(result_dst, op[0], op[1]));
1369 }
1370 break;
1371 case ir_binop_imul_high: {
1372 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1373
1374 emit(MUL(acc, op[0], op[1]));
1375 emit(MACH(result_dst, op[0], op[1]));
1376 break;
1377 }
1378 case ir_binop_div:
1379 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1380 assert(ir->type->is_integer());
1381 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1382 break;
1383 case ir_binop_carry: {
1384 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1385
1386 emit(ADDC(dst_null_ud(), op[0], op[1]));
1387 emit(MOV(result_dst, src_reg(acc)));
1388 break;
1389 }
1390 case ir_binop_borrow: {
1391 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1392
1393 emit(SUBB(dst_null_ud(), op[0], op[1]));
1394 emit(MOV(result_dst, src_reg(acc)));
1395 break;
1396 }
1397 case ir_binop_mod:
1398 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1399 assert(ir->type->is_integer());
1400 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1401 break;
1402
1403 case ir_binop_less:
1404 case ir_binop_greater:
1405 case ir_binop_lequal:
1406 case ir_binop_gequal:
1407 case ir_binop_equal:
1408 case ir_binop_nequal: {
1409 emit(CMP(result_dst, op[0], op[1],
1410 brw_conditional_for_comparison(ir->operation)));
1411 emit(AND(result_dst, result_src, src_reg(0x1)));
1412 break;
1413 }
1414
1415 case ir_binop_all_equal:
1416 /* "==" operator producing a scalar boolean. */
1417 if (ir->operands[0]->type->is_vector() ||
1418 ir->operands[1]->type->is_vector()) {
1419 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1420 emit(MOV(result_dst, src_reg(0)));
1421 inst = emit(MOV(result_dst, src_reg(1)));
1422 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1423 } else {
1424 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1425 emit(AND(result_dst, result_src, src_reg(0x1)));
1426 }
1427 break;
1428 case ir_binop_any_nequal:
1429 /* "!=" operator producing a scalar boolean. */
1430 if (ir->operands[0]->type->is_vector() ||
1431 ir->operands[1]->type->is_vector()) {
1432 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1433
1434 emit(MOV(result_dst, src_reg(0)));
1435 inst = emit(MOV(result_dst, src_reg(1)));
1436 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1437 } else {
1438 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1439 emit(AND(result_dst, result_src, src_reg(0x1)));
1440 }
1441 break;
1442
1443 case ir_unop_any:
1444 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1445 emit(MOV(result_dst, src_reg(0)));
1446
1447 inst = emit(MOV(result_dst, src_reg(1)));
1448 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1449 break;
1450
1451 case ir_binop_logic_xor:
1452 emit(XOR(result_dst, op[0], op[1]));
1453 break;
1454
1455 case ir_binop_logic_or:
1456 emit(OR(result_dst, op[0], op[1]));
1457 break;
1458
1459 case ir_binop_logic_and:
1460 emit(AND(result_dst, op[0], op[1]));
1461 break;
1462
1463 case ir_binop_dot:
1464 assert(ir->operands[0]->type->is_vector());
1465 assert(ir->operands[0]->type == ir->operands[1]->type);
1466 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1467 break;
1468
1469 case ir_unop_sqrt:
1470 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1471 break;
1472 case ir_unop_rsq:
1473 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1474 break;
1475
1476 case ir_unop_bitcast_i2f:
1477 case ir_unop_bitcast_u2f:
1478 this->result = op[0];
1479 this->result.type = BRW_REGISTER_TYPE_F;
1480 break;
1481
1482 case ir_unop_bitcast_f2i:
1483 this->result = op[0];
1484 this->result.type = BRW_REGISTER_TYPE_D;
1485 break;
1486
1487 case ir_unop_bitcast_f2u:
1488 this->result = op[0];
1489 this->result.type = BRW_REGISTER_TYPE_UD;
1490 break;
1491
1492 case ir_unop_i2f:
1493 case ir_unop_i2u:
1494 case ir_unop_u2i:
1495 case ir_unop_u2f:
1496 case ir_unop_b2f:
1497 case ir_unop_b2i:
1498 case ir_unop_f2i:
1499 case ir_unop_f2u:
1500 emit(MOV(result_dst, op[0]));
1501 break;
1502 case ir_unop_f2b:
1503 case ir_unop_i2b: {
1504 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1505 emit(AND(result_dst, result_src, src_reg(1)));
1506 break;
1507 }
1508
1509 case ir_unop_trunc:
1510 emit(RNDZ(result_dst, op[0]));
1511 break;
1512 case ir_unop_ceil:
1513 op[0].negate = !op[0].negate;
1514 inst = emit(RNDD(result_dst, op[0]));
1515 this->result.negate = true;
1516 break;
1517 case ir_unop_floor:
1518 inst = emit(RNDD(result_dst, op[0]));
1519 break;
1520 case ir_unop_fract:
1521 inst = emit(FRC(result_dst, op[0]));
1522 break;
1523 case ir_unop_round_even:
1524 emit(RNDE(result_dst, op[0]));
1525 break;
1526
1527 case ir_binop_min:
1528 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1529 break;
1530 case ir_binop_max:
1531 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1532 break;
1533
1534 case ir_binop_pow:
1535 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1536 break;
1537
1538 case ir_unop_bit_not:
1539 inst = emit(NOT(result_dst, op[0]));
1540 break;
1541 case ir_binop_bit_and:
1542 inst = emit(AND(result_dst, op[0], op[1]));
1543 break;
1544 case ir_binop_bit_xor:
1545 inst = emit(XOR(result_dst, op[0], op[1]));
1546 break;
1547 case ir_binop_bit_or:
1548 inst = emit(OR(result_dst, op[0], op[1]));
1549 break;
1550
1551 case ir_binop_lshift:
1552 inst = emit(SHL(result_dst, op[0], op[1]));
1553 break;
1554
1555 case ir_binop_rshift:
1556 if (ir->type->base_type == GLSL_TYPE_INT)
1557 inst = emit(ASR(result_dst, op[0], op[1]));
1558 else
1559 inst = emit(SHR(result_dst, op[0], op[1]));
1560 break;
1561
1562 case ir_binop_bfm:
1563 emit(BFI1(result_dst, op[0], op[1]));
1564 break;
1565
1566 case ir_binop_ubo_load: {
1567 ir_constant *uniform_block = ir->operands[0]->as_constant();
1568 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1569 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1570 src_reg offset = op[1];
1571
1572 /* Now, load the vector from that offset. */
1573 assert(ir->type->is_vector() || ir->type->is_scalar());
1574
1575 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1576 packed_consts.type = result.type;
1577 src_reg surf_index =
1578 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1579 if (const_offset_ir) {
1580 offset = src_reg(const_offset / 16);
1581 } else {
1582 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1583 }
1584
1585 vec4_instruction *pull =
1586 emit(new(mem_ctx) vec4_instruction(this,
1587 VS_OPCODE_PULL_CONSTANT_LOAD,
1588 dst_reg(packed_consts),
1589 surf_index,
1590 offset));
1591 pull->base_mrf = 14;
1592 pull->mlen = 1;
1593
1594 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1595 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1596 const_offset % 16 / 4,
1597 const_offset % 16 / 4,
1598 const_offset % 16 / 4);
1599
1600 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1601 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1602 emit(CMP(result_dst, packed_consts, src_reg(0u),
1603 BRW_CONDITIONAL_NZ));
1604 emit(AND(result_dst, result, src_reg(0x1)));
1605 } else {
1606 emit(MOV(result_dst, packed_consts));
1607 }
1608 break;
1609 }
1610
1611 case ir_binop_vector_extract:
1612 assert(!"should have been lowered by vec_index_to_cond_assign");
1613 break;
1614
1615 case ir_triop_fma:
1616 op[0] = fix_3src_operand(op[0]);
1617 op[1] = fix_3src_operand(op[1]);
1618 op[2] = fix_3src_operand(op[2]);
1619 /* Note that the instruction's argument order is reversed from GLSL
1620 * and the IR.
1621 */
1622 emit(MAD(result_dst, op[2], op[1], op[0]));
1623 break;
1624
1625 case ir_triop_lrp:
1626 op[0] = fix_3src_operand(op[0]);
1627 op[1] = fix_3src_operand(op[1]);
1628 op[2] = fix_3src_operand(op[2]);
1629 /* Note that the instruction's argument order is reversed from GLSL
1630 * and the IR.
1631 */
1632 emit(LRP(result_dst, op[2], op[1], op[0]));
1633 break;
1634
1635 case ir_triop_csel:
1636 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1637 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1638 inst->predicate = BRW_PREDICATE_NORMAL;
1639 break;
1640
1641 case ir_triop_bfi:
1642 op[0] = fix_3src_operand(op[0]);
1643 op[1] = fix_3src_operand(op[1]);
1644 op[2] = fix_3src_operand(op[2]);
1645 emit(BFI2(result_dst, op[0], op[1], op[2]));
1646 break;
1647
1648 case ir_triop_bitfield_extract:
1649 op[0] = fix_3src_operand(op[0]);
1650 op[1] = fix_3src_operand(op[1]);
1651 op[2] = fix_3src_operand(op[2]);
1652 /* Note that the instruction's argument order is reversed from GLSL
1653 * and the IR.
1654 */
1655 emit(BFE(result_dst, op[2], op[1], op[0]));
1656 break;
1657
1658 case ir_triop_vector_insert:
1659 assert(!"should have been lowered by lower_vector_insert");
1660 break;
1661
1662 case ir_quadop_bitfield_insert:
1663 assert(!"not reached: should be handled by "
1664 "bitfield_insert_to_bfm_bfi\n");
1665 break;
1666
1667 case ir_quadop_vector:
1668 assert(!"not reached: should be handled by lower_quadop_vector");
1669 break;
1670
1671 case ir_unop_pack_half_2x16:
1672 emit_pack_half_2x16(result_dst, op[0]);
1673 break;
1674 case ir_unop_unpack_half_2x16:
1675 emit_unpack_half_2x16(result_dst, op[0]);
1676 break;
1677 case ir_unop_pack_snorm_2x16:
1678 case ir_unop_pack_snorm_4x8:
1679 case ir_unop_pack_unorm_2x16:
1680 case ir_unop_pack_unorm_4x8:
1681 case ir_unop_unpack_snorm_2x16:
1682 case ir_unop_unpack_snorm_4x8:
1683 case ir_unop_unpack_unorm_2x16:
1684 case ir_unop_unpack_unorm_4x8:
1685 assert(!"not reached: should be handled by lower_packing_builtins");
1686 break;
1687 case ir_unop_unpack_half_2x16_split_x:
1688 case ir_unop_unpack_half_2x16_split_y:
1689 case ir_binop_pack_half_2x16_split:
1690 assert(!"not reached: should not occur in vertex shader");
1691 break;
1692 case ir_binop_ldexp:
1693 assert(!"not reached: should be handled by ldexp_to_arith()");
1694 break;
1695 }
1696 }
1697
1698
1699 void
1700 vec4_visitor::visit(ir_swizzle *ir)
1701 {
1702 src_reg src;
1703 int i = 0;
1704 int swizzle[4];
1705
1706 /* Note that this is only swizzles in expressions, not those on the left
1707 * hand side of an assignment, which do write masking. See ir_assignment
1708 * for that.
1709 */
1710
1711 ir->val->accept(this);
1712 src = this->result;
1713 assert(src.file != BAD_FILE);
1714
1715 for (i = 0; i < ir->type->vector_elements; i++) {
1716 switch (i) {
1717 case 0:
1718 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1719 break;
1720 case 1:
1721 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1722 break;
1723 case 2:
1724 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1725 break;
1726 case 3:
1727 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1728 break;
1729 }
1730 }
1731 for (; i < 4; i++) {
1732 /* Replicate the last channel out. */
1733 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1734 }
1735
1736 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1737
1738 this->result = src;
1739 }
1740
1741 void
1742 vec4_visitor::visit(ir_dereference_variable *ir)
1743 {
1744 const struct glsl_type *type = ir->type;
1745 dst_reg *reg = variable_storage(ir->var);
1746
1747 if (!reg) {
1748 fail("Failed to find variable storage for %s\n", ir->var->name);
1749 this->result = src_reg(brw_null_reg());
1750 return;
1751 }
1752
1753 this->result = src_reg(*reg);
1754
1755 /* System values get their swizzle from the dst_reg writemask */
1756 if (ir->var->mode == ir_var_system_value)
1757 return;
1758
1759 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1760 this->result.swizzle = swizzle_for_size(type->vector_elements);
1761 }
1762
1763
1764 int
1765 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1766 {
1767 /* Under normal circumstances array elements are stored consecutively, so
1768 * the stride is equal to the size of the array element.
1769 */
1770 return type_size(ir->type);
1771 }
1772
1773
1774 void
1775 vec4_visitor::visit(ir_dereference_array *ir)
1776 {
1777 ir_constant *constant_index;
1778 src_reg src;
1779 int array_stride = compute_array_stride(ir);
1780
1781 constant_index = ir->array_index->constant_expression_value();
1782
1783 ir->array->accept(this);
1784 src = this->result;
1785
1786 if (constant_index) {
1787 src.reg_offset += constant_index->value.i[0] * array_stride;
1788 } else {
1789 /* Variable index array dereference. It eats the "vec4" of the
1790 * base of the array and an index that offsets the Mesa register
1791 * index.
1792 */
1793 ir->array_index->accept(this);
1794
1795 src_reg index_reg;
1796
1797 if (array_stride == 1) {
1798 index_reg = this->result;
1799 } else {
1800 index_reg = src_reg(this, glsl_type::int_type);
1801
1802 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1803 }
1804
1805 if (src.reladdr) {
1806 src_reg temp = src_reg(this, glsl_type::int_type);
1807
1808 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1809
1810 index_reg = temp;
1811 }
1812
1813 src.reladdr = ralloc(mem_ctx, src_reg);
1814 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1815 }
1816
1817 /* If the type is smaller than a vec4, replicate the last channel out. */
1818 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1819 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1820 else
1821 src.swizzle = BRW_SWIZZLE_NOOP;
1822 src.type = brw_type_for_base_type(ir->type);
1823
1824 this->result = src;
1825 }
1826
1827 void
1828 vec4_visitor::visit(ir_dereference_record *ir)
1829 {
1830 unsigned int i;
1831 const glsl_type *struct_type = ir->record->type;
1832 int offset = 0;
1833
1834 ir->record->accept(this);
1835
1836 for (i = 0; i < struct_type->length; i++) {
1837 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1838 break;
1839 offset += type_size(struct_type->fields.structure[i].type);
1840 }
1841
1842 /* If the type is smaller than a vec4, replicate the last channel out. */
1843 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1844 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1845 else
1846 this->result.swizzle = BRW_SWIZZLE_NOOP;
1847 this->result.type = brw_type_for_base_type(ir->type);
1848
1849 this->result.reg_offset += offset;
1850 }
1851
1852 /**
1853 * We want to be careful in assignment setup to hit the actual storage
1854 * instead of potentially using a temporary like we might with the
1855 * ir_dereference handler.
1856 */
1857 static dst_reg
1858 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1859 {
1860 /* The LHS must be a dereference. If the LHS is a variable indexed array
1861 * access of a vector, it must be separated into a series conditional moves
1862 * before reaching this point (see ir_vec_index_to_cond_assign).
1863 */
1864 assert(ir->as_dereference());
1865 ir_dereference_array *deref_array = ir->as_dereference_array();
1866 if (deref_array) {
1867 assert(!deref_array->array->type->is_vector());
1868 }
1869
1870 /* Use the rvalue deref handler for the most part. We'll ignore
1871 * swizzles in it and write swizzles using writemask, though.
1872 */
1873 ir->accept(v);
1874 return dst_reg(v->result);
1875 }
1876
1877 void
1878 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1879 const struct glsl_type *type, uint32_t predicate)
1880 {
1881 if (type->base_type == GLSL_TYPE_STRUCT) {
1882 for (unsigned int i = 0; i < type->length; i++) {
1883 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1884 }
1885 return;
1886 }
1887
1888 if (type->is_array()) {
1889 for (unsigned int i = 0; i < type->length; i++) {
1890 emit_block_move(dst, src, type->fields.array, predicate);
1891 }
1892 return;
1893 }
1894
1895 if (type->is_matrix()) {
1896 const struct glsl_type *vec_type;
1897
1898 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1899 type->vector_elements, 1);
1900
1901 for (int i = 0; i < type->matrix_columns; i++) {
1902 emit_block_move(dst, src, vec_type, predicate);
1903 }
1904 return;
1905 }
1906
1907 assert(type->is_scalar() || type->is_vector());
1908
1909 dst->type = brw_type_for_base_type(type);
1910 src->type = dst->type;
1911
1912 dst->writemask = (1 << type->vector_elements) - 1;
1913
1914 src->swizzle = swizzle_for_size(type->vector_elements);
1915
1916 vec4_instruction *inst = emit(MOV(*dst, *src));
1917 inst->predicate = predicate;
1918
1919 dst->reg_offset++;
1920 src->reg_offset++;
1921 }
1922
1923
1924 /* If the RHS processing resulted in an instruction generating a
1925 * temporary value, and it would be easy to rewrite the instruction to
1926 * generate its result right into the LHS instead, do so. This ends
1927 * up reliably removing instructions where it can be tricky to do so
1928 * later without real UD chain information.
1929 */
1930 bool
1931 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1932 dst_reg dst,
1933 src_reg src,
1934 vec4_instruction *pre_rhs_inst,
1935 vec4_instruction *last_rhs_inst)
1936 {
1937 /* This could be supported, but it would take more smarts. */
1938 if (ir->condition)
1939 return false;
1940
1941 if (pre_rhs_inst == last_rhs_inst)
1942 return false; /* No instructions generated to work with. */
1943
1944 /* Make sure the last instruction generated our source reg. */
1945 if (src.file != GRF ||
1946 src.file != last_rhs_inst->dst.file ||
1947 src.reg != last_rhs_inst->dst.reg ||
1948 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1949 src.reladdr ||
1950 src.abs ||
1951 src.negate ||
1952 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1953 return false;
1954
1955 /* Check that that last instruction fully initialized the channels
1956 * we want to use, in the order we want to use them. We could
1957 * potentially reswizzle the operands of many instructions so that
1958 * we could handle out of order channels, but don't yet.
1959 */
1960
1961 for (unsigned i = 0; i < 4; i++) {
1962 if (dst.writemask & (1 << i)) {
1963 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1964 return false;
1965
1966 if (BRW_GET_SWZ(src.swizzle, i) != i)
1967 return false;
1968 }
1969 }
1970
1971 /* Success! Rewrite the instruction. */
1972 last_rhs_inst->dst.file = dst.file;
1973 last_rhs_inst->dst.reg = dst.reg;
1974 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1975 last_rhs_inst->dst.reladdr = dst.reladdr;
1976 last_rhs_inst->dst.writemask &= dst.writemask;
1977
1978 return true;
1979 }
1980
1981 void
1982 vec4_visitor::visit(ir_assignment *ir)
1983 {
1984 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1985 uint32_t predicate = BRW_PREDICATE_NONE;
1986
1987 if (!ir->lhs->type->is_scalar() &&
1988 !ir->lhs->type->is_vector()) {
1989 ir->rhs->accept(this);
1990 src_reg src = this->result;
1991
1992 if (ir->condition) {
1993 emit_bool_to_cond_code(ir->condition, &predicate);
1994 }
1995
1996 /* emit_block_move doesn't account for swizzles in the source register.
1997 * This should be ok, since the source register is a structure or an
1998 * array, and those can't be swizzled. But double-check to be sure.
1999 */
2000 assert(src.swizzle ==
2001 (ir->rhs->type->is_matrix()
2002 ? swizzle_for_size(ir->rhs->type->vector_elements)
2003 : BRW_SWIZZLE_NOOP));
2004
2005 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2006 return;
2007 }
2008
2009 /* Now we're down to just a scalar/vector with writemasks. */
2010 int i;
2011
2012 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2013 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2014
2015 ir->rhs->accept(this);
2016
2017 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2018
2019 src_reg src = this->result;
2020
2021 int swizzles[4];
2022 int first_enabled_chan = 0;
2023 int src_chan = 0;
2024
2025 assert(ir->lhs->type->is_vector() ||
2026 ir->lhs->type->is_scalar());
2027 dst.writemask = ir->write_mask;
2028
2029 for (int i = 0; i < 4; i++) {
2030 if (dst.writemask & (1 << i)) {
2031 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2032 break;
2033 }
2034 }
2035
2036 /* Swizzle a small RHS vector into the channels being written.
2037 *
2038 * glsl ir treats write_mask as dictating how many channels are
2039 * present on the RHS while in our instructions we need to make
2040 * those channels appear in the slots of the vec4 they're written to.
2041 */
2042 for (int i = 0; i < 4; i++) {
2043 if (dst.writemask & (1 << i))
2044 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2045 else
2046 swizzles[i] = first_enabled_chan;
2047 }
2048 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2049 swizzles[2], swizzles[3]);
2050
2051 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2052 return;
2053 }
2054
2055 if (ir->condition) {
2056 emit_bool_to_cond_code(ir->condition, &predicate);
2057 }
2058
2059 for (i = 0; i < type_size(ir->lhs->type); i++) {
2060 vec4_instruction *inst = emit(MOV(dst, src));
2061 inst->predicate = predicate;
2062
2063 dst.reg_offset++;
2064 src.reg_offset++;
2065 }
2066 }
2067
2068 void
2069 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2070 {
2071 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2072 foreach_list(node, &ir->components) {
2073 ir_constant *field_value = (ir_constant *)node;
2074
2075 emit_constant_values(dst, field_value);
2076 }
2077 return;
2078 }
2079
2080 if (ir->type->is_array()) {
2081 for (unsigned int i = 0; i < ir->type->length; i++) {
2082 emit_constant_values(dst, ir->array_elements[i]);
2083 }
2084 return;
2085 }
2086
2087 if (ir->type->is_matrix()) {
2088 for (int i = 0; i < ir->type->matrix_columns; i++) {
2089 float *vec = &ir->value.f[i * ir->type->vector_elements];
2090
2091 for (int j = 0; j < ir->type->vector_elements; j++) {
2092 dst->writemask = 1 << j;
2093 dst->type = BRW_REGISTER_TYPE_F;
2094
2095 emit(MOV(*dst, src_reg(vec[j])));
2096 }
2097 dst->reg_offset++;
2098 }
2099 return;
2100 }
2101
2102 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2103
2104 for (int i = 0; i < ir->type->vector_elements; i++) {
2105 if (!(remaining_writemask & (1 << i)))
2106 continue;
2107
2108 dst->writemask = 1 << i;
2109 dst->type = brw_type_for_base_type(ir->type);
2110
2111 /* Find other components that match the one we're about to
2112 * write. Emits fewer instructions for things like vec4(0.5,
2113 * 1.5, 1.5, 1.5).
2114 */
2115 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2116 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2117 if (ir->value.b[i] == ir->value.b[j])
2118 dst->writemask |= (1 << j);
2119 } else {
2120 /* u, i, and f storage all line up, so no need for a
2121 * switch case for comparing each type.
2122 */
2123 if (ir->value.u[i] == ir->value.u[j])
2124 dst->writemask |= (1 << j);
2125 }
2126 }
2127
2128 switch (ir->type->base_type) {
2129 case GLSL_TYPE_FLOAT:
2130 emit(MOV(*dst, src_reg(ir->value.f[i])));
2131 break;
2132 case GLSL_TYPE_INT:
2133 emit(MOV(*dst, src_reg(ir->value.i[i])));
2134 break;
2135 case GLSL_TYPE_UINT:
2136 emit(MOV(*dst, src_reg(ir->value.u[i])));
2137 break;
2138 case GLSL_TYPE_BOOL:
2139 emit(MOV(*dst, src_reg(ir->value.b[i])));
2140 break;
2141 default:
2142 assert(!"Non-float/uint/int/bool constant");
2143 break;
2144 }
2145
2146 remaining_writemask &= ~dst->writemask;
2147 }
2148 dst->reg_offset++;
2149 }
2150
2151 void
2152 vec4_visitor::visit(ir_constant *ir)
2153 {
2154 dst_reg dst = dst_reg(this, ir->type);
2155 this->result = src_reg(dst);
2156
2157 emit_constant_values(&dst, ir);
2158 }
2159
2160 void
2161 vec4_visitor::visit(ir_call *ir)
2162 {
2163 assert(!"not reached");
2164 }
2165
2166 void
2167 vec4_visitor::visit(ir_texture *ir)
2168 {
2169 int sampler =
2170 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2171
2172 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2173 * emitting anything other than setting up the constant result.
2174 */
2175 if (ir->op == ir_tg4) {
2176 ir_constant *chan = ir->lod_info.component->as_constant();
2177 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2178 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2179 dst_reg result(this, ir->type);
2180 this->result = src_reg(result);
2181 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2182 return;
2183 }
2184 }
2185
2186 /* Should be lowered by do_lower_texture_projection */
2187 assert(!ir->projector);
2188
2189 /* Generate code to compute all the subexpression trees. This has to be
2190 * done before loading any values into MRFs for the sampler message since
2191 * generating these values may involve SEND messages that need the MRFs.
2192 */
2193 src_reg coordinate;
2194 if (ir->coordinate) {
2195 ir->coordinate->accept(this);
2196 coordinate = this->result;
2197 }
2198
2199 src_reg shadow_comparitor;
2200 if (ir->shadow_comparitor) {
2201 ir->shadow_comparitor->accept(this);
2202 shadow_comparitor = this->result;
2203 }
2204
2205 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2206 src_reg lod, dPdx, dPdy, sample_index;
2207 switch (ir->op) {
2208 case ir_tex:
2209 lod = src_reg(0.0f);
2210 lod_type = glsl_type::float_type;
2211 break;
2212 case ir_txf:
2213 case ir_txl:
2214 case ir_txs:
2215 ir->lod_info.lod->accept(this);
2216 lod = this->result;
2217 lod_type = ir->lod_info.lod->type;
2218 break;
2219 case ir_query_levels:
2220 lod = src_reg(0);
2221 lod_type = glsl_type::int_type;
2222 break;
2223 case ir_txf_ms:
2224 ir->lod_info.sample_index->accept(this);
2225 sample_index = this->result;
2226 sample_index_type = ir->lod_info.sample_index->type;
2227 break;
2228 case ir_txd:
2229 ir->lod_info.grad.dPdx->accept(this);
2230 dPdx = this->result;
2231
2232 ir->lod_info.grad.dPdy->accept(this);
2233 dPdy = this->result;
2234
2235 lod_type = ir->lod_info.grad.dPdx->type;
2236 break;
2237 case ir_txb:
2238 case ir_lod:
2239 case ir_tg4:
2240 break;
2241 }
2242
2243 vec4_instruction *inst = NULL;
2244 switch (ir->op) {
2245 case ir_tex:
2246 case ir_txl:
2247 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2248 break;
2249 case ir_txd:
2250 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2251 break;
2252 case ir_txf:
2253 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2254 break;
2255 case ir_txf_ms:
2256 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2257 break;
2258 case ir_txs:
2259 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2260 break;
2261 case ir_tg4:
2262 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2263 break;
2264 case ir_query_levels:
2265 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2266 break;
2267 case ir_txb:
2268 assert(!"TXB is not valid for vertex shaders.");
2269 break;
2270 case ir_lod:
2271 assert(!"LOD is not valid for vertex shaders.");
2272 break;
2273 default:
2274 assert(!"Unrecognized tex op");
2275 }
2276
2277 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2278
2279 /* Texel offsets go in the message header; Gen4 also requires headers. */
2280 inst->header_present = use_texture_offset || brw->gen < 5 || ir->op == ir_tg4;
2281 inst->base_mrf = 2;
2282 inst->mlen = inst->header_present + 1; /* always at least one */
2283 inst->sampler = sampler;
2284 inst->dst = dst_reg(this, ir->type);
2285 inst->dst.writemask = WRITEMASK_XYZW;
2286 inst->shadow_compare = ir->shadow_comparitor != NULL;
2287
2288 if (use_texture_offset)
2289 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2290
2291 /* Stuff the channel select bits in the top of the texture offset */
2292 if (ir->op == ir_tg4)
2293 inst->texture_offset |= gather_channel(ir, sampler)<<16;
2294
2295 /* MRF for the first parameter */
2296 int param_base = inst->base_mrf + inst->header_present;
2297
2298 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2299 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2300 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2301 } else {
2302 /* Load the coordinate */
2303 /* FINISHME: gl_clamp_mask and saturate */
2304 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2305 int zero_mask = 0xf & ~coord_mask;
2306
2307 if (ir->offset && ir->op == ir_txf) {
2308 /* It appears that the ld instruction used for txf does its
2309 * address bounds check before adding in the offset. To work
2310 * around this, just add the integer offset to the integer
2311 * texel coordinate, and don't put the offset in the header.
2312 */
2313 ir_constant *offset = ir->offset->as_constant();
2314 assert(offset);
2315
2316 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2317 src_reg src = coordinate;
2318 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2319 BRW_GET_SWZ(src.swizzle, j),
2320 BRW_GET_SWZ(src.swizzle, j),
2321 BRW_GET_SWZ(src.swizzle, j));
2322 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2323 src, offset->value.i[j]));
2324 }
2325 } else {
2326 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2327 coordinate));
2328 }
2329 if (zero_mask != 0) {
2330 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2331 src_reg(0)));
2332 }
2333 /* Load the shadow comparitor */
2334 if (ir->shadow_comparitor && ir->op != ir_txd) {
2335 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2336 WRITEMASK_X),
2337 shadow_comparitor));
2338 inst->mlen++;
2339 }
2340
2341 /* Load the LOD info */
2342 if (ir->op == ir_tex || ir->op == ir_txl) {
2343 int mrf, writemask;
2344 if (brw->gen >= 5) {
2345 mrf = param_base + 1;
2346 if (ir->shadow_comparitor) {
2347 writemask = WRITEMASK_Y;
2348 /* mlen already incremented */
2349 } else {
2350 writemask = WRITEMASK_X;
2351 inst->mlen++;
2352 }
2353 } else /* brw->gen == 4 */ {
2354 mrf = param_base;
2355 writemask = WRITEMASK_W;
2356 }
2357 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2358 } else if (ir->op == ir_txf) {
2359 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2360 } else if (ir->op == ir_txf_ms) {
2361 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2362 sample_index));
2363 inst->mlen++;
2364
2365 /* on Gen7, there is an additional MCS parameter here after SI,
2366 * but we don't bother to emit it since it's always zero. If
2367 * we start supporting texturing from CMS surfaces, this will have
2368 * to change
2369 */
2370 } else if (ir->op == ir_txd) {
2371 const glsl_type *type = lod_type;
2372
2373 if (brw->gen >= 5) {
2374 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2375 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2376 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2377 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2378 inst->mlen++;
2379
2380 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2381 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2382 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2383 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2384 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2385 inst->mlen++;
2386
2387 if (ir->shadow_comparitor) {
2388 emit(MOV(dst_reg(MRF, param_base + 2,
2389 ir->shadow_comparitor->type, WRITEMASK_Z),
2390 shadow_comparitor));
2391 }
2392 }
2393 } else /* brw->gen == 4 */ {
2394 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2395 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2396 inst->mlen += 2;
2397 }
2398 }
2399 }
2400
2401 emit(inst);
2402
2403 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2404 * spec requires layers.
2405 */
2406 if (ir->op == ir_txs) {
2407 glsl_type const *type = ir->sampler->type;
2408 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2409 type->sampler_array) {
2410 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2411 with_writemask(inst->dst, WRITEMASK_Z),
2412 src_reg(inst->dst), src_reg(6));
2413 }
2414 }
2415
2416 swizzle_result(ir, src_reg(inst->dst), sampler);
2417 }
2418
2419 /**
2420 * Set up the gather channel based on the swizzle, for gather4.
2421 */
2422 uint32_t
2423 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2424 {
2425 ir_constant *chan = ir->lod_info.component->as_constant();
2426 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2427 switch (swiz) {
2428 case SWIZZLE_X: return 0;
2429 case SWIZZLE_Y:
2430 /* gather4 sampler is broken for green channel on RG32F --
2431 * we must ask for blue instead.
2432 */
2433 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2434 return 2;
2435 return 1;
2436 case SWIZZLE_Z: return 2;
2437 case SWIZZLE_W: return 3;
2438 default:
2439 assert(!"Not reached"); /* zero, one swizzles handled already */
2440 return 0;
2441 }
2442 }
2443
2444 void
2445 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2446 {
2447 int s = key->tex.swizzles[sampler];
2448
2449 this->result = src_reg(this, ir->type);
2450 dst_reg swizzled_result(this->result);
2451
2452 if (ir->op == ir_query_levels) {
2453 /* # levels is in .w */
2454 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2455 emit(MOV(swizzled_result, orig_val));
2456 return;
2457 }
2458
2459 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2460 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2461 emit(MOV(swizzled_result, orig_val));
2462 return;
2463 }
2464
2465
2466 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2467 int swizzle[4] = {0};
2468
2469 for (int i = 0; i < 4; i++) {
2470 switch (GET_SWZ(s, i)) {
2471 case SWIZZLE_ZERO:
2472 zero_mask |= (1 << i);
2473 break;
2474 case SWIZZLE_ONE:
2475 one_mask |= (1 << i);
2476 break;
2477 default:
2478 copy_mask |= (1 << i);
2479 swizzle[i] = GET_SWZ(s, i);
2480 break;
2481 }
2482 }
2483
2484 if (copy_mask) {
2485 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2486 swizzled_result.writemask = copy_mask;
2487 emit(MOV(swizzled_result, orig_val));
2488 }
2489
2490 if (zero_mask) {
2491 swizzled_result.writemask = zero_mask;
2492 emit(MOV(swizzled_result, src_reg(0.0f)));
2493 }
2494
2495 if (one_mask) {
2496 swizzled_result.writemask = one_mask;
2497 emit(MOV(swizzled_result, src_reg(1.0f)));
2498 }
2499 }
2500
2501 void
2502 vec4_visitor::visit(ir_return *ir)
2503 {
2504 assert(!"not reached");
2505 }
2506
2507 void
2508 vec4_visitor::visit(ir_discard *ir)
2509 {
2510 assert(!"not reached");
2511 }
2512
2513 void
2514 vec4_visitor::visit(ir_if *ir)
2515 {
2516 /* Don't point the annotation at the if statement, because then it plus
2517 * the then and else blocks get printed.
2518 */
2519 this->base_ir = ir->condition;
2520
2521 if (brw->gen == 6) {
2522 emit_if_gen6(ir);
2523 } else {
2524 uint32_t predicate;
2525 emit_bool_to_cond_code(ir->condition, &predicate);
2526 emit(IF(predicate));
2527 }
2528
2529 visit_instructions(&ir->then_instructions);
2530
2531 if (!ir->else_instructions.is_empty()) {
2532 this->base_ir = ir->condition;
2533 emit(BRW_OPCODE_ELSE);
2534
2535 visit_instructions(&ir->else_instructions);
2536 }
2537
2538 this->base_ir = ir->condition;
2539 emit(BRW_OPCODE_ENDIF);
2540 }
2541
2542 void
2543 vec4_visitor::visit(ir_emit_vertex *)
2544 {
2545 assert(!"not reached");
2546 }
2547
2548 void
2549 vec4_visitor::visit(ir_end_primitive *)
2550 {
2551 assert(!"not reached");
2552 }
2553
2554 void
2555 vec4_visitor::emit_ndc_computation()
2556 {
2557 /* Get the position */
2558 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2559
2560 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2561 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2562 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2563
2564 current_annotation = "NDC";
2565 dst_reg ndc_w = ndc;
2566 ndc_w.writemask = WRITEMASK_W;
2567 src_reg pos_w = pos;
2568 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2569 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2570
2571 dst_reg ndc_xyz = ndc;
2572 ndc_xyz.writemask = WRITEMASK_XYZ;
2573
2574 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2575 }
2576
2577 void
2578 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2579 {
2580 if (brw->gen < 6 &&
2581 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2582 key->userclip_active || brw->has_negative_rhw_bug)) {
2583 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2584 dst_reg header1_w = header1;
2585 header1_w.writemask = WRITEMASK_W;
2586
2587 emit(MOV(header1, 0u));
2588
2589 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2590 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2591
2592 current_annotation = "Point size";
2593 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2594 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2595 }
2596
2597 if (key->userclip_active) {
2598 current_annotation = "Clipping flags";
2599 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2600 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2601
2602 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2603 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2604 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2605
2606 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2607 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2608 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2609 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2610 }
2611
2612 /* i965 clipping workaround:
2613 * 1) Test for -ve rhw
2614 * 2) If set,
2615 * set ndc = (0,0,0,0)
2616 * set ucp[6] = 1
2617 *
2618 * Later, clipping will detect ucp[6] and ensure the primitive is
2619 * clipped against all fixed planes.
2620 */
2621 if (brw->has_negative_rhw_bug) {
2622 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2623 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2624 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2625 vec4_instruction *inst;
2626 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2627 inst->predicate = BRW_PREDICATE_NORMAL;
2628 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2629 inst->predicate = BRW_PREDICATE_NORMAL;
2630 }
2631
2632 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2633 } else if (brw->gen < 6) {
2634 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2635 } else {
2636 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2637 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2638 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2639 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2640 }
2641 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2642 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2643 src_reg(output_reg[VARYING_SLOT_LAYER])));
2644 }
2645 }
2646 }
2647
2648 void
2649 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2650 {
2651 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2652 *
2653 * "If a linked set of shaders forming the vertex stage contains no
2654 * static write to gl_ClipVertex or gl_ClipDistance, but the
2655 * application has requested clipping against user clip planes through
2656 * the API, then the coordinate written to gl_Position is used for
2657 * comparison against the user clip planes."
2658 *
2659 * This function is only called if the shader didn't write to
2660 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2661 * if the user wrote to it; otherwise we use gl_Position.
2662 */
2663 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2664 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2665 clip_vertex = VARYING_SLOT_POS;
2666 }
2667
2668 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2669 ++i) {
2670 reg.writemask = 1 << i;
2671 emit(DP4(reg,
2672 src_reg(output_reg[clip_vertex]),
2673 src_reg(this->userplane[i + offset])));
2674 }
2675 }
2676
2677 void
2678 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2679 {
2680 assert (varying < VARYING_SLOT_MAX);
2681 reg.type = output_reg[varying].type;
2682 current_annotation = output_reg_annotation[varying];
2683 /* Copy the register, saturating if necessary */
2684 vec4_instruction *inst = emit(MOV(reg,
2685 src_reg(output_reg[varying])));
2686 if ((varying == VARYING_SLOT_COL0 ||
2687 varying == VARYING_SLOT_COL1 ||
2688 varying == VARYING_SLOT_BFC0 ||
2689 varying == VARYING_SLOT_BFC1) &&
2690 key->clamp_vertex_color) {
2691 inst->saturate = true;
2692 }
2693 }
2694
2695 void
2696 vec4_visitor::emit_urb_slot(int mrf, int varying)
2697 {
2698 struct brw_reg hw_reg = brw_message_reg(mrf);
2699 dst_reg reg = dst_reg(MRF, mrf);
2700 reg.type = BRW_REGISTER_TYPE_F;
2701
2702 switch (varying) {
2703 case VARYING_SLOT_PSIZ:
2704 /* PSIZ is always in slot 0, and is coupled with other flags. */
2705 current_annotation = "indices, point width, clip flags";
2706 emit_psiz_and_flags(hw_reg);
2707 break;
2708 case BRW_VARYING_SLOT_NDC:
2709 current_annotation = "NDC";
2710 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2711 break;
2712 case VARYING_SLOT_POS:
2713 current_annotation = "gl_Position";
2714 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2715 break;
2716 case VARYING_SLOT_EDGE:
2717 /* This is present when doing unfilled polygons. We're supposed to copy
2718 * the edge flag from the user-provided vertex array
2719 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2720 * of that attribute (starts as 1.0f). This is then used in clipping to
2721 * determine which edges should be drawn as wireframe.
2722 */
2723 current_annotation = "edge flag";
2724 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2725 glsl_type::float_type, WRITEMASK_XYZW))));
2726 break;
2727 case BRW_VARYING_SLOT_PAD:
2728 /* No need to write to this slot */
2729 break;
2730 default:
2731 emit_generic_urb_slot(reg, varying);
2732 break;
2733 }
2734 }
2735
2736 static int
2737 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2738 {
2739 if (brw->gen >= 6) {
2740 /* URB data written (does not include the message header reg) must
2741 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2742 * section 5.4.3.2.2: URB_INTERLEAVED.
2743 *
2744 * URB entries are allocated on a multiple of 1024 bits, so an
2745 * extra 128 bits written here to make the end align to 256 is
2746 * no problem.
2747 */
2748 if ((mlen % 2) != 1)
2749 mlen++;
2750 }
2751
2752 return mlen;
2753 }
2754
2755
2756 /**
2757 * Generates the VUE payload plus the necessary URB write instructions to
2758 * output it.
2759 *
2760 * The VUE layout is documented in Volume 2a.
2761 */
2762 void
2763 vec4_visitor::emit_vertex()
2764 {
2765 /* MRF 0 is reserved for the debugger, so start with message header
2766 * in MRF 1.
2767 */
2768 int base_mrf = 1;
2769 int mrf = base_mrf;
2770 /* In the process of generating our URB write message contents, we
2771 * may need to unspill a register or load from an array. Those
2772 * reads would use MRFs 14-15.
2773 */
2774 int max_usable_mrf = 13;
2775
2776 /* The following assertion verifies that max_usable_mrf causes an
2777 * even-numbered amount of URB write data, which will meet gen6's
2778 * requirements for length alignment.
2779 */
2780 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2781
2782 /* First mrf is the g0-based message header containing URB handles and
2783 * such.
2784 */
2785 emit_urb_write_header(mrf++);
2786
2787 if (brw->gen < 6) {
2788 emit_ndc_computation();
2789 }
2790
2791 /* Lower legacy ff and ClipVertex clipping to clip distances */
2792 if (key->userclip_active && !prog->UsesClipDistanceOut) {
2793 current_annotation = "user clip distances";
2794
2795 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2796 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2797
2798 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2799 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2800 }
2801
2802 /* We may need to split this up into several URB writes, so do them in a
2803 * loop.
2804 */
2805 int slot = 0;
2806 bool complete = false;
2807 do {
2808 /* URB offset is in URB row increments, and each of our MRFs is half of
2809 * one of those, since we're doing interleaved writes.
2810 */
2811 int offset = slot / 2;
2812
2813 mrf = base_mrf + 1;
2814 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2815 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2816
2817 /* If this was max_usable_mrf, we can't fit anything more into this
2818 * URB WRITE.
2819 */
2820 if (mrf > max_usable_mrf) {
2821 slot++;
2822 break;
2823 }
2824 }
2825
2826 complete = slot >= prog_data->vue_map.num_slots;
2827 current_annotation = "URB write";
2828 vec4_instruction *inst = emit_urb_write_opcode(complete);
2829 inst->base_mrf = base_mrf;
2830 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2831 inst->offset += offset;
2832 } while(!complete);
2833 }
2834
2835
2836 src_reg
2837 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2838 src_reg *reladdr, int reg_offset)
2839 {
2840 /* Because we store the values to scratch interleaved like our
2841 * vertex data, we need to scale the vec4 index by 2.
2842 */
2843 int message_header_scale = 2;
2844
2845 /* Pre-gen6, the message header uses byte offsets instead of vec4
2846 * (16-byte) offset units.
2847 */
2848 if (brw->gen < 6)
2849 message_header_scale *= 16;
2850
2851 if (reladdr) {
2852 src_reg index = src_reg(this, glsl_type::int_type);
2853
2854 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2855 emit_before(inst, MUL(dst_reg(index),
2856 index, src_reg(message_header_scale)));
2857
2858 return index;
2859 } else {
2860 return src_reg(reg_offset * message_header_scale);
2861 }
2862 }
2863
2864 src_reg
2865 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2866 src_reg *reladdr, int reg_offset)
2867 {
2868 if (reladdr) {
2869 src_reg index = src_reg(this, glsl_type::int_type);
2870
2871 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2872
2873 /* Pre-gen6, the message header uses byte offsets instead of vec4
2874 * (16-byte) offset units.
2875 */
2876 if (brw->gen < 6) {
2877 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2878 }
2879
2880 return index;
2881 } else {
2882 int message_header_scale = brw->gen < 6 ? 16 : 1;
2883 return src_reg(reg_offset * message_header_scale);
2884 }
2885 }
2886
2887 /**
2888 * Emits an instruction before @inst to load the value named by @orig_src
2889 * from scratch space at @base_offset to @temp.
2890 *
2891 * @base_offset is measured in 32-byte units (the size of a register).
2892 */
2893 void
2894 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2895 dst_reg temp, src_reg orig_src,
2896 int base_offset)
2897 {
2898 int reg_offset = base_offset + orig_src.reg_offset;
2899 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2900
2901 emit_before(inst, SCRATCH_READ(temp, index));
2902 }
2903
2904 /**
2905 * Emits an instruction after @inst to store the value to be written
2906 * to @orig_dst to scratch space at @base_offset, from @temp.
2907 *
2908 * @base_offset is measured in 32-byte units (the size of a register).
2909 */
2910 void
2911 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2912 {
2913 int reg_offset = base_offset + inst->dst.reg_offset;
2914 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2915
2916 /* Create a temporary register to store *inst's result in.
2917 *
2918 * We have to be careful in MOVing from our temporary result register in
2919 * the scratch write. If we swizzle from channels of the temporary that
2920 * weren't initialized, it will confuse live interval analysis, which will
2921 * make spilling fail to make progress.
2922 */
2923 src_reg temp = src_reg(this, glsl_type::vec4_type);
2924 temp.type = inst->dst.type;
2925 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2926 int swizzles[4];
2927 for (int i = 0; i < 4; i++)
2928 if (inst->dst.writemask & (1 << i))
2929 swizzles[i] = i;
2930 else
2931 swizzles[i] = first_writemask_chan;
2932 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2933 swizzles[2], swizzles[3]);
2934
2935 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2936 inst->dst.writemask));
2937 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2938 write->predicate = inst->predicate;
2939 write->ir = inst->ir;
2940 write->annotation = inst->annotation;
2941 inst->insert_after(write);
2942
2943 inst->dst.file = temp.file;
2944 inst->dst.reg = temp.reg;
2945 inst->dst.reg_offset = temp.reg_offset;
2946 inst->dst.reladdr = NULL;
2947 }
2948
2949 /**
2950 * We can't generally support array access in GRF space, because a
2951 * single instruction's destination can only span 2 contiguous
2952 * registers. So, we send all GRF arrays that get variable index
2953 * access to scratch space.
2954 */
2955 void
2956 vec4_visitor::move_grf_array_access_to_scratch()
2957 {
2958 int scratch_loc[this->virtual_grf_count];
2959
2960 for (int i = 0; i < this->virtual_grf_count; i++) {
2961 scratch_loc[i] = -1;
2962 }
2963
2964 /* First, calculate the set of virtual GRFs that need to be punted
2965 * to scratch due to having any array access on them, and where in
2966 * scratch.
2967 */
2968 foreach_list(node, &this->instructions) {
2969 vec4_instruction *inst = (vec4_instruction *)node;
2970
2971 if (inst->dst.file == GRF && inst->dst.reladdr &&
2972 scratch_loc[inst->dst.reg] == -1) {
2973 scratch_loc[inst->dst.reg] = c->last_scratch;
2974 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2975 }
2976
2977 for (int i = 0 ; i < 3; i++) {
2978 src_reg *src = &inst->src[i];
2979
2980 if (src->file == GRF && src->reladdr &&
2981 scratch_loc[src->reg] == -1) {
2982 scratch_loc[src->reg] = c->last_scratch;
2983 c->last_scratch += this->virtual_grf_sizes[src->reg];
2984 }
2985 }
2986 }
2987
2988 /* Now, for anything that will be accessed through scratch, rewrite
2989 * it to load/store. Note that this is a _safe list walk, because
2990 * we may generate a new scratch_write instruction after the one
2991 * we're processing.
2992 */
2993 foreach_list_safe(node, &this->instructions) {
2994 vec4_instruction *inst = (vec4_instruction *)node;
2995
2996 /* Set up the annotation tracking for new generated instructions. */
2997 base_ir = inst->ir;
2998 current_annotation = inst->annotation;
2999
3000 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3001 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3002 }
3003
3004 for (int i = 0 ; i < 3; i++) {
3005 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3006 continue;
3007
3008 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3009
3010 emit_scratch_read(inst, temp, inst->src[i],
3011 scratch_loc[inst->src[i].reg]);
3012
3013 inst->src[i].file = temp.file;
3014 inst->src[i].reg = temp.reg;
3015 inst->src[i].reg_offset = temp.reg_offset;
3016 inst->src[i].reladdr = NULL;
3017 }
3018 }
3019 }
3020
3021 /**
3022 * Emits an instruction before @inst to load the value named by @orig_src
3023 * from the pull constant buffer (surface) at @base_offset to @temp.
3024 */
3025 void
3026 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3027 dst_reg temp, src_reg orig_src,
3028 int base_offset)
3029 {
3030 int reg_offset = base_offset + orig_src.reg_offset;
3031 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3032 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3033 vec4_instruction *load;
3034
3035 if (brw->gen >= 7) {
3036 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3037 grf_offset.type = offset.type;
3038 emit_before(inst, MOV(grf_offset, offset));
3039
3040 load = new(mem_ctx) vec4_instruction(this,
3041 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3042 temp, index, src_reg(grf_offset));
3043 } else {
3044 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3045 temp, index, offset);
3046 load->base_mrf = 14;
3047 load->mlen = 1;
3048 }
3049 emit_before(inst, load);
3050 }
3051
3052 /**
3053 * Implements array access of uniforms by inserting a
3054 * PULL_CONSTANT_LOAD instruction.
3055 *
3056 * Unlike temporary GRF array access (where we don't support it due to
3057 * the difficulty of doing relative addressing on instruction
3058 * destinations), we could potentially do array access of uniforms
3059 * that were loaded in GRF space as push constants. In real-world
3060 * usage we've seen, though, the arrays being used are always larger
3061 * than we could load as push constants, so just always move all
3062 * uniform array access out to a pull constant buffer.
3063 */
3064 void
3065 vec4_visitor::move_uniform_array_access_to_pull_constants()
3066 {
3067 int pull_constant_loc[this->uniforms];
3068
3069 for (int i = 0; i < this->uniforms; i++) {
3070 pull_constant_loc[i] = -1;
3071 }
3072
3073 /* Walk through and find array access of uniforms. Put a copy of that
3074 * uniform in the pull constant buffer.
3075 *
3076 * Note that we don't move constant-indexed accesses to arrays. No
3077 * testing has been done of the performance impact of this choice.
3078 */
3079 foreach_list_safe(node, &this->instructions) {
3080 vec4_instruction *inst = (vec4_instruction *)node;
3081
3082 for (int i = 0 ; i < 3; i++) {
3083 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3084 continue;
3085
3086 int uniform = inst->src[i].reg;
3087
3088 /* If this array isn't already present in the pull constant buffer,
3089 * add it.
3090 */
3091 if (pull_constant_loc[uniform] == -1) {
3092 const float **values = &prog_data->param[uniform * 4];
3093
3094 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3095
3096 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3097 prog_data->pull_param[prog_data->nr_pull_params++]
3098 = values[j];
3099 }
3100 }
3101
3102 /* Set up the annotation tracking for new generated instructions. */
3103 base_ir = inst->ir;
3104 current_annotation = inst->annotation;
3105
3106 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3107
3108 emit_pull_constant_load(inst, temp, inst->src[i],
3109 pull_constant_loc[uniform]);
3110
3111 inst->src[i].file = temp.file;
3112 inst->src[i].reg = temp.reg;
3113 inst->src[i].reg_offset = temp.reg_offset;
3114 inst->src[i].reladdr = NULL;
3115 }
3116 }
3117
3118 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3119 * no need to track them as larger-than-vec4 objects. This will be
3120 * relied on in cutting out unused uniform vectors from push
3121 * constants.
3122 */
3123 split_uniform_registers();
3124 }
3125
3126 void
3127 vec4_visitor::resolve_ud_negate(src_reg *reg)
3128 {
3129 if (reg->type != BRW_REGISTER_TYPE_UD ||
3130 !reg->negate)
3131 return;
3132
3133 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3134 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3135 *reg = temp;
3136 }
3137
3138 vec4_visitor::vec4_visitor(struct brw_context *brw,
3139 struct brw_vec4_compile *c,
3140 struct gl_program *prog,
3141 const struct brw_vec4_prog_key *key,
3142 struct brw_vec4_prog_data *prog_data,
3143 struct gl_shader_program *shader_prog,
3144 struct brw_shader *shader,
3145 void *mem_ctx,
3146 bool debug_flag,
3147 bool no_spills)
3148 : debug_flag(debug_flag), no_spills(no_spills)
3149 {
3150 this->brw = brw;
3151 this->ctx = &brw->ctx;
3152 this->shader_prog = shader_prog;
3153 this->shader = shader;
3154
3155 this->mem_ctx = mem_ctx;
3156 this->failed = false;
3157
3158 this->base_ir = NULL;
3159 this->current_annotation = NULL;
3160 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3161
3162 this->c = c;
3163 this->prog = prog;
3164 this->key = key;
3165 this->prog_data = prog_data;
3166 this->stage_prog_data = &prog_data->base;
3167
3168 this->variable_ht = hash_table_ctor(0,
3169 hash_table_pointer_hash,
3170 hash_table_pointer_compare);
3171
3172 this->virtual_grf_start = NULL;
3173 this->virtual_grf_end = NULL;
3174 this->virtual_grf_sizes = NULL;
3175 this->virtual_grf_count = 0;
3176 this->virtual_grf_reg_map = NULL;
3177 this->virtual_grf_reg_count = 0;
3178 this->virtual_grf_array_size = 0;
3179 this->live_intervals_valid = false;
3180
3181 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3182
3183 this->uniforms = 0;
3184 }
3185
3186 vec4_visitor::~vec4_visitor()
3187 {
3188 hash_table_dtor(this->variable_ht);
3189 }
3190
3191
3192 void
3193 vec4_visitor::fail(const char *format, ...)
3194 {
3195 va_list va;
3196 char *msg;
3197
3198 if (failed)
3199 return;
3200
3201 failed = true;
3202
3203 va_start(va, format);
3204 msg = ralloc_vasprintf(mem_ctx, format, va);
3205 va_end(va);
3206 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3207
3208 this->fail_msg = msg;
3209
3210 if (debug_flag) {
3211 fprintf(stderr, "%s", msg);
3212 }
3213 }
3214
3215 } /* namespace brw */