i965: Move up duplicated fields from stage-specific prog_data to brw_stage_prog_data.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->sampler = 0;
47 this->texture_offset = 0;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->ir = v->base_ir;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_present = false;
53 this->mlen = 0;
54 this->base_mrf = 0;
55 this->offset = 0;
56 this->annotation = v->current_annotation;
57 }
58
59 vec4_instruction *
60 vec4_visitor::emit(vec4_instruction *inst)
61 {
62 this->instructions.push_tail(inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
69 {
70 new_inst->ir = inst->ir;
71 new_inst->annotation = inst->annotation;
72
73 inst->insert_before(new_inst);
74
75 return inst;
76 }
77
78 vec4_instruction *
79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
80 src_reg src0, src_reg src1, src_reg src2)
81 {
82 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
83 src0, src1, src2));
84 }
85
86
87 vec4_instruction *
88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
89 {
90 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
91 }
92
93 vec4_instruction *
94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
95 {
96 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
101 {
102 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode)
107 {
108 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
109 }
110
111 #define ALU1(op) \
112 vec4_instruction * \
113 vec4_visitor::op(dst_reg dst, src_reg src0) \
114 { \
115 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
116 src0); \
117 }
118
119 #define ALU2(op) \
120 vec4_instruction * \
121 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
122 { \
123 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
124 src0, src1); \
125 }
126
127 #define ALU3(op) \
128 vec4_instruction * \
129 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
130 { \
131 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
132 src0, src1, src2); \
133 }
134
135 ALU1(NOT)
136 ALU1(MOV)
137 ALU1(FRC)
138 ALU1(RNDD)
139 ALU1(RNDE)
140 ALU1(RNDZ)
141 ALU1(F32TO16)
142 ALU1(F16TO32)
143 ALU2(ADD)
144 ALU2(MUL)
145 ALU2(MACH)
146 ALU2(AND)
147 ALU2(OR)
148 ALU2(XOR)
149 ALU2(DP3)
150 ALU2(DP4)
151 ALU2(DPH)
152 ALU2(SHL)
153 ALU2(SHR)
154 ALU2(ASR)
155 ALU3(LRP)
156 ALU1(BFREV)
157 ALU3(BFE)
158 ALU2(BFI1)
159 ALU3(BFI2)
160 ALU1(FBH)
161 ALU1(FBL)
162 ALU1(CBIT)
163 ALU3(MAD)
164 ALU2(ADDC)
165 ALU2(SUBB)
166
167 /** Gen4 predicated IF. */
168 vec4_instruction *
169 vec4_visitor::IF(uint32_t predicate)
170 {
171 vec4_instruction *inst;
172
173 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
174 inst->predicate = predicate;
175
176 return inst;
177 }
178
179 /** Gen6 IF with embedded comparison. */
180 vec4_instruction *
181 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
182 {
183 assert(brw->gen == 6);
184
185 vec4_instruction *inst;
186
187 resolve_ud_negate(&src0);
188 resolve_ud_negate(&src1);
189
190 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
191 src0, src1);
192 inst->conditional_mod = condition;
193
194 return inst;
195 }
196
197 /**
198 * CMP: Sets the low bit of the destination channels with the result
199 * of the comparison, while the upper bits are undefined, and updates
200 * the flag register with the packed 16 bits of the result.
201 */
202 vec4_instruction *
203 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
204 {
205 vec4_instruction *inst;
206
207 /* original gen4 does type conversion to the destination type
208 * before before comparison, producing garbage results for floating
209 * point comparisons.
210 */
211 if (brw->gen == 4) {
212 dst.type = src0.type;
213 if (dst.file == HW_REG)
214 dst.fixed_hw_reg.type = dst.type;
215 }
216
217 resolve_ud_negate(&src0);
218 resolve_ud_negate(&src1);
219
220 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
221 inst->conditional_mod = condition;
222
223 return inst;
224 }
225
226 vec4_instruction *
227 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
228 {
229 vec4_instruction *inst;
230
231 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
232 dst, index);
233 inst->base_mrf = 14;
234 inst->mlen = 2;
235
236 return inst;
237 }
238
239 vec4_instruction *
240 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
241 {
242 vec4_instruction *inst;
243
244 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
245 dst, src, index);
246 inst->base_mrf = 13;
247 inst->mlen = 3;
248
249 return inst;
250 }
251
252 void
253 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
254 {
255 static enum opcode dot_opcodes[] = {
256 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
257 };
258
259 emit(dot_opcodes[elements - 2], dst, src0, src1);
260 }
261
262 src_reg
263 vec4_visitor::fix_3src_operand(src_reg src)
264 {
265 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
266 * able to use vertical stride of zero to replicate the vec4 uniform, like
267 *
268 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
269 *
270 * But you can't, since vertical stride is always four in three-source
271 * instructions. Instead, insert a MOV instruction to do the replication so
272 * that the three-source instruction can consume it.
273 */
274
275 /* The MOV is only needed if the source is a uniform or immediate. */
276 if (src.file != UNIFORM && src.file != IMM)
277 return src;
278
279 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
280 expanded.type = src.type;
281 emit(MOV(expanded, src));
282 return src_reg(expanded);
283 }
284
285 src_reg
286 vec4_visitor::fix_math_operand(src_reg src)
287 {
288 /* The gen6 math instruction ignores the source modifiers --
289 * swizzle, abs, negate, and at least some parts of the register
290 * region description.
291 *
292 * Rather than trying to enumerate all these cases, *always* expand the
293 * operand to a temp GRF for gen6.
294 *
295 * For gen7, keep the operand as-is, except if immediate, which gen7 still
296 * can't use.
297 */
298
299 if (brw->gen == 7 && src.file != IMM)
300 return src;
301
302 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303 expanded.type = src.type;
304 emit(MOV(expanded, src));
305 return src_reg(expanded);
306 }
307
308 void
309 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
310 {
311 src = fix_math_operand(src);
312
313 if (dst.writemask != WRITEMASK_XYZW) {
314 /* The gen6 math instruction must be align1, so we can't do
315 * writemasks.
316 */
317 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
318
319 emit(opcode, temp_dst, src);
320
321 emit(MOV(dst, src_reg(temp_dst)));
322 } else {
323 emit(opcode, dst, src);
324 }
325 }
326
327 void
328 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
329 {
330 vec4_instruction *inst = emit(opcode, dst, src);
331 inst->base_mrf = 1;
332 inst->mlen = 1;
333 }
334
335 void
336 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
337 {
338 switch (opcode) {
339 case SHADER_OPCODE_RCP:
340 case SHADER_OPCODE_RSQ:
341 case SHADER_OPCODE_SQRT:
342 case SHADER_OPCODE_EXP2:
343 case SHADER_OPCODE_LOG2:
344 case SHADER_OPCODE_SIN:
345 case SHADER_OPCODE_COS:
346 break;
347 default:
348 assert(!"not reached: bad math opcode");
349 return;
350 }
351
352 if (brw->gen >= 6) {
353 return emit_math1_gen6(opcode, dst, src);
354 } else {
355 return emit_math1_gen4(opcode, dst, src);
356 }
357 }
358
359 void
360 vec4_visitor::emit_math2_gen6(enum opcode opcode,
361 dst_reg dst, src_reg src0, src_reg src1)
362 {
363 src0 = fix_math_operand(src0);
364 src1 = fix_math_operand(src1);
365
366 if (dst.writemask != WRITEMASK_XYZW) {
367 /* The gen6 math instruction must be align1, so we can't do
368 * writemasks.
369 */
370 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
371 temp_dst.type = dst.type;
372
373 emit(opcode, temp_dst, src0, src1);
374
375 emit(MOV(dst, src_reg(temp_dst)));
376 } else {
377 emit(opcode, dst, src0, src1);
378 }
379 }
380
381 void
382 vec4_visitor::emit_math2_gen4(enum opcode opcode,
383 dst_reg dst, src_reg src0, src_reg src1)
384 {
385 vec4_instruction *inst = emit(opcode, dst, src0, src1);
386 inst->base_mrf = 1;
387 inst->mlen = 2;
388 }
389
390 void
391 vec4_visitor::emit_math(enum opcode opcode,
392 dst_reg dst, src_reg src0, src_reg src1)
393 {
394 switch (opcode) {
395 case SHADER_OPCODE_POW:
396 case SHADER_OPCODE_INT_QUOTIENT:
397 case SHADER_OPCODE_INT_REMAINDER:
398 break;
399 default:
400 assert(!"not reached: unsupported binary math opcode");
401 return;
402 }
403
404 if (brw->gen >= 6) {
405 return emit_math2_gen6(opcode, dst, src0, src1);
406 } else {
407 return emit_math2_gen4(opcode, dst, src0, src1);
408 }
409 }
410
411 void
412 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
413 {
414 if (brw->gen < 7)
415 assert(!"ir_unop_pack_half_2x16 should be lowered");
416
417 assert(dst.type == BRW_REGISTER_TYPE_UD);
418 assert(src0.type == BRW_REGISTER_TYPE_F);
419
420 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
421 *
422 * Because this instruction does not have a 16-bit floating-point type,
423 * the destination data type must be Word (W).
424 *
425 * The destination must be DWord-aligned and specify a horizontal stride
426 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
427 * each destination channel and the upper word is not modified.
428 *
429 * The above restriction implies that the f32to16 instruction must use
430 * align1 mode, because only in align1 mode is it possible to specify
431 * horizontal stride. We choose here to defy the hardware docs and emit
432 * align16 instructions.
433 *
434 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
435 * instructions. I was partially successful in that the code passed all
436 * tests. However, the code was dubiously correct and fragile, and the
437 * tests were not harsh enough to probe that frailty. Not trusting the
438 * code, I chose instead to remain in align16 mode in defiance of the hw
439 * docs).
440 *
441 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
442 * simulator, emitting a f32to16 in align16 mode with UD as destination
443 * data type is safe. The behavior differs from that specified in the PRM
444 * in that the upper word of each destination channel is cleared to 0.
445 */
446
447 dst_reg tmp_dst(this, glsl_type::uvec2_type);
448 src_reg tmp_src(tmp_dst);
449
450 #if 0
451 /* Verify the undocumented behavior on which the following instructions
452 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
453 * then the result of the bit-or instruction below will be incorrect.
454 *
455 * You should inspect the disasm output in order to verify that the MOV is
456 * not optimized away.
457 */
458 emit(MOV(tmp_dst, src_reg(0x12345678u)));
459 #endif
460
461 /* Give tmp the form below, where "." means untouched.
462 *
463 * w z y x w z y x
464 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
465 *
466 * That the upper word of each write-channel be 0 is required for the
467 * following bit-shift and bit-or instructions to work. Note that this
468 * relies on the undocumented hardware behavior mentioned above.
469 */
470 tmp_dst.writemask = WRITEMASK_XY;
471 emit(F32TO16(tmp_dst, src0));
472
473 /* Give the write-channels of dst the form:
474 * 0xhhhh0000
475 */
476 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
477 emit(SHL(dst, tmp_src, src_reg(16u)));
478
479 /* Finally, give the write-channels of dst the form of packHalf2x16's
480 * output:
481 * 0xhhhhllll
482 */
483 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
484 emit(OR(dst, src_reg(dst), tmp_src));
485 }
486
487 void
488 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
489 {
490 if (brw->gen < 7)
491 assert(!"ir_unop_unpack_half_2x16 should be lowered");
492
493 assert(dst.type == BRW_REGISTER_TYPE_F);
494 assert(src0.type == BRW_REGISTER_TYPE_UD);
495
496 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
497 *
498 * Because this instruction does not have a 16-bit floating-point type,
499 * the source data type must be Word (W). The destination type must be
500 * F (Float).
501 *
502 * To use W as the source data type, we must adjust horizontal strides,
503 * which is only possible in align1 mode. All my [chadv] attempts at
504 * emitting align1 instructions for unpackHalf2x16 failed to pass the
505 * Piglit tests, so I gave up.
506 *
507 * I've verified that, on gen7 hardware and the simulator, it is safe to
508 * emit f16to32 in align16 mode with UD as source data type.
509 */
510
511 dst_reg tmp_dst(this, glsl_type::uvec2_type);
512 src_reg tmp_src(tmp_dst);
513
514 tmp_dst.writemask = WRITEMASK_X;
515 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
516
517 tmp_dst.writemask = WRITEMASK_Y;
518 emit(SHR(tmp_dst, src0, src_reg(16u)));
519
520 dst.writemask = WRITEMASK_XY;
521 emit(F16TO32(dst, tmp_src));
522 }
523
524 void
525 vec4_visitor::visit_instructions(const exec_list *list)
526 {
527 foreach_list(node, list) {
528 ir_instruction *ir = (ir_instruction *)node;
529
530 base_ir = ir;
531 ir->accept(this);
532 }
533 }
534
535
536 static int
537 type_size(const struct glsl_type *type)
538 {
539 unsigned int i;
540 int size;
541
542 switch (type->base_type) {
543 case GLSL_TYPE_UINT:
544 case GLSL_TYPE_INT:
545 case GLSL_TYPE_FLOAT:
546 case GLSL_TYPE_BOOL:
547 if (type->is_matrix()) {
548 return type->matrix_columns;
549 } else {
550 /* Regardless of size of vector, it gets a vec4. This is bad
551 * packing for things like floats, but otherwise arrays become a
552 * mess. Hopefully a later pass over the code can pack scalars
553 * down if appropriate.
554 */
555 return 1;
556 }
557 case GLSL_TYPE_ARRAY:
558 assert(type->length > 0);
559 return type_size(type->fields.array) * type->length;
560 case GLSL_TYPE_STRUCT:
561 size = 0;
562 for (i = 0; i < type->length; i++) {
563 size += type_size(type->fields.structure[i].type);
564 }
565 return size;
566 case GLSL_TYPE_SAMPLER:
567 /* Samplers take up one slot in UNIFORMS[], but they're baked in
568 * at link time.
569 */
570 return 1;
571 case GLSL_TYPE_ATOMIC_UINT:
572 return 0;
573 case GLSL_TYPE_IMAGE:
574 case GLSL_TYPE_VOID:
575 case GLSL_TYPE_ERROR:
576 case GLSL_TYPE_INTERFACE:
577 assert(0);
578 break;
579 }
580
581 return 0;
582 }
583
584 int
585 vec4_visitor::virtual_grf_alloc(int size)
586 {
587 if (virtual_grf_array_size <= virtual_grf_count) {
588 if (virtual_grf_array_size == 0)
589 virtual_grf_array_size = 16;
590 else
591 virtual_grf_array_size *= 2;
592 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
593 virtual_grf_array_size);
594 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
595 virtual_grf_array_size);
596 }
597 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
598 virtual_grf_reg_count += size;
599 virtual_grf_sizes[virtual_grf_count] = size;
600 return virtual_grf_count++;
601 }
602
603 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
604 {
605 init();
606
607 this->file = GRF;
608 this->reg = v->virtual_grf_alloc(type_size(type));
609
610 if (type->is_array() || type->is_record()) {
611 this->swizzle = BRW_SWIZZLE_NOOP;
612 } else {
613 this->swizzle = swizzle_for_size(type->vector_elements);
614 }
615
616 this->type = brw_type_for_base_type(type);
617 }
618
619 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
620 {
621 init();
622
623 this->file = GRF;
624 this->reg = v->virtual_grf_alloc(type_size(type));
625
626 if (type->is_array() || type->is_record()) {
627 this->writemask = WRITEMASK_XYZW;
628 } else {
629 this->writemask = (1 << type->vector_elements) - 1;
630 }
631
632 this->type = brw_type_for_base_type(type);
633 }
634
635 /* Our support for uniforms is piggy-backed on the struct
636 * gl_fragment_program, because that's where the values actually
637 * get stored, rather than in some global gl_shader_program uniform
638 * store.
639 */
640 void
641 vec4_visitor::setup_uniform_values(ir_variable *ir)
642 {
643 int namelen = strlen(ir->name);
644
645 /* The data for our (non-builtin) uniforms is stored in a series of
646 * gl_uniform_driver_storage structs for each subcomponent that
647 * glGetUniformLocation() could name. We know it's been set up in the same
648 * order we'd walk the type, so walk the list of storage and find anything
649 * with our name, or the prefix of a component that starts with our name.
650 */
651 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
652 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
653
654 if (strncmp(ir->name, storage->name, namelen) != 0 ||
655 (storage->name[namelen] != 0 &&
656 storage->name[namelen] != '.' &&
657 storage->name[namelen] != '[')) {
658 continue;
659 }
660
661 gl_constant_value *components = storage->storage;
662 unsigned vector_count = (MAX2(storage->array_elements, 1) *
663 storage->type->matrix_columns);
664
665 for (unsigned s = 0; s < vector_count; s++) {
666 uniform_vector_size[uniforms] = storage->type->vector_elements;
667
668 int i;
669 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
670 stage_prog_data->param[uniforms * 4 + i] = &components->f;
671 components++;
672 }
673 for (; i < 4; i++) {
674 static float zero = 0;
675 stage_prog_data->param[uniforms * 4 + i] = &zero;
676 }
677
678 uniforms++;
679 }
680 }
681 }
682
683 void
684 vec4_visitor::setup_uniform_clipplane_values()
685 {
686 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
687
688 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
689 this->uniform_vector_size[this->uniforms] = 4;
690 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
691 this->userplane[i].type = BRW_REGISTER_TYPE_F;
692 for (int j = 0; j < 4; ++j) {
693 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
694 }
695 ++this->uniforms;
696 }
697 }
698
699 /* Our support for builtin uniforms is even scarier than non-builtin.
700 * It sits on top of the PROG_STATE_VAR parameters that are
701 * automatically updated from GL context state.
702 */
703 void
704 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
705 {
706 const ir_state_slot *const slots = ir->state_slots;
707 assert(ir->state_slots != NULL);
708
709 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
710 /* This state reference has already been setup by ir_to_mesa,
711 * but we'll get the same index back here. We can reference
712 * ParameterValues directly, since unlike brw_fs.cpp, we never
713 * add new state references during compile.
714 */
715 int index = _mesa_add_state_reference(this->prog->Parameters,
716 (gl_state_index *)slots[i].tokens);
717 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
718
719 this->uniform_vector_size[this->uniforms] = 0;
720 /* Add each of the unique swizzled channels of the element.
721 * This will end up matching the size of the glsl_type of this field.
722 */
723 int last_swiz = -1;
724 for (unsigned int j = 0; j < 4; j++) {
725 int swiz = GET_SWZ(slots[i].swizzle, j);
726 last_swiz = swiz;
727
728 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
729 if (swiz <= last_swiz)
730 this->uniform_vector_size[this->uniforms]++;
731 }
732 this->uniforms++;
733 }
734 }
735
736 dst_reg *
737 vec4_visitor::variable_storage(ir_variable *var)
738 {
739 return (dst_reg *)hash_table_find(this->variable_ht, var);
740 }
741
742 void
743 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
744 {
745 ir_expression *expr = ir->as_expression();
746
747 *predicate = BRW_PREDICATE_NORMAL;
748
749 if (expr) {
750 src_reg op[2];
751 vec4_instruction *inst;
752
753 assert(expr->get_num_operands() <= 2);
754 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
755 expr->operands[i]->accept(this);
756 op[i] = this->result;
757
758 resolve_ud_negate(&op[i]);
759 }
760
761 switch (expr->operation) {
762 case ir_unop_logic_not:
763 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
764 inst->conditional_mod = BRW_CONDITIONAL_Z;
765 break;
766
767 case ir_binop_logic_xor:
768 inst = emit(XOR(dst_null_d(), op[0], op[1]));
769 inst->conditional_mod = BRW_CONDITIONAL_NZ;
770 break;
771
772 case ir_binop_logic_or:
773 inst = emit(OR(dst_null_d(), op[0], op[1]));
774 inst->conditional_mod = BRW_CONDITIONAL_NZ;
775 break;
776
777 case ir_binop_logic_and:
778 inst = emit(AND(dst_null_d(), op[0], op[1]));
779 inst->conditional_mod = BRW_CONDITIONAL_NZ;
780 break;
781
782 case ir_unop_f2b:
783 if (brw->gen >= 6) {
784 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
785 } else {
786 inst = emit(MOV(dst_null_f(), op[0]));
787 inst->conditional_mod = BRW_CONDITIONAL_NZ;
788 }
789 break;
790
791 case ir_unop_i2b:
792 if (brw->gen >= 6) {
793 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
794 } else {
795 inst = emit(MOV(dst_null_d(), op[0]));
796 inst->conditional_mod = BRW_CONDITIONAL_NZ;
797 }
798 break;
799
800 case ir_binop_all_equal:
801 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
802 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
803 break;
804
805 case ir_binop_any_nequal:
806 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
807 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
808 break;
809
810 case ir_unop_any:
811 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
812 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
813 break;
814
815 case ir_binop_greater:
816 case ir_binop_gequal:
817 case ir_binop_less:
818 case ir_binop_lequal:
819 case ir_binop_equal:
820 case ir_binop_nequal:
821 emit(CMP(dst_null_d(), op[0], op[1],
822 brw_conditional_for_comparison(expr->operation)));
823 break;
824
825 default:
826 assert(!"not reached");
827 break;
828 }
829 return;
830 }
831
832 ir->accept(this);
833
834 resolve_ud_negate(&this->result);
835
836 if (brw->gen >= 6) {
837 vec4_instruction *inst = emit(AND(dst_null_d(),
838 this->result, src_reg(1)));
839 inst->conditional_mod = BRW_CONDITIONAL_NZ;
840 } else {
841 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 }
844 }
845
846 /**
847 * Emit a gen6 IF statement with the comparison folded into the IF
848 * instruction.
849 */
850 void
851 vec4_visitor::emit_if_gen6(ir_if *ir)
852 {
853 ir_expression *expr = ir->condition->as_expression();
854
855 if (expr) {
856 src_reg op[2];
857 dst_reg temp;
858
859 assert(expr->get_num_operands() <= 2);
860 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
861 expr->operands[i]->accept(this);
862 op[i] = this->result;
863 }
864
865 switch (expr->operation) {
866 case ir_unop_logic_not:
867 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
868 return;
869
870 case ir_binop_logic_xor:
871 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
872 return;
873
874 case ir_binop_logic_or:
875 temp = dst_reg(this, glsl_type::bool_type);
876 emit(OR(temp, op[0], op[1]));
877 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
878 return;
879
880 case ir_binop_logic_and:
881 temp = dst_reg(this, glsl_type::bool_type);
882 emit(AND(temp, op[0], op[1]));
883 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
884 return;
885
886 case ir_unop_f2b:
887 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
888 return;
889
890 case ir_unop_i2b:
891 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
892 return;
893
894 case ir_binop_greater:
895 case ir_binop_gequal:
896 case ir_binop_less:
897 case ir_binop_lequal:
898 case ir_binop_equal:
899 case ir_binop_nequal:
900 emit(IF(op[0], op[1],
901 brw_conditional_for_comparison(expr->operation)));
902 return;
903
904 case ir_binop_all_equal:
905 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
906 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
907 return;
908
909 case ir_binop_any_nequal:
910 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
911 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
912 return;
913
914 case ir_unop_any:
915 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
916 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
917 return;
918
919 default:
920 assert(!"not reached");
921 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
922 return;
923 }
924 return;
925 }
926
927 ir->condition->accept(this);
928
929 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
930 }
931
932 dst_reg
933 with_writemask(dst_reg const & r, int mask)
934 {
935 dst_reg result = r;
936 result.writemask = mask;
937 return result;
938 }
939
940
941 void
942 vec4_visitor::visit(ir_variable *ir)
943 {
944 dst_reg *reg = NULL;
945
946 if (variable_storage(ir))
947 return;
948
949 switch (ir->data.mode) {
950 case ir_var_shader_in:
951 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
952 break;
953
954 case ir_var_shader_out:
955 reg = new(mem_ctx) dst_reg(this, ir->type);
956
957 for (int i = 0; i < type_size(ir->type); i++) {
958 output_reg[ir->data.location + i] = *reg;
959 output_reg[ir->data.location + i].reg_offset = i;
960 output_reg[ir->data.location + i].type =
961 brw_type_for_base_type(ir->type->get_scalar_type());
962 output_reg_annotation[ir->data.location + i] = ir->name;
963 }
964 break;
965
966 case ir_var_auto:
967 case ir_var_temporary:
968 reg = new(mem_ctx) dst_reg(this, ir->type);
969 break;
970
971 case ir_var_uniform:
972 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
973
974 /* Thanks to the lower_ubo_reference pass, we will see only
975 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
976 * variables, so no need for them to be in variable_ht.
977 *
978 * Atomic counters take no uniform storage, no need to do
979 * anything here.
980 */
981 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
982 return;
983
984 /* Track how big the whole uniform variable is, in case we need to put a
985 * copy of its data into pull constants for array access.
986 */
987 this->uniform_size[this->uniforms] = type_size(ir->type);
988
989 if (!strncmp(ir->name, "gl_", 3)) {
990 setup_builtin_uniform_values(ir);
991 } else {
992 setup_uniform_values(ir);
993 }
994 break;
995
996 case ir_var_system_value:
997 reg = make_reg_for_system_value(ir);
998 break;
999
1000 default:
1001 assert(!"not reached");
1002 }
1003
1004 reg->type = brw_type_for_base_type(ir->type);
1005 hash_table_insert(this->variable_ht, reg, ir);
1006 }
1007
1008 void
1009 vec4_visitor::visit(ir_loop *ir)
1010 {
1011 /* We don't want debugging output to print the whole body of the
1012 * loop as the annotation.
1013 */
1014 this->base_ir = NULL;
1015
1016 emit(BRW_OPCODE_DO);
1017
1018 visit_instructions(&ir->body_instructions);
1019
1020 emit(BRW_OPCODE_WHILE);
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_loop_jump *ir)
1025 {
1026 switch (ir->mode) {
1027 case ir_loop_jump::jump_break:
1028 emit(BRW_OPCODE_BREAK);
1029 break;
1030 case ir_loop_jump::jump_continue:
1031 emit(BRW_OPCODE_CONTINUE);
1032 break;
1033 }
1034 }
1035
1036
1037 void
1038 vec4_visitor::visit(ir_function_signature *ir)
1039 {
1040 assert(0);
1041 (void)ir;
1042 }
1043
1044 void
1045 vec4_visitor::visit(ir_function *ir)
1046 {
1047 /* Ignore function bodies other than main() -- we shouldn't see calls to
1048 * them since they should all be inlined.
1049 */
1050 if (strcmp(ir->name, "main") == 0) {
1051 const ir_function_signature *sig;
1052 exec_list empty;
1053
1054 sig = ir->matching_signature(NULL, &empty);
1055
1056 assert(sig);
1057
1058 visit_instructions(&sig->body);
1059 }
1060 }
1061
1062 bool
1063 vec4_visitor::try_emit_sat(ir_expression *ir)
1064 {
1065 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1066 if (!sat_src)
1067 return false;
1068
1069 sat_src->accept(this);
1070 src_reg src = this->result;
1071
1072 this->result = src_reg(this, ir->type);
1073 vec4_instruction *inst;
1074 inst = emit(MOV(dst_reg(this->result), src));
1075 inst->saturate = true;
1076
1077 return true;
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1082 {
1083 /* 3-src instructions were introduced in gen6. */
1084 if (brw->gen < 6)
1085 return false;
1086
1087 /* MAD can only handle floating-point data. */
1088 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1089 return false;
1090
1091 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1092 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1093
1094 if (!mul || mul->operation != ir_binop_mul)
1095 return false;
1096
1097 nonmul->accept(this);
1098 src_reg src0 = fix_3src_operand(this->result);
1099
1100 mul->operands[0]->accept(this);
1101 src_reg src1 = fix_3src_operand(this->result);
1102
1103 mul->operands[1]->accept(this);
1104 src_reg src2 = fix_3src_operand(this->result);
1105
1106 this->result = src_reg(this, ir->type);
1107 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1108
1109 return true;
1110 }
1111
1112 void
1113 vec4_visitor::emit_bool_comparison(unsigned int op,
1114 dst_reg dst, src_reg src0, src_reg src1)
1115 {
1116 /* original gen4 does destination conversion before comparison. */
1117 if (brw->gen < 5)
1118 dst.type = src0.type;
1119
1120 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1121
1122 dst.type = BRW_REGISTER_TYPE_D;
1123 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1124 }
1125
1126 void
1127 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1128 src_reg src0, src_reg src1)
1129 {
1130 vec4_instruction *inst;
1131
1132 if (brw->gen >= 6) {
1133 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1134 inst->conditional_mod = conditionalmod;
1135 } else {
1136 emit(CMP(dst, src0, src1, conditionalmod));
1137
1138 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1139 inst->predicate = BRW_PREDICATE_NORMAL;
1140 }
1141 }
1142
1143 static bool
1144 is_16bit_constant(ir_rvalue *rvalue)
1145 {
1146 ir_constant *constant = rvalue->as_constant();
1147 if (!constant)
1148 return false;
1149
1150 if (constant->type != glsl_type::int_type &&
1151 constant->type != glsl_type::uint_type)
1152 return false;
1153
1154 return constant->value.u[0] < (1 << 16);
1155 }
1156
1157 void
1158 vec4_visitor::visit(ir_expression *ir)
1159 {
1160 unsigned int operand;
1161 src_reg op[Elements(ir->operands)];
1162 src_reg result_src;
1163 dst_reg result_dst;
1164 vec4_instruction *inst;
1165
1166 if (try_emit_sat(ir))
1167 return;
1168
1169 if (ir->operation == ir_binop_add) {
1170 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1171 return;
1172 }
1173
1174 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1175 this->result.file = BAD_FILE;
1176 ir->operands[operand]->accept(this);
1177 if (this->result.file == BAD_FILE) {
1178 printf("Failed to get tree for expression operand:\n");
1179 ir->operands[operand]->print();
1180 exit(1);
1181 }
1182 op[operand] = this->result;
1183
1184 /* Matrix expression operands should have been broken down to vector
1185 * operations already.
1186 */
1187 assert(!ir->operands[operand]->type->is_matrix());
1188 }
1189
1190 int vector_elements = ir->operands[0]->type->vector_elements;
1191 if (ir->operands[1]) {
1192 vector_elements = MAX2(vector_elements,
1193 ir->operands[1]->type->vector_elements);
1194 }
1195
1196 this->result.file = BAD_FILE;
1197
1198 /* Storage for our result. Ideally for an assignment we'd be using
1199 * the actual storage for the result here, instead.
1200 */
1201 result_src = src_reg(this, ir->type);
1202 /* convenience for the emit functions below. */
1203 result_dst = dst_reg(result_src);
1204 /* If nothing special happens, this is the result. */
1205 this->result = result_src;
1206 /* Limit writes to the channels that will be used by result_src later.
1207 * This does limit this temp's use as a temporary for multi-instruction
1208 * sequences.
1209 */
1210 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1211
1212 switch (ir->operation) {
1213 case ir_unop_logic_not:
1214 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1215 * ones complement of the whole register, not just bit 0.
1216 */
1217 emit(XOR(result_dst, op[0], src_reg(1)));
1218 break;
1219 case ir_unop_neg:
1220 op[0].negate = !op[0].negate;
1221 emit(MOV(result_dst, op[0]));
1222 break;
1223 case ir_unop_abs:
1224 op[0].abs = true;
1225 op[0].negate = false;
1226 emit(MOV(result_dst, op[0]));
1227 break;
1228
1229 case ir_unop_sign:
1230 if (ir->type->is_float()) {
1231 /* AND(val, 0x80000000) gives the sign bit.
1232 *
1233 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1234 * zero.
1235 */
1236 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1237
1238 op[0].type = BRW_REGISTER_TYPE_UD;
1239 result_dst.type = BRW_REGISTER_TYPE_UD;
1240 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1241
1242 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1243 inst->predicate = BRW_PREDICATE_NORMAL;
1244
1245 this->result.type = BRW_REGISTER_TYPE_F;
1246 } else {
1247 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1248 * -> non-negative val generates 0x00000000.
1249 * Predicated OR sets 1 if val is positive.
1250 */
1251 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1252
1253 emit(ASR(result_dst, op[0], src_reg(31)));
1254
1255 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1256 inst->predicate = BRW_PREDICATE_NORMAL;
1257 }
1258 break;
1259
1260 case ir_unop_rcp:
1261 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1262 break;
1263
1264 case ir_unop_exp2:
1265 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1266 break;
1267 case ir_unop_log2:
1268 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1269 break;
1270 case ir_unop_exp:
1271 case ir_unop_log:
1272 assert(!"not reached: should be handled by ir_explog_to_explog2");
1273 break;
1274 case ir_unop_sin:
1275 case ir_unop_sin_reduced:
1276 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1277 break;
1278 case ir_unop_cos:
1279 case ir_unop_cos_reduced:
1280 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1281 break;
1282
1283 case ir_unop_dFdx:
1284 case ir_unop_dFdy:
1285 assert(!"derivatives not valid in vertex shader");
1286 break;
1287
1288 case ir_unop_bitfield_reverse:
1289 emit(BFREV(result_dst, op[0]));
1290 break;
1291 case ir_unop_bit_count:
1292 emit(CBIT(result_dst, op[0]));
1293 break;
1294 case ir_unop_find_msb: {
1295 src_reg temp = src_reg(this, glsl_type::uint_type);
1296
1297 inst = emit(FBH(dst_reg(temp), op[0]));
1298 inst->dst.writemask = WRITEMASK_XYZW;
1299
1300 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1301 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1302 * subtract the result from 31 to convert the MSB count into an LSB count.
1303 */
1304
1305 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1306 temp.swizzle = BRW_SWIZZLE_NOOP;
1307 emit(MOV(result_dst, temp));
1308
1309 src_reg src_tmp = src_reg(result_dst);
1310 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1311
1312 src_tmp.negate = true;
1313 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1314 inst->predicate = BRW_PREDICATE_NORMAL;
1315 break;
1316 }
1317 case ir_unop_find_lsb:
1318 emit(FBL(result_dst, op[0]));
1319 break;
1320
1321 case ir_unop_noise:
1322 assert(!"not reached: should be handled by lower_noise");
1323 break;
1324
1325 case ir_binop_add:
1326 emit(ADD(result_dst, op[0], op[1]));
1327 break;
1328 case ir_binop_sub:
1329 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1330 break;
1331
1332 case ir_binop_mul:
1333 if (brw->gen < 8 && ir->type->is_integer()) {
1334 /* For integer multiplication, the MUL uses the low 16 bits of one of
1335 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1336 * accumulates in the contribution of the upper 16 bits of that
1337 * operand. If we can determine that one of the args is in the low
1338 * 16 bits, though, we can just emit a single MUL.
1339 */
1340 if (is_16bit_constant(ir->operands[0])) {
1341 if (brw->gen < 7)
1342 emit(MUL(result_dst, op[0], op[1]));
1343 else
1344 emit(MUL(result_dst, op[1], op[0]));
1345 } else if (is_16bit_constant(ir->operands[1])) {
1346 if (brw->gen < 7)
1347 emit(MUL(result_dst, op[1], op[0]));
1348 else
1349 emit(MUL(result_dst, op[0], op[1]));
1350 } else {
1351 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1352
1353 emit(MUL(acc, op[0], op[1]));
1354 emit(MACH(dst_null_d(), op[0], op[1]));
1355 emit(MOV(result_dst, src_reg(acc)));
1356 }
1357 } else {
1358 emit(MUL(result_dst, op[0], op[1]));
1359 }
1360 break;
1361 case ir_binop_imul_high: {
1362 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1363
1364 emit(MUL(acc, op[0], op[1]));
1365 emit(MACH(result_dst, op[0], op[1]));
1366 break;
1367 }
1368 case ir_binop_div:
1369 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1370 assert(ir->type->is_integer());
1371 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1372 break;
1373 case ir_binop_carry: {
1374 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1375
1376 emit(ADDC(dst_null_ud(), op[0], op[1]));
1377 emit(MOV(result_dst, src_reg(acc)));
1378 break;
1379 }
1380 case ir_binop_borrow: {
1381 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1382
1383 emit(SUBB(dst_null_ud(), op[0], op[1]));
1384 emit(MOV(result_dst, src_reg(acc)));
1385 break;
1386 }
1387 case ir_binop_mod:
1388 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1389 assert(ir->type->is_integer());
1390 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1391 break;
1392
1393 case ir_binop_less:
1394 case ir_binop_greater:
1395 case ir_binop_lequal:
1396 case ir_binop_gequal:
1397 case ir_binop_equal:
1398 case ir_binop_nequal: {
1399 emit(CMP(result_dst, op[0], op[1],
1400 brw_conditional_for_comparison(ir->operation)));
1401 emit(AND(result_dst, result_src, src_reg(0x1)));
1402 break;
1403 }
1404
1405 case ir_binop_all_equal:
1406 /* "==" operator producing a scalar boolean. */
1407 if (ir->operands[0]->type->is_vector() ||
1408 ir->operands[1]->type->is_vector()) {
1409 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1410 emit(MOV(result_dst, src_reg(0)));
1411 inst = emit(MOV(result_dst, src_reg(1)));
1412 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1413 } else {
1414 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1415 emit(AND(result_dst, result_src, src_reg(0x1)));
1416 }
1417 break;
1418 case ir_binop_any_nequal:
1419 /* "!=" operator producing a scalar boolean. */
1420 if (ir->operands[0]->type->is_vector() ||
1421 ir->operands[1]->type->is_vector()) {
1422 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1423
1424 emit(MOV(result_dst, src_reg(0)));
1425 inst = emit(MOV(result_dst, src_reg(1)));
1426 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1427 } else {
1428 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1429 emit(AND(result_dst, result_src, src_reg(0x1)));
1430 }
1431 break;
1432
1433 case ir_unop_any:
1434 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1435 emit(MOV(result_dst, src_reg(0)));
1436
1437 inst = emit(MOV(result_dst, src_reg(1)));
1438 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1439 break;
1440
1441 case ir_binop_logic_xor:
1442 emit(XOR(result_dst, op[0], op[1]));
1443 break;
1444
1445 case ir_binop_logic_or:
1446 emit(OR(result_dst, op[0], op[1]));
1447 break;
1448
1449 case ir_binop_logic_and:
1450 emit(AND(result_dst, op[0], op[1]));
1451 break;
1452
1453 case ir_binop_dot:
1454 assert(ir->operands[0]->type->is_vector());
1455 assert(ir->operands[0]->type == ir->operands[1]->type);
1456 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1457 break;
1458
1459 case ir_unop_sqrt:
1460 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1461 break;
1462 case ir_unop_rsq:
1463 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1464 break;
1465
1466 case ir_unop_bitcast_i2f:
1467 case ir_unop_bitcast_u2f:
1468 this->result = op[0];
1469 this->result.type = BRW_REGISTER_TYPE_F;
1470 break;
1471
1472 case ir_unop_bitcast_f2i:
1473 this->result = op[0];
1474 this->result.type = BRW_REGISTER_TYPE_D;
1475 break;
1476
1477 case ir_unop_bitcast_f2u:
1478 this->result = op[0];
1479 this->result.type = BRW_REGISTER_TYPE_UD;
1480 break;
1481
1482 case ir_unop_i2f:
1483 case ir_unop_i2u:
1484 case ir_unop_u2i:
1485 case ir_unop_u2f:
1486 case ir_unop_b2f:
1487 case ir_unop_b2i:
1488 case ir_unop_f2i:
1489 case ir_unop_f2u:
1490 emit(MOV(result_dst, op[0]));
1491 break;
1492 case ir_unop_f2b:
1493 case ir_unop_i2b: {
1494 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1495 emit(AND(result_dst, result_src, src_reg(1)));
1496 break;
1497 }
1498
1499 case ir_unop_trunc:
1500 emit(RNDZ(result_dst, op[0]));
1501 break;
1502 case ir_unop_ceil:
1503 op[0].negate = !op[0].negate;
1504 inst = emit(RNDD(result_dst, op[0]));
1505 this->result.negate = true;
1506 break;
1507 case ir_unop_floor:
1508 inst = emit(RNDD(result_dst, op[0]));
1509 break;
1510 case ir_unop_fract:
1511 inst = emit(FRC(result_dst, op[0]));
1512 break;
1513 case ir_unop_round_even:
1514 emit(RNDE(result_dst, op[0]));
1515 break;
1516
1517 case ir_binop_min:
1518 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1519 break;
1520 case ir_binop_max:
1521 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1522 break;
1523
1524 case ir_binop_pow:
1525 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1526 break;
1527
1528 case ir_unop_bit_not:
1529 inst = emit(NOT(result_dst, op[0]));
1530 break;
1531 case ir_binop_bit_and:
1532 inst = emit(AND(result_dst, op[0], op[1]));
1533 break;
1534 case ir_binop_bit_xor:
1535 inst = emit(XOR(result_dst, op[0], op[1]));
1536 break;
1537 case ir_binop_bit_or:
1538 inst = emit(OR(result_dst, op[0], op[1]));
1539 break;
1540
1541 case ir_binop_lshift:
1542 inst = emit(SHL(result_dst, op[0], op[1]));
1543 break;
1544
1545 case ir_binop_rshift:
1546 if (ir->type->base_type == GLSL_TYPE_INT)
1547 inst = emit(ASR(result_dst, op[0], op[1]));
1548 else
1549 inst = emit(SHR(result_dst, op[0], op[1]));
1550 break;
1551
1552 case ir_binop_bfm:
1553 emit(BFI1(result_dst, op[0], op[1]));
1554 break;
1555
1556 case ir_binop_ubo_load: {
1557 ir_constant *uniform_block = ir->operands[0]->as_constant();
1558 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1559 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1560 src_reg offset;
1561
1562 /* Now, load the vector from that offset. */
1563 assert(ir->type->is_vector() || ir->type->is_scalar());
1564
1565 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1566 packed_consts.type = result.type;
1567 src_reg surf_index =
1568 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1569 if (const_offset_ir) {
1570 if (brw->gen >= 8) {
1571 /* Store the offset in a GRF so we can send-from-GRF. */
1572 offset = src_reg(this, glsl_type::int_type);
1573 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1574 } else {
1575 /* Immediates are fine on older generations since they'll be moved
1576 * to a (potentially fake) MRF at the generator level.
1577 */
1578 offset = src_reg(const_offset / 16);
1579 }
1580 } else {
1581 offset = src_reg(this, glsl_type::uint_type);
1582 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1583 }
1584
1585 vec4_instruction *pull =
1586 emit(new(mem_ctx) vec4_instruction(this,
1587 VS_OPCODE_PULL_CONSTANT_LOAD,
1588 dst_reg(packed_consts),
1589 surf_index,
1590 offset));
1591 pull->base_mrf = 14;
1592 pull->mlen = 1;
1593
1594 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1595 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1596 const_offset % 16 / 4,
1597 const_offset % 16 / 4,
1598 const_offset % 16 / 4);
1599
1600 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1601 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1602 emit(CMP(result_dst, packed_consts, src_reg(0u),
1603 BRW_CONDITIONAL_NZ));
1604 emit(AND(result_dst, result, src_reg(0x1)));
1605 } else {
1606 emit(MOV(result_dst, packed_consts));
1607 }
1608 break;
1609 }
1610
1611 case ir_binop_vector_extract:
1612 assert(!"should have been lowered by vec_index_to_cond_assign");
1613 break;
1614
1615 case ir_triop_fma:
1616 op[0] = fix_3src_operand(op[0]);
1617 op[1] = fix_3src_operand(op[1]);
1618 op[2] = fix_3src_operand(op[2]);
1619 /* Note that the instruction's argument order is reversed from GLSL
1620 * and the IR.
1621 */
1622 emit(MAD(result_dst, op[2], op[1], op[0]));
1623 break;
1624
1625 case ir_triop_lrp:
1626 op[0] = fix_3src_operand(op[0]);
1627 op[1] = fix_3src_operand(op[1]);
1628 op[2] = fix_3src_operand(op[2]);
1629 /* Note that the instruction's argument order is reversed from GLSL
1630 * and the IR.
1631 */
1632 emit(LRP(result_dst, op[2], op[1], op[0]));
1633 break;
1634
1635 case ir_triop_csel:
1636 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1637 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1638 inst->predicate = BRW_PREDICATE_NORMAL;
1639 break;
1640
1641 case ir_triop_bfi:
1642 op[0] = fix_3src_operand(op[0]);
1643 op[1] = fix_3src_operand(op[1]);
1644 op[2] = fix_3src_operand(op[2]);
1645 emit(BFI2(result_dst, op[0], op[1], op[2]));
1646 break;
1647
1648 case ir_triop_bitfield_extract:
1649 op[0] = fix_3src_operand(op[0]);
1650 op[1] = fix_3src_operand(op[1]);
1651 op[2] = fix_3src_operand(op[2]);
1652 /* Note that the instruction's argument order is reversed from GLSL
1653 * and the IR.
1654 */
1655 emit(BFE(result_dst, op[2], op[1], op[0]));
1656 break;
1657
1658 case ir_triop_vector_insert:
1659 assert(!"should have been lowered by lower_vector_insert");
1660 break;
1661
1662 case ir_quadop_bitfield_insert:
1663 assert(!"not reached: should be handled by "
1664 "bitfield_insert_to_bfm_bfi\n");
1665 break;
1666
1667 case ir_quadop_vector:
1668 assert(!"not reached: should be handled by lower_quadop_vector");
1669 break;
1670
1671 case ir_unop_pack_half_2x16:
1672 emit_pack_half_2x16(result_dst, op[0]);
1673 break;
1674 case ir_unop_unpack_half_2x16:
1675 emit_unpack_half_2x16(result_dst, op[0]);
1676 break;
1677 case ir_unop_pack_snorm_2x16:
1678 case ir_unop_pack_snorm_4x8:
1679 case ir_unop_pack_unorm_2x16:
1680 case ir_unop_pack_unorm_4x8:
1681 case ir_unop_unpack_snorm_2x16:
1682 case ir_unop_unpack_snorm_4x8:
1683 case ir_unop_unpack_unorm_2x16:
1684 case ir_unop_unpack_unorm_4x8:
1685 assert(!"not reached: should be handled by lower_packing_builtins");
1686 break;
1687 case ir_unop_unpack_half_2x16_split_x:
1688 case ir_unop_unpack_half_2x16_split_y:
1689 case ir_binop_pack_half_2x16_split:
1690 assert(!"not reached: should not occur in vertex shader");
1691 break;
1692 case ir_binop_ldexp:
1693 assert(!"not reached: should be handled by ldexp_to_arith()");
1694 break;
1695 }
1696 }
1697
1698
1699 void
1700 vec4_visitor::visit(ir_swizzle *ir)
1701 {
1702 src_reg src;
1703 int i = 0;
1704 int swizzle[4];
1705
1706 /* Note that this is only swizzles in expressions, not those on the left
1707 * hand side of an assignment, which do write masking. See ir_assignment
1708 * for that.
1709 */
1710
1711 ir->val->accept(this);
1712 src = this->result;
1713 assert(src.file != BAD_FILE);
1714
1715 for (i = 0; i < ir->type->vector_elements; i++) {
1716 switch (i) {
1717 case 0:
1718 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1719 break;
1720 case 1:
1721 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1722 break;
1723 case 2:
1724 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1725 break;
1726 case 3:
1727 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1728 break;
1729 }
1730 }
1731 for (; i < 4; i++) {
1732 /* Replicate the last channel out. */
1733 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1734 }
1735
1736 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1737
1738 this->result = src;
1739 }
1740
1741 void
1742 vec4_visitor::visit(ir_dereference_variable *ir)
1743 {
1744 const struct glsl_type *type = ir->type;
1745 dst_reg *reg = variable_storage(ir->var);
1746
1747 if (!reg) {
1748 fail("Failed to find variable storage for %s\n", ir->var->name);
1749 this->result = src_reg(brw_null_reg());
1750 return;
1751 }
1752
1753 this->result = src_reg(*reg);
1754
1755 /* System values get their swizzle from the dst_reg writemask */
1756 if (ir->var->data.mode == ir_var_system_value)
1757 return;
1758
1759 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1760 this->result.swizzle = swizzle_for_size(type->vector_elements);
1761 }
1762
1763
1764 int
1765 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1766 {
1767 /* Under normal circumstances array elements are stored consecutively, so
1768 * the stride is equal to the size of the array element.
1769 */
1770 return type_size(ir->type);
1771 }
1772
1773
1774 void
1775 vec4_visitor::visit(ir_dereference_array *ir)
1776 {
1777 ir_constant *constant_index;
1778 src_reg src;
1779 int array_stride = compute_array_stride(ir);
1780
1781 constant_index = ir->array_index->constant_expression_value();
1782
1783 ir->array->accept(this);
1784 src = this->result;
1785
1786 if (constant_index) {
1787 src.reg_offset += constant_index->value.i[0] * array_stride;
1788 } else {
1789 /* Variable index array dereference. It eats the "vec4" of the
1790 * base of the array and an index that offsets the Mesa register
1791 * index.
1792 */
1793 ir->array_index->accept(this);
1794
1795 src_reg index_reg;
1796
1797 if (array_stride == 1) {
1798 index_reg = this->result;
1799 } else {
1800 index_reg = src_reg(this, glsl_type::int_type);
1801
1802 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1803 }
1804
1805 if (src.reladdr) {
1806 src_reg temp = src_reg(this, glsl_type::int_type);
1807
1808 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1809
1810 index_reg = temp;
1811 }
1812
1813 src.reladdr = ralloc(mem_ctx, src_reg);
1814 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1815 }
1816
1817 /* If the type is smaller than a vec4, replicate the last channel out. */
1818 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1819 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1820 else
1821 src.swizzle = BRW_SWIZZLE_NOOP;
1822 src.type = brw_type_for_base_type(ir->type);
1823
1824 this->result = src;
1825 }
1826
1827 void
1828 vec4_visitor::visit(ir_dereference_record *ir)
1829 {
1830 unsigned int i;
1831 const glsl_type *struct_type = ir->record->type;
1832 int offset = 0;
1833
1834 ir->record->accept(this);
1835
1836 for (i = 0; i < struct_type->length; i++) {
1837 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1838 break;
1839 offset += type_size(struct_type->fields.structure[i].type);
1840 }
1841
1842 /* If the type is smaller than a vec4, replicate the last channel out. */
1843 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1844 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1845 else
1846 this->result.swizzle = BRW_SWIZZLE_NOOP;
1847 this->result.type = brw_type_for_base_type(ir->type);
1848
1849 this->result.reg_offset += offset;
1850 }
1851
1852 /**
1853 * We want to be careful in assignment setup to hit the actual storage
1854 * instead of potentially using a temporary like we might with the
1855 * ir_dereference handler.
1856 */
1857 static dst_reg
1858 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1859 {
1860 /* The LHS must be a dereference. If the LHS is a variable indexed array
1861 * access of a vector, it must be separated into a series conditional moves
1862 * before reaching this point (see ir_vec_index_to_cond_assign).
1863 */
1864 assert(ir->as_dereference());
1865 ir_dereference_array *deref_array = ir->as_dereference_array();
1866 if (deref_array) {
1867 assert(!deref_array->array->type->is_vector());
1868 }
1869
1870 /* Use the rvalue deref handler for the most part. We'll ignore
1871 * swizzles in it and write swizzles using writemask, though.
1872 */
1873 ir->accept(v);
1874 return dst_reg(v->result);
1875 }
1876
1877 void
1878 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1879 const struct glsl_type *type, uint32_t predicate)
1880 {
1881 if (type->base_type == GLSL_TYPE_STRUCT) {
1882 for (unsigned int i = 0; i < type->length; i++) {
1883 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1884 }
1885 return;
1886 }
1887
1888 if (type->is_array()) {
1889 for (unsigned int i = 0; i < type->length; i++) {
1890 emit_block_move(dst, src, type->fields.array, predicate);
1891 }
1892 return;
1893 }
1894
1895 if (type->is_matrix()) {
1896 const struct glsl_type *vec_type;
1897
1898 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1899 type->vector_elements, 1);
1900
1901 for (int i = 0; i < type->matrix_columns; i++) {
1902 emit_block_move(dst, src, vec_type, predicate);
1903 }
1904 return;
1905 }
1906
1907 assert(type->is_scalar() || type->is_vector());
1908
1909 dst->type = brw_type_for_base_type(type);
1910 src->type = dst->type;
1911
1912 dst->writemask = (1 << type->vector_elements) - 1;
1913
1914 src->swizzle = swizzle_for_size(type->vector_elements);
1915
1916 vec4_instruction *inst = emit(MOV(*dst, *src));
1917 inst->predicate = predicate;
1918
1919 dst->reg_offset++;
1920 src->reg_offset++;
1921 }
1922
1923
1924 /* If the RHS processing resulted in an instruction generating a
1925 * temporary value, and it would be easy to rewrite the instruction to
1926 * generate its result right into the LHS instead, do so. This ends
1927 * up reliably removing instructions where it can be tricky to do so
1928 * later without real UD chain information.
1929 */
1930 bool
1931 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1932 dst_reg dst,
1933 src_reg src,
1934 vec4_instruction *pre_rhs_inst,
1935 vec4_instruction *last_rhs_inst)
1936 {
1937 /* This could be supported, but it would take more smarts. */
1938 if (ir->condition)
1939 return false;
1940
1941 if (pre_rhs_inst == last_rhs_inst)
1942 return false; /* No instructions generated to work with. */
1943
1944 /* Make sure the last instruction generated our source reg. */
1945 if (src.file != GRF ||
1946 src.file != last_rhs_inst->dst.file ||
1947 src.reg != last_rhs_inst->dst.reg ||
1948 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1949 src.reladdr ||
1950 src.abs ||
1951 src.negate ||
1952 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1953 return false;
1954
1955 /* Check that that last instruction fully initialized the channels
1956 * we want to use, in the order we want to use them. We could
1957 * potentially reswizzle the operands of many instructions so that
1958 * we could handle out of order channels, but don't yet.
1959 */
1960
1961 for (unsigned i = 0; i < 4; i++) {
1962 if (dst.writemask & (1 << i)) {
1963 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1964 return false;
1965
1966 if (BRW_GET_SWZ(src.swizzle, i) != i)
1967 return false;
1968 }
1969 }
1970
1971 /* Success! Rewrite the instruction. */
1972 last_rhs_inst->dst.file = dst.file;
1973 last_rhs_inst->dst.reg = dst.reg;
1974 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1975 last_rhs_inst->dst.reladdr = dst.reladdr;
1976 last_rhs_inst->dst.writemask &= dst.writemask;
1977
1978 return true;
1979 }
1980
1981 void
1982 vec4_visitor::visit(ir_assignment *ir)
1983 {
1984 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1985 uint32_t predicate = BRW_PREDICATE_NONE;
1986
1987 if (!ir->lhs->type->is_scalar() &&
1988 !ir->lhs->type->is_vector()) {
1989 ir->rhs->accept(this);
1990 src_reg src = this->result;
1991
1992 if (ir->condition) {
1993 emit_bool_to_cond_code(ir->condition, &predicate);
1994 }
1995
1996 /* emit_block_move doesn't account for swizzles in the source register.
1997 * This should be ok, since the source register is a structure or an
1998 * array, and those can't be swizzled. But double-check to be sure.
1999 */
2000 assert(src.swizzle ==
2001 (ir->rhs->type->is_matrix()
2002 ? swizzle_for_size(ir->rhs->type->vector_elements)
2003 : BRW_SWIZZLE_NOOP));
2004
2005 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2006 return;
2007 }
2008
2009 /* Now we're down to just a scalar/vector with writemasks. */
2010 int i;
2011
2012 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2013 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2014
2015 ir->rhs->accept(this);
2016
2017 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2018
2019 src_reg src = this->result;
2020
2021 int swizzles[4];
2022 int first_enabled_chan = 0;
2023 int src_chan = 0;
2024
2025 assert(ir->lhs->type->is_vector() ||
2026 ir->lhs->type->is_scalar());
2027 dst.writemask = ir->write_mask;
2028
2029 for (int i = 0; i < 4; i++) {
2030 if (dst.writemask & (1 << i)) {
2031 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2032 break;
2033 }
2034 }
2035
2036 /* Swizzle a small RHS vector into the channels being written.
2037 *
2038 * glsl ir treats write_mask as dictating how many channels are
2039 * present on the RHS while in our instructions we need to make
2040 * those channels appear in the slots of the vec4 they're written to.
2041 */
2042 for (int i = 0; i < 4; i++) {
2043 if (dst.writemask & (1 << i))
2044 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2045 else
2046 swizzles[i] = first_enabled_chan;
2047 }
2048 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2049 swizzles[2], swizzles[3]);
2050
2051 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2052 return;
2053 }
2054
2055 if (ir->condition) {
2056 emit_bool_to_cond_code(ir->condition, &predicate);
2057 }
2058
2059 for (i = 0; i < type_size(ir->lhs->type); i++) {
2060 vec4_instruction *inst = emit(MOV(dst, src));
2061 inst->predicate = predicate;
2062
2063 dst.reg_offset++;
2064 src.reg_offset++;
2065 }
2066 }
2067
2068 void
2069 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2070 {
2071 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2072 foreach_list(node, &ir->components) {
2073 ir_constant *field_value = (ir_constant *)node;
2074
2075 emit_constant_values(dst, field_value);
2076 }
2077 return;
2078 }
2079
2080 if (ir->type->is_array()) {
2081 for (unsigned int i = 0; i < ir->type->length; i++) {
2082 emit_constant_values(dst, ir->array_elements[i]);
2083 }
2084 return;
2085 }
2086
2087 if (ir->type->is_matrix()) {
2088 for (int i = 0; i < ir->type->matrix_columns; i++) {
2089 float *vec = &ir->value.f[i * ir->type->vector_elements];
2090
2091 for (int j = 0; j < ir->type->vector_elements; j++) {
2092 dst->writemask = 1 << j;
2093 dst->type = BRW_REGISTER_TYPE_F;
2094
2095 emit(MOV(*dst, src_reg(vec[j])));
2096 }
2097 dst->reg_offset++;
2098 }
2099 return;
2100 }
2101
2102 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2103
2104 for (int i = 0; i < ir->type->vector_elements; i++) {
2105 if (!(remaining_writemask & (1 << i)))
2106 continue;
2107
2108 dst->writemask = 1 << i;
2109 dst->type = brw_type_for_base_type(ir->type);
2110
2111 /* Find other components that match the one we're about to
2112 * write. Emits fewer instructions for things like vec4(0.5,
2113 * 1.5, 1.5, 1.5).
2114 */
2115 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2116 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2117 if (ir->value.b[i] == ir->value.b[j])
2118 dst->writemask |= (1 << j);
2119 } else {
2120 /* u, i, and f storage all line up, so no need for a
2121 * switch case for comparing each type.
2122 */
2123 if (ir->value.u[i] == ir->value.u[j])
2124 dst->writemask |= (1 << j);
2125 }
2126 }
2127
2128 switch (ir->type->base_type) {
2129 case GLSL_TYPE_FLOAT:
2130 emit(MOV(*dst, src_reg(ir->value.f[i])));
2131 break;
2132 case GLSL_TYPE_INT:
2133 emit(MOV(*dst, src_reg(ir->value.i[i])));
2134 break;
2135 case GLSL_TYPE_UINT:
2136 emit(MOV(*dst, src_reg(ir->value.u[i])));
2137 break;
2138 case GLSL_TYPE_BOOL:
2139 emit(MOV(*dst, src_reg(ir->value.b[i])));
2140 break;
2141 default:
2142 assert(!"Non-float/uint/int/bool constant");
2143 break;
2144 }
2145
2146 remaining_writemask &= ~dst->writemask;
2147 }
2148 dst->reg_offset++;
2149 }
2150
2151 void
2152 vec4_visitor::visit(ir_constant *ir)
2153 {
2154 dst_reg dst = dst_reg(this, ir->type);
2155 this->result = src_reg(dst);
2156
2157 emit_constant_values(&dst, ir);
2158 }
2159
2160 void
2161 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2162 {
2163 ir_dereference *deref = static_cast<ir_dereference *>(
2164 ir->actual_parameters.get_head());
2165 ir_variable *location = deref->variable_referenced();
2166 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2167 location->data.atomic.buffer_index);
2168
2169 /* Calculate the surface offset */
2170 src_reg offset(this, glsl_type::uint_type);
2171 ir_dereference_array *deref_array = deref->as_dereference_array();
2172 if (deref_array) {
2173 deref_array->array_index->accept(this);
2174
2175 src_reg tmp(this, glsl_type::uint_type);
2176 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2177 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2178 } else {
2179 offset = location->data.atomic.offset;
2180 }
2181
2182 /* Emit the appropriate machine instruction */
2183 const char *callee = ir->callee->function_name();
2184 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2185
2186 if (!strcmp("__intrinsic_atomic_read", callee)) {
2187 emit_untyped_surface_read(surf_index, dst, offset);
2188
2189 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2190 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2191 src_reg(), src_reg());
2192
2193 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2194 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2195 src_reg(), src_reg());
2196 }
2197 }
2198
2199 void
2200 vec4_visitor::visit(ir_call *ir)
2201 {
2202 const char *callee = ir->callee->function_name();
2203
2204 if (!strcmp("__intrinsic_atomic_read", callee) ||
2205 !strcmp("__intrinsic_atomic_increment", callee) ||
2206 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2207 visit_atomic_counter_intrinsic(ir);
2208 } else {
2209 assert(!"Unsupported intrinsic.");
2210 }
2211 }
2212
2213 src_reg
2214 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2215 {
2216 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2217 inst->base_mrf = 2;
2218 inst->mlen = 1;
2219 inst->sampler = sampler;
2220 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2221 inst->dst.writemask = WRITEMASK_XYZW;
2222
2223 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2224 int param_base = inst->base_mrf;
2225 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2226 int zero_mask = 0xf & ~coord_mask;
2227
2228 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2229 coordinate));
2230
2231 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2232 src_reg(0)));
2233
2234 emit(inst);
2235 return src_reg(inst->dst);
2236 }
2237
2238 void
2239 vec4_visitor::visit(ir_texture *ir)
2240 {
2241 int sampler =
2242 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2243
2244 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2245 * emitting anything other than setting up the constant result.
2246 */
2247 if (ir->op == ir_tg4) {
2248 ir_constant *chan = ir->lod_info.component->as_constant();
2249 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2250 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2251 dst_reg result(this, ir->type);
2252 this->result = src_reg(result);
2253 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2254 return;
2255 }
2256 }
2257
2258 /* Should be lowered by do_lower_texture_projection */
2259 assert(!ir->projector);
2260
2261 /* Should be lowered */
2262 assert(!ir->offset || !ir->offset->type->is_array());
2263
2264 /* Generate code to compute all the subexpression trees. This has to be
2265 * done before loading any values into MRFs for the sampler message since
2266 * generating these values may involve SEND messages that need the MRFs.
2267 */
2268 src_reg coordinate;
2269 if (ir->coordinate) {
2270 ir->coordinate->accept(this);
2271 coordinate = this->result;
2272 }
2273
2274 src_reg shadow_comparitor;
2275 if (ir->shadow_comparitor) {
2276 ir->shadow_comparitor->accept(this);
2277 shadow_comparitor = this->result;
2278 }
2279
2280 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2281 src_reg offset_value;
2282 if (has_nonconstant_offset) {
2283 ir->offset->accept(this);
2284 offset_value = src_reg(this->result);
2285 }
2286
2287 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2288 src_reg lod, dPdx, dPdy, sample_index, mcs;
2289 switch (ir->op) {
2290 case ir_tex:
2291 lod = src_reg(0.0f);
2292 lod_type = glsl_type::float_type;
2293 break;
2294 case ir_txf:
2295 case ir_txl:
2296 case ir_txs:
2297 ir->lod_info.lod->accept(this);
2298 lod = this->result;
2299 lod_type = ir->lod_info.lod->type;
2300 break;
2301 case ir_query_levels:
2302 lod = src_reg(0);
2303 lod_type = glsl_type::int_type;
2304 break;
2305 case ir_txf_ms:
2306 ir->lod_info.sample_index->accept(this);
2307 sample_index = this->result;
2308 sample_index_type = ir->lod_info.sample_index->type;
2309
2310 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2311 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2312 else
2313 mcs = src_reg(0u);
2314 break;
2315 case ir_txd:
2316 ir->lod_info.grad.dPdx->accept(this);
2317 dPdx = this->result;
2318
2319 ir->lod_info.grad.dPdy->accept(this);
2320 dPdy = this->result;
2321
2322 lod_type = ir->lod_info.grad.dPdx->type;
2323 break;
2324 case ir_txb:
2325 case ir_lod:
2326 case ir_tg4:
2327 break;
2328 }
2329
2330 vec4_instruction *inst = NULL;
2331 switch (ir->op) {
2332 case ir_tex:
2333 case ir_txl:
2334 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2335 break;
2336 case ir_txd:
2337 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2338 break;
2339 case ir_txf:
2340 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2341 break;
2342 case ir_txf_ms:
2343 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2344 break;
2345 case ir_txs:
2346 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2347 break;
2348 case ir_tg4:
2349 if (has_nonconstant_offset)
2350 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2351 else
2352 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2353 break;
2354 case ir_query_levels:
2355 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2356 break;
2357 case ir_txb:
2358 assert(!"TXB is not valid for vertex shaders.");
2359 break;
2360 case ir_lod:
2361 assert(!"LOD is not valid for vertex shaders.");
2362 break;
2363 default:
2364 assert(!"Unrecognized tex op");
2365 }
2366
2367 if (ir->offset != NULL && ir->op != ir_txf)
2368 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2369
2370 /* Stuff the channel select bits in the top of the texture offset */
2371 if (ir->op == ir_tg4)
2372 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2373
2374 /* The message header is necessary for:
2375 * - Gen4 (always)
2376 * - Texel offsets
2377 * - Gather channel selection
2378 * - Sampler indices too large to fit in a 4-bit value.
2379 */
2380 inst->header_present =
2381 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2382 sampler >= 16;
2383 inst->base_mrf = 2;
2384 inst->mlen = inst->header_present + 1; /* always at least one */
2385 inst->sampler = sampler;
2386 inst->dst = dst_reg(this, ir->type);
2387 inst->dst.writemask = WRITEMASK_XYZW;
2388 inst->shadow_compare = ir->shadow_comparitor != NULL;
2389
2390 /* MRF for the first parameter */
2391 int param_base = inst->base_mrf + inst->header_present;
2392
2393 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2394 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2395 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2396 } else {
2397 /* Load the coordinate */
2398 /* FINISHME: gl_clamp_mask and saturate */
2399 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2400 int zero_mask = 0xf & ~coord_mask;
2401
2402 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2403 coordinate));
2404
2405 if (zero_mask != 0) {
2406 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2407 src_reg(0)));
2408 }
2409 /* Load the shadow comparitor */
2410 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2411 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2412 WRITEMASK_X),
2413 shadow_comparitor));
2414 inst->mlen++;
2415 }
2416
2417 /* Load the LOD info */
2418 if (ir->op == ir_tex || ir->op == ir_txl) {
2419 int mrf, writemask;
2420 if (brw->gen >= 5) {
2421 mrf = param_base + 1;
2422 if (ir->shadow_comparitor) {
2423 writemask = WRITEMASK_Y;
2424 /* mlen already incremented */
2425 } else {
2426 writemask = WRITEMASK_X;
2427 inst->mlen++;
2428 }
2429 } else /* brw->gen == 4 */ {
2430 mrf = param_base;
2431 writemask = WRITEMASK_W;
2432 }
2433 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2434 } else if (ir->op == ir_txf) {
2435 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2436 } else if (ir->op == ir_txf_ms) {
2437 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2438 sample_index));
2439 if (brw->gen >= 7)
2440 /* MCS data is in the first channel of `mcs`, but we need to get it into
2441 * the .y channel of the second vec4 of params, so replicate .x across
2442 * the whole vec4 and then mask off everything except .y
2443 */
2444 mcs.swizzle = BRW_SWIZZLE_XXXX;
2445 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2446 mcs));
2447 inst->mlen++;
2448 } else if (ir->op == ir_txd) {
2449 const glsl_type *type = lod_type;
2450
2451 if (brw->gen >= 5) {
2452 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2453 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2454 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2455 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2456 inst->mlen++;
2457
2458 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2459 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2460 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2461 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2462 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2463 inst->mlen++;
2464
2465 if (ir->shadow_comparitor) {
2466 emit(MOV(dst_reg(MRF, param_base + 2,
2467 ir->shadow_comparitor->type, WRITEMASK_Z),
2468 shadow_comparitor));
2469 }
2470 }
2471 } else /* brw->gen == 4 */ {
2472 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2473 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2474 inst->mlen += 2;
2475 }
2476 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2477 if (ir->shadow_comparitor) {
2478 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2479 shadow_comparitor));
2480 }
2481
2482 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2483 offset_value));
2484 inst->mlen++;
2485 }
2486 }
2487
2488 emit(inst);
2489
2490 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2491 * spec requires layers.
2492 */
2493 if (ir->op == ir_txs) {
2494 glsl_type const *type = ir->sampler->type;
2495 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2496 type->sampler_array) {
2497 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2498 with_writemask(inst->dst, WRITEMASK_Z),
2499 src_reg(inst->dst), src_reg(6));
2500 }
2501 }
2502
2503 if (brw->gen == 6 && ir->op == ir_tg4) {
2504 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2505 }
2506
2507 swizzle_result(ir, src_reg(inst->dst), sampler);
2508 }
2509
2510 /**
2511 * Apply workarounds for Gen6 gather with UINT/SINT
2512 */
2513 void
2514 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2515 {
2516 if (!wa)
2517 return;
2518
2519 int width = (wa & WA_8BIT) ? 8 : 16;
2520 dst_reg dst_f = dst;
2521 dst_f.type = BRW_REGISTER_TYPE_F;
2522
2523 /* Convert from UNORM to UINT */
2524 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2525 emit(MOV(dst, src_reg(dst_f)));
2526
2527 if (wa & WA_SIGN) {
2528 /* Reinterpret the UINT value as a signed INT value by
2529 * shifting the sign bit into place, then shifting back
2530 * preserving sign.
2531 */
2532 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2533 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2534 }
2535 }
2536
2537 /**
2538 * Set up the gather channel based on the swizzle, for gather4.
2539 */
2540 uint32_t
2541 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2542 {
2543 ir_constant *chan = ir->lod_info.component->as_constant();
2544 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2545 switch (swiz) {
2546 case SWIZZLE_X: return 0;
2547 case SWIZZLE_Y:
2548 /* gather4 sampler is broken for green channel on RG32F --
2549 * we must ask for blue instead.
2550 */
2551 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2552 return 2;
2553 return 1;
2554 case SWIZZLE_Z: return 2;
2555 case SWIZZLE_W: return 3;
2556 default:
2557 assert(!"Not reached"); /* zero, one swizzles handled already */
2558 return 0;
2559 }
2560 }
2561
2562 void
2563 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2564 {
2565 int s = key->tex.swizzles[sampler];
2566
2567 this->result = src_reg(this, ir->type);
2568 dst_reg swizzled_result(this->result);
2569
2570 if (ir->op == ir_query_levels) {
2571 /* # levels is in .w */
2572 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2573 emit(MOV(swizzled_result, orig_val));
2574 return;
2575 }
2576
2577 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2578 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2579 emit(MOV(swizzled_result, orig_val));
2580 return;
2581 }
2582
2583
2584 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2585 int swizzle[4] = {0};
2586
2587 for (int i = 0; i < 4; i++) {
2588 switch (GET_SWZ(s, i)) {
2589 case SWIZZLE_ZERO:
2590 zero_mask |= (1 << i);
2591 break;
2592 case SWIZZLE_ONE:
2593 one_mask |= (1 << i);
2594 break;
2595 default:
2596 copy_mask |= (1 << i);
2597 swizzle[i] = GET_SWZ(s, i);
2598 break;
2599 }
2600 }
2601
2602 if (copy_mask) {
2603 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2604 swizzled_result.writemask = copy_mask;
2605 emit(MOV(swizzled_result, orig_val));
2606 }
2607
2608 if (zero_mask) {
2609 swizzled_result.writemask = zero_mask;
2610 emit(MOV(swizzled_result, src_reg(0.0f)));
2611 }
2612
2613 if (one_mask) {
2614 swizzled_result.writemask = one_mask;
2615 emit(MOV(swizzled_result, src_reg(1.0f)));
2616 }
2617 }
2618
2619 void
2620 vec4_visitor::visit(ir_return *ir)
2621 {
2622 assert(!"not reached");
2623 }
2624
2625 void
2626 vec4_visitor::visit(ir_discard *ir)
2627 {
2628 assert(!"not reached");
2629 }
2630
2631 void
2632 vec4_visitor::visit(ir_if *ir)
2633 {
2634 /* Don't point the annotation at the if statement, because then it plus
2635 * the then and else blocks get printed.
2636 */
2637 this->base_ir = ir->condition;
2638
2639 if (brw->gen == 6) {
2640 emit_if_gen6(ir);
2641 } else {
2642 uint32_t predicate;
2643 emit_bool_to_cond_code(ir->condition, &predicate);
2644 emit(IF(predicate));
2645 }
2646
2647 visit_instructions(&ir->then_instructions);
2648
2649 if (!ir->else_instructions.is_empty()) {
2650 this->base_ir = ir->condition;
2651 emit(BRW_OPCODE_ELSE);
2652
2653 visit_instructions(&ir->else_instructions);
2654 }
2655
2656 this->base_ir = ir->condition;
2657 emit(BRW_OPCODE_ENDIF);
2658 }
2659
2660 void
2661 vec4_visitor::visit(ir_emit_vertex *)
2662 {
2663 assert(!"not reached");
2664 }
2665
2666 void
2667 vec4_visitor::visit(ir_end_primitive *)
2668 {
2669 assert(!"not reached");
2670 }
2671
2672 void
2673 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2674 dst_reg dst, src_reg offset,
2675 src_reg src0, src_reg src1)
2676 {
2677 unsigned mlen = 0;
2678
2679 /* Set the atomic operation offset. */
2680 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2681 mlen++;
2682
2683 /* Set the atomic operation arguments. */
2684 if (src0.file != BAD_FILE) {
2685 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2686 mlen++;
2687 }
2688
2689 if (src1.file != BAD_FILE) {
2690 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2691 mlen++;
2692 }
2693
2694 /* Emit the instruction. Note that this maps to the normal SIMD8
2695 * untyped atomic message on Ivy Bridge, but that's OK because
2696 * unused channels will be masked out.
2697 */
2698 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2699 src_reg(atomic_op), src_reg(surf_index));
2700 inst->base_mrf = 0;
2701 inst->mlen = mlen;
2702 }
2703
2704 void
2705 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2706 src_reg offset)
2707 {
2708 /* Set the surface read offset. */
2709 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2710
2711 /* Emit the instruction. Note that this maps to the normal SIMD8
2712 * untyped surface read message, but that's OK because unused
2713 * channels will be masked out.
2714 */
2715 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2716 dst, src_reg(surf_index));
2717 inst->base_mrf = 0;
2718 inst->mlen = 1;
2719 }
2720
2721 void
2722 vec4_visitor::emit_ndc_computation()
2723 {
2724 /* Get the position */
2725 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2726
2727 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2728 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2729 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2730
2731 current_annotation = "NDC";
2732 dst_reg ndc_w = ndc;
2733 ndc_w.writemask = WRITEMASK_W;
2734 src_reg pos_w = pos;
2735 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2736 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2737
2738 dst_reg ndc_xyz = ndc;
2739 ndc_xyz.writemask = WRITEMASK_XYZ;
2740
2741 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2742 }
2743
2744 void
2745 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2746 {
2747 if (brw->gen < 6 &&
2748 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2749 key->userclip_active || brw->has_negative_rhw_bug)) {
2750 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2751 dst_reg header1_w = header1;
2752 header1_w.writemask = WRITEMASK_W;
2753
2754 emit(MOV(header1, 0u));
2755
2756 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2757 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2758
2759 current_annotation = "Point size";
2760 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2761 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2762 }
2763
2764 if (key->userclip_active) {
2765 current_annotation = "Clipping flags";
2766 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2767 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2768
2769 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2770 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2771 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2772
2773 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2774 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2775 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2776 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2777 }
2778
2779 /* i965 clipping workaround:
2780 * 1) Test for -ve rhw
2781 * 2) If set,
2782 * set ndc = (0,0,0,0)
2783 * set ucp[6] = 1
2784 *
2785 * Later, clipping will detect ucp[6] and ensure the primitive is
2786 * clipped against all fixed planes.
2787 */
2788 if (brw->has_negative_rhw_bug) {
2789 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2790 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2791 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2792 vec4_instruction *inst;
2793 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2794 inst->predicate = BRW_PREDICATE_NORMAL;
2795 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2796 inst->predicate = BRW_PREDICATE_NORMAL;
2797 }
2798
2799 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2800 } else if (brw->gen < 6) {
2801 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2802 } else {
2803 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2804 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2805 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2806 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2807 }
2808 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2809 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2810 src_reg(output_reg[VARYING_SLOT_LAYER])));
2811 }
2812 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2813 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2814 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2815 }
2816 }
2817 }
2818
2819 void
2820 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2821 {
2822 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2823 *
2824 * "If a linked set of shaders forming the vertex stage contains no
2825 * static write to gl_ClipVertex or gl_ClipDistance, but the
2826 * application has requested clipping against user clip planes through
2827 * the API, then the coordinate written to gl_Position is used for
2828 * comparison against the user clip planes."
2829 *
2830 * This function is only called if the shader didn't write to
2831 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2832 * if the user wrote to it; otherwise we use gl_Position.
2833 */
2834 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2835 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2836 clip_vertex = VARYING_SLOT_POS;
2837 }
2838
2839 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2840 ++i) {
2841 reg.writemask = 1 << i;
2842 emit(DP4(reg,
2843 src_reg(output_reg[clip_vertex]),
2844 src_reg(this->userplane[i + offset])));
2845 }
2846 }
2847
2848 void
2849 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2850 {
2851 assert (varying < VARYING_SLOT_MAX);
2852 reg.type = output_reg[varying].type;
2853 current_annotation = output_reg_annotation[varying];
2854 /* Copy the register, saturating if necessary */
2855 vec4_instruction *inst = emit(MOV(reg,
2856 src_reg(output_reg[varying])));
2857 if ((varying == VARYING_SLOT_COL0 ||
2858 varying == VARYING_SLOT_COL1 ||
2859 varying == VARYING_SLOT_BFC0 ||
2860 varying == VARYING_SLOT_BFC1) &&
2861 key->clamp_vertex_color) {
2862 inst->saturate = true;
2863 }
2864 }
2865
2866 void
2867 vec4_visitor::emit_urb_slot(int mrf, int varying)
2868 {
2869 struct brw_reg hw_reg = brw_message_reg(mrf);
2870 dst_reg reg = dst_reg(MRF, mrf);
2871 reg.type = BRW_REGISTER_TYPE_F;
2872
2873 switch (varying) {
2874 case VARYING_SLOT_PSIZ:
2875 /* PSIZ is always in slot 0, and is coupled with other flags. */
2876 current_annotation = "indices, point width, clip flags";
2877 emit_psiz_and_flags(hw_reg);
2878 break;
2879 case BRW_VARYING_SLOT_NDC:
2880 current_annotation = "NDC";
2881 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2882 break;
2883 case VARYING_SLOT_POS:
2884 current_annotation = "gl_Position";
2885 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2886 break;
2887 case VARYING_SLOT_EDGE:
2888 /* This is present when doing unfilled polygons. We're supposed to copy
2889 * the edge flag from the user-provided vertex array
2890 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2891 * of that attribute (starts as 1.0f). This is then used in clipping to
2892 * determine which edges should be drawn as wireframe.
2893 */
2894 current_annotation = "edge flag";
2895 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2896 glsl_type::float_type, WRITEMASK_XYZW))));
2897 break;
2898 case BRW_VARYING_SLOT_PAD:
2899 /* No need to write to this slot */
2900 break;
2901 default:
2902 emit_generic_urb_slot(reg, varying);
2903 break;
2904 }
2905 }
2906
2907 static int
2908 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2909 {
2910 if (brw->gen >= 6) {
2911 /* URB data written (does not include the message header reg) must
2912 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2913 * section 5.4.3.2.2: URB_INTERLEAVED.
2914 *
2915 * URB entries are allocated on a multiple of 1024 bits, so an
2916 * extra 128 bits written here to make the end align to 256 is
2917 * no problem.
2918 */
2919 if ((mlen % 2) != 1)
2920 mlen++;
2921 }
2922
2923 return mlen;
2924 }
2925
2926
2927 /**
2928 * Generates the VUE payload plus the necessary URB write instructions to
2929 * output it.
2930 *
2931 * The VUE layout is documented in Volume 2a.
2932 */
2933 void
2934 vec4_visitor::emit_vertex()
2935 {
2936 /* MRF 0 is reserved for the debugger, so start with message header
2937 * in MRF 1.
2938 */
2939 int base_mrf = 1;
2940 int mrf = base_mrf;
2941 /* In the process of generating our URB write message contents, we
2942 * may need to unspill a register or load from an array. Those
2943 * reads would use MRFs 14-15.
2944 */
2945 int max_usable_mrf = 13;
2946
2947 /* The following assertion verifies that max_usable_mrf causes an
2948 * even-numbered amount of URB write data, which will meet gen6's
2949 * requirements for length alignment.
2950 */
2951 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2952
2953 /* First mrf is the g0-based message header containing URB handles and
2954 * such.
2955 */
2956 emit_urb_write_header(mrf++);
2957
2958 if (brw->gen < 6) {
2959 emit_ndc_computation();
2960 }
2961
2962 /* Lower legacy ff and ClipVertex clipping to clip distances */
2963 if (key->userclip_active && !prog->UsesClipDistanceOut) {
2964 current_annotation = "user clip distances";
2965
2966 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2967 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2968
2969 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2970 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2971 }
2972
2973 /* We may need to split this up into several URB writes, so do them in a
2974 * loop.
2975 */
2976 int slot = 0;
2977 bool complete = false;
2978 do {
2979 /* URB offset is in URB row increments, and each of our MRFs is half of
2980 * one of those, since we're doing interleaved writes.
2981 */
2982 int offset = slot / 2;
2983
2984 mrf = base_mrf + 1;
2985 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2986 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2987
2988 /* If this was max_usable_mrf, we can't fit anything more into this
2989 * URB WRITE.
2990 */
2991 if (mrf > max_usable_mrf) {
2992 slot++;
2993 break;
2994 }
2995 }
2996
2997 complete = slot >= prog_data->vue_map.num_slots;
2998 current_annotation = "URB write";
2999 vec4_instruction *inst = emit_urb_write_opcode(complete);
3000 inst->base_mrf = base_mrf;
3001 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3002 inst->offset += offset;
3003 } while(!complete);
3004 }
3005
3006
3007 src_reg
3008 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3009 src_reg *reladdr, int reg_offset)
3010 {
3011 /* Because we store the values to scratch interleaved like our
3012 * vertex data, we need to scale the vec4 index by 2.
3013 */
3014 int message_header_scale = 2;
3015
3016 /* Pre-gen6, the message header uses byte offsets instead of vec4
3017 * (16-byte) offset units.
3018 */
3019 if (brw->gen < 6)
3020 message_header_scale *= 16;
3021
3022 if (reladdr) {
3023 src_reg index = src_reg(this, glsl_type::int_type);
3024
3025 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3026 emit_before(inst, MUL(dst_reg(index),
3027 index, src_reg(message_header_scale)));
3028
3029 return index;
3030 } else {
3031 return src_reg(reg_offset * message_header_scale);
3032 }
3033 }
3034
3035 src_reg
3036 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3037 src_reg *reladdr, int reg_offset)
3038 {
3039 if (reladdr) {
3040 src_reg index = src_reg(this, glsl_type::int_type);
3041
3042 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3043
3044 /* Pre-gen6, the message header uses byte offsets instead of vec4
3045 * (16-byte) offset units.
3046 */
3047 if (brw->gen < 6) {
3048 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3049 }
3050
3051 return index;
3052 } else if (brw->gen >= 8) {
3053 /* Store the offset in a GRF so we can send-from-GRF. */
3054 src_reg offset = src_reg(this, glsl_type::int_type);
3055 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3056 return offset;
3057 } else {
3058 int message_header_scale = brw->gen < 6 ? 16 : 1;
3059 return src_reg(reg_offset * message_header_scale);
3060 }
3061 }
3062
3063 /**
3064 * Emits an instruction before @inst to load the value named by @orig_src
3065 * from scratch space at @base_offset to @temp.
3066 *
3067 * @base_offset is measured in 32-byte units (the size of a register).
3068 */
3069 void
3070 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3071 dst_reg temp, src_reg orig_src,
3072 int base_offset)
3073 {
3074 int reg_offset = base_offset + orig_src.reg_offset;
3075 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3076
3077 emit_before(inst, SCRATCH_READ(temp, index));
3078 }
3079
3080 /**
3081 * Emits an instruction after @inst to store the value to be written
3082 * to @orig_dst to scratch space at @base_offset, from @temp.
3083 *
3084 * @base_offset is measured in 32-byte units (the size of a register).
3085 */
3086 void
3087 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3088 {
3089 int reg_offset = base_offset + inst->dst.reg_offset;
3090 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3091
3092 /* Create a temporary register to store *inst's result in.
3093 *
3094 * We have to be careful in MOVing from our temporary result register in
3095 * the scratch write. If we swizzle from channels of the temporary that
3096 * weren't initialized, it will confuse live interval analysis, which will
3097 * make spilling fail to make progress.
3098 */
3099 src_reg temp = src_reg(this, glsl_type::vec4_type);
3100 temp.type = inst->dst.type;
3101 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3102 int swizzles[4];
3103 for (int i = 0; i < 4; i++)
3104 if (inst->dst.writemask & (1 << i))
3105 swizzles[i] = i;
3106 else
3107 swizzles[i] = first_writemask_chan;
3108 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3109 swizzles[2], swizzles[3]);
3110
3111 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3112 inst->dst.writemask));
3113 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3114 write->predicate = inst->predicate;
3115 write->ir = inst->ir;
3116 write->annotation = inst->annotation;
3117 inst->insert_after(write);
3118
3119 inst->dst.file = temp.file;
3120 inst->dst.reg = temp.reg;
3121 inst->dst.reg_offset = temp.reg_offset;
3122 inst->dst.reladdr = NULL;
3123 }
3124
3125 /**
3126 * We can't generally support array access in GRF space, because a
3127 * single instruction's destination can only span 2 contiguous
3128 * registers. So, we send all GRF arrays that get variable index
3129 * access to scratch space.
3130 */
3131 void
3132 vec4_visitor::move_grf_array_access_to_scratch()
3133 {
3134 int scratch_loc[this->virtual_grf_count];
3135
3136 for (int i = 0; i < this->virtual_grf_count; i++) {
3137 scratch_loc[i] = -1;
3138 }
3139
3140 /* First, calculate the set of virtual GRFs that need to be punted
3141 * to scratch due to having any array access on them, and where in
3142 * scratch.
3143 */
3144 foreach_list(node, &this->instructions) {
3145 vec4_instruction *inst = (vec4_instruction *)node;
3146
3147 if (inst->dst.file == GRF && inst->dst.reladdr &&
3148 scratch_loc[inst->dst.reg] == -1) {
3149 scratch_loc[inst->dst.reg] = c->last_scratch;
3150 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3151 }
3152
3153 for (int i = 0 ; i < 3; i++) {
3154 src_reg *src = &inst->src[i];
3155
3156 if (src->file == GRF && src->reladdr &&
3157 scratch_loc[src->reg] == -1) {
3158 scratch_loc[src->reg] = c->last_scratch;
3159 c->last_scratch += this->virtual_grf_sizes[src->reg];
3160 }
3161 }
3162 }
3163
3164 /* Now, for anything that will be accessed through scratch, rewrite
3165 * it to load/store. Note that this is a _safe list walk, because
3166 * we may generate a new scratch_write instruction after the one
3167 * we're processing.
3168 */
3169 foreach_list_safe(node, &this->instructions) {
3170 vec4_instruction *inst = (vec4_instruction *)node;
3171
3172 /* Set up the annotation tracking for new generated instructions. */
3173 base_ir = inst->ir;
3174 current_annotation = inst->annotation;
3175
3176 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3177 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3178 }
3179
3180 for (int i = 0 ; i < 3; i++) {
3181 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3182 continue;
3183
3184 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3185
3186 emit_scratch_read(inst, temp, inst->src[i],
3187 scratch_loc[inst->src[i].reg]);
3188
3189 inst->src[i].file = temp.file;
3190 inst->src[i].reg = temp.reg;
3191 inst->src[i].reg_offset = temp.reg_offset;
3192 inst->src[i].reladdr = NULL;
3193 }
3194 }
3195 }
3196
3197 /**
3198 * Emits an instruction before @inst to load the value named by @orig_src
3199 * from the pull constant buffer (surface) at @base_offset to @temp.
3200 */
3201 void
3202 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3203 dst_reg temp, src_reg orig_src,
3204 int base_offset)
3205 {
3206 int reg_offset = base_offset + orig_src.reg_offset;
3207 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3208 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3209 vec4_instruction *load;
3210
3211 if (brw->gen >= 7) {
3212 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3213 grf_offset.type = offset.type;
3214 emit_before(inst, MOV(grf_offset, offset));
3215
3216 load = new(mem_ctx) vec4_instruction(this,
3217 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3218 temp, index, src_reg(grf_offset));
3219 } else {
3220 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3221 temp, index, offset);
3222 load->base_mrf = 14;
3223 load->mlen = 1;
3224 }
3225 emit_before(inst, load);
3226 }
3227
3228 /**
3229 * Implements array access of uniforms by inserting a
3230 * PULL_CONSTANT_LOAD instruction.
3231 *
3232 * Unlike temporary GRF array access (where we don't support it due to
3233 * the difficulty of doing relative addressing on instruction
3234 * destinations), we could potentially do array access of uniforms
3235 * that were loaded in GRF space as push constants. In real-world
3236 * usage we've seen, though, the arrays being used are always larger
3237 * than we could load as push constants, so just always move all
3238 * uniform array access out to a pull constant buffer.
3239 */
3240 void
3241 vec4_visitor::move_uniform_array_access_to_pull_constants()
3242 {
3243 int pull_constant_loc[this->uniforms];
3244
3245 for (int i = 0; i < this->uniforms; i++) {
3246 pull_constant_loc[i] = -1;
3247 }
3248
3249 /* Walk through and find array access of uniforms. Put a copy of that
3250 * uniform in the pull constant buffer.
3251 *
3252 * Note that we don't move constant-indexed accesses to arrays. No
3253 * testing has been done of the performance impact of this choice.
3254 */
3255 foreach_list_safe(node, &this->instructions) {
3256 vec4_instruction *inst = (vec4_instruction *)node;
3257
3258 for (int i = 0 ; i < 3; i++) {
3259 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3260 continue;
3261
3262 int uniform = inst->src[i].reg;
3263
3264 /* If this array isn't already present in the pull constant buffer,
3265 * add it.
3266 */
3267 if (pull_constant_loc[uniform] == -1) {
3268 const float **values = &stage_prog_data->param[uniform * 4];
3269
3270 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3271
3272 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3273 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3274 = values[j];
3275 }
3276 }
3277
3278 /* Set up the annotation tracking for new generated instructions. */
3279 base_ir = inst->ir;
3280 current_annotation = inst->annotation;
3281
3282 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3283
3284 emit_pull_constant_load(inst, temp, inst->src[i],
3285 pull_constant_loc[uniform]);
3286
3287 inst->src[i].file = temp.file;
3288 inst->src[i].reg = temp.reg;
3289 inst->src[i].reg_offset = temp.reg_offset;
3290 inst->src[i].reladdr = NULL;
3291 }
3292 }
3293
3294 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3295 * no need to track them as larger-than-vec4 objects. This will be
3296 * relied on in cutting out unused uniform vectors from push
3297 * constants.
3298 */
3299 split_uniform_registers();
3300 }
3301
3302 void
3303 vec4_visitor::resolve_ud_negate(src_reg *reg)
3304 {
3305 if (reg->type != BRW_REGISTER_TYPE_UD ||
3306 !reg->negate)
3307 return;
3308
3309 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3310 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3311 *reg = temp;
3312 }
3313
3314 vec4_visitor::vec4_visitor(struct brw_context *brw,
3315 struct brw_vec4_compile *c,
3316 struct gl_program *prog,
3317 const struct brw_vec4_prog_key *key,
3318 struct brw_vec4_prog_data *prog_data,
3319 struct gl_shader_program *shader_prog,
3320 struct brw_shader *shader,
3321 void *mem_ctx,
3322 bool debug_flag,
3323 bool no_spills,
3324 shader_time_shader_type st_base,
3325 shader_time_shader_type st_written,
3326 shader_time_shader_type st_reset)
3327 : sanity_param_count(0),
3328 fail_msg(NULL),
3329 first_non_payload_grf(0),
3330 need_all_constants_in_pull_buffer(false),
3331 debug_flag(debug_flag),
3332 no_spills(no_spills),
3333 st_base(st_base),
3334 st_written(st_written),
3335 st_reset(st_reset)
3336 {
3337 this->brw = brw;
3338 this->ctx = &brw->ctx;
3339 this->shader_prog = shader_prog;
3340 this->shader = shader;
3341
3342 this->mem_ctx = mem_ctx;
3343 this->failed = false;
3344
3345 this->base_ir = NULL;
3346 this->current_annotation = NULL;
3347 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3348
3349 this->c = c;
3350 this->prog = prog;
3351 this->key = key;
3352 this->prog_data = prog_data;
3353 this->stage_prog_data = &prog_data->base;
3354
3355 this->variable_ht = hash_table_ctor(0,
3356 hash_table_pointer_hash,
3357 hash_table_pointer_compare);
3358
3359 this->virtual_grf_start = NULL;
3360 this->virtual_grf_end = NULL;
3361 this->virtual_grf_sizes = NULL;
3362 this->virtual_grf_count = 0;
3363 this->virtual_grf_reg_map = NULL;
3364 this->virtual_grf_reg_count = 0;
3365 this->virtual_grf_array_size = 0;
3366 this->live_intervals_valid = false;
3367
3368 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3369
3370 this->uniforms = 0;
3371 }
3372
3373 vec4_visitor::~vec4_visitor()
3374 {
3375 hash_table_dtor(this->variable_ht);
3376 }
3377
3378
3379 void
3380 vec4_visitor::fail(const char *format, ...)
3381 {
3382 va_list va;
3383 char *msg;
3384
3385 if (failed)
3386 return;
3387
3388 failed = true;
3389
3390 va_start(va, format);
3391 msg = ralloc_vasprintf(mem_ctx, format, va);
3392 va_end(va);
3393 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3394
3395 this->fail_msg = msg;
3396
3397 if (debug_flag) {
3398 fprintf(stderr, "%s", msg);
3399 }
3400 }
3401
3402 } /* namespace brw */