i965/vec4: Mark invariant members as constants in vec4_visitor
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->sampler = 0;
47 this->texture_offset = 0;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->ir = v->base_ir;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_present = false;
53 this->mlen = 0;
54 this->base_mrf = 0;
55 this->offset = 0;
56 this->annotation = v->current_annotation;
57 }
58
59 vec4_instruction *
60 vec4_visitor::emit(vec4_instruction *inst)
61 {
62 this->instructions.push_tail(inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
69 {
70 new_inst->ir = inst->ir;
71 new_inst->annotation = inst->annotation;
72
73 inst->insert_before(new_inst);
74
75 return inst;
76 }
77
78 vec4_instruction *
79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
80 src_reg src0, src_reg src1, src_reg src2)
81 {
82 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
83 src0, src1, src2));
84 }
85
86
87 vec4_instruction *
88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
89 {
90 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
91 }
92
93 vec4_instruction *
94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
95 {
96 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
101 {
102 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode)
107 {
108 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
109 }
110
111 #define ALU1(op) \
112 vec4_instruction * \
113 vec4_visitor::op(dst_reg dst, src_reg src0) \
114 { \
115 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
116 src0); \
117 }
118
119 #define ALU2(op) \
120 vec4_instruction * \
121 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
122 { \
123 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
124 src0, src1); \
125 }
126
127 #define ALU3(op) \
128 vec4_instruction * \
129 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
130 { \
131 assert(brw->gen >= 6); \
132 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
133 src0, src1, src2); \
134 }
135
136 ALU1(NOT)
137 ALU1(MOV)
138 ALU1(FRC)
139 ALU1(RNDD)
140 ALU1(RNDE)
141 ALU1(RNDZ)
142 ALU1(F32TO16)
143 ALU1(F16TO32)
144 ALU2(ADD)
145 ALU2(MUL)
146 ALU2(MACH)
147 ALU2(AND)
148 ALU2(OR)
149 ALU2(XOR)
150 ALU2(DP3)
151 ALU2(DP4)
152 ALU2(DPH)
153 ALU2(SHL)
154 ALU2(SHR)
155 ALU2(ASR)
156 ALU3(LRP)
157 ALU1(BFREV)
158 ALU3(BFE)
159 ALU2(BFI1)
160 ALU3(BFI2)
161 ALU1(FBH)
162 ALU1(FBL)
163 ALU1(CBIT)
164 ALU3(MAD)
165 ALU2(ADDC)
166 ALU2(SUBB)
167
168 /** Gen4 predicated IF. */
169 vec4_instruction *
170 vec4_visitor::IF(uint32_t predicate)
171 {
172 vec4_instruction *inst;
173
174 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
175 inst->predicate = predicate;
176
177 return inst;
178 }
179
180 /** Gen6 IF with embedded comparison. */
181 vec4_instruction *
182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
183 {
184 assert(brw->gen == 6);
185
186 vec4_instruction *inst;
187
188 resolve_ud_negate(&src0);
189 resolve_ud_negate(&src1);
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
192 src0, src1);
193 inst->conditional_mod = condition;
194
195 return inst;
196 }
197
198 /**
199 * CMP: Sets the low bit of the destination channels with the result
200 * of the comparison, while the upper bits are undefined, and updates
201 * the flag register with the packed 16 bits of the result.
202 */
203 vec4_instruction *
204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
205 {
206 vec4_instruction *inst;
207
208 /* original gen4 does type conversion to the destination type
209 * before before comparison, producing garbage results for floating
210 * point comparisons.
211 */
212 if (brw->gen == 4) {
213 dst.type = src0.type;
214 if (dst.file == HW_REG)
215 dst.fixed_hw_reg.type = dst.type;
216 }
217
218 resolve_ud_negate(&src0);
219 resolve_ud_negate(&src1);
220
221 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
222 inst->conditional_mod = condition;
223
224 return inst;
225 }
226
227 vec4_instruction *
228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
229 {
230 vec4_instruction *inst;
231
232 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
233 dst, index);
234 inst->base_mrf = 14;
235 inst->mlen = 2;
236
237 return inst;
238 }
239
240 vec4_instruction *
241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
242 {
243 vec4_instruction *inst;
244
245 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
246 dst, src, index);
247 inst->base_mrf = 13;
248 inst->mlen = 3;
249
250 return inst;
251 }
252
253 void
254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
255 {
256 static enum opcode dot_opcodes[] = {
257 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
258 };
259
260 emit(dot_opcodes[elements - 2], dst, src0, src1);
261 }
262
263 src_reg
264 vec4_visitor::fix_3src_operand(src_reg src)
265 {
266 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
267 * able to use vertical stride of zero to replicate the vec4 uniform, like
268 *
269 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
270 *
271 * But you can't, since vertical stride is always four in three-source
272 * instructions. Instead, insert a MOV instruction to do the replication so
273 * that the three-source instruction can consume it.
274 */
275
276 /* The MOV is only needed if the source is a uniform or immediate. */
277 if (src.file != UNIFORM && src.file != IMM)
278 return src;
279
280 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
281 expanded.type = src.type;
282 emit(MOV(expanded, src));
283 return src_reg(expanded);
284 }
285
286 src_reg
287 vec4_visitor::fix_math_operand(src_reg src)
288 {
289 /* The gen6 math instruction ignores the source modifiers --
290 * swizzle, abs, negate, and at least some parts of the register
291 * region description.
292 *
293 * Rather than trying to enumerate all these cases, *always* expand the
294 * operand to a temp GRF for gen6.
295 *
296 * For gen7, keep the operand as-is, except if immediate, which gen7 still
297 * can't use.
298 */
299
300 if (brw->gen == 7 && src.file != IMM)
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
304 expanded.type = src.type;
305 emit(MOV(expanded, src));
306 return src_reg(expanded);
307 }
308
309 void
310 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
311 {
312 src = fix_math_operand(src);
313
314 if (dst.writemask != WRITEMASK_XYZW) {
315 /* The gen6 math instruction must be align1, so we can't do
316 * writemasks.
317 */
318 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
319
320 emit(opcode, temp_dst, src);
321
322 emit(MOV(dst, src_reg(temp_dst)));
323 } else {
324 emit(opcode, dst, src);
325 }
326 }
327
328 void
329 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
330 {
331 vec4_instruction *inst = emit(opcode, dst, src);
332 inst->base_mrf = 1;
333 inst->mlen = 1;
334 }
335
336 void
337 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
338 {
339 switch (opcode) {
340 case SHADER_OPCODE_RCP:
341 case SHADER_OPCODE_RSQ:
342 case SHADER_OPCODE_SQRT:
343 case SHADER_OPCODE_EXP2:
344 case SHADER_OPCODE_LOG2:
345 case SHADER_OPCODE_SIN:
346 case SHADER_OPCODE_COS:
347 break;
348 default:
349 assert(!"not reached: bad math opcode");
350 return;
351 }
352
353 if (brw->gen >= 6) {
354 return emit_math1_gen6(opcode, dst, src);
355 } else {
356 return emit_math1_gen4(opcode, dst, src);
357 }
358 }
359
360 void
361 vec4_visitor::emit_math2_gen6(enum opcode opcode,
362 dst_reg dst, src_reg src0, src_reg src1)
363 {
364 src0 = fix_math_operand(src0);
365 src1 = fix_math_operand(src1);
366
367 if (dst.writemask != WRITEMASK_XYZW) {
368 /* The gen6 math instruction must be align1, so we can't do
369 * writemasks.
370 */
371 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
372 temp_dst.type = dst.type;
373
374 emit(opcode, temp_dst, src0, src1);
375
376 emit(MOV(dst, src_reg(temp_dst)));
377 } else {
378 emit(opcode, dst, src0, src1);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen4(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 vec4_instruction *inst = emit(opcode, dst, src0, src1);
387 inst->base_mrf = 1;
388 inst->mlen = 2;
389 }
390
391 void
392 vec4_visitor::emit_math(enum opcode opcode,
393 dst_reg dst, src_reg src0, src_reg src1)
394 {
395 switch (opcode) {
396 case SHADER_OPCODE_POW:
397 case SHADER_OPCODE_INT_QUOTIENT:
398 case SHADER_OPCODE_INT_REMAINDER:
399 break;
400 default:
401 assert(!"not reached: unsupported binary math opcode");
402 return;
403 }
404
405 if (brw->gen >= 6) {
406 return emit_math2_gen6(opcode, dst, src0, src1);
407 } else {
408 return emit_math2_gen4(opcode, dst, src0, src1);
409 }
410 }
411
412 void
413 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
414 {
415 if (brw->gen < 7)
416 assert(!"ir_unop_pack_half_2x16 should be lowered");
417
418 assert(dst.type == BRW_REGISTER_TYPE_UD);
419 assert(src0.type == BRW_REGISTER_TYPE_F);
420
421 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
422 *
423 * Because this instruction does not have a 16-bit floating-point type,
424 * the destination data type must be Word (W).
425 *
426 * The destination must be DWord-aligned and specify a horizontal stride
427 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
428 * each destination channel and the upper word is not modified.
429 *
430 * The above restriction implies that the f32to16 instruction must use
431 * align1 mode, because only in align1 mode is it possible to specify
432 * horizontal stride. We choose here to defy the hardware docs and emit
433 * align16 instructions.
434 *
435 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
436 * instructions. I was partially successful in that the code passed all
437 * tests. However, the code was dubiously correct and fragile, and the
438 * tests were not harsh enough to probe that frailty. Not trusting the
439 * code, I chose instead to remain in align16 mode in defiance of the hw
440 * docs).
441 *
442 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
443 * simulator, emitting a f32to16 in align16 mode with UD as destination
444 * data type is safe. The behavior differs from that specified in the PRM
445 * in that the upper word of each destination channel is cleared to 0.
446 */
447
448 dst_reg tmp_dst(this, glsl_type::uvec2_type);
449 src_reg tmp_src(tmp_dst);
450
451 #if 0
452 /* Verify the undocumented behavior on which the following instructions
453 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
454 * then the result of the bit-or instruction below will be incorrect.
455 *
456 * You should inspect the disasm output in order to verify that the MOV is
457 * not optimized away.
458 */
459 emit(MOV(tmp_dst, src_reg(0x12345678u)));
460 #endif
461
462 /* Give tmp the form below, where "." means untouched.
463 *
464 * w z y x w z y x
465 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
466 *
467 * That the upper word of each write-channel be 0 is required for the
468 * following bit-shift and bit-or instructions to work. Note that this
469 * relies on the undocumented hardware behavior mentioned above.
470 */
471 tmp_dst.writemask = WRITEMASK_XY;
472 emit(F32TO16(tmp_dst, src0));
473
474 /* Give the write-channels of dst the form:
475 * 0xhhhh0000
476 */
477 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
478 emit(SHL(dst, tmp_src, src_reg(16u)));
479
480 /* Finally, give the write-channels of dst the form of packHalf2x16's
481 * output:
482 * 0xhhhhllll
483 */
484 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
485 emit(OR(dst, src_reg(dst), tmp_src));
486 }
487
488 void
489 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
490 {
491 if (brw->gen < 7)
492 assert(!"ir_unop_unpack_half_2x16 should be lowered");
493
494 assert(dst.type == BRW_REGISTER_TYPE_F);
495 assert(src0.type == BRW_REGISTER_TYPE_UD);
496
497 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
498 *
499 * Because this instruction does not have a 16-bit floating-point type,
500 * the source data type must be Word (W). The destination type must be
501 * F (Float).
502 *
503 * To use W as the source data type, we must adjust horizontal strides,
504 * which is only possible in align1 mode. All my [chadv] attempts at
505 * emitting align1 instructions for unpackHalf2x16 failed to pass the
506 * Piglit tests, so I gave up.
507 *
508 * I've verified that, on gen7 hardware and the simulator, it is safe to
509 * emit f16to32 in align16 mode with UD as source data type.
510 */
511
512 dst_reg tmp_dst(this, glsl_type::uvec2_type);
513 src_reg tmp_src(tmp_dst);
514
515 tmp_dst.writemask = WRITEMASK_X;
516 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
517
518 tmp_dst.writemask = WRITEMASK_Y;
519 emit(SHR(tmp_dst, src0, src_reg(16u)));
520
521 dst.writemask = WRITEMASK_XY;
522 emit(F16TO32(dst, tmp_src));
523 }
524
525 void
526 vec4_visitor::visit_instructions(const exec_list *list)
527 {
528 foreach_list(node, list) {
529 ir_instruction *ir = (ir_instruction *)node;
530
531 base_ir = ir;
532 ir->accept(this);
533 }
534 }
535
536
537 static int
538 type_size(const struct glsl_type *type)
539 {
540 unsigned int i;
541 int size;
542
543 switch (type->base_type) {
544 case GLSL_TYPE_UINT:
545 case GLSL_TYPE_INT:
546 case GLSL_TYPE_FLOAT:
547 case GLSL_TYPE_BOOL:
548 if (type->is_matrix()) {
549 return type->matrix_columns;
550 } else {
551 /* Regardless of size of vector, it gets a vec4. This is bad
552 * packing for things like floats, but otherwise arrays become a
553 * mess. Hopefully a later pass over the code can pack scalars
554 * down if appropriate.
555 */
556 return 1;
557 }
558 case GLSL_TYPE_ARRAY:
559 assert(type->length > 0);
560 return type_size(type->fields.array) * type->length;
561 case GLSL_TYPE_STRUCT:
562 size = 0;
563 for (i = 0; i < type->length; i++) {
564 size += type_size(type->fields.structure[i].type);
565 }
566 return size;
567 case GLSL_TYPE_SAMPLER:
568 /* Samplers take up one slot in UNIFORMS[], but they're baked in
569 * at link time.
570 */
571 return 1;
572 case GLSL_TYPE_ATOMIC_UINT:
573 return 0;
574 case GLSL_TYPE_IMAGE:
575 case GLSL_TYPE_VOID:
576 case GLSL_TYPE_ERROR:
577 case GLSL_TYPE_INTERFACE:
578 assert(0);
579 break;
580 }
581
582 return 0;
583 }
584
585 int
586 vec4_visitor::virtual_grf_alloc(int size)
587 {
588 if (virtual_grf_array_size <= virtual_grf_count) {
589 if (virtual_grf_array_size == 0)
590 virtual_grf_array_size = 16;
591 else
592 virtual_grf_array_size *= 2;
593 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
594 virtual_grf_array_size);
595 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
596 virtual_grf_array_size);
597 }
598 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
599 virtual_grf_reg_count += size;
600 virtual_grf_sizes[virtual_grf_count] = size;
601 return virtual_grf_count++;
602 }
603
604 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
605 {
606 init();
607
608 this->file = GRF;
609 this->reg = v->virtual_grf_alloc(type_size(type));
610
611 if (type->is_array() || type->is_record()) {
612 this->swizzle = BRW_SWIZZLE_NOOP;
613 } else {
614 this->swizzle = swizzle_for_size(type->vector_elements);
615 }
616
617 this->type = brw_type_for_base_type(type);
618 }
619
620 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
621 {
622 init();
623
624 this->file = GRF;
625 this->reg = v->virtual_grf_alloc(type_size(type));
626
627 if (type->is_array() || type->is_record()) {
628 this->writemask = WRITEMASK_XYZW;
629 } else {
630 this->writemask = (1 << type->vector_elements) - 1;
631 }
632
633 this->type = brw_type_for_base_type(type);
634 }
635
636 /* Our support for uniforms is piggy-backed on the struct
637 * gl_fragment_program, because that's where the values actually
638 * get stored, rather than in some global gl_shader_program uniform
639 * store.
640 */
641 void
642 vec4_visitor::setup_uniform_values(ir_variable *ir)
643 {
644 int namelen = strlen(ir->name);
645
646 /* The data for our (non-builtin) uniforms is stored in a series of
647 * gl_uniform_driver_storage structs for each subcomponent that
648 * glGetUniformLocation() could name. We know it's been set up in the same
649 * order we'd walk the type, so walk the list of storage and find anything
650 * with our name, or the prefix of a component that starts with our name.
651 */
652 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
653 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
654
655 if (strncmp(ir->name, storage->name, namelen) != 0 ||
656 (storage->name[namelen] != 0 &&
657 storage->name[namelen] != '.' &&
658 storage->name[namelen] != '[')) {
659 continue;
660 }
661
662 gl_constant_value *components = storage->storage;
663 unsigned vector_count = (MAX2(storage->array_elements, 1) *
664 storage->type->matrix_columns);
665
666 for (unsigned s = 0; s < vector_count; s++) {
667 assert(uniforms < uniform_array_size);
668 uniform_vector_size[uniforms] = storage->type->vector_elements;
669
670 int i;
671 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
672 stage_prog_data->param[uniforms * 4 + i] = &components->f;
673 components++;
674 }
675 for (; i < 4; i++) {
676 static float zero = 0;
677 stage_prog_data->param[uniforms * 4 + i] = &zero;
678 }
679
680 uniforms++;
681 }
682 }
683 }
684
685 void
686 vec4_visitor::setup_uniform_clipplane_values()
687 {
688 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
689
690 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
691 assert(this->uniforms < uniform_array_size);
692 this->uniform_vector_size[this->uniforms] = 4;
693 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
694 this->userplane[i].type = BRW_REGISTER_TYPE_F;
695 for (int j = 0; j < 4; ++j) {
696 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
697 }
698 ++this->uniforms;
699 }
700 }
701
702 /* Our support for builtin uniforms is even scarier than non-builtin.
703 * It sits on top of the PROG_STATE_VAR parameters that are
704 * automatically updated from GL context state.
705 */
706 void
707 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
708 {
709 const ir_state_slot *const slots = ir->state_slots;
710 assert(ir->state_slots != NULL);
711
712 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
713 /* This state reference has already been setup by ir_to_mesa,
714 * but we'll get the same index back here. We can reference
715 * ParameterValues directly, since unlike brw_fs.cpp, we never
716 * add new state references during compile.
717 */
718 int index = _mesa_add_state_reference(this->prog->Parameters,
719 (gl_state_index *)slots[i].tokens);
720 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
721
722 assert(this->uniforms < uniform_array_size);
723 this->uniform_vector_size[this->uniforms] = 0;
724 /* Add each of the unique swizzled channels of the element.
725 * This will end up matching the size of the glsl_type of this field.
726 */
727 int last_swiz = -1;
728 for (unsigned int j = 0; j < 4; j++) {
729 int swiz = GET_SWZ(slots[i].swizzle, j);
730 last_swiz = swiz;
731
732 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
733 assert(this->uniforms < uniform_array_size);
734 if (swiz <= last_swiz)
735 this->uniform_vector_size[this->uniforms]++;
736 }
737 this->uniforms++;
738 }
739 }
740
741 dst_reg *
742 vec4_visitor::variable_storage(ir_variable *var)
743 {
744 return (dst_reg *)hash_table_find(this->variable_ht, var);
745 }
746
747 void
748 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
749 {
750 ir_expression *expr = ir->as_expression();
751
752 *predicate = BRW_PREDICATE_NORMAL;
753
754 if (expr) {
755 src_reg op[2];
756 vec4_instruction *inst;
757
758 assert(expr->get_num_operands() <= 2);
759 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
760 expr->operands[i]->accept(this);
761 op[i] = this->result;
762
763 resolve_ud_negate(&op[i]);
764 }
765
766 switch (expr->operation) {
767 case ir_unop_logic_not:
768 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
769 inst->conditional_mod = BRW_CONDITIONAL_Z;
770 break;
771
772 case ir_binop_logic_xor:
773 inst = emit(XOR(dst_null_d(), op[0], op[1]));
774 inst->conditional_mod = BRW_CONDITIONAL_NZ;
775 break;
776
777 case ir_binop_logic_or:
778 inst = emit(OR(dst_null_d(), op[0], op[1]));
779 inst->conditional_mod = BRW_CONDITIONAL_NZ;
780 break;
781
782 case ir_binop_logic_and:
783 inst = emit(AND(dst_null_d(), op[0], op[1]));
784 inst->conditional_mod = BRW_CONDITIONAL_NZ;
785 break;
786
787 case ir_unop_f2b:
788 if (brw->gen >= 6) {
789 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
790 } else {
791 inst = emit(MOV(dst_null_f(), op[0]));
792 inst->conditional_mod = BRW_CONDITIONAL_NZ;
793 }
794 break;
795
796 case ir_unop_i2b:
797 if (brw->gen >= 6) {
798 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
799 } else {
800 inst = emit(MOV(dst_null_d(), op[0]));
801 inst->conditional_mod = BRW_CONDITIONAL_NZ;
802 }
803 break;
804
805 case ir_binop_all_equal:
806 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
807 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
808 break;
809
810 case ir_binop_any_nequal:
811 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
812 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
813 break;
814
815 case ir_unop_any:
816 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
817 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
818 break;
819
820 case ir_binop_greater:
821 case ir_binop_gequal:
822 case ir_binop_less:
823 case ir_binop_lequal:
824 case ir_binop_equal:
825 case ir_binop_nequal:
826 emit(CMP(dst_null_d(), op[0], op[1],
827 brw_conditional_for_comparison(expr->operation)));
828 break;
829
830 default:
831 assert(!"not reached");
832 break;
833 }
834 return;
835 }
836
837 ir->accept(this);
838
839 resolve_ud_negate(&this->result);
840
841 if (brw->gen >= 6) {
842 vec4_instruction *inst = emit(AND(dst_null_d(),
843 this->result, src_reg(1)));
844 inst->conditional_mod = BRW_CONDITIONAL_NZ;
845 } else {
846 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
847 inst->conditional_mod = BRW_CONDITIONAL_NZ;
848 }
849 }
850
851 /**
852 * Emit a gen6 IF statement with the comparison folded into the IF
853 * instruction.
854 */
855 void
856 vec4_visitor::emit_if_gen6(ir_if *ir)
857 {
858 ir_expression *expr = ir->condition->as_expression();
859
860 if (expr) {
861 src_reg op[2];
862 dst_reg temp;
863
864 assert(expr->get_num_operands() <= 2);
865 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
866 expr->operands[i]->accept(this);
867 op[i] = this->result;
868 }
869
870 switch (expr->operation) {
871 case ir_unop_logic_not:
872 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
873 return;
874
875 case ir_binop_logic_xor:
876 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
877 return;
878
879 case ir_binop_logic_or:
880 temp = dst_reg(this, glsl_type::bool_type);
881 emit(OR(temp, op[0], op[1]));
882 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
883 return;
884
885 case ir_binop_logic_and:
886 temp = dst_reg(this, glsl_type::bool_type);
887 emit(AND(temp, op[0], op[1]));
888 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
889 return;
890
891 case ir_unop_f2b:
892 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
893 return;
894
895 case ir_unop_i2b:
896 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
897 return;
898
899 case ir_binop_greater:
900 case ir_binop_gequal:
901 case ir_binop_less:
902 case ir_binop_lequal:
903 case ir_binop_equal:
904 case ir_binop_nequal:
905 emit(IF(op[0], op[1],
906 brw_conditional_for_comparison(expr->operation)));
907 return;
908
909 case ir_binop_all_equal:
910 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
911 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
912 return;
913
914 case ir_binop_any_nequal:
915 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
916 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
917 return;
918
919 case ir_unop_any:
920 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
921 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
922 return;
923
924 default:
925 assert(!"not reached");
926 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
927 return;
928 }
929 return;
930 }
931
932 ir->condition->accept(this);
933
934 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
935 }
936
937 void
938 vec4_visitor::visit(ir_variable *ir)
939 {
940 dst_reg *reg = NULL;
941
942 if (variable_storage(ir))
943 return;
944
945 switch (ir->data.mode) {
946 case ir_var_shader_in:
947 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
948 break;
949
950 case ir_var_shader_out:
951 reg = new(mem_ctx) dst_reg(this, ir->type);
952
953 for (int i = 0; i < type_size(ir->type); i++) {
954 output_reg[ir->data.location + i] = *reg;
955 output_reg[ir->data.location + i].reg_offset = i;
956 output_reg[ir->data.location + i].type =
957 brw_type_for_base_type(ir->type->get_scalar_type());
958 output_reg_annotation[ir->data.location + i] = ir->name;
959 }
960 break;
961
962 case ir_var_auto:
963 case ir_var_temporary:
964 reg = new(mem_ctx) dst_reg(this, ir->type);
965 break;
966
967 case ir_var_uniform:
968 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
969
970 /* Thanks to the lower_ubo_reference pass, we will see only
971 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
972 * variables, so no need for them to be in variable_ht.
973 *
974 * Atomic counters take no uniform storage, no need to do
975 * anything here.
976 */
977 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
978 return;
979
980 /* Track how big the whole uniform variable is, in case we need to put a
981 * copy of its data into pull constants for array access.
982 */
983 assert(this->uniforms < uniform_array_size);
984 this->uniform_size[this->uniforms] = type_size(ir->type);
985
986 if (!strncmp(ir->name, "gl_", 3)) {
987 setup_builtin_uniform_values(ir);
988 } else {
989 setup_uniform_values(ir);
990 }
991 break;
992
993 case ir_var_system_value:
994 reg = make_reg_for_system_value(ir);
995 break;
996
997 default:
998 assert(!"not reached");
999 }
1000
1001 reg->type = brw_type_for_base_type(ir->type);
1002 hash_table_insert(this->variable_ht, reg, ir);
1003 }
1004
1005 void
1006 vec4_visitor::visit(ir_loop *ir)
1007 {
1008 /* We don't want debugging output to print the whole body of the
1009 * loop as the annotation.
1010 */
1011 this->base_ir = NULL;
1012
1013 emit(BRW_OPCODE_DO);
1014
1015 visit_instructions(&ir->body_instructions);
1016
1017 emit(BRW_OPCODE_WHILE);
1018 }
1019
1020 void
1021 vec4_visitor::visit(ir_loop_jump *ir)
1022 {
1023 switch (ir->mode) {
1024 case ir_loop_jump::jump_break:
1025 emit(BRW_OPCODE_BREAK);
1026 break;
1027 case ir_loop_jump::jump_continue:
1028 emit(BRW_OPCODE_CONTINUE);
1029 break;
1030 }
1031 }
1032
1033
1034 void
1035 vec4_visitor::visit(ir_function_signature *ir)
1036 {
1037 assert(0);
1038 (void)ir;
1039 }
1040
1041 void
1042 vec4_visitor::visit(ir_function *ir)
1043 {
1044 /* Ignore function bodies other than main() -- we shouldn't see calls to
1045 * them since they should all be inlined.
1046 */
1047 if (strcmp(ir->name, "main") == 0) {
1048 const ir_function_signature *sig;
1049 exec_list empty;
1050
1051 sig = ir->matching_signature(NULL, &empty);
1052
1053 assert(sig);
1054
1055 visit_instructions(&sig->body);
1056 }
1057 }
1058
1059 bool
1060 vec4_visitor::try_emit_sat(ir_expression *ir)
1061 {
1062 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1063 if (!sat_src)
1064 return false;
1065
1066 sat_src->accept(this);
1067 src_reg src = this->result;
1068
1069 this->result = src_reg(this, ir->type);
1070 vec4_instruction *inst;
1071 inst = emit(MOV(dst_reg(this->result), src));
1072 inst->saturate = true;
1073
1074 return true;
1075 }
1076
1077 bool
1078 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1079 {
1080 /* 3-src instructions were introduced in gen6. */
1081 if (brw->gen < 6)
1082 return false;
1083
1084 /* MAD can only handle floating-point data. */
1085 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1086 return false;
1087
1088 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1089 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1090
1091 if (!mul || mul->operation != ir_binop_mul)
1092 return false;
1093
1094 nonmul->accept(this);
1095 src_reg src0 = fix_3src_operand(this->result);
1096
1097 mul->operands[0]->accept(this);
1098 src_reg src1 = fix_3src_operand(this->result);
1099
1100 mul->operands[1]->accept(this);
1101 src_reg src2 = fix_3src_operand(this->result);
1102
1103 this->result = src_reg(this, ir->type);
1104 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1105
1106 return true;
1107 }
1108
1109 void
1110 vec4_visitor::emit_bool_comparison(unsigned int op,
1111 dst_reg dst, src_reg src0, src_reg src1)
1112 {
1113 /* original gen4 does destination conversion before comparison. */
1114 if (brw->gen < 5)
1115 dst.type = src0.type;
1116
1117 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1118
1119 dst.type = BRW_REGISTER_TYPE_D;
1120 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1121 }
1122
1123 void
1124 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1125 src_reg src0, src_reg src1)
1126 {
1127 vec4_instruction *inst;
1128
1129 if (brw->gen >= 6) {
1130 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1131 inst->conditional_mod = conditionalmod;
1132 } else {
1133 emit(CMP(dst, src0, src1, conditionalmod));
1134
1135 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1136 inst->predicate = BRW_PREDICATE_NORMAL;
1137 }
1138 }
1139
1140 void
1141 vec4_visitor::emit_lrp(const dst_reg &dst,
1142 const src_reg &x, const src_reg &y, const src_reg &a)
1143 {
1144 if (brw->gen >= 6) {
1145 /* Note that the instruction's argument order is reversed from GLSL
1146 * and the IR.
1147 */
1148 emit(LRP(dst,
1149 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1150 } else {
1151 /* Earlier generations don't support three source operations, so we
1152 * need to emit x*(1-a) + y*a.
1153 *
1154 * A better way to do this would be:
1155 * ADD one_minus_a, negate(a), 1.0f
1156 * MUL null, y, a
1157 * MAC dst, x, one_minus_a
1158 * but we would need to support MAC and implicit accumulator.
1159 */
1160 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1161 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1162 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1163 y_times_a.writemask = dst.writemask;
1164 one_minus_a.writemask = dst.writemask;
1165 x_times_one_minus_a.writemask = dst.writemask;
1166
1167 emit(MUL(y_times_a, y, a));
1168 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1169 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1170 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1171 }
1172 }
1173
1174 static bool
1175 is_16bit_constant(ir_rvalue *rvalue)
1176 {
1177 ir_constant *constant = rvalue->as_constant();
1178 if (!constant)
1179 return false;
1180
1181 if (constant->type != glsl_type::int_type &&
1182 constant->type != glsl_type::uint_type)
1183 return false;
1184
1185 return constant->value.u[0] < (1 << 16);
1186 }
1187
1188 void
1189 vec4_visitor::visit(ir_expression *ir)
1190 {
1191 unsigned int operand;
1192 src_reg op[Elements(ir->operands)];
1193 src_reg result_src;
1194 dst_reg result_dst;
1195 vec4_instruction *inst;
1196
1197 if (try_emit_sat(ir))
1198 return;
1199
1200 if (ir->operation == ir_binop_add) {
1201 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1202 return;
1203 }
1204
1205 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1206 this->result.file = BAD_FILE;
1207 ir->operands[operand]->accept(this);
1208 if (this->result.file == BAD_FILE) {
1209 fprintf(stderr, "Failed to get tree for expression operand:\n");
1210 ir->operands[operand]->fprint(stderr);
1211 exit(1);
1212 }
1213 op[operand] = this->result;
1214
1215 /* Matrix expression operands should have been broken down to vector
1216 * operations already.
1217 */
1218 assert(!ir->operands[operand]->type->is_matrix());
1219 }
1220
1221 int vector_elements = ir->operands[0]->type->vector_elements;
1222 if (ir->operands[1]) {
1223 vector_elements = MAX2(vector_elements,
1224 ir->operands[1]->type->vector_elements);
1225 }
1226
1227 this->result.file = BAD_FILE;
1228
1229 /* Storage for our result. Ideally for an assignment we'd be using
1230 * the actual storage for the result here, instead.
1231 */
1232 result_src = src_reg(this, ir->type);
1233 /* convenience for the emit functions below. */
1234 result_dst = dst_reg(result_src);
1235 /* If nothing special happens, this is the result. */
1236 this->result = result_src;
1237 /* Limit writes to the channels that will be used by result_src later.
1238 * This does limit this temp's use as a temporary for multi-instruction
1239 * sequences.
1240 */
1241 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1242
1243 switch (ir->operation) {
1244 case ir_unop_logic_not:
1245 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1246 * ones complement of the whole register, not just bit 0.
1247 */
1248 emit(XOR(result_dst, op[0], src_reg(1)));
1249 break;
1250 case ir_unop_neg:
1251 op[0].negate = !op[0].negate;
1252 emit(MOV(result_dst, op[0]));
1253 break;
1254 case ir_unop_abs:
1255 op[0].abs = true;
1256 op[0].negate = false;
1257 emit(MOV(result_dst, op[0]));
1258 break;
1259
1260 case ir_unop_sign:
1261 if (ir->type->is_float()) {
1262 /* AND(val, 0x80000000) gives the sign bit.
1263 *
1264 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1265 * zero.
1266 */
1267 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1268
1269 op[0].type = BRW_REGISTER_TYPE_UD;
1270 result_dst.type = BRW_REGISTER_TYPE_UD;
1271 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1272
1273 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1274 inst->predicate = BRW_PREDICATE_NORMAL;
1275
1276 this->result.type = BRW_REGISTER_TYPE_F;
1277 } else {
1278 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1279 * -> non-negative val generates 0x00000000.
1280 * Predicated OR sets 1 if val is positive.
1281 */
1282 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1283
1284 emit(ASR(result_dst, op[0], src_reg(31)));
1285
1286 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1287 inst->predicate = BRW_PREDICATE_NORMAL;
1288 }
1289 break;
1290
1291 case ir_unop_rcp:
1292 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1293 break;
1294
1295 case ir_unop_exp2:
1296 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1297 break;
1298 case ir_unop_log2:
1299 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1300 break;
1301 case ir_unop_exp:
1302 case ir_unop_log:
1303 assert(!"not reached: should be handled by ir_explog_to_explog2");
1304 break;
1305 case ir_unop_sin:
1306 case ir_unop_sin_reduced:
1307 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1308 break;
1309 case ir_unop_cos:
1310 case ir_unop_cos_reduced:
1311 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1312 break;
1313
1314 case ir_unop_dFdx:
1315 case ir_unop_dFdy:
1316 assert(!"derivatives not valid in vertex shader");
1317 break;
1318
1319 case ir_unop_bitfield_reverse:
1320 emit(BFREV(result_dst, op[0]));
1321 break;
1322 case ir_unop_bit_count:
1323 emit(CBIT(result_dst, op[0]));
1324 break;
1325 case ir_unop_find_msb: {
1326 src_reg temp = src_reg(this, glsl_type::uint_type);
1327
1328 inst = emit(FBH(dst_reg(temp), op[0]));
1329 inst->dst.writemask = WRITEMASK_XYZW;
1330
1331 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1332 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1333 * subtract the result from 31 to convert the MSB count into an LSB count.
1334 */
1335
1336 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1337 temp.swizzle = BRW_SWIZZLE_NOOP;
1338 emit(MOV(result_dst, temp));
1339
1340 src_reg src_tmp = src_reg(result_dst);
1341 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1342
1343 src_tmp.negate = true;
1344 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1345 inst->predicate = BRW_PREDICATE_NORMAL;
1346 break;
1347 }
1348 case ir_unop_find_lsb:
1349 emit(FBL(result_dst, op[0]));
1350 break;
1351
1352 case ir_unop_noise:
1353 assert(!"not reached: should be handled by lower_noise");
1354 break;
1355
1356 case ir_binop_add:
1357 emit(ADD(result_dst, op[0], op[1]));
1358 break;
1359 case ir_binop_sub:
1360 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1361 break;
1362
1363 case ir_binop_mul:
1364 if (brw->gen < 8 && ir->type->is_integer()) {
1365 /* For integer multiplication, the MUL uses the low 16 bits of one of
1366 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1367 * accumulates in the contribution of the upper 16 bits of that
1368 * operand. If we can determine that one of the args is in the low
1369 * 16 bits, though, we can just emit a single MUL.
1370 */
1371 if (is_16bit_constant(ir->operands[0])) {
1372 if (brw->gen < 7)
1373 emit(MUL(result_dst, op[0], op[1]));
1374 else
1375 emit(MUL(result_dst, op[1], op[0]));
1376 } else if (is_16bit_constant(ir->operands[1])) {
1377 if (brw->gen < 7)
1378 emit(MUL(result_dst, op[1], op[0]));
1379 else
1380 emit(MUL(result_dst, op[0], op[1]));
1381 } else {
1382 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1383
1384 emit(MUL(acc, op[0], op[1]));
1385 emit(MACH(dst_null_d(), op[0], op[1]));
1386 emit(MOV(result_dst, src_reg(acc)));
1387 }
1388 } else {
1389 emit(MUL(result_dst, op[0], op[1]));
1390 }
1391 break;
1392 case ir_binop_imul_high: {
1393 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1394
1395 emit(MUL(acc, op[0], op[1]));
1396 emit(MACH(result_dst, op[0], op[1]));
1397 break;
1398 }
1399 case ir_binop_div:
1400 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1401 assert(ir->type->is_integer());
1402 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1403 break;
1404 case ir_binop_carry: {
1405 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1406
1407 emit(ADDC(dst_null_ud(), op[0], op[1]));
1408 emit(MOV(result_dst, src_reg(acc)));
1409 break;
1410 }
1411 case ir_binop_borrow: {
1412 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1413
1414 emit(SUBB(dst_null_ud(), op[0], op[1]));
1415 emit(MOV(result_dst, src_reg(acc)));
1416 break;
1417 }
1418 case ir_binop_mod:
1419 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1420 assert(ir->type->is_integer());
1421 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1422 break;
1423
1424 case ir_binop_less:
1425 case ir_binop_greater:
1426 case ir_binop_lequal:
1427 case ir_binop_gequal:
1428 case ir_binop_equal:
1429 case ir_binop_nequal: {
1430 emit(CMP(result_dst, op[0], op[1],
1431 brw_conditional_for_comparison(ir->operation)));
1432 emit(AND(result_dst, result_src, src_reg(0x1)));
1433 break;
1434 }
1435
1436 case ir_binop_all_equal:
1437 /* "==" operator producing a scalar boolean. */
1438 if (ir->operands[0]->type->is_vector() ||
1439 ir->operands[1]->type->is_vector()) {
1440 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1441 emit(MOV(result_dst, src_reg(0)));
1442 inst = emit(MOV(result_dst, src_reg(1)));
1443 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1444 } else {
1445 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1446 emit(AND(result_dst, result_src, src_reg(0x1)));
1447 }
1448 break;
1449 case ir_binop_any_nequal:
1450 /* "!=" operator producing a scalar boolean. */
1451 if (ir->operands[0]->type->is_vector() ||
1452 ir->operands[1]->type->is_vector()) {
1453 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1454
1455 emit(MOV(result_dst, src_reg(0)));
1456 inst = emit(MOV(result_dst, src_reg(1)));
1457 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1458 } else {
1459 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1460 emit(AND(result_dst, result_src, src_reg(0x1)));
1461 }
1462 break;
1463
1464 case ir_unop_any:
1465 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1466 emit(MOV(result_dst, src_reg(0)));
1467
1468 inst = emit(MOV(result_dst, src_reg(1)));
1469 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1470 break;
1471
1472 case ir_binop_logic_xor:
1473 emit(XOR(result_dst, op[0], op[1]));
1474 break;
1475
1476 case ir_binop_logic_or:
1477 emit(OR(result_dst, op[0], op[1]));
1478 break;
1479
1480 case ir_binop_logic_and:
1481 emit(AND(result_dst, op[0], op[1]));
1482 break;
1483
1484 case ir_binop_dot:
1485 assert(ir->operands[0]->type->is_vector());
1486 assert(ir->operands[0]->type == ir->operands[1]->type);
1487 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1488 break;
1489
1490 case ir_unop_sqrt:
1491 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1492 break;
1493 case ir_unop_rsq:
1494 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1495 break;
1496
1497 case ir_unop_bitcast_i2f:
1498 case ir_unop_bitcast_u2f:
1499 this->result = op[0];
1500 this->result.type = BRW_REGISTER_TYPE_F;
1501 break;
1502
1503 case ir_unop_bitcast_f2i:
1504 this->result = op[0];
1505 this->result.type = BRW_REGISTER_TYPE_D;
1506 break;
1507
1508 case ir_unop_bitcast_f2u:
1509 this->result = op[0];
1510 this->result.type = BRW_REGISTER_TYPE_UD;
1511 break;
1512
1513 case ir_unop_i2f:
1514 case ir_unop_i2u:
1515 case ir_unop_u2i:
1516 case ir_unop_u2f:
1517 case ir_unop_b2f:
1518 case ir_unop_b2i:
1519 case ir_unop_f2i:
1520 case ir_unop_f2u:
1521 emit(MOV(result_dst, op[0]));
1522 break;
1523 case ir_unop_f2b:
1524 case ir_unop_i2b: {
1525 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1526 emit(AND(result_dst, result_src, src_reg(1)));
1527 break;
1528 }
1529
1530 case ir_unop_trunc:
1531 emit(RNDZ(result_dst, op[0]));
1532 break;
1533 case ir_unop_ceil:
1534 op[0].negate = !op[0].negate;
1535 inst = emit(RNDD(result_dst, op[0]));
1536 this->result.negate = true;
1537 break;
1538 case ir_unop_floor:
1539 inst = emit(RNDD(result_dst, op[0]));
1540 break;
1541 case ir_unop_fract:
1542 inst = emit(FRC(result_dst, op[0]));
1543 break;
1544 case ir_unop_round_even:
1545 emit(RNDE(result_dst, op[0]));
1546 break;
1547
1548 case ir_binop_min:
1549 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1550 break;
1551 case ir_binop_max:
1552 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1553 break;
1554
1555 case ir_binop_pow:
1556 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1557 break;
1558
1559 case ir_unop_bit_not:
1560 inst = emit(NOT(result_dst, op[0]));
1561 break;
1562 case ir_binop_bit_and:
1563 inst = emit(AND(result_dst, op[0], op[1]));
1564 break;
1565 case ir_binop_bit_xor:
1566 inst = emit(XOR(result_dst, op[0], op[1]));
1567 break;
1568 case ir_binop_bit_or:
1569 inst = emit(OR(result_dst, op[0], op[1]));
1570 break;
1571
1572 case ir_binop_lshift:
1573 inst = emit(SHL(result_dst, op[0], op[1]));
1574 break;
1575
1576 case ir_binop_rshift:
1577 if (ir->type->base_type == GLSL_TYPE_INT)
1578 inst = emit(ASR(result_dst, op[0], op[1]));
1579 else
1580 inst = emit(SHR(result_dst, op[0], op[1]));
1581 break;
1582
1583 case ir_binop_bfm:
1584 emit(BFI1(result_dst, op[0], op[1]));
1585 break;
1586
1587 case ir_binop_ubo_load: {
1588 ir_constant *uniform_block = ir->operands[0]->as_constant();
1589 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1590 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1591 src_reg offset;
1592
1593 /* Now, load the vector from that offset. */
1594 assert(ir->type->is_vector() || ir->type->is_scalar());
1595
1596 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1597 packed_consts.type = result.type;
1598 src_reg surf_index =
1599 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1600 if (const_offset_ir) {
1601 if (brw->gen >= 8) {
1602 /* Store the offset in a GRF so we can send-from-GRF. */
1603 offset = src_reg(this, glsl_type::int_type);
1604 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1605 } else {
1606 /* Immediates are fine on older generations since they'll be moved
1607 * to a (potentially fake) MRF at the generator level.
1608 */
1609 offset = src_reg(const_offset / 16);
1610 }
1611 } else {
1612 offset = src_reg(this, glsl_type::uint_type);
1613 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1614 }
1615
1616 if (brw->gen >= 7) {
1617 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1618 grf_offset.type = offset.type;
1619
1620 emit(MOV(grf_offset, offset));
1621
1622 emit(new(mem_ctx) vec4_instruction(this,
1623 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1624 dst_reg(packed_consts),
1625 surf_index,
1626 src_reg(grf_offset)));
1627 } else {
1628 vec4_instruction *pull =
1629 emit(new(mem_ctx) vec4_instruction(this,
1630 VS_OPCODE_PULL_CONSTANT_LOAD,
1631 dst_reg(packed_consts),
1632 surf_index,
1633 offset));
1634 pull->base_mrf = 14;
1635 pull->mlen = 1;
1636 }
1637
1638 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1639 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1640 const_offset % 16 / 4,
1641 const_offset % 16 / 4,
1642 const_offset % 16 / 4);
1643
1644 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1645 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1646 emit(CMP(result_dst, packed_consts, src_reg(0u),
1647 BRW_CONDITIONAL_NZ));
1648 emit(AND(result_dst, result, src_reg(0x1)));
1649 } else {
1650 emit(MOV(result_dst, packed_consts));
1651 }
1652 break;
1653 }
1654
1655 case ir_binop_vector_extract:
1656 assert(!"should have been lowered by vec_index_to_cond_assign");
1657 break;
1658
1659 case ir_triop_fma:
1660 op[0] = fix_3src_operand(op[0]);
1661 op[1] = fix_3src_operand(op[1]);
1662 op[2] = fix_3src_operand(op[2]);
1663 /* Note that the instruction's argument order is reversed from GLSL
1664 * and the IR.
1665 */
1666 emit(MAD(result_dst, op[2], op[1], op[0]));
1667 break;
1668
1669 case ir_triop_lrp:
1670 emit_lrp(result_dst, op[0], op[1], op[2]);
1671 break;
1672
1673 case ir_triop_csel:
1674 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1675 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1676 inst->predicate = BRW_PREDICATE_NORMAL;
1677 break;
1678
1679 case ir_triop_bfi:
1680 op[0] = fix_3src_operand(op[0]);
1681 op[1] = fix_3src_operand(op[1]);
1682 op[2] = fix_3src_operand(op[2]);
1683 emit(BFI2(result_dst, op[0], op[1], op[2]));
1684 break;
1685
1686 case ir_triop_bitfield_extract:
1687 op[0] = fix_3src_operand(op[0]);
1688 op[1] = fix_3src_operand(op[1]);
1689 op[2] = fix_3src_operand(op[2]);
1690 /* Note that the instruction's argument order is reversed from GLSL
1691 * and the IR.
1692 */
1693 emit(BFE(result_dst, op[2], op[1], op[0]));
1694 break;
1695
1696 case ir_triop_vector_insert:
1697 assert(!"should have been lowered by lower_vector_insert");
1698 break;
1699
1700 case ir_quadop_bitfield_insert:
1701 assert(!"not reached: should be handled by "
1702 "bitfield_insert_to_bfm_bfi\n");
1703 break;
1704
1705 case ir_quadop_vector:
1706 assert(!"not reached: should be handled by lower_quadop_vector");
1707 break;
1708
1709 case ir_unop_pack_half_2x16:
1710 emit_pack_half_2x16(result_dst, op[0]);
1711 break;
1712 case ir_unop_unpack_half_2x16:
1713 emit_unpack_half_2x16(result_dst, op[0]);
1714 break;
1715 case ir_unop_pack_snorm_2x16:
1716 case ir_unop_pack_snorm_4x8:
1717 case ir_unop_pack_unorm_2x16:
1718 case ir_unop_pack_unorm_4x8:
1719 case ir_unop_unpack_snorm_2x16:
1720 case ir_unop_unpack_snorm_4x8:
1721 case ir_unop_unpack_unorm_2x16:
1722 case ir_unop_unpack_unorm_4x8:
1723 assert(!"not reached: should be handled by lower_packing_builtins");
1724 break;
1725 case ir_unop_unpack_half_2x16_split_x:
1726 case ir_unop_unpack_half_2x16_split_y:
1727 case ir_binop_pack_half_2x16_split:
1728 assert(!"not reached: should not occur in vertex shader");
1729 break;
1730 case ir_binop_ldexp:
1731 assert(!"not reached: should be handled by ldexp_to_arith()");
1732 break;
1733 }
1734 }
1735
1736
1737 void
1738 vec4_visitor::visit(ir_swizzle *ir)
1739 {
1740 src_reg src;
1741 int i = 0;
1742 int swizzle[4];
1743
1744 /* Note that this is only swizzles in expressions, not those on the left
1745 * hand side of an assignment, which do write masking. See ir_assignment
1746 * for that.
1747 */
1748
1749 ir->val->accept(this);
1750 src = this->result;
1751 assert(src.file != BAD_FILE);
1752
1753 for (i = 0; i < ir->type->vector_elements; i++) {
1754 switch (i) {
1755 case 0:
1756 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1757 break;
1758 case 1:
1759 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1760 break;
1761 case 2:
1762 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1763 break;
1764 case 3:
1765 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1766 break;
1767 }
1768 }
1769 for (; i < 4; i++) {
1770 /* Replicate the last channel out. */
1771 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1772 }
1773
1774 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1775
1776 this->result = src;
1777 }
1778
1779 void
1780 vec4_visitor::visit(ir_dereference_variable *ir)
1781 {
1782 const struct glsl_type *type = ir->type;
1783 dst_reg *reg = variable_storage(ir->var);
1784
1785 if (!reg) {
1786 fail("Failed to find variable storage for %s\n", ir->var->name);
1787 this->result = src_reg(brw_null_reg());
1788 return;
1789 }
1790
1791 this->result = src_reg(*reg);
1792
1793 /* System values get their swizzle from the dst_reg writemask */
1794 if (ir->var->data.mode == ir_var_system_value)
1795 return;
1796
1797 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1798 this->result.swizzle = swizzle_for_size(type->vector_elements);
1799 }
1800
1801
1802 int
1803 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1804 {
1805 /* Under normal circumstances array elements are stored consecutively, so
1806 * the stride is equal to the size of the array element.
1807 */
1808 return type_size(ir->type);
1809 }
1810
1811
1812 void
1813 vec4_visitor::visit(ir_dereference_array *ir)
1814 {
1815 ir_constant *constant_index;
1816 src_reg src;
1817 int array_stride = compute_array_stride(ir);
1818
1819 constant_index = ir->array_index->constant_expression_value();
1820
1821 ir->array->accept(this);
1822 src = this->result;
1823
1824 if (constant_index) {
1825 src.reg_offset += constant_index->value.i[0] * array_stride;
1826 } else {
1827 /* Variable index array dereference. It eats the "vec4" of the
1828 * base of the array and an index that offsets the Mesa register
1829 * index.
1830 */
1831 ir->array_index->accept(this);
1832
1833 src_reg index_reg;
1834
1835 if (array_stride == 1) {
1836 index_reg = this->result;
1837 } else {
1838 index_reg = src_reg(this, glsl_type::int_type);
1839
1840 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1841 }
1842
1843 if (src.reladdr) {
1844 src_reg temp = src_reg(this, glsl_type::int_type);
1845
1846 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1847
1848 index_reg = temp;
1849 }
1850
1851 src.reladdr = ralloc(mem_ctx, src_reg);
1852 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1853 }
1854
1855 /* If the type is smaller than a vec4, replicate the last channel out. */
1856 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1857 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1858 else
1859 src.swizzle = BRW_SWIZZLE_NOOP;
1860 src.type = brw_type_for_base_type(ir->type);
1861
1862 this->result = src;
1863 }
1864
1865 void
1866 vec4_visitor::visit(ir_dereference_record *ir)
1867 {
1868 unsigned int i;
1869 const glsl_type *struct_type = ir->record->type;
1870 int offset = 0;
1871
1872 ir->record->accept(this);
1873
1874 for (i = 0; i < struct_type->length; i++) {
1875 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1876 break;
1877 offset += type_size(struct_type->fields.structure[i].type);
1878 }
1879
1880 /* If the type is smaller than a vec4, replicate the last channel out. */
1881 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1882 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1883 else
1884 this->result.swizzle = BRW_SWIZZLE_NOOP;
1885 this->result.type = brw_type_for_base_type(ir->type);
1886
1887 this->result.reg_offset += offset;
1888 }
1889
1890 /**
1891 * We want to be careful in assignment setup to hit the actual storage
1892 * instead of potentially using a temporary like we might with the
1893 * ir_dereference handler.
1894 */
1895 static dst_reg
1896 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1897 {
1898 /* The LHS must be a dereference. If the LHS is a variable indexed array
1899 * access of a vector, it must be separated into a series conditional moves
1900 * before reaching this point (see ir_vec_index_to_cond_assign).
1901 */
1902 assert(ir->as_dereference());
1903 ir_dereference_array *deref_array = ir->as_dereference_array();
1904 if (deref_array) {
1905 assert(!deref_array->array->type->is_vector());
1906 }
1907
1908 /* Use the rvalue deref handler for the most part. We'll ignore
1909 * swizzles in it and write swizzles using writemask, though.
1910 */
1911 ir->accept(v);
1912 return dst_reg(v->result);
1913 }
1914
1915 void
1916 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1917 const struct glsl_type *type, uint32_t predicate)
1918 {
1919 if (type->base_type == GLSL_TYPE_STRUCT) {
1920 for (unsigned int i = 0; i < type->length; i++) {
1921 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1922 }
1923 return;
1924 }
1925
1926 if (type->is_array()) {
1927 for (unsigned int i = 0; i < type->length; i++) {
1928 emit_block_move(dst, src, type->fields.array, predicate);
1929 }
1930 return;
1931 }
1932
1933 if (type->is_matrix()) {
1934 const struct glsl_type *vec_type;
1935
1936 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1937 type->vector_elements, 1);
1938
1939 for (int i = 0; i < type->matrix_columns; i++) {
1940 emit_block_move(dst, src, vec_type, predicate);
1941 }
1942 return;
1943 }
1944
1945 assert(type->is_scalar() || type->is_vector());
1946
1947 dst->type = brw_type_for_base_type(type);
1948 src->type = dst->type;
1949
1950 dst->writemask = (1 << type->vector_elements) - 1;
1951
1952 src->swizzle = swizzle_for_size(type->vector_elements);
1953
1954 vec4_instruction *inst = emit(MOV(*dst, *src));
1955 inst->predicate = predicate;
1956
1957 dst->reg_offset++;
1958 src->reg_offset++;
1959 }
1960
1961
1962 /* If the RHS processing resulted in an instruction generating a
1963 * temporary value, and it would be easy to rewrite the instruction to
1964 * generate its result right into the LHS instead, do so. This ends
1965 * up reliably removing instructions where it can be tricky to do so
1966 * later without real UD chain information.
1967 */
1968 bool
1969 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1970 dst_reg dst,
1971 src_reg src,
1972 vec4_instruction *pre_rhs_inst,
1973 vec4_instruction *last_rhs_inst)
1974 {
1975 /* This could be supported, but it would take more smarts. */
1976 if (ir->condition)
1977 return false;
1978
1979 if (pre_rhs_inst == last_rhs_inst)
1980 return false; /* No instructions generated to work with. */
1981
1982 /* Make sure the last instruction generated our source reg. */
1983 if (src.file != GRF ||
1984 src.file != last_rhs_inst->dst.file ||
1985 src.reg != last_rhs_inst->dst.reg ||
1986 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1987 src.reladdr ||
1988 src.abs ||
1989 src.negate ||
1990 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1991 return false;
1992
1993 /* Check that that last instruction fully initialized the channels
1994 * we want to use, in the order we want to use them. We could
1995 * potentially reswizzle the operands of many instructions so that
1996 * we could handle out of order channels, but don't yet.
1997 */
1998
1999 for (unsigned i = 0; i < 4; i++) {
2000 if (dst.writemask & (1 << i)) {
2001 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2002 return false;
2003
2004 if (BRW_GET_SWZ(src.swizzle, i) != i)
2005 return false;
2006 }
2007 }
2008
2009 /* Success! Rewrite the instruction. */
2010 last_rhs_inst->dst.file = dst.file;
2011 last_rhs_inst->dst.reg = dst.reg;
2012 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2013 last_rhs_inst->dst.reladdr = dst.reladdr;
2014 last_rhs_inst->dst.writemask &= dst.writemask;
2015
2016 return true;
2017 }
2018
2019 void
2020 vec4_visitor::visit(ir_assignment *ir)
2021 {
2022 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2023 uint32_t predicate = BRW_PREDICATE_NONE;
2024
2025 if (!ir->lhs->type->is_scalar() &&
2026 !ir->lhs->type->is_vector()) {
2027 ir->rhs->accept(this);
2028 src_reg src = this->result;
2029
2030 if (ir->condition) {
2031 emit_bool_to_cond_code(ir->condition, &predicate);
2032 }
2033
2034 /* emit_block_move doesn't account for swizzles in the source register.
2035 * This should be ok, since the source register is a structure or an
2036 * array, and those can't be swizzled. But double-check to be sure.
2037 */
2038 assert(src.swizzle ==
2039 (ir->rhs->type->is_matrix()
2040 ? swizzle_for_size(ir->rhs->type->vector_elements)
2041 : BRW_SWIZZLE_NOOP));
2042
2043 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2044 return;
2045 }
2046
2047 /* Now we're down to just a scalar/vector with writemasks. */
2048 int i;
2049
2050 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2051 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2052
2053 ir->rhs->accept(this);
2054
2055 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2056
2057 src_reg src = this->result;
2058
2059 int swizzles[4];
2060 int first_enabled_chan = 0;
2061 int src_chan = 0;
2062
2063 assert(ir->lhs->type->is_vector() ||
2064 ir->lhs->type->is_scalar());
2065 dst.writemask = ir->write_mask;
2066
2067 for (int i = 0; i < 4; i++) {
2068 if (dst.writemask & (1 << i)) {
2069 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2070 break;
2071 }
2072 }
2073
2074 /* Swizzle a small RHS vector into the channels being written.
2075 *
2076 * glsl ir treats write_mask as dictating how many channels are
2077 * present on the RHS while in our instructions we need to make
2078 * those channels appear in the slots of the vec4 they're written to.
2079 */
2080 for (int i = 0; i < 4; i++) {
2081 if (dst.writemask & (1 << i))
2082 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2083 else
2084 swizzles[i] = first_enabled_chan;
2085 }
2086 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2087 swizzles[2], swizzles[3]);
2088
2089 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2090 return;
2091 }
2092
2093 if (ir->condition) {
2094 emit_bool_to_cond_code(ir->condition, &predicate);
2095 }
2096
2097 for (i = 0; i < type_size(ir->lhs->type); i++) {
2098 vec4_instruction *inst = emit(MOV(dst, src));
2099 inst->predicate = predicate;
2100
2101 dst.reg_offset++;
2102 src.reg_offset++;
2103 }
2104 }
2105
2106 void
2107 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2108 {
2109 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2110 foreach_list(node, &ir->components) {
2111 ir_constant *field_value = (ir_constant *)node;
2112
2113 emit_constant_values(dst, field_value);
2114 }
2115 return;
2116 }
2117
2118 if (ir->type->is_array()) {
2119 for (unsigned int i = 0; i < ir->type->length; i++) {
2120 emit_constant_values(dst, ir->array_elements[i]);
2121 }
2122 return;
2123 }
2124
2125 if (ir->type->is_matrix()) {
2126 for (int i = 0; i < ir->type->matrix_columns; i++) {
2127 float *vec = &ir->value.f[i * ir->type->vector_elements];
2128
2129 for (int j = 0; j < ir->type->vector_elements; j++) {
2130 dst->writemask = 1 << j;
2131 dst->type = BRW_REGISTER_TYPE_F;
2132
2133 emit(MOV(*dst, src_reg(vec[j])));
2134 }
2135 dst->reg_offset++;
2136 }
2137 return;
2138 }
2139
2140 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2141
2142 for (int i = 0; i < ir->type->vector_elements; i++) {
2143 if (!(remaining_writemask & (1 << i)))
2144 continue;
2145
2146 dst->writemask = 1 << i;
2147 dst->type = brw_type_for_base_type(ir->type);
2148
2149 /* Find other components that match the one we're about to
2150 * write. Emits fewer instructions for things like vec4(0.5,
2151 * 1.5, 1.5, 1.5).
2152 */
2153 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2154 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2155 if (ir->value.b[i] == ir->value.b[j])
2156 dst->writemask |= (1 << j);
2157 } else {
2158 /* u, i, and f storage all line up, so no need for a
2159 * switch case for comparing each type.
2160 */
2161 if (ir->value.u[i] == ir->value.u[j])
2162 dst->writemask |= (1 << j);
2163 }
2164 }
2165
2166 switch (ir->type->base_type) {
2167 case GLSL_TYPE_FLOAT:
2168 emit(MOV(*dst, src_reg(ir->value.f[i])));
2169 break;
2170 case GLSL_TYPE_INT:
2171 emit(MOV(*dst, src_reg(ir->value.i[i])));
2172 break;
2173 case GLSL_TYPE_UINT:
2174 emit(MOV(*dst, src_reg(ir->value.u[i])));
2175 break;
2176 case GLSL_TYPE_BOOL:
2177 emit(MOV(*dst, src_reg(ir->value.b[i])));
2178 break;
2179 default:
2180 assert(!"Non-float/uint/int/bool constant");
2181 break;
2182 }
2183
2184 remaining_writemask &= ~dst->writemask;
2185 }
2186 dst->reg_offset++;
2187 }
2188
2189 void
2190 vec4_visitor::visit(ir_constant *ir)
2191 {
2192 dst_reg dst = dst_reg(this, ir->type);
2193 this->result = src_reg(dst);
2194
2195 emit_constant_values(&dst, ir);
2196 }
2197
2198 void
2199 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2200 {
2201 ir_dereference *deref = static_cast<ir_dereference *>(
2202 ir->actual_parameters.get_head());
2203 ir_variable *location = deref->variable_referenced();
2204 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2205 location->data.atomic.buffer_index);
2206
2207 /* Calculate the surface offset */
2208 src_reg offset(this, glsl_type::uint_type);
2209 ir_dereference_array *deref_array = deref->as_dereference_array();
2210 if (deref_array) {
2211 deref_array->array_index->accept(this);
2212
2213 src_reg tmp(this, glsl_type::uint_type);
2214 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2215 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2216 } else {
2217 offset = location->data.atomic.offset;
2218 }
2219
2220 /* Emit the appropriate machine instruction */
2221 const char *callee = ir->callee->function_name();
2222 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2223
2224 if (!strcmp("__intrinsic_atomic_read", callee)) {
2225 emit_untyped_surface_read(surf_index, dst, offset);
2226
2227 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2228 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2229 src_reg(), src_reg());
2230
2231 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2232 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2233 src_reg(), src_reg());
2234 }
2235 }
2236
2237 void
2238 vec4_visitor::visit(ir_call *ir)
2239 {
2240 const char *callee = ir->callee->function_name();
2241
2242 if (!strcmp("__intrinsic_atomic_read", callee) ||
2243 !strcmp("__intrinsic_atomic_increment", callee) ||
2244 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2245 visit_atomic_counter_intrinsic(ir);
2246 } else {
2247 assert(!"Unsupported intrinsic.");
2248 }
2249 }
2250
2251 src_reg
2252 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2253 {
2254 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2255 inst->base_mrf = 2;
2256 inst->mlen = 1;
2257 inst->sampler = sampler;
2258 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2259 inst->dst.writemask = WRITEMASK_XYZW;
2260
2261 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2262 int param_base = inst->base_mrf;
2263 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2264 int zero_mask = 0xf & ~coord_mask;
2265
2266 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2267 coordinate));
2268
2269 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2270 src_reg(0)));
2271
2272 emit(inst);
2273 return src_reg(inst->dst);
2274 }
2275
2276 void
2277 vec4_visitor::visit(ir_texture *ir)
2278 {
2279 int sampler =
2280 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2281
2282 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2283 * emitting anything other than setting up the constant result.
2284 */
2285 if (ir->op == ir_tg4) {
2286 ir_constant *chan = ir->lod_info.component->as_constant();
2287 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2288 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2289 dst_reg result(this, ir->type);
2290 this->result = src_reg(result);
2291 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2292 return;
2293 }
2294 }
2295
2296 /* Should be lowered by do_lower_texture_projection */
2297 assert(!ir->projector);
2298
2299 /* Should be lowered */
2300 assert(!ir->offset || !ir->offset->type->is_array());
2301
2302 /* Generate code to compute all the subexpression trees. This has to be
2303 * done before loading any values into MRFs for the sampler message since
2304 * generating these values may involve SEND messages that need the MRFs.
2305 */
2306 src_reg coordinate;
2307 if (ir->coordinate) {
2308 ir->coordinate->accept(this);
2309 coordinate = this->result;
2310 }
2311
2312 src_reg shadow_comparitor;
2313 if (ir->shadow_comparitor) {
2314 ir->shadow_comparitor->accept(this);
2315 shadow_comparitor = this->result;
2316 }
2317
2318 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2319 src_reg offset_value;
2320 if (has_nonconstant_offset) {
2321 ir->offset->accept(this);
2322 offset_value = src_reg(this->result);
2323 }
2324
2325 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2326 src_reg lod, dPdx, dPdy, sample_index, mcs;
2327 switch (ir->op) {
2328 case ir_tex:
2329 lod = src_reg(0.0f);
2330 lod_type = glsl_type::float_type;
2331 break;
2332 case ir_txf:
2333 case ir_txl:
2334 case ir_txs:
2335 ir->lod_info.lod->accept(this);
2336 lod = this->result;
2337 lod_type = ir->lod_info.lod->type;
2338 break;
2339 case ir_query_levels:
2340 lod = src_reg(0);
2341 lod_type = glsl_type::int_type;
2342 break;
2343 case ir_txf_ms:
2344 ir->lod_info.sample_index->accept(this);
2345 sample_index = this->result;
2346 sample_index_type = ir->lod_info.sample_index->type;
2347
2348 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2349 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2350 else
2351 mcs = src_reg(0u);
2352 break;
2353 case ir_txd:
2354 ir->lod_info.grad.dPdx->accept(this);
2355 dPdx = this->result;
2356
2357 ir->lod_info.grad.dPdy->accept(this);
2358 dPdy = this->result;
2359
2360 lod_type = ir->lod_info.grad.dPdx->type;
2361 break;
2362 case ir_txb:
2363 case ir_lod:
2364 case ir_tg4:
2365 break;
2366 }
2367
2368 vec4_instruction *inst = NULL;
2369 switch (ir->op) {
2370 case ir_tex:
2371 case ir_txl:
2372 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2373 break;
2374 case ir_txd:
2375 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2376 break;
2377 case ir_txf:
2378 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2379 break;
2380 case ir_txf_ms:
2381 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2382 break;
2383 case ir_txs:
2384 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2385 break;
2386 case ir_tg4:
2387 if (has_nonconstant_offset)
2388 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2389 else
2390 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2391 break;
2392 case ir_query_levels:
2393 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2394 break;
2395 case ir_txb:
2396 assert(!"TXB is not valid for vertex shaders.");
2397 break;
2398 case ir_lod:
2399 assert(!"LOD is not valid for vertex shaders.");
2400 break;
2401 default:
2402 assert(!"Unrecognized tex op");
2403 }
2404
2405 if (ir->offset != NULL && ir->op != ir_txf)
2406 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2407
2408 /* Stuff the channel select bits in the top of the texture offset */
2409 if (ir->op == ir_tg4)
2410 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2411
2412 /* The message header is necessary for:
2413 * - Gen4 (always)
2414 * - Texel offsets
2415 * - Gather channel selection
2416 * - Sampler indices too large to fit in a 4-bit value.
2417 */
2418 inst->header_present =
2419 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2420 sampler >= 16;
2421 inst->base_mrf = 2;
2422 inst->mlen = inst->header_present + 1; /* always at least one */
2423 inst->sampler = sampler;
2424 inst->dst = dst_reg(this, ir->type);
2425 inst->dst.writemask = WRITEMASK_XYZW;
2426 inst->shadow_compare = ir->shadow_comparitor != NULL;
2427
2428 /* MRF for the first parameter */
2429 int param_base = inst->base_mrf + inst->header_present;
2430
2431 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2432 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2433 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2434 } else {
2435 /* Load the coordinate */
2436 /* FINISHME: gl_clamp_mask and saturate */
2437 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2438 int zero_mask = 0xf & ~coord_mask;
2439
2440 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2441 coordinate));
2442
2443 if (zero_mask != 0) {
2444 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2445 src_reg(0)));
2446 }
2447 /* Load the shadow comparitor */
2448 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2449 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2450 WRITEMASK_X),
2451 shadow_comparitor));
2452 inst->mlen++;
2453 }
2454
2455 /* Load the LOD info */
2456 if (ir->op == ir_tex || ir->op == ir_txl) {
2457 int mrf, writemask;
2458 if (brw->gen >= 5) {
2459 mrf = param_base + 1;
2460 if (ir->shadow_comparitor) {
2461 writemask = WRITEMASK_Y;
2462 /* mlen already incremented */
2463 } else {
2464 writemask = WRITEMASK_X;
2465 inst->mlen++;
2466 }
2467 } else /* brw->gen == 4 */ {
2468 mrf = param_base;
2469 writemask = WRITEMASK_W;
2470 }
2471 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2472 } else if (ir->op == ir_txf) {
2473 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2474 } else if (ir->op == ir_txf_ms) {
2475 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2476 sample_index));
2477 if (brw->gen >= 7)
2478 /* MCS data is in the first channel of `mcs`, but we need to get it into
2479 * the .y channel of the second vec4 of params, so replicate .x across
2480 * the whole vec4 and then mask off everything except .y
2481 */
2482 mcs.swizzle = BRW_SWIZZLE_XXXX;
2483 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2484 mcs));
2485 inst->mlen++;
2486 } else if (ir->op == ir_txd) {
2487 const glsl_type *type = lod_type;
2488
2489 if (brw->gen >= 5) {
2490 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2491 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2492 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2493 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2494 inst->mlen++;
2495
2496 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2497 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2498 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2499 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2500 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2501 inst->mlen++;
2502
2503 if (ir->shadow_comparitor) {
2504 emit(MOV(dst_reg(MRF, param_base + 2,
2505 ir->shadow_comparitor->type, WRITEMASK_Z),
2506 shadow_comparitor));
2507 }
2508 }
2509 } else /* brw->gen == 4 */ {
2510 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2511 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2512 inst->mlen += 2;
2513 }
2514 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2515 if (ir->shadow_comparitor) {
2516 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2517 shadow_comparitor));
2518 }
2519
2520 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2521 offset_value));
2522 inst->mlen++;
2523 }
2524 }
2525
2526 emit(inst);
2527
2528 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2529 * spec requires layers.
2530 */
2531 if (ir->op == ir_txs) {
2532 glsl_type const *type = ir->sampler->type;
2533 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2534 type->sampler_array) {
2535 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2536 writemask(inst->dst, WRITEMASK_Z),
2537 src_reg(inst->dst), src_reg(6));
2538 }
2539 }
2540
2541 if (brw->gen == 6 && ir->op == ir_tg4) {
2542 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2543 }
2544
2545 swizzle_result(ir, src_reg(inst->dst), sampler);
2546 }
2547
2548 /**
2549 * Apply workarounds for Gen6 gather with UINT/SINT
2550 */
2551 void
2552 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2553 {
2554 if (!wa)
2555 return;
2556
2557 int width = (wa & WA_8BIT) ? 8 : 16;
2558 dst_reg dst_f = dst;
2559 dst_f.type = BRW_REGISTER_TYPE_F;
2560
2561 /* Convert from UNORM to UINT */
2562 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2563 emit(MOV(dst, src_reg(dst_f)));
2564
2565 if (wa & WA_SIGN) {
2566 /* Reinterpret the UINT value as a signed INT value by
2567 * shifting the sign bit into place, then shifting back
2568 * preserving sign.
2569 */
2570 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2571 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2572 }
2573 }
2574
2575 /**
2576 * Set up the gather channel based on the swizzle, for gather4.
2577 */
2578 uint32_t
2579 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2580 {
2581 ir_constant *chan = ir->lod_info.component->as_constant();
2582 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2583 switch (swiz) {
2584 case SWIZZLE_X: return 0;
2585 case SWIZZLE_Y:
2586 /* gather4 sampler is broken for green channel on RG32F --
2587 * we must ask for blue instead.
2588 */
2589 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2590 return 2;
2591 return 1;
2592 case SWIZZLE_Z: return 2;
2593 case SWIZZLE_W: return 3;
2594 default:
2595 assert(!"Not reached"); /* zero, one swizzles handled already */
2596 return 0;
2597 }
2598 }
2599
2600 void
2601 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2602 {
2603 int s = key->tex.swizzles[sampler];
2604
2605 this->result = src_reg(this, ir->type);
2606 dst_reg swizzled_result(this->result);
2607
2608 if (ir->op == ir_query_levels) {
2609 /* # levels is in .w */
2610 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2611 emit(MOV(swizzled_result, orig_val));
2612 return;
2613 }
2614
2615 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2616 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2617 emit(MOV(swizzled_result, orig_val));
2618 return;
2619 }
2620
2621
2622 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2623 int swizzle[4] = {0};
2624
2625 for (int i = 0; i < 4; i++) {
2626 switch (GET_SWZ(s, i)) {
2627 case SWIZZLE_ZERO:
2628 zero_mask |= (1 << i);
2629 break;
2630 case SWIZZLE_ONE:
2631 one_mask |= (1 << i);
2632 break;
2633 default:
2634 copy_mask |= (1 << i);
2635 swizzle[i] = GET_SWZ(s, i);
2636 break;
2637 }
2638 }
2639
2640 if (copy_mask) {
2641 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2642 swizzled_result.writemask = copy_mask;
2643 emit(MOV(swizzled_result, orig_val));
2644 }
2645
2646 if (zero_mask) {
2647 swizzled_result.writemask = zero_mask;
2648 emit(MOV(swizzled_result, src_reg(0.0f)));
2649 }
2650
2651 if (one_mask) {
2652 swizzled_result.writemask = one_mask;
2653 emit(MOV(swizzled_result, src_reg(1.0f)));
2654 }
2655 }
2656
2657 void
2658 vec4_visitor::visit(ir_return *ir)
2659 {
2660 assert(!"not reached");
2661 }
2662
2663 void
2664 vec4_visitor::visit(ir_discard *ir)
2665 {
2666 assert(!"not reached");
2667 }
2668
2669 void
2670 vec4_visitor::visit(ir_if *ir)
2671 {
2672 /* Don't point the annotation at the if statement, because then it plus
2673 * the then and else blocks get printed.
2674 */
2675 this->base_ir = ir->condition;
2676
2677 if (brw->gen == 6) {
2678 emit_if_gen6(ir);
2679 } else {
2680 uint32_t predicate;
2681 emit_bool_to_cond_code(ir->condition, &predicate);
2682 emit(IF(predicate));
2683 }
2684
2685 visit_instructions(&ir->then_instructions);
2686
2687 if (!ir->else_instructions.is_empty()) {
2688 this->base_ir = ir->condition;
2689 emit(BRW_OPCODE_ELSE);
2690
2691 visit_instructions(&ir->else_instructions);
2692 }
2693
2694 this->base_ir = ir->condition;
2695 emit(BRW_OPCODE_ENDIF);
2696 }
2697
2698 void
2699 vec4_visitor::visit(ir_emit_vertex *)
2700 {
2701 assert(!"not reached");
2702 }
2703
2704 void
2705 vec4_visitor::visit(ir_end_primitive *)
2706 {
2707 assert(!"not reached");
2708 }
2709
2710 void
2711 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2712 dst_reg dst, src_reg offset,
2713 src_reg src0, src_reg src1)
2714 {
2715 unsigned mlen = 0;
2716
2717 /* Set the atomic operation offset. */
2718 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2719 mlen++;
2720
2721 /* Set the atomic operation arguments. */
2722 if (src0.file != BAD_FILE) {
2723 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2724 mlen++;
2725 }
2726
2727 if (src1.file != BAD_FILE) {
2728 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2729 mlen++;
2730 }
2731
2732 /* Emit the instruction. Note that this maps to the normal SIMD8
2733 * untyped atomic message on Ivy Bridge, but that's OK because
2734 * unused channels will be masked out.
2735 */
2736 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2737 src_reg(atomic_op), src_reg(surf_index));
2738 inst->base_mrf = 0;
2739 inst->mlen = mlen;
2740 }
2741
2742 void
2743 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2744 src_reg offset)
2745 {
2746 /* Set the surface read offset. */
2747 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2748
2749 /* Emit the instruction. Note that this maps to the normal SIMD8
2750 * untyped surface read message, but that's OK because unused
2751 * channels will be masked out.
2752 */
2753 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2754 dst, src_reg(surf_index));
2755 inst->base_mrf = 0;
2756 inst->mlen = 1;
2757 }
2758
2759 void
2760 vec4_visitor::emit_ndc_computation()
2761 {
2762 /* Get the position */
2763 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2764
2765 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2766 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2767 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2768
2769 current_annotation = "NDC";
2770 dst_reg ndc_w = ndc;
2771 ndc_w.writemask = WRITEMASK_W;
2772 src_reg pos_w = pos;
2773 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2774 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2775
2776 dst_reg ndc_xyz = ndc;
2777 ndc_xyz.writemask = WRITEMASK_XYZ;
2778
2779 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2780 }
2781
2782 void
2783 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2784 {
2785 if (brw->gen < 6 &&
2786 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2787 key->userclip_active || brw->has_negative_rhw_bug)) {
2788 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2789 dst_reg header1_w = header1;
2790 header1_w.writemask = WRITEMASK_W;
2791
2792 emit(MOV(header1, 0u));
2793
2794 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2795 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2796
2797 current_annotation = "Point size";
2798 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2799 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2800 }
2801
2802 if (key->userclip_active) {
2803 current_annotation = "Clipping flags";
2804 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2805 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2806
2807 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2808 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2809 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2810
2811 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2812 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2813 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2814 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2815 }
2816
2817 /* i965 clipping workaround:
2818 * 1) Test for -ve rhw
2819 * 2) If set,
2820 * set ndc = (0,0,0,0)
2821 * set ucp[6] = 1
2822 *
2823 * Later, clipping will detect ucp[6] and ensure the primitive is
2824 * clipped against all fixed planes.
2825 */
2826 if (brw->has_negative_rhw_bug) {
2827 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2828 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2829 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2830 vec4_instruction *inst;
2831 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2832 inst->predicate = BRW_PREDICATE_NORMAL;
2833 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2834 inst->predicate = BRW_PREDICATE_NORMAL;
2835 }
2836
2837 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2838 } else if (brw->gen < 6) {
2839 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2840 } else {
2841 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2842 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2843 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2844 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2845 }
2846 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2847 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2848 src_reg(output_reg[VARYING_SLOT_LAYER])));
2849 }
2850 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2851 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2852 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2853 }
2854 }
2855 }
2856
2857 void
2858 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2859 {
2860 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2861 *
2862 * "If a linked set of shaders forming the vertex stage contains no
2863 * static write to gl_ClipVertex or gl_ClipDistance, but the
2864 * application has requested clipping against user clip planes through
2865 * the API, then the coordinate written to gl_Position is used for
2866 * comparison against the user clip planes."
2867 *
2868 * This function is only called if the shader didn't write to
2869 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2870 * if the user wrote to it; otherwise we use gl_Position.
2871 */
2872 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2873 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2874 clip_vertex = VARYING_SLOT_POS;
2875 }
2876
2877 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2878 ++i) {
2879 reg.writemask = 1 << i;
2880 emit(DP4(reg,
2881 src_reg(output_reg[clip_vertex]),
2882 src_reg(this->userplane[i + offset])));
2883 }
2884 }
2885
2886 void
2887 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2888 {
2889 assert (varying < VARYING_SLOT_MAX);
2890 reg.type = output_reg[varying].type;
2891 current_annotation = output_reg_annotation[varying];
2892 /* Copy the register, saturating if necessary */
2893 vec4_instruction *inst = emit(MOV(reg,
2894 src_reg(output_reg[varying])));
2895 if ((varying == VARYING_SLOT_COL0 ||
2896 varying == VARYING_SLOT_COL1 ||
2897 varying == VARYING_SLOT_BFC0 ||
2898 varying == VARYING_SLOT_BFC1) &&
2899 key->clamp_vertex_color) {
2900 inst->saturate = true;
2901 }
2902 }
2903
2904 void
2905 vec4_visitor::emit_urb_slot(int mrf, int varying)
2906 {
2907 struct brw_reg hw_reg = brw_message_reg(mrf);
2908 dst_reg reg = dst_reg(MRF, mrf);
2909 reg.type = BRW_REGISTER_TYPE_F;
2910
2911 switch (varying) {
2912 case VARYING_SLOT_PSIZ:
2913 /* PSIZ is always in slot 0, and is coupled with other flags. */
2914 current_annotation = "indices, point width, clip flags";
2915 emit_psiz_and_flags(hw_reg);
2916 break;
2917 case BRW_VARYING_SLOT_NDC:
2918 current_annotation = "NDC";
2919 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2920 break;
2921 case VARYING_SLOT_POS:
2922 current_annotation = "gl_Position";
2923 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2924 break;
2925 case VARYING_SLOT_EDGE:
2926 /* This is present when doing unfilled polygons. We're supposed to copy
2927 * the edge flag from the user-provided vertex array
2928 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2929 * of that attribute (starts as 1.0f). This is then used in clipping to
2930 * determine which edges should be drawn as wireframe.
2931 */
2932 current_annotation = "edge flag";
2933 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2934 glsl_type::float_type, WRITEMASK_XYZW))));
2935 break;
2936 case BRW_VARYING_SLOT_PAD:
2937 /* No need to write to this slot */
2938 break;
2939 default:
2940 emit_generic_urb_slot(reg, varying);
2941 break;
2942 }
2943 }
2944
2945 static int
2946 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2947 {
2948 if (brw->gen >= 6) {
2949 /* URB data written (does not include the message header reg) must
2950 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2951 * section 5.4.3.2.2: URB_INTERLEAVED.
2952 *
2953 * URB entries are allocated on a multiple of 1024 bits, so an
2954 * extra 128 bits written here to make the end align to 256 is
2955 * no problem.
2956 */
2957 if ((mlen % 2) != 1)
2958 mlen++;
2959 }
2960
2961 return mlen;
2962 }
2963
2964
2965 /**
2966 * Generates the VUE payload plus the necessary URB write instructions to
2967 * output it.
2968 *
2969 * The VUE layout is documented in Volume 2a.
2970 */
2971 void
2972 vec4_visitor::emit_vertex()
2973 {
2974 /* MRF 0 is reserved for the debugger, so start with message header
2975 * in MRF 1.
2976 */
2977 int base_mrf = 1;
2978 int mrf = base_mrf;
2979 /* In the process of generating our URB write message contents, we
2980 * may need to unspill a register or load from an array. Those
2981 * reads would use MRFs 14-15.
2982 */
2983 int max_usable_mrf = 13;
2984
2985 /* The following assertion verifies that max_usable_mrf causes an
2986 * even-numbered amount of URB write data, which will meet gen6's
2987 * requirements for length alignment.
2988 */
2989 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2990
2991 /* First mrf is the g0-based message header containing URB handles and
2992 * such.
2993 */
2994 emit_urb_write_header(mrf++);
2995
2996 if (brw->gen < 6) {
2997 emit_ndc_computation();
2998 }
2999
3000 /* Lower legacy ff and ClipVertex clipping to clip distances */
3001 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3002 current_annotation = "user clip distances";
3003
3004 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3005 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3006
3007 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3008 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3009 }
3010
3011 /* We may need to split this up into several URB writes, so do them in a
3012 * loop.
3013 */
3014 int slot = 0;
3015 bool complete = false;
3016 do {
3017 /* URB offset is in URB row increments, and each of our MRFs is half of
3018 * one of those, since we're doing interleaved writes.
3019 */
3020 int offset = slot / 2;
3021
3022 mrf = base_mrf + 1;
3023 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3024 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3025
3026 /* If this was max_usable_mrf, we can't fit anything more into this
3027 * URB WRITE.
3028 */
3029 if (mrf > max_usable_mrf) {
3030 slot++;
3031 break;
3032 }
3033 }
3034
3035 complete = slot >= prog_data->vue_map.num_slots;
3036 current_annotation = "URB write";
3037 vec4_instruction *inst = emit_urb_write_opcode(complete);
3038 inst->base_mrf = base_mrf;
3039 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3040 inst->offset += offset;
3041 } while(!complete);
3042 }
3043
3044
3045 src_reg
3046 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3047 src_reg *reladdr, int reg_offset)
3048 {
3049 /* Because we store the values to scratch interleaved like our
3050 * vertex data, we need to scale the vec4 index by 2.
3051 */
3052 int message_header_scale = 2;
3053
3054 /* Pre-gen6, the message header uses byte offsets instead of vec4
3055 * (16-byte) offset units.
3056 */
3057 if (brw->gen < 6)
3058 message_header_scale *= 16;
3059
3060 if (reladdr) {
3061 src_reg index = src_reg(this, glsl_type::int_type);
3062
3063 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3064 emit_before(inst, MUL(dst_reg(index),
3065 index, src_reg(message_header_scale)));
3066
3067 return index;
3068 } else {
3069 return src_reg(reg_offset * message_header_scale);
3070 }
3071 }
3072
3073 src_reg
3074 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3075 src_reg *reladdr, int reg_offset)
3076 {
3077 if (reladdr) {
3078 src_reg index = src_reg(this, glsl_type::int_type);
3079
3080 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3081
3082 /* Pre-gen6, the message header uses byte offsets instead of vec4
3083 * (16-byte) offset units.
3084 */
3085 if (brw->gen < 6) {
3086 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3087 }
3088
3089 return index;
3090 } else if (brw->gen >= 8) {
3091 /* Store the offset in a GRF so we can send-from-GRF. */
3092 src_reg offset = src_reg(this, glsl_type::int_type);
3093 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3094 return offset;
3095 } else {
3096 int message_header_scale = brw->gen < 6 ? 16 : 1;
3097 return src_reg(reg_offset * message_header_scale);
3098 }
3099 }
3100
3101 /**
3102 * Emits an instruction before @inst to load the value named by @orig_src
3103 * from scratch space at @base_offset to @temp.
3104 *
3105 * @base_offset is measured in 32-byte units (the size of a register).
3106 */
3107 void
3108 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3109 dst_reg temp, src_reg orig_src,
3110 int base_offset)
3111 {
3112 int reg_offset = base_offset + orig_src.reg_offset;
3113 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3114
3115 emit_before(inst, SCRATCH_READ(temp, index));
3116 }
3117
3118 /**
3119 * Emits an instruction after @inst to store the value to be written
3120 * to @orig_dst to scratch space at @base_offset, from @temp.
3121 *
3122 * @base_offset is measured in 32-byte units (the size of a register).
3123 */
3124 void
3125 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3126 {
3127 int reg_offset = base_offset + inst->dst.reg_offset;
3128 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3129
3130 /* Create a temporary register to store *inst's result in.
3131 *
3132 * We have to be careful in MOVing from our temporary result register in
3133 * the scratch write. If we swizzle from channels of the temporary that
3134 * weren't initialized, it will confuse live interval analysis, which will
3135 * make spilling fail to make progress.
3136 */
3137 src_reg temp = src_reg(this, glsl_type::vec4_type);
3138 temp.type = inst->dst.type;
3139 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3140 int swizzles[4];
3141 for (int i = 0; i < 4; i++)
3142 if (inst->dst.writemask & (1 << i))
3143 swizzles[i] = i;
3144 else
3145 swizzles[i] = first_writemask_chan;
3146 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3147 swizzles[2], swizzles[3]);
3148
3149 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3150 inst->dst.writemask));
3151 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3152 write->predicate = inst->predicate;
3153 write->ir = inst->ir;
3154 write->annotation = inst->annotation;
3155 inst->insert_after(write);
3156
3157 inst->dst.file = temp.file;
3158 inst->dst.reg = temp.reg;
3159 inst->dst.reg_offset = temp.reg_offset;
3160 inst->dst.reladdr = NULL;
3161 }
3162
3163 /**
3164 * We can't generally support array access in GRF space, because a
3165 * single instruction's destination can only span 2 contiguous
3166 * registers. So, we send all GRF arrays that get variable index
3167 * access to scratch space.
3168 */
3169 void
3170 vec4_visitor::move_grf_array_access_to_scratch()
3171 {
3172 int scratch_loc[this->virtual_grf_count];
3173
3174 for (int i = 0; i < this->virtual_grf_count; i++) {
3175 scratch_loc[i] = -1;
3176 }
3177
3178 /* First, calculate the set of virtual GRFs that need to be punted
3179 * to scratch due to having any array access on them, and where in
3180 * scratch.
3181 */
3182 foreach_list(node, &this->instructions) {
3183 vec4_instruction *inst = (vec4_instruction *)node;
3184
3185 if (inst->dst.file == GRF && inst->dst.reladdr &&
3186 scratch_loc[inst->dst.reg] == -1) {
3187 scratch_loc[inst->dst.reg] = c->last_scratch;
3188 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3189 }
3190
3191 for (int i = 0 ; i < 3; i++) {
3192 src_reg *src = &inst->src[i];
3193
3194 if (src->file == GRF && src->reladdr &&
3195 scratch_loc[src->reg] == -1) {
3196 scratch_loc[src->reg] = c->last_scratch;
3197 c->last_scratch += this->virtual_grf_sizes[src->reg];
3198 }
3199 }
3200 }
3201
3202 /* Now, for anything that will be accessed through scratch, rewrite
3203 * it to load/store. Note that this is a _safe list walk, because
3204 * we may generate a new scratch_write instruction after the one
3205 * we're processing.
3206 */
3207 foreach_list_safe(node, &this->instructions) {
3208 vec4_instruction *inst = (vec4_instruction *)node;
3209
3210 /* Set up the annotation tracking for new generated instructions. */
3211 base_ir = inst->ir;
3212 current_annotation = inst->annotation;
3213
3214 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3215 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3216 }
3217
3218 for (int i = 0 ; i < 3; i++) {
3219 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3220 continue;
3221
3222 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3223
3224 emit_scratch_read(inst, temp, inst->src[i],
3225 scratch_loc[inst->src[i].reg]);
3226
3227 inst->src[i].file = temp.file;
3228 inst->src[i].reg = temp.reg;
3229 inst->src[i].reg_offset = temp.reg_offset;
3230 inst->src[i].reladdr = NULL;
3231 }
3232 }
3233 }
3234
3235 /**
3236 * Emits an instruction before @inst to load the value named by @orig_src
3237 * from the pull constant buffer (surface) at @base_offset to @temp.
3238 */
3239 void
3240 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3241 dst_reg temp, src_reg orig_src,
3242 int base_offset)
3243 {
3244 int reg_offset = base_offset + orig_src.reg_offset;
3245 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3246 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3247 vec4_instruction *load;
3248
3249 if (brw->gen >= 7) {
3250 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3251 grf_offset.type = offset.type;
3252 emit_before(inst, MOV(grf_offset, offset));
3253
3254 load = new(mem_ctx) vec4_instruction(this,
3255 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3256 temp, index, src_reg(grf_offset));
3257 } else {
3258 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3259 temp, index, offset);
3260 load->base_mrf = 14;
3261 load->mlen = 1;
3262 }
3263 emit_before(inst, load);
3264 }
3265
3266 /**
3267 * Implements array access of uniforms by inserting a
3268 * PULL_CONSTANT_LOAD instruction.
3269 *
3270 * Unlike temporary GRF array access (where we don't support it due to
3271 * the difficulty of doing relative addressing on instruction
3272 * destinations), we could potentially do array access of uniforms
3273 * that were loaded in GRF space as push constants. In real-world
3274 * usage we've seen, though, the arrays being used are always larger
3275 * than we could load as push constants, so just always move all
3276 * uniform array access out to a pull constant buffer.
3277 */
3278 void
3279 vec4_visitor::move_uniform_array_access_to_pull_constants()
3280 {
3281 int pull_constant_loc[this->uniforms];
3282
3283 for (int i = 0; i < this->uniforms; i++) {
3284 pull_constant_loc[i] = -1;
3285 }
3286
3287 /* Walk through and find array access of uniforms. Put a copy of that
3288 * uniform in the pull constant buffer.
3289 *
3290 * Note that we don't move constant-indexed accesses to arrays. No
3291 * testing has been done of the performance impact of this choice.
3292 */
3293 foreach_list_safe(node, &this->instructions) {
3294 vec4_instruction *inst = (vec4_instruction *)node;
3295
3296 for (int i = 0 ; i < 3; i++) {
3297 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3298 continue;
3299
3300 int uniform = inst->src[i].reg;
3301
3302 /* If this array isn't already present in the pull constant buffer,
3303 * add it.
3304 */
3305 if (pull_constant_loc[uniform] == -1) {
3306 const float **values = &stage_prog_data->param[uniform * 4];
3307
3308 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3309
3310 assert(uniform < uniform_array_size);
3311 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3312 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3313 = values[j];
3314 }
3315 }
3316
3317 /* Set up the annotation tracking for new generated instructions. */
3318 base_ir = inst->ir;
3319 current_annotation = inst->annotation;
3320
3321 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3322
3323 emit_pull_constant_load(inst, temp, inst->src[i],
3324 pull_constant_loc[uniform]);
3325
3326 inst->src[i].file = temp.file;
3327 inst->src[i].reg = temp.reg;
3328 inst->src[i].reg_offset = temp.reg_offset;
3329 inst->src[i].reladdr = NULL;
3330 }
3331 }
3332
3333 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3334 * no need to track them as larger-than-vec4 objects. This will be
3335 * relied on in cutting out unused uniform vectors from push
3336 * constants.
3337 */
3338 split_uniform_registers();
3339 }
3340
3341 void
3342 vec4_visitor::resolve_ud_negate(src_reg *reg)
3343 {
3344 if (reg->type != BRW_REGISTER_TYPE_UD ||
3345 !reg->negate)
3346 return;
3347
3348 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3349 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3350 *reg = temp;
3351 }
3352
3353 vec4_visitor::vec4_visitor(struct brw_context *brw,
3354 struct brw_vec4_compile *c,
3355 struct gl_program *prog,
3356 const struct brw_vec4_prog_key *key,
3357 struct brw_vec4_prog_data *prog_data,
3358 struct gl_shader_program *shader_prog,
3359 struct brw_shader *shader,
3360 void *mem_ctx,
3361 bool debug_flag,
3362 bool no_spills,
3363 shader_time_shader_type st_base,
3364 shader_time_shader_type st_written,
3365 shader_time_shader_type st_reset)
3366 : c(c),
3367 key(key),
3368 prog_data(prog_data),
3369 sanity_param_count(0),
3370 fail_msg(NULL),
3371 first_non_payload_grf(0),
3372 need_all_constants_in_pull_buffer(false),
3373 debug_flag(debug_flag),
3374 no_spills(no_spills),
3375 st_base(st_base),
3376 st_written(st_written),
3377 st_reset(st_reset)
3378 {
3379 this->brw = brw;
3380 this->ctx = &brw->ctx;
3381 this->shader_prog = shader_prog;
3382 this->shader = shader;
3383
3384 this->mem_ctx = mem_ctx;
3385 this->failed = false;
3386
3387 this->base_ir = NULL;
3388 this->current_annotation = NULL;
3389 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3390
3391 this->prog = prog;
3392 this->stage_prog_data = &prog_data->base;
3393
3394 this->variable_ht = hash_table_ctor(0,
3395 hash_table_pointer_hash,
3396 hash_table_pointer_compare);
3397
3398 this->virtual_grf_start = NULL;
3399 this->virtual_grf_end = NULL;
3400 this->virtual_grf_sizes = NULL;
3401 this->virtual_grf_count = 0;
3402 this->virtual_grf_reg_map = NULL;
3403 this->virtual_grf_reg_count = 0;
3404 this->virtual_grf_array_size = 0;
3405 this->live_intervals_valid = false;
3406
3407 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3408
3409 this->uniforms = 0;
3410
3411 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3412 * at least one. See setup_uniforms() in brw_vec4.cpp.
3413 */
3414 this->uniform_array_size = 1;
3415 if (prog_data) {
3416 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3417 }
3418
3419 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3420 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3421 }
3422
3423 vec4_visitor::~vec4_visitor()
3424 {
3425 hash_table_dtor(this->variable_ht);
3426 }
3427
3428
3429 void
3430 vec4_visitor::fail(const char *format, ...)
3431 {
3432 va_list va;
3433 char *msg;
3434
3435 if (failed)
3436 return;
3437
3438 failed = true;
3439
3440 va_start(va, format);
3441 msg = ralloc_vasprintf(mem_ctx, format, va);
3442 va_end(va);
3443 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3444
3445 this->fail_msg = msg;
3446
3447 if (debug_flag) {
3448 fprintf(stderr, "%s", msg);
3449 }
3450 }
3451
3452 } /* namespace brw */