glsl: Make is_16bit_constant from i965 an ir_constant method.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->sampler = 0;
47 this->texture_offset = 0;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->ir = v->base_ir;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_present = false;
53 this->mlen = 0;
54 this->base_mrf = 0;
55 this->offset = 0;
56 this->annotation = v->current_annotation;
57 }
58
59 vec4_instruction *
60 vec4_visitor::emit(vec4_instruction *inst)
61 {
62 this->instructions.push_tail(inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
69 {
70 new_inst->ir = inst->ir;
71 new_inst->annotation = inst->annotation;
72
73 inst->insert_before(new_inst);
74
75 return inst;
76 }
77
78 vec4_instruction *
79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
80 src_reg src0, src_reg src1, src_reg src2)
81 {
82 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
83 src0, src1, src2));
84 }
85
86
87 vec4_instruction *
88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
89 {
90 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
91 }
92
93 vec4_instruction *
94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
95 {
96 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
101 {
102 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode)
107 {
108 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
109 }
110
111 #define ALU1(op) \
112 vec4_instruction * \
113 vec4_visitor::op(dst_reg dst, src_reg src0) \
114 { \
115 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
116 src0); \
117 }
118
119 #define ALU2(op) \
120 vec4_instruction * \
121 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
122 { \
123 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
124 src0, src1); \
125 }
126
127 #define ALU3(op) \
128 vec4_instruction * \
129 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
130 { \
131 assert(brw->gen >= 6); \
132 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
133 src0, src1, src2); \
134 }
135
136 ALU1(NOT)
137 ALU1(MOV)
138 ALU1(FRC)
139 ALU1(RNDD)
140 ALU1(RNDE)
141 ALU1(RNDZ)
142 ALU1(F32TO16)
143 ALU1(F16TO32)
144 ALU2(ADD)
145 ALU2(MUL)
146 ALU2(MACH)
147 ALU2(AND)
148 ALU2(OR)
149 ALU2(XOR)
150 ALU2(DP3)
151 ALU2(DP4)
152 ALU2(DPH)
153 ALU2(SHL)
154 ALU2(SHR)
155 ALU2(ASR)
156 ALU3(LRP)
157 ALU1(BFREV)
158 ALU3(BFE)
159 ALU2(BFI1)
160 ALU3(BFI2)
161 ALU1(FBH)
162 ALU1(FBL)
163 ALU1(CBIT)
164 ALU3(MAD)
165 ALU2(ADDC)
166 ALU2(SUBB)
167
168 /** Gen4 predicated IF. */
169 vec4_instruction *
170 vec4_visitor::IF(uint32_t predicate)
171 {
172 vec4_instruction *inst;
173
174 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
175 inst->predicate = predicate;
176
177 return inst;
178 }
179
180 /** Gen6 IF with embedded comparison. */
181 vec4_instruction *
182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
183 {
184 assert(brw->gen == 6);
185
186 vec4_instruction *inst;
187
188 resolve_ud_negate(&src0);
189 resolve_ud_negate(&src1);
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
192 src0, src1);
193 inst->conditional_mod = condition;
194
195 return inst;
196 }
197
198 /**
199 * CMP: Sets the low bit of the destination channels with the result
200 * of the comparison, while the upper bits are undefined, and updates
201 * the flag register with the packed 16 bits of the result.
202 */
203 vec4_instruction *
204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
205 {
206 vec4_instruction *inst;
207
208 /* original gen4 does type conversion to the destination type
209 * before before comparison, producing garbage results for floating
210 * point comparisons.
211 */
212 if (brw->gen == 4) {
213 dst.type = src0.type;
214 if (dst.file == HW_REG)
215 dst.fixed_hw_reg.type = dst.type;
216 }
217
218 resolve_ud_negate(&src0);
219 resolve_ud_negate(&src1);
220
221 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
222 inst->conditional_mod = condition;
223
224 return inst;
225 }
226
227 vec4_instruction *
228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
229 {
230 vec4_instruction *inst;
231
232 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
233 dst, index);
234 inst->base_mrf = 14;
235 inst->mlen = 2;
236
237 return inst;
238 }
239
240 vec4_instruction *
241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
242 {
243 vec4_instruction *inst;
244
245 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
246 dst, src, index);
247 inst->base_mrf = 13;
248 inst->mlen = 3;
249
250 return inst;
251 }
252
253 void
254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
255 {
256 static enum opcode dot_opcodes[] = {
257 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
258 };
259
260 emit(dot_opcodes[elements - 2], dst, src0, src1);
261 }
262
263 src_reg
264 vec4_visitor::fix_3src_operand(src_reg src)
265 {
266 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
267 * able to use vertical stride of zero to replicate the vec4 uniform, like
268 *
269 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
270 *
271 * But you can't, since vertical stride is always four in three-source
272 * instructions. Instead, insert a MOV instruction to do the replication so
273 * that the three-source instruction can consume it.
274 */
275
276 /* The MOV is only needed if the source is a uniform or immediate. */
277 if (src.file != UNIFORM && src.file != IMM)
278 return src;
279
280 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
281 return src;
282
283 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
284 expanded.type = src.type;
285 emit(MOV(expanded, src));
286 return src_reg(expanded);
287 }
288
289 src_reg
290 vec4_visitor::fix_math_operand(src_reg src)
291 {
292 /* The gen6 math instruction ignores the source modifiers --
293 * swizzle, abs, negate, and at least some parts of the register
294 * region description.
295 *
296 * Rather than trying to enumerate all these cases, *always* expand the
297 * operand to a temp GRF for gen6.
298 *
299 * For gen7, keep the operand as-is, except if immediate, which gen7 still
300 * can't use.
301 */
302
303 if (brw->gen == 7 && src.file != IMM)
304 return src;
305
306 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
307 expanded.type = src.type;
308 emit(MOV(expanded, src));
309 return src_reg(expanded);
310 }
311
312 void
313 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
314 {
315 src = fix_math_operand(src);
316
317 if (dst.writemask != WRITEMASK_XYZW) {
318 /* The gen6 math instruction must be align1, so we can't do
319 * writemasks.
320 */
321 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
322
323 emit(opcode, temp_dst, src);
324
325 emit(MOV(dst, src_reg(temp_dst)));
326 } else {
327 emit(opcode, dst, src);
328 }
329 }
330
331 void
332 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
333 {
334 vec4_instruction *inst = emit(opcode, dst, src);
335 inst->base_mrf = 1;
336 inst->mlen = 1;
337 }
338
339 void
340 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
341 {
342 switch (opcode) {
343 case SHADER_OPCODE_RCP:
344 case SHADER_OPCODE_RSQ:
345 case SHADER_OPCODE_SQRT:
346 case SHADER_OPCODE_EXP2:
347 case SHADER_OPCODE_LOG2:
348 case SHADER_OPCODE_SIN:
349 case SHADER_OPCODE_COS:
350 break;
351 default:
352 assert(!"not reached: bad math opcode");
353 return;
354 }
355
356 if (brw->gen >= 6) {
357 return emit_math1_gen6(opcode, dst, src);
358 } else {
359 return emit_math1_gen4(opcode, dst, src);
360 }
361 }
362
363 void
364 vec4_visitor::emit_math2_gen6(enum opcode opcode,
365 dst_reg dst, src_reg src0, src_reg src1)
366 {
367 src0 = fix_math_operand(src0);
368 src1 = fix_math_operand(src1);
369
370 if (dst.writemask != WRITEMASK_XYZW) {
371 /* The gen6 math instruction must be align1, so we can't do
372 * writemasks.
373 */
374 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
375 temp_dst.type = dst.type;
376
377 emit(opcode, temp_dst, src0, src1);
378
379 emit(MOV(dst, src_reg(temp_dst)));
380 } else {
381 emit(opcode, dst, src0, src1);
382 }
383 }
384
385 void
386 vec4_visitor::emit_math2_gen4(enum opcode opcode,
387 dst_reg dst, src_reg src0, src_reg src1)
388 {
389 vec4_instruction *inst = emit(opcode, dst, src0, src1);
390 inst->base_mrf = 1;
391 inst->mlen = 2;
392 }
393
394 void
395 vec4_visitor::emit_math(enum opcode opcode,
396 dst_reg dst, src_reg src0, src_reg src1)
397 {
398 switch (opcode) {
399 case SHADER_OPCODE_POW:
400 case SHADER_OPCODE_INT_QUOTIENT:
401 case SHADER_OPCODE_INT_REMAINDER:
402 break;
403 default:
404 assert(!"not reached: unsupported binary math opcode");
405 return;
406 }
407
408 if (brw->gen >= 6) {
409 return emit_math2_gen6(opcode, dst, src0, src1);
410 } else {
411 return emit_math2_gen4(opcode, dst, src0, src1);
412 }
413 }
414
415 void
416 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
417 {
418 if (brw->gen < 7)
419 assert(!"ir_unop_pack_half_2x16 should be lowered");
420
421 assert(dst.type == BRW_REGISTER_TYPE_UD);
422 assert(src0.type == BRW_REGISTER_TYPE_F);
423
424 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
425 *
426 * Because this instruction does not have a 16-bit floating-point type,
427 * the destination data type must be Word (W).
428 *
429 * The destination must be DWord-aligned and specify a horizontal stride
430 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
431 * each destination channel and the upper word is not modified.
432 *
433 * The above restriction implies that the f32to16 instruction must use
434 * align1 mode, because only in align1 mode is it possible to specify
435 * horizontal stride. We choose here to defy the hardware docs and emit
436 * align16 instructions.
437 *
438 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
439 * instructions. I was partially successful in that the code passed all
440 * tests. However, the code was dubiously correct and fragile, and the
441 * tests were not harsh enough to probe that frailty. Not trusting the
442 * code, I chose instead to remain in align16 mode in defiance of the hw
443 * docs).
444 *
445 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
446 * simulator, emitting a f32to16 in align16 mode with UD as destination
447 * data type is safe. The behavior differs from that specified in the PRM
448 * in that the upper word of each destination channel is cleared to 0.
449 */
450
451 dst_reg tmp_dst(this, glsl_type::uvec2_type);
452 src_reg tmp_src(tmp_dst);
453
454 #if 0
455 /* Verify the undocumented behavior on which the following instructions
456 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
457 * then the result of the bit-or instruction below will be incorrect.
458 *
459 * You should inspect the disasm output in order to verify that the MOV is
460 * not optimized away.
461 */
462 emit(MOV(tmp_dst, src_reg(0x12345678u)));
463 #endif
464
465 /* Give tmp the form below, where "." means untouched.
466 *
467 * w z y x w z y x
468 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
469 *
470 * That the upper word of each write-channel be 0 is required for the
471 * following bit-shift and bit-or instructions to work. Note that this
472 * relies on the undocumented hardware behavior mentioned above.
473 */
474 tmp_dst.writemask = WRITEMASK_XY;
475 emit(F32TO16(tmp_dst, src0));
476
477 /* Give the write-channels of dst the form:
478 * 0xhhhh0000
479 */
480 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
481 emit(SHL(dst, tmp_src, src_reg(16u)));
482
483 /* Finally, give the write-channels of dst the form of packHalf2x16's
484 * output:
485 * 0xhhhhllll
486 */
487 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
488 emit(OR(dst, src_reg(dst), tmp_src));
489 }
490
491 void
492 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
493 {
494 if (brw->gen < 7)
495 assert(!"ir_unop_unpack_half_2x16 should be lowered");
496
497 assert(dst.type == BRW_REGISTER_TYPE_F);
498 assert(src0.type == BRW_REGISTER_TYPE_UD);
499
500 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
501 *
502 * Because this instruction does not have a 16-bit floating-point type,
503 * the source data type must be Word (W). The destination type must be
504 * F (Float).
505 *
506 * To use W as the source data type, we must adjust horizontal strides,
507 * which is only possible in align1 mode. All my [chadv] attempts at
508 * emitting align1 instructions for unpackHalf2x16 failed to pass the
509 * Piglit tests, so I gave up.
510 *
511 * I've verified that, on gen7 hardware and the simulator, it is safe to
512 * emit f16to32 in align16 mode with UD as source data type.
513 */
514
515 dst_reg tmp_dst(this, glsl_type::uvec2_type);
516 src_reg tmp_src(tmp_dst);
517
518 tmp_dst.writemask = WRITEMASK_X;
519 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
520
521 tmp_dst.writemask = WRITEMASK_Y;
522 emit(SHR(tmp_dst, src0, src_reg(16u)));
523
524 dst.writemask = WRITEMASK_XY;
525 emit(F16TO32(dst, tmp_src));
526 }
527
528 void
529 vec4_visitor::visit_instructions(const exec_list *list)
530 {
531 foreach_list(node, list) {
532 ir_instruction *ir = (ir_instruction *)node;
533
534 base_ir = ir;
535 ir->accept(this);
536 }
537 }
538
539
540 static int
541 type_size(const struct glsl_type *type)
542 {
543 unsigned int i;
544 int size;
545
546 switch (type->base_type) {
547 case GLSL_TYPE_UINT:
548 case GLSL_TYPE_INT:
549 case GLSL_TYPE_FLOAT:
550 case GLSL_TYPE_BOOL:
551 if (type->is_matrix()) {
552 return type->matrix_columns;
553 } else {
554 /* Regardless of size of vector, it gets a vec4. This is bad
555 * packing for things like floats, but otherwise arrays become a
556 * mess. Hopefully a later pass over the code can pack scalars
557 * down if appropriate.
558 */
559 return 1;
560 }
561 case GLSL_TYPE_ARRAY:
562 assert(type->length > 0);
563 return type_size(type->fields.array) * type->length;
564 case GLSL_TYPE_STRUCT:
565 size = 0;
566 for (i = 0; i < type->length; i++) {
567 size += type_size(type->fields.structure[i].type);
568 }
569 return size;
570 case GLSL_TYPE_SAMPLER:
571 /* Samplers take up one slot in UNIFORMS[], but they're baked in
572 * at link time.
573 */
574 return 1;
575 case GLSL_TYPE_ATOMIC_UINT:
576 return 0;
577 case GLSL_TYPE_IMAGE:
578 case GLSL_TYPE_VOID:
579 case GLSL_TYPE_ERROR:
580 case GLSL_TYPE_INTERFACE:
581 assert(0);
582 break;
583 }
584
585 return 0;
586 }
587
588 int
589 vec4_visitor::virtual_grf_alloc(int size)
590 {
591 if (virtual_grf_array_size <= virtual_grf_count) {
592 if (virtual_grf_array_size == 0)
593 virtual_grf_array_size = 16;
594 else
595 virtual_grf_array_size *= 2;
596 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
597 virtual_grf_array_size);
598 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
599 virtual_grf_array_size);
600 }
601 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
602 virtual_grf_reg_count += size;
603 virtual_grf_sizes[virtual_grf_count] = size;
604 return virtual_grf_count++;
605 }
606
607 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
608 {
609 init();
610
611 this->file = GRF;
612 this->reg = v->virtual_grf_alloc(type_size(type));
613
614 if (type->is_array() || type->is_record()) {
615 this->swizzle = BRW_SWIZZLE_NOOP;
616 } else {
617 this->swizzle = swizzle_for_size(type->vector_elements);
618 }
619
620 this->type = brw_type_for_base_type(type);
621 }
622
623 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
624 {
625 init();
626
627 this->file = GRF;
628 this->reg = v->virtual_grf_alloc(type_size(type));
629
630 if (type->is_array() || type->is_record()) {
631 this->writemask = WRITEMASK_XYZW;
632 } else {
633 this->writemask = (1 << type->vector_elements) - 1;
634 }
635
636 this->type = brw_type_for_base_type(type);
637 }
638
639 /* Our support for uniforms is piggy-backed on the struct
640 * gl_fragment_program, because that's where the values actually
641 * get stored, rather than in some global gl_shader_program uniform
642 * store.
643 */
644 void
645 vec4_visitor::setup_uniform_values(ir_variable *ir)
646 {
647 int namelen = strlen(ir->name);
648
649 /* The data for our (non-builtin) uniforms is stored in a series of
650 * gl_uniform_driver_storage structs for each subcomponent that
651 * glGetUniformLocation() could name. We know it's been set up in the same
652 * order we'd walk the type, so walk the list of storage and find anything
653 * with our name, or the prefix of a component that starts with our name.
654 */
655 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
656 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
657
658 if (strncmp(ir->name, storage->name, namelen) != 0 ||
659 (storage->name[namelen] != 0 &&
660 storage->name[namelen] != '.' &&
661 storage->name[namelen] != '[')) {
662 continue;
663 }
664
665 gl_constant_value *components = storage->storage;
666 unsigned vector_count = (MAX2(storage->array_elements, 1) *
667 storage->type->matrix_columns);
668
669 for (unsigned s = 0; s < vector_count; s++) {
670 assert(uniforms < uniform_array_size);
671 uniform_vector_size[uniforms] = storage->type->vector_elements;
672
673 int i;
674 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
675 stage_prog_data->param[uniforms * 4 + i] = &components->f;
676 components++;
677 }
678 for (; i < 4; i++) {
679 static float zero = 0;
680 stage_prog_data->param[uniforms * 4 + i] = &zero;
681 }
682
683 uniforms++;
684 }
685 }
686 }
687
688 void
689 vec4_visitor::setup_uniform_clipplane_values()
690 {
691 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
692
693 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
694 assert(this->uniforms < uniform_array_size);
695 this->uniform_vector_size[this->uniforms] = 4;
696 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
697 this->userplane[i].type = BRW_REGISTER_TYPE_F;
698 for (int j = 0; j < 4; ++j) {
699 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
700 }
701 ++this->uniforms;
702 }
703 }
704
705 /* Our support for builtin uniforms is even scarier than non-builtin.
706 * It sits on top of the PROG_STATE_VAR parameters that are
707 * automatically updated from GL context state.
708 */
709 void
710 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
711 {
712 const ir_state_slot *const slots = ir->state_slots;
713 assert(ir->state_slots != NULL);
714
715 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
716 /* This state reference has already been setup by ir_to_mesa,
717 * but we'll get the same index back here. We can reference
718 * ParameterValues directly, since unlike brw_fs.cpp, we never
719 * add new state references during compile.
720 */
721 int index = _mesa_add_state_reference(this->prog->Parameters,
722 (gl_state_index *)slots[i].tokens);
723 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
724
725 assert(this->uniforms < uniform_array_size);
726 this->uniform_vector_size[this->uniforms] = 0;
727 /* Add each of the unique swizzled channels of the element.
728 * This will end up matching the size of the glsl_type of this field.
729 */
730 int last_swiz = -1;
731 for (unsigned int j = 0; j < 4; j++) {
732 int swiz = GET_SWZ(slots[i].swizzle, j);
733 last_swiz = swiz;
734
735 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
736 assert(this->uniforms < uniform_array_size);
737 if (swiz <= last_swiz)
738 this->uniform_vector_size[this->uniforms]++;
739 }
740 this->uniforms++;
741 }
742 }
743
744 dst_reg *
745 vec4_visitor::variable_storage(ir_variable *var)
746 {
747 return (dst_reg *)hash_table_find(this->variable_ht, var);
748 }
749
750 void
751 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
752 {
753 ir_expression *expr = ir->as_expression();
754
755 *predicate = BRW_PREDICATE_NORMAL;
756
757 if (expr) {
758 src_reg op[2];
759 vec4_instruction *inst;
760
761 assert(expr->get_num_operands() <= 2);
762 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
763 expr->operands[i]->accept(this);
764 op[i] = this->result;
765
766 resolve_ud_negate(&op[i]);
767 }
768
769 switch (expr->operation) {
770 case ir_unop_logic_not:
771 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
772 inst->conditional_mod = BRW_CONDITIONAL_Z;
773 break;
774
775 case ir_binop_logic_xor:
776 inst = emit(XOR(dst_null_d(), op[0], op[1]));
777 inst->conditional_mod = BRW_CONDITIONAL_NZ;
778 break;
779
780 case ir_binop_logic_or:
781 inst = emit(OR(dst_null_d(), op[0], op[1]));
782 inst->conditional_mod = BRW_CONDITIONAL_NZ;
783 break;
784
785 case ir_binop_logic_and:
786 inst = emit(AND(dst_null_d(), op[0], op[1]));
787 inst->conditional_mod = BRW_CONDITIONAL_NZ;
788 break;
789
790 case ir_unop_f2b:
791 if (brw->gen >= 6) {
792 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
793 } else {
794 inst = emit(MOV(dst_null_f(), op[0]));
795 inst->conditional_mod = BRW_CONDITIONAL_NZ;
796 }
797 break;
798
799 case ir_unop_i2b:
800 if (brw->gen >= 6) {
801 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
802 } else {
803 inst = emit(MOV(dst_null_d(), op[0]));
804 inst->conditional_mod = BRW_CONDITIONAL_NZ;
805 }
806 break;
807
808 case ir_binop_all_equal:
809 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
810 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
811 break;
812
813 case ir_binop_any_nequal:
814 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
815 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
816 break;
817
818 case ir_unop_any:
819 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
820 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
821 break;
822
823 case ir_binop_greater:
824 case ir_binop_gequal:
825 case ir_binop_less:
826 case ir_binop_lequal:
827 case ir_binop_equal:
828 case ir_binop_nequal:
829 emit(CMP(dst_null_d(), op[0], op[1],
830 brw_conditional_for_comparison(expr->operation)));
831 break;
832
833 default:
834 assert(!"not reached");
835 break;
836 }
837 return;
838 }
839
840 ir->accept(this);
841
842 resolve_ud_negate(&this->result);
843
844 if (brw->gen >= 6) {
845 vec4_instruction *inst = emit(AND(dst_null_d(),
846 this->result, src_reg(1)));
847 inst->conditional_mod = BRW_CONDITIONAL_NZ;
848 } else {
849 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
850 inst->conditional_mod = BRW_CONDITIONAL_NZ;
851 }
852 }
853
854 /**
855 * Emit a gen6 IF statement with the comparison folded into the IF
856 * instruction.
857 */
858 void
859 vec4_visitor::emit_if_gen6(ir_if *ir)
860 {
861 ir_expression *expr = ir->condition->as_expression();
862
863 if (expr) {
864 src_reg op[2];
865 dst_reg temp;
866
867 assert(expr->get_num_operands() <= 2);
868 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
869 expr->operands[i]->accept(this);
870 op[i] = this->result;
871 }
872
873 switch (expr->operation) {
874 case ir_unop_logic_not:
875 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
876 return;
877
878 case ir_binop_logic_xor:
879 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
880 return;
881
882 case ir_binop_logic_or:
883 temp = dst_reg(this, glsl_type::bool_type);
884 emit(OR(temp, op[0], op[1]));
885 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
886 return;
887
888 case ir_binop_logic_and:
889 temp = dst_reg(this, glsl_type::bool_type);
890 emit(AND(temp, op[0], op[1]));
891 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
892 return;
893
894 case ir_unop_f2b:
895 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
896 return;
897
898 case ir_unop_i2b:
899 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
900 return;
901
902 case ir_binop_greater:
903 case ir_binop_gequal:
904 case ir_binop_less:
905 case ir_binop_lequal:
906 case ir_binop_equal:
907 case ir_binop_nequal:
908 emit(IF(op[0], op[1],
909 brw_conditional_for_comparison(expr->operation)));
910 return;
911
912 case ir_binop_all_equal:
913 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
914 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
915 return;
916
917 case ir_binop_any_nequal:
918 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
919 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
920 return;
921
922 case ir_unop_any:
923 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
924 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
925 return;
926
927 default:
928 assert(!"not reached");
929 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
930 return;
931 }
932 return;
933 }
934
935 ir->condition->accept(this);
936
937 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
938 }
939
940 void
941 vec4_visitor::visit(ir_variable *ir)
942 {
943 dst_reg *reg = NULL;
944
945 if (variable_storage(ir))
946 return;
947
948 switch (ir->data.mode) {
949 case ir_var_shader_in:
950 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
951 break;
952
953 case ir_var_shader_out:
954 reg = new(mem_ctx) dst_reg(this, ir->type);
955
956 for (int i = 0; i < type_size(ir->type); i++) {
957 output_reg[ir->data.location + i] = *reg;
958 output_reg[ir->data.location + i].reg_offset = i;
959 output_reg[ir->data.location + i].type =
960 brw_type_for_base_type(ir->type->get_scalar_type());
961 output_reg_annotation[ir->data.location + i] = ir->name;
962 }
963 break;
964
965 case ir_var_auto:
966 case ir_var_temporary:
967 reg = new(mem_ctx) dst_reg(this, ir->type);
968 break;
969
970 case ir_var_uniform:
971 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
972
973 /* Thanks to the lower_ubo_reference pass, we will see only
974 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
975 * variables, so no need for them to be in variable_ht.
976 *
977 * Atomic counters take no uniform storage, no need to do
978 * anything here.
979 */
980 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
981 return;
982
983 /* Track how big the whole uniform variable is, in case we need to put a
984 * copy of its data into pull constants for array access.
985 */
986 assert(this->uniforms < uniform_array_size);
987 this->uniform_size[this->uniforms] = type_size(ir->type);
988
989 if (!strncmp(ir->name, "gl_", 3)) {
990 setup_builtin_uniform_values(ir);
991 } else {
992 setup_uniform_values(ir);
993 }
994 break;
995
996 case ir_var_system_value:
997 reg = make_reg_for_system_value(ir);
998 break;
999
1000 default:
1001 assert(!"not reached");
1002 }
1003
1004 reg->type = brw_type_for_base_type(ir->type);
1005 hash_table_insert(this->variable_ht, reg, ir);
1006 }
1007
1008 void
1009 vec4_visitor::visit(ir_loop *ir)
1010 {
1011 /* We don't want debugging output to print the whole body of the
1012 * loop as the annotation.
1013 */
1014 this->base_ir = NULL;
1015
1016 emit(BRW_OPCODE_DO);
1017
1018 visit_instructions(&ir->body_instructions);
1019
1020 emit(BRW_OPCODE_WHILE);
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_loop_jump *ir)
1025 {
1026 switch (ir->mode) {
1027 case ir_loop_jump::jump_break:
1028 emit(BRW_OPCODE_BREAK);
1029 break;
1030 case ir_loop_jump::jump_continue:
1031 emit(BRW_OPCODE_CONTINUE);
1032 break;
1033 }
1034 }
1035
1036
1037 void
1038 vec4_visitor::visit(ir_function_signature *ir)
1039 {
1040 assert(0);
1041 (void)ir;
1042 }
1043
1044 void
1045 vec4_visitor::visit(ir_function *ir)
1046 {
1047 /* Ignore function bodies other than main() -- we shouldn't see calls to
1048 * them since they should all be inlined.
1049 */
1050 if (strcmp(ir->name, "main") == 0) {
1051 const ir_function_signature *sig;
1052 exec_list empty;
1053
1054 sig = ir->matching_signature(NULL, &empty);
1055
1056 assert(sig);
1057
1058 visit_instructions(&sig->body);
1059 }
1060 }
1061
1062 bool
1063 vec4_visitor::try_emit_sat(ir_expression *ir)
1064 {
1065 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1066 if (!sat_src)
1067 return false;
1068
1069 sat_src->accept(this);
1070 src_reg src = this->result;
1071
1072 this->result = src_reg(this, ir->type);
1073 vec4_instruction *inst;
1074 inst = emit(MOV(dst_reg(this->result), src));
1075 inst->saturate = true;
1076
1077 return true;
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1082 {
1083 /* 3-src instructions were introduced in gen6. */
1084 if (brw->gen < 6)
1085 return false;
1086
1087 /* MAD can only handle floating-point data. */
1088 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1089 return false;
1090
1091 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1092 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1093
1094 if (!mul || mul->operation != ir_binop_mul)
1095 return false;
1096
1097 nonmul->accept(this);
1098 src_reg src0 = fix_3src_operand(this->result);
1099
1100 mul->operands[0]->accept(this);
1101 src_reg src1 = fix_3src_operand(this->result);
1102
1103 mul->operands[1]->accept(this);
1104 src_reg src2 = fix_3src_operand(this->result);
1105
1106 this->result = src_reg(this, ir->type);
1107 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1108
1109 return true;
1110 }
1111
1112 void
1113 vec4_visitor::emit_bool_comparison(unsigned int op,
1114 dst_reg dst, src_reg src0, src_reg src1)
1115 {
1116 /* original gen4 does destination conversion before comparison. */
1117 if (brw->gen < 5)
1118 dst.type = src0.type;
1119
1120 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1121
1122 dst.type = BRW_REGISTER_TYPE_D;
1123 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1124 }
1125
1126 void
1127 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1128 src_reg src0, src_reg src1)
1129 {
1130 vec4_instruction *inst;
1131
1132 if (brw->gen >= 6) {
1133 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1134 inst->conditional_mod = conditionalmod;
1135 } else {
1136 emit(CMP(dst, src0, src1, conditionalmod));
1137
1138 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1139 inst->predicate = BRW_PREDICATE_NORMAL;
1140 }
1141 }
1142
1143 void
1144 vec4_visitor::emit_lrp(const dst_reg &dst,
1145 const src_reg &x, const src_reg &y, const src_reg &a)
1146 {
1147 if (brw->gen >= 6) {
1148 /* Note that the instruction's argument order is reversed from GLSL
1149 * and the IR.
1150 */
1151 emit(LRP(dst,
1152 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1153 } else {
1154 /* Earlier generations don't support three source operations, so we
1155 * need to emit x*(1-a) + y*a.
1156 *
1157 * A better way to do this would be:
1158 * ADD one_minus_a, negate(a), 1.0f
1159 * MUL null, y, a
1160 * MAC dst, x, one_minus_a
1161 * but we would need to support MAC and implicit accumulator.
1162 */
1163 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1164 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1165 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1166 y_times_a.writemask = dst.writemask;
1167 one_minus_a.writemask = dst.writemask;
1168 x_times_one_minus_a.writemask = dst.writemask;
1169
1170 emit(MUL(y_times_a, y, a));
1171 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1172 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1173 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1174 }
1175 }
1176
1177 void
1178 vec4_visitor::visit(ir_expression *ir)
1179 {
1180 unsigned int operand;
1181 src_reg op[Elements(ir->operands)];
1182 src_reg result_src;
1183 dst_reg result_dst;
1184 vec4_instruction *inst;
1185
1186 if (try_emit_sat(ir))
1187 return;
1188
1189 if (ir->operation == ir_binop_add) {
1190 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1191 return;
1192 }
1193
1194 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1195 this->result.file = BAD_FILE;
1196 ir->operands[operand]->accept(this);
1197 if (this->result.file == BAD_FILE) {
1198 fprintf(stderr, "Failed to get tree for expression operand:\n");
1199 ir->operands[operand]->fprint(stderr);
1200 exit(1);
1201 }
1202 op[operand] = this->result;
1203
1204 /* Matrix expression operands should have been broken down to vector
1205 * operations already.
1206 */
1207 assert(!ir->operands[operand]->type->is_matrix());
1208 }
1209
1210 int vector_elements = ir->operands[0]->type->vector_elements;
1211 if (ir->operands[1]) {
1212 vector_elements = MAX2(vector_elements,
1213 ir->operands[1]->type->vector_elements);
1214 }
1215
1216 this->result.file = BAD_FILE;
1217
1218 /* Storage for our result. Ideally for an assignment we'd be using
1219 * the actual storage for the result here, instead.
1220 */
1221 result_src = src_reg(this, ir->type);
1222 /* convenience for the emit functions below. */
1223 result_dst = dst_reg(result_src);
1224 /* If nothing special happens, this is the result. */
1225 this->result = result_src;
1226 /* Limit writes to the channels that will be used by result_src later.
1227 * This does limit this temp's use as a temporary for multi-instruction
1228 * sequences.
1229 */
1230 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1231
1232 switch (ir->operation) {
1233 case ir_unop_logic_not:
1234 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1235 * ones complement of the whole register, not just bit 0.
1236 */
1237 emit(XOR(result_dst, op[0], src_reg(1)));
1238 break;
1239 case ir_unop_neg:
1240 op[0].negate = !op[0].negate;
1241 emit(MOV(result_dst, op[0]));
1242 break;
1243 case ir_unop_abs:
1244 op[0].abs = true;
1245 op[0].negate = false;
1246 emit(MOV(result_dst, op[0]));
1247 break;
1248
1249 case ir_unop_sign:
1250 if (ir->type->is_float()) {
1251 /* AND(val, 0x80000000) gives the sign bit.
1252 *
1253 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1254 * zero.
1255 */
1256 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1257
1258 op[0].type = BRW_REGISTER_TYPE_UD;
1259 result_dst.type = BRW_REGISTER_TYPE_UD;
1260 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1261
1262 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1263 inst->predicate = BRW_PREDICATE_NORMAL;
1264
1265 this->result.type = BRW_REGISTER_TYPE_F;
1266 } else {
1267 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1268 * -> non-negative val generates 0x00000000.
1269 * Predicated OR sets 1 if val is positive.
1270 */
1271 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1272
1273 emit(ASR(result_dst, op[0], src_reg(31)));
1274
1275 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1276 inst->predicate = BRW_PREDICATE_NORMAL;
1277 }
1278 break;
1279
1280 case ir_unop_rcp:
1281 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1282 break;
1283
1284 case ir_unop_exp2:
1285 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1286 break;
1287 case ir_unop_log2:
1288 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1289 break;
1290 case ir_unop_exp:
1291 case ir_unop_log:
1292 assert(!"not reached: should be handled by ir_explog_to_explog2");
1293 break;
1294 case ir_unop_sin:
1295 case ir_unop_sin_reduced:
1296 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1297 break;
1298 case ir_unop_cos:
1299 case ir_unop_cos_reduced:
1300 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1301 break;
1302
1303 case ir_unop_dFdx:
1304 case ir_unop_dFdy:
1305 assert(!"derivatives not valid in vertex shader");
1306 break;
1307
1308 case ir_unop_bitfield_reverse:
1309 emit(BFREV(result_dst, op[0]));
1310 break;
1311 case ir_unop_bit_count:
1312 emit(CBIT(result_dst, op[0]));
1313 break;
1314 case ir_unop_find_msb: {
1315 src_reg temp = src_reg(this, glsl_type::uint_type);
1316
1317 inst = emit(FBH(dst_reg(temp), op[0]));
1318 inst->dst.writemask = WRITEMASK_XYZW;
1319
1320 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1321 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1322 * subtract the result from 31 to convert the MSB count into an LSB count.
1323 */
1324
1325 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1326 temp.swizzle = BRW_SWIZZLE_NOOP;
1327 emit(MOV(result_dst, temp));
1328
1329 src_reg src_tmp = src_reg(result_dst);
1330 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1331
1332 src_tmp.negate = true;
1333 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1334 inst->predicate = BRW_PREDICATE_NORMAL;
1335 break;
1336 }
1337 case ir_unop_find_lsb:
1338 emit(FBL(result_dst, op[0]));
1339 break;
1340
1341 case ir_unop_noise:
1342 assert(!"not reached: should be handled by lower_noise");
1343 break;
1344
1345 case ir_binop_add:
1346 emit(ADD(result_dst, op[0], op[1]));
1347 break;
1348 case ir_binop_sub:
1349 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1350 break;
1351
1352 case ir_binop_mul:
1353 if (brw->gen < 8 && ir->type->is_integer()) {
1354 /* For integer multiplication, the MUL uses the low 16 bits of one of
1355 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1356 * accumulates in the contribution of the upper 16 bits of that
1357 * operand. If we can determine that one of the args is in the low
1358 * 16 bits, though, we can just emit a single MUL.
1359 */
1360 if (ir->operands[0]->is_uint16_constant()) {
1361 if (brw->gen < 7)
1362 emit(MUL(result_dst, op[0], op[1]));
1363 else
1364 emit(MUL(result_dst, op[1], op[0]));
1365 } else if (ir->operands[1]->is_uint16_constant()) {
1366 if (brw->gen < 7)
1367 emit(MUL(result_dst, op[1], op[0]));
1368 else
1369 emit(MUL(result_dst, op[0], op[1]));
1370 } else {
1371 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1372
1373 emit(MUL(acc, op[0], op[1]));
1374 emit(MACH(dst_null_d(), op[0], op[1]));
1375 emit(MOV(result_dst, src_reg(acc)));
1376 }
1377 } else {
1378 emit(MUL(result_dst, op[0], op[1]));
1379 }
1380 break;
1381 case ir_binop_imul_high: {
1382 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1383
1384 emit(MUL(acc, op[0], op[1]));
1385 emit(MACH(result_dst, op[0], op[1]));
1386 break;
1387 }
1388 case ir_binop_div:
1389 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1390 assert(ir->type->is_integer());
1391 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1392 break;
1393 case ir_binop_carry: {
1394 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1395
1396 emit(ADDC(dst_null_ud(), op[0], op[1]));
1397 emit(MOV(result_dst, src_reg(acc)));
1398 break;
1399 }
1400 case ir_binop_borrow: {
1401 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1402
1403 emit(SUBB(dst_null_ud(), op[0], op[1]));
1404 emit(MOV(result_dst, src_reg(acc)));
1405 break;
1406 }
1407 case ir_binop_mod:
1408 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1409 assert(ir->type->is_integer());
1410 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1411 break;
1412
1413 case ir_binop_less:
1414 case ir_binop_greater:
1415 case ir_binop_lequal:
1416 case ir_binop_gequal:
1417 case ir_binop_equal:
1418 case ir_binop_nequal: {
1419 emit(CMP(result_dst, op[0], op[1],
1420 brw_conditional_for_comparison(ir->operation)));
1421 emit(AND(result_dst, result_src, src_reg(0x1)));
1422 break;
1423 }
1424
1425 case ir_binop_all_equal:
1426 /* "==" operator producing a scalar boolean. */
1427 if (ir->operands[0]->type->is_vector() ||
1428 ir->operands[1]->type->is_vector()) {
1429 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1430 emit(MOV(result_dst, src_reg(0)));
1431 inst = emit(MOV(result_dst, src_reg(1)));
1432 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1433 } else {
1434 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1435 emit(AND(result_dst, result_src, src_reg(0x1)));
1436 }
1437 break;
1438 case ir_binop_any_nequal:
1439 /* "!=" operator producing a scalar boolean. */
1440 if (ir->operands[0]->type->is_vector() ||
1441 ir->operands[1]->type->is_vector()) {
1442 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1443
1444 emit(MOV(result_dst, src_reg(0)));
1445 inst = emit(MOV(result_dst, src_reg(1)));
1446 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1447 } else {
1448 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1449 emit(AND(result_dst, result_src, src_reg(0x1)));
1450 }
1451 break;
1452
1453 case ir_unop_any:
1454 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1455 emit(MOV(result_dst, src_reg(0)));
1456
1457 inst = emit(MOV(result_dst, src_reg(1)));
1458 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1459 break;
1460
1461 case ir_binop_logic_xor:
1462 emit(XOR(result_dst, op[0], op[1]));
1463 break;
1464
1465 case ir_binop_logic_or:
1466 emit(OR(result_dst, op[0], op[1]));
1467 break;
1468
1469 case ir_binop_logic_and:
1470 emit(AND(result_dst, op[0], op[1]));
1471 break;
1472
1473 case ir_binop_dot:
1474 assert(ir->operands[0]->type->is_vector());
1475 assert(ir->operands[0]->type == ir->operands[1]->type);
1476 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1477 break;
1478
1479 case ir_unop_sqrt:
1480 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1481 break;
1482 case ir_unop_rsq:
1483 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1484 break;
1485
1486 case ir_unop_bitcast_i2f:
1487 case ir_unop_bitcast_u2f:
1488 this->result = op[0];
1489 this->result.type = BRW_REGISTER_TYPE_F;
1490 break;
1491
1492 case ir_unop_bitcast_f2i:
1493 this->result = op[0];
1494 this->result.type = BRW_REGISTER_TYPE_D;
1495 break;
1496
1497 case ir_unop_bitcast_f2u:
1498 this->result = op[0];
1499 this->result.type = BRW_REGISTER_TYPE_UD;
1500 break;
1501
1502 case ir_unop_i2f:
1503 case ir_unop_i2u:
1504 case ir_unop_u2i:
1505 case ir_unop_u2f:
1506 case ir_unop_b2f:
1507 case ir_unop_b2i:
1508 case ir_unop_f2i:
1509 case ir_unop_f2u:
1510 emit(MOV(result_dst, op[0]));
1511 break;
1512 case ir_unop_f2b:
1513 case ir_unop_i2b: {
1514 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1515 emit(AND(result_dst, result_src, src_reg(1)));
1516 break;
1517 }
1518
1519 case ir_unop_trunc:
1520 emit(RNDZ(result_dst, op[0]));
1521 break;
1522 case ir_unop_ceil:
1523 op[0].negate = !op[0].negate;
1524 inst = emit(RNDD(result_dst, op[0]));
1525 this->result.negate = true;
1526 break;
1527 case ir_unop_floor:
1528 inst = emit(RNDD(result_dst, op[0]));
1529 break;
1530 case ir_unop_fract:
1531 inst = emit(FRC(result_dst, op[0]));
1532 break;
1533 case ir_unop_round_even:
1534 emit(RNDE(result_dst, op[0]));
1535 break;
1536
1537 case ir_binop_min:
1538 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1539 break;
1540 case ir_binop_max:
1541 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1542 break;
1543
1544 case ir_binop_pow:
1545 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1546 break;
1547
1548 case ir_unop_bit_not:
1549 inst = emit(NOT(result_dst, op[0]));
1550 break;
1551 case ir_binop_bit_and:
1552 inst = emit(AND(result_dst, op[0], op[1]));
1553 break;
1554 case ir_binop_bit_xor:
1555 inst = emit(XOR(result_dst, op[0], op[1]));
1556 break;
1557 case ir_binop_bit_or:
1558 inst = emit(OR(result_dst, op[0], op[1]));
1559 break;
1560
1561 case ir_binop_lshift:
1562 inst = emit(SHL(result_dst, op[0], op[1]));
1563 break;
1564
1565 case ir_binop_rshift:
1566 if (ir->type->base_type == GLSL_TYPE_INT)
1567 inst = emit(ASR(result_dst, op[0], op[1]));
1568 else
1569 inst = emit(SHR(result_dst, op[0], op[1]));
1570 break;
1571
1572 case ir_binop_bfm:
1573 emit(BFI1(result_dst, op[0], op[1]));
1574 break;
1575
1576 case ir_binop_ubo_load: {
1577 ir_constant *uniform_block = ir->operands[0]->as_constant();
1578 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1579 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1580 src_reg offset;
1581
1582 /* Now, load the vector from that offset. */
1583 assert(ir->type->is_vector() || ir->type->is_scalar());
1584
1585 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1586 packed_consts.type = result.type;
1587 src_reg surf_index =
1588 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1589 if (const_offset_ir) {
1590 if (brw->gen >= 8) {
1591 /* Store the offset in a GRF so we can send-from-GRF. */
1592 offset = src_reg(this, glsl_type::int_type);
1593 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1594 } else {
1595 /* Immediates are fine on older generations since they'll be moved
1596 * to a (potentially fake) MRF at the generator level.
1597 */
1598 offset = src_reg(const_offset / 16);
1599 }
1600 } else {
1601 offset = src_reg(this, glsl_type::uint_type);
1602 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1603 }
1604
1605 if (brw->gen >= 7) {
1606 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1607 grf_offset.type = offset.type;
1608
1609 emit(MOV(grf_offset, offset));
1610
1611 emit(new(mem_ctx) vec4_instruction(this,
1612 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1613 dst_reg(packed_consts),
1614 surf_index,
1615 src_reg(grf_offset)));
1616 } else {
1617 vec4_instruction *pull =
1618 emit(new(mem_ctx) vec4_instruction(this,
1619 VS_OPCODE_PULL_CONSTANT_LOAD,
1620 dst_reg(packed_consts),
1621 surf_index,
1622 offset));
1623 pull->base_mrf = 14;
1624 pull->mlen = 1;
1625 }
1626
1627 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1628 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1629 const_offset % 16 / 4,
1630 const_offset % 16 / 4,
1631 const_offset % 16 / 4);
1632
1633 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1634 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1635 emit(CMP(result_dst, packed_consts, src_reg(0u),
1636 BRW_CONDITIONAL_NZ));
1637 emit(AND(result_dst, result, src_reg(0x1)));
1638 } else {
1639 emit(MOV(result_dst, packed_consts));
1640 }
1641 break;
1642 }
1643
1644 case ir_binop_vector_extract:
1645 assert(!"should have been lowered by vec_index_to_cond_assign");
1646 break;
1647
1648 case ir_triop_fma:
1649 op[0] = fix_3src_operand(op[0]);
1650 op[1] = fix_3src_operand(op[1]);
1651 op[2] = fix_3src_operand(op[2]);
1652 /* Note that the instruction's argument order is reversed from GLSL
1653 * and the IR.
1654 */
1655 emit(MAD(result_dst, op[2], op[1], op[0]));
1656 break;
1657
1658 case ir_triop_lrp:
1659 emit_lrp(result_dst, op[0], op[1], op[2]);
1660 break;
1661
1662 case ir_triop_csel:
1663 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1664 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1665 inst->predicate = BRW_PREDICATE_NORMAL;
1666 break;
1667
1668 case ir_triop_bfi:
1669 op[0] = fix_3src_operand(op[0]);
1670 op[1] = fix_3src_operand(op[1]);
1671 op[2] = fix_3src_operand(op[2]);
1672 emit(BFI2(result_dst, op[0], op[1], op[2]));
1673 break;
1674
1675 case ir_triop_bitfield_extract:
1676 op[0] = fix_3src_operand(op[0]);
1677 op[1] = fix_3src_operand(op[1]);
1678 op[2] = fix_3src_operand(op[2]);
1679 /* Note that the instruction's argument order is reversed from GLSL
1680 * and the IR.
1681 */
1682 emit(BFE(result_dst, op[2], op[1], op[0]));
1683 break;
1684
1685 case ir_triop_vector_insert:
1686 assert(!"should have been lowered by lower_vector_insert");
1687 break;
1688
1689 case ir_quadop_bitfield_insert:
1690 assert(!"not reached: should be handled by "
1691 "bitfield_insert_to_bfm_bfi\n");
1692 break;
1693
1694 case ir_quadop_vector:
1695 assert(!"not reached: should be handled by lower_quadop_vector");
1696 break;
1697
1698 case ir_unop_pack_half_2x16:
1699 emit_pack_half_2x16(result_dst, op[0]);
1700 break;
1701 case ir_unop_unpack_half_2x16:
1702 emit_unpack_half_2x16(result_dst, op[0]);
1703 break;
1704 case ir_unop_pack_snorm_2x16:
1705 case ir_unop_pack_snorm_4x8:
1706 case ir_unop_pack_unorm_2x16:
1707 case ir_unop_pack_unorm_4x8:
1708 case ir_unop_unpack_snorm_2x16:
1709 case ir_unop_unpack_snorm_4x8:
1710 case ir_unop_unpack_unorm_2x16:
1711 case ir_unop_unpack_unorm_4x8:
1712 assert(!"not reached: should be handled by lower_packing_builtins");
1713 break;
1714 case ir_unop_unpack_half_2x16_split_x:
1715 case ir_unop_unpack_half_2x16_split_y:
1716 case ir_binop_pack_half_2x16_split:
1717 assert(!"not reached: should not occur in vertex shader");
1718 break;
1719 case ir_binop_ldexp:
1720 assert(!"not reached: should be handled by ldexp_to_arith()");
1721 break;
1722 }
1723 }
1724
1725
1726 void
1727 vec4_visitor::visit(ir_swizzle *ir)
1728 {
1729 src_reg src;
1730 int i = 0;
1731 int swizzle[4];
1732
1733 /* Note that this is only swizzles in expressions, not those on the left
1734 * hand side of an assignment, which do write masking. See ir_assignment
1735 * for that.
1736 */
1737
1738 ir->val->accept(this);
1739 src = this->result;
1740 assert(src.file != BAD_FILE);
1741
1742 for (i = 0; i < ir->type->vector_elements; i++) {
1743 switch (i) {
1744 case 0:
1745 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1746 break;
1747 case 1:
1748 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1749 break;
1750 case 2:
1751 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1752 break;
1753 case 3:
1754 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1755 break;
1756 }
1757 }
1758 for (; i < 4; i++) {
1759 /* Replicate the last channel out. */
1760 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1761 }
1762
1763 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1764
1765 this->result = src;
1766 }
1767
1768 void
1769 vec4_visitor::visit(ir_dereference_variable *ir)
1770 {
1771 const struct glsl_type *type = ir->type;
1772 dst_reg *reg = variable_storage(ir->var);
1773
1774 if (!reg) {
1775 fail("Failed to find variable storage for %s\n", ir->var->name);
1776 this->result = src_reg(brw_null_reg());
1777 return;
1778 }
1779
1780 this->result = src_reg(*reg);
1781
1782 /* System values get their swizzle from the dst_reg writemask */
1783 if (ir->var->data.mode == ir_var_system_value)
1784 return;
1785
1786 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1787 this->result.swizzle = swizzle_for_size(type->vector_elements);
1788 }
1789
1790
1791 int
1792 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1793 {
1794 /* Under normal circumstances array elements are stored consecutively, so
1795 * the stride is equal to the size of the array element.
1796 */
1797 return type_size(ir->type);
1798 }
1799
1800
1801 void
1802 vec4_visitor::visit(ir_dereference_array *ir)
1803 {
1804 ir_constant *constant_index;
1805 src_reg src;
1806 int array_stride = compute_array_stride(ir);
1807
1808 constant_index = ir->array_index->constant_expression_value();
1809
1810 ir->array->accept(this);
1811 src = this->result;
1812
1813 if (constant_index) {
1814 src.reg_offset += constant_index->value.i[0] * array_stride;
1815 } else {
1816 /* Variable index array dereference. It eats the "vec4" of the
1817 * base of the array and an index that offsets the Mesa register
1818 * index.
1819 */
1820 ir->array_index->accept(this);
1821
1822 src_reg index_reg;
1823
1824 if (array_stride == 1) {
1825 index_reg = this->result;
1826 } else {
1827 index_reg = src_reg(this, glsl_type::int_type);
1828
1829 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1830 }
1831
1832 if (src.reladdr) {
1833 src_reg temp = src_reg(this, glsl_type::int_type);
1834
1835 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1836
1837 index_reg = temp;
1838 }
1839
1840 src.reladdr = ralloc(mem_ctx, src_reg);
1841 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1842 }
1843
1844 /* If the type is smaller than a vec4, replicate the last channel out. */
1845 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1846 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1847 else
1848 src.swizzle = BRW_SWIZZLE_NOOP;
1849 src.type = brw_type_for_base_type(ir->type);
1850
1851 this->result = src;
1852 }
1853
1854 void
1855 vec4_visitor::visit(ir_dereference_record *ir)
1856 {
1857 unsigned int i;
1858 const glsl_type *struct_type = ir->record->type;
1859 int offset = 0;
1860
1861 ir->record->accept(this);
1862
1863 for (i = 0; i < struct_type->length; i++) {
1864 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1865 break;
1866 offset += type_size(struct_type->fields.structure[i].type);
1867 }
1868
1869 /* If the type is smaller than a vec4, replicate the last channel out. */
1870 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1871 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1872 else
1873 this->result.swizzle = BRW_SWIZZLE_NOOP;
1874 this->result.type = brw_type_for_base_type(ir->type);
1875
1876 this->result.reg_offset += offset;
1877 }
1878
1879 /**
1880 * We want to be careful in assignment setup to hit the actual storage
1881 * instead of potentially using a temporary like we might with the
1882 * ir_dereference handler.
1883 */
1884 static dst_reg
1885 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1886 {
1887 /* The LHS must be a dereference. If the LHS is a variable indexed array
1888 * access of a vector, it must be separated into a series conditional moves
1889 * before reaching this point (see ir_vec_index_to_cond_assign).
1890 */
1891 assert(ir->as_dereference());
1892 ir_dereference_array *deref_array = ir->as_dereference_array();
1893 if (deref_array) {
1894 assert(!deref_array->array->type->is_vector());
1895 }
1896
1897 /* Use the rvalue deref handler for the most part. We'll ignore
1898 * swizzles in it and write swizzles using writemask, though.
1899 */
1900 ir->accept(v);
1901 return dst_reg(v->result);
1902 }
1903
1904 void
1905 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1906 const struct glsl_type *type, uint32_t predicate)
1907 {
1908 if (type->base_type == GLSL_TYPE_STRUCT) {
1909 for (unsigned int i = 0; i < type->length; i++) {
1910 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1911 }
1912 return;
1913 }
1914
1915 if (type->is_array()) {
1916 for (unsigned int i = 0; i < type->length; i++) {
1917 emit_block_move(dst, src, type->fields.array, predicate);
1918 }
1919 return;
1920 }
1921
1922 if (type->is_matrix()) {
1923 const struct glsl_type *vec_type;
1924
1925 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1926 type->vector_elements, 1);
1927
1928 for (int i = 0; i < type->matrix_columns; i++) {
1929 emit_block_move(dst, src, vec_type, predicate);
1930 }
1931 return;
1932 }
1933
1934 assert(type->is_scalar() || type->is_vector());
1935
1936 dst->type = brw_type_for_base_type(type);
1937 src->type = dst->type;
1938
1939 dst->writemask = (1 << type->vector_elements) - 1;
1940
1941 src->swizzle = swizzle_for_size(type->vector_elements);
1942
1943 vec4_instruction *inst = emit(MOV(*dst, *src));
1944 inst->predicate = predicate;
1945
1946 dst->reg_offset++;
1947 src->reg_offset++;
1948 }
1949
1950
1951 /* If the RHS processing resulted in an instruction generating a
1952 * temporary value, and it would be easy to rewrite the instruction to
1953 * generate its result right into the LHS instead, do so. This ends
1954 * up reliably removing instructions where it can be tricky to do so
1955 * later without real UD chain information.
1956 */
1957 bool
1958 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1959 dst_reg dst,
1960 src_reg src,
1961 vec4_instruction *pre_rhs_inst,
1962 vec4_instruction *last_rhs_inst)
1963 {
1964 /* This could be supported, but it would take more smarts. */
1965 if (ir->condition)
1966 return false;
1967
1968 if (pre_rhs_inst == last_rhs_inst)
1969 return false; /* No instructions generated to work with. */
1970
1971 /* Make sure the last instruction generated our source reg. */
1972 if (src.file != GRF ||
1973 src.file != last_rhs_inst->dst.file ||
1974 src.reg != last_rhs_inst->dst.reg ||
1975 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1976 src.reladdr ||
1977 src.abs ||
1978 src.negate ||
1979 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1980 return false;
1981
1982 /* Check that that last instruction fully initialized the channels
1983 * we want to use, in the order we want to use them. We could
1984 * potentially reswizzle the operands of many instructions so that
1985 * we could handle out of order channels, but don't yet.
1986 */
1987
1988 for (unsigned i = 0; i < 4; i++) {
1989 if (dst.writemask & (1 << i)) {
1990 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1991 return false;
1992
1993 if (BRW_GET_SWZ(src.swizzle, i) != i)
1994 return false;
1995 }
1996 }
1997
1998 /* Success! Rewrite the instruction. */
1999 last_rhs_inst->dst.file = dst.file;
2000 last_rhs_inst->dst.reg = dst.reg;
2001 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2002 last_rhs_inst->dst.reladdr = dst.reladdr;
2003 last_rhs_inst->dst.writemask &= dst.writemask;
2004
2005 return true;
2006 }
2007
2008 void
2009 vec4_visitor::visit(ir_assignment *ir)
2010 {
2011 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2012 uint32_t predicate = BRW_PREDICATE_NONE;
2013
2014 if (!ir->lhs->type->is_scalar() &&
2015 !ir->lhs->type->is_vector()) {
2016 ir->rhs->accept(this);
2017 src_reg src = this->result;
2018
2019 if (ir->condition) {
2020 emit_bool_to_cond_code(ir->condition, &predicate);
2021 }
2022
2023 /* emit_block_move doesn't account for swizzles in the source register.
2024 * This should be ok, since the source register is a structure or an
2025 * array, and those can't be swizzled. But double-check to be sure.
2026 */
2027 assert(src.swizzle ==
2028 (ir->rhs->type->is_matrix()
2029 ? swizzle_for_size(ir->rhs->type->vector_elements)
2030 : BRW_SWIZZLE_NOOP));
2031
2032 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2033 return;
2034 }
2035
2036 /* Now we're down to just a scalar/vector with writemasks. */
2037 int i;
2038
2039 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2040 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2041
2042 ir->rhs->accept(this);
2043
2044 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2045
2046 src_reg src = this->result;
2047
2048 int swizzles[4];
2049 int first_enabled_chan = 0;
2050 int src_chan = 0;
2051
2052 assert(ir->lhs->type->is_vector() ||
2053 ir->lhs->type->is_scalar());
2054 dst.writemask = ir->write_mask;
2055
2056 for (int i = 0; i < 4; i++) {
2057 if (dst.writemask & (1 << i)) {
2058 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2059 break;
2060 }
2061 }
2062
2063 /* Swizzle a small RHS vector into the channels being written.
2064 *
2065 * glsl ir treats write_mask as dictating how many channels are
2066 * present on the RHS while in our instructions we need to make
2067 * those channels appear in the slots of the vec4 they're written to.
2068 */
2069 for (int i = 0; i < 4; i++) {
2070 if (dst.writemask & (1 << i))
2071 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2072 else
2073 swizzles[i] = first_enabled_chan;
2074 }
2075 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2076 swizzles[2], swizzles[3]);
2077
2078 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2079 return;
2080 }
2081
2082 if (ir->condition) {
2083 emit_bool_to_cond_code(ir->condition, &predicate);
2084 }
2085
2086 for (i = 0; i < type_size(ir->lhs->type); i++) {
2087 vec4_instruction *inst = emit(MOV(dst, src));
2088 inst->predicate = predicate;
2089
2090 dst.reg_offset++;
2091 src.reg_offset++;
2092 }
2093 }
2094
2095 void
2096 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2097 {
2098 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2099 foreach_list(node, &ir->components) {
2100 ir_constant *field_value = (ir_constant *)node;
2101
2102 emit_constant_values(dst, field_value);
2103 }
2104 return;
2105 }
2106
2107 if (ir->type->is_array()) {
2108 for (unsigned int i = 0; i < ir->type->length; i++) {
2109 emit_constant_values(dst, ir->array_elements[i]);
2110 }
2111 return;
2112 }
2113
2114 if (ir->type->is_matrix()) {
2115 for (int i = 0; i < ir->type->matrix_columns; i++) {
2116 float *vec = &ir->value.f[i * ir->type->vector_elements];
2117
2118 for (int j = 0; j < ir->type->vector_elements; j++) {
2119 dst->writemask = 1 << j;
2120 dst->type = BRW_REGISTER_TYPE_F;
2121
2122 emit(MOV(*dst, src_reg(vec[j])));
2123 }
2124 dst->reg_offset++;
2125 }
2126 return;
2127 }
2128
2129 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2130
2131 for (int i = 0; i < ir->type->vector_elements; i++) {
2132 if (!(remaining_writemask & (1 << i)))
2133 continue;
2134
2135 dst->writemask = 1 << i;
2136 dst->type = brw_type_for_base_type(ir->type);
2137
2138 /* Find other components that match the one we're about to
2139 * write. Emits fewer instructions for things like vec4(0.5,
2140 * 1.5, 1.5, 1.5).
2141 */
2142 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2143 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2144 if (ir->value.b[i] == ir->value.b[j])
2145 dst->writemask |= (1 << j);
2146 } else {
2147 /* u, i, and f storage all line up, so no need for a
2148 * switch case for comparing each type.
2149 */
2150 if (ir->value.u[i] == ir->value.u[j])
2151 dst->writemask |= (1 << j);
2152 }
2153 }
2154
2155 switch (ir->type->base_type) {
2156 case GLSL_TYPE_FLOAT:
2157 emit(MOV(*dst, src_reg(ir->value.f[i])));
2158 break;
2159 case GLSL_TYPE_INT:
2160 emit(MOV(*dst, src_reg(ir->value.i[i])));
2161 break;
2162 case GLSL_TYPE_UINT:
2163 emit(MOV(*dst, src_reg(ir->value.u[i])));
2164 break;
2165 case GLSL_TYPE_BOOL:
2166 emit(MOV(*dst, src_reg(ir->value.b[i])));
2167 break;
2168 default:
2169 assert(!"Non-float/uint/int/bool constant");
2170 break;
2171 }
2172
2173 remaining_writemask &= ~dst->writemask;
2174 }
2175 dst->reg_offset++;
2176 }
2177
2178 void
2179 vec4_visitor::visit(ir_constant *ir)
2180 {
2181 dst_reg dst = dst_reg(this, ir->type);
2182 this->result = src_reg(dst);
2183
2184 emit_constant_values(&dst, ir);
2185 }
2186
2187 void
2188 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2189 {
2190 ir_dereference *deref = static_cast<ir_dereference *>(
2191 ir->actual_parameters.get_head());
2192 ir_variable *location = deref->variable_referenced();
2193 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2194 location->data.atomic.buffer_index);
2195
2196 /* Calculate the surface offset */
2197 src_reg offset(this, glsl_type::uint_type);
2198 ir_dereference_array *deref_array = deref->as_dereference_array();
2199 if (deref_array) {
2200 deref_array->array_index->accept(this);
2201
2202 src_reg tmp(this, glsl_type::uint_type);
2203 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2204 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2205 } else {
2206 offset = location->data.atomic.offset;
2207 }
2208
2209 /* Emit the appropriate machine instruction */
2210 const char *callee = ir->callee->function_name();
2211 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2212
2213 if (!strcmp("__intrinsic_atomic_read", callee)) {
2214 emit_untyped_surface_read(surf_index, dst, offset);
2215
2216 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2217 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2218 src_reg(), src_reg());
2219
2220 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2221 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2222 src_reg(), src_reg());
2223 }
2224 }
2225
2226 void
2227 vec4_visitor::visit(ir_call *ir)
2228 {
2229 const char *callee = ir->callee->function_name();
2230
2231 if (!strcmp("__intrinsic_atomic_read", callee) ||
2232 !strcmp("__intrinsic_atomic_increment", callee) ||
2233 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2234 visit_atomic_counter_intrinsic(ir);
2235 } else {
2236 assert(!"Unsupported intrinsic.");
2237 }
2238 }
2239
2240 src_reg
2241 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2242 {
2243 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2244 inst->base_mrf = 2;
2245 inst->mlen = 1;
2246 inst->sampler = sampler;
2247 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2248 inst->dst.writemask = WRITEMASK_XYZW;
2249
2250 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2251 int param_base = inst->base_mrf;
2252 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2253 int zero_mask = 0xf & ~coord_mask;
2254
2255 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2256 coordinate));
2257
2258 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2259 src_reg(0)));
2260
2261 emit(inst);
2262 return src_reg(inst->dst);
2263 }
2264
2265 void
2266 vec4_visitor::visit(ir_texture *ir)
2267 {
2268 int sampler =
2269 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2270
2271 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2272 * emitting anything other than setting up the constant result.
2273 */
2274 if (ir->op == ir_tg4) {
2275 ir_constant *chan = ir->lod_info.component->as_constant();
2276 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2277 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2278 dst_reg result(this, ir->type);
2279 this->result = src_reg(result);
2280 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2281 return;
2282 }
2283 }
2284
2285 /* Should be lowered by do_lower_texture_projection */
2286 assert(!ir->projector);
2287
2288 /* Should be lowered */
2289 assert(!ir->offset || !ir->offset->type->is_array());
2290
2291 /* Generate code to compute all the subexpression trees. This has to be
2292 * done before loading any values into MRFs for the sampler message since
2293 * generating these values may involve SEND messages that need the MRFs.
2294 */
2295 src_reg coordinate;
2296 if (ir->coordinate) {
2297 ir->coordinate->accept(this);
2298 coordinate = this->result;
2299 }
2300
2301 src_reg shadow_comparitor;
2302 if (ir->shadow_comparitor) {
2303 ir->shadow_comparitor->accept(this);
2304 shadow_comparitor = this->result;
2305 }
2306
2307 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2308 src_reg offset_value;
2309 if (has_nonconstant_offset) {
2310 ir->offset->accept(this);
2311 offset_value = src_reg(this->result);
2312 }
2313
2314 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2315 src_reg lod, dPdx, dPdy, sample_index, mcs;
2316 switch (ir->op) {
2317 case ir_tex:
2318 lod = src_reg(0.0f);
2319 lod_type = glsl_type::float_type;
2320 break;
2321 case ir_txf:
2322 case ir_txl:
2323 case ir_txs:
2324 ir->lod_info.lod->accept(this);
2325 lod = this->result;
2326 lod_type = ir->lod_info.lod->type;
2327 break;
2328 case ir_query_levels:
2329 lod = src_reg(0);
2330 lod_type = glsl_type::int_type;
2331 break;
2332 case ir_txf_ms:
2333 ir->lod_info.sample_index->accept(this);
2334 sample_index = this->result;
2335 sample_index_type = ir->lod_info.sample_index->type;
2336
2337 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2338 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2339 else
2340 mcs = src_reg(0u);
2341 break;
2342 case ir_txd:
2343 ir->lod_info.grad.dPdx->accept(this);
2344 dPdx = this->result;
2345
2346 ir->lod_info.grad.dPdy->accept(this);
2347 dPdy = this->result;
2348
2349 lod_type = ir->lod_info.grad.dPdx->type;
2350 break;
2351 case ir_txb:
2352 case ir_lod:
2353 case ir_tg4:
2354 break;
2355 }
2356
2357 vec4_instruction *inst = NULL;
2358 switch (ir->op) {
2359 case ir_tex:
2360 case ir_txl:
2361 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2362 break;
2363 case ir_txd:
2364 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2365 break;
2366 case ir_txf:
2367 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2368 break;
2369 case ir_txf_ms:
2370 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2371 break;
2372 case ir_txs:
2373 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2374 break;
2375 case ir_tg4:
2376 if (has_nonconstant_offset)
2377 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2378 else
2379 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2380 break;
2381 case ir_query_levels:
2382 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2383 break;
2384 case ir_txb:
2385 assert(!"TXB is not valid for vertex shaders.");
2386 break;
2387 case ir_lod:
2388 assert(!"LOD is not valid for vertex shaders.");
2389 break;
2390 default:
2391 assert(!"Unrecognized tex op");
2392 }
2393
2394 if (ir->offset != NULL && ir->op != ir_txf)
2395 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2396
2397 /* Stuff the channel select bits in the top of the texture offset */
2398 if (ir->op == ir_tg4)
2399 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2400
2401 /* The message header is necessary for:
2402 * - Gen4 (always)
2403 * - Texel offsets
2404 * - Gather channel selection
2405 * - Sampler indices too large to fit in a 4-bit value.
2406 */
2407 inst->header_present =
2408 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2409 sampler >= 16;
2410 inst->base_mrf = 2;
2411 inst->mlen = inst->header_present + 1; /* always at least one */
2412 inst->sampler = sampler;
2413 inst->dst = dst_reg(this, ir->type);
2414 inst->dst.writemask = WRITEMASK_XYZW;
2415 inst->shadow_compare = ir->shadow_comparitor != NULL;
2416
2417 /* MRF for the first parameter */
2418 int param_base = inst->base_mrf + inst->header_present;
2419
2420 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2421 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2422 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2423 } else {
2424 /* Load the coordinate */
2425 /* FINISHME: gl_clamp_mask and saturate */
2426 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2427 int zero_mask = 0xf & ~coord_mask;
2428
2429 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2430 coordinate));
2431
2432 if (zero_mask != 0) {
2433 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2434 src_reg(0)));
2435 }
2436 /* Load the shadow comparitor */
2437 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2438 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2439 WRITEMASK_X),
2440 shadow_comparitor));
2441 inst->mlen++;
2442 }
2443
2444 /* Load the LOD info */
2445 if (ir->op == ir_tex || ir->op == ir_txl) {
2446 int mrf, writemask;
2447 if (brw->gen >= 5) {
2448 mrf = param_base + 1;
2449 if (ir->shadow_comparitor) {
2450 writemask = WRITEMASK_Y;
2451 /* mlen already incremented */
2452 } else {
2453 writemask = WRITEMASK_X;
2454 inst->mlen++;
2455 }
2456 } else /* brw->gen == 4 */ {
2457 mrf = param_base;
2458 writemask = WRITEMASK_W;
2459 }
2460 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2461 } else if (ir->op == ir_txf) {
2462 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2463 } else if (ir->op == ir_txf_ms) {
2464 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2465 sample_index));
2466 if (brw->gen >= 7)
2467 /* MCS data is in the first channel of `mcs`, but we need to get it into
2468 * the .y channel of the second vec4 of params, so replicate .x across
2469 * the whole vec4 and then mask off everything except .y
2470 */
2471 mcs.swizzle = BRW_SWIZZLE_XXXX;
2472 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2473 mcs));
2474 inst->mlen++;
2475 } else if (ir->op == ir_txd) {
2476 const glsl_type *type = lod_type;
2477
2478 if (brw->gen >= 5) {
2479 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2480 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2481 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2482 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2483 inst->mlen++;
2484
2485 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2486 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2487 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2488 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2489 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2490 inst->mlen++;
2491
2492 if (ir->shadow_comparitor) {
2493 emit(MOV(dst_reg(MRF, param_base + 2,
2494 ir->shadow_comparitor->type, WRITEMASK_Z),
2495 shadow_comparitor));
2496 }
2497 }
2498 } else /* brw->gen == 4 */ {
2499 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2500 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2501 inst->mlen += 2;
2502 }
2503 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2504 if (ir->shadow_comparitor) {
2505 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2506 shadow_comparitor));
2507 }
2508
2509 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2510 offset_value));
2511 inst->mlen++;
2512 }
2513 }
2514
2515 emit(inst);
2516
2517 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2518 * spec requires layers.
2519 */
2520 if (ir->op == ir_txs) {
2521 glsl_type const *type = ir->sampler->type;
2522 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2523 type->sampler_array) {
2524 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2525 writemask(inst->dst, WRITEMASK_Z),
2526 src_reg(inst->dst), src_reg(6));
2527 }
2528 }
2529
2530 if (brw->gen == 6 && ir->op == ir_tg4) {
2531 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2532 }
2533
2534 swizzle_result(ir, src_reg(inst->dst), sampler);
2535 }
2536
2537 /**
2538 * Apply workarounds for Gen6 gather with UINT/SINT
2539 */
2540 void
2541 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2542 {
2543 if (!wa)
2544 return;
2545
2546 int width = (wa & WA_8BIT) ? 8 : 16;
2547 dst_reg dst_f = dst;
2548 dst_f.type = BRW_REGISTER_TYPE_F;
2549
2550 /* Convert from UNORM to UINT */
2551 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2552 emit(MOV(dst, src_reg(dst_f)));
2553
2554 if (wa & WA_SIGN) {
2555 /* Reinterpret the UINT value as a signed INT value by
2556 * shifting the sign bit into place, then shifting back
2557 * preserving sign.
2558 */
2559 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2560 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2561 }
2562 }
2563
2564 /**
2565 * Set up the gather channel based on the swizzle, for gather4.
2566 */
2567 uint32_t
2568 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2569 {
2570 ir_constant *chan = ir->lod_info.component->as_constant();
2571 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2572 switch (swiz) {
2573 case SWIZZLE_X: return 0;
2574 case SWIZZLE_Y:
2575 /* gather4 sampler is broken for green channel on RG32F --
2576 * we must ask for blue instead.
2577 */
2578 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2579 return 2;
2580 return 1;
2581 case SWIZZLE_Z: return 2;
2582 case SWIZZLE_W: return 3;
2583 default:
2584 assert(!"Not reached"); /* zero, one swizzles handled already */
2585 return 0;
2586 }
2587 }
2588
2589 void
2590 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2591 {
2592 int s = key->tex.swizzles[sampler];
2593
2594 this->result = src_reg(this, ir->type);
2595 dst_reg swizzled_result(this->result);
2596
2597 if (ir->op == ir_query_levels) {
2598 /* # levels is in .w */
2599 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2600 emit(MOV(swizzled_result, orig_val));
2601 return;
2602 }
2603
2604 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2605 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2606 emit(MOV(swizzled_result, orig_val));
2607 return;
2608 }
2609
2610
2611 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2612 int swizzle[4] = {0};
2613
2614 for (int i = 0; i < 4; i++) {
2615 switch (GET_SWZ(s, i)) {
2616 case SWIZZLE_ZERO:
2617 zero_mask |= (1 << i);
2618 break;
2619 case SWIZZLE_ONE:
2620 one_mask |= (1 << i);
2621 break;
2622 default:
2623 copy_mask |= (1 << i);
2624 swizzle[i] = GET_SWZ(s, i);
2625 break;
2626 }
2627 }
2628
2629 if (copy_mask) {
2630 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2631 swizzled_result.writemask = copy_mask;
2632 emit(MOV(swizzled_result, orig_val));
2633 }
2634
2635 if (zero_mask) {
2636 swizzled_result.writemask = zero_mask;
2637 emit(MOV(swizzled_result, src_reg(0.0f)));
2638 }
2639
2640 if (one_mask) {
2641 swizzled_result.writemask = one_mask;
2642 emit(MOV(swizzled_result, src_reg(1.0f)));
2643 }
2644 }
2645
2646 void
2647 vec4_visitor::visit(ir_return *ir)
2648 {
2649 assert(!"not reached");
2650 }
2651
2652 void
2653 vec4_visitor::visit(ir_discard *ir)
2654 {
2655 assert(!"not reached");
2656 }
2657
2658 void
2659 vec4_visitor::visit(ir_if *ir)
2660 {
2661 /* Don't point the annotation at the if statement, because then it plus
2662 * the then and else blocks get printed.
2663 */
2664 this->base_ir = ir->condition;
2665
2666 if (brw->gen == 6) {
2667 emit_if_gen6(ir);
2668 } else {
2669 uint32_t predicate;
2670 emit_bool_to_cond_code(ir->condition, &predicate);
2671 emit(IF(predicate));
2672 }
2673
2674 visit_instructions(&ir->then_instructions);
2675
2676 if (!ir->else_instructions.is_empty()) {
2677 this->base_ir = ir->condition;
2678 emit(BRW_OPCODE_ELSE);
2679
2680 visit_instructions(&ir->else_instructions);
2681 }
2682
2683 this->base_ir = ir->condition;
2684 emit(BRW_OPCODE_ENDIF);
2685 }
2686
2687 void
2688 vec4_visitor::visit(ir_emit_vertex *)
2689 {
2690 assert(!"not reached");
2691 }
2692
2693 void
2694 vec4_visitor::visit(ir_end_primitive *)
2695 {
2696 assert(!"not reached");
2697 }
2698
2699 void
2700 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2701 dst_reg dst, src_reg offset,
2702 src_reg src0, src_reg src1)
2703 {
2704 unsigned mlen = 0;
2705
2706 /* Set the atomic operation offset. */
2707 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2708 mlen++;
2709
2710 /* Set the atomic operation arguments. */
2711 if (src0.file != BAD_FILE) {
2712 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2713 mlen++;
2714 }
2715
2716 if (src1.file != BAD_FILE) {
2717 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2718 mlen++;
2719 }
2720
2721 /* Emit the instruction. Note that this maps to the normal SIMD8
2722 * untyped atomic message on Ivy Bridge, but that's OK because
2723 * unused channels will be masked out.
2724 */
2725 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2726 src_reg(atomic_op), src_reg(surf_index));
2727 inst->base_mrf = 0;
2728 inst->mlen = mlen;
2729 }
2730
2731 void
2732 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2733 src_reg offset)
2734 {
2735 /* Set the surface read offset. */
2736 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2737
2738 /* Emit the instruction. Note that this maps to the normal SIMD8
2739 * untyped surface read message, but that's OK because unused
2740 * channels will be masked out.
2741 */
2742 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2743 dst, src_reg(surf_index));
2744 inst->base_mrf = 0;
2745 inst->mlen = 1;
2746 }
2747
2748 void
2749 vec4_visitor::emit_ndc_computation()
2750 {
2751 /* Get the position */
2752 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2753
2754 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2755 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2756 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2757
2758 current_annotation = "NDC";
2759 dst_reg ndc_w = ndc;
2760 ndc_w.writemask = WRITEMASK_W;
2761 src_reg pos_w = pos;
2762 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2763 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2764
2765 dst_reg ndc_xyz = ndc;
2766 ndc_xyz.writemask = WRITEMASK_XYZ;
2767
2768 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2769 }
2770
2771 void
2772 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2773 {
2774 if (brw->gen < 6 &&
2775 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2776 key->userclip_active || brw->has_negative_rhw_bug)) {
2777 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2778 dst_reg header1_w = header1;
2779 header1_w.writemask = WRITEMASK_W;
2780
2781 emit(MOV(header1, 0u));
2782
2783 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2784 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2785
2786 current_annotation = "Point size";
2787 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2788 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2789 }
2790
2791 if (key->userclip_active) {
2792 current_annotation = "Clipping flags";
2793 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2794 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2795
2796 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2797 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2798 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2799
2800 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2801 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2802 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2803 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2804 }
2805
2806 /* i965 clipping workaround:
2807 * 1) Test for -ve rhw
2808 * 2) If set,
2809 * set ndc = (0,0,0,0)
2810 * set ucp[6] = 1
2811 *
2812 * Later, clipping will detect ucp[6] and ensure the primitive is
2813 * clipped against all fixed planes.
2814 */
2815 if (brw->has_negative_rhw_bug) {
2816 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2817 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2818 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2819 vec4_instruction *inst;
2820 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2821 inst->predicate = BRW_PREDICATE_NORMAL;
2822 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2823 inst->predicate = BRW_PREDICATE_NORMAL;
2824 }
2825
2826 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2827 } else if (brw->gen < 6) {
2828 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2829 } else {
2830 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2831 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2832 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2833 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2834 }
2835 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2836 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2837 src_reg(output_reg[VARYING_SLOT_LAYER])));
2838 }
2839 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2840 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2841 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2842 }
2843 }
2844 }
2845
2846 void
2847 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2848 {
2849 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2850 *
2851 * "If a linked set of shaders forming the vertex stage contains no
2852 * static write to gl_ClipVertex or gl_ClipDistance, but the
2853 * application has requested clipping against user clip planes through
2854 * the API, then the coordinate written to gl_Position is used for
2855 * comparison against the user clip planes."
2856 *
2857 * This function is only called if the shader didn't write to
2858 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2859 * if the user wrote to it; otherwise we use gl_Position.
2860 */
2861 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2862 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2863 clip_vertex = VARYING_SLOT_POS;
2864 }
2865
2866 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2867 ++i) {
2868 reg.writemask = 1 << i;
2869 emit(DP4(reg,
2870 src_reg(output_reg[clip_vertex]),
2871 src_reg(this->userplane[i + offset])));
2872 }
2873 }
2874
2875 void
2876 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2877 {
2878 assert (varying < VARYING_SLOT_MAX);
2879 reg.type = output_reg[varying].type;
2880 current_annotation = output_reg_annotation[varying];
2881 /* Copy the register, saturating if necessary */
2882 vec4_instruction *inst = emit(MOV(reg,
2883 src_reg(output_reg[varying])));
2884 if ((varying == VARYING_SLOT_COL0 ||
2885 varying == VARYING_SLOT_COL1 ||
2886 varying == VARYING_SLOT_BFC0 ||
2887 varying == VARYING_SLOT_BFC1) &&
2888 key->clamp_vertex_color) {
2889 inst->saturate = true;
2890 }
2891 }
2892
2893 void
2894 vec4_visitor::emit_urb_slot(int mrf, int varying)
2895 {
2896 struct brw_reg hw_reg = brw_message_reg(mrf);
2897 dst_reg reg = dst_reg(MRF, mrf);
2898 reg.type = BRW_REGISTER_TYPE_F;
2899
2900 switch (varying) {
2901 case VARYING_SLOT_PSIZ:
2902 /* PSIZ is always in slot 0, and is coupled with other flags. */
2903 current_annotation = "indices, point width, clip flags";
2904 emit_psiz_and_flags(hw_reg);
2905 break;
2906 case BRW_VARYING_SLOT_NDC:
2907 current_annotation = "NDC";
2908 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2909 break;
2910 case VARYING_SLOT_POS:
2911 current_annotation = "gl_Position";
2912 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2913 break;
2914 case VARYING_SLOT_EDGE:
2915 /* This is present when doing unfilled polygons. We're supposed to copy
2916 * the edge flag from the user-provided vertex array
2917 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2918 * of that attribute (starts as 1.0f). This is then used in clipping to
2919 * determine which edges should be drawn as wireframe.
2920 */
2921 current_annotation = "edge flag";
2922 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2923 glsl_type::float_type, WRITEMASK_XYZW))));
2924 break;
2925 case BRW_VARYING_SLOT_PAD:
2926 /* No need to write to this slot */
2927 break;
2928 default:
2929 emit_generic_urb_slot(reg, varying);
2930 break;
2931 }
2932 }
2933
2934 static int
2935 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2936 {
2937 if (brw->gen >= 6) {
2938 /* URB data written (does not include the message header reg) must
2939 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2940 * section 5.4.3.2.2: URB_INTERLEAVED.
2941 *
2942 * URB entries are allocated on a multiple of 1024 bits, so an
2943 * extra 128 bits written here to make the end align to 256 is
2944 * no problem.
2945 */
2946 if ((mlen % 2) != 1)
2947 mlen++;
2948 }
2949
2950 return mlen;
2951 }
2952
2953
2954 /**
2955 * Generates the VUE payload plus the necessary URB write instructions to
2956 * output it.
2957 *
2958 * The VUE layout is documented in Volume 2a.
2959 */
2960 void
2961 vec4_visitor::emit_vertex()
2962 {
2963 /* MRF 0 is reserved for the debugger, so start with message header
2964 * in MRF 1.
2965 */
2966 int base_mrf = 1;
2967 int mrf = base_mrf;
2968 /* In the process of generating our URB write message contents, we
2969 * may need to unspill a register or load from an array. Those
2970 * reads would use MRFs 14-15.
2971 */
2972 int max_usable_mrf = 13;
2973
2974 /* The following assertion verifies that max_usable_mrf causes an
2975 * even-numbered amount of URB write data, which will meet gen6's
2976 * requirements for length alignment.
2977 */
2978 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2979
2980 /* First mrf is the g0-based message header containing URB handles and
2981 * such.
2982 */
2983 emit_urb_write_header(mrf++);
2984
2985 if (brw->gen < 6) {
2986 emit_ndc_computation();
2987 }
2988
2989 /* Lower legacy ff and ClipVertex clipping to clip distances */
2990 if (key->userclip_active && !prog->UsesClipDistanceOut) {
2991 current_annotation = "user clip distances";
2992
2993 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2994 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2995
2996 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2997 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2998 }
2999
3000 /* We may need to split this up into several URB writes, so do them in a
3001 * loop.
3002 */
3003 int slot = 0;
3004 bool complete = false;
3005 do {
3006 /* URB offset is in URB row increments, and each of our MRFs is half of
3007 * one of those, since we're doing interleaved writes.
3008 */
3009 int offset = slot / 2;
3010
3011 mrf = base_mrf + 1;
3012 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3013 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3014
3015 /* If this was max_usable_mrf, we can't fit anything more into this
3016 * URB WRITE.
3017 */
3018 if (mrf > max_usable_mrf) {
3019 slot++;
3020 break;
3021 }
3022 }
3023
3024 complete = slot >= prog_data->vue_map.num_slots;
3025 current_annotation = "URB write";
3026 vec4_instruction *inst = emit_urb_write_opcode(complete);
3027 inst->base_mrf = base_mrf;
3028 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3029 inst->offset += offset;
3030 } while(!complete);
3031 }
3032
3033
3034 src_reg
3035 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3036 src_reg *reladdr, int reg_offset)
3037 {
3038 /* Because we store the values to scratch interleaved like our
3039 * vertex data, we need to scale the vec4 index by 2.
3040 */
3041 int message_header_scale = 2;
3042
3043 /* Pre-gen6, the message header uses byte offsets instead of vec4
3044 * (16-byte) offset units.
3045 */
3046 if (brw->gen < 6)
3047 message_header_scale *= 16;
3048
3049 if (reladdr) {
3050 src_reg index = src_reg(this, glsl_type::int_type);
3051
3052 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3053 emit_before(inst, MUL(dst_reg(index),
3054 index, src_reg(message_header_scale)));
3055
3056 return index;
3057 } else {
3058 return src_reg(reg_offset * message_header_scale);
3059 }
3060 }
3061
3062 src_reg
3063 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3064 src_reg *reladdr, int reg_offset)
3065 {
3066 if (reladdr) {
3067 src_reg index = src_reg(this, glsl_type::int_type);
3068
3069 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3070
3071 /* Pre-gen6, the message header uses byte offsets instead of vec4
3072 * (16-byte) offset units.
3073 */
3074 if (brw->gen < 6) {
3075 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3076 }
3077
3078 return index;
3079 } else if (brw->gen >= 8) {
3080 /* Store the offset in a GRF so we can send-from-GRF. */
3081 src_reg offset = src_reg(this, glsl_type::int_type);
3082 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3083 return offset;
3084 } else {
3085 int message_header_scale = brw->gen < 6 ? 16 : 1;
3086 return src_reg(reg_offset * message_header_scale);
3087 }
3088 }
3089
3090 /**
3091 * Emits an instruction before @inst to load the value named by @orig_src
3092 * from scratch space at @base_offset to @temp.
3093 *
3094 * @base_offset is measured in 32-byte units (the size of a register).
3095 */
3096 void
3097 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3098 dst_reg temp, src_reg orig_src,
3099 int base_offset)
3100 {
3101 int reg_offset = base_offset + orig_src.reg_offset;
3102 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3103
3104 emit_before(inst, SCRATCH_READ(temp, index));
3105 }
3106
3107 /**
3108 * Emits an instruction after @inst to store the value to be written
3109 * to @orig_dst to scratch space at @base_offset, from @temp.
3110 *
3111 * @base_offset is measured in 32-byte units (the size of a register).
3112 */
3113 void
3114 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3115 {
3116 int reg_offset = base_offset + inst->dst.reg_offset;
3117 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3118
3119 /* Create a temporary register to store *inst's result in.
3120 *
3121 * We have to be careful in MOVing from our temporary result register in
3122 * the scratch write. If we swizzle from channels of the temporary that
3123 * weren't initialized, it will confuse live interval analysis, which will
3124 * make spilling fail to make progress.
3125 */
3126 src_reg temp = src_reg(this, glsl_type::vec4_type);
3127 temp.type = inst->dst.type;
3128 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3129 int swizzles[4];
3130 for (int i = 0; i < 4; i++)
3131 if (inst->dst.writemask & (1 << i))
3132 swizzles[i] = i;
3133 else
3134 swizzles[i] = first_writemask_chan;
3135 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3136 swizzles[2], swizzles[3]);
3137
3138 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3139 inst->dst.writemask));
3140 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3141 write->predicate = inst->predicate;
3142 write->ir = inst->ir;
3143 write->annotation = inst->annotation;
3144 inst->insert_after(write);
3145
3146 inst->dst.file = temp.file;
3147 inst->dst.reg = temp.reg;
3148 inst->dst.reg_offset = temp.reg_offset;
3149 inst->dst.reladdr = NULL;
3150 }
3151
3152 /**
3153 * We can't generally support array access in GRF space, because a
3154 * single instruction's destination can only span 2 contiguous
3155 * registers. So, we send all GRF arrays that get variable index
3156 * access to scratch space.
3157 */
3158 void
3159 vec4_visitor::move_grf_array_access_to_scratch()
3160 {
3161 int scratch_loc[this->virtual_grf_count];
3162
3163 for (int i = 0; i < this->virtual_grf_count; i++) {
3164 scratch_loc[i] = -1;
3165 }
3166
3167 /* First, calculate the set of virtual GRFs that need to be punted
3168 * to scratch due to having any array access on them, and where in
3169 * scratch.
3170 */
3171 foreach_list(node, &this->instructions) {
3172 vec4_instruction *inst = (vec4_instruction *)node;
3173
3174 if (inst->dst.file == GRF && inst->dst.reladdr &&
3175 scratch_loc[inst->dst.reg] == -1) {
3176 scratch_loc[inst->dst.reg] = c->last_scratch;
3177 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3178 }
3179
3180 for (int i = 0 ; i < 3; i++) {
3181 src_reg *src = &inst->src[i];
3182
3183 if (src->file == GRF && src->reladdr &&
3184 scratch_loc[src->reg] == -1) {
3185 scratch_loc[src->reg] = c->last_scratch;
3186 c->last_scratch += this->virtual_grf_sizes[src->reg];
3187 }
3188 }
3189 }
3190
3191 /* Now, for anything that will be accessed through scratch, rewrite
3192 * it to load/store. Note that this is a _safe list walk, because
3193 * we may generate a new scratch_write instruction after the one
3194 * we're processing.
3195 */
3196 foreach_list_safe(node, &this->instructions) {
3197 vec4_instruction *inst = (vec4_instruction *)node;
3198
3199 /* Set up the annotation tracking for new generated instructions. */
3200 base_ir = inst->ir;
3201 current_annotation = inst->annotation;
3202
3203 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3204 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3205 }
3206
3207 for (int i = 0 ; i < 3; i++) {
3208 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3209 continue;
3210
3211 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3212
3213 emit_scratch_read(inst, temp, inst->src[i],
3214 scratch_loc[inst->src[i].reg]);
3215
3216 inst->src[i].file = temp.file;
3217 inst->src[i].reg = temp.reg;
3218 inst->src[i].reg_offset = temp.reg_offset;
3219 inst->src[i].reladdr = NULL;
3220 }
3221 }
3222 }
3223
3224 /**
3225 * Emits an instruction before @inst to load the value named by @orig_src
3226 * from the pull constant buffer (surface) at @base_offset to @temp.
3227 */
3228 void
3229 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3230 dst_reg temp, src_reg orig_src,
3231 int base_offset)
3232 {
3233 int reg_offset = base_offset + orig_src.reg_offset;
3234 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3235 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3236 vec4_instruction *load;
3237
3238 if (brw->gen >= 7) {
3239 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3240 grf_offset.type = offset.type;
3241 emit_before(inst, MOV(grf_offset, offset));
3242
3243 load = new(mem_ctx) vec4_instruction(this,
3244 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3245 temp, index, src_reg(grf_offset));
3246 } else {
3247 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3248 temp, index, offset);
3249 load->base_mrf = 14;
3250 load->mlen = 1;
3251 }
3252 emit_before(inst, load);
3253 }
3254
3255 /**
3256 * Implements array access of uniforms by inserting a
3257 * PULL_CONSTANT_LOAD instruction.
3258 *
3259 * Unlike temporary GRF array access (where we don't support it due to
3260 * the difficulty of doing relative addressing on instruction
3261 * destinations), we could potentially do array access of uniforms
3262 * that were loaded in GRF space as push constants. In real-world
3263 * usage we've seen, though, the arrays being used are always larger
3264 * than we could load as push constants, so just always move all
3265 * uniform array access out to a pull constant buffer.
3266 */
3267 void
3268 vec4_visitor::move_uniform_array_access_to_pull_constants()
3269 {
3270 int pull_constant_loc[this->uniforms];
3271
3272 for (int i = 0; i < this->uniforms; i++) {
3273 pull_constant_loc[i] = -1;
3274 }
3275
3276 /* Walk through and find array access of uniforms. Put a copy of that
3277 * uniform in the pull constant buffer.
3278 *
3279 * Note that we don't move constant-indexed accesses to arrays. No
3280 * testing has been done of the performance impact of this choice.
3281 */
3282 foreach_list_safe(node, &this->instructions) {
3283 vec4_instruction *inst = (vec4_instruction *)node;
3284
3285 for (int i = 0 ; i < 3; i++) {
3286 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3287 continue;
3288
3289 int uniform = inst->src[i].reg;
3290
3291 /* If this array isn't already present in the pull constant buffer,
3292 * add it.
3293 */
3294 if (pull_constant_loc[uniform] == -1) {
3295 const float **values = &stage_prog_data->param[uniform * 4];
3296
3297 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3298
3299 assert(uniform < uniform_array_size);
3300 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3301 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3302 = values[j];
3303 }
3304 }
3305
3306 /* Set up the annotation tracking for new generated instructions. */
3307 base_ir = inst->ir;
3308 current_annotation = inst->annotation;
3309
3310 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3311
3312 emit_pull_constant_load(inst, temp, inst->src[i],
3313 pull_constant_loc[uniform]);
3314
3315 inst->src[i].file = temp.file;
3316 inst->src[i].reg = temp.reg;
3317 inst->src[i].reg_offset = temp.reg_offset;
3318 inst->src[i].reladdr = NULL;
3319 }
3320 }
3321
3322 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3323 * no need to track them as larger-than-vec4 objects. This will be
3324 * relied on in cutting out unused uniform vectors from push
3325 * constants.
3326 */
3327 split_uniform_registers();
3328 }
3329
3330 void
3331 vec4_visitor::resolve_ud_negate(src_reg *reg)
3332 {
3333 if (reg->type != BRW_REGISTER_TYPE_UD ||
3334 !reg->negate)
3335 return;
3336
3337 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3338 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3339 *reg = temp;
3340 }
3341
3342 vec4_visitor::vec4_visitor(struct brw_context *brw,
3343 struct brw_vec4_compile *c,
3344 struct gl_program *prog,
3345 const struct brw_vec4_prog_key *key,
3346 struct brw_vec4_prog_data *prog_data,
3347 struct gl_shader_program *shader_prog,
3348 gl_shader_stage stage,
3349 void *mem_ctx,
3350 bool debug_flag,
3351 bool no_spills,
3352 shader_time_shader_type st_base,
3353 shader_time_shader_type st_written,
3354 shader_time_shader_type st_reset)
3355 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3356 c(c),
3357 key(key),
3358 prog_data(prog_data),
3359 sanity_param_count(0),
3360 fail_msg(NULL),
3361 first_non_payload_grf(0),
3362 need_all_constants_in_pull_buffer(false),
3363 debug_flag(debug_flag),
3364 no_spills(no_spills),
3365 st_base(st_base),
3366 st_written(st_written),
3367 st_reset(st_reset)
3368 {
3369 this->mem_ctx = mem_ctx;
3370 this->failed = false;
3371
3372 this->base_ir = NULL;
3373 this->current_annotation = NULL;
3374 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3375
3376 this->variable_ht = hash_table_ctor(0,
3377 hash_table_pointer_hash,
3378 hash_table_pointer_compare);
3379
3380 this->virtual_grf_start = NULL;
3381 this->virtual_grf_end = NULL;
3382 this->virtual_grf_sizes = NULL;
3383 this->virtual_grf_count = 0;
3384 this->virtual_grf_reg_map = NULL;
3385 this->virtual_grf_reg_count = 0;
3386 this->virtual_grf_array_size = 0;
3387 this->live_intervals_valid = false;
3388
3389 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3390
3391 this->uniforms = 0;
3392
3393 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3394 * at least one. See setup_uniforms() in brw_vec4.cpp.
3395 */
3396 this->uniform_array_size = 1;
3397 if (prog_data) {
3398 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3399 }
3400
3401 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3402 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3403 }
3404
3405 vec4_visitor::~vec4_visitor()
3406 {
3407 hash_table_dtor(this->variable_ht);
3408 }
3409
3410
3411 void
3412 vec4_visitor::fail(const char *format, ...)
3413 {
3414 va_list va;
3415 char *msg;
3416
3417 if (failed)
3418 return;
3419
3420 failed = true;
3421
3422 va_start(va, format);
3423 msg = ralloc_vasprintf(mem_ctx, format, va);
3424 va_end(va);
3425 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3426
3427 this->fail_msg = msg;
3428
3429 if (debug_flag) {
3430 fprintf(stderr, "%s", msg);
3431 }
3432 }
3433
3434 } /* namespace brw */