i965/vec4: Generate URB writes using a loop.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->ir = v->base_ir;
42 this->annotation = v->current_annotation;
43 }
44
45 vec4_instruction *
46 vec4_visitor::emit(vec4_instruction *inst)
47 {
48 this->instructions.push_tail(inst);
49
50 return inst;
51 }
52
53 vec4_instruction *
54 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
55 {
56 new_inst->ir = inst->ir;
57 new_inst->annotation = inst->annotation;
58
59 inst->insert_before(new_inst);
60
61 return inst;
62 }
63
64 vec4_instruction *
65 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
66 src_reg src0, src_reg src1, src_reg src2)
67 {
68 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
69 src0, src1, src2));
70 }
71
72
73 vec4_instruction *
74 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
75 {
76 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
81 {
82 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
83 }
84
85 vec4_instruction *
86 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
87 {
88 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
89 }
90
91 vec4_instruction *
92 vec4_visitor::emit(enum opcode opcode)
93 {
94 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
95 }
96
97 #define ALU1(op) \
98 vec4_instruction * \
99 vec4_visitor::op(dst_reg dst, src_reg src0) \
100 { \
101 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
102 src0); \
103 }
104
105 #define ALU2(op) \
106 vec4_instruction * \
107 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
108 { \
109 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
110 src0, src1); \
111 }
112
113 #define ALU3(op) \
114 vec4_instruction * \
115 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
116 { \
117 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
118 src0, src1, src2); \
119 }
120
121 ALU1(NOT)
122 ALU1(MOV)
123 ALU1(FRC)
124 ALU1(RNDD)
125 ALU1(RNDE)
126 ALU1(RNDZ)
127 ALU1(F32TO16)
128 ALU1(F16TO32)
129 ALU2(ADD)
130 ALU2(MUL)
131 ALU2(MACH)
132 ALU2(AND)
133 ALU2(OR)
134 ALU2(XOR)
135 ALU2(DP3)
136 ALU2(DP4)
137 ALU2(DPH)
138 ALU2(SHL)
139 ALU2(SHR)
140 ALU2(ASR)
141 ALU3(LRP)
142 ALU1(BFREV)
143 ALU3(BFE)
144 ALU2(BFI1)
145 ALU3(BFI2)
146 ALU1(FBH)
147 ALU1(FBL)
148 ALU1(CBIT)
149 ALU3(MAD)
150
151 /** Gen4 predicated IF. */
152 vec4_instruction *
153 vec4_visitor::IF(uint32_t predicate)
154 {
155 vec4_instruction *inst;
156
157 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
158 inst->predicate = predicate;
159
160 return inst;
161 }
162
163 /** Gen6+ IF with embedded comparison. */
164 vec4_instruction *
165 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
166 {
167 assert(brw->gen >= 6);
168
169 vec4_instruction *inst;
170
171 resolve_ud_negate(&src0);
172 resolve_ud_negate(&src1);
173
174 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
175 src0, src1);
176 inst->conditional_mod = condition;
177
178 return inst;
179 }
180
181 /**
182 * CMP: Sets the low bit of the destination channels with the result
183 * of the comparison, while the upper bits are undefined, and updates
184 * the flag register with the packed 16 bits of the result.
185 */
186 vec4_instruction *
187 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
188 {
189 vec4_instruction *inst;
190
191 /* original gen4 does type conversion to the destination type
192 * before before comparison, producing garbage results for floating
193 * point comparisons.
194 */
195 if (brw->gen == 4) {
196 dst.type = src0.type;
197 if (dst.file == HW_REG)
198 dst.fixed_hw_reg.type = dst.type;
199 }
200
201 resolve_ud_negate(&src0);
202 resolve_ud_negate(&src1);
203
204 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
205 inst->conditional_mod = condition;
206
207 return inst;
208 }
209
210 vec4_instruction *
211 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
212 {
213 vec4_instruction *inst;
214
215 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
216 dst, index);
217 inst->base_mrf = 14;
218 inst->mlen = 2;
219
220 return inst;
221 }
222
223 vec4_instruction *
224 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
225 {
226 vec4_instruction *inst;
227
228 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
229 dst, src, index);
230 inst->base_mrf = 13;
231 inst->mlen = 3;
232
233 return inst;
234 }
235
236 void
237 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
238 {
239 static enum opcode dot_opcodes[] = {
240 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
241 };
242
243 emit(dot_opcodes[elements - 2], dst, src0, src1);
244 }
245
246 src_reg
247 vec4_visitor::fix_3src_operand(src_reg src)
248 {
249 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
250 * able to use vertical stride of zero to replicate the vec4 uniform, like
251 *
252 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
253 *
254 * But you can't, since vertical stride is always four in three-source
255 * instructions. Instead, insert a MOV instruction to do the replication so
256 * that the three-source instruction can consume it.
257 */
258
259 /* The MOV is only needed if the source is a uniform or immediate. */
260 if (src.file != UNIFORM && src.file != IMM)
261 return src;
262
263 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
264 expanded.type = src.type;
265 emit(MOV(expanded, src));
266 return src_reg(expanded);
267 }
268
269 src_reg
270 vec4_visitor::fix_math_operand(src_reg src)
271 {
272 /* The gen6 math instruction ignores the source modifiers --
273 * swizzle, abs, negate, and at least some parts of the register
274 * region description.
275 *
276 * Rather than trying to enumerate all these cases, *always* expand the
277 * operand to a temp GRF for gen6.
278 *
279 * For gen7, keep the operand as-is, except if immediate, which gen7 still
280 * can't use.
281 */
282
283 if (brw->gen == 7 && src.file != IMM)
284 return src;
285
286 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
287 expanded.type = src.type;
288 emit(MOV(expanded, src));
289 return src_reg(expanded);
290 }
291
292 void
293 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
294 {
295 src = fix_math_operand(src);
296
297 if (dst.writemask != WRITEMASK_XYZW) {
298 /* The gen6 math instruction must be align1, so we can't do
299 * writemasks.
300 */
301 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
302
303 emit(opcode, temp_dst, src);
304
305 emit(MOV(dst, src_reg(temp_dst)));
306 } else {
307 emit(opcode, dst, src);
308 }
309 }
310
311 void
312 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
313 {
314 vec4_instruction *inst = emit(opcode, dst, src);
315 inst->base_mrf = 1;
316 inst->mlen = 1;
317 }
318
319 void
320 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
321 {
322 switch (opcode) {
323 case SHADER_OPCODE_RCP:
324 case SHADER_OPCODE_RSQ:
325 case SHADER_OPCODE_SQRT:
326 case SHADER_OPCODE_EXP2:
327 case SHADER_OPCODE_LOG2:
328 case SHADER_OPCODE_SIN:
329 case SHADER_OPCODE_COS:
330 break;
331 default:
332 assert(!"not reached: bad math opcode");
333 return;
334 }
335
336 if (brw->gen >= 6) {
337 return emit_math1_gen6(opcode, dst, src);
338 } else {
339 return emit_math1_gen4(opcode, dst, src);
340 }
341 }
342
343 void
344 vec4_visitor::emit_math2_gen6(enum opcode opcode,
345 dst_reg dst, src_reg src0, src_reg src1)
346 {
347 src0 = fix_math_operand(src0);
348 src1 = fix_math_operand(src1);
349
350 if (dst.writemask != WRITEMASK_XYZW) {
351 /* The gen6 math instruction must be align1, so we can't do
352 * writemasks.
353 */
354 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
355 temp_dst.type = dst.type;
356
357 emit(opcode, temp_dst, src0, src1);
358
359 emit(MOV(dst, src_reg(temp_dst)));
360 } else {
361 emit(opcode, dst, src0, src1);
362 }
363 }
364
365 void
366 vec4_visitor::emit_math2_gen4(enum opcode opcode,
367 dst_reg dst, src_reg src0, src_reg src1)
368 {
369 vec4_instruction *inst = emit(opcode, dst, src0, src1);
370 inst->base_mrf = 1;
371 inst->mlen = 2;
372 }
373
374 void
375 vec4_visitor::emit_math(enum opcode opcode,
376 dst_reg dst, src_reg src0, src_reg src1)
377 {
378 switch (opcode) {
379 case SHADER_OPCODE_POW:
380 case SHADER_OPCODE_INT_QUOTIENT:
381 case SHADER_OPCODE_INT_REMAINDER:
382 break;
383 default:
384 assert(!"not reached: unsupported binary math opcode");
385 return;
386 }
387
388 if (brw->gen >= 6) {
389 return emit_math2_gen6(opcode, dst, src0, src1);
390 } else {
391 return emit_math2_gen4(opcode, dst, src0, src1);
392 }
393 }
394
395 void
396 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
397 {
398 if (brw->gen < 7)
399 assert(!"ir_unop_pack_half_2x16 should be lowered");
400
401 assert(dst.type == BRW_REGISTER_TYPE_UD);
402 assert(src0.type == BRW_REGISTER_TYPE_F);
403
404 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
405 *
406 * Because this instruction does not have a 16-bit floating-point type,
407 * the destination data type must be Word (W).
408 *
409 * The destination must be DWord-aligned and specify a horizontal stride
410 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
411 * each destination channel and the upper word is not modified.
412 *
413 * The above restriction implies that the f32to16 instruction must use
414 * align1 mode, because only in align1 mode is it possible to specify
415 * horizontal stride. We choose here to defy the hardware docs and emit
416 * align16 instructions.
417 *
418 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
419 * instructions. I was partially successful in that the code passed all
420 * tests. However, the code was dubiously correct and fragile, and the
421 * tests were not harsh enough to probe that frailty. Not trusting the
422 * code, I chose instead to remain in align16 mode in defiance of the hw
423 * docs).
424 *
425 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
426 * simulator, emitting a f32to16 in align16 mode with UD as destination
427 * data type is safe. The behavior differs from that specified in the PRM
428 * in that the upper word of each destination channel is cleared to 0.
429 */
430
431 dst_reg tmp_dst(this, glsl_type::uvec2_type);
432 src_reg tmp_src(tmp_dst);
433
434 #if 0
435 /* Verify the undocumented behavior on which the following instructions
436 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
437 * then the result of the bit-or instruction below will be incorrect.
438 *
439 * You should inspect the disasm output in order to verify that the MOV is
440 * not optimized away.
441 */
442 emit(MOV(tmp_dst, src_reg(0x12345678u)));
443 #endif
444
445 /* Give tmp the form below, where "." means untouched.
446 *
447 * w z y x w z y x
448 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
449 *
450 * That the upper word of each write-channel be 0 is required for the
451 * following bit-shift and bit-or instructions to work. Note that this
452 * relies on the undocumented hardware behavior mentioned above.
453 */
454 tmp_dst.writemask = WRITEMASK_XY;
455 emit(F32TO16(tmp_dst, src0));
456
457 /* Give the write-channels of dst the form:
458 * 0xhhhh0000
459 */
460 tmp_src.swizzle = SWIZZLE_Y;
461 emit(SHL(dst, tmp_src, src_reg(16u)));
462
463 /* Finally, give the write-channels of dst the form of packHalf2x16's
464 * output:
465 * 0xhhhhllll
466 */
467 tmp_src.swizzle = SWIZZLE_X;
468 emit(OR(dst, src_reg(dst), tmp_src));
469 }
470
471 void
472 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
473 {
474 if (brw->gen < 7)
475 assert(!"ir_unop_unpack_half_2x16 should be lowered");
476
477 assert(dst.type == BRW_REGISTER_TYPE_F);
478 assert(src0.type == BRW_REGISTER_TYPE_UD);
479
480 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
481 *
482 * Because this instruction does not have a 16-bit floating-point type,
483 * the source data type must be Word (W). The destination type must be
484 * F (Float).
485 *
486 * To use W as the source data type, we must adjust horizontal strides,
487 * which is only possible in align1 mode. All my [chadv] attempts at
488 * emitting align1 instructions for unpackHalf2x16 failed to pass the
489 * Piglit tests, so I gave up.
490 *
491 * I've verified that, on gen7 hardware and the simulator, it is safe to
492 * emit f16to32 in align16 mode with UD as source data type.
493 */
494
495 dst_reg tmp_dst(this, glsl_type::uvec2_type);
496 src_reg tmp_src(tmp_dst);
497
498 tmp_dst.writemask = WRITEMASK_X;
499 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
500
501 tmp_dst.writemask = WRITEMASK_Y;
502 emit(SHR(tmp_dst, src0, src_reg(16u)));
503
504 dst.writemask = WRITEMASK_XY;
505 emit(F16TO32(dst, tmp_src));
506 }
507
508 void
509 vec4_visitor::visit_instructions(const exec_list *list)
510 {
511 foreach_list(node, list) {
512 ir_instruction *ir = (ir_instruction *)node;
513
514 base_ir = ir;
515 ir->accept(this);
516 }
517 }
518
519
520 static int
521 type_size(const struct glsl_type *type)
522 {
523 unsigned int i;
524 int size;
525
526 switch (type->base_type) {
527 case GLSL_TYPE_UINT:
528 case GLSL_TYPE_INT:
529 case GLSL_TYPE_FLOAT:
530 case GLSL_TYPE_BOOL:
531 if (type->is_matrix()) {
532 return type->matrix_columns;
533 } else {
534 /* Regardless of size of vector, it gets a vec4. This is bad
535 * packing for things like floats, but otherwise arrays become a
536 * mess. Hopefully a later pass over the code can pack scalars
537 * down if appropriate.
538 */
539 return 1;
540 }
541 case GLSL_TYPE_ARRAY:
542 assert(type->length > 0);
543 return type_size(type->fields.array) * type->length;
544 case GLSL_TYPE_STRUCT:
545 size = 0;
546 for (i = 0; i < type->length; i++) {
547 size += type_size(type->fields.structure[i].type);
548 }
549 return size;
550 case GLSL_TYPE_SAMPLER:
551 /* Samplers take up one slot in UNIFORMS[], but they're baked in
552 * at link time.
553 */
554 return 1;
555 case GLSL_TYPE_VOID:
556 case GLSL_TYPE_ERROR:
557 case GLSL_TYPE_INTERFACE:
558 assert(0);
559 break;
560 }
561
562 return 0;
563 }
564
565 int
566 vec4_visitor::virtual_grf_alloc(int size)
567 {
568 if (virtual_grf_array_size <= virtual_grf_count) {
569 if (virtual_grf_array_size == 0)
570 virtual_grf_array_size = 16;
571 else
572 virtual_grf_array_size *= 2;
573 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
574 virtual_grf_array_size);
575 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
576 virtual_grf_array_size);
577 }
578 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
579 virtual_grf_reg_count += size;
580 virtual_grf_sizes[virtual_grf_count] = size;
581 return virtual_grf_count++;
582 }
583
584 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
585 {
586 init();
587
588 this->file = GRF;
589 this->reg = v->virtual_grf_alloc(type_size(type));
590
591 if (type->is_array() || type->is_record()) {
592 this->swizzle = BRW_SWIZZLE_NOOP;
593 } else {
594 this->swizzle = swizzle_for_size(type->vector_elements);
595 }
596
597 this->type = brw_type_for_base_type(type);
598 }
599
600 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
601 {
602 init();
603
604 this->file = GRF;
605 this->reg = v->virtual_grf_alloc(type_size(type));
606
607 if (type->is_array() || type->is_record()) {
608 this->writemask = WRITEMASK_XYZW;
609 } else {
610 this->writemask = (1 << type->vector_elements) - 1;
611 }
612
613 this->type = brw_type_for_base_type(type);
614 }
615
616 /* Our support for uniforms is piggy-backed on the struct
617 * gl_fragment_program, because that's where the values actually
618 * get stored, rather than in some global gl_shader_program uniform
619 * store.
620 */
621 void
622 vec4_visitor::setup_uniform_values(ir_variable *ir)
623 {
624 int namelen = strlen(ir->name);
625
626 /* The data for our (non-builtin) uniforms is stored in a series of
627 * gl_uniform_driver_storage structs for each subcomponent that
628 * glGetUniformLocation() could name. We know it's been set up in the same
629 * order we'd walk the type, so walk the list of storage and find anything
630 * with our name, or the prefix of a component that starts with our name.
631 */
632 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
633 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
634
635 if (strncmp(ir->name, storage->name, namelen) != 0 ||
636 (storage->name[namelen] != 0 &&
637 storage->name[namelen] != '.' &&
638 storage->name[namelen] != '[')) {
639 continue;
640 }
641
642 gl_constant_value *components = storage->storage;
643 unsigned vector_count = (MAX2(storage->array_elements, 1) *
644 storage->type->matrix_columns);
645
646 for (unsigned s = 0; s < vector_count; s++) {
647 uniform_vector_size[uniforms] = storage->type->vector_elements;
648
649 int i;
650 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
651 prog_data->param[uniforms * 4 + i] = &components->f;
652 components++;
653 }
654 for (; i < 4; i++) {
655 static float zero = 0;
656 prog_data->param[uniforms * 4 + i] = &zero;
657 }
658
659 uniforms++;
660 }
661 }
662 }
663
664 void
665 vec4_visitor::setup_uniform_clipplane_values()
666 {
667 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
668
669 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
670 this->uniform_vector_size[this->uniforms] = 4;
671 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
672 this->userplane[i].type = BRW_REGISTER_TYPE_F;
673 for (int j = 0; j < 4; ++j) {
674 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
675 }
676 ++this->uniforms;
677 }
678 }
679
680 /* Our support for builtin uniforms is even scarier than non-builtin.
681 * It sits on top of the PROG_STATE_VAR parameters that are
682 * automatically updated from GL context state.
683 */
684 void
685 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
686 {
687 const ir_state_slot *const slots = ir->state_slots;
688 assert(ir->state_slots != NULL);
689
690 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
691 /* This state reference has already been setup by ir_to_mesa,
692 * but we'll get the same index back here. We can reference
693 * ParameterValues directly, since unlike brw_fs.cpp, we never
694 * add new state references during compile.
695 */
696 int index = _mesa_add_state_reference(this->prog->Parameters,
697 (gl_state_index *)slots[i].tokens);
698 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
699
700 this->uniform_vector_size[this->uniforms] = 0;
701 /* Add each of the unique swizzled channels of the element.
702 * This will end up matching the size of the glsl_type of this field.
703 */
704 int last_swiz = -1;
705 for (unsigned int j = 0; j < 4; j++) {
706 int swiz = GET_SWZ(slots[i].swizzle, j);
707 last_swiz = swiz;
708
709 prog_data->param[this->uniforms * 4 + j] = &values[swiz];
710 if (swiz <= last_swiz)
711 this->uniform_vector_size[this->uniforms]++;
712 }
713 this->uniforms++;
714 }
715 }
716
717 dst_reg *
718 vec4_visitor::variable_storage(ir_variable *var)
719 {
720 return (dst_reg *)hash_table_find(this->variable_ht, var);
721 }
722
723 void
724 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
725 {
726 ir_expression *expr = ir->as_expression();
727
728 *predicate = BRW_PREDICATE_NORMAL;
729
730 if (expr) {
731 src_reg op[2];
732 vec4_instruction *inst;
733
734 assert(expr->get_num_operands() <= 2);
735 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
736 expr->operands[i]->accept(this);
737 op[i] = this->result;
738
739 resolve_ud_negate(&op[i]);
740 }
741
742 switch (expr->operation) {
743 case ir_unop_logic_not:
744 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
745 inst->conditional_mod = BRW_CONDITIONAL_Z;
746 break;
747
748 case ir_binop_logic_xor:
749 inst = emit(XOR(dst_null_d(), op[0], op[1]));
750 inst->conditional_mod = BRW_CONDITIONAL_NZ;
751 break;
752
753 case ir_binop_logic_or:
754 inst = emit(OR(dst_null_d(), op[0], op[1]));
755 inst->conditional_mod = BRW_CONDITIONAL_NZ;
756 break;
757
758 case ir_binop_logic_and:
759 inst = emit(AND(dst_null_d(), op[0], op[1]));
760 inst->conditional_mod = BRW_CONDITIONAL_NZ;
761 break;
762
763 case ir_unop_f2b:
764 if (brw->gen >= 6) {
765 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
766 } else {
767 inst = emit(MOV(dst_null_f(), op[0]));
768 inst->conditional_mod = BRW_CONDITIONAL_NZ;
769 }
770 break;
771
772 case ir_unop_i2b:
773 if (brw->gen >= 6) {
774 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
775 } else {
776 inst = emit(MOV(dst_null_d(), op[0]));
777 inst->conditional_mod = BRW_CONDITIONAL_NZ;
778 }
779 break;
780
781 case ir_binop_all_equal:
782 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
783 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
784 break;
785
786 case ir_binop_any_nequal:
787 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
788 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
789 break;
790
791 case ir_unop_any:
792 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
793 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
794 break;
795
796 case ir_binop_greater:
797 case ir_binop_gequal:
798 case ir_binop_less:
799 case ir_binop_lequal:
800 case ir_binop_equal:
801 case ir_binop_nequal:
802 emit(CMP(dst_null_d(), op[0], op[1],
803 brw_conditional_for_comparison(expr->operation)));
804 break;
805
806 default:
807 assert(!"not reached");
808 break;
809 }
810 return;
811 }
812
813 ir->accept(this);
814
815 resolve_ud_negate(&this->result);
816
817 if (brw->gen >= 6) {
818 vec4_instruction *inst = emit(AND(dst_null_d(),
819 this->result, src_reg(1)));
820 inst->conditional_mod = BRW_CONDITIONAL_NZ;
821 } else {
822 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
823 inst->conditional_mod = BRW_CONDITIONAL_NZ;
824 }
825 }
826
827 /**
828 * Emit a gen6 IF statement with the comparison folded into the IF
829 * instruction.
830 */
831 void
832 vec4_visitor::emit_if_gen6(ir_if *ir)
833 {
834 ir_expression *expr = ir->condition->as_expression();
835
836 if (expr) {
837 src_reg op[2];
838 dst_reg temp;
839
840 assert(expr->get_num_operands() <= 2);
841 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
842 expr->operands[i]->accept(this);
843 op[i] = this->result;
844 }
845
846 switch (expr->operation) {
847 case ir_unop_logic_not:
848 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
849 return;
850
851 case ir_binop_logic_xor:
852 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
853 return;
854
855 case ir_binop_logic_or:
856 temp = dst_reg(this, glsl_type::bool_type);
857 emit(OR(temp, op[0], op[1]));
858 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
859 return;
860
861 case ir_binop_logic_and:
862 temp = dst_reg(this, glsl_type::bool_type);
863 emit(AND(temp, op[0], op[1]));
864 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
865 return;
866
867 case ir_unop_f2b:
868 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
869 return;
870
871 case ir_unop_i2b:
872 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
873 return;
874
875 case ir_binop_greater:
876 case ir_binop_gequal:
877 case ir_binop_less:
878 case ir_binop_lequal:
879 case ir_binop_equal:
880 case ir_binop_nequal:
881 emit(IF(op[0], op[1],
882 brw_conditional_for_comparison(expr->operation)));
883 return;
884
885 case ir_binop_all_equal:
886 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
887 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
888 return;
889
890 case ir_binop_any_nequal:
891 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
892 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
893 return;
894
895 case ir_unop_any:
896 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
897 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
898 return;
899
900 default:
901 assert(!"not reached");
902 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
903 return;
904 }
905 return;
906 }
907
908 ir->condition->accept(this);
909
910 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
911 }
912
913 dst_reg
914 with_writemask(dst_reg const & r, int mask)
915 {
916 dst_reg result = r;
917 result.writemask = mask;
918 return result;
919 }
920
921
922 void
923 vec4_visitor::visit(ir_variable *ir)
924 {
925 dst_reg *reg = NULL;
926
927 if (variable_storage(ir))
928 return;
929
930 switch (ir->mode) {
931 case ir_var_shader_in:
932 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
933 break;
934
935 case ir_var_shader_out:
936 reg = new(mem_ctx) dst_reg(this, ir->type);
937
938 for (int i = 0; i < type_size(ir->type); i++) {
939 output_reg[ir->location + i] = *reg;
940 output_reg[ir->location + i].reg_offset = i;
941 output_reg[ir->location + i].type =
942 brw_type_for_base_type(ir->type->get_scalar_type());
943 output_reg_annotation[ir->location + i] = ir->name;
944 }
945 break;
946
947 case ir_var_auto:
948 case ir_var_temporary:
949 reg = new(mem_ctx) dst_reg(this, ir->type);
950 break;
951
952 case ir_var_uniform:
953 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
954
955 /* Thanks to the lower_ubo_reference pass, we will see only
956 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
957 * variables, so no need for them to be in variable_ht.
958 */
959 if (ir->is_in_uniform_block())
960 return;
961
962 /* Track how big the whole uniform variable is, in case we need to put a
963 * copy of its data into pull constants for array access.
964 */
965 this->uniform_size[this->uniforms] = type_size(ir->type);
966
967 if (!strncmp(ir->name, "gl_", 3)) {
968 setup_builtin_uniform_values(ir);
969 } else {
970 setup_uniform_values(ir);
971 }
972 break;
973
974 case ir_var_system_value:
975 reg = make_reg_for_system_value(ir);
976 break;
977
978 default:
979 assert(!"not reached");
980 }
981
982 reg->type = brw_type_for_base_type(ir->type);
983 hash_table_insert(this->variable_ht, reg, ir);
984 }
985
986 void
987 vec4_visitor::visit(ir_loop *ir)
988 {
989 dst_reg counter;
990
991 /* We don't want debugging output to print the whole body of the
992 * loop as the annotation.
993 */
994 this->base_ir = NULL;
995
996 if (ir->counter != NULL) {
997 this->base_ir = ir->counter;
998 ir->counter->accept(this);
999 counter = *(variable_storage(ir->counter));
1000
1001 if (ir->from != NULL) {
1002 this->base_ir = ir->from;
1003 ir->from->accept(this);
1004
1005 emit(MOV(counter, this->result));
1006 }
1007 }
1008
1009 emit(BRW_OPCODE_DO);
1010
1011 if (ir->to) {
1012 this->base_ir = ir->to;
1013 ir->to->accept(this);
1014
1015 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1016 brw_conditional_for_comparison(ir->cmp)));
1017
1018 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1019 inst->predicate = BRW_PREDICATE_NORMAL;
1020 }
1021
1022 visit_instructions(&ir->body_instructions);
1023
1024
1025 if (ir->increment) {
1026 this->base_ir = ir->increment;
1027 ir->increment->accept(this);
1028 emit(ADD(counter, src_reg(counter), this->result));
1029 }
1030
1031 emit(BRW_OPCODE_WHILE);
1032 }
1033
1034 void
1035 vec4_visitor::visit(ir_loop_jump *ir)
1036 {
1037 switch (ir->mode) {
1038 case ir_loop_jump::jump_break:
1039 emit(BRW_OPCODE_BREAK);
1040 break;
1041 case ir_loop_jump::jump_continue:
1042 emit(BRW_OPCODE_CONTINUE);
1043 break;
1044 }
1045 }
1046
1047
1048 void
1049 vec4_visitor::visit(ir_function_signature *ir)
1050 {
1051 assert(0);
1052 (void)ir;
1053 }
1054
1055 void
1056 vec4_visitor::visit(ir_function *ir)
1057 {
1058 /* Ignore function bodies other than main() -- we shouldn't see calls to
1059 * them since they should all be inlined.
1060 */
1061 if (strcmp(ir->name, "main") == 0) {
1062 const ir_function_signature *sig;
1063 exec_list empty;
1064
1065 sig = ir->matching_signature(NULL, &empty);
1066
1067 assert(sig);
1068
1069 visit_instructions(&sig->body);
1070 }
1071 }
1072
1073 bool
1074 vec4_visitor::try_emit_sat(ir_expression *ir)
1075 {
1076 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1077 if (!sat_src)
1078 return false;
1079
1080 sat_src->accept(this);
1081 src_reg src = this->result;
1082
1083 this->result = src_reg(this, ir->type);
1084 vec4_instruction *inst;
1085 inst = emit(MOV(dst_reg(this->result), src));
1086 inst->saturate = true;
1087
1088 return true;
1089 }
1090
1091 bool
1092 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1093 {
1094 /* 3-src instructions were introduced in gen6. */
1095 if (brw->gen < 6)
1096 return false;
1097
1098 /* MAD can only handle floating-point data. */
1099 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1100 return false;
1101
1102 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1103 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1104
1105 if (!mul || mul->operation != ir_binop_mul)
1106 return false;
1107
1108 nonmul->accept(this);
1109 src_reg src0 = fix_3src_operand(this->result);
1110
1111 mul->operands[0]->accept(this);
1112 src_reg src1 = fix_3src_operand(this->result);
1113
1114 mul->operands[1]->accept(this);
1115 src_reg src2 = fix_3src_operand(this->result);
1116
1117 this->result = src_reg(this, ir->type);
1118 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1119
1120 return true;
1121 }
1122
1123 void
1124 vec4_visitor::emit_bool_comparison(unsigned int op,
1125 dst_reg dst, src_reg src0, src_reg src1)
1126 {
1127 /* original gen4 does destination conversion before comparison. */
1128 if (brw->gen < 5)
1129 dst.type = src0.type;
1130
1131 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1132
1133 dst.type = BRW_REGISTER_TYPE_D;
1134 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1135 }
1136
1137 void
1138 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1139 src_reg src0, src_reg src1)
1140 {
1141 vec4_instruction *inst;
1142
1143 if (brw->gen >= 6) {
1144 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1145 inst->conditional_mod = conditionalmod;
1146 } else {
1147 emit(CMP(dst, src0, src1, conditionalmod));
1148
1149 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1150 inst->predicate = BRW_PREDICATE_NORMAL;
1151 }
1152 }
1153
1154 static bool
1155 is_16bit_constant(ir_rvalue *rvalue)
1156 {
1157 ir_constant *constant = rvalue->as_constant();
1158 if (!constant)
1159 return false;
1160
1161 if (constant->type != glsl_type::int_type &&
1162 constant->type != glsl_type::uint_type)
1163 return false;
1164
1165 return constant->value.u[0] < (1 << 16);
1166 }
1167
1168 void
1169 vec4_visitor::visit(ir_expression *ir)
1170 {
1171 unsigned int operand;
1172 src_reg op[Elements(ir->operands)];
1173 src_reg result_src;
1174 dst_reg result_dst;
1175 vec4_instruction *inst;
1176
1177 if (try_emit_sat(ir))
1178 return;
1179
1180 if (ir->operation == ir_binop_add) {
1181 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1182 return;
1183 }
1184
1185 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1186 this->result.file = BAD_FILE;
1187 ir->operands[operand]->accept(this);
1188 if (this->result.file == BAD_FILE) {
1189 printf("Failed to get tree for expression operand:\n");
1190 ir->operands[operand]->print();
1191 exit(1);
1192 }
1193 op[operand] = this->result;
1194
1195 /* Matrix expression operands should have been broken down to vector
1196 * operations already.
1197 */
1198 assert(!ir->operands[operand]->type->is_matrix());
1199 }
1200
1201 int vector_elements = ir->operands[0]->type->vector_elements;
1202 if (ir->operands[1]) {
1203 vector_elements = MAX2(vector_elements,
1204 ir->operands[1]->type->vector_elements);
1205 }
1206
1207 this->result.file = BAD_FILE;
1208
1209 /* Storage for our result. Ideally for an assignment we'd be using
1210 * the actual storage for the result here, instead.
1211 */
1212 result_src = src_reg(this, ir->type);
1213 /* convenience for the emit functions below. */
1214 result_dst = dst_reg(result_src);
1215 /* If nothing special happens, this is the result. */
1216 this->result = result_src;
1217 /* Limit writes to the channels that will be used by result_src later.
1218 * This does limit this temp's use as a temporary for multi-instruction
1219 * sequences.
1220 */
1221 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1222
1223 switch (ir->operation) {
1224 case ir_unop_logic_not:
1225 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1226 * ones complement of the whole register, not just bit 0.
1227 */
1228 emit(XOR(result_dst, op[0], src_reg(1)));
1229 break;
1230 case ir_unop_neg:
1231 op[0].negate = !op[0].negate;
1232 emit(MOV(result_dst, op[0]));
1233 break;
1234 case ir_unop_abs:
1235 op[0].abs = true;
1236 op[0].negate = false;
1237 emit(MOV(result_dst, op[0]));
1238 break;
1239
1240 case ir_unop_sign:
1241 emit(MOV(result_dst, src_reg(0.0f)));
1242
1243 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1244 inst = emit(MOV(result_dst, src_reg(1.0f)));
1245 inst->predicate = BRW_PREDICATE_NORMAL;
1246
1247 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1248 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1249 inst->predicate = BRW_PREDICATE_NORMAL;
1250
1251 break;
1252
1253 case ir_unop_rcp:
1254 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1255 break;
1256
1257 case ir_unop_exp2:
1258 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1259 break;
1260 case ir_unop_log2:
1261 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1262 break;
1263 case ir_unop_exp:
1264 case ir_unop_log:
1265 assert(!"not reached: should be handled by ir_explog_to_explog2");
1266 break;
1267 case ir_unop_sin:
1268 case ir_unop_sin_reduced:
1269 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1270 break;
1271 case ir_unop_cos:
1272 case ir_unop_cos_reduced:
1273 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1274 break;
1275
1276 case ir_unop_dFdx:
1277 case ir_unop_dFdy:
1278 assert(!"derivatives not valid in vertex shader");
1279 break;
1280
1281 case ir_unop_bitfield_reverse:
1282 emit(BFREV(result_dst, op[0]));
1283 break;
1284 case ir_unop_bit_count:
1285 emit(CBIT(result_dst, op[0]));
1286 break;
1287 case ir_unop_find_msb: {
1288 src_reg temp = src_reg(this, glsl_type::uint_type);
1289
1290 inst = emit(FBH(dst_reg(temp), op[0]));
1291 inst->dst.writemask = WRITEMASK_XYZW;
1292
1293 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1294 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1295 * subtract the result from 31 to convert the MSB count into an LSB count.
1296 */
1297
1298 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1299 temp.swizzle = BRW_SWIZZLE_NOOP;
1300 emit(MOV(result_dst, temp));
1301
1302 src_reg src_tmp = src_reg(result_dst);
1303 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1304
1305 src_tmp.negate = true;
1306 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1307 inst->predicate = BRW_PREDICATE_NORMAL;
1308 break;
1309 }
1310 case ir_unop_find_lsb:
1311 emit(FBL(result_dst, op[0]));
1312 break;
1313
1314 case ir_unop_noise:
1315 assert(!"not reached: should be handled by lower_noise");
1316 break;
1317
1318 case ir_binop_add:
1319 emit(ADD(result_dst, op[0], op[1]));
1320 break;
1321 case ir_binop_sub:
1322 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1323 break;
1324
1325 case ir_binop_mul:
1326 if (ir->type->is_integer()) {
1327 /* For integer multiplication, the MUL uses the low 16 bits of one of
1328 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1329 * accumulates in the contribution of the upper 16 bits of that
1330 * operand. If we can determine that one of the args is in the low
1331 * 16 bits, though, we can just emit a single MUL.
1332 */
1333 if (is_16bit_constant(ir->operands[0])) {
1334 if (brw->gen < 7)
1335 emit(MUL(result_dst, op[0], op[1]));
1336 else
1337 emit(MUL(result_dst, op[1], op[0]));
1338 } else if (is_16bit_constant(ir->operands[1])) {
1339 if (brw->gen < 7)
1340 emit(MUL(result_dst, op[1], op[0]));
1341 else
1342 emit(MUL(result_dst, op[0], op[1]));
1343 } else {
1344 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1345
1346 emit(MUL(acc, op[0], op[1]));
1347 emit(MACH(dst_null_d(), op[0], op[1]));
1348 emit(MOV(result_dst, src_reg(acc)));
1349 }
1350 } else {
1351 emit(MUL(result_dst, op[0], op[1]));
1352 }
1353 break;
1354 case ir_binop_div:
1355 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1356 assert(ir->type->is_integer());
1357 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1358 break;
1359 case ir_binop_mod:
1360 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1361 assert(ir->type->is_integer());
1362 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1363 break;
1364
1365 case ir_binop_less:
1366 case ir_binop_greater:
1367 case ir_binop_lequal:
1368 case ir_binop_gequal:
1369 case ir_binop_equal:
1370 case ir_binop_nequal: {
1371 emit(CMP(result_dst, op[0], op[1],
1372 brw_conditional_for_comparison(ir->operation)));
1373 emit(AND(result_dst, result_src, src_reg(0x1)));
1374 break;
1375 }
1376
1377 case ir_binop_all_equal:
1378 /* "==" operator producing a scalar boolean. */
1379 if (ir->operands[0]->type->is_vector() ||
1380 ir->operands[1]->type->is_vector()) {
1381 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1382 emit(MOV(result_dst, src_reg(0)));
1383 inst = emit(MOV(result_dst, src_reg(1)));
1384 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1385 } else {
1386 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1387 emit(AND(result_dst, result_src, src_reg(0x1)));
1388 }
1389 break;
1390 case ir_binop_any_nequal:
1391 /* "!=" operator producing a scalar boolean. */
1392 if (ir->operands[0]->type->is_vector() ||
1393 ir->operands[1]->type->is_vector()) {
1394 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1395
1396 emit(MOV(result_dst, src_reg(0)));
1397 inst = emit(MOV(result_dst, src_reg(1)));
1398 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1399 } else {
1400 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1401 emit(AND(result_dst, result_src, src_reg(0x1)));
1402 }
1403 break;
1404
1405 case ir_unop_any:
1406 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1407 emit(MOV(result_dst, src_reg(0)));
1408
1409 inst = emit(MOV(result_dst, src_reg(1)));
1410 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1411 break;
1412
1413 case ir_binop_logic_xor:
1414 emit(XOR(result_dst, op[0], op[1]));
1415 break;
1416
1417 case ir_binop_logic_or:
1418 emit(OR(result_dst, op[0], op[1]));
1419 break;
1420
1421 case ir_binop_logic_and:
1422 emit(AND(result_dst, op[0], op[1]));
1423 break;
1424
1425 case ir_binop_dot:
1426 assert(ir->operands[0]->type->is_vector());
1427 assert(ir->operands[0]->type == ir->operands[1]->type);
1428 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1429 break;
1430
1431 case ir_unop_sqrt:
1432 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1433 break;
1434 case ir_unop_rsq:
1435 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1436 break;
1437
1438 case ir_unop_bitcast_i2f:
1439 case ir_unop_bitcast_u2f:
1440 this->result = op[0];
1441 this->result.type = BRW_REGISTER_TYPE_F;
1442 break;
1443
1444 case ir_unop_bitcast_f2i:
1445 this->result = op[0];
1446 this->result.type = BRW_REGISTER_TYPE_D;
1447 break;
1448
1449 case ir_unop_bitcast_f2u:
1450 this->result = op[0];
1451 this->result.type = BRW_REGISTER_TYPE_UD;
1452 break;
1453
1454 case ir_unop_i2f:
1455 case ir_unop_i2u:
1456 case ir_unop_u2i:
1457 case ir_unop_u2f:
1458 case ir_unop_b2f:
1459 case ir_unop_b2i:
1460 case ir_unop_f2i:
1461 case ir_unop_f2u:
1462 emit(MOV(result_dst, op[0]));
1463 break;
1464 case ir_unop_f2b:
1465 case ir_unop_i2b: {
1466 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1467 emit(AND(result_dst, result_src, src_reg(1)));
1468 break;
1469 }
1470
1471 case ir_unop_trunc:
1472 emit(RNDZ(result_dst, op[0]));
1473 break;
1474 case ir_unop_ceil:
1475 op[0].negate = !op[0].negate;
1476 inst = emit(RNDD(result_dst, op[0]));
1477 this->result.negate = true;
1478 break;
1479 case ir_unop_floor:
1480 inst = emit(RNDD(result_dst, op[0]));
1481 break;
1482 case ir_unop_fract:
1483 inst = emit(FRC(result_dst, op[0]));
1484 break;
1485 case ir_unop_round_even:
1486 emit(RNDE(result_dst, op[0]));
1487 break;
1488
1489 case ir_binop_min:
1490 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1491 break;
1492 case ir_binop_max:
1493 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1494 break;
1495
1496 case ir_binop_pow:
1497 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1498 break;
1499
1500 case ir_unop_bit_not:
1501 inst = emit(NOT(result_dst, op[0]));
1502 break;
1503 case ir_binop_bit_and:
1504 inst = emit(AND(result_dst, op[0], op[1]));
1505 break;
1506 case ir_binop_bit_xor:
1507 inst = emit(XOR(result_dst, op[0], op[1]));
1508 break;
1509 case ir_binop_bit_or:
1510 inst = emit(OR(result_dst, op[0], op[1]));
1511 break;
1512
1513 case ir_binop_lshift:
1514 inst = emit(SHL(result_dst, op[0], op[1]));
1515 break;
1516
1517 case ir_binop_rshift:
1518 if (ir->type->base_type == GLSL_TYPE_INT)
1519 inst = emit(ASR(result_dst, op[0], op[1]));
1520 else
1521 inst = emit(SHR(result_dst, op[0], op[1]));
1522 break;
1523
1524 case ir_binop_bfm:
1525 emit(BFI1(result_dst, op[0], op[1]));
1526 break;
1527
1528 case ir_binop_ubo_load: {
1529 ir_constant *uniform_block = ir->operands[0]->as_constant();
1530 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1531 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1532 src_reg offset = op[1];
1533
1534 /* Now, load the vector from that offset. */
1535 assert(ir->type->is_vector() || ir->type->is_scalar());
1536
1537 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1538 packed_consts.type = result.type;
1539 src_reg surf_index =
1540 src_reg(SURF_INDEX_VEC4_UBO(uniform_block->value.u[0]));
1541 if (const_offset_ir) {
1542 offset = src_reg(const_offset / 16);
1543 } else {
1544 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1545 }
1546
1547 vec4_instruction *pull =
1548 emit(new(mem_ctx) vec4_instruction(this,
1549 VS_OPCODE_PULL_CONSTANT_LOAD,
1550 dst_reg(packed_consts),
1551 surf_index,
1552 offset));
1553 pull->base_mrf = 14;
1554 pull->mlen = 1;
1555
1556 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1557 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1558 const_offset % 16 / 4,
1559 const_offset % 16 / 4,
1560 const_offset % 16 / 4);
1561
1562 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1563 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1564 emit(CMP(result_dst, packed_consts, src_reg(0u),
1565 BRW_CONDITIONAL_NZ));
1566 emit(AND(result_dst, result, src_reg(0x1)));
1567 } else {
1568 emit(MOV(result_dst, packed_consts));
1569 }
1570 break;
1571 }
1572
1573 case ir_binop_vector_extract:
1574 assert(!"should have been lowered by vec_index_to_cond_assign");
1575 break;
1576
1577 case ir_triop_fma:
1578 op[0] = fix_3src_operand(op[0]);
1579 op[1] = fix_3src_operand(op[1]);
1580 op[2] = fix_3src_operand(op[2]);
1581 /* Note that the instruction's argument order is reversed from GLSL
1582 * and the IR.
1583 */
1584 emit(MAD(result_dst, op[2], op[1], op[0]));
1585 break;
1586
1587 case ir_triop_lrp:
1588 op[0] = fix_3src_operand(op[0]);
1589 op[1] = fix_3src_operand(op[1]);
1590 op[2] = fix_3src_operand(op[2]);
1591 /* Note that the instruction's argument order is reversed from GLSL
1592 * and the IR.
1593 */
1594 emit(LRP(result_dst, op[2], op[1], op[0]));
1595 break;
1596
1597 case ir_triop_csel:
1598 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1599 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1600 inst->predicate = BRW_PREDICATE_NORMAL;
1601 break;
1602
1603 case ir_triop_bfi:
1604 op[0] = fix_3src_operand(op[0]);
1605 op[1] = fix_3src_operand(op[1]);
1606 op[2] = fix_3src_operand(op[2]);
1607 emit(BFI2(result_dst, op[0], op[1], op[2]));
1608 break;
1609
1610 case ir_triop_bitfield_extract:
1611 op[0] = fix_3src_operand(op[0]);
1612 op[1] = fix_3src_operand(op[1]);
1613 op[2] = fix_3src_operand(op[2]);
1614 /* Note that the instruction's argument order is reversed from GLSL
1615 * and the IR.
1616 */
1617 emit(BFE(result_dst, op[2], op[1], op[0]));
1618 break;
1619
1620 case ir_triop_vector_insert:
1621 assert(!"should have been lowered by lower_vector_insert");
1622 break;
1623
1624 case ir_quadop_bitfield_insert:
1625 assert(!"not reached: should be handled by "
1626 "bitfield_insert_to_bfm_bfi\n");
1627 break;
1628
1629 case ir_quadop_vector:
1630 assert(!"not reached: should be handled by lower_quadop_vector");
1631 break;
1632
1633 case ir_unop_pack_half_2x16:
1634 emit_pack_half_2x16(result_dst, op[0]);
1635 break;
1636 case ir_unop_unpack_half_2x16:
1637 emit_unpack_half_2x16(result_dst, op[0]);
1638 break;
1639 case ir_unop_pack_snorm_2x16:
1640 case ir_unop_pack_snorm_4x8:
1641 case ir_unop_pack_unorm_2x16:
1642 case ir_unop_pack_unorm_4x8:
1643 case ir_unop_unpack_snorm_2x16:
1644 case ir_unop_unpack_snorm_4x8:
1645 case ir_unop_unpack_unorm_2x16:
1646 case ir_unop_unpack_unorm_4x8:
1647 assert(!"not reached: should be handled by lower_packing_builtins");
1648 break;
1649 case ir_unop_unpack_half_2x16_split_x:
1650 case ir_unop_unpack_half_2x16_split_y:
1651 case ir_binop_pack_half_2x16_split:
1652 assert(!"not reached: should not occur in vertex shader");
1653 break;
1654 }
1655 }
1656
1657
1658 void
1659 vec4_visitor::visit(ir_swizzle *ir)
1660 {
1661 src_reg src;
1662 int i = 0;
1663 int swizzle[4];
1664
1665 /* Note that this is only swizzles in expressions, not those on the left
1666 * hand side of an assignment, which do write masking. See ir_assignment
1667 * for that.
1668 */
1669
1670 ir->val->accept(this);
1671 src = this->result;
1672 assert(src.file != BAD_FILE);
1673
1674 for (i = 0; i < ir->type->vector_elements; i++) {
1675 switch (i) {
1676 case 0:
1677 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1678 break;
1679 case 1:
1680 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1681 break;
1682 case 2:
1683 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1684 break;
1685 case 3:
1686 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1687 break;
1688 }
1689 }
1690 for (; i < 4; i++) {
1691 /* Replicate the last channel out. */
1692 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1693 }
1694
1695 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1696
1697 this->result = src;
1698 }
1699
1700 void
1701 vec4_visitor::visit(ir_dereference_variable *ir)
1702 {
1703 const struct glsl_type *type = ir->type;
1704 dst_reg *reg = variable_storage(ir->var);
1705
1706 if (!reg) {
1707 fail("Failed to find variable storage for %s\n", ir->var->name);
1708 this->result = src_reg(brw_null_reg());
1709 return;
1710 }
1711
1712 this->result = src_reg(*reg);
1713
1714 /* System values get their swizzle from the dst_reg writemask */
1715 if (ir->var->mode == ir_var_system_value)
1716 return;
1717
1718 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1719 this->result.swizzle = swizzle_for_size(type->vector_elements);
1720 }
1721
1722
1723 int
1724 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1725 {
1726 /* Under normal circumstances array elements are stored consecutively, so
1727 * the stride is equal to the size of the array element.
1728 */
1729 return type_size(ir->type);
1730 }
1731
1732
1733 void
1734 vec4_visitor::visit(ir_dereference_array *ir)
1735 {
1736 ir_constant *constant_index;
1737 src_reg src;
1738 int array_stride = compute_array_stride(ir);
1739
1740 constant_index = ir->array_index->constant_expression_value();
1741
1742 ir->array->accept(this);
1743 src = this->result;
1744
1745 if (constant_index) {
1746 src.reg_offset += constant_index->value.i[0] * array_stride;
1747 } else {
1748 /* Variable index array dereference. It eats the "vec4" of the
1749 * base of the array and an index that offsets the Mesa register
1750 * index.
1751 */
1752 ir->array_index->accept(this);
1753
1754 src_reg index_reg;
1755
1756 if (array_stride == 1) {
1757 index_reg = this->result;
1758 } else {
1759 index_reg = src_reg(this, glsl_type::int_type);
1760
1761 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1762 }
1763
1764 if (src.reladdr) {
1765 src_reg temp = src_reg(this, glsl_type::int_type);
1766
1767 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1768
1769 index_reg = temp;
1770 }
1771
1772 src.reladdr = ralloc(mem_ctx, src_reg);
1773 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1774 }
1775
1776 /* If the type is smaller than a vec4, replicate the last channel out. */
1777 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1778 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1779 else
1780 src.swizzle = BRW_SWIZZLE_NOOP;
1781 src.type = brw_type_for_base_type(ir->type);
1782
1783 this->result = src;
1784 }
1785
1786 void
1787 vec4_visitor::visit(ir_dereference_record *ir)
1788 {
1789 unsigned int i;
1790 const glsl_type *struct_type = ir->record->type;
1791 int offset = 0;
1792
1793 ir->record->accept(this);
1794
1795 for (i = 0; i < struct_type->length; i++) {
1796 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1797 break;
1798 offset += type_size(struct_type->fields.structure[i].type);
1799 }
1800
1801 /* If the type is smaller than a vec4, replicate the last channel out. */
1802 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1803 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1804 else
1805 this->result.swizzle = BRW_SWIZZLE_NOOP;
1806 this->result.type = brw_type_for_base_type(ir->type);
1807
1808 this->result.reg_offset += offset;
1809 }
1810
1811 /**
1812 * We want to be careful in assignment setup to hit the actual storage
1813 * instead of potentially using a temporary like we might with the
1814 * ir_dereference handler.
1815 */
1816 static dst_reg
1817 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1818 {
1819 /* The LHS must be a dereference. If the LHS is a variable indexed array
1820 * access of a vector, it must be separated into a series conditional moves
1821 * before reaching this point (see ir_vec_index_to_cond_assign).
1822 */
1823 assert(ir->as_dereference());
1824 ir_dereference_array *deref_array = ir->as_dereference_array();
1825 if (deref_array) {
1826 assert(!deref_array->array->type->is_vector());
1827 }
1828
1829 /* Use the rvalue deref handler for the most part. We'll ignore
1830 * swizzles in it and write swizzles using writemask, though.
1831 */
1832 ir->accept(v);
1833 return dst_reg(v->result);
1834 }
1835
1836 void
1837 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1838 const struct glsl_type *type, uint32_t predicate)
1839 {
1840 if (type->base_type == GLSL_TYPE_STRUCT) {
1841 for (unsigned int i = 0; i < type->length; i++) {
1842 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1843 }
1844 return;
1845 }
1846
1847 if (type->is_array()) {
1848 for (unsigned int i = 0; i < type->length; i++) {
1849 emit_block_move(dst, src, type->fields.array, predicate);
1850 }
1851 return;
1852 }
1853
1854 if (type->is_matrix()) {
1855 const struct glsl_type *vec_type;
1856
1857 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1858 type->vector_elements, 1);
1859
1860 for (int i = 0; i < type->matrix_columns; i++) {
1861 emit_block_move(dst, src, vec_type, predicate);
1862 }
1863 return;
1864 }
1865
1866 assert(type->is_scalar() || type->is_vector());
1867
1868 dst->type = brw_type_for_base_type(type);
1869 src->type = dst->type;
1870
1871 dst->writemask = (1 << type->vector_elements) - 1;
1872
1873 src->swizzle = swizzle_for_size(type->vector_elements);
1874
1875 vec4_instruction *inst = emit(MOV(*dst, *src));
1876 inst->predicate = predicate;
1877
1878 dst->reg_offset++;
1879 src->reg_offset++;
1880 }
1881
1882
1883 /* If the RHS processing resulted in an instruction generating a
1884 * temporary value, and it would be easy to rewrite the instruction to
1885 * generate its result right into the LHS instead, do so. This ends
1886 * up reliably removing instructions where it can be tricky to do so
1887 * later without real UD chain information.
1888 */
1889 bool
1890 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1891 dst_reg dst,
1892 src_reg src,
1893 vec4_instruction *pre_rhs_inst,
1894 vec4_instruction *last_rhs_inst)
1895 {
1896 /* This could be supported, but it would take more smarts. */
1897 if (ir->condition)
1898 return false;
1899
1900 if (pre_rhs_inst == last_rhs_inst)
1901 return false; /* No instructions generated to work with. */
1902
1903 /* Make sure the last instruction generated our source reg. */
1904 if (src.file != GRF ||
1905 src.file != last_rhs_inst->dst.file ||
1906 src.reg != last_rhs_inst->dst.reg ||
1907 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1908 src.reladdr ||
1909 src.abs ||
1910 src.negate ||
1911 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1912 return false;
1913
1914 /* Check that that last instruction fully initialized the channels
1915 * we want to use, in the order we want to use them. We could
1916 * potentially reswizzle the operands of many instructions so that
1917 * we could handle out of order channels, but don't yet.
1918 */
1919
1920 for (unsigned i = 0; i < 4; i++) {
1921 if (dst.writemask & (1 << i)) {
1922 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1923 return false;
1924
1925 if (BRW_GET_SWZ(src.swizzle, i) != i)
1926 return false;
1927 }
1928 }
1929
1930 /* Success! Rewrite the instruction. */
1931 last_rhs_inst->dst.file = dst.file;
1932 last_rhs_inst->dst.reg = dst.reg;
1933 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1934 last_rhs_inst->dst.reladdr = dst.reladdr;
1935 last_rhs_inst->dst.writemask &= dst.writemask;
1936
1937 return true;
1938 }
1939
1940 void
1941 vec4_visitor::visit(ir_assignment *ir)
1942 {
1943 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1944 uint32_t predicate = BRW_PREDICATE_NONE;
1945
1946 if (!ir->lhs->type->is_scalar() &&
1947 !ir->lhs->type->is_vector()) {
1948 ir->rhs->accept(this);
1949 src_reg src = this->result;
1950
1951 if (ir->condition) {
1952 emit_bool_to_cond_code(ir->condition, &predicate);
1953 }
1954
1955 /* emit_block_move doesn't account for swizzles in the source register.
1956 * This should be ok, since the source register is a structure or an
1957 * array, and those can't be swizzled. But double-check to be sure.
1958 */
1959 assert(src.swizzle ==
1960 (ir->rhs->type->is_matrix()
1961 ? swizzle_for_size(ir->rhs->type->vector_elements)
1962 : BRW_SWIZZLE_NOOP));
1963
1964 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1965 return;
1966 }
1967
1968 /* Now we're down to just a scalar/vector with writemasks. */
1969 int i;
1970
1971 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1972 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1973
1974 ir->rhs->accept(this);
1975
1976 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1977
1978 src_reg src = this->result;
1979
1980 int swizzles[4];
1981 int first_enabled_chan = 0;
1982 int src_chan = 0;
1983
1984 assert(ir->lhs->type->is_vector() ||
1985 ir->lhs->type->is_scalar());
1986 dst.writemask = ir->write_mask;
1987
1988 for (int i = 0; i < 4; i++) {
1989 if (dst.writemask & (1 << i)) {
1990 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1991 break;
1992 }
1993 }
1994
1995 /* Swizzle a small RHS vector into the channels being written.
1996 *
1997 * glsl ir treats write_mask as dictating how many channels are
1998 * present on the RHS while in our instructions we need to make
1999 * those channels appear in the slots of the vec4 they're written to.
2000 */
2001 for (int i = 0; i < 4; i++) {
2002 if (dst.writemask & (1 << i))
2003 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2004 else
2005 swizzles[i] = first_enabled_chan;
2006 }
2007 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2008 swizzles[2], swizzles[3]);
2009
2010 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2011 return;
2012 }
2013
2014 if (ir->condition) {
2015 emit_bool_to_cond_code(ir->condition, &predicate);
2016 }
2017
2018 for (i = 0; i < type_size(ir->lhs->type); i++) {
2019 vec4_instruction *inst = emit(MOV(dst, src));
2020 inst->predicate = predicate;
2021
2022 dst.reg_offset++;
2023 src.reg_offset++;
2024 }
2025 }
2026
2027 void
2028 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2029 {
2030 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2031 foreach_list(node, &ir->components) {
2032 ir_constant *field_value = (ir_constant *)node;
2033
2034 emit_constant_values(dst, field_value);
2035 }
2036 return;
2037 }
2038
2039 if (ir->type->is_array()) {
2040 for (unsigned int i = 0; i < ir->type->length; i++) {
2041 emit_constant_values(dst, ir->array_elements[i]);
2042 }
2043 return;
2044 }
2045
2046 if (ir->type->is_matrix()) {
2047 for (int i = 0; i < ir->type->matrix_columns; i++) {
2048 float *vec = &ir->value.f[i * ir->type->vector_elements];
2049
2050 for (int j = 0; j < ir->type->vector_elements; j++) {
2051 dst->writemask = 1 << j;
2052 dst->type = BRW_REGISTER_TYPE_F;
2053
2054 emit(MOV(*dst, src_reg(vec[j])));
2055 }
2056 dst->reg_offset++;
2057 }
2058 return;
2059 }
2060
2061 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2062
2063 for (int i = 0; i < ir->type->vector_elements; i++) {
2064 if (!(remaining_writemask & (1 << i)))
2065 continue;
2066
2067 dst->writemask = 1 << i;
2068 dst->type = brw_type_for_base_type(ir->type);
2069
2070 /* Find other components that match the one we're about to
2071 * write. Emits fewer instructions for things like vec4(0.5,
2072 * 1.5, 1.5, 1.5).
2073 */
2074 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2075 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2076 if (ir->value.b[i] == ir->value.b[j])
2077 dst->writemask |= (1 << j);
2078 } else {
2079 /* u, i, and f storage all line up, so no need for a
2080 * switch case for comparing each type.
2081 */
2082 if (ir->value.u[i] == ir->value.u[j])
2083 dst->writemask |= (1 << j);
2084 }
2085 }
2086
2087 switch (ir->type->base_type) {
2088 case GLSL_TYPE_FLOAT:
2089 emit(MOV(*dst, src_reg(ir->value.f[i])));
2090 break;
2091 case GLSL_TYPE_INT:
2092 emit(MOV(*dst, src_reg(ir->value.i[i])));
2093 break;
2094 case GLSL_TYPE_UINT:
2095 emit(MOV(*dst, src_reg(ir->value.u[i])));
2096 break;
2097 case GLSL_TYPE_BOOL:
2098 emit(MOV(*dst, src_reg(ir->value.b[i])));
2099 break;
2100 default:
2101 assert(!"Non-float/uint/int/bool constant");
2102 break;
2103 }
2104
2105 remaining_writemask &= ~dst->writemask;
2106 }
2107 dst->reg_offset++;
2108 }
2109
2110 void
2111 vec4_visitor::visit(ir_constant *ir)
2112 {
2113 dst_reg dst = dst_reg(this, ir->type);
2114 this->result = src_reg(dst);
2115
2116 emit_constant_values(&dst, ir);
2117 }
2118
2119 void
2120 vec4_visitor::visit(ir_call *ir)
2121 {
2122 assert(!"not reached");
2123 }
2124
2125 void
2126 vec4_visitor::visit(ir_texture *ir)
2127 {
2128 int sampler =
2129 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2130
2131 /* Should be lowered by do_lower_texture_projection */
2132 assert(!ir->projector);
2133
2134 /* Generate code to compute all the subexpression trees. This has to be
2135 * done before loading any values into MRFs for the sampler message since
2136 * generating these values may involve SEND messages that need the MRFs.
2137 */
2138 src_reg coordinate;
2139 if (ir->coordinate) {
2140 ir->coordinate->accept(this);
2141 coordinate = this->result;
2142 }
2143
2144 src_reg shadow_comparitor;
2145 if (ir->shadow_comparitor) {
2146 ir->shadow_comparitor->accept(this);
2147 shadow_comparitor = this->result;
2148 }
2149
2150 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2151 src_reg lod, dPdx, dPdy, sample_index;
2152 switch (ir->op) {
2153 case ir_tex:
2154 lod = src_reg(0.0f);
2155 lod_type = glsl_type::float_type;
2156 break;
2157 case ir_txf:
2158 case ir_txl:
2159 case ir_txs:
2160 ir->lod_info.lod->accept(this);
2161 lod = this->result;
2162 lod_type = ir->lod_info.lod->type;
2163 break;
2164 case ir_txf_ms:
2165 ir->lod_info.sample_index->accept(this);
2166 sample_index = this->result;
2167 sample_index_type = ir->lod_info.sample_index->type;
2168 break;
2169 case ir_txd:
2170 ir->lod_info.grad.dPdx->accept(this);
2171 dPdx = this->result;
2172
2173 ir->lod_info.grad.dPdy->accept(this);
2174 dPdy = this->result;
2175
2176 lod_type = ir->lod_info.grad.dPdx->type;
2177 break;
2178 case ir_txb:
2179 case ir_lod:
2180 break;
2181 }
2182
2183 vec4_instruction *inst = NULL;
2184 switch (ir->op) {
2185 case ir_tex:
2186 case ir_txl:
2187 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2188 break;
2189 case ir_txd:
2190 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2191 break;
2192 case ir_txf:
2193 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2194 break;
2195 case ir_txf_ms:
2196 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2197 break;
2198 case ir_txs:
2199 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2200 break;
2201 case ir_txb:
2202 assert(!"TXB is not valid for vertex shaders.");
2203 break;
2204 case ir_lod:
2205 assert(!"LOD is not valid for vertex shaders.");
2206 break;
2207 }
2208
2209 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2210
2211 /* Texel offsets go in the message header; Gen4 also requires headers. */
2212 inst->header_present = use_texture_offset || brw->gen < 5;
2213 inst->base_mrf = 2;
2214 inst->mlen = inst->header_present + 1; /* always at least one */
2215 inst->sampler = sampler;
2216 inst->dst = dst_reg(this, ir->type);
2217 inst->dst.writemask = WRITEMASK_XYZW;
2218 inst->shadow_compare = ir->shadow_comparitor != NULL;
2219
2220 if (use_texture_offset)
2221 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2222
2223 /* MRF for the first parameter */
2224 int param_base = inst->base_mrf + inst->header_present;
2225
2226 if (ir->op == ir_txs) {
2227 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2228 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2229 } else {
2230 /* Load the coordinate */
2231 /* FINISHME: gl_clamp_mask and saturate */
2232 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2233 int zero_mask = 0xf & ~coord_mask;
2234
2235 if (ir->offset && ir->op == ir_txf) {
2236 /* It appears that the ld instruction used for txf does its
2237 * address bounds check before adding in the offset. To work
2238 * around this, just add the integer offset to the integer
2239 * texel coordinate, and don't put the offset in the header.
2240 */
2241 ir_constant *offset = ir->offset->as_constant();
2242 assert(offset);
2243
2244 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2245 src_reg src = coordinate;
2246 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2247 BRW_GET_SWZ(src.swizzle, j),
2248 BRW_GET_SWZ(src.swizzle, j),
2249 BRW_GET_SWZ(src.swizzle, j));
2250 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2251 src, offset->value.i[j]));
2252 }
2253 } else {
2254 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2255 coordinate));
2256 }
2257 if (zero_mask != 0) {
2258 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2259 src_reg(0)));
2260 }
2261 /* Load the shadow comparitor */
2262 if (ir->shadow_comparitor && ir->op != ir_txd) {
2263 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2264 WRITEMASK_X),
2265 shadow_comparitor));
2266 inst->mlen++;
2267 }
2268
2269 /* Load the LOD info */
2270 if (ir->op == ir_tex || ir->op == ir_txl) {
2271 int mrf, writemask;
2272 if (brw->gen >= 5) {
2273 mrf = param_base + 1;
2274 if (ir->shadow_comparitor) {
2275 writemask = WRITEMASK_Y;
2276 /* mlen already incremented */
2277 } else {
2278 writemask = WRITEMASK_X;
2279 inst->mlen++;
2280 }
2281 } else /* brw->gen == 4 */ {
2282 mrf = param_base;
2283 writemask = WRITEMASK_W;
2284 }
2285 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2286 } else if (ir->op == ir_txf) {
2287 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2288 } else if (ir->op == ir_txf_ms) {
2289 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2290 sample_index));
2291 inst->mlen++;
2292
2293 /* on Gen7, there is an additional MCS parameter here after SI,
2294 * but we don't bother to emit it since it's always zero. If
2295 * we start supporting texturing from CMS surfaces, this will have
2296 * to change
2297 */
2298 } else if (ir->op == ir_txd) {
2299 const glsl_type *type = lod_type;
2300
2301 if (brw->gen >= 5) {
2302 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2303 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2304 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2305 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2306 inst->mlen++;
2307
2308 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2309 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2310 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2311 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2312 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2313 inst->mlen++;
2314
2315 if (ir->shadow_comparitor) {
2316 emit(MOV(dst_reg(MRF, param_base + 2,
2317 ir->shadow_comparitor->type, WRITEMASK_Z),
2318 shadow_comparitor));
2319 }
2320 }
2321 } else /* brw->gen == 4 */ {
2322 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2323 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2324 inst->mlen += 2;
2325 }
2326 }
2327 }
2328
2329 emit(inst);
2330
2331 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2332 * spec requires layers.
2333 */
2334 if (ir->op == ir_txs) {
2335 glsl_type const *type = ir->sampler->type;
2336 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2337 type->sampler_array) {
2338 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2339 with_writemask(inst->dst, WRITEMASK_Z),
2340 src_reg(inst->dst), src_reg(6));
2341 }
2342 }
2343
2344 swizzle_result(ir, src_reg(inst->dst), sampler);
2345 }
2346
2347 void
2348 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2349 {
2350 int s = key->tex.swizzles[sampler];
2351
2352 this->result = src_reg(this, ir->type);
2353 dst_reg swizzled_result(this->result);
2354
2355 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2356 || s == SWIZZLE_NOOP) {
2357 emit(MOV(swizzled_result, orig_val));
2358 return;
2359 }
2360
2361 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2362 int swizzle[4] = {0};
2363
2364 for (int i = 0; i < 4; i++) {
2365 switch (GET_SWZ(s, i)) {
2366 case SWIZZLE_ZERO:
2367 zero_mask |= (1 << i);
2368 break;
2369 case SWIZZLE_ONE:
2370 one_mask |= (1 << i);
2371 break;
2372 default:
2373 copy_mask |= (1 << i);
2374 swizzle[i] = GET_SWZ(s, i);
2375 break;
2376 }
2377 }
2378
2379 if (copy_mask) {
2380 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2381 swizzled_result.writemask = copy_mask;
2382 emit(MOV(swizzled_result, orig_val));
2383 }
2384
2385 if (zero_mask) {
2386 swizzled_result.writemask = zero_mask;
2387 emit(MOV(swizzled_result, src_reg(0.0f)));
2388 }
2389
2390 if (one_mask) {
2391 swizzled_result.writemask = one_mask;
2392 emit(MOV(swizzled_result, src_reg(1.0f)));
2393 }
2394 }
2395
2396 void
2397 vec4_visitor::visit(ir_return *ir)
2398 {
2399 assert(!"not reached");
2400 }
2401
2402 void
2403 vec4_visitor::visit(ir_discard *ir)
2404 {
2405 assert(!"not reached");
2406 }
2407
2408 void
2409 vec4_visitor::visit(ir_if *ir)
2410 {
2411 /* Don't point the annotation at the if statement, because then it plus
2412 * the then and else blocks get printed.
2413 */
2414 this->base_ir = ir->condition;
2415
2416 if (brw->gen == 6) {
2417 emit_if_gen6(ir);
2418 } else {
2419 uint32_t predicate;
2420 emit_bool_to_cond_code(ir->condition, &predicate);
2421 emit(IF(predicate));
2422 }
2423
2424 visit_instructions(&ir->then_instructions);
2425
2426 if (!ir->else_instructions.is_empty()) {
2427 this->base_ir = ir->condition;
2428 emit(BRW_OPCODE_ELSE);
2429
2430 visit_instructions(&ir->else_instructions);
2431 }
2432
2433 this->base_ir = ir->condition;
2434 emit(BRW_OPCODE_ENDIF);
2435 }
2436
2437 void
2438 vec4_visitor::visit(ir_emit_vertex *)
2439 {
2440 assert(!"not reached");
2441 }
2442
2443 void
2444 vec4_visitor::visit(ir_end_primitive *)
2445 {
2446 assert(!"not reached");
2447 }
2448
2449 void
2450 vec4_visitor::emit_ndc_computation()
2451 {
2452 /* Get the position */
2453 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2454
2455 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2456 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2457 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2458
2459 current_annotation = "NDC";
2460 dst_reg ndc_w = ndc;
2461 ndc_w.writemask = WRITEMASK_W;
2462 src_reg pos_w = pos;
2463 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2464 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2465
2466 dst_reg ndc_xyz = ndc;
2467 ndc_xyz.writemask = WRITEMASK_XYZ;
2468
2469 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2470 }
2471
2472 void
2473 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2474 {
2475 if (brw->gen < 6 &&
2476 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2477 key->userclip_active || brw->has_negative_rhw_bug)) {
2478 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2479 dst_reg header1_w = header1;
2480 header1_w.writemask = WRITEMASK_W;
2481
2482 emit(MOV(header1, 0u));
2483
2484 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2485 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2486
2487 current_annotation = "Point size";
2488 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2489 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2490 }
2491
2492 if (key->userclip_active) {
2493 current_annotation = "Clipping flags";
2494 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2495 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2496
2497 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2498 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2499 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2500
2501 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2502 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2503 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2504 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2505 }
2506
2507 /* i965 clipping workaround:
2508 * 1) Test for -ve rhw
2509 * 2) If set,
2510 * set ndc = (0,0,0,0)
2511 * set ucp[6] = 1
2512 *
2513 * Later, clipping will detect ucp[6] and ensure the primitive is
2514 * clipped against all fixed planes.
2515 */
2516 if (brw->has_negative_rhw_bug) {
2517 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2518 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2519 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2520 vec4_instruction *inst;
2521 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2522 inst->predicate = BRW_PREDICATE_NORMAL;
2523 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2524 inst->predicate = BRW_PREDICATE_NORMAL;
2525 }
2526
2527 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2528 } else if (brw->gen < 6) {
2529 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2530 } else {
2531 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2532 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2533 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2534 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2535 }
2536 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2537 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2538 src_reg(output_reg[VARYING_SLOT_LAYER])));
2539 }
2540 }
2541 }
2542
2543 void
2544 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2545 {
2546 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2547 *
2548 * "If a linked set of shaders forming the vertex stage contains no
2549 * static write to gl_ClipVertex or gl_ClipDistance, but the
2550 * application has requested clipping against user clip planes through
2551 * the API, then the coordinate written to gl_Position is used for
2552 * comparison against the user clip planes."
2553 *
2554 * This function is only called if the shader didn't write to
2555 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2556 * if the user wrote to it; otherwise we use gl_Position.
2557 */
2558 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2559 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2560 clip_vertex = VARYING_SLOT_POS;
2561 }
2562
2563 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2564 ++i) {
2565 reg.writemask = 1 << i;
2566 emit(DP4(reg,
2567 src_reg(output_reg[clip_vertex]),
2568 src_reg(this->userplane[i + offset])));
2569 }
2570 }
2571
2572 void
2573 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2574 {
2575 assert (varying < VARYING_SLOT_MAX);
2576 reg.type = output_reg[varying].type;
2577 current_annotation = output_reg_annotation[varying];
2578 /* Copy the register, saturating if necessary */
2579 vec4_instruction *inst = emit(MOV(reg,
2580 src_reg(output_reg[varying])));
2581 if ((varying == VARYING_SLOT_COL0 ||
2582 varying == VARYING_SLOT_COL1 ||
2583 varying == VARYING_SLOT_BFC0 ||
2584 varying == VARYING_SLOT_BFC1) &&
2585 key->clamp_vertex_color) {
2586 inst->saturate = true;
2587 }
2588 }
2589
2590 void
2591 vec4_visitor::emit_urb_slot(int mrf, int varying)
2592 {
2593 struct brw_reg hw_reg = brw_message_reg(mrf);
2594 dst_reg reg = dst_reg(MRF, mrf);
2595 reg.type = BRW_REGISTER_TYPE_F;
2596
2597 switch (varying) {
2598 case VARYING_SLOT_PSIZ:
2599 /* PSIZ is always in slot 0, and is coupled with other flags. */
2600 current_annotation = "indices, point width, clip flags";
2601 emit_psiz_and_flags(hw_reg);
2602 break;
2603 case BRW_VARYING_SLOT_NDC:
2604 current_annotation = "NDC";
2605 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2606 break;
2607 case VARYING_SLOT_POS:
2608 current_annotation = "gl_Position";
2609 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2610 break;
2611 case VARYING_SLOT_EDGE:
2612 /* This is present when doing unfilled polygons. We're supposed to copy
2613 * the edge flag from the user-provided vertex array
2614 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2615 * of that attribute (starts as 1.0f). This is then used in clipping to
2616 * determine which edges should be drawn as wireframe.
2617 */
2618 current_annotation = "edge flag";
2619 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2620 glsl_type::float_type, WRITEMASK_XYZW))));
2621 break;
2622 case BRW_VARYING_SLOT_PAD:
2623 /* No need to write to this slot */
2624 break;
2625 default:
2626 emit_generic_urb_slot(reg, varying);
2627 break;
2628 }
2629 }
2630
2631 static int
2632 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2633 {
2634 if (brw->gen >= 6) {
2635 /* URB data written (does not include the message header reg) must
2636 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2637 * section 5.4.3.2.2: URB_INTERLEAVED.
2638 *
2639 * URB entries are allocated on a multiple of 1024 bits, so an
2640 * extra 128 bits written here to make the end align to 256 is
2641 * no problem.
2642 */
2643 if ((mlen % 2) != 1)
2644 mlen++;
2645 }
2646
2647 return mlen;
2648 }
2649
2650
2651 /**
2652 * Generates the VUE payload plus the necessary URB write instructions to
2653 * output it.
2654 *
2655 * The VUE layout is documented in Volume 2a.
2656 */
2657 void
2658 vec4_visitor::emit_vertex()
2659 {
2660 /* MRF 0 is reserved for the debugger, so start with message header
2661 * in MRF 1.
2662 */
2663 int base_mrf = 1;
2664 int mrf = base_mrf;
2665 /* In the process of generating our URB write message contents, we
2666 * may need to unspill a register or load from an array. Those
2667 * reads would use MRFs 14-15.
2668 */
2669 int max_usable_mrf = 13;
2670
2671 /* The following assertion verifies that max_usable_mrf causes an
2672 * even-numbered amount of URB write data, which will meet gen6's
2673 * requirements for length alignment.
2674 */
2675 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2676
2677 /* First mrf is the g0-based message header containing URB handles and
2678 * such.
2679 */
2680 emit_urb_write_header(mrf++);
2681
2682 if (brw->gen < 6) {
2683 emit_ndc_computation();
2684 }
2685
2686 /* Lower legacy ff and ClipVertex clipping to clip distances */
2687 if (key->userclip_active && !key->uses_clip_distance) {
2688 current_annotation = "user clip distances";
2689
2690 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2691 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2692
2693 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2694 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2695 }
2696
2697 /* We may need to split this up into several URB writes, so do them in a
2698 * loop.
2699 */
2700 int slot = 0;
2701 bool complete = false;
2702 do {
2703 /* URB offset is in URB row increments, and each of our MRFs is half of
2704 * one of those, since we're doing interleaved writes.
2705 */
2706 int offset = slot / 2;
2707
2708 mrf = base_mrf + 1;
2709 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2710 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2711
2712 /* If this was max_usable_mrf, we can't fit anything more into this
2713 * URB WRITE.
2714 */
2715 if (mrf > max_usable_mrf) {
2716 slot++;
2717 break;
2718 }
2719 }
2720
2721 complete = slot >= prog_data->vue_map.num_slots;
2722 current_annotation = "URB write";
2723 vec4_instruction *inst = emit_urb_write_opcode(complete);
2724 inst->base_mrf = base_mrf;
2725 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2726 inst->offset += offset;
2727 } while(!complete);
2728 }
2729
2730
2731 src_reg
2732 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2733 src_reg *reladdr, int reg_offset)
2734 {
2735 /* Because we store the values to scratch interleaved like our
2736 * vertex data, we need to scale the vec4 index by 2.
2737 */
2738 int message_header_scale = 2;
2739
2740 /* Pre-gen6, the message header uses byte offsets instead of vec4
2741 * (16-byte) offset units.
2742 */
2743 if (brw->gen < 6)
2744 message_header_scale *= 16;
2745
2746 if (reladdr) {
2747 src_reg index = src_reg(this, glsl_type::int_type);
2748
2749 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2750 emit_before(inst, MUL(dst_reg(index),
2751 index, src_reg(message_header_scale)));
2752
2753 return index;
2754 } else {
2755 return src_reg(reg_offset * message_header_scale);
2756 }
2757 }
2758
2759 src_reg
2760 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2761 src_reg *reladdr, int reg_offset)
2762 {
2763 if (reladdr) {
2764 src_reg index = src_reg(this, glsl_type::int_type);
2765
2766 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2767
2768 /* Pre-gen6, the message header uses byte offsets instead of vec4
2769 * (16-byte) offset units.
2770 */
2771 if (brw->gen < 6) {
2772 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2773 }
2774
2775 return index;
2776 } else {
2777 int message_header_scale = brw->gen < 6 ? 16 : 1;
2778 return src_reg(reg_offset * message_header_scale);
2779 }
2780 }
2781
2782 /**
2783 * Emits an instruction before @inst to load the value named by @orig_src
2784 * from scratch space at @base_offset to @temp.
2785 *
2786 * @base_offset is measured in 32-byte units (the size of a register).
2787 */
2788 void
2789 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2790 dst_reg temp, src_reg orig_src,
2791 int base_offset)
2792 {
2793 int reg_offset = base_offset + orig_src.reg_offset;
2794 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2795
2796 emit_before(inst, SCRATCH_READ(temp, index));
2797 }
2798
2799 /**
2800 * Emits an instruction after @inst to store the value to be written
2801 * to @orig_dst to scratch space at @base_offset, from @temp.
2802 *
2803 * @base_offset is measured in 32-byte units (the size of a register).
2804 */
2805 void
2806 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2807 {
2808 int reg_offset = base_offset + inst->dst.reg_offset;
2809 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2810
2811 /* Create a temporary register to store *inst's result in.
2812 *
2813 * We have to be careful in MOVing from our temporary result register in
2814 * the scratch write. If we swizzle from channels of the temporary that
2815 * weren't initialized, it will confuse live interval analysis, which will
2816 * make spilling fail to make progress.
2817 */
2818 src_reg temp = src_reg(this, glsl_type::vec4_type);
2819 temp.type = inst->dst.type;
2820 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2821 int swizzles[4];
2822 for (int i = 0; i < 4; i++)
2823 if (inst->dst.writemask & (1 << i))
2824 swizzles[i] = i;
2825 else
2826 swizzles[i] = first_writemask_chan;
2827 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2828 swizzles[2], swizzles[3]);
2829
2830 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2831 inst->dst.writemask));
2832 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2833 write->predicate = inst->predicate;
2834 write->ir = inst->ir;
2835 write->annotation = inst->annotation;
2836 inst->insert_after(write);
2837
2838 inst->dst.file = temp.file;
2839 inst->dst.reg = temp.reg;
2840 inst->dst.reg_offset = temp.reg_offset;
2841 inst->dst.reladdr = NULL;
2842 }
2843
2844 /**
2845 * We can't generally support array access in GRF space, because a
2846 * single instruction's destination can only span 2 contiguous
2847 * registers. So, we send all GRF arrays that get variable index
2848 * access to scratch space.
2849 */
2850 void
2851 vec4_visitor::move_grf_array_access_to_scratch()
2852 {
2853 int scratch_loc[this->virtual_grf_count];
2854
2855 for (int i = 0; i < this->virtual_grf_count; i++) {
2856 scratch_loc[i] = -1;
2857 }
2858
2859 /* First, calculate the set of virtual GRFs that need to be punted
2860 * to scratch due to having any array access on them, and where in
2861 * scratch.
2862 */
2863 foreach_list(node, &this->instructions) {
2864 vec4_instruction *inst = (vec4_instruction *)node;
2865
2866 if (inst->dst.file == GRF && inst->dst.reladdr &&
2867 scratch_loc[inst->dst.reg] == -1) {
2868 scratch_loc[inst->dst.reg] = c->last_scratch;
2869 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2870 }
2871
2872 for (int i = 0 ; i < 3; i++) {
2873 src_reg *src = &inst->src[i];
2874
2875 if (src->file == GRF && src->reladdr &&
2876 scratch_loc[src->reg] == -1) {
2877 scratch_loc[src->reg] = c->last_scratch;
2878 c->last_scratch += this->virtual_grf_sizes[src->reg];
2879 }
2880 }
2881 }
2882
2883 /* Now, for anything that will be accessed through scratch, rewrite
2884 * it to load/store. Note that this is a _safe list walk, because
2885 * we may generate a new scratch_write instruction after the one
2886 * we're processing.
2887 */
2888 foreach_list_safe(node, &this->instructions) {
2889 vec4_instruction *inst = (vec4_instruction *)node;
2890
2891 /* Set up the annotation tracking for new generated instructions. */
2892 base_ir = inst->ir;
2893 current_annotation = inst->annotation;
2894
2895 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2896 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2897 }
2898
2899 for (int i = 0 ; i < 3; i++) {
2900 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2901 continue;
2902
2903 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2904
2905 emit_scratch_read(inst, temp, inst->src[i],
2906 scratch_loc[inst->src[i].reg]);
2907
2908 inst->src[i].file = temp.file;
2909 inst->src[i].reg = temp.reg;
2910 inst->src[i].reg_offset = temp.reg_offset;
2911 inst->src[i].reladdr = NULL;
2912 }
2913 }
2914 }
2915
2916 /**
2917 * Emits an instruction before @inst to load the value named by @orig_src
2918 * from the pull constant buffer (surface) at @base_offset to @temp.
2919 */
2920 void
2921 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2922 dst_reg temp, src_reg orig_src,
2923 int base_offset)
2924 {
2925 int reg_offset = base_offset + orig_src.reg_offset;
2926 src_reg index = src_reg((unsigned)SURF_INDEX_VEC4_CONST_BUFFER);
2927 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2928 vec4_instruction *load;
2929
2930 if (brw->gen >= 7) {
2931 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2932 grf_offset.type = offset.type;
2933 emit_before(inst, MOV(grf_offset, offset));
2934
2935 load = new(mem_ctx) vec4_instruction(this,
2936 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2937 temp, index, src_reg(grf_offset));
2938 } else {
2939 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2940 temp, index, offset);
2941 load->base_mrf = 14;
2942 load->mlen = 1;
2943 }
2944 emit_before(inst, load);
2945 }
2946
2947 /**
2948 * Implements array access of uniforms by inserting a
2949 * PULL_CONSTANT_LOAD instruction.
2950 *
2951 * Unlike temporary GRF array access (where we don't support it due to
2952 * the difficulty of doing relative addressing on instruction
2953 * destinations), we could potentially do array access of uniforms
2954 * that were loaded in GRF space as push constants. In real-world
2955 * usage we've seen, though, the arrays being used are always larger
2956 * than we could load as push constants, so just always move all
2957 * uniform array access out to a pull constant buffer.
2958 */
2959 void
2960 vec4_visitor::move_uniform_array_access_to_pull_constants()
2961 {
2962 int pull_constant_loc[this->uniforms];
2963
2964 for (int i = 0; i < this->uniforms; i++) {
2965 pull_constant_loc[i] = -1;
2966 }
2967
2968 /* Walk through and find array access of uniforms. Put a copy of that
2969 * uniform in the pull constant buffer.
2970 *
2971 * Note that we don't move constant-indexed accesses to arrays. No
2972 * testing has been done of the performance impact of this choice.
2973 */
2974 foreach_list_safe(node, &this->instructions) {
2975 vec4_instruction *inst = (vec4_instruction *)node;
2976
2977 for (int i = 0 ; i < 3; i++) {
2978 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2979 continue;
2980
2981 int uniform = inst->src[i].reg;
2982
2983 /* If this array isn't already present in the pull constant buffer,
2984 * add it.
2985 */
2986 if (pull_constant_loc[uniform] == -1) {
2987 const float **values = &prog_data->param[uniform * 4];
2988
2989 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2990
2991 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2992 prog_data->pull_param[prog_data->nr_pull_params++]
2993 = values[j];
2994 }
2995 }
2996
2997 /* Set up the annotation tracking for new generated instructions. */
2998 base_ir = inst->ir;
2999 current_annotation = inst->annotation;
3000
3001 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3002
3003 emit_pull_constant_load(inst, temp, inst->src[i],
3004 pull_constant_loc[uniform]);
3005
3006 inst->src[i].file = temp.file;
3007 inst->src[i].reg = temp.reg;
3008 inst->src[i].reg_offset = temp.reg_offset;
3009 inst->src[i].reladdr = NULL;
3010 }
3011 }
3012
3013 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3014 * no need to track them as larger-than-vec4 objects. This will be
3015 * relied on in cutting out unused uniform vectors from push
3016 * constants.
3017 */
3018 split_uniform_registers();
3019 }
3020
3021 void
3022 vec4_visitor::resolve_ud_negate(src_reg *reg)
3023 {
3024 if (reg->type != BRW_REGISTER_TYPE_UD ||
3025 !reg->negate)
3026 return;
3027
3028 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3029 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3030 *reg = temp;
3031 }
3032
3033 vec4_visitor::vec4_visitor(struct brw_context *brw,
3034 struct brw_vec4_compile *c,
3035 struct gl_program *prog,
3036 const struct brw_vec4_prog_key *key,
3037 struct brw_vec4_prog_data *prog_data,
3038 struct gl_shader_program *shader_prog,
3039 struct brw_shader *shader,
3040 void *mem_ctx,
3041 bool debug_flag)
3042 : debug_flag(debug_flag)
3043 {
3044 this->brw = brw;
3045 this->ctx = &brw->ctx;
3046 this->shader_prog = shader_prog;
3047 this->shader = shader;
3048
3049 this->mem_ctx = mem_ctx;
3050 this->failed = false;
3051
3052 this->base_ir = NULL;
3053 this->current_annotation = NULL;
3054 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3055
3056 this->c = c;
3057 this->prog = prog;
3058 this->key = key;
3059 this->prog_data = prog_data;
3060
3061 this->variable_ht = hash_table_ctor(0,
3062 hash_table_pointer_hash,
3063 hash_table_pointer_compare);
3064
3065 this->virtual_grf_start = NULL;
3066 this->virtual_grf_end = NULL;
3067 this->virtual_grf_sizes = NULL;
3068 this->virtual_grf_count = 0;
3069 this->virtual_grf_reg_map = NULL;
3070 this->virtual_grf_reg_count = 0;
3071 this->virtual_grf_array_size = 0;
3072 this->live_intervals_valid = false;
3073
3074 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3075
3076 this->uniforms = 0;
3077 }
3078
3079 vec4_visitor::~vec4_visitor()
3080 {
3081 hash_table_dtor(this->variable_ht);
3082 }
3083
3084
3085 void
3086 vec4_visitor::fail(const char *format, ...)
3087 {
3088 va_list va;
3089 char *msg;
3090
3091 if (failed)
3092 return;
3093
3094 failed = true;
3095
3096 va_start(va, format);
3097 msg = ralloc_vasprintf(mem_ctx, format, va);
3098 va_end(va);
3099 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3100
3101 this->fail_msg = msg;
3102
3103 if (debug_flag) {
3104 fprintf(stderr, "%s", msg);
3105 }
3106 }
3107
3108 } /* namespace brw */