i965: Remove never used RSR and RSL opcodes.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->ir = v->base_ir;
42 this->annotation = v->current_annotation;
43 }
44
45 vec4_instruction *
46 vec4_visitor::emit(vec4_instruction *inst)
47 {
48 this->instructions.push_tail(inst);
49
50 return inst;
51 }
52
53 vec4_instruction *
54 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
55 {
56 new_inst->ir = inst->ir;
57 new_inst->annotation = inst->annotation;
58
59 inst->insert_before(new_inst);
60
61 return inst;
62 }
63
64 vec4_instruction *
65 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
66 src_reg src0, src_reg src1, src_reg src2)
67 {
68 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
69 src0, src1, src2));
70 }
71
72
73 vec4_instruction *
74 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
75 {
76 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
81 {
82 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
83 }
84
85 vec4_instruction *
86 vec4_visitor::emit(enum opcode opcode)
87 {
88 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
89 }
90
91 #define ALU1(op) \
92 vec4_instruction * \
93 vec4_visitor::op(dst_reg dst, src_reg src0) \
94 { \
95 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
96 src0); \
97 }
98
99 #define ALU2(op) \
100 vec4_instruction * \
101 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
102 { \
103 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
104 src0, src1); \
105 }
106
107 #define ALU3(op) \
108 vec4_instruction * \
109 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
110 { \
111 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
112 src0, src1, src2); \
113 }
114
115 ALU1(NOT)
116 ALU1(MOV)
117 ALU1(FRC)
118 ALU1(RNDD)
119 ALU1(RNDE)
120 ALU1(RNDZ)
121 ALU1(F32TO16)
122 ALU1(F16TO32)
123 ALU2(ADD)
124 ALU2(MUL)
125 ALU2(MACH)
126 ALU2(AND)
127 ALU2(OR)
128 ALU2(XOR)
129 ALU2(DP3)
130 ALU2(DP4)
131 ALU2(DPH)
132 ALU2(SHL)
133 ALU2(SHR)
134 ALU2(ASR)
135 ALU3(LRP)
136 ALU1(BFREV)
137 ALU3(BFE)
138 ALU2(BFI1)
139 ALU3(BFI2)
140 ALU1(FBH)
141 ALU1(FBL)
142 ALU1(CBIT)
143 ALU3(MAD)
144
145 /** Gen4 predicated IF. */
146 vec4_instruction *
147 vec4_visitor::IF(uint32_t predicate)
148 {
149 vec4_instruction *inst;
150
151 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
152 inst->predicate = predicate;
153
154 return inst;
155 }
156
157 /** Gen6+ IF with embedded comparison. */
158 vec4_instruction *
159 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
160 {
161 assert(brw->gen >= 6);
162
163 vec4_instruction *inst;
164
165 resolve_ud_negate(&src0);
166 resolve_ud_negate(&src1);
167
168 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
169 src0, src1);
170 inst->conditional_mod = condition;
171
172 return inst;
173 }
174
175 /**
176 * CMP: Sets the low bit of the destination channels with the result
177 * of the comparison, while the upper bits are undefined, and updates
178 * the flag register with the packed 16 bits of the result.
179 */
180 vec4_instruction *
181 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
182 {
183 vec4_instruction *inst;
184
185 /* original gen4 does type conversion to the destination type
186 * before before comparison, producing garbage results for floating
187 * point comparisons.
188 */
189 if (brw->gen == 4) {
190 dst.type = src0.type;
191 if (dst.file == HW_REG)
192 dst.fixed_hw_reg.type = dst.type;
193 }
194
195 resolve_ud_negate(&src0);
196 resolve_ud_negate(&src1);
197
198 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
199 inst->conditional_mod = condition;
200
201 return inst;
202 }
203
204 vec4_instruction *
205 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
206 {
207 vec4_instruction *inst;
208
209 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
210 dst, index);
211 inst->base_mrf = 14;
212 inst->mlen = 2;
213
214 return inst;
215 }
216
217 vec4_instruction *
218 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
219 {
220 vec4_instruction *inst;
221
222 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
223 dst, src, index);
224 inst->base_mrf = 13;
225 inst->mlen = 3;
226
227 return inst;
228 }
229
230 void
231 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
232 {
233 static enum opcode dot_opcodes[] = {
234 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
235 };
236
237 emit(dot_opcodes[elements - 2], dst, src0, src1);
238 }
239
240 src_reg
241 vec4_visitor::fix_3src_operand(src_reg src)
242 {
243 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
244 * able to use vertical stride of zero to replicate the vec4 uniform, like
245 *
246 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
247 *
248 * But you can't, since vertical stride is always four in three-source
249 * instructions. Instead, insert a MOV instruction to do the replication so
250 * that the three-source instruction can consume it.
251 */
252
253 /* The MOV is only needed if the source is a uniform or immediate. */
254 if (src.file != UNIFORM && src.file != IMM)
255 return src;
256
257 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
258 expanded.type = src.type;
259 emit(MOV(expanded, src));
260 return src_reg(expanded);
261 }
262
263 src_reg
264 vec4_visitor::fix_math_operand(src_reg src)
265 {
266 /* The gen6 math instruction ignores the source modifiers --
267 * swizzle, abs, negate, and at least some parts of the register
268 * region description.
269 *
270 * Rather than trying to enumerate all these cases, *always* expand the
271 * operand to a temp GRF for gen6.
272 *
273 * For gen7, keep the operand as-is, except if immediate, which gen7 still
274 * can't use.
275 */
276
277 if (brw->gen == 7 && src.file != IMM)
278 return src;
279
280 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
281 expanded.type = src.type;
282 emit(MOV(expanded, src));
283 return src_reg(expanded);
284 }
285
286 void
287 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
288 {
289 src = fix_math_operand(src);
290
291 if (dst.writemask != WRITEMASK_XYZW) {
292 /* The gen6 math instruction must be align1, so we can't do
293 * writemasks.
294 */
295 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
296
297 emit(opcode, temp_dst, src);
298
299 emit(MOV(dst, src_reg(temp_dst)));
300 } else {
301 emit(opcode, dst, src);
302 }
303 }
304
305 void
306 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
307 {
308 vec4_instruction *inst = emit(opcode, dst, src);
309 inst->base_mrf = 1;
310 inst->mlen = 1;
311 }
312
313 void
314 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
315 {
316 switch (opcode) {
317 case SHADER_OPCODE_RCP:
318 case SHADER_OPCODE_RSQ:
319 case SHADER_OPCODE_SQRT:
320 case SHADER_OPCODE_EXP2:
321 case SHADER_OPCODE_LOG2:
322 case SHADER_OPCODE_SIN:
323 case SHADER_OPCODE_COS:
324 break;
325 default:
326 assert(!"not reached: bad math opcode");
327 return;
328 }
329
330 if (brw->gen >= 6) {
331 return emit_math1_gen6(opcode, dst, src);
332 } else {
333 return emit_math1_gen4(opcode, dst, src);
334 }
335 }
336
337 void
338 vec4_visitor::emit_math2_gen6(enum opcode opcode,
339 dst_reg dst, src_reg src0, src_reg src1)
340 {
341 src0 = fix_math_operand(src0);
342 src1 = fix_math_operand(src1);
343
344 if (dst.writemask != WRITEMASK_XYZW) {
345 /* The gen6 math instruction must be align1, so we can't do
346 * writemasks.
347 */
348 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
349 temp_dst.type = dst.type;
350
351 emit(opcode, temp_dst, src0, src1);
352
353 emit(MOV(dst, src_reg(temp_dst)));
354 } else {
355 emit(opcode, dst, src0, src1);
356 }
357 }
358
359 void
360 vec4_visitor::emit_math2_gen4(enum opcode opcode,
361 dst_reg dst, src_reg src0, src_reg src1)
362 {
363 vec4_instruction *inst = emit(opcode, dst, src0, src1);
364 inst->base_mrf = 1;
365 inst->mlen = 2;
366 }
367
368 void
369 vec4_visitor::emit_math(enum opcode opcode,
370 dst_reg dst, src_reg src0, src_reg src1)
371 {
372 switch (opcode) {
373 case SHADER_OPCODE_POW:
374 case SHADER_OPCODE_INT_QUOTIENT:
375 case SHADER_OPCODE_INT_REMAINDER:
376 break;
377 default:
378 assert(!"not reached: unsupported binary math opcode");
379 return;
380 }
381
382 if (brw->gen >= 6) {
383 return emit_math2_gen6(opcode, dst, src0, src1);
384 } else {
385 return emit_math2_gen4(opcode, dst, src0, src1);
386 }
387 }
388
389 void
390 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
391 {
392 if (brw->gen < 7)
393 assert(!"ir_unop_pack_half_2x16 should be lowered");
394
395 assert(dst.type == BRW_REGISTER_TYPE_UD);
396 assert(src0.type == BRW_REGISTER_TYPE_F);
397
398 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
399 *
400 * Because this instruction does not have a 16-bit floating-point type,
401 * the destination data type must be Word (W).
402 *
403 * The destination must be DWord-aligned and specify a horizontal stride
404 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
405 * each destination channel and the upper word is not modified.
406 *
407 * The above restriction implies that the f32to16 instruction must use
408 * align1 mode, because only in align1 mode is it possible to specify
409 * horizontal stride. We choose here to defy the hardware docs and emit
410 * align16 instructions.
411 *
412 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
413 * instructions. I was partially successful in that the code passed all
414 * tests. However, the code was dubiously correct and fragile, and the
415 * tests were not harsh enough to probe that frailty. Not trusting the
416 * code, I chose instead to remain in align16 mode in defiance of the hw
417 * docs).
418 *
419 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
420 * simulator, emitting a f32to16 in align16 mode with UD as destination
421 * data type is safe. The behavior differs from that specified in the PRM
422 * in that the upper word of each destination channel is cleared to 0.
423 */
424
425 dst_reg tmp_dst(this, glsl_type::uvec2_type);
426 src_reg tmp_src(tmp_dst);
427
428 #if 0
429 /* Verify the undocumented behavior on which the following instructions
430 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
431 * then the result of the bit-or instruction below will be incorrect.
432 *
433 * You should inspect the disasm output in order to verify that the MOV is
434 * not optimized away.
435 */
436 emit(MOV(tmp_dst, src_reg(0x12345678u)));
437 #endif
438
439 /* Give tmp the form below, where "." means untouched.
440 *
441 * w z y x w z y x
442 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
443 *
444 * That the upper word of each write-channel be 0 is required for the
445 * following bit-shift and bit-or instructions to work. Note that this
446 * relies on the undocumented hardware behavior mentioned above.
447 */
448 tmp_dst.writemask = WRITEMASK_XY;
449 emit(F32TO16(tmp_dst, src0));
450
451 /* Give the write-channels of dst the form:
452 * 0xhhhh0000
453 */
454 tmp_src.swizzle = SWIZZLE_Y;
455 emit(SHL(dst, tmp_src, src_reg(16u)));
456
457 /* Finally, give the write-channels of dst the form of packHalf2x16's
458 * output:
459 * 0xhhhhllll
460 */
461 tmp_src.swizzle = SWIZZLE_X;
462 emit(OR(dst, src_reg(dst), tmp_src));
463 }
464
465 void
466 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
467 {
468 if (brw->gen < 7)
469 assert(!"ir_unop_unpack_half_2x16 should be lowered");
470
471 assert(dst.type == BRW_REGISTER_TYPE_F);
472 assert(src0.type == BRW_REGISTER_TYPE_UD);
473
474 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
475 *
476 * Because this instruction does not have a 16-bit floating-point type,
477 * the source data type must be Word (W). The destination type must be
478 * F (Float).
479 *
480 * To use W as the source data type, we must adjust horizontal strides,
481 * which is only possible in align1 mode. All my [chadv] attempts at
482 * emitting align1 instructions for unpackHalf2x16 failed to pass the
483 * Piglit tests, so I gave up.
484 *
485 * I've verified that, on gen7 hardware and the simulator, it is safe to
486 * emit f16to32 in align16 mode with UD as source data type.
487 */
488
489 dst_reg tmp_dst(this, glsl_type::uvec2_type);
490 src_reg tmp_src(tmp_dst);
491
492 tmp_dst.writemask = WRITEMASK_X;
493 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
494
495 tmp_dst.writemask = WRITEMASK_Y;
496 emit(SHR(tmp_dst, src0, src_reg(16u)));
497
498 dst.writemask = WRITEMASK_XY;
499 emit(F16TO32(dst, tmp_src));
500 }
501
502 void
503 vec4_visitor::visit_instructions(const exec_list *list)
504 {
505 foreach_list(node, list) {
506 ir_instruction *ir = (ir_instruction *)node;
507
508 base_ir = ir;
509 ir->accept(this);
510 }
511 }
512
513
514 static int
515 type_size(const struct glsl_type *type)
516 {
517 unsigned int i;
518 int size;
519
520 switch (type->base_type) {
521 case GLSL_TYPE_UINT:
522 case GLSL_TYPE_INT:
523 case GLSL_TYPE_FLOAT:
524 case GLSL_TYPE_BOOL:
525 if (type->is_matrix()) {
526 return type->matrix_columns;
527 } else {
528 /* Regardless of size of vector, it gets a vec4. This is bad
529 * packing for things like floats, but otherwise arrays become a
530 * mess. Hopefully a later pass over the code can pack scalars
531 * down if appropriate.
532 */
533 return 1;
534 }
535 case GLSL_TYPE_ARRAY:
536 assert(type->length > 0);
537 return type_size(type->fields.array) * type->length;
538 case GLSL_TYPE_STRUCT:
539 size = 0;
540 for (i = 0; i < type->length; i++) {
541 size += type_size(type->fields.structure[i].type);
542 }
543 return size;
544 case GLSL_TYPE_SAMPLER:
545 /* Samplers take up one slot in UNIFORMS[], but they're baked in
546 * at link time.
547 */
548 return 1;
549 case GLSL_TYPE_VOID:
550 case GLSL_TYPE_ERROR:
551 case GLSL_TYPE_INTERFACE:
552 assert(0);
553 break;
554 }
555
556 return 0;
557 }
558
559 int
560 vec4_visitor::virtual_grf_alloc(int size)
561 {
562 if (virtual_grf_array_size <= virtual_grf_count) {
563 if (virtual_grf_array_size == 0)
564 virtual_grf_array_size = 16;
565 else
566 virtual_grf_array_size *= 2;
567 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
568 virtual_grf_array_size);
569 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
570 virtual_grf_array_size);
571 }
572 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
573 virtual_grf_reg_count += size;
574 virtual_grf_sizes[virtual_grf_count] = size;
575 return virtual_grf_count++;
576 }
577
578 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
579 {
580 init();
581
582 this->file = GRF;
583 this->reg = v->virtual_grf_alloc(type_size(type));
584
585 if (type->is_array() || type->is_record()) {
586 this->swizzle = BRW_SWIZZLE_NOOP;
587 } else {
588 this->swizzle = swizzle_for_size(type->vector_elements);
589 }
590
591 this->type = brw_type_for_base_type(type);
592 }
593
594 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
595 {
596 init();
597
598 this->file = GRF;
599 this->reg = v->virtual_grf_alloc(type_size(type));
600
601 if (type->is_array() || type->is_record()) {
602 this->writemask = WRITEMASK_XYZW;
603 } else {
604 this->writemask = (1 << type->vector_elements) - 1;
605 }
606
607 this->type = brw_type_for_base_type(type);
608 }
609
610 /* Our support for uniforms is piggy-backed on the struct
611 * gl_fragment_program, because that's where the values actually
612 * get stored, rather than in some global gl_shader_program uniform
613 * store.
614 */
615 void
616 vec4_visitor::setup_uniform_values(ir_variable *ir)
617 {
618 int namelen = strlen(ir->name);
619
620 /* The data for our (non-builtin) uniforms is stored in a series of
621 * gl_uniform_driver_storage structs for each subcomponent that
622 * glGetUniformLocation() could name. We know it's been set up in the same
623 * order we'd walk the type, so walk the list of storage and find anything
624 * with our name, or the prefix of a component that starts with our name.
625 */
626 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
627 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
628
629 if (strncmp(ir->name, storage->name, namelen) != 0 ||
630 (storage->name[namelen] != 0 &&
631 storage->name[namelen] != '.' &&
632 storage->name[namelen] != '[')) {
633 continue;
634 }
635
636 gl_constant_value *components = storage->storage;
637 unsigned vector_count = (MAX2(storage->array_elements, 1) *
638 storage->type->matrix_columns);
639
640 for (unsigned s = 0; s < vector_count; s++) {
641 uniform_vector_size[uniforms] = storage->type->vector_elements;
642
643 int i;
644 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
645 prog_data->param[uniforms * 4 + i] = &components->f;
646 components++;
647 }
648 for (; i < 4; i++) {
649 static float zero = 0;
650 prog_data->param[uniforms * 4 + i] = &zero;
651 }
652
653 uniforms++;
654 }
655 }
656 }
657
658 void
659 vec4_visitor::setup_uniform_clipplane_values()
660 {
661 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
662
663 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
664 this->uniform_vector_size[this->uniforms] = 4;
665 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
666 this->userplane[i].type = BRW_REGISTER_TYPE_F;
667 for (int j = 0; j < 4; ++j) {
668 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
669 }
670 ++this->uniforms;
671 }
672 }
673
674 /* Our support for builtin uniforms is even scarier than non-builtin.
675 * It sits on top of the PROG_STATE_VAR parameters that are
676 * automatically updated from GL context state.
677 */
678 void
679 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
680 {
681 const ir_state_slot *const slots = ir->state_slots;
682 assert(ir->state_slots != NULL);
683
684 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
685 /* This state reference has already been setup by ir_to_mesa,
686 * but we'll get the same index back here. We can reference
687 * ParameterValues directly, since unlike brw_fs.cpp, we never
688 * add new state references during compile.
689 */
690 int index = _mesa_add_state_reference(this->prog->Parameters,
691 (gl_state_index *)slots[i].tokens);
692 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
693
694 this->uniform_vector_size[this->uniforms] = 0;
695 /* Add each of the unique swizzled channels of the element.
696 * This will end up matching the size of the glsl_type of this field.
697 */
698 int last_swiz = -1;
699 for (unsigned int j = 0; j < 4; j++) {
700 int swiz = GET_SWZ(slots[i].swizzle, j);
701 last_swiz = swiz;
702
703 prog_data->param[this->uniforms * 4 + j] = &values[swiz];
704 if (swiz <= last_swiz)
705 this->uniform_vector_size[this->uniforms]++;
706 }
707 this->uniforms++;
708 }
709 }
710
711 dst_reg *
712 vec4_visitor::variable_storage(ir_variable *var)
713 {
714 return (dst_reg *)hash_table_find(this->variable_ht, var);
715 }
716
717 void
718 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
719 {
720 ir_expression *expr = ir->as_expression();
721
722 *predicate = BRW_PREDICATE_NORMAL;
723
724 if (expr) {
725 src_reg op[2];
726 vec4_instruction *inst;
727
728 assert(expr->get_num_operands() <= 2);
729 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
730 expr->operands[i]->accept(this);
731 op[i] = this->result;
732
733 resolve_ud_negate(&op[i]);
734 }
735
736 switch (expr->operation) {
737 case ir_unop_logic_not:
738 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
739 inst->conditional_mod = BRW_CONDITIONAL_Z;
740 break;
741
742 case ir_binop_logic_xor:
743 inst = emit(XOR(dst_null_d(), op[0], op[1]));
744 inst->conditional_mod = BRW_CONDITIONAL_NZ;
745 break;
746
747 case ir_binop_logic_or:
748 inst = emit(OR(dst_null_d(), op[0], op[1]));
749 inst->conditional_mod = BRW_CONDITIONAL_NZ;
750 break;
751
752 case ir_binop_logic_and:
753 inst = emit(AND(dst_null_d(), op[0], op[1]));
754 inst->conditional_mod = BRW_CONDITIONAL_NZ;
755 break;
756
757 case ir_unop_f2b:
758 if (brw->gen >= 6) {
759 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
760 } else {
761 inst = emit(MOV(dst_null_f(), op[0]));
762 inst->conditional_mod = BRW_CONDITIONAL_NZ;
763 }
764 break;
765
766 case ir_unop_i2b:
767 if (brw->gen >= 6) {
768 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
769 } else {
770 inst = emit(MOV(dst_null_d(), op[0]));
771 inst->conditional_mod = BRW_CONDITIONAL_NZ;
772 }
773 break;
774
775 case ir_binop_all_equal:
776 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
777 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
778 break;
779
780 case ir_binop_any_nequal:
781 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
782 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
783 break;
784
785 case ir_unop_any:
786 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
787 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
788 break;
789
790 case ir_binop_greater:
791 case ir_binop_gequal:
792 case ir_binop_less:
793 case ir_binop_lequal:
794 case ir_binop_equal:
795 case ir_binop_nequal:
796 emit(CMP(dst_null_d(), op[0], op[1],
797 brw_conditional_for_comparison(expr->operation)));
798 break;
799
800 default:
801 assert(!"not reached");
802 break;
803 }
804 return;
805 }
806
807 ir->accept(this);
808
809 resolve_ud_negate(&this->result);
810
811 if (brw->gen >= 6) {
812 vec4_instruction *inst = emit(AND(dst_null_d(),
813 this->result, src_reg(1)));
814 inst->conditional_mod = BRW_CONDITIONAL_NZ;
815 } else {
816 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
817 inst->conditional_mod = BRW_CONDITIONAL_NZ;
818 }
819 }
820
821 /**
822 * Emit a gen6 IF statement with the comparison folded into the IF
823 * instruction.
824 */
825 void
826 vec4_visitor::emit_if_gen6(ir_if *ir)
827 {
828 ir_expression *expr = ir->condition->as_expression();
829
830 if (expr) {
831 src_reg op[2];
832 dst_reg temp;
833
834 assert(expr->get_num_operands() <= 2);
835 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
836 expr->operands[i]->accept(this);
837 op[i] = this->result;
838 }
839
840 switch (expr->operation) {
841 case ir_unop_logic_not:
842 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
843 return;
844
845 case ir_binop_logic_xor:
846 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
847 return;
848
849 case ir_binop_logic_or:
850 temp = dst_reg(this, glsl_type::bool_type);
851 emit(OR(temp, op[0], op[1]));
852 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
853 return;
854
855 case ir_binop_logic_and:
856 temp = dst_reg(this, glsl_type::bool_type);
857 emit(AND(temp, op[0], op[1]));
858 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
859 return;
860
861 case ir_unop_f2b:
862 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
863 return;
864
865 case ir_unop_i2b:
866 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
867 return;
868
869 case ir_binop_greater:
870 case ir_binop_gequal:
871 case ir_binop_less:
872 case ir_binop_lequal:
873 case ir_binop_equal:
874 case ir_binop_nequal:
875 emit(IF(op[0], op[1],
876 brw_conditional_for_comparison(expr->operation)));
877 return;
878
879 case ir_binop_all_equal:
880 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
881 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
882 return;
883
884 case ir_binop_any_nequal:
885 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
886 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
887 return;
888
889 case ir_unop_any:
890 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
891 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
892 return;
893
894 default:
895 assert(!"not reached");
896 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
897 return;
898 }
899 return;
900 }
901
902 ir->condition->accept(this);
903
904 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
905 }
906
907 dst_reg
908 with_writemask(dst_reg const & r, int mask)
909 {
910 dst_reg result = r;
911 result.writemask = mask;
912 return result;
913 }
914
915
916 void
917 vec4_visitor::visit(ir_variable *ir)
918 {
919 dst_reg *reg = NULL;
920
921 if (variable_storage(ir))
922 return;
923
924 switch (ir->mode) {
925 case ir_var_shader_in:
926 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
927 break;
928
929 case ir_var_shader_out:
930 reg = new(mem_ctx) dst_reg(this, ir->type);
931
932 for (int i = 0; i < type_size(ir->type); i++) {
933 output_reg[ir->location + i] = *reg;
934 output_reg[ir->location + i].reg_offset = i;
935 output_reg[ir->location + i].type =
936 brw_type_for_base_type(ir->type->get_scalar_type());
937 output_reg_annotation[ir->location + i] = ir->name;
938 }
939 break;
940
941 case ir_var_auto:
942 case ir_var_temporary:
943 reg = new(mem_ctx) dst_reg(this, ir->type);
944 break;
945
946 case ir_var_uniform:
947 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
948
949 /* Thanks to the lower_ubo_reference pass, we will see only
950 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
951 * variables, so no need for them to be in variable_ht.
952 */
953 if (ir->is_in_uniform_block())
954 return;
955
956 /* Track how big the whole uniform variable is, in case we need to put a
957 * copy of its data into pull constants for array access.
958 */
959 this->uniform_size[this->uniforms] = type_size(ir->type);
960
961 if (!strncmp(ir->name, "gl_", 3)) {
962 setup_builtin_uniform_values(ir);
963 } else {
964 setup_uniform_values(ir);
965 }
966 break;
967
968 case ir_var_system_value:
969 reg = make_reg_for_system_value(ir);
970 break;
971
972 default:
973 assert(!"not reached");
974 }
975
976 reg->type = brw_type_for_base_type(ir->type);
977 hash_table_insert(this->variable_ht, reg, ir);
978 }
979
980 void
981 vec4_visitor::visit(ir_loop *ir)
982 {
983 dst_reg counter;
984
985 /* We don't want debugging output to print the whole body of the
986 * loop as the annotation.
987 */
988 this->base_ir = NULL;
989
990 if (ir->counter != NULL) {
991 this->base_ir = ir->counter;
992 ir->counter->accept(this);
993 counter = *(variable_storage(ir->counter));
994
995 if (ir->from != NULL) {
996 this->base_ir = ir->from;
997 ir->from->accept(this);
998
999 emit(MOV(counter, this->result));
1000 }
1001 }
1002
1003 emit(BRW_OPCODE_DO);
1004
1005 if (ir->to) {
1006 this->base_ir = ir->to;
1007 ir->to->accept(this);
1008
1009 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1010 brw_conditional_for_comparison(ir->cmp)));
1011
1012 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1013 inst->predicate = BRW_PREDICATE_NORMAL;
1014 }
1015
1016 visit_instructions(&ir->body_instructions);
1017
1018
1019 if (ir->increment) {
1020 this->base_ir = ir->increment;
1021 ir->increment->accept(this);
1022 emit(ADD(counter, src_reg(counter), this->result));
1023 }
1024
1025 emit(BRW_OPCODE_WHILE);
1026 }
1027
1028 void
1029 vec4_visitor::visit(ir_loop_jump *ir)
1030 {
1031 switch (ir->mode) {
1032 case ir_loop_jump::jump_break:
1033 emit(BRW_OPCODE_BREAK);
1034 break;
1035 case ir_loop_jump::jump_continue:
1036 emit(BRW_OPCODE_CONTINUE);
1037 break;
1038 }
1039 }
1040
1041
1042 void
1043 vec4_visitor::visit(ir_function_signature *ir)
1044 {
1045 assert(0);
1046 (void)ir;
1047 }
1048
1049 void
1050 vec4_visitor::visit(ir_function *ir)
1051 {
1052 /* Ignore function bodies other than main() -- we shouldn't see calls to
1053 * them since they should all be inlined.
1054 */
1055 if (strcmp(ir->name, "main") == 0) {
1056 const ir_function_signature *sig;
1057 exec_list empty;
1058
1059 sig = ir->matching_signature(&empty);
1060
1061 assert(sig);
1062
1063 visit_instructions(&sig->body);
1064 }
1065 }
1066
1067 bool
1068 vec4_visitor::try_emit_sat(ir_expression *ir)
1069 {
1070 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1071 if (!sat_src)
1072 return false;
1073
1074 sat_src->accept(this);
1075 src_reg src = this->result;
1076
1077 this->result = src_reg(this, ir->type);
1078 vec4_instruction *inst;
1079 inst = emit(MOV(dst_reg(this->result), src));
1080 inst->saturate = true;
1081
1082 return true;
1083 }
1084
1085 bool
1086 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1087 {
1088 /* 3-src instructions were introduced in gen6. */
1089 if (brw->gen < 6)
1090 return false;
1091
1092 /* MAD can only handle floating-point data. */
1093 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1094 return false;
1095
1096 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1097 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1098
1099 if (!mul || mul->operation != ir_binop_mul)
1100 return false;
1101
1102 nonmul->accept(this);
1103 src_reg src0 = fix_3src_operand(this->result);
1104
1105 mul->operands[0]->accept(this);
1106 src_reg src1 = fix_3src_operand(this->result);
1107
1108 mul->operands[1]->accept(this);
1109 src_reg src2 = fix_3src_operand(this->result);
1110
1111 this->result = src_reg(this, ir->type);
1112 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1113
1114 return true;
1115 }
1116
1117 void
1118 vec4_visitor::emit_bool_comparison(unsigned int op,
1119 dst_reg dst, src_reg src0, src_reg src1)
1120 {
1121 /* original gen4 does destination conversion before comparison. */
1122 if (brw->gen < 5)
1123 dst.type = src0.type;
1124
1125 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1126
1127 dst.type = BRW_REGISTER_TYPE_D;
1128 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1129 }
1130
1131 void
1132 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1133 src_reg src0, src_reg src1)
1134 {
1135 vec4_instruction *inst;
1136
1137 if (brw->gen >= 6) {
1138 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1139 inst->conditional_mod = conditionalmod;
1140 } else {
1141 emit(CMP(dst, src0, src1, conditionalmod));
1142
1143 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1144 inst->predicate = BRW_PREDICATE_NORMAL;
1145 }
1146 }
1147
1148 static bool
1149 is_16bit_constant(ir_rvalue *rvalue)
1150 {
1151 ir_constant *constant = rvalue->as_constant();
1152 if (!constant)
1153 return false;
1154
1155 if (constant->type != glsl_type::int_type &&
1156 constant->type != glsl_type::uint_type)
1157 return false;
1158
1159 return constant->value.u[0] < (1 << 16);
1160 }
1161
1162 void
1163 vec4_visitor::visit(ir_expression *ir)
1164 {
1165 unsigned int operand;
1166 src_reg op[Elements(ir->operands)];
1167 src_reg result_src;
1168 dst_reg result_dst;
1169 vec4_instruction *inst;
1170
1171 if (try_emit_sat(ir))
1172 return;
1173
1174 if (ir->operation == ir_binop_add) {
1175 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1176 return;
1177 }
1178
1179 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1180 this->result.file = BAD_FILE;
1181 ir->operands[operand]->accept(this);
1182 if (this->result.file == BAD_FILE) {
1183 printf("Failed to get tree for expression operand:\n");
1184 ir->operands[operand]->print();
1185 exit(1);
1186 }
1187 op[operand] = this->result;
1188
1189 /* Matrix expression operands should have been broken down to vector
1190 * operations already.
1191 */
1192 assert(!ir->operands[operand]->type->is_matrix());
1193 }
1194
1195 int vector_elements = ir->operands[0]->type->vector_elements;
1196 if (ir->operands[1]) {
1197 vector_elements = MAX2(vector_elements,
1198 ir->operands[1]->type->vector_elements);
1199 }
1200
1201 this->result.file = BAD_FILE;
1202
1203 /* Storage for our result. Ideally for an assignment we'd be using
1204 * the actual storage for the result here, instead.
1205 */
1206 result_src = src_reg(this, ir->type);
1207 /* convenience for the emit functions below. */
1208 result_dst = dst_reg(result_src);
1209 /* If nothing special happens, this is the result. */
1210 this->result = result_src;
1211 /* Limit writes to the channels that will be used by result_src later.
1212 * This does limit this temp's use as a temporary for multi-instruction
1213 * sequences.
1214 */
1215 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1216
1217 switch (ir->operation) {
1218 case ir_unop_logic_not:
1219 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1220 * ones complement of the whole register, not just bit 0.
1221 */
1222 emit(XOR(result_dst, op[0], src_reg(1)));
1223 break;
1224 case ir_unop_neg:
1225 op[0].negate = !op[0].negate;
1226 emit(MOV(result_dst, op[0]));
1227 break;
1228 case ir_unop_abs:
1229 op[0].abs = true;
1230 op[0].negate = false;
1231 emit(MOV(result_dst, op[0]));
1232 break;
1233
1234 case ir_unop_sign:
1235 emit(MOV(result_dst, src_reg(0.0f)));
1236
1237 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1238 inst = emit(MOV(result_dst, src_reg(1.0f)));
1239 inst->predicate = BRW_PREDICATE_NORMAL;
1240
1241 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1242 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1243 inst->predicate = BRW_PREDICATE_NORMAL;
1244
1245 break;
1246
1247 case ir_unop_rcp:
1248 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1249 break;
1250
1251 case ir_unop_exp2:
1252 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1253 break;
1254 case ir_unop_log2:
1255 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1256 break;
1257 case ir_unop_exp:
1258 case ir_unop_log:
1259 assert(!"not reached: should be handled by ir_explog_to_explog2");
1260 break;
1261 case ir_unop_sin:
1262 case ir_unop_sin_reduced:
1263 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1264 break;
1265 case ir_unop_cos:
1266 case ir_unop_cos_reduced:
1267 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1268 break;
1269
1270 case ir_unop_dFdx:
1271 case ir_unop_dFdy:
1272 assert(!"derivatives not valid in vertex shader");
1273 break;
1274
1275 case ir_unop_bitfield_reverse:
1276 emit(BFREV(result_dst, op[0]));
1277 break;
1278 case ir_unop_bit_count:
1279 emit(CBIT(result_dst, op[0]));
1280 break;
1281 case ir_unop_find_msb: {
1282 src_reg temp = src_reg(this, glsl_type::uint_type);
1283
1284 inst = emit(FBH(dst_reg(temp), op[0]));
1285 inst->dst.writemask = WRITEMASK_XYZW;
1286
1287 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1288 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1289 * subtract the result from 31 to convert the MSB count into an LSB count.
1290 */
1291
1292 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1293 temp.swizzle = BRW_SWIZZLE_NOOP;
1294 emit(MOV(result_dst, temp));
1295
1296 src_reg src_tmp = src_reg(result_dst);
1297 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1298
1299 src_tmp.negate = true;
1300 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1301 inst->predicate = BRW_PREDICATE_NORMAL;
1302 break;
1303 }
1304 case ir_unop_find_lsb:
1305 emit(FBL(result_dst, op[0]));
1306 break;
1307
1308 case ir_unop_noise:
1309 assert(!"not reached: should be handled by lower_noise");
1310 break;
1311
1312 case ir_binop_add:
1313 emit(ADD(result_dst, op[0], op[1]));
1314 break;
1315 case ir_binop_sub:
1316 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1317 break;
1318
1319 case ir_binop_mul:
1320 if (ir->type->is_integer()) {
1321 /* For integer multiplication, the MUL uses the low 16 bits of one of
1322 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1323 * accumulates in the contribution of the upper 16 bits of that
1324 * operand. If we can determine that one of the args is in the low
1325 * 16 bits, though, we can just emit a single MUL.
1326 */
1327 if (is_16bit_constant(ir->operands[0])) {
1328 if (brw->gen < 7)
1329 emit(MUL(result_dst, op[0], op[1]));
1330 else
1331 emit(MUL(result_dst, op[1], op[0]));
1332 } else if (is_16bit_constant(ir->operands[1])) {
1333 if (brw->gen < 7)
1334 emit(MUL(result_dst, op[1], op[0]));
1335 else
1336 emit(MUL(result_dst, op[0], op[1]));
1337 } else {
1338 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1339
1340 emit(MUL(acc, op[0], op[1]));
1341 emit(MACH(dst_null_d(), op[0], op[1]));
1342 emit(MOV(result_dst, src_reg(acc)));
1343 }
1344 } else {
1345 emit(MUL(result_dst, op[0], op[1]));
1346 }
1347 break;
1348 case ir_binop_div:
1349 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1350 assert(ir->type->is_integer());
1351 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1352 break;
1353 case ir_binop_mod:
1354 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1355 assert(ir->type->is_integer());
1356 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1357 break;
1358
1359 case ir_binop_less:
1360 case ir_binop_greater:
1361 case ir_binop_lequal:
1362 case ir_binop_gequal:
1363 case ir_binop_equal:
1364 case ir_binop_nequal: {
1365 emit(CMP(result_dst, op[0], op[1],
1366 brw_conditional_for_comparison(ir->operation)));
1367 emit(AND(result_dst, result_src, src_reg(0x1)));
1368 break;
1369 }
1370
1371 case ir_binop_all_equal:
1372 /* "==" operator producing a scalar boolean. */
1373 if (ir->operands[0]->type->is_vector() ||
1374 ir->operands[1]->type->is_vector()) {
1375 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1376 emit(MOV(result_dst, src_reg(0)));
1377 inst = emit(MOV(result_dst, src_reg(1)));
1378 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1379 } else {
1380 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1381 emit(AND(result_dst, result_src, src_reg(0x1)));
1382 }
1383 break;
1384 case ir_binop_any_nequal:
1385 /* "!=" operator producing a scalar boolean. */
1386 if (ir->operands[0]->type->is_vector() ||
1387 ir->operands[1]->type->is_vector()) {
1388 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1389
1390 emit(MOV(result_dst, src_reg(0)));
1391 inst = emit(MOV(result_dst, src_reg(1)));
1392 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1393 } else {
1394 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1395 emit(AND(result_dst, result_src, src_reg(0x1)));
1396 }
1397 break;
1398
1399 case ir_unop_any:
1400 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1401 emit(MOV(result_dst, src_reg(0)));
1402
1403 inst = emit(MOV(result_dst, src_reg(1)));
1404 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1405 break;
1406
1407 case ir_binop_logic_xor:
1408 emit(XOR(result_dst, op[0], op[1]));
1409 break;
1410
1411 case ir_binop_logic_or:
1412 emit(OR(result_dst, op[0], op[1]));
1413 break;
1414
1415 case ir_binop_logic_and:
1416 emit(AND(result_dst, op[0], op[1]));
1417 break;
1418
1419 case ir_binop_dot:
1420 assert(ir->operands[0]->type->is_vector());
1421 assert(ir->operands[0]->type == ir->operands[1]->type);
1422 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1423 break;
1424
1425 case ir_unop_sqrt:
1426 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1427 break;
1428 case ir_unop_rsq:
1429 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1430 break;
1431
1432 case ir_unop_bitcast_i2f:
1433 case ir_unop_bitcast_u2f:
1434 this->result = op[0];
1435 this->result.type = BRW_REGISTER_TYPE_F;
1436 break;
1437
1438 case ir_unop_bitcast_f2i:
1439 this->result = op[0];
1440 this->result.type = BRW_REGISTER_TYPE_D;
1441 break;
1442
1443 case ir_unop_bitcast_f2u:
1444 this->result = op[0];
1445 this->result.type = BRW_REGISTER_TYPE_UD;
1446 break;
1447
1448 case ir_unop_i2f:
1449 case ir_unop_i2u:
1450 case ir_unop_u2i:
1451 case ir_unop_u2f:
1452 case ir_unop_b2f:
1453 case ir_unop_b2i:
1454 case ir_unop_f2i:
1455 case ir_unop_f2u:
1456 emit(MOV(result_dst, op[0]));
1457 break;
1458 case ir_unop_f2b:
1459 case ir_unop_i2b: {
1460 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1461 emit(AND(result_dst, result_src, src_reg(1)));
1462 break;
1463 }
1464
1465 case ir_unop_trunc:
1466 emit(RNDZ(result_dst, op[0]));
1467 break;
1468 case ir_unop_ceil:
1469 op[0].negate = !op[0].negate;
1470 inst = emit(RNDD(result_dst, op[0]));
1471 this->result.negate = true;
1472 break;
1473 case ir_unop_floor:
1474 inst = emit(RNDD(result_dst, op[0]));
1475 break;
1476 case ir_unop_fract:
1477 inst = emit(FRC(result_dst, op[0]));
1478 break;
1479 case ir_unop_round_even:
1480 emit(RNDE(result_dst, op[0]));
1481 break;
1482
1483 case ir_binop_min:
1484 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1485 break;
1486 case ir_binop_max:
1487 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1488 break;
1489
1490 case ir_binop_pow:
1491 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1492 break;
1493
1494 case ir_unop_bit_not:
1495 inst = emit(NOT(result_dst, op[0]));
1496 break;
1497 case ir_binop_bit_and:
1498 inst = emit(AND(result_dst, op[0], op[1]));
1499 break;
1500 case ir_binop_bit_xor:
1501 inst = emit(XOR(result_dst, op[0], op[1]));
1502 break;
1503 case ir_binop_bit_or:
1504 inst = emit(OR(result_dst, op[0], op[1]));
1505 break;
1506
1507 case ir_binop_lshift:
1508 inst = emit(SHL(result_dst, op[0], op[1]));
1509 break;
1510
1511 case ir_binop_rshift:
1512 if (ir->type->base_type == GLSL_TYPE_INT)
1513 inst = emit(ASR(result_dst, op[0], op[1]));
1514 else
1515 inst = emit(SHR(result_dst, op[0], op[1]));
1516 break;
1517
1518 case ir_binop_bfm:
1519 emit(BFI1(result_dst, op[0], op[1]));
1520 break;
1521
1522 case ir_binop_ubo_load: {
1523 ir_constant *uniform_block = ir->operands[0]->as_constant();
1524 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1525 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1526 src_reg offset = op[1];
1527
1528 /* Now, load the vector from that offset. */
1529 assert(ir->type->is_vector() || ir->type->is_scalar());
1530
1531 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1532 packed_consts.type = result.type;
1533 src_reg surf_index =
1534 src_reg(SURF_INDEX_VEC4_UBO(uniform_block->value.u[0]));
1535 if (const_offset_ir) {
1536 offset = src_reg(const_offset / 16);
1537 } else {
1538 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1539 }
1540
1541 vec4_instruction *pull =
1542 emit(new(mem_ctx) vec4_instruction(this,
1543 VS_OPCODE_PULL_CONSTANT_LOAD,
1544 dst_reg(packed_consts),
1545 surf_index,
1546 offset));
1547 pull->base_mrf = 14;
1548 pull->mlen = 1;
1549
1550 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1551 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1552 const_offset % 16 / 4,
1553 const_offset % 16 / 4,
1554 const_offset % 16 / 4);
1555
1556 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1557 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1558 emit(CMP(result_dst, packed_consts, src_reg(0u),
1559 BRW_CONDITIONAL_NZ));
1560 emit(AND(result_dst, result, src_reg(0x1)));
1561 } else {
1562 emit(MOV(result_dst, packed_consts));
1563 }
1564 break;
1565 }
1566
1567 case ir_binop_vector_extract:
1568 assert(!"should have been lowered by vec_index_to_cond_assign");
1569 break;
1570
1571 case ir_triop_fma:
1572 op[0] = fix_3src_operand(op[0]);
1573 op[1] = fix_3src_operand(op[1]);
1574 op[2] = fix_3src_operand(op[2]);
1575 /* Note that the instruction's argument order is reversed from GLSL
1576 * and the IR.
1577 */
1578 emit(MAD(result_dst, op[2], op[1], op[0]));
1579 break;
1580
1581 case ir_triop_lrp:
1582 op[0] = fix_3src_operand(op[0]);
1583 op[1] = fix_3src_operand(op[1]);
1584 op[2] = fix_3src_operand(op[2]);
1585 /* Note that the instruction's argument order is reversed from GLSL
1586 * and the IR.
1587 */
1588 emit(LRP(result_dst, op[2], op[1], op[0]));
1589 break;
1590
1591 case ir_triop_bfi:
1592 op[0] = fix_3src_operand(op[0]);
1593 op[1] = fix_3src_operand(op[1]);
1594 op[2] = fix_3src_operand(op[2]);
1595 emit(BFI2(result_dst, op[0], op[1], op[2]));
1596 break;
1597
1598 case ir_triop_bitfield_extract:
1599 op[0] = fix_3src_operand(op[0]);
1600 op[1] = fix_3src_operand(op[1]);
1601 op[2] = fix_3src_operand(op[2]);
1602 /* Note that the instruction's argument order is reversed from GLSL
1603 * and the IR.
1604 */
1605 emit(BFE(result_dst, op[2], op[1], op[0]));
1606 break;
1607
1608 case ir_triop_vector_insert:
1609 assert(!"should have been lowered by lower_vector_insert");
1610 break;
1611
1612 case ir_quadop_bitfield_insert:
1613 assert(!"not reached: should be handled by "
1614 "bitfield_insert_to_bfm_bfi\n");
1615 break;
1616
1617 case ir_quadop_vector:
1618 assert(!"not reached: should be handled by lower_quadop_vector");
1619 break;
1620
1621 case ir_unop_pack_half_2x16:
1622 emit_pack_half_2x16(result_dst, op[0]);
1623 break;
1624 case ir_unop_unpack_half_2x16:
1625 emit_unpack_half_2x16(result_dst, op[0]);
1626 break;
1627 case ir_unop_pack_snorm_2x16:
1628 case ir_unop_pack_snorm_4x8:
1629 case ir_unop_pack_unorm_2x16:
1630 case ir_unop_pack_unorm_4x8:
1631 case ir_unop_unpack_snorm_2x16:
1632 case ir_unop_unpack_snorm_4x8:
1633 case ir_unop_unpack_unorm_2x16:
1634 case ir_unop_unpack_unorm_4x8:
1635 assert(!"not reached: should be handled by lower_packing_builtins");
1636 break;
1637 case ir_unop_unpack_half_2x16_split_x:
1638 case ir_unop_unpack_half_2x16_split_y:
1639 case ir_binop_pack_half_2x16_split:
1640 assert(!"not reached: should not occur in vertex shader");
1641 break;
1642 }
1643 }
1644
1645
1646 void
1647 vec4_visitor::visit(ir_swizzle *ir)
1648 {
1649 src_reg src;
1650 int i = 0;
1651 int swizzle[4];
1652
1653 /* Note that this is only swizzles in expressions, not those on the left
1654 * hand side of an assignment, which do write masking. See ir_assignment
1655 * for that.
1656 */
1657
1658 ir->val->accept(this);
1659 src = this->result;
1660 assert(src.file != BAD_FILE);
1661
1662 for (i = 0; i < ir->type->vector_elements; i++) {
1663 switch (i) {
1664 case 0:
1665 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1666 break;
1667 case 1:
1668 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1669 break;
1670 case 2:
1671 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1672 break;
1673 case 3:
1674 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1675 break;
1676 }
1677 }
1678 for (; i < 4; i++) {
1679 /* Replicate the last channel out. */
1680 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1681 }
1682
1683 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1684
1685 this->result = src;
1686 }
1687
1688 void
1689 vec4_visitor::visit(ir_dereference_variable *ir)
1690 {
1691 const struct glsl_type *type = ir->type;
1692 dst_reg *reg = variable_storage(ir->var);
1693
1694 if (!reg) {
1695 fail("Failed to find variable storage for %s\n", ir->var->name);
1696 this->result = src_reg(brw_null_reg());
1697 return;
1698 }
1699
1700 this->result = src_reg(*reg);
1701
1702 /* System values get their swizzle from the dst_reg writemask */
1703 if (ir->var->mode == ir_var_system_value)
1704 return;
1705
1706 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1707 this->result.swizzle = swizzle_for_size(type->vector_elements);
1708 }
1709
1710
1711 int
1712 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1713 {
1714 /* Under normal circumstances array elements are stored consecutively, so
1715 * the stride is equal to the size of the array element.
1716 */
1717 return type_size(ir->type);
1718 }
1719
1720
1721 void
1722 vec4_visitor::visit(ir_dereference_array *ir)
1723 {
1724 ir_constant *constant_index;
1725 src_reg src;
1726 int array_stride = compute_array_stride(ir);
1727
1728 constant_index = ir->array_index->constant_expression_value();
1729
1730 ir->array->accept(this);
1731 src = this->result;
1732
1733 if (constant_index) {
1734 src.reg_offset += constant_index->value.i[0] * array_stride;
1735 } else {
1736 /* Variable index array dereference. It eats the "vec4" of the
1737 * base of the array and an index that offsets the Mesa register
1738 * index.
1739 */
1740 ir->array_index->accept(this);
1741
1742 src_reg index_reg;
1743
1744 if (array_stride == 1) {
1745 index_reg = this->result;
1746 } else {
1747 index_reg = src_reg(this, glsl_type::int_type);
1748
1749 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1750 }
1751
1752 if (src.reladdr) {
1753 src_reg temp = src_reg(this, glsl_type::int_type);
1754
1755 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1756
1757 index_reg = temp;
1758 }
1759
1760 src.reladdr = ralloc(mem_ctx, src_reg);
1761 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1762 }
1763
1764 /* If the type is smaller than a vec4, replicate the last channel out. */
1765 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1766 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1767 else
1768 src.swizzle = BRW_SWIZZLE_NOOP;
1769 src.type = brw_type_for_base_type(ir->type);
1770
1771 this->result = src;
1772 }
1773
1774 void
1775 vec4_visitor::visit(ir_dereference_record *ir)
1776 {
1777 unsigned int i;
1778 const glsl_type *struct_type = ir->record->type;
1779 int offset = 0;
1780
1781 ir->record->accept(this);
1782
1783 for (i = 0; i < struct_type->length; i++) {
1784 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1785 break;
1786 offset += type_size(struct_type->fields.structure[i].type);
1787 }
1788
1789 /* If the type is smaller than a vec4, replicate the last channel out. */
1790 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1791 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1792 else
1793 this->result.swizzle = BRW_SWIZZLE_NOOP;
1794 this->result.type = brw_type_for_base_type(ir->type);
1795
1796 this->result.reg_offset += offset;
1797 }
1798
1799 /**
1800 * We want to be careful in assignment setup to hit the actual storage
1801 * instead of potentially using a temporary like we might with the
1802 * ir_dereference handler.
1803 */
1804 static dst_reg
1805 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1806 {
1807 /* The LHS must be a dereference. If the LHS is a variable indexed array
1808 * access of a vector, it must be separated into a series conditional moves
1809 * before reaching this point (see ir_vec_index_to_cond_assign).
1810 */
1811 assert(ir->as_dereference());
1812 ir_dereference_array *deref_array = ir->as_dereference_array();
1813 if (deref_array) {
1814 assert(!deref_array->array->type->is_vector());
1815 }
1816
1817 /* Use the rvalue deref handler for the most part. We'll ignore
1818 * swizzles in it and write swizzles using writemask, though.
1819 */
1820 ir->accept(v);
1821 return dst_reg(v->result);
1822 }
1823
1824 void
1825 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1826 const struct glsl_type *type, uint32_t predicate)
1827 {
1828 if (type->base_type == GLSL_TYPE_STRUCT) {
1829 for (unsigned int i = 0; i < type->length; i++) {
1830 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1831 }
1832 return;
1833 }
1834
1835 if (type->is_array()) {
1836 for (unsigned int i = 0; i < type->length; i++) {
1837 emit_block_move(dst, src, type->fields.array, predicate);
1838 }
1839 return;
1840 }
1841
1842 if (type->is_matrix()) {
1843 const struct glsl_type *vec_type;
1844
1845 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1846 type->vector_elements, 1);
1847
1848 for (int i = 0; i < type->matrix_columns; i++) {
1849 emit_block_move(dst, src, vec_type, predicate);
1850 }
1851 return;
1852 }
1853
1854 assert(type->is_scalar() || type->is_vector());
1855
1856 dst->type = brw_type_for_base_type(type);
1857 src->type = dst->type;
1858
1859 dst->writemask = (1 << type->vector_elements) - 1;
1860
1861 src->swizzle = swizzle_for_size(type->vector_elements);
1862
1863 vec4_instruction *inst = emit(MOV(*dst, *src));
1864 inst->predicate = predicate;
1865
1866 dst->reg_offset++;
1867 src->reg_offset++;
1868 }
1869
1870
1871 /* If the RHS processing resulted in an instruction generating a
1872 * temporary value, and it would be easy to rewrite the instruction to
1873 * generate its result right into the LHS instead, do so. This ends
1874 * up reliably removing instructions where it can be tricky to do so
1875 * later without real UD chain information.
1876 */
1877 bool
1878 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1879 dst_reg dst,
1880 src_reg src,
1881 vec4_instruction *pre_rhs_inst,
1882 vec4_instruction *last_rhs_inst)
1883 {
1884 /* This could be supported, but it would take more smarts. */
1885 if (ir->condition)
1886 return false;
1887
1888 if (pre_rhs_inst == last_rhs_inst)
1889 return false; /* No instructions generated to work with. */
1890
1891 /* Make sure the last instruction generated our source reg. */
1892 if (src.file != GRF ||
1893 src.file != last_rhs_inst->dst.file ||
1894 src.reg != last_rhs_inst->dst.reg ||
1895 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1896 src.reladdr ||
1897 src.abs ||
1898 src.negate ||
1899 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1900 return false;
1901
1902 /* Check that that last instruction fully initialized the channels
1903 * we want to use, in the order we want to use them. We could
1904 * potentially reswizzle the operands of many instructions so that
1905 * we could handle out of order channels, but don't yet.
1906 */
1907
1908 for (unsigned i = 0; i < 4; i++) {
1909 if (dst.writemask & (1 << i)) {
1910 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1911 return false;
1912
1913 if (BRW_GET_SWZ(src.swizzle, i) != i)
1914 return false;
1915 }
1916 }
1917
1918 /* Success! Rewrite the instruction. */
1919 last_rhs_inst->dst.file = dst.file;
1920 last_rhs_inst->dst.reg = dst.reg;
1921 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1922 last_rhs_inst->dst.reladdr = dst.reladdr;
1923 last_rhs_inst->dst.writemask &= dst.writemask;
1924
1925 return true;
1926 }
1927
1928 void
1929 vec4_visitor::visit(ir_assignment *ir)
1930 {
1931 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1932 uint32_t predicate = BRW_PREDICATE_NONE;
1933
1934 if (!ir->lhs->type->is_scalar() &&
1935 !ir->lhs->type->is_vector()) {
1936 ir->rhs->accept(this);
1937 src_reg src = this->result;
1938
1939 if (ir->condition) {
1940 emit_bool_to_cond_code(ir->condition, &predicate);
1941 }
1942
1943 /* emit_block_move doesn't account for swizzles in the source register.
1944 * This should be ok, since the source register is a structure or an
1945 * array, and those can't be swizzled. But double-check to be sure.
1946 */
1947 assert(src.swizzle ==
1948 (ir->rhs->type->is_matrix()
1949 ? swizzle_for_size(ir->rhs->type->vector_elements)
1950 : BRW_SWIZZLE_NOOP));
1951
1952 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1953 return;
1954 }
1955
1956 /* Now we're down to just a scalar/vector with writemasks. */
1957 int i;
1958
1959 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1960 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1961
1962 ir->rhs->accept(this);
1963
1964 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1965
1966 src_reg src = this->result;
1967
1968 int swizzles[4];
1969 int first_enabled_chan = 0;
1970 int src_chan = 0;
1971
1972 assert(ir->lhs->type->is_vector() ||
1973 ir->lhs->type->is_scalar());
1974 dst.writemask = ir->write_mask;
1975
1976 for (int i = 0; i < 4; i++) {
1977 if (dst.writemask & (1 << i)) {
1978 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1979 break;
1980 }
1981 }
1982
1983 /* Swizzle a small RHS vector into the channels being written.
1984 *
1985 * glsl ir treats write_mask as dictating how many channels are
1986 * present on the RHS while in our instructions we need to make
1987 * those channels appear in the slots of the vec4 they're written to.
1988 */
1989 for (int i = 0; i < 4; i++) {
1990 if (dst.writemask & (1 << i))
1991 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1992 else
1993 swizzles[i] = first_enabled_chan;
1994 }
1995 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1996 swizzles[2], swizzles[3]);
1997
1998 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1999 return;
2000 }
2001
2002 if (ir->condition) {
2003 emit_bool_to_cond_code(ir->condition, &predicate);
2004 }
2005
2006 for (i = 0; i < type_size(ir->lhs->type); i++) {
2007 vec4_instruction *inst = emit(MOV(dst, src));
2008 inst->predicate = predicate;
2009
2010 dst.reg_offset++;
2011 src.reg_offset++;
2012 }
2013 }
2014
2015 void
2016 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2017 {
2018 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2019 foreach_list(node, &ir->components) {
2020 ir_constant *field_value = (ir_constant *)node;
2021
2022 emit_constant_values(dst, field_value);
2023 }
2024 return;
2025 }
2026
2027 if (ir->type->is_array()) {
2028 for (unsigned int i = 0; i < ir->type->length; i++) {
2029 emit_constant_values(dst, ir->array_elements[i]);
2030 }
2031 return;
2032 }
2033
2034 if (ir->type->is_matrix()) {
2035 for (int i = 0; i < ir->type->matrix_columns; i++) {
2036 float *vec = &ir->value.f[i * ir->type->vector_elements];
2037
2038 for (int j = 0; j < ir->type->vector_elements; j++) {
2039 dst->writemask = 1 << j;
2040 dst->type = BRW_REGISTER_TYPE_F;
2041
2042 emit(MOV(*dst, src_reg(vec[j])));
2043 }
2044 dst->reg_offset++;
2045 }
2046 return;
2047 }
2048
2049 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2050
2051 for (int i = 0; i < ir->type->vector_elements; i++) {
2052 if (!(remaining_writemask & (1 << i)))
2053 continue;
2054
2055 dst->writemask = 1 << i;
2056 dst->type = brw_type_for_base_type(ir->type);
2057
2058 /* Find other components that match the one we're about to
2059 * write. Emits fewer instructions for things like vec4(0.5,
2060 * 1.5, 1.5, 1.5).
2061 */
2062 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2063 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2064 if (ir->value.b[i] == ir->value.b[j])
2065 dst->writemask |= (1 << j);
2066 } else {
2067 /* u, i, and f storage all line up, so no need for a
2068 * switch case for comparing each type.
2069 */
2070 if (ir->value.u[i] == ir->value.u[j])
2071 dst->writemask |= (1 << j);
2072 }
2073 }
2074
2075 switch (ir->type->base_type) {
2076 case GLSL_TYPE_FLOAT:
2077 emit(MOV(*dst, src_reg(ir->value.f[i])));
2078 break;
2079 case GLSL_TYPE_INT:
2080 emit(MOV(*dst, src_reg(ir->value.i[i])));
2081 break;
2082 case GLSL_TYPE_UINT:
2083 emit(MOV(*dst, src_reg(ir->value.u[i])));
2084 break;
2085 case GLSL_TYPE_BOOL:
2086 emit(MOV(*dst, src_reg(ir->value.b[i])));
2087 break;
2088 default:
2089 assert(!"Non-float/uint/int/bool constant");
2090 break;
2091 }
2092
2093 remaining_writemask &= ~dst->writemask;
2094 }
2095 dst->reg_offset++;
2096 }
2097
2098 void
2099 vec4_visitor::visit(ir_constant *ir)
2100 {
2101 dst_reg dst = dst_reg(this, ir->type);
2102 this->result = src_reg(dst);
2103
2104 emit_constant_values(&dst, ir);
2105 }
2106
2107 void
2108 vec4_visitor::visit(ir_call *ir)
2109 {
2110 assert(!"not reached");
2111 }
2112
2113 void
2114 vec4_visitor::visit(ir_texture *ir)
2115 {
2116 int sampler =
2117 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2118
2119 /* Should be lowered by do_lower_texture_projection */
2120 assert(!ir->projector);
2121
2122 /* Generate code to compute all the subexpression trees. This has to be
2123 * done before loading any values into MRFs for the sampler message since
2124 * generating these values may involve SEND messages that need the MRFs.
2125 */
2126 src_reg coordinate;
2127 if (ir->coordinate) {
2128 ir->coordinate->accept(this);
2129 coordinate = this->result;
2130 }
2131
2132 src_reg shadow_comparitor;
2133 if (ir->shadow_comparitor) {
2134 ir->shadow_comparitor->accept(this);
2135 shadow_comparitor = this->result;
2136 }
2137
2138 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2139 src_reg lod, dPdx, dPdy, sample_index;
2140 switch (ir->op) {
2141 case ir_tex:
2142 lod = src_reg(0.0f);
2143 lod_type = glsl_type::float_type;
2144 break;
2145 case ir_txf:
2146 case ir_txl:
2147 case ir_txs:
2148 ir->lod_info.lod->accept(this);
2149 lod = this->result;
2150 lod_type = ir->lod_info.lod->type;
2151 break;
2152 case ir_txf_ms:
2153 ir->lod_info.sample_index->accept(this);
2154 sample_index = this->result;
2155 sample_index_type = ir->lod_info.sample_index->type;
2156 break;
2157 case ir_txd:
2158 ir->lod_info.grad.dPdx->accept(this);
2159 dPdx = this->result;
2160
2161 ir->lod_info.grad.dPdy->accept(this);
2162 dPdy = this->result;
2163
2164 lod_type = ir->lod_info.grad.dPdx->type;
2165 break;
2166 case ir_txb:
2167 case ir_lod:
2168 break;
2169 }
2170
2171 vec4_instruction *inst = NULL;
2172 switch (ir->op) {
2173 case ir_tex:
2174 case ir_txl:
2175 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2176 break;
2177 case ir_txd:
2178 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2179 break;
2180 case ir_txf:
2181 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2182 break;
2183 case ir_txf_ms:
2184 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2185 break;
2186 case ir_txs:
2187 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2188 break;
2189 case ir_txb:
2190 assert(!"TXB is not valid for vertex shaders.");
2191 break;
2192 case ir_lod:
2193 assert(!"LOD is not valid for vertex shaders.");
2194 break;
2195 }
2196
2197 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2198
2199 /* Texel offsets go in the message header; Gen4 also requires headers. */
2200 inst->header_present = use_texture_offset || brw->gen < 5;
2201 inst->base_mrf = 2;
2202 inst->mlen = inst->header_present + 1; /* always at least one */
2203 inst->sampler = sampler;
2204 inst->dst = dst_reg(this, ir->type);
2205 inst->dst.writemask = WRITEMASK_XYZW;
2206 inst->shadow_compare = ir->shadow_comparitor != NULL;
2207
2208 if (use_texture_offset)
2209 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2210
2211 /* MRF for the first parameter */
2212 int param_base = inst->base_mrf + inst->header_present;
2213
2214 if (ir->op == ir_txs) {
2215 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2216 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2217 } else {
2218 int i, coord_mask = 0, zero_mask = 0;
2219 /* Load the coordinate */
2220 /* FINISHME: gl_clamp_mask and saturate */
2221 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2222 coord_mask |= (1 << i);
2223 for (; i < 4; i++)
2224 zero_mask |= (1 << i);
2225
2226 if (ir->offset && ir->op == ir_txf) {
2227 /* It appears that the ld instruction used for txf does its
2228 * address bounds check before adding in the offset. To work
2229 * around this, just add the integer offset to the integer
2230 * texel coordinate, and don't put the offset in the header.
2231 */
2232 ir_constant *offset = ir->offset->as_constant();
2233 assert(offset);
2234
2235 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2236 src_reg src = coordinate;
2237 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2238 BRW_GET_SWZ(src.swizzle, j),
2239 BRW_GET_SWZ(src.swizzle, j),
2240 BRW_GET_SWZ(src.swizzle, j));
2241 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2242 src, offset->value.i[j]));
2243 }
2244 } else {
2245 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2246 coordinate));
2247 }
2248 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2249 src_reg(0)));
2250 /* Load the shadow comparitor */
2251 if (ir->shadow_comparitor && ir->op != ir_txd) {
2252 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2253 WRITEMASK_X),
2254 shadow_comparitor));
2255 inst->mlen++;
2256 }
2257
2258 /* Load the LOD info */
2259 if (ir->op == ir_tex || ir->op == ir_txl) {
2260 int mrf, writemask;
2261 if (brw->gen >= 5) {
2262 mrf = param_base + 1;
2263 if (ir->shadow_comparitor) {
2264 writemask = WRITEMASK_Y;
2265 /* mlen already incremented */
2266 } else {
2267 writemask = WRITEMASK_X;
2268 inst->mlen++;
2269 }
2270 } else /* brw->gen == 4 */ {
2271 mrf = param_base;
2272 writemask = WRITEMASK_W;
2273 }
2274 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2275 } else if (ir->op == ir_txf) {
2276 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2277 } else if (ir->op == ir_txf_ms) {
2278 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2279 sample_index));
2280 inst->mlen++;
2281
2282 /* on Gen7, there is an additional MCS parameter here after SI,
2283 * but we don't bother to emit it since it's always zero. If
2284 * we start supporting texturing from CMS surfaces, this will have
2285 * to change
2286 */
2287 } else if (ir->op == ir_txd) {
2288 const glsl_type *type = lod_type;
2289
2290 if (brw->gen >= 5) {
2291 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2292 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2293 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2294 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2295 inst->mlen++;
2296
2297 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2298 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2299 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2300 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2301 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2302 inst->mlen++;
2303
2304 if (ir->shadow_comparitor) {
2305 emit(MOV(dst_reg(MRF, param_base + 2,
2306 ir->shadow_comparitor->type, WRITEMASK_Z),
2307 shadow_comparitor));
2308 }
2309 }
2310 } else /* brw->gen == 4 */ {
2311 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2312 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2313 inst->mlen += 2;
2314 }
2315 }
2316 }
2317
2318 emit(inst);
2319
2320 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2321 * spec requires layers.
2322 */
2323 if (ir->op == ir_txs) {
2324 glsl_type const *type = ir->sampler->type;
2325 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2326 type->sampler_array) {
2327 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2328 with_writemask(inst->dst, WRITEMASK_Z),
2329 src_reg(inst->dst), src_reg(6));
2330 }
2331 }
2332
2333 swizzle_result(ir, src_reg(inst->dst), sampler);
2334 }
2335
2336 void
2337 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2338 {
2339 int s = key->tex.swizzles[sampler];
2340
2341 this->result = src_reg(this, ir->type);
2342 dst_reg swizzled_result(this->result);
2343
2344 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2345 || s == SWIZZLE_NOOP) {
2346 emit(MOV(swizzled_result, orig_val));
2347 return;
2348 }
2349
2350 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2351 int swizzle[4] = {0};
2352
2353 for (int i = 0; i < 4; i++) {
2354 switch (GET_SWZ(s, i)) {
2355 case SWIZZLE_ZERO:
2356 zero_mask |= (1 << i);
2357 break;
2358 case SWIZZLE_ONE:
2359 one_mask |= (1 << i);
2360 break;
2361 default:
2362 copy_mask |= (1 << i);
2363 swizzle[i] = GET_SWZ(s, i);
2364 break;
2365 }
2366 }
2367
2368 if (copy_mask) {
2369 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2370 swizzled_result.writemask = copy_mask;
2371 emit(MOV(swizzled_result, orig_val));
2372 }
2373
2374 if (zero_mask) {
2375 swizzled_result.writemask = zero_mask;
2376 emit(MOV(swizzled_result, src_reg(0.0f)));
2377 }
2378
2379 if (one_mask) {
2380 swizzled_result.writemask = one_mask;
2381 emit(MOV(swizzled_result, src_reg(1.0f)));
2382 }
2383 }
2384
2385 void
2386 vec4_visitor::visit(ir_return *ir)
2387 {
2388 assert(!"not reached");
2389 }
2390
2391 void
2392 vec4_visitor::visit(ir_discard *ir)
2393 {
2394 assert(!"not reached");
2395 }
2396
2397 void
2398 vec4_visitor::visit(ir_if *ir)
2399 {
2400 /* Don't point the annotation at the if statement, because then it plus
2401 * the then and else blocks get printed.
2402 */
2403 this->base_ir = ir->condition;
2404
2405 if (brw->gen == 6) {
2406 emit_if_gen6(ir);
2407 } else {
2408 uint32_t predicate;
2409 emit_bool_to_cond_code(ir->condition, &predicate);
2410 emit(IF(predicate));
2411 }
2412
2413 visit_instructions(&ir->then_instructions);
2414
2415 if (!ir->else_instructions.is_empty()) {
2416 this->base_ir = ir->condition;
2417 emit(BRW_OPCODE_ELSE);
2418
2419 visit_instructions(&ir->else_instructions);
2420 }
2421
2422 this->base_ir = ir->condition;
2423 emit(BRW_OPCODE_ENDIF);
2424 }
2425
2426 void
2427 vec4_visitor::visit(ir_emit_vertex *)
2428 {
2429 assert(!"not reached");
2430 }
2431
2432 void
2433 vec4_visitor::visit(ir_end_primitive *)
2434 {
2435 assert(!"not reached");
2436 }
2437
2438 void
2439 vec4_visitor::emit_ndc_computation()
2440 {
2441 /* Get the position */
2442 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2443
2444 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2445 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2446 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2447
2448 current_annotation = "NDC";
2449 dst_reg ndc_w = ndc;
2450 ndc_w.writemask = WRITEMASK_W;
2451 src_reg pos_w = pos;
2452 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2453 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2454
2455 dst_reg ndc_xyz = ndc;
2456 ndc_xyz.writemask = WRITEMASK_XYZ;
2457
2458 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2459 }
2460
2461 void
2462 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2463 {
2464 if (brw->gen < 6 &&
2465 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2466 key->userclip_active || brw->has_negative_rhw_bug)) {
2467 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2468 dst_reg header1_w = header1;
2469 header1_w.writemask = WRITEMASK_W;
2470
2471 emit(MOV(header1, 0u));
2472
2473 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2474 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2475
2476 current_annotation = "Point size";
2477 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2478 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2479 }
2480
2481 if (key->userclip_active) {
2482 current_annotation = "Clipping flags";
2483 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2484 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2485
2486 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2487 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2488 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2489
2490 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2491 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2492 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2493 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2494 }
2495
2496 /* i965 clipping workaround:
2497 * 1) Test for -ve rhw
2498 * 2) If set,
2499 * set ndc = (0,0,0,0)
2500 * set ucp[6] = 1
2501 *
2502 * Later, clipping will detect ucp[6] and ensure the primitive is
2503 * clipped against all fixed planes.
2504 */
2505 if (brw->has_negative_rhw_bug) {
2506 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2507 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2508 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2509 vec4_instruction *inst;
2510 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2511 inst->predicate = BRW_PREDICATE_NORMAL;
2512 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2513 inst->predicate = BRW_PREDICATE_NORMAL;
2514 }
2515
2516 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2517 } else if (brw->gen < 6) {
2518 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2519 } else {
2520 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2521 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2522 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2523 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2524 }
2525 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2526 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2527 src_reg(output_reg[VARYING_SLOT_LAYER])));
2528 }
2529 }
2530 }
2531
2532 void
2533 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2534 {
2535 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2536 *
2537 * "If a linked set of shaders forming the vertex stage contains no
2538 * static write to gl_ClipVertex or gl_ClipDistance, but the
2539 * application has requested clipping against user clip planes through
2540 * the API, then the coordinate written to gl_Position is used for
2541 * comparison against the user clip planes."
2542 *
2543 * This function is only called if the shader didn't write to
2544 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2545 * if the user wrote to it; otherwise we use gl_Position.
2546 */
2547 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2548 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2549 clip_vertex = VARYING_SLOT_POS;
2550 }
2551
2552 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2553 ++i) {
2554 reg.writemask = 1 << i;
2555 emit(DP4(reg,
2556 src_reg(output_reg[clip_vertex]),
2557 src_reg(this->userplane[i + offset])));
2558 }
2559 }
2560
2561 void
2562 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2563 {
2564 assert (varying < VARYING_SLOT_MAX);
2565 reg.type = output_reg[varying].type;
2566 current_annotation = output_reg_annotation[varying];
2567 /* Copy the register, saturating if necessary */
2568 vec4_instruction *inst = emit(MOV(reg,
2569 src_reg(output_reg[varying])));
2570 if ((varying == VARYING_SLOT_COL0 ||
2571 varying == VARYING_SLOT_COL1 ||
2572 varying == VARYING_SLOT_BFC0 ||
2573 varying == VARYING_SLOT_BFC1) &&
2574 key->clamp_vertex_color) {
2575 inst->saturate = true;
2576 }
2577 }
2578
2579 void
2580 vec4_visitor::emit_urb_slot(int mrf, int varying)
2581 {
2582 struct brw_reg hw_reg = brw_message_reg(mrf);
2583 dst_reg reg = dst_reg(MRF, mrf);
2584 reg.type = BRW_REGISTER_TYPE_F;
2585
2586 switch (varying) {
2587 case VARYING_SLOT_PSIZ:
2588 /* PSIZ is always in slot 0, and is coupled with other flags. */
2589 current_annotation = "indices, point width, clip flags";
2590 emit_psiz_and_flags(hw_reg);
2591 break;
2592 case BRW_VARYING_SLOT_NDC:
2593 current_annotation = "NDC";
2594 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2595 break;
2596 case VARYING_SLOT_POS:
2597 current_annotation = "gl_Position";
2598 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2599 break;
2600 case VARYING_SLOT_EDGE:
2601 /* This is present when doing unfilled polygons. We're supposed to copy
2602 * the edge flag from the user-provided vertex array
2603 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2604 * of that attribute (starts as 1.0f). This is then used in clipping to
2605 * determine which edges should be drawn as wireframe.
2606 */
2607 current_annotation = "edge flag";
2608 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2609 glsl_type::float_type, WRITEMASK_XYZW))));
2610 break;
2611 case BRW_VARYING_SLOT_PAD:
2612 /* No need to write to this slot */
2613 break;
2614 default:
2615 emit_generic_urb_slot(reg, varying);
2616 break;
2617 }
2618 }
2619
2620 static int
2621 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2622 {
2623 if (brw->gen >= 6) {
2624 /* URB data written (does not include the message header reg) must
2625 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2626 * section 5.4.3.2.2: URB_INTERLEAVED.
2627 *
2628 * URB entries are allocated on a multiple of 1024 bits, so an
2629 * extra 128 bits written here to make the end align to 256 is
2630 * no problem.
2631 */
2632 if ((mlen % 2) != 1)
2633 mlen++;
2634 }
2635
2636 return mlen;
2637 }
2638
2639
2640 /**
2641 * Generates the VUE payload plus the necessary URB write instructions to
2642 * output it.
2643 *
2644 * The VUE layout is documented in Volume 2a.
2645 */
2646 void
2647 vec4_visitor::emit_vertex()
2648 {
2649 /* MRF 0 is reserved for the debugger, so start with message header
2650 * in MRF 1.
2651 */
2652 int base_mrf = 1;
2653 int mrf = base_mrf;
2654 /* In the process of generating our URB write message contents, we
2655 * may need to unspill a register or load from an array. Those
2656 * reads would use MRFs 14-15.
2657 */
2658 int max_usable_mrf = 13;
2659
2660 /* The following assertion verifies that max_usable_mrf causes an
2661 * even-numbered amount of URB write data, which will meet gen6's
2662 * requirements for length alignment.
2663 */
2664 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2665
2666 /* First mrf is the g0-based message header containing URB handles and
2667 * such.
2668 */
2669 emit_urb_write_header(mrf++);
2670
2671 if (brw->gen < 6) {
2672 emit_ndc_computation();
2673 }
2674
2675 /* Lower legacy ff and ClipVertex clipping to clip distances */
2676 if (key->userclip_active && !key->uses_clip_distance) {
2677 current_annotation = "user clip distances";
2678
2679 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2680 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2681
2682 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2683 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2684 }
2685
2686 /* Set up the VUE data for the first URB write */
2687 int slot;
2688 for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2689 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2690
2691 /* If this was max_usable_mrf, we can't fit anything more into this URB
2692 * WRITE.
2693 */
2694 if (mrf > max_usable_mrf) {
2695 slot++;
2696 break;
2697 }
2698 }
2699
2700 bool complete = slot >= prog_data->vue_map.num_slots;
2701 current_annotation = "URB write";
2702 vec4_instruction *inst = emit_urb_write_opcode(complete);
2703 inst->base_mrf = base_mrf;
2704 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2705
2706 /* Optional second URB write */
2707 if (!complete) {
2708 mrf = base_mrf + 1;
2709
2710 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2711 assert(mrf < max_usable_mrf);
2712
2713 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2714 }
2715
2716 current_annotation = "URB write";
2717 inst = emit_urb_write_opcode(true /* complete */);
2718 inst->base_mrf = base_mrf;
2719 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2720 /* URB destination offset. In the previous write, we got MRFs
2721 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2722 * URB row increments, and each of our MRFs is half of one of
2723 * those, since we're doing interleaved writes.
2724 */
2725 inst->offset = (max_usable_mrf - base_mrf) / 2;
2726 }
2727 }
2728
2729
2730 src_reg
2731 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2732 src_reg *reladdr, int reg_offset)
2733 {
2734 /* Because we store the values to scratch interleaved like our
2735 * vertex data, we need to scale the vec4 index by 2.
2736 */
2737 int message_header_scale = 2;
2738
2739 /* Pre-gen6, the message header uses byte offsets instead of vec4
2740 * (16-byte) offset units.
2741 */
2742 if (brw->gen < 6)
2743 message_header_scale *= 16;
2744
2745 if (reladdr) {
2746 src_reg index = src_reg(this, glsl_type::int_type);
2747
2748 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2749 emit_before(inst, MUL(dst_reg(index),
2750 index, src_reg(message_header_scale)));
2751
2752 return index;
2753 } else {
2754 return src_reg(reg_offset * message_header_scale);
2755 }
2756 }
2757
2758 src_reg
2759 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2760 src_reg *reladdr, int reg_offset)
2761 {
2762 if (reladdr) {
2763 src_reg index = src_reg(this, glsl_type::int_type);
2764
2765 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2766
2767 /* Pre-gen6, the message header uses byte offsets instead of vec4
2768 * (16-byte) offset units.
2769 */
2770 if (brw->gen < 6) {
2771 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2772 }
2773
2774 return index;
2775 } else {
2776 int message_header_scale = brw->gen < 6 ? 16 : 1;
2777 return src_reg(reg_offset * message_header_scale);
2778 }
2779 }
2780
2781 /**
2782 * Emits an instruction before @inst to load the value named by @orig_src
2783 * from scratch space at @base_offset to @temp.
2784 *
2785 * @base_offset is measured in 32-byte units (the size of a register).
2786 */
2787 void
2788 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2789 dst_reg temp, src_reg orig_src,
2790 int base_offset)
2791 {
2792 int reg_offset = base_offset + orig_src.reg_offset;
2793 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2794
2795 emit_before(inst, SCRATCH_READ(temp, index));
2796 }
2797
2798 /**
2799 * Emits an instruction after @inst to store the value to be written
2800 * to @orig_dst to scratch space at @base_offset, from @temp.
2801 *
2802 * @base_offset is measured in 32-byte units (the size of a register).
2803 */
2804 void
2805 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2806 {
2807 int reg_offset = base_offset + inst->dst.reg_offset;
2808 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2809
2810 /* Create a temporary register to store *inst's result in.
2811 *
2812 * We have to be careful in MOVing from our temporary result register in
2813 * the scratch write. If we swizzle from channels of the temporary that
2814 * weren't initialized, it will confuse live interval analysis, which will
2815 * make spilling fail to make progress.
2816 */
2817 src_reg temp = src_reg(this, glsl_type::vec4_type);
2818 temp.type = inst->dst.type;
2819 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2820 int swizzles[4];
2821 for (int i = 0; i < 4; i++)
2822 if (inst->dst.writemask & (1 << i))
2823 swizzles[i] = i;
2824 else
2825 swizzles[i] = first_writemask_chan;
2826 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2827 swizzles[2], swizzles[3]);
2828
2829 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2830 inst->dst.writemask));
2831 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2832 write->predicate = inst->predicate;
2833 write->ir = inst->ir;
2834 write->annotation = inst->annotation;
2835 inst->insert_after(write);
2836
2837 inst->dst.file = temp.file;
2838 inst->dst.reg = temp.reg;
2839 inst->dst.reg_offset = temp.reg_offset;
2840 inst->dst.reladdr = NULL;
2841 }
2842
2843 /**
2844 * We can't generally support array access in GRF space, because a
2845 * single instruction's destination can only span 2 contiguous
2846 * registers. So, we send all GRF arrays that get variable index
2847 * access to scratch space.
2848 */
2849 void
2850 vec4_visitor::move_grf_array_access_to_scratch()
2851 {
2852 int scratch_loc[this->virtual_grf_count];
2853
2854 for (int i = 0; i < this->virtual_grf_count; i++) {
2855 scratch_loc[i] = -1;
2856 }
2857
2858 /* First, calculate the set of virtual GRFs that need to be punted
2859 * to scratch due to having any array access on them, and where in
2860 * scratch.
2861 */
2862 foreach_list(node, &this->instructions) {
2863 vec4_instruction *inst = (vec4_instruction *)node;
2864
2865 if (inst->dst.file == GRF && inst->dst.reladdr &&
2866 scratch_loc[inst->dst.reg] == -1) {
2867 scratch_loc[inst->dst.reg] = c->last_scratch;
2868 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2869 }
2870
2871 for (int i = 0 ; i < 3; i++) {
2872 src_reg *src = &inst->src[i];
2873
2874 if (src->file == GRF && src->reladdr &&
2875 scratch_loc[src->reg] == -1) {
2876 scratch_loc[src->reg] = c->last_scratch;
2877 c->last_scratch += this->virtual_grf_sizes[src->reg];
2878 }
2879 }
2880 }
2881
2882 /* Now, for anything that will be accessed through scratch, rewrite
2883 * it to load/store. Note that this is a _safe list walk, because
2884 * we may generate a new scratch_write instruction after the one
2885 * we're processing.
2886 */
2887 foreach_list_safe(node, &this->instructions) {
2888 vec4_instruction *inst = (vec4_instruction *)node;
2889
2890 /* Set up the annotation tracking for new generated instructions. */
2891 base_ir = inst->ir;
2892 current_annotation = inst->annotation;
2893
2894 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2895 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2896 }
2897
2898 for (int i = 0 ; i < 3; i++) {
2899 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2900 continue;
2901
2902 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2903
2904 emit_scratch_read(inst, temp, inst->src[i],
2905 scratch_loc[inst->src[i].reg]);
2906
2907 inst->src[i].file = temp.file;
2908 inst->src[i].reg = temp.reg;
2909 inst->src[i].reg_offset = temp.reg_offset;
2910 inst->src[i].reladdr = NULL;
2911 }
2912 }
2913 }
2914
2915 /**
2916 * Emits an instruction before @inst to load the value named by @orig_src
2917 * from the pull constant buffer (surface) at @base_offset to @temp.
2918 */
2919 void
2920 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2921 dst_reg temp, src_reg orig_src,
2922 int base_offset)
2923 {
2924 int reg_offset = base_offset + orig_src.reg_offset;
2925 src_reg index = src_reg((unsigned)SURF_INDEX_VEC4_CONST_BUFFER);
2926 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2927 vec4_instruction *load;
2928
2929 if (brw->gen >= 7) {
2930 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2931 grf_offset.type = offset.type;
2932 emit_before(inst, MOV(grf_offset, offset));
2933
2934 load = new(mem_ctx) vec4_instruction(this,
2935 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2936 temp, index, src_reg(grf_offset));
2937 } else {
2938 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2939 temp, index, offset);
2940 load->base_mrf = 14;
2941 load->mlen = 1;
2942 }
2943 emit_before(inst, load);
2944 }
2945
2946 /**
2947 * Implements array access of uniforms by inserting a
2948 * PULL_CONSTANT_LOAD instruction.
2949 *
2950 * Unlike temporary GRF array access (where we don't support it due to
2951 * the difficulty of doing relative addressing on instruction
2952 * destinations), we could potentially do array access of uniforms
2953 * that were loaded in GRF space as push constants. In real-world
2954 * usage we've seen, though, the arrays being used are always larger
2955 * than we could load as push constants, so just always move all
2956 * uniform array access out to a pull constant buffer.
2957 */
2958 void
2959 vec4_visitor::move_uniform_array_access_to_pull_constants()
2960 {
2961 int pull_constant_loc[this->uniforms];
2962
2963 for (int i = 0; i < this->uniforms; i++) {
2964 pull_constant_loc[i] = -1;
2965 }
2966
2967 /* Walk through and find array access of uniforms. Put a copy of that
2968 * uniform in the pull constant buffer.
2969 *
2970 * Note that we don't move constant-indexed accesses to arrays. No
2971 * testing has been done of the performance impact of this choice.
2972 */
2973 foreach_list_safe(node, &this->instructions) {
2974 vec4_instruction *inst = (vec4_instruction *)node;
2975
2976 for (int i = 0 ; i < 3; i++) {
2977 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2978 continue;
2979
2980 int uniform = inst->src[i].reg;
2981
2982 /* If this array isn't already present in the pull constant buffer,
2983 * add it.
2984 */
2985 if (pull_constant_loc[uniform] == -1) {
2986 const float **values = &prog_data->param[uniform * 4];
2987
2988 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2989
2990 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2991 prog_data->pull_param[prog_data->nr_pull_params++]
2992 = values[j];
2993 }
2994 }
2995
2996 /* Set up the annotation tracking for new generated instructions. */
2997 base_ir = inst->ir;
2998 current_annotation = inst->annotation;
2999
3000 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3001
3002 emit_pull_constant_load(inst, temp, inst->src[i],
3003 pull_constant_loc[uniform]);
3004
3005 inst->src[i].file = temp.file;
3006 inst->src[i].reg = temp.reg;
3007 inst->src[i].reg_offset = temp.reg_offset;
3008 inst->src[i].reladdr = NULL;
3009 }
3010 }
3011
3012 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3013 * no need to track them as larger-than-vec4 objects. This will be
3014 * relied on in cutting out unused uniform vectors from push
3015 * constants.
3016 */
3017 split_uniform_registers();
3018 }
3019
3020 void
3021 vec4_visitor::resolve_ud_negate(src_reg *reg)
3022 {
3023 if (reg->type != BRW_REGISTER_TYPE_UD ||
3024 !reg->negate)
3025 return;
3026
3027 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3028 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3029 *reg = temp;
3030 }
3031
3032 vec4_visitor::vec4_visitor(struct brw_context *brw,
3033 struct brw_vec4_compile *c,
3034 struct gl_program *prog,
3035 const struct brw_vec4_prog_key *key,
3036 struct brw_vec4_prog_data *prog_data,
3037 struct gl_shader_program *shader_prog,
3038 struct brw_shader *shader,
3039 void *mem_ctx,
3040 bool debug_flag)
3041 : debug_flag(debug_flag)
3042 {
3043 this->brw = brw;
3044 this->ctx = &brw->ctx;
3045 this->shader_prog = shader_prog;
3046 this->shader = shader;
3047
3048 this->mem_ctx = mem_ctx;
3049 this->failed = false;
3050
3051 this->base_ir = NULL;
3052 this->current_annotation = NULL;
3053 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3054
3055 this->c = c;
3056 this->prog = prog;
3057 this->key = key;
3058 this->prog_data = prog_data;
3059
3060 this->variable_ht = hash_table_ctor(0,
3061 hash_table_pointer_hash,
3062 hash_table_pointer_compare);
3063
3064 this->virtual_grf_start = NULL;
3065 this->virtual_grf_end = NULL;
3066 this->virtual_grf_sizes = NULL;
3067 this->virtual_grf_count = 0;
3068 this->virtual_grf_reg_map = NULL;
3069 this->virtual_grf_reg_count = 0;
3070 this->virtual_grf_array_size = 0;
3071 this->live_intervals_valid = false;
3072
3073 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3074
3075 this->uniforms = 0;
3076 }
3077
3078 vec4_visitor::~vec4_visitor()
3079 {
3080 hash_table_dtor(this->variable_ht);
3081 }
3082
3083
3084 void
3085 vec4_visitor::fail(const char *format, ...)
3086 {
3087 va_list va;
3088 char *msg;
3089
3090 if (failed)
3091 return;
3092
3093 failed = true;
3094
3095 va_start(va, format);
3096 msg = ralloc_vasprintf(mem_ctx, format, va);
3097 va_end(va);
3098 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3099
3100 this->fail_msg = msg;
3101
3102 if (debug_flag) {
3103 fprintf(stderr, "%s", msg);
3104 }
3105 }
3106
3107 } /* namespace brw */