i965: Allocate vec4_visitor's uniform_size and uniform_vector_size arrays dynamically.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->sampler = 0;
47 this->texture_offset = 0;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->ir = v->base_ir;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_present = false;
53 this->mlen = 0;
54 this->base_mrf = 0;
55 this->offset = 0;
56 this->annotation = v->current_annotation;
57 }
58
59 vec4_instruction *
60 vec4_visitor::emit(vec4_instruction *inst)
61 {
62 this->instructions.push_tail(inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
69 {
70 new_inst->ir = inst->ir;
71 new_inst->annotation = inst->annotation;
72
73 inst->insert_before(new_inst);
74
75 return inst;
76 }
77
78 vec4_instruction *
79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
80 src_reg src0, src_reg src1, src_reg src2)
81 {
82 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
83 src0, src1, src2));
84 }
85
86
87 vec4_instruction *
88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
89 {
90 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
91 }
92
93 vec4_instruction *
94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
95 {
96 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
101 {
102 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode)
107 {
108 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
109 }
110
111 #define ALU1(op) \
112 vec4_instruction * \
113 vec4_visitor::op(dst_reg dst, src_reg src0) \
114 { \
115 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
116 src0); \
117 }
118
119 #define ALU2(op) \
120 vec4_instruction * \
121 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
122 { \
123 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
124 src0, src1); \
125 }
126
127 #define ALU3(op) \
128 vec4_instruction * \
129 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
130 { \
131 assert(brw->gen >= 6); \
132 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
133 src0, src1, src2); \
134 }
135
136 ALU1(NOT)
137 ALU1(MOV)
138 ALU1(FRC)
139 ALU1(RNDD)
140 ALU1(RNDE)
141 ALU1(RNDZ)
142 ALU1(F32TO16)
143 ALU1(F16TO32)
144 ALU2(ADD)
145 ALU2(MUL)
146 ALU2(MACH)
147 ALU2(AND)
148 ALU2(OR)
149 ALU2(XOR)
150 ALU2(DP3)
151 ALU2(DP4)
152 ALU2(DPH)
153 ALU2(SHL)
154 ALU2(SHR)
155 ALU2(ASR)
156 ALU3(LRP)
157 ALU1(BFREV)
158 ALU3(BFE)
159 ALU2(BFI1)
160 ALU3(BFI2)
161 ALU1(FBH)
162 ALU1(FBL)
163 ALU1(CBIT)
164 ALU3(MAD)
165 ALU2(ADDC)
166 ALU2(SUBB)
167
168 /** Gen4 predicated IF. */
169 vec4_instruction *
170 vec4_visitor::IF(uint32_t predicate)
171 {
172 vec4_instruction *inst;
173
174 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
175 inst->predicate = predicate;
176
177 return inst;
178 }
179
180 /** Gen6 IF with embedded comparison. */
181 vec4_instruction *
182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
183 {
184 assert(brw->gen == 6);
185
186 vec4_instruction *inst;
187
188 resolve_ud_negate(&src0);
189 resolve_ud_negate(&src1);
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
192 src0, src1);
193 inst->conditional_mod = condition;
194
195 return inst;
196 }
197
198 /**
199 * CMP: Sets the low bit of the destination channels with the result
200 * of the comparison, while the upper bits are undefined, and updates
201 * the flag register with the packed 16 bits of the result.
202 */
203 vec4_instruction *
204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
205 {
206 vec4_instruction *inst;
207
208 /* original gen4 does type conversion to the destination type
209 * before before comparison, producing garbage results for floating
210 * point comparisons.
211 */
212 if (brw->gen == 4) {
213 dst.type = src0.type;
214 if (dst.file == HW_REG)
215 dst.fixed_hw_reg.type = dst.type;
216 }
217
218 resolve_ud_negate(&src0);
219 resolve_ud_negate(&src1);
220
221 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
222 inst->conditional_mod = condition;
223
224 return inst;
225 }
226
227 vec4_instruction *
228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
229 {
230 vec4_instruction *inst;
231
232 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
233 dst, index);
234 inst->base_mrf = 14;
235 inst->mlen = 2;
236
237 return inst;
238 }
239
240 vec4_instruction *
241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
242 {
243 vec4_instruction *inst;
244
245 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
246 dst, src, index);
247 inst->base_mrf = 13;
248 inst->mlen = 3;
249
250 return inst;
251 }
252
253 void
254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
255 {
256 static enum opcode dot_opcodes[] = {
257 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
258 };
259
260 emit(dot_opcodes[elements - 2], dst, src0, src1);
261 }
262
263 src_reg
264 vec4_visitor::fix_3src_operand(src_reg src)
265 {
266 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
267 * able to use vertical stride of zero to replicate the vec4 uniform, like
268 *
269 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
270 *
271 * But you can't, since vertical stride is always four in three-source
272 * instructions. Instead, insert a MOV instruction to do the replication so
273 * that the three-source instruction can consume it.
274 */
275
276 /* The MOV is only needed if the source is a uniform or immediate. */
277 if (src.file != UNIFORM && src.file != IMM)
278 return src;
279
280 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
281 expanded.type = src.type;
282 emit(MOV(expanded, src));
283 return src_reg(expanded);
284 }
285
286 src_reg
287 vec4_visitor::fix_math_operand(src_reg src)
288 {
289 /* The gen6 math instruction ignores the source modifiers --
290 * swizzle, abs, negate, and at least some parts of the register
291 * region description.
292 *
293 * Rather than trying to enumerate all these cases, *always* expand the
294 * operand to a temp GRF for gen6.
295 *
296 * For gen7, keep the operand as-is, except if immediate, which gen7 still
297 * can't use.
298 */
299
300 if (brw->gen == 7 && src.file != IMM)
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
304 expanded.type = src.type;
305 emit(MOV(expanded, src));
306 return src_reg(expanded);
307 }
308
309 void
310 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
311 {
312 src = fix_math_operand(src);
313
314 if (dst.writemask != WRITEMASK_XYZW) {
315 /* The gen6 math instruction must be align1, so we can't do
316 * writemasks.
317 */
318 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
319
320 emit(opcode, temp_dst, src);
321
322 emit(MOV(dst, src_reg(temp_dst)));
323 } else {
324 emit(opcode, dst, src);
325 }
326 }
327
328 void
329 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
330 {
331 vec4_instruction *inst = emit(opcode, dst, src);
332 inst->base_mrf = 1;
333 inst->mlen = 1;
334 }
335
336 void
337 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
338 {
339 switch (opcode) {
340 case SHADER_OPCODE_RCP:
341 case SHADER_OPCODE_RSQ:
342 case SHADER_OPCODE_SQRT:
343 case SHADER_OPCODE_EXP2:
344 case SHADER_OPCODE_LOG2:
345 case SHADER_OPCODE_SIN:
346 case SHADER_OPCODE_COS:
347 break;
348 default:
349 assert(!"not reached: bad math opcode");
350 return;
351 }
352
353 if (brw->gen >= 6) {
354 return emit_math1_gen6(opcode, dst, src);
355 } else {
356 return emit_math1_gen4(opcode, dst, src);
357 }
358 }
359
360 void
361 vec4_visitor::emit_math2_gen6(enum opcode opcode,
362 dst_reg dst, src_reg src0, src_reg src1)
363 {
364 src0 = fix_math_operand(src0);
365 src1 = fix_math_operand(src1);
366
367 if (dst.writemask != WRITEMASK_XYZW) {
368 /* The gen6 math instruction must be align1, so we can't do
369 * writemasks.
370 */
371 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
372 temp_dst.type = dst.type;
373
374 emit(opcode, temp_dst, src0, src1);
375
376 emit(MOV(dst, src_reg(temp_dst)));
377 } else {
378 emit(opcode, dst, src0, src1);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen4(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 vec4_instruction *inst = emit(opcode, dst, src0, src1);
387 inst->base_mrf = 1;
388 inst->mlen = 2;
389 }
390
391 void
392 vec4_visitor::emit_math(enum opcode opcode,
393 dst_reg dst, src_reg src0, src_reg src1)
394 {
395 switch (opcode) {
396 case SHADER_OPCODE_POW:
397 case SHADER_OPCODE_INT_QUOTIENT:
398 case SHADER_OPCODE_INT_REMAINDER:
399 break;
400 default:
401 assert(!"not reached: unsupported binary math opcode");
402 return;
403 }
404
405 if (brw->gen >= 6) {
406 return emit_math2_gen6(opcode, dst, src0, src1);
407 } else {
408 return emit_math2_gen4(opcode, dst, src0, src1);
409 }
410 }
411
412 void
413 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
414 {
415 if (brw->gen < 7)
416 assert(!"ir_unop_pack_half_2x16 should be lowered");
417
418 assert(dst.type == BRW_REGISTER_TYPE_UD);
419 assert(src0.type == BRW_REGISTER_TYPE_F);
420
421 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
422 *
423 * Because this instruction does not have a 16-bit floating-point type,
424 * the destination data type must be Word (W).
425 *
426 * The destination must be DWord-aligned and specify a horizontal stride
427 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
428 * each destination channel and the upper word is not modified.
429 *
430 * The above restriction implies that the f32to16 instruction must use
431 * align1 mode, because only in align1 mode is it possible to specify
432 * horizontal stride. We choose here to defy the hardware docs and emit
433 * align16 instructions.
434 *
435 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
436 * instructions. I was partially successful in that the code passed all
437 * tests. However, the code was dubiously correct and fragile, and the
438 * tests were not harsh enough to probe that frailty. Not trusting the
439 * code, I chose instead to remain in align16 mode in defiance of the hw
440 * docs).
441 *
442 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
443 * simulator, emitting a f32to16 in align16 mode with UD as destination
444 * data type is safe. The behavior differs from that specified in the PRM
445 * in that the upper word of each destination channel is cleared to 0.
446 */
447
448 dst_reg tmp_dst(this, glsl_type::uvec2_type);
449 src_reg tmp_src(tmp_dst);
450
451 #if 0
452 /* Verify the undocumented behavior on which the following instructions
453 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
454 * then the result of the bit-or instruction below will be incorrect.
455 *
456 * You should inspect the disasm output in order to verify that the MOV is
457 * not optimized away.
458 */
459 emit(MOV(tmp_dst, src_reg(0x12345678u)));
460 #endif
461
462 /* Give tmp the form below, where "." means untouched.
463 *
464 * w z y x w z y x
465 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
466 *
467 * That the upper word of each write-channel be 0 is required for the
468 * following bit-shift and bit-or instructions to work. Note that this
469 * relies on the undocumented hardware behavior mentioned above.
470 */
471 tmp_dst.writemask = WRITEMASK_XY;
472 emit(F32TO16(tmp_dst, src0));
473
474 /* Give the write-channels of dst the form:
475 * 0xhhhh0000
476 */
477 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
478 emit(SHL(dst, tmp_src, src_reg(16u)));
479
480 /* Finally, give the write-channels of dst the form of packHalf2x16's
481 * output:
482 * 0xhhhhllll
483 */
484 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
485 emit(OR(dst, src_reg(dst), tmp_src));
486 }
487
488 void
489 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
490 {
491 if (brw->gen < 7)
492 assert(!"ir_unop_unpack_half_2x16 should be lowered");
493
494 assert(dst.type == BRW_REGISTER_TYPE_F);
495 assert(src0.type == BRW_REGISTER_TYPE_UD);
496
497 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
498 *
499 * Because this instruction does not have a 16-bit floating-point type,
500 * the source data type must be Word (W). The destination type must be
501 * F (Float).
502 *
503 * To use W as the source data type, we must adjust horizontal strides,
504 * which is only possible in align1 mode. All my [chadv] attempts at
505 * emitting align1 instructions for unpackHalf2x16 failed to pass the
506 * Piglit tests, so I gave up.
507 *
508 * I've verified that, on gen7 hardware and the simulator, it is safe to
509 * emit f16to32 in align16 mode with UD as source data type.
510 */
511
512 dst_reg tmp_dst(this, glsl_type::uvec2_type);
513 src_reg tmp_src(tmp_dst);
514
515 tmp_dst.writemask = WRITEMASK_X;
516 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
517
518 tmp_dst.writemask = WRITEMASK_Y;
519 emit(SHR(tmp_dst, src0, src_reg(16u)));
520
521 dst.writemask = WRITEMASK_XY;
522 emit(F16TO32(dst, tmp_src));
523 }
524
525 void
526 vec4_visitor::visit_instructions(const exec_list *list)
527 {
528 foreach_list(node, list) {
529 ir_instruction *ir = (ir_instruction *)node;
530
531 base_ir = ir;
532 ir->accept(this);
533 }
534 }
535
536
537 static int
538 type_size(const struct glsl_type *type)
539 {
540 unsigned int i;
541 int size;
542
543 switch (type->base_type) {
544 case GLSL_TYPE_UINT:
545 case GLSL_TYPE_INT:
546 case GLSL_TYPE_FLOAT:
547 case GLSL_TYPE_BOOL:
548 if (type->is_matrix()) {
549 return type->matrix_columns;
550 } else {
551 /* Regardless of size of vector, it gets a vec4. This is bad
552 * packing for things like floats, but otherwise arrays become a
553 * mess. Hopefully a later pass over the code can pack scalars
554 * down if appropriate.
555 */
556 return 1;
557 }
558 case GLSL_TYPE_ARRAY:
559 assert(type->length > 0);
560 return type_size(type->fields.array) * type->length;
561 case GLSL_TYPE_STRUCT:
562 size = 0;
563 for (i = 0; i < type->length; i++) {
564 size += type_size(type->fields.structure[i].type);
565 }
566 return size;
567 case GLSL_TYPE_SAMPLER:
568 /* Samplers take up one slot in UNIFORMS[], but they're baked in
569 * at link time.
570 */
571 return 1;
572 case GLSL_TYPE_ATOMIC_UINT:
573 return 0;
574 case GLSL_TYPE_IMAGE:
575 case GLSL_TYPE_VOID:
576 case GLSL_TYPE_ERROR:
577 case GLSL_TYPE_INTERFACE:
578 assert(0);
579 break;
580 }
581
582 return 0;
583 }
584
585 int
586 vec4_visitor::virtual_grf_alloc(int size)
587 {
588 if (virtual_grf_array_size <= virtual_grf_count) {
589 if (virtual_grf_array_size == 0)
590 virtual_grf_array_size = 16;
591 else
592 virtual_grf_array_size *= 2;
593 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
594 virtual_grf_array_size);
595 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
596 virtual_grf_array_size);
597 }
598 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
599 virtual_grf_reg_count += size;
600 virtual_grf_sizes[virtual_grf_count] = size;
601 return virtual_grf_count++;
602 }
603
604 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
605 {
606 init();
607
608 this->file = GRF;
609 this->reg = v->virtual_grf_alloc(type_size(type));
610
611 if (type->is_array() || type->is_record()) {
612 this->swizzle = BRW_SWIZZLE_NOOP;
613 } else {
614 this->swizzle = swizzle_for_size(type->vector_elements);
615 }
616
617 this->type = brw_type_for_base_type(type);
618 }
619
620 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
621 {
622 init();
623
624 this->file = GRF;
625 this->reg = v->virtual_grf_alloc(type_size(type));
626
627 if (type->is_array() || type->is_record()) {
628 this->writemask = WRITEMASK_XYZW;
629 } else {
630 this->writemask = (1 << type->vector_elements) - 1;
631 }
632
633 this->type = brw_type_for_base_type(type);
634 }
635
636 /* Our support for uniforms is piggy-backed on the struct
637 * gl_fragment_program, because that's where the values actually
638 * get stored, rather than in some global gl_shader_program uniform
639 * store.
640 */
641 void
642 vec4_visitor::setup_uniform_values(ir_variable *ir)
643 {
644 int namelen = strlen(ir->name);
645
646 /* The data for our (non-builtin) uniforms is stored in a series of
647 * gl_uniform_driver_storage structs for each subcomponent that
648 * glGetUniformLocation() could name. We know it's been set up in the same
649 * order we'd walk the type, so walk the list of storage and find anything
650 * with our name, or the prefix of a component that starts with our name.
651 */
652 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
653 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
654
655 if (strncmp(ir->name, storage->name, namelen) != 0 ||
656 (storage->name[namelen] != 0 &&
657 storage->name[namelen] != '.' &&
658 storage->name[namelen] != '[')) {
659 continue;
660 }
661
662 gl_constant_value *components = storage->storage;
663 unsigned vector_count = (MAX2(storage->array_elements, 1) *
664 storage->type->matrix_columns);
665
666 for (unsigned s = 0; s < vector_count; s++) {
667 uniform_vector_size[uniforms] = storage->type->vector_elements;
668
669 int i;
670 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
671 stage_prog_data->param[uniforms * 4 + i] = &components->f;
672 components++;
673 }
674 for (; i < 4; i++) {
675 static float zero = 0;
676 stage_prog_data->param[uniforms * 4 + i] = &zero;
677 }
678
679 uniforms++;
680 }
681 }
682 }
683
684 void
685 vec4_visitor::setup_uniform_clipplane_values()
686 {
687 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
688
689 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
690 this->uniform_vector_size[this->uniforms] = 4;
691 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
692 this->userplane[i].type = BRW_REGISTER_TYPE_F;
693 for (int j = 0; j < 4; ++j) {
694 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
695 }
696 ++this->uniforms;
697 }
698 }
699
700 /* Our support for builtin uniforms is even scarier than non-builtin.
701 * It sits on top of the PROG_STATE_VAR parameters that are
702 * automatically updated from GL context state.
703 */
704 void
705 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
706 {
707 const ir_state_slot *const slots = ir->state_slots;
708 assert(ir->state_slots != NULL);
709
710 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
711 /* This state reference has already been setup by ir_to_mesa,
712 * but we'll get the same index back here. We can reference
713 * ParameterValues directly, since unlike brw_fs.cpp, we never
714 * add new state references during compile.
715 */
716 int index = _mesa_add_state_reference(this->prog->Parameters,
717 (gl_state_index *)slots[i].tokens);
718 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
719
720 this->uniform_vector_size[this->uniforms] = 0;
721 /* Add each of the unique swizzled channels of the element.
722 * This will end up matching the size of the glsl_type of this field.
723 */
724 int last_swiz = -1;
725 for (unsigned int j = 0; j < 4; j++) {
726 int swiz = GET_SWZ(slots[i].swizzle, j);
727 last_swiz = swiz;
728
729 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
730 if (swiz <= last_swiz)
731 this->uniform_vector_size[this->uniforms]++;
732 }
733 this->uniforms++;
734 }
735 }
736
737 dst_reg *
738 vec4_visitor::variable_storage(ir_variable *var)
739 {
740 return (dst_reg *)hash_table_find(this->variable_ht, var);
741 }
742
743 void
744 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
745 {
746 ir_expression *expr = ir->as_expression();
747
748 *predicate = BRW_PREDICATE_NORMAL;
749
750 if (expr) {
751 src_reg op[2];
752 vec4_instruction *inst;
753
754 assert(expr->get_num_operands() <= 2);
755 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
756 expr->operands[i]->accept(this);
757 op[i] = this->result;
758
759 resolve_ud_negate(&op[i]);
760 }
761
762 switch (expr->operation) {
763 case ir_unop_logic_not:
764 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
765 inst->conditional_mod = BRW_CONDITIONAL_Z;
766 break;
767
768 case ir_binop_logic_xor:
769 inst = emit(XOR(dst_null_d(), op[0], op[1]));
770 inst->conditional_mod = BRW_CONDITIONAL_NZ;
771 break;
772
773 case ir_binop_logic_or:
774 inst = emit(OR(dst_null_d(), op[0], op[1]));
775 inst->conditional_mod = BRW_CONDITIONAL_NZ;
776 break;
777
778 case ir_binop_logic_and:
779 inst = emit(AND(dst_null_d(), op[0], op[1]));
780 inst->conditional_mod = BRW_CONDITIONAL_NZ;
781 break;
782
783 case ir_unop_f2b:
784 if (brw->gen >= 6) {
785 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
786 } else {
787 inst = emit(MOV(dst_null_f(), op[0]));
788 inst->conditional_mod = BRW_CONDITIONAL_NZ;
789 }
790 break;
791
792 case ir_unop_i2b:
793 if (brw->gen >= 6) {
794 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
795 } else {
796 inst = emit(MOV(dst_null_d(), op[0]));
797 inst->conditional_mod = BRW_CONDITIONAL_NZ;
798 }
799 break;
800
801 case ir_binop_all_equal:
802 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
803 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
804 break;
805
806 case ir_binop_any_nequal:
807 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
808 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
809 break;
810
811 case ir_unop_any:
812 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
813 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
814 break;
815
816 case ir_binop_greater:
817 case ir_binop_gequal:
818 case ir_binop_less:
819 case ir_binop_lequal:
820 case ir_binop_equal:
821 case ir_binop_nequal:
822 emit(CMP(dst_null_d(), op[0], op[1],
823 brw_conditional_for_comparison(expr->operation)));
824 break;
825
826 default:
827 assert(!"not reached");
828 break;
829 }
830 return;
831 }
832
833 ir->accept(this);
834
835 resolve_ud_negate(&this->result);
836
837 if (brw->gen >= 6) {
838 vec4_instruction *inst = emit(AND(dst_null_d(),
839 this->result, src_reg(1)));
840 inst->conditional_mod = BRW_CONDITIONAL_NZ;
841 } else {
842 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
843 inst->conditional_mod = BRW_CONDITIONAL_NZ;
844 }
845 }
846
847 /**
848 * Emit a gen6 IF statement with the comparison folded into the IF
849 * instruction.
850 */
851 void
852 vec4_visitor::emit_if_gen6(ir_if *ir)
853 {
854 ir_expression *expr = ir->condition->as_expression();
855
856 if (expr) {
857 src_reg op[2];
858 dst_reg temp;
859
860 assert(expr->get_num_operands() <= 2);
861 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
862 expr->operands[i]->accept(this);
863 op[i] = this->result;
864 }
865
866 switch (expr->operation) {
867 case ir_unop_logic_not:
868 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
869 return;
870
871 case ir_binop_logic_xor:
872 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
873 return;
874
875 case ir_binop_logic_or:
876 temp = dst_reg(this, glsl_type::bool_type);
877 emit(OR(temp, op[0], op[1]));
878 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
879 return;
880
881 case ir_binop_logic_and:
882 temp = dst_reg(this, glsl_type::bool_type);
883 emit(AND(temp, op[0], op[1]));
884 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
885 return;
886
887 case ir_unop_f2b:
888 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
889 return;
890
891 case ir_unop_i2b:
892 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
893 return;
894
895 case ir_binop_greater:
896 case ir_binop_gequal:
897 case ir_binop_less:
898 case ir_binop_lequal:
899 case ir_binop_equal:
900 case ir_binop_nequal:
901 emit(IF(op[0], op[1],
902 brw_conditional_for_comparison(expr->operation)));
903 return;
904
905 case ir_binop_all_equal:
906 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
907 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
908 return;
909
910 case ir_binop_any_nequal:
911 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
912 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
913 return;
914
915 case ir_unop_any:
916 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
917 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
918 return;
919
920 default:
921 assert(!"not reached");
922 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
923 return;
924 }
925 return;
926 }
927
928 ir->condition->accept(this);
929
930 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
931 }
932
933 void
934 vec4_visitor::visit(ir_variable *ir)
935 {
936 dst_reg *reg = NULL;
937
938 if (variable_storage(ir))
939 return;
940
941 switch (ir->data.mode) {
942 case ir_var_shader_in:
943 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
944 break;
945
946 case ir_var_shader_out:
947 reg = new(mem_ctx) dst_reg(this, ir->type);
948
949 for (int i = 0; i < type_size(ir->type); i++) {
950 output_reg[ir->data.location + i] = *reg;
951 output_reg[ir->data.location + i].reg_offset = i;
952 output_reg[ir->data.location + i].type =
953 brw_type_for_base_type(ir->type->get_scalar_type());
954 output_reg_annotation[ir->data.location + i] = ir->name;
955 }
956 break;
957
958 case ir_var_auto:
959 case ir_var_temporary:
960 reg = new(mem_ctx) dst_reg(this, ir->type);
961 break;
962
963 case ir_var_uniform:
964 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
965
966 /* Thanks to the lower_ubo_reference pass, we will see only
967 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
968 * variables, so no need for them to be in variable_ht.
969 *
970 * Atomic counters take no uniform storage, no need to do
971 * anything here.
972 */
973 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
974 return;
975
976 /* Track how big the whole uniform variable is, in case we need to put a
977 * copy of its data into pull constants for array access.
978 */
979 this->uniform_size[this->uniforms] = type_size(ir->type);
980
981 if (!strncmp(ir->name, "gl_", 3)) {
982 setup_builtin_uniform_values(ir);
983 } else {
984 setup_uniform_values(ir);
985 }
986 break;
987
988 case ir_var_system_value:
989 reg = make_reg_for_system_value(ir);
990 break;
991
992 default:
993 assert(!"not reached");
994 }
995
996 reg->type = brw_type_for_base_type(ir->type);
997 hash_table_insert(this->variable_ht, reg, ir);
998 }
999
1000 void
1001 vec4_visitor::visit(ir_loop *ir)
1002 {
1003 /* We don't want debugging output to print the whole body of the
1004 * loop as the annotation.
1005 */
1006 this->base_ir = NULL;
1007
1008 emit(BRW_OPCODE_DO);
1009
1010 visit_instructions(&ir->body_instructions);
1011
1012 emit(BRW_OPCODE_WHILE);
1013 }
1014
1015 void
1016 vec4_visitor::visit(ir_loop_jump *ir)
1017 {
1018 switch (ir->mode) {
1019 case ir_loop_jump::jump_break:
1020 emit(BRW_OPCODE_BREAK);
1021 break;
1022 case ir_loop_jump::jump_continue:
1023 emit(BRW_OPCODE_CONTINUE);
1024 break;
1025 }
1026 }
1027
1028
1029 void
1030 vec4_visitor::visit(ir_function_signature *ir)
1031 {
1032 assert(0);
1033 (void)ir;
1034 }
1035
1036 void
1037 vec4_visitor::visit(ir_function *ir)
1038 {
1039 /* Ignore function bodies other than main() -- we shouldn't see calls to
1040 * them since they should all be inlined.
1041 */
1042 if (strcmp(ir->name, "main") == 0) {
1043 const ir_function_signature *sig;
1044 exec_list empty;
1045
1046 sig = ir->matching_signature(NULL, &empty);
1047
1048 assert(sig);
1049
1050 visit_instructions(&sig->body);
1051 }
1052 }
1053
1054 bool
1055 vec4_visitor::try_emit_sat(ir_expression *ir)
1056 {
1057 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1058 if (!sat_src)
1059 return false;
1060
1061 sat_src->accept(this);
1062 src_reg src = this->result;
1063
1064 this->result = src_reg(this, ir->type);
1065 vec4_instruction *inst;
1066 inst = emit(MOV(dst_reg(this->result), src));
1067 inst->saturate = true;
1068
1069 return true;
1070 }
1071
1072 bool
1073 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1074 {
1075 /* 3-src instructions were introduced in gen6. */
1076 if (brw->gen < 6)
1077 return false;
1078
1079 /* MAD can only handle floating-point data. */
1080 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1081 return false;
1082
1083 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1084 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1085
1086 if (!mul || mul->operation != ir_binop_mul)
1087 return false;
1088
1089 nonmul->accept(this);
1090 src_reg src0 = fix_3src_operand(this->result);
1091
1092 mul->operands[0]->accept(this);
1093 src_reg src1 = fix_3src_operand(this->result);
1094
1095 mul->operands[1]->accept(this);
1096 src_reg src2 = fix_3src_operand(this->result);
1097
1098 this->result = src_reg(this, ir->type);
1099 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1100
1101 return true;
1102 }
1103
1104 void
1105 vec4_visitor::emit_bool_comparison(unsigned int op,
1106 dst_reg dst, src_reg src0, src_reg src1)
1107 {
1108 /* original gen4 does destination conversion before comparison. */
1109 if (brw->gen < 5)
1110 dst.type = src0.type;
1111
1112 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1113
1114 dst.type = BRW_REGISTER_TYPE_D;
1115 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1116 }
1117
1118 void
1119 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1120 src_reg src0, src_reg src1)
1121 {
1122 vec4_instruction *inst;
1123
1124 if (brw->gen >= 6) {
1125 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1126 inst->conditional_mod = conditionalmod;
1127 } else {
1128 emit(CMP(dst, src0, src1, conditionalmod));
1129
1130 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1131 inst->predicate = BRW_PREDICATE_NORMAL;
1132 }
1133 }
1134
1135 void
1136 vec4_visitor::emit_lrp(const dst_reg &dst,
1137 const src_reg &x, const src_reg &y, const src_reg &a)
1138 {
1139 if (brw->gen >= 6) {
1140 /* Note that the instruction's argument order is reversed from GLSL
1141 * and the IR.
1142 */
1143 emit(LRP(dst,
1144 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1145 } else {
1146 /* Earlier generations don't support three source operations, so we
1147 * need to emit x*(1-a) + y*a.
1148 *
1149 * A better way to do this would be:
1150 * ADD one_minus_a, negate(a), 1.0f
1151 * MUL null, y, a
1152 * MAC dst, x, one_minus_a
1153 * but we would need to support MAC and implicit accumulator.
1154 */
1155 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1156 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1157 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1158 y_times_a.writemask = dst.writemask;
1159 one_minus_a.writemask = dst.writemask;
1160 x_times_one_minus_a.writemask = dst.writemask;
1161
1162 emit(MUL(y_times_a, y, a));
1163 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1164 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1165 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1166 }
1167 }
1168
1169 static bool
1170 is_16bit_constant(ir_rvalue *rvalue)
1171 {
1172 ir_constant *constant = rvalue->as_constant();
1173 if (!constant)
1174 return false;
1175
1176 if (constant->type != glsl_type::int_type &&
1177 constant->type != glsl_type::uint_type)
1178 return false;
1179
1180 return constant->value.u[0] < (1 << 16);
1181 }
1182
1183 void
1184 vec4_visitor::visit(ir_expression *ir)
1185 {
1186 unsigned int operand;
1187 src_reg op[Elements(ir->operands)];
1188 src_reg result_src;
1189 dst_reg result_dst;
1190 vec4_instruction *inst;
1191
1192 if (try_emit_sat(ir))
1193 return;
1194
1195 if (ir->operation == ir_binop_add) {
1196 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1197 return;
1198 }
1199
1200 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1201 this->result.file = BAD_FILE;
1202 ir->operands[operand]->accept(this);
1203 if (this->result.file == BAD_FILE) {
1204 fprintf(stderr, "Failed to get tree for expression operand:\n");
1205 ir->operands[operand]->fprint(stderr);
1206 exit(1);
1207 }
1208 op[operand] = this->result;
1209
1210 /* Matrix expression operands should have been broken down to vector
1211 * operations already.
1212 */
1213 assert(!ir->operands[operand]->type->is_matrix());
1214 }
1215
1216 int vector_elements = ir->operands[0]->type->vector_elements;
1217 if (ir->operands[1]) {
1218 vector_elements = MAX2(vector_elements,
1219 ir->operands[1]->type->vector_elements);
1220 }
1221
1222 this->result.file = BAD_FILE;
1223
1224 /* Storage for our result. Ideally for an assignment we'd be using
1225 * the actual storage for the result here, instead.
1226 */
1227 result_src = src_reg(this, ir->type);
1228 /* convenience for the emit functions below. */
1229 result_dst = dst_reg(result_src);
1230 /* If nothing special happens, this is the result. */
1231 this->result = result_src;
1232 /* Limit writes to the channels that will be used by result_src later.
1233 * This does limit this temp's use as a temporary for multi-instruction
1234 * sequences.
1235 */
1236 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1237
1238 switch (ir->operation) {
1239 case ir_unop_logic_not:
1240 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1241 * ones complement of the whole register, not just bit 0.
1242 */
1243 emit(XOR(result_dst, op[0], src_reg(1)));
1244 break;
1245 case ir_unop_neg:
1246 op[0].negate = !op[0].negate;
1247 emit(MOV(result_dst, op[0]));
1248 break;
1249 case ir_unop_abs:
1250 op[0].abs = true;
1251 op[0].negate = false;
1252 emit(MOV(result_dst, op[0]));
1253 break;
1254
1255 case ir_unop_sign:
1256 if (ir->type->is_float()) {
1257 /* AND(val, 0x80000000) gives the sign bit.
1258 *
1259 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1260 * zero.
1261 */
1262 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1263
1264 op[0].type = BRW_REGISTER_TYPE_UD;
1265 result_dst.type = BRW_REGISTER_TYPE_UD;
1266 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1267
1268 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1269 inst->predicate = BRW_PREDICATE_NORMAL;
1270
1271 this->result.type = BRW_REGISTER_TYPE_F;
1272 } else {
1273 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1274 * -> non-negative val generates 0x00000000.
1275 * Predicated OR sets 1 if val is positive.
1276 */
1277 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1278
1279 emit(ASR(result_dst, op[0], src_reg(31)));
1280
1281 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1282 inst->predicate = BRW_PREDICATE_NORMAL;
1283 }
1284 break;
1285
1286 case ir_unop_rcp:
1287 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1288 break;
1289
1290 case ir_unop_exp2:
1291 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1292 break;
1293 case ir_unop_log2:
1294 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1295 break;
1296 case ir_unop_exp:
1297 case ir_unop_log:
1298 assert(!"not reached: should be handled by ir_explog_to_explog2");
1299 break;
1300 case ir_unop_sin:
1301 case ir_unop_sin_reduced:
1302 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1303 break;
1304 case ir_unop_cos:
1305 case ir_unop_cos_reduced:
1306 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1307 break;
1308
1309 case ir_unop_dFdx:
1310 case ir_unop_dFdy:
1311 assert(!"derivatives not valid in vertex shader");
1312 break;
1313
1314 case ir_unop_bitfield_reverse:
1315 emit(BFREV(result_dst, op[0]));
1316 break;
1317 case ir_unop_bit_count:
1318 emit(CBIT(result_dst, op[0]));
1319 break;
1320 case ir_unop_find_msb: {
1321 src_reg temp = src_reg(this, glsl_type::uint_type);
1322
1323 inst = emit(FBH(dst_reg(temp), op[0]));
1324 inst->dst.writemask = WRITEMASK_XYZW;
1325
1326 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1327 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1328 * subtract the result from 31 to convert the MSB count into an LSB count.
1329 */
1330
1331 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1332 temp.swizzle = BRW_SWIZZLE_NOOP;
1333 emit(MOV(result_dst, temp));
1334
1335 src_reg src_tmp = src_reg(result_dst);
1336 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1337
1338 src_tmp.negate = true;
1339 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1340 inst->predicate = BRW_PREDICATE_NORMAL;
1341 break;
1342 }
1343 case ir_unop_find_lsb:
1344 emit(FBL(result_dst, op[0]));
1345 break;
1346
1347 case ir_unop_noise:
1348 assert(!"not reached: should be handled by lower_noise");
1349 break;
1350
1351 case ir_binop_add:
1352 emit(ADD(result_dst, op[0], op[1]));
1353 break;
1354 case ir_binop_sub:
1355 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1356 break;
1357
1358 case ir_binop_mul:
1359 if (brw->gen < 8 && ir->type->is_integer()) {
1360 /* For integer multiplication, the MUL uses the low 16 bits of one of
1361 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1362 * accumulates in the contribution of the upper 16 bits of that
1363 * operand. If we can determine that one of the args is in the low
1364 * 16 bits, though, we can just emit a single MUL.
1365 */
1366 if (is_16bit_constant(ir->operands[0])) {
1367 if (brw->gen < 7)
1368 emit(MUL(result_dst, op[0], op[1]));
1369 else
1370 emit(MUL(result_dst, op[1], op[0]));
1371 } else if (is_16bit_constant(ir->operands[1])) {
1372 if (brw->gen < 7)
1373 emit(MUL(result_dst, op[1], op[0]));
1374 else
1375 emit(MUL(result_dst, op[0], op[1]));
1376 } else {
1377 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1378
1379 emit(MUL(acc, op[0], op[1]));
1380 emit(MACH(dst_null_d(), op[0], op[1]));
1381 emit(MOV(result_dst, src_reg(acc)));
1382 }
1383 } else {
1384 emit(MUL(result_dst, op[0], op[1]));
1385 }
1386 break;
1387 case ir_binop_imul_high: {
1388 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1389
1390 emit(MUL(acc, op[0], op[1]));
1391 emit(MACH(result_dst, op[0], op[1]));
1392 break;
1393 }
1394 case ir_binop_div:
1395 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1396 assert(ir->type->is_integer());
1397 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1398 break;
1399 case ir_binop_carry: {
1400 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1401
1402 emit(ADDC(dst_null_ud(), op[0], op[1]));
1403 emit(MOV(result_dst, src_reg(acc)));
1404 break;
1405 }
1406 case ir_binop_borrow: {
1407 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1408
1409 emit(SUBB(dst_null_ud(), op[0], op[1]));
1410 emit(MOV(result_dst, src_reg(acc)));
1411 break;
1412 }
1413 case ir_binop_mod:
1414 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1415 assert(ir->type->is_integer());
1416 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1417 break;
1418
1419 case ir_binop_less:
1420 case ir_binop_greater:
1421 case ir_binop_lequal:
1422 case ir_binop_gequal:
1423 case ir_binop_equal:
1424 case ir_binop_nequal: {
1425 emit(CMP(result_dst, op[0], op[1],
1426 brw_conditional_for_comparison(ir->operation)));
1427 emit(AND(result_dst, result_src, src_reg(0x1)));
1428 break;
1429 }
1430
1431 case ir_binop_all_equal:
1432 /* "==" operator producing a scalar boolean. */
1433 if (ir->operands[0]->type->is_vector() ||
1434 ir->operands[1]->type->is_vector()) {
1435 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1436 emit(MOV(result_dst, src_reg(0)));
1437 inst = emit(MOV(result_dst, src_reg(1)));
1438 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1439 } else {
1440 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1441 emit(AND(result_dst, result_src, src_reg(0x1)));
1442 }
1443 break;
1444 case ir_binop_any_nequal:
1445 /* "!=" operator producing a scalar boolean. */
1446 if (ir->operands[0]->type->is_vector() ||
1447 ir->operands[1]->type->is_vector()) {
1448 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1449
1450 emit(MOV(result_dst, src_reg(0)));
1451 inst = emit(MOV(result_dst, src_reg(1)));
1452 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1453 } else {
1454 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1455 emit(AND(result_dst, result_src, src_reg(0x1)));
1456 }
1457 break;
1458
1459 case ir_unop_any:
1460 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1461 emit(MOV(result_dst, src_reg(0)));
1462
1463 inst = emit(MOV(result_dst, src_reg(1)));
1464 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1465 break;
1466
1467 case ir_binop_logic_xor:
1468 emit(XOR(result_dst, op[0], op[1]));
1469 break;
1470
1471 case ir_binop_logic_or:
1472 emit(OR(result_dst, op[0], op[1]));
1473 break;
1474
1475 case ir_binop_logic_and:
1476 emit(AND(result_dst, op[0], op[1]));
1477 break;
1478
1479 case ir_binop_dot:
1480 assert(ir->operands[0]->type->is_vector());
1481 assert(ir->operands[0]->type == ir->operands[1]->type);
1482 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1483 break;
1484
1485 case ir_unop_sqrt:
1486 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1487 break;
1488 case ir_unop_rsq:
1489 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1490 break;
1491
1492 case ir_unop_bitcast_i2f:
1493 case ir_unop_bitcast_u2f:
1494 this->result = op[0];
1495 this->result.type = BRW_REGISTER_TYPE_F;
1496 break;
1497
1498 case ir_unop_bitcast_f2i:
1499 this->result = op[0];
1500 this->result.type = BRW_REGISTER_TYPE_D;
1501 break;
1502
1503 case ir_unop_bitcast_f2u:
1504 this->result = op[0];
1505 this->result.type = BRW_REGISTER_TYPE_UD;
1506 break;
1507
1508 case ir_unop_i2f:
1509 case ir_unop_i2u:
1510 case ir_unop_u2i:
1511 case ir_unop_u2f:
1512 case ir_unop_b2f:
1513 case ir_unop_b2i:
1514 case ir_unop_f2i:
1515 case ir_unop_f2u:
1516 emit(MOV(result_dst, op[0]));
1517 break;
1518 case ir_unop_f2b:
1519 case ir_unop_i2b: {
1520 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1521 emit(AND(result_dst, result_src, src_reg(1)));
1522 break;
1523 }
1524
1525 case ir_unop_trunc:
1526 emit(RNDZ(result_dst, op[0]));
1527 break;
1528 case ir_unop_ceil:
1529 op[0].negate = !op[0].negate;
1530 inst = emit(RNDD(result_dst, op[0]));
1531 this->result.negate = true;
1532 break;
1533 case ir_unop_floor:
1534 inst = emit(RNDD(result_dst, op[0]));
1535 break;
1536 case ir_unop_fract:
1537 inst = emit(FRC(result_dst, op[0]));
1538 break;
1539 case ir_unop_round_even:
1540 emit(RNDE(result_dst, op[0]));
1541 break;
1542
1543 case ir_binop_min:
1544 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1545 break;
1546 case ir_binop_max:
1547 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1548 break;
1549
1550 case ir_binop_pow:
1551 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1552 break;
1553
1554 case ir_unop_bit_not:
1555 inst = emit(NOT(result_dst, op[0]));
1556 break;
1557 case ir_binop_bit_and:
1558 inst = emit(AND(result_dst, op[0], op[1]));
1559 break;
1560 case ir_binop_bit_xor:
1561 inst = emit(XOR(result_dst, op[0], op[1]));
1562 break;
1563 case ir_binop_bit_or:
1564 inst = emit(OR(result_dst, op[0], op[1]));
1565 break;
1566
1567 case ir_binop_lshift:
1568 inst = emit(SHL(result_dst, op[0], op[1]));
1569 break;
1570
1571 case ir_binop_rshift:
1572 if (ir->type->base_type == GLSL_TYPE_INT)
1573 inst = emit(ASR(result_dst, op[0], op[1]));
1574 else
1575 inst = emit(SHR(result_dst, op[0], op[1]));
1576 break;
1577
1578 case ir_binop_bfm:
1579 emit(BFI1(result_dst, op[0], op[1]));
1580 break;
1581
1582 case ir_binop_ubo_load: {
1583 ir_constant *uniform_block = ir->operands[0]->as_constant();
1584 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1585 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1586 src_reg offset;
1587
1588 /* Now, load the vector from that offset. */
1589 assert(ir->type->is_vector() || ir->type->is_scalar());
1590
1591 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1592 packed_consts.type = result.type;
1593 src_reg surf_index =
1594 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1595 if (const_offset_ir) {
1596 if (brw->gen >= 8) {
1597 /* Store the offset in a GRF so we can send-from-GRF. */
1598 offset = src_reg(this, glsl_type::int_type);
1599 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1600 } else {
1601 /* Immediates are fine on older generations since they'll be moved
1602 * to a (potentially fake) MRF at the generator level.
1603 */
1604 offset = src_reg(const_offset / 16);
1605 }
1606 } else {
1607 offset = src_reg(this, glsl_type::uint_type);
1608 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1609 }
1610
1611 if (brw->gen >= 7) {
1612 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1613 grf_offset.type = offset.type;
1614
1615 emit(MOV(grf_offset, offset));
1616
1617 emit(new(mem_ctx) vec4_instruction(this,
1618 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1619 dst_reg(packed_consts),
1620 surf_index,
1621 src_reg(grf_offset)));
1622 } else {
1623 vec4_instruction *pull =
1624 emit(new(mem_ctx) vec4_instruction(this,
1625 VS_OPCODE_PULL_CONSTANT_LOAD,
1626 dst_reg(packed_consts),
1627 surf_index,
1628 offset));
1629 pull->base_mrf = 14;
1630 pull->mlen = 1;
1631 }
1632
1633 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1634 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1635 const_offset % 16 / 4,
1636 const_offset % 16 / 4,
1637 const_offset % 16 / 4);
1638
1639 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1640 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1641 emit(CMP(result_dst, packed_consts, src_reg(0u),
1642 BRW_CONDITIONAL_NZ));
1643 emit(AND(result_dst, result, src_reg(0x1)));
1644 } else {
1645 emit(MOV(result_dst, packed_consts));
1646 }
1647 break;
1648 }
1649
1650 case ir_binop_vector_extract:
1651 assert(!"should have been lowered by vec_index_to_cond_assign");
1652 break;
1653
1654 case ir_triop_fma:
1655 op[0] = fix_3src_operand(op[0]);
1656 op[1] = fix_3src_operand(op[1]);
1657 op[2] = fix_3src_operand(op[2]);
1658 /* Note that the instruction's argument order is reversed from GLSL
1659 * and the IR.
1660 */
1661 emit(MAD(result_dst, op[2], op[1], op[0]));
1662 break;
1663
1664 case ir_triop_lrp:
1665 emit_lrp(result_dst, op[0], op[1], op[2]);
1666 break;
1667
1668 case ir_triop_csel:
1669 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1670 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1671 inst->predicate = BRW_PREDICATE_NORMAL;
1672 break;
1673
1674 case ir_triop_bfi:
1675 op[0] = fix_3src_operand(op[0]);
1676 op[1] = fix_3src_operand(op[1]);
1677 op[2] = fix_3src_operand(op[2]);
1678 emit(BFI2(result_dst, op[0], op[1], op[2]));
1679 break;
1680
1681 case ir_triop_bitfield_extract:
1682 op[0] = fix_3src_operand(op[0]);
1683 op[1] = fix_3src_operand(op[1]);
1684 op[2] = fix_3src_operand(op[2]);
1685 /* Note that the instruction's argument order is reversed from GLSL
1686 * and the IR.
1687 */
1688 emit(BFE(result_dst, op[2], op[1], op[0]));
1689 break;
1690
1691 case ir_triop_vector_insert:
1692 assert(!"should have been lowered by lower_vector_insert");
1693 break;
1694
1695 case ir_quadop_bitfield_insert:
1696 assert(!"not reached: should be handled by "
1697 "bitfield_insert_to_bfm_bfi\n");
1698 break;
1699
1700 case ir_quadop_vector:
1701 assert(!"not reached: should be handled by lower_quadop_vector");
1702 break;
1703
1704 case ir_unop_pack_half_2x16:
1705 emit_pack_half_2x16(result_dst, op[0]);
1706 break;
1707 case ir_unop_unpack_half_2x16:
1708 emit_unpack_half_2x16(result_dst, op[0]);
1709 break;
1710 case ir_unop_pack_snorm_2x16:
1711 case ir_unop_pack_snorm_4x8:
1712 case ir_unop_pack_unorm_2x16:
1713 case ir_unop_pack_unorm_4x8:
1714 case ir_unop_unpack_snorm_2x16:
1715 case ir_unop_unpack_snorm_4x8:
1716 case ir_unop_unpack_unorm_2x16:
1717 case ir_unop_unpack_unorm_4x8:
1718 assert(!"not reached: should be handled by lower_packing_builtins");
1719 break;
1720 case ir_unop_unpack_half_2x16_split_x:
1721 case ir_unop_unpack_half_2x16_split_y:
1722 case ir_binop_pack_half_2x16_split:
1723 assert(!"not reached: should not occur in vertex shader");
1724 break;
1725 case ir_binop_ldexp:
1726 assert(!"not reached: should be handled by ldexp_to_arith()");
1727 break;
1728 }
1729 }
1730
1731
1732 void
1733 vec4_visitor::visit(ir_swizzle *ir)
1734 {
1735 src_reg src;
1736 int i = 0;
1737 int swizzle[4];
1738
1739 /* Note that this is only swizzles in expressions, not those on the left
1740 * hand side of an assignment, which do write masking. See ir_assignment
1741 * for that.
1742 */
1743
1744 ir->val->accept(this);
1745 src = this->result;
1746 assert(src.file != BAD_FILE);
1747
1748 for (i = 0; i < ir->type->vector_elements; i++) {
1749 switch (i) {
1750 case 0:
1751 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1752 break;
1753 case 1:
1754 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1755 break;
1756 case 2:
1757 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1758 break;
1759 case 3:
1760 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1761 break;
1762 }
1763 }
1764 for (; i < 4; i++) {
1765 /* Replicate the last channel out. */
1766 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1767 }
1768
1769 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1770
1771 this->result = src;
1772 }
1773
1774 void
1775 vec4_visitor::visit(ir_dereference_variable *ir)
1776 {
1777 const struct glsl_type *type = ir->type;
1778 dst_reg *reg = variable_storage(ir->var);
1779
1780 if (!reg) {
1781 fail("Failed to find variable storage for %s\n", ir->var->name);
1782 this->result = src_reg(brw_null_reg());
1783 return;
1784 }
1785
1786 this->result = src_reg(*reg);
1787
1788 /* System values get their swizzle from the dst_reg writemask */
1789 if (ir->var->data.mode == ir_var_system_value)
1790 return;
1791
1792 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1793 this->result.swizzle = swizzle_for_size(type->vector_elements);
1794 }
1795
1796
1797 int
1798 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1799 {
1800 /* Under normal circumstances array elements are stored consecutively, so
1801 * the stride is equal to the size of the array element.
1802 */
1803 return type_size(ir->type);
1804 }
1805
1806
1807 void
1808 vec4_visitor::visit(ir_dereference_array *ir)
1809 {
1810 ir_constant *constant_index;
1811 src_reg src;
1812 int array_stride = compute_array_stride(ir);
1813
1814 constant_index = ir->array_index->constant_expression_value();
1815
1816 ir->array->accept(this);
1817 src = this->result;
1818
1819 if (constant_index) {
1820 src.reg_offset += constant_index->value.i[0] * array_stride;
1821 } else {
1822 /* Variable index array dereference. It eats the "vec4" of the
1823 * base of the array and an index that offsets the Mesa register
1824 * index.
1825 */
1826 ir->array_index->accept(this);
1827
1828 src_reg index_reg;
1829
1830 if (array_stride == 1) {
1831 index_reg = this->result;
1832 } else {
1833 index_reg = src_reg(this, glsl_type::int_type);
1834
1835 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1836 }
1837
1838 if (src.reladdr) {
1839 src_reg temp = src_reg(this, glsl_type::int_type);
1840
1841 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1842
1843 index_reg = temp;
1844 }
1845
1846 src.reladdr = ralloc(mem_ctx, src_reg);
1847 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1848 }
1849
1850 /* If the type is smaller than a vec4, replicate the last channel out. */
1851 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1852 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1853 else
1854 src.swizzle = BRW_SWIZZLE_NOOP;
1855 src.type = brw_type_for_base_type(ir->type);
1856
1857 this->result = src;
1858 }
1859
1860 void
1861 vec4_visitor::visit(ir_dereference_record *ir)
1862 {
1863 unsigned int i;
1864 const glsl_type *struct_type = ir->record->type;
1865 int offset = 0;
1866
1867 ir->record->accept(this);
1868
1869 for (i = 0; i < struct_type->length; i++) {
1870 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1871 break;
1872 offset += type_size(struct_type->fields.structure[i].type);
1873 }
1874
1875 /* If the type is smaller than a vec4, replicate the last channel out. */
1876 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1877 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1878 else
1879 this->result.swizzle = BRW_SWIZZLE_NOOP;
1880 this->result.type = brw_type_for_base_type(ir->type);
1881
1882 this->result.reg_offset += offset;
1883 }
1884
1885 /**
1886 * We want to be careful in assignment setup to hit the actual storage
1887 * instead of potentially using a temporary like we might with the
1888 * ir_dereference handler.
1889 */
1890 static dst_reg
1891 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1892 {
1893 /* The LHS must be a dereference. If the LHS is a variable indexed array
1894 * access of a vector, it must be separated into a series conditional moves
1895 * before reaching this point (see ir_vec_index_to_cond_assign).
1896 */
1897 assert(ir->as_dereference());
1898 ir_dereference_array *deref_array = ir->as_dereference_array();
1899 if (deref_array) {
1900 assert(!deref_array->array->type->is_vector());
1901 }
1902
1903 /* Use the rvalue deref handler for the most part. We'll ignore
1904 * swizzles in it and write swizzles using writemask, though.
1905 */
1906 ir->accept(v);
1907 return dst_reg(v->result);
1908 }
1909
1910 void
1911 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1912 const struct glsl_type *type, uint32_t predicate)
1913 {
1914 if (type->base_type == GLSL_TYPE_STRUCT) {
1915 for (unsigned int i = 0; i < type->length; i++) {
1916 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1917 }
1918 return;
1919 }
1920
1921 if (type->is_array()) {
1922 for (unsigned int i = 0; i < type->length; i++) {
1923 emit_block_move(dst, src, type->fields.array, predicate);
1924 }
1925 return;
1926 }
1927
1928 if (type->is_matrix()) {
1929 const struct glsl_type *vec_type;
1930
1931 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1932 type->vector_elements, 1);
1933
1934 for (int i = 0; i < type->matrix_columns; i++) {
1935 emit_block_move(dst, src, vec_type, predicate);
1936 }
1937 return;
1938 }
1939
1940 assert(type->is_scalar() || type->is_vector());
1941
1942 dst->type = brw_type_for_base_type(type);
1943 src->type = dst->type;
1944
1945 dst->writemask = (1 << type->vector_elements) - 1;
1946
1947 src->swizzle = swizzle_for_size(type->vector_elements);
1948
1949 vec4_instruction *inst = emit(MOV(*dst, *src));
1950 inst->predicate = predicate;
1951
1952 dst->reg_offset++;
1953 src->reg_offset++;
1954 }
1955
1956
1957 /* If the RHS processing resulted in an instruction generating a
1958 * temporary value, and it would be easy to rewrite the instruction to
1959 * generate its result right into the LHS instead, do so. This ends
1960 * up reliably removing instructions where it can be tricky to do so
1961 * later without real UD chain information.
1962 */
1963 bool
1964 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1965 dst_reg dst,
1966 src_reg src,
1967 vec4_instruction *pre_rhs_inst,
1968 vec4_instruction *last_rhs_inst)
1969 {
1970 /* This could be supported, but it would take more smarts. */
1971 if (ir->condition)
1972 return false;
1973
1974 if (pre_rhs_inst == last_rhs_inst)
1975 return false; /* No instructions generated to work with. */
1976
1977 /* Make sure the last instruction generated our source reg. */
1978 if (src.file != GRF ||
1979 src.file != last_rhs_inst->dst.file ||
1980 src.reg != last_rhs_inst->dst.reg ||
1981 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1982 src.reladdr ||
1983 src.abs ||
1984 src.negate ||
1985 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1986 return false;
1987
1988 /* Check that that last instruction fully initialized the channels
1989 * we want to use, in the order we want to use them. We could
1990 * potentially reswizzle the operands of many instructions so that
1991 * we could handle out of order channels, but don't yet.
1992 */
1993
1994 for (unsigned i = 0; i < 4; i++) {
1995 if (dst.writemask & (1 << i)) {
1996 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1997 return false;
1998
1999 if (BRW_GET_SWZ(src.swizzle, i) != i)
2000 return false;
2001 }
2002 }
2003
2004 /* Success! Rewrite the instruction. */
2005 last_rhs_inst->dst.file = dst.file;
2006 last_rhs_inst->dst.reg = dst.reg;
2007 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2008 last_rhs_inst->dst.reladdr = dst.reladdr;
2009 last_rhs_inst->dst.writemask &= dst.writemask;
2010
2011 return true;
2012 }
2013
2014 void
2015 vec4_visitor::visit(ir_assignment *ir)
2016 {
2017 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2018 uint32_t predicate = BRW_PREDICATE_NONE;
2019
2020 if (!ir->lhs->type->is_scalar() &&
2021 !ir->lhs->type->is_vector()) {
2022 ir->rhs->accept(this);
2023 src_reg src = this->result;
2024
2025 if (ir->condition) {
2026 emit_bool_to_cond_code(ir->condition, &predicate);
2027 }
2028
2029 /* emit_block_move doesn't account for swizzles in the source register.
2030 * This should be ok, since the source register is a structure or an
2031 * array, and those can't be swizzled. But double-check to be sure.
2032 */
2033 assert(src.swizzle ==
2034 (ir->rhs->type->is_matrix()
2035 ? swizzle_for_size(ir->rhs->type->vector_elements)
2036 : BRW_SWIZZLE_NOOP));
2037
2038 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2039 return;
2040 }
2041
2042 /* Now we're down to just a scalar/vector with writemasks. */
2043 int i;
2044
2045 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2046 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2047
2048 ir->rhs->accept(this);
2049
2050 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2051
2052 src_reg src = this->result;
2053
2054 int swizzles[4];
2055 int first_enabled_chan = 0;
2056 int src_chan = 0;
2057
2058 assert(ir->lhs->type->is_vector() ||
2059 ir->lhs->type->is_scalar());
2060 dst.writemask = ir->write_mask;
2061
2062 for (int i = 0; i < 4; i++) {
2063 if (dst.writemask & (1 << i)) {
2064 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2065 break;
2066 }
2067 }
2068
2069 /* Swizzle a small RHS vector into the channels being written.
2070 *
2071 * glsl ir treats write_mask as dictating how many channels are
2072 * present on the RHS while in our instructions we need to make
2073 * those channels appear in the slots of the vec4 they're written to.
2074 */
2075 for (int i = 0; i < 4; i++) {
2076 if (dst.writemask & (1 << i))
2077 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2078 else
2079 swizzles[i] = first_enabled_chan;
2080 }
2081 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2082 swizzles[2], swizzles[3]);
2083
2084 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2085 return;
2086 }
2087
2088 if (ir->condition) {
2089 emit_bool_to_cond_code(ir->condition, &predicate);
2090 }
2091
2092 for (i = 0; i < type_size(ir->lhs->type); i++) {
2093 vec4_instruction *inst = emit(MOV(dst, src));
2094 inst->predicate = predicate;
2095
2096 dst.reg_offset++;
2097 src.reg_offset++;
2098 }
2099 }
2100
2101 void
2102 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2103 {
2104 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2105 foreach_list(node, &ir->components) {
2106 ir_constant *field_value = (ir_constant *)node;
2107
2108 emit_constant_values(dst, field_value);
2109 }
2110 return;
2111 }
2112
2113 if (ir->type->is_array()) {
2114 for (unsigned int i = 0; i < ir->type->length; i++) {
2115 emit_constant_values(dst, ir->array_elements[i]);
2116 }
2117 return;
2118 }
2119
2120 if (ir->type->is_matrix()) {
2121 for (int i = 0; i < ir->type->matrix_columns; i++) {
2122 float *vec = &ir->value.f[i * ir->type->vector_elements];
2123
2124 for (int j = 0; j < ir->type->vector_elements; j++) {
2125 dst->writemask = 1 << j;
2126 dst->type = BRW_REGISTER_TYPE_F;
2127
2128 emit(MOV(*dst, src_reg(vec[j])));
2129 }
2130 dst->reg_offset++;
2131 }
2132 return;
2133 }
2134
2135 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2136
2137 for (int i = 0; i < ir->type->vector_elements; i++) {
2138 if (!(remaining_writemask & (1 << i)))
2139 continue;
2140
2141 dst->writemask = 1 << i;
2142 dst->type = brw_type_for_base_type(ir->type);
2143
2144 /* Find other components that match the one we're about to
2145 * write. Emits fewer instructions for things like vec4(0.5,
2146 * 1.5, 1.5, 1.5).
2147 */
2148 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2149 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2150 if (ir->value.b[i] == ir->value.b[j])
2151 dst->writemask |= (1 << j);
2152 } else {
2153 /* u, i, and f storage all line up, so no need for a
2154 * switch case for comparing each type.
2155 */
2156 if (ir->value.u[i] == ir->value.u[j])
2157 dst->writemask |= (1 << j);
2158 }
2159 }
2160
2161 switch (ir->type->base_type) {
2162 case GLSL_TYPE_FLOAT:
2163 emit(MOV(*dst, src_reg(ir->value.f[i])));
2164 break;
2165 case GLSL_TYPE_INT:
2166 emit(MOV(*dst, src_reg(ir->value.i[i])));
2167 break;
2168 case GLSL_TYPE_UINT:
2169 emit(MOV(*dst, src_reg(ir->value.u[i])));
2170 break;
2171 case GLSL_TYPE_BOOL:
2172 emit(MOV(*dst, src_reg(ir->value.b[i])));
2173 break;
2174 default:
2175 assert(!"Non-float/uint/int/bool constant");
2176 break;
2177 }
2178
2179 remaining_writemask &= ~dst->writemask;
2180 }
2181 dst->reg_offset++;
2182 }
2183
2184 void
2185 vec4_visitor::visit(ir_constant *ir)
2186 {
2187 dst_reg dst = dst_reg(this, ir->type);
2188 this->result = src_reg(dst);
2189
2190 emit_constant_values(&dst, ir);
2191 }
2192
2193 void
2194 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2195 {
2196 ir_dereference *deref = static_cast<ir_dereference *>(
2197 ir->actual_parameters.get_head());
2198 ir_variable *location = deref->variable_referenced();
2199 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2200 location->data.atomic.buffer_index);
2201
2202 /* Calculate the surface offset */
2203 src_reg offset(this, glsl_type::uint_type);
2204 ir_dereference_array *deref_array = deref->as_dereference_array();
2205 if (deref_array) {
2206 deref_array->array_index->accept(this);
2207
2208 src_reg tmp(this, glsl_type::uint_type);
2209 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2210 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2211 } else {
2212 offset = location->data.atomic.offset;
2213 }
2214
2215 /* Emit the appropriate machine instruction */
2216 const char *callee = ir->callee->function_name();
2217 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2218
2219 if (!strcmp("__intrinsic_atomic_read", callee)) {
2220 emit_untyped_surface_read(surf_index, dst, offset);
2221
2222 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2223 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2224 src_reg(), src_reg());
2225
2226 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2227 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2228 src_reg(), src_reg());
2229 }
2230 }
2231
2232 void
2233 vec4_visitor::visit(ir_call *ir)
2234 {
2235 const char *callee = ir->callee->function_name();
2236
2237 if (!strcmp("__intrinsic_atomic_read", callee) ||
2238 !strcmp("__intrinsic_atomic_increment", callee) ||
2239 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2240 visit_atomic_counter_intrinsic(ir);
2241 } else {
2242 assert(!"Unsupported intrinsic.");
2243 }
2244 }
2245
2246 src_reg
2247 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2248 {
2249 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2250 inst->base_mrf = 2;
2251 inst->mlen = 1;
2252 inst->sampler = sampler;
2253 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2254 inst->dst.writemask = WRITEMASK_XYZW;
2255
2256 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2257 int param_base = inst->base_mrf;
2258 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2259 int zero_mask = 0xf & ~coord_mask;
2260
2261 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2262 coordinate));
2263
2264 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2265 src_reg(0)));
2266
2267 emit(inst);
2268 return src_reg(inst->dst);
2269 }
2270
2271 void
2272 vec4_visitor::visit(ir_texture *ir)
2273 {
2274 int sampler =
2275 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2276
2277 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2278 * emitting anything other than setting up the constant result.
2279 */
2280 if (ir->op == ir_tg4) {
2281 ir_constant *chan = ir->lod_info.component->as_constant();
2282 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2283 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2284 dst_reg result(this, ir->type);
2285 this->result = src_reg(result);
2286 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2287 return;
2288 }
2289 }
2290
2291 /* Should be lowered by do_lower_texture_projection */
2292 assert(!ir->projector);
2293
2294 /* Should be lowered */
2295 assert(!ir->offset || !ir->offset->type->is_array());
2296
2297 /* Generate code to compute all the subexpression trees. This has to be
2298 * done before loading any values into MRFs for the sampler message since
2299 * generating these values may involve SEND messages that need the MRFs.
2300 */
2301 src_reg coordinate;
2302 if (ir->coordinate) {
2303 ir->coordinate->accept(this);
2304 coordinate = this->result;
2305 }
2306
2307 src_reg shadow_comparitor;
2308 if (ir->shadow_comparitor) {
2309 ir->shadow_comparitor->accept(this);
2310 shadow_comparitor = this->result;
2311 }
2312
2313 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2314 src_reg offset_value;
2315 if (has_nonconstant_offset) {
2316 ir->offset->accept(this);
2317 offset_value = src_reg(this->result);
2318 }
2319
2320 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2321 src_reg lod, dPdx, dPdy, sample_index, mcs;
2322 switch (ir->op) {
2323 case ir_tex:
2324 lod = src_reg(0.0f);
2325 lod_type = glsl_type::float_type;
2326 break;
2327 case ir_txf:
2328 case ir_txl:
2329 case ir_txs:
2330 ir->lod_info.lod->accept(this);
2331 lod = this->result;
2332 lod_type = ir->lod_info.lod->type;
2333 break;
2334 case ir_query_levels:
2335 lod = src_reg(0);
2336 lod_type = glsl_type::int_type;
2337 break;
2338 case ir_txf_ms:
2339 ir->lod_info.sample_index->accept(this);
2340 sample_index = this->result;
2341 sample_index_type = ir->lod_info.sample_index->type;
2342
2343 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2344 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2345 else
2346 mcs = src_reg(0u);
2347 break;
2348 case ir_txd:
2349 ir->lod_info.grad.dPdx->accept(this);
2350 dPdx = this->result;
2351
2352 ir->lod_info.grad.dPdy->accept(this);
2353 dPdy = this->result;
2354
2355 lod_type = ir->lod_info.grad.dPdx->type;
2356 break;
2357 case ir_txb:
2358 case ir_lod:
2359 case ir_tg4:
2360 break;
2361 }
2362
2363 vec4_instruction *inst = NULL;
2364 switch (ir->op) {
2365 case ir_tex:
2366 case ir_txl:
2367 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2368 break;
2369 case ir_txd:
2370 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2371 break;
2372 case ir_txf:
2373 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2374 break;
2375 case ir_txf_ms:
2376 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2377 break;
2378 case ir_txs:
2379 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2380 break;
2381 case ir_tg4:
2382 if (has_nonconstant_offset)
2383 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2384 else
2385 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2386 break;
2387 case ir_query_levels:
2388 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2389 break;
2390 case ir_txb:
2391 assert(!"TXB is not valid for vertex shaders.");
2392 break;
2393 case ir_lod:
2394 assert(!"LOD is not valid for vertex shaders.");
2395 break;
2396 default:
2397 assert(!"Unrecognized tex op");
2398 }
2399
2400 if (ir->offset != NULL && ir->op != ir_txf)
2401 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2402
2403 /* Stuff the channel select bits in the top of the texture offset */
2404 if (ir->op == ir_tg4)
2405 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2406
2407 /* The message header is necessary for:
2408 * - Gen4 (always)
2409 * - Texel offsets
2410 * - Gather channel selection
2411 * - Sampler indices too large to fit in a 4-bit value.
2412 */
2413 inst->header_present =
2414 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2415 sampler >= 16;
2416 inst->base_mrf = 2;
2417 inst->mlen = inst->header_present + 1; /* always at least one */
2418 inst->sampler = sampler;
2419 inst->dst = dst_reg(this, ir->type);
2420 inst->dst.writemask = WRITEMASK_XYZW;
2421 inst->shadow_compare = ir->shadow_comparitor != NULL;
2422
2423 /* MRF for the first parameter */
2424 int param_base = inst->base_mrf + inst->header_present;
2425
2426 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2427 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2428 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2429 } else {
2430 /* Load the coordinate */
2431 /* FINISHME: gl_clamp_mask and saturate */
2432 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2433 int zero_mask = 0xf & ~coord_mask;
2434
2435 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2436 coordinate));
2437
2438 if (zero_mask != 0) {
2439 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2440 src_reg(0)));
2441 }
2442 /* Load the shadow comparitor */
2443 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2444 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2445 WRITEMASK_X),
2446 shadow_comparitor));
2447 inst->mlen++;
2448 }
2449
2450 /* Load the LOD info */
2451 if (ir->op == ir_tex || ir->op == ir_txl) {
2452 int mrf, writemask;
2453 if (brw->gen >= 5) {
2454 mrf = param_base + 1;
2455 if (ir->shadow_comparitor) {
2456 writemask = WRITEMASK_Y;
2457 /* mlen already incremented */
2458 } else {
2459 writemask = WRITEMASK_X;
2460 inst->mlen++;
2461 }
2462 } else /* brw->gen == 4 */ {
2463 mrf = param_base;
2464 writemask = WRITEMASK_W;
2465 }
2466 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2467 } else if (ir->op == ir_txf) {
2468 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2469 } else if (ir->op == ir_txf_ms) {
2470 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2471 sample_index));
2472 if (brw->gen >= 7)
2473 /* MCS data is in the first channel of `mcs`, but we need to get it into
2474 * the .y channel of the second vec4 of params, so replicate .x across
2475 * the whole vec4 and then mask off everything except .y
2476 */
2477 mcs.swizzle = BRW_SWIZZLE_XXXX;
2478 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2479 mcs));
2480 inst->mlen++;
2481 } else if (ir->op == ir_txd) {
2482 const glsl_type *type = lod_type;
2483
2484 if (brw->gen >= 5) {
2485 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2486 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2487 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2488 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2489 inst->mlen++;
2490
2491 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2492 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2493 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2494 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2495 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2496 inst->mlen++;
2497
2498 if (ir->shadow_comparitor) {
2499 emit(MOV(dst_reg(MRF, param_base + 2,
2500 ir->shadow_comparitor->type, WRITEMASK_Z),
2501 shadow_comparitor));
2502 }
2503 }
2504 } else /* brw->gen == 4 */ {
2505 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2506 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2507 inst->mlen += 2;
2508 }
2509 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2510 if (ir->shadow_comparitor) {
2511 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2512 shadow_comparitor));
2513 }
2514
2515 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2516 offset_value));
2517 inst->mlen++;
2518 }
2519 }
2520
2521 emit(inst);
2522
2523 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2524 * spec requires layers.
2525 */
2526 if (ir->op == ir_txs) {
2527 glsl_type const *type = ir->sampler->type;
2528 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2529 type->sampler_array) {
2530 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2531 writemask(inst->dst, WRITEMASK_Z),
2532 src_reg(inst->dst), src_reg(6));
2533 }
2534 }
2535
2536 if (brw->gen == 6 && ir->op == ir_tg4) {
2537 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2538 }
2539
2540 swizzle_result(ir, src_reg(inst->dst), sampler);
2541 }
2542
2543 /**
2544 * Apply workarounds for Gen6 gather with UINT/SINT
2545 */
2546 void
2547 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2548 {
2549 if (!wa)
2550 return;
2551
2552 int width = (wa & WA_8BIT) ? 8 : 16;
2553 dst_reg dst_f = dst;
2554 dst_f.type = BRW_REGISTER_TYPE_F;
2555
2556 /* Convert from UNORM to UINT */
2557 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2558 emit(MOV(dst, src_reg(dst_f)));
2559
2560 if (wa & WA_SIGN) {
2561 /* Reinterpret the UINT value as a signed INT value by
2562 * shifting the sign bit into place, then shifting back
2563 * preserving sign.
2564 */
2565 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2566 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2567 }
2568 }
2569
2570 /**
2571 * Set up the gather channel based on the swizzle, for gather4.
2572 */
2573 uint32_t
2574 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2575 {
2576 ir_constant *chan = ir->lod_info.component->as_constant();
2577 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2578 switch (swiz) {
2579 case SWIZZLE_X: return 0;
2580 case SWIZZLE_Y:
2581 /* gather4 sampler is broken for green channel on RG32F --
2582 * we must ask for blue instead.
2583 */
2584 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2585 return 2;
2586 return 1;
2587 case SWIZZLE_Z: return 2;
2588 case SWIZZLE_W: return 3;
2589 default:
2590 assert(!"Not reached"); /* zero, one swizzles handled already */
2591 return 0;
2592 }
2593 }
2594
2595 void
2596 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2597 {
2598 int s = key->tex.swizzles[sampler];
2599
2600 this->result = src_reg(this, ir->type);
2601 dst_reg swizzled_result(this->result);
2602
2603 if (ir->op == ir_query_levels) {
2604 /* # levels is in .w */
2605 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2606 emit(MOV(swizzled_result, orig_val));
2607 return;
2608 }
2609
2610 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2611 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2612 emit(MOV(swizzled_result, orig_val));
2613 return;
2614 }
2615
2616
2617 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2618 int swizzle[4] = {0};
2619
2620 for (int i = 0; i < 4; i++) {
2621 switch (GET_SWZ(s, i)) {
2622 case SWIZZLE_ZERO:
2623 zero_mask |= (1 << i);
2624 break;
2625 case SWIZZLE_ONE:
2626 one_mask |= (1 << i);
2627 break;
2628 default:
2629 copy_mask |= (1 << i);
2630 swizzle[i] = GET_SWZ(s, i);
2631 break;
2632 }
2633 }
2634
2635 if (copy_mask) {
2636 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2637 swizzled_result.writemask = copy_mask;
2638 emit(MOV(swizzled_result, orig_val));
2639 }
2640
2641 if (zero_mask) {
2642 swizzled_result.writemask = zero_mask;
2643 emit(MOV(swizzled_result, src_reg(0.0f)));
2644 }
2645
2646 if (one_mask) {
2647 swizzled_result.writemask = one_mask;
2648 emit(MOV(swizzled_result, src_reg(1.0f)));
2649 }
2650 }
2651
2652 void
2653 vec4_visitor::visit(ir_return *ir)
2654 {
2655 assert(!"not reached");
2656 }
2657
2658 void
2659 vec4_visitor::visit(ir_discard *ir)
2660 {
2661 assert(!"not reached");
2662 }
2663
2664 void
2665 vec4_visitor::visit(ir_if *ir)
2666 {
2667 /* Don't point the annotation at the if statement, because then it plus
2668 * the then and else blocks get printed.
2669 */
2670 this->base_ir = ir->condition;
2671
2672 if (brw->gen == 6) {
2673 emit_if_gen6(ir);
2674 } else {
2675 uint32_t predicate;
2676 emit_bool_to_cond_code(ir->condition, &predicate);
2677 emit(IF(predicate));
2678 }
2679
2680 visit_instructions(&ir->then_instructions);
2681
2682 if (!ir->else_instructions.is_empty()) {
2683 this->base_ir = ir->condition;
2684 emit(BRW_OPCODE_ELSE);
2685
2686 visit_instructions(&ir->else_instructions);
2687 }
2688
2689 this->base_ir = ir->condition;
2690 emit(BRW_OPCODE_ENDIF);
2691 }
2692
2693 void
2694 vec4_visitor::visit(ir_emit_vertex *)
2695 {
2696 assert(!"not reached");
2697 }
2698
2699 void
2700 vec4_visitor::visit(ir_end_primitive *)
2701 {
2702 assert(!"not reached");
2703 }
2704
2705 void
2706 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2707 dst_reg dst, src_reg offset,
2708 src_reg src0, src_reg src1)
2709 {
2710 unsigned mlen = 0;
2711
2712 /* Set the atomic operation offset. */
2713 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2714 mlen++;
2715
2716 /* Set the atomic operation arguments. */
2717 if (src0.file != BAD_FILE) {
2718 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2719 mlen++;
2720 }
2721
2722 if (src1.file != BAD_FILE) {
2723 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2724 mlen++;
2725 }
2726
2727 /* Emit the instruction. Note that this maps to the normal SIMD8
2728 * untyped atomic message on Ivy Bridge, but that's OK because
2729 * unused channels will be masked out.
2730 */
2731 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2732 src_reg(atomic_op), src_reg(surf_index));
2733 inst->base_mrf = 0;
2734 inst->mlen = mlen;
2735 }
2736
2737 void
2738 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2739 src_reg offset)
2740 {
2741 /* Set the surface read offset. */
2742 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2743
2744 /* Emit the instruction. Note that this maps to the normal SIMD8
2745 * untyped surface read message, but that's OK because unused
2746 * channels will be masked out.
2747 */
2748 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2749 dst, src_reg(surf_index));
2750 inst->base_mrf = 0;
2751 inst->mlen = 1;
2752 }
2753
2754 void
2755 vec4_visitor::emit_ndc_computation()
2756 {
2757 /* Get the position */
2758 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2759
2760 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2761 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2762 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2763
2764 current_annotation = "NDC";
2765 dst_reg ndc_w = ndc;
2766 ndc_w.writemask = WRITEMASK_W;
2767 src_reg pos_w = pos;
2768 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2769 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2770
2771 dst_reg ndc_xyz = ndc;
2772 ndc_xyz.writemask = WRITEMASK_XYZ;
2773
2774 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2775 }
2776
2777 void
2778 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2779 {
2780 if (brw->gen < 6 &&
2781 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2782 key->userclip_active || brw->has_negative_rhw_bug)) {
2783 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2784 dst_reg header1_w = header1;
2785 header1_w.writemask = WRITEMASK_W;
2786
2787 emit(MOV(header1, 0u));
2788
2789 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2790 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2791
2792 current_annotation = "Point size";
2793 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2794 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2795 }
2796
2797 if (key->userclip_active) {
2798 current_annotation = "Clipping flags";
2799 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2800 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2801
2802 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2803 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2804 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2805
2806 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2807 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2808 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2809 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2810 }
2811
2812 /* i965 clipping workaround:
2813 * 1) Test for -ve rhw
2814 * 2) If set,
2815 * set ndc = (0,0,0,0)
2816 * set ucp[6] = 1
2817 *
2818 * Later, clipping will detect ucp[6] and ensure the primitive is
2819 * clipped against all fixed planes.
2820 */
2821 if (brw->has_negative_rhw_bug) {
2822 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2823 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2824 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2825 vec4_instruction *inst;
2826 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2827 inst->predicate = BRW_PREDICATE_NORMAL;
2828 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2829 inst->predicate = BRW_PREDICATE_NORMAL;
2830 }
2831
2832 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2833 } else if (brw->gen < 6) {
2834 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2835 } else {
2836 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2837 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2838 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2839 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2840 }
2841 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2842 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2843 src_reg(output_reg[VARYING_SLOT_LAYER])));
2844 }
2845 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2846 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2847 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2848 }
2849 }
2850 }
2851
2852 void
2853 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2854 {
2855 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2856 *
2857 * "If a linked set of shaders forming the vertex stage contains no
2858 * static write to gl_ClipVertex or gl_ClipDistance, but the
2859 * application has requested clipping against user clip planes through
2860 * the API, then the coordinate written to gl_Position is used for
2861 * comparison against the user clip planes."
2862 *
2863 * This function is only called if the shader didn't write to
2864 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2865 * if the user wrote to it; otherwise we use gl_Position.
2866 */
2867 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2868 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2869 clip_vertex = VARYING_SLOT_POS;
2870 }
2871
2872 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2873 ++i) {
2874 reg.writemask = 1 << i;
2875 emit(DP4(reg,
2876 src_reg(output_reg[clip_vertex]),
2877 src_reg(this->userplane[i + offset])));
2878 }
2879 }
2880
2881 void
2882 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2883 {
2884 assert (varying < VARYING_SLOT_MAX);
2885 reg.type = output_reg[varying].type;
2886 current_annotation = output_reg_annotation[varying];
2887 /* Copy the register, saturating if necessary */
2888 vec4_instruction *inst = emit(MOV(reg,
2889 src_reg(output_reg[varying])));
2890 if ((varying == VARYING_SLOT_COL0 ||
2891 varying == VARYING_SLOT_COL1 ||
2892 varying == VARYING_SLOT_BFC0 ||
2893 varying == VARYING_SLOT_BFC1) &&
2894 key->clamp_vertex_color) {
2895 inst->saturate = true;
2896 }
2897 }
2898
2899 void
2900 vec4_visitor::emit_urb_slot(int mrf, int varying)
2901 {
2902 struct brw_reg hw_reg = brw_message_reg(mrf);
2903 dst_reg reg = dst_reg(MRF, mrf);
2904 reg.type = BRW_REGISTER_TYPE_F;
2905
2906 switch (varying) {
2907 case VARYING_SLOT_PSIZ:
2908 /* PSIZ is always in slot 0, and is coupled with other flags. */
2909 current_annotation = "indices, point width, clip flags";
2910 emit_psiz_and_flags(hw_reg);
2911 break;
2912 case BRW_VARYING_SLOT_NDC:
2913 current_annotation = "NDC";
2914 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2915 break;
2916 case VARYING_SLOT_POS:
2917 current_annotation = "gl_Position";
2918 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2919 break;
2920 case VARYING_SLOT_EDGE:
2921 /* This is present when doing unfilled polygons. We're supposed to copy
2922 * the edge flag from the user-provided vertex array
2923 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2924 * of that attribute (starts as 1.0f). This is then used in clipping to
2925 * determine which edges should be drawn as wireframe.
2926 */
2927 current_annotation = "edge flag";
2928 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2929 glsl_type::float_type, WRITEMASK_XYZW))));
2930 break;
2931 case BRW_VARYING_SLOT_PAD:
2932 /* No need to write to this slot */
2933 break;
2934 default:
2935 emit_generic_urb_slot(reg, varying);
2936 break;
2937 }
2938 }
2939
2940 static int
2941 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2942 {
2943 if (brw->gen >= 6) {
2944 /* URB data written (does not include the message header reg) must
2945 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2946 * section 5.4.3.2.2: URB_INTERLEAVED.
2947 *
2948 * URB entries are allocated on a multiple of 1024 bits, so an
2949 * extra 128 bits written here to make the end align to 256 is
2950 * no problem.
2951 */
2952 if ((mlen % 2) != 1)
2953 mlen++;
2954 }
2955
2956 return mlen;
2957 }
2958
2959
2960 /**
2961 * Generates the VUE payload plus the necessary URB write instructions to
2962 * output it.
2963 *
2964 * The VUE layout is documented in Volume 2a.
2965 */
2966 void
2967 vec4_visitor::emit_vertex()
2968 {
2969 /* MRF 0 is reserved for the debugger, so start with message header
2970 * in MRF 1.
2971 */
2972 int base_mrf = 1;
2973 int mrf = base_mrf;
2974 /* In the process of generating our URB write message contents, we
2975 * may need to unspill a register or load from an array. Those
2976 * reads would use MRFs 14-15.
2977 */
2978 int max_usable_mrf = 13;
2979
2980 /* The following assertion verifies that max_usable_mrf causes an
2981 * even-numbered amount of URB write data, which will meet gen6's
2982 * requirements for length alignment.
2983 */
2984 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2985
2986 /* First mrf is the g0-based message header containing URB handles and
2987 * such.
2988 */
2989 emit_urb_write_header(mrf++);
2990
2991 if (brw->gen < 6) {
2992 emit_ndc_computation();
2993 }
2994
2995 /* Lower legacy ff and ClipVertex clipping to clip distances */
2996 if (key->userclip_active && !prog->UsesClipDistanceOut) {
2997 current_annotation = "user clip distances";
2998
2999 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3000 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3001
3002 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3003 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3004 }
3005
3006 /* We may need to split this up into several URB writes, so do them in a
3007 * loop.
3008 */
3009 int slot = 0;
3010 bool complete = false;
3011 do {
3012 /* URB offset is in URB row increments, and each of our MRFs is half of
3013 * one of those, since we're doing interleaved writes.
3014 */
3015 int offset = slot / 2;
3016
3017 mrf = base_mrf + 1;
3018 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3019 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3020
3021 /* If this was max_usable_mrf, we can't fit anything more into this
3022 * URB WRITE.
3023 */
3024 if (mrf > max_usable_mrf) {
3025 slot++;
3026 break;
3027 }
3028 }
3029
3030 complete = slot >= prog_data->vue_map.num_slots;
3031 current_annotation = "URB write";
3032 vec4_instruction *inst = emit_urb_write_opcode(complete);
3033 inst->base_mrf = base_mrf;
3034 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3035 inst->offset += offset;
3036 } while(!complete);
3037 }
3038
3039
3040 src_reg
3041 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3042 src_reg *reladdr, int reg_offset)
3043 {
3044 /* Because we store the values to scratch interleaved like our
3045 * vertex data, we need to scale the vec4 index by 2.
3046 */
3047 int message_header_scale = 2;
3048
3049 /* Pre-gen6, the message header uses byte offsets instead of vec4
3050 * (16-byte) offset units.
3051 */
3052 if (brw->gen < 6)
3053 message_header_scale *= 16;
3054
3055 if (reladdr) {
3056 src_reg index = src_reg(this, glsl_type::int_type);
3057
3058 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3059 emit_before(inst, MUL(dst_reg(index),
3060 index, src_reg(message_header_scale)));
3061
3062 return index;
3063 } else {
3064 return src_reg(reg_offset * message_header_scale);
3065 }
3066 }
3067
3068 src_reg
3069 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3070 src_reg *reladdr, int reg_offset)
3071 {
3072 if (reladdr) {
3073 src_reg index = src_reg(this, glsl_type::int_type);
3074
3075 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3076
3077 /* Pre-gen6, the message header uses byte offsets instead of vec4
3078 * (16-byte) offset units.
3079 */
3080 if (brw->gen < 6) {
3081 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3082 }
3083
3084 return index;
3085 } else if (brw->gen >= 8) {
3086 /* Store the offset in a GRF so we can send-from-GRF. */
3087 src_reg offset = src_reg(this, glsl_type::int_type);
3088 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3089 return offset;
3090 } else {
3091 int message_header_scale = brw->gen < 6 ? 16 : 1;
3092 return src_reg(reg_offset * message_header_scale);
3093 }
3094 }
3095
3096 /**
3097 * Emits an instruction before @inst to load the value named by @orig_src
3098 * from scratch space at @base_offset to @temp.
3099 *
3100 * @base_offset is measured in 32-byte units (the size of a register).
3101 */
3102 void
3103 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3104 dst_reg temp, src_reg orig_src,
3105 int base_offset)
3106 {
3107 int reg_offset = base_offset + orig_src.reg_offset;
3108 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3109
3110 emit_before(inst, SCRATCH_READ(temp, index));
3111 }
3112
3113 /**
3114 * Emits an instruction after @inst to store the value to be written
3115 * to @orig_dst to scratch space at @base_offset, from @temp.
3116 *
3117 * @base_offset is measured in 32-byte units (the size of a register).
3118 */
3119 void
3120 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3121 {
3122 int reg_offset = base_offset + inst->dst.reg_offset;
3123 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3124
3125 /* Create a temporary register to store *inst's result in.
3126 *
3127 * We have to be careful in MOVing from our temporary result register in
3128 * the scratch write. If we swizzle from channels of the temporary that
3129 * weren't initialized, it will confuse live interval analysis, which will
3130 * make spilling fail to make progress.
3131 */
3132 src_reg temp = src_reg(this, glsl_type::vec4_type);
3133 temp.type = inst->dst.type;
3134 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3135 int swizzles[4];
3136 for (int i = 0; i < 4; i++)
3137 if (inst->dst.writemask & (1 << i))
3138 swizzles[i] = i;
3139 else
3140 swizzles[i] = first_writemask_chan;
3141 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3142 swizzles[2], swizzles[3]);
3143
3144 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3145 inst->dst.writemask));
3146 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3147 write->predicate = inst->predicate;
3148 write->ir = inst->ir;
3149 write->annotation = inst->annotation;
3150 inst->insert_after(write);
3151
3152 inst->dst.file = temp.file;
3153 inst->dst.reg = temp.reg;
3154 inst->dst.reg_offset = temp.reg_offset;
3155 inst->dst.reladdr = NULL;
3156 }
3157
3158 /**
3159 * We can't generally support array access in GRF space, because a
3160 * single instruction's destination can only span 2 contiguous
3161 * registers. So, we send all GRF arrays that get variable index
3162 * access to scratch space.
3163 */
3164 void
3165 vec4_visitor::move_grf_array_access_to_scratch()
3166 {
3167 int scratch_loc[this->virtual_grf_count];
3168
3169 for (int i = 0; i < this->virtual_grf_count; i++) {
3170 scratch_loc[i] = -1;
3171 }
3172
3173 /* First, calculate the set of virtual GRFs that need to be punted
3174 * to scratch due to having any array access on them, and where in
3175 * scratch.
3176 */
3177 foreach_list(node, &this->instructions) {
3178 vec4_instruction *inst = (vec4_instruction *)node;
3179
3180 if (inst->dst.file == GRF && inst->dst.reladdr &&
3181 scratch_loc[inst->dst.reg] == -1) {
3182 scratch_loc[inst->dst.reg] = c->last_scratch;
3183 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3184 }
3185
3186 for (int i = 0 ; i < 3; i++) {
3187 src_reg *src = &inst->src[i];
3188
3189 if (src->file == GRF && src->reladdr &&
3190 scratch_loc[src->reg] == -1) {
3191 scratch_loc[src->reg] = c->last_scratch;
3192 c->last_scratch += this->virtual_grf_sizes[src->reg];
3193 }
3194 }
3195 }
3196
3197 /* Now, for anything that will be accessed through scratch, rewrite
3198 * it to load/store. Note that this is a _safe list walk, because
3199 * we may generate a new scratch_write instruction after the one
3200 * we're processing.
3201 */
3202 foreach_list_safe(node, &this->instructions) {
3203 vec4_instruction *inst = (vec4_instruction *)node;
3204
3205 /* Set up the annotation tracking for new generated instructions. */
3206 base_ir = inst->ir;
3207 current_annotation = inst->annotation;
3208
3209 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3210 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3211 }
3212
3213 for (int i = 0 ; i < 3; i++) {
3214 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3215 continue;
3216
3217 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3218
3219 emit_scratch_read(inst, temp, inst->src[i],
3220 scratch_loc[inst->src[i].reg]);
3221
3222 inst->src[i].file = temp.file;
3223 inst->src[i].reg = temp.reg;
3224 inst->src[i].reg_offset = temp.reg_offset;
3225 inst->src[i].reladdr = NULL;
3226 }
3227 }
3228 }
3229
3230 /**
3231 * Emits an instruction before @inst to load the value named by @orig_src
3232 * from the pull constant buffer (surface) at @base_offset to @temp.
3233 */
3234 void
3235 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3236 dst_reg temp, src_reg orig_src,
3237 int base_offset)
3238 {
3239 int reg_offset = base_offset + orig_src.reg_offset;
3240 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3241 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3242 vec4_instruction *load;
3243
3244 if (brw->gen >= 7) {
3245 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3246 grf_offset.type = offset.type;
3247 emit_before(inst, MOV(grf_offset, offset));
3248
3249 load = new(mem_ctx) vec4_instruction(this,
3250 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3251 temp, index, src_reg(grf_offset));
3252 } else {
3253 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3254 temp, index, offset);
3255 load->base_mrf = 14;
3256 load->mlen = 1;
3257 }
3258 emit_before(inst, load);
3259 }
3260
3261 /**
3262 * Implements array access of uniforms by inserting a
3263 * PULL_CONSTANT_LOAD instruction.
3264 *
3265 * Unlike temporary GRF array access (where we don't support it due to
3266 * the difficulty of doing relative addressing on instruction
3267 * destinations), we could potentially do array access of uniforms
3268 * that were loaded in GRF space as push constants. In real-world
3269 * usage we've seen, though, the arrays being used are always larger
3270 * than we could load as push constants, so just always move all
3271 * uniform array access out to a pull constant buffer.
3272 */
3273 void
3274 vec4_visitor::move_uniform_array_access_to_pull_constants()
3275 {
3276 int pull_constant_loc[this->uniforms];
3277
3278 for (int i = 0; i < this->uniforms; i++) {
3279 pull_constant_loc[i] = -1;
3280 }
3281
3282 /* Walk through and find array access of uniforms. Put a copy of that
3283 * uniform in the pull constant buffer.
3284 *
3285 * Note that we don't move constant-indexed accesses to arrays. No
3286 * testing has been done of the performance impact of this choice.
3287 */
3288 foreach_list_safe(node, &this->instructions) {
3289 vec4_instruction *inst = (vec4_instruction *)node;
3290
3291 for (int i = 0 ; i < 3; i++) {
3292 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3293 continue;
3294
3295 int uniform = inst->src[i].reg;
3296
3297 /* If this array isn't already present in the pull constant buffer,
3298 * add it.
3299 */
3300 if (pull_constant_loc[uniform] == -1) {
3301 const float **values = &stage_prog_data->param[uniform * 4];
3302
3303 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3304
3305 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3306 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3307 = values[j];
3308 }
3309 }
3310
3311 /* Set up the annotation tracking for new generated instructions. */
3312 base_ir = inst->ir;
3313 current_annotation = inst->annotation;
3314
3315 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3316
3317 emit_pull_constant_load(inst, temp, inst->src[i],
3318 pull_constant_loc[uniform]);
3319
3320 inst->src[i].file = temp.file;
3321 inst->src[i].reg = temp.reg;
3322 inst->src[i].reg_offset = temp.reg_offset;
3323 inst->src[i].reladdr = NULL;
3324 }
3325 }
3326
3327 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3328 * no need to track them as larger-than-vec4 objects. This will be
3329 * relied on in cutting out unused uniform vectors from push
3330 * constants.
3331 */
3332 split_uniform_registers();
3333 }
3334
3335 void
3336 vec4_visitor::resolve_ud_negate(src_reg *reg)
3337 {
3338 if (reg->type != BRW_REGISTER_TYPE_UD ||
3339 !reg->negate)
3340 return;
3341
3342 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3343 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3344 *reg = temp;
3345 }
3346
3347 vec4_visitor::vec4_visitor(struct brw_context *brw,
3348 struct brw_vec4_compile *c,
3349 struct gl_program *prog,
3350 const struct brw_vec4_prog_key *key,
3351 struct brw_vec4_prog_data *prog_data,
3352 struct gl_shader_program *shader_prog,
3353 struct brw_shader *shader,
3354 void *mem_ctx,
3355 bool debug_flag,
3356 bool no_spills,
3357 shader_time_shader_type st_base,
3358 shader_time_shader_type st_written,
3359 shader_time_shader_type st_reset)
3360 : sanity_param_count(0),
3361 fail_msg(NULL),
3362 first_non_payload_grf(0),
3363 need_all_constants_in_pull_buffer(false),
3364 debug_flag(debug_flag),
3365 no_spills(no_spills),
3366 st_base(st_base),
3367 st_written(st_written),
3368 st_reset(st_reset)
3369 {
3370 this->brw = brw;
3371 this->ctx = &brw->ctx;
3372 this->shader_prog = shader_prog;
3373 this->shader = shader;
3374
3375 this->mem_ctx = mem_ctx;
3376 this->failed = false;
3377
3378 this->base_ir = NULL;
3379 this->current_annotation = NULL;
3380 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3381
3382 this->c = c;
3383 this->prog = prog;
3384 this->key = key;
3385 this->prog_data = prog_data;
3386 this->stage_prog_data = &prog_data->base;
3387
3388 this->variable_ht = hash_table_ctor(0,
3389 hash_table_pointer_hash,
3390 hash_table_pointer_compare);
3391
3392 this->virtual_grf_start = NULL;
3393 this->virtual_grf_end = NULL;
3394 this->virtual_grf_sizes = NULL;
3395 this->virtual_grf_count = 0;
3396 this->virtual_grf_reg_map = NULL;
3397 this->virtual_grf_reg_count = 0;
3398 this->virtual_grf_array_size = 0;
3399 this->live_intervals_valid = false;
3400
3401 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3402
3403 this->uniforms = 0;
3404
3405 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3406 * at least one. See setup_uniforms() in brw_vec4.cpp.
3407 */
3408 this->uniform_array_size = 1;
3409 if (prog_data) {
3410 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3411 }
3412
3413 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3414 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3415 }
3416
3417 vec4_visitor::~vec4_visitor()
3418 {
3419 hash_table_dtor(this->variable_ht);
3420 }
3421
3422
3423 void
3424 vec4_visitor::fail(const char *format, ...)
3425 {
3426 va_list va;
3427 char *msg;
3428
3429 if (failed)
3430 return;
3431
3432 failed = true;
3433
3434 va_start(va, format);
3435 msg = ralloc_vasprintf(mem_ctx, format, va);
3436 va_end(va);
3437 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3438
3439 this->fail_msg = msg;
3440
3441 if (debug_flag) {
3442 fprintf(stderr, "%s", msg);
3443 }
3444 }
3445
3446 } /* namespace brw */