i965/vec4: Optimize unpackSnorm4x8().
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 extern "C" {
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(vec4_visitor *v,
34 enum opcode opcode, const dst_reg &dst,
35 const src_reg &src0, const src_reg &src1,
36 const src_reg &src2)
37 {
38 this->opcode = opcode;
39 this->dst = dst;
40 this->src[0] = src0;
41 this->src[1] = src1;
42 this->src[2] = src2;
43 this->saturate = false;
44 this->force_writemask_all = false;
45 this->no_dd_clear = false;
46 this->no_dd_check = false;
47 this->writes_accumulator = false;
48 this->conditional_mod = BRW_CONDITIONAL_NONE;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
70 vec4_instruction *new_inst)
71 {
72 new_inst->ir = inst->ir;
73 new_inst->annotation = inst->annotation;
74
75 inst->insert_before(block, new_inst);
76
77 return inst;
78 }
79
80 vec4_instruction *
81 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
82 const src_reg &src1, const src_reg &src2)
83 {
84 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
85 src0, src1, src2));
86 }
87
88
89 vec4_instruction *
90 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
91 const src_reg &src1)
92 {
93 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
94 }
95
96 vec4_instruction *
97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
98 {
99 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
100 }
101
102 vec4_instruction *
103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
104 {
105 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode)
110 {
111 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
112 }
113
114 #define ALU1(op) \
115 vec4_instruction * \
116 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
117 { \
118 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
119 src0); \
120 }
121
122 #define ALU2(op) \
123 vec4_instruction * \
124 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
125 const src_reg &src1) \
126 { \
127 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
128 src0, src1); \
129 }
130
131 #define ALU2_ACC(op) \
132 vec4_instruction * \
133 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
134 const src_reg &src1) \
135 { \
136 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
137 BRW_OPCODE_##op, dst, src0, src1); \
138 inst->writes_accumulator = true; \
139 return inst; \
140 }
141
142 #define ALU3(op) \
143 vec4_instruction * \
144 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
145 const src_reg &src1, const src_reg &src2) \
146 { \
147 assert(brw->gen >= 6); \
148 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
149 src0, src1, src2); \
150 }
151
152 ALU1(NOT)
153 ALU1(MOV)
154 ALU1(FRC)
155 ALU1(RNDD)
156 ALU1(RNDE)
157 ALU1(RNDZ)
158 ALU1(F32TO16)
159 ALU1(F16TO32)
160 ALU2(ADD)
161 ALU2(MUL)
162 ALU2_ACC(MACH)
163 ALU2(AND)
164 ALU2(OR)
165 ALU2(XOR)
166 ALU2(DP3)
167 ALU2(DP4)
168 ALU2(DPH)
169 ALU2(SHL)
170 ALU2(SHR)
171 ALU2(ASR)
172 ALU3(LRP)
173 ALU1(BFREV)
174 ALU3(BFE)
175 ALU2(BFI1)
176 ALU3(BFI2)
177 ALU1(FBH)
178 ALU1(FBL)
179 ALU1(CBIT)
180 ALU3(MAD)
181 ALU2_ACC(ADDC)
182 ALU2_ACC(SUBB)
183 ALU2(MAC)
184
185 /** Gen4 predicated IF. */
186 vec4_instruction *
187 vec4_visitor::IF(enum brw_predicate predicate)
188 {
189 vec4_instruction *inst;
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
192 inst->predicate = predicate;
193
194 return inst;
195 }
196
197 /** Gen6 IF with embedded comparison. */
198 vec4_instruction *
199 vec4_visitor::IF(src_reg src0, src_reg src1,
200 enum brw_conditional_mod condition)
201 {
202 assert(brw->gen == 6);
203
204 vec4_instruction *inst;
205
206 resolve_ud_negate(&src0);
207 resolve_ud_negate(&src1);
208
209 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
210 src0, src1);
211 inst->conditional_mod = condition;
212
213 return inst;
214 }
215
216 /**
217 * CMP: Sets the low bit of the destination channels with the result
218 * of the comparison, while the upper bits are undefined, and updates
219 * the flag register with the packed 16 bits of the result.
220 */
221 vec4_instruction *
222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
223 enum brw_conditional_mod condition)
224 {
225 vec4_instruction *inst;
226
227 /* original gen4 does type conversion to the destination type
228 * before before comparison, producing garbage results for floating
229 * point comparisons.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 vec4_instruction *
247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
248 {
249 vec4_instruction *inst;
250
251 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
252 dst, index);
253 inst->base_mrf = 14;
254 inst->mlen = 2;
255
256 return inst;
257 }
258
259 vec4_instruction *
260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
261 const src_reg &index)
262 {
263 vec4_instruction *inst;
264
265 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
266 dst, src, index);
267 inst->base_mrf = 13;
268 inst->mlen = 3;
269
270 return inst;
271 }
272
273 void
274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
275 {
276 static enum opcode dot_opcodes[] = {
277 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
278 };
279
280 emit(dot_opcodes[elements - 2], dst, src0, src1);
281 }
282
283 src_reg
284 vec4_visitor::fix_3src_operand(src_reg src)
285 {
286 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
287 * able to use vertical stride of zero to replicate the vec4 uniform, like
288 *
289 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
290 *
291 * But you can't, since vertical stride is always four in three-source
292 * instructions. Instead, insert a MOV instruction to do the replication so
293 * that the three-source instruction can consume it.
294 */
295
296 /* The MOV is only needed if the source is a uniform or immediate. */
297 if (src.file != UNIFORM && src.file != IMM)
298 return src;
299
300 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
304 expanded.type = src.type;
305 emit(MOV(expanded, src));
306 return src_reg(expanded);
307 }
308
309 src_reg
310 vec4_visitor::fix_math_operand(src_reg src)
311 {
312 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
313 return src;
314
315 /* The gen6 math instruction ignores the source modifiers --
316 * swizzle, abs, negate, and at least some parts of the register
317 * region description.
318 *
319 * Rather than trying to enumerate all these cases, *always* expand the
320 * operand to a temp GRF for gen6.
321 *
322 * For gen7, keep the operand as-is, except if immediate, which gen7 still
323 * can't use.
324 */
325
326 if (brw->gen == 7 && src.file != IMM)
327 return src;
328
329 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
330 expanded.type = src.type;
331 emit(MOV(expanded, src));
332 return src_reg(expanded);
333 }
334
335 void
336 vec4_visitor::emit_math(enum opcode opcode,
337 const dst_reg &dst,
338 const src_reg &src0, const src_reg &src1)
339 {
340 vec4_instruction *math =
341 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
342
343 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
344 /* MATH on Gen6 must be align1, so we can't do writemasks. */
345 math->dst = dst_reg(this, glsl_type::vec4_type);
346 math->dst.type = dst.type;
347 emit(MOV(dst, src_reg(math->dst)));
348 } else if (brw->gen < 6) {
349 math->base_mrf = 1;
350 math->mlen = src1.file == BAD_FILE ? 1 : 2;
351 }
352 }
353
354 void
355 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
356 {
357 if (brw->gen < 7) {
358 unreachable("ir_unop_pack_half_2x16 should be lowered");
359 }
360
361 assert(dst.type == BRW_REGISTER_TYPE_UD);
362 assert(src0.type == BRW_REGISTER_TYPE_F);
363
364 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
365 *
366 * Because this instruction does not have a 16-bit floating-point type,
367 * the destination data type must be Word (W).
368 *
369 * The destination must be DWord-aligned and specify a horizontal stride
370 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
371 * each destination channel and the upper word is not modified.
372 *
373 * The above restriction implies that the f32to16 instruction must use
374 * align1 mode, because only in align1 mode is it possible to specify
375 * horizontal stride. We choose here to defy the hardware docs and emit
376 * align16 instructions.
377 *
378 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
379 * instructions. I was partially successful in that the code passed all
380 * tests. However, the code was dubiously correct and fragile, and the
381 * tests were not harsh enough to probe that frailty. Not trusting the
382 * code, I chose instead to remain in align16 mode in defiance of the hw
383 * docs).
384 *
385 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
386 * simulator, emitting a f32to16 in align16 mode with UD as destination
387 * data type is safe. The behavior differs from that specified in the PRM
388 * in that the upper word of each destination channel is cleared to 0.
389 */
390
391 dst_reg tmp_dst(this, glsl_type::uvec2_type);
392 src_reg tmp_src(tmp_dst);
393
394 #if 0
395 /* Verify the undocumented behavior on which the following instructions
396 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
397 * then the result of the bit-or instruction below will be incorrect.
398 *
399 * You should inspect the disasm output in order to verify that the MOV is
400 * not optimized away.
401 */
402 emit(MOV(tmp_dst, src_reg(0x12345678u)));
403 #endif
404
405 /* Give tmp the form below, where "." means untouched.
406 *
407 * w z y x w z y x
408 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
409 *
410 * That the upper word of each write-channel be 0 is required for the
411 * following bit-shift and bit-or instructions to work. Note that this
412 * relies on the undocumented hardware behavior mentioned above.
413 */
414 tmp_dst.writemask = WRITEMASK_XY;
415 emit(F32TO16(tmp_dst, src0));
416
417 /* Give the write-channels of dst the form:
418 * 0xhhhh0000
419 */
420 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
421 emit(SHL(dst, tmp_src, src_reg(16u)));
422
423 /* Finally, give the write-channels of dst the form of packHalf2x16's
424 * output:
425 * 0xhhhhllll
426 */
427 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
428 emit(OR(dst, src_reg(dst), tmp_src));
429 }
430
431 void
432 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
433 {
434 if (brw->gen < 7) {
435 unreachable("ir_unop_unpack_half_2x16 should be lowered");
436 }
437
438 assert(dst.type == BRW_REGISTER_TYPE_F);
439 assert(src0.type == BRW_REGISTER_TYPE_UD);
440
441 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
442 *
443 * Because this instruction does not have a 16-bit floating-point type,
444 * the source data type must be Word (W). The destination type must be
445 * F (Float).
446 *
447 * To use W as the source data type, we must adjust horizontal strides,
448 * which is only possible in align1 mode. All my [chadv] attempts at
449 * emitting align1 instructions for unpackHalf2x16 failed to pass the
450 * Piglit tests, so I gave up.
451 *
452 * I've verified that, on gen7 hardware and the simulator, it is safe to
453 * emit f16to32 in align16 mode with UD as source data type.
454 */
455
456 dst_reg tmp_dst(this, glsl_type::uvec2_type);
457 src_reg tmp_src(tmp_dst);
458
459 tmp_dst.writemask = WRITEMASK_X;
460 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
461
462 tmp_dst.writemask = WRITEMASK_Y;
463 emit(SHR(tmp_dst, src0, src_reg(16u)));
464
465 dst.writemask = WRITEMASK_XY;
466 emit(F16TO32(dst, tmp_src));
467 }
468
469 void
470 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
471 {
472 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
473 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
474 * is not suitable to generate the shift values, but we can use the packed
475 * vector float and a type-converting MOV.
476 */
477 dst_reg shift(this, glsl_type::uvec4_type);
478 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
479
480 dst_reg shifted(this, glsl_type::uvec4_type);
481 src0.swizzle = BRW_SWIZZLE_XXXX;
482 emit(SHR(shifted, src0, src_reg(shift)));
483
484 shifted.type = BRW_REGISTER_TYPE_UB;
485 dst_reg f(this, glsl_type::vec4_type);
486 emit(MOV(f, src_reg(shifted)));
487
488 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
489 }
490
491 void
492 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
493 {
494 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
495 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
496 * is not suitable to generate the shift values, but we can use the packed
497 * vector float and a type-converting MOV.
498 */
499 dst_reg shift(this, glsl_type::uvec4_type);
500 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
501
502 dst_reg shifted(this, glsl_type::uvec4_type);
503 src0.swizzle = BRW_SWIZZLE_XXXX;
504 emit(SHR(shifted, src0, src_reg(shift)));
505
506 shifted.type = BRW_REGISTER_TYPE_B;
507 dst_reg f(this, glsl_type::vec4_type);
508 emit(MOV(f, src_reg(shifted)));
509
510 dst_reg scaled(this, glsl_type::vec4_type);
511 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
512
513 dst_reg max(this, glsl_type::vec4_type);
514 emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
515 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
516 }
517
518 void
519 vec4_visitor::visit_instructions(const exec_list *list)
520 {
521 foreach_in_list(ir_instruction, ir, list) {
522 base_ir = ir;
523 ir->accept(this);
524 }
525 }
526
527
528 static int
529 type_size(const struct glsl_type *type)
530 {
531 unsigned int i;
532 int size;
533
534 switch (type->base_type) {
535 case GLSL_TYPE_UINT:
536 case GLSL_TYPE_INT:
537 case GLSL_TYPE_FLOAT:
538 case GLSL_TYPE_BOOL:
539 if (type->is_matrix()) {
540 return type->matrix_columns;
541 } else {
542 /* Regardless of size of vector, it gets a vec4. This is bad
543 * packing for things like floats, but otherwise arrays become a
544 * mess. Hopefully a later pass over the code can pack scalars
545 * down if appropriate.
546 */
547 return 1;
548 }
549 case GLSL_TYPE_ARRAY:
550 assert(type->length > 0);
551 return type_size(type->fields.array) * type->length;
552 case GLSL_TYPE_STRUCT:
553 size = 0;
554 for (i = 0; i < type->length; i++) {
555 size += type_size(type->fields.structure[i].type);
556 }
557 return size;
558 case GLSL_TYPE_SAMPLER:
559 /* Samplers take up no register space, since they're baked in at
560 * link time.
561 */
562 return 0;
563 case GLSL_TYPE_ATOMIC_UINT:
564 return 0;
565 case GLSL_TYPE_IMAGE:
566 case GLSL_TYPE_VOID:
567 case GLSL_TYPE_ERROR:
568 case GLSL_TYPE_INTERFACE:
569 unreachable("not reached");
570 }
571
572 return 0;
573 }
574
575 int
576 vec4_visitor::virtual_grf_alloc(int size)
577 {
578 if (virtual_grf_array_size <= virtual_grf_count) {
579 if (virtual_grf_array_size == 0)
580 virtual_grf_array_size = 16;
581 else
582 virtual_grf_array_size *= 2;
583 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
584 virtual_grf_array_size);
585 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
586 virtual_grf_array_size);
587 }
588 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
589 virtual_grf_reg_count += size;
590 virtual_grf_sizes[virtual_grf_count] = size;
591 return virtual_grf_count++;
592 }
593
594 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
595 {
596 init();
597
598 this->file = GRF;
599 this->reg = v->virtual_grf_alloc(type_size(type));
600
601 if (type->is_array() || type->is_record()) {
602 this->swizzle = BRW_SWIZZLE_NOOP;
603 } else {
604 this->swizzle = swizzle_for_size(type->vector_elements);
605 }
606
607 this->type = brw_type_for_base_type(type);
608 }
609
610 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
611 {
612 assert(size > 0);
613
614 init();
615
616 this->file = GRF;
617 this->reg = v->virtual_grf_alloc(type_size(type) * size);
618
619 this->swizzle = BRW_SWIZZLE_NOOP;
620
621 this->type = brw_type_for_base_type(type);
622 }
623
624 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
625 {
626 init();
627
628 this->file = GRF;
629 this->reg = v->virtual_grf_alloc(type_size(type));
630
631 if (type->is_array() || type->is_record()) {
632 this->writemask = WRITEMASK_XYZW;
633 } else {
634 this->writemask = (1 << type->vector_elements) - 1;
635 }
636
637 this->type = brw_type_for_base_type(type);
638 }
639
640 /* Our support for uniforms is piggy-backed on the struct
641 * gl_fragment_program, because that's where the values actually
642 * get stored, rather than in some global gl_shader_program uniform
643 * store.
644 */
645 void
646 vec4_visitor::setup_uniform_values(ir_variable *ir)
647 {
648 int namelen = strlen(ir->name);
649
650 /* The data for our (non-builtin) uniforms is stored in a series of
651 * gl_uniform_driver_storage structs for each subcomponent that
652 * glGetUniformLocation() could name. We know it's been set up in the same
653 * order we'd walk the type, so walk the list of storage and find anything
654 * with our name, or the prefix of a component that starts with our name.
655 */
656 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
657 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
658
659 if (strncmp(ir->name, storage->name, namelen) != 0 ||
660 (storage->name[namelen] != 0 &&
661 storage->name[namelen] != '.' &&
662 storage->name[namelen] != '[')) {
663 continue;
664 }
665
666 gl_constant_value *components = storage->storage;
667 unsigned vector_count = (MAX2(storage->array_elements, 1) *
668 storage->type->matrix_columns);
669
670 for (unsigned s = 0; s < vector_count; s++) {
671 assert(uniforms < uniform_array_size);
672 uniform_vector_size[uniforms] = storage->type->vector_elements;
673
674 int i;
675 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
676 stage_prog_data->param[uniforms * 4 + i] = components;
677 components++;
678 }
679 for (; i < 4; i++) {
680 static gl_constant_value zero = { 0.0 };
681 stage_prog_data->param[uniforms * 4 + i] = &zero;
682 }
683
684 uniforms++;
685 }
686 }
687 }
688
689 void
690 vec4_visitor::setup_uniform_clipplane_values()
691 {
692 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
693
694 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
695 assert(this->uniforms < uniform_array_size);
696 this->uniform_vector_size[this->uniforms] = 4;
697 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
698 this->userplane[i].type = BRW_REGISTER_TYPE_F;
699 for (int j = 0; j < 4; ++j) {
700 stage_prog_data->param[this->uniforms * 4 + j] =
701 (gl_constant_value *) &clip_planes[i][j];
702 }
703 ++this->uniforms;
704 }
705 }
706
707 /* Our support for builtin uniforms is even scarier than non-builtin.
708 * It sits on top of the PROG_STATE_VAR parameters that are
709 * automatically updated from GL context state.
710 */
711 void
712 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
713 {
714 const ir_state_slot *const slots = ir->get_state_slots();
715 assert(slots != NULL);
716
717 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
718 /* This state reference has already been setup by ir_to_mesa,
719 * but we'll get the same index back here. We can reference
720 * ParameterValues directly, since unlike brw_fs.cpp, we never
721 * add new state references during compile.
722 */
723 int index = _mesa_add_state_reference(this->prog->Parameters,
724 (gl_state_index *)slots[i].tokens);
725 gl_constant_value *values =
726 &this->prog->Parameters->ParameterValues[index][0];
727
728 assert(this->uniforms < uniform_array_size);
729 this->uniform_vector_size[this->uniforms] = 0;
730 /* Add each of the unique swizzled channels of the element.
731 * This will end up matching the size of the glsl_type of this field.
732 */
733 int last_swiz = -1;
734 for (unsigned int j = 0; j < 4; j++) {
735 int swiz = GET_SWZ(slots[i].swizzle, j);
736 last_swiz = swiz;
737
738 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
739 assert(this->uniforms < uniform_array_size);
740 if (swiz <= last_swiz)
741 this->uniform_vector_size[this->uniforms]++;
742 }
743 this->uniforms++;
744 }
745 }
746
747 dst_reg *
748 vec4_visitor::variable_storage(ir_variable *var)
749 {
750 return (dst_reg *)hash_table_find(this->variable_ht, var);
751 }
752
753 void
754 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
755 enum brw_predicate *predicate)
756 {
757 ir_expression *expr = ir->as_expression();
758
759 *predicate = BRW_PREDICATE_NORMAL;
760
761 if (expr && expr->operation != ir_binop_ubo_load) {
762 src_reg op[3];
763 vec4_instruction *inst;
764
765 assert(expr->get_num_operands() <= 3);
766 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
767 expr->operands[i]->accept(this);
768 op[i] = this->result;
769
770 resolve_ud_negate(&op[i]);
771 }
772
773 switch (expr->operation) {
774 case ir_unop_logic_not:
775 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
776 inst->conditional_mod = BRW_CONDITIONAL_Z;
777 break;
778
779 case ir_binop_logic_xor:
780 inst = emit(XOR(dst_null_d(), op[0], op[1]));
781 inst->conditional_mod = BRW_CONDITIONAL_NZ;
782 break;
783
784 case ir_binop_logic_or:
785 inst = emit(OR(dst_null_d(), op[0], op[1]));
786 inst->conditional_mod = BRW_CONDITIONAL_NZ;
787 break;
788
789 case ir_binop_logic_and:
790 inst = emit(AND(dst_null_d(), op[0], op[1]));
791 inst->conditional_mod = BRW_CONDITIONAL_NZ;
792 break;
793
794 case ir_unop_f2b:
795 if (brw->gen >= 6) {
796 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
797 } else {
798 inst = emit(MOV(dst_null_f(), op[0]));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 }
801 break;
802
803 case ir_unop_i2b:
804 if (brw->gen >= 6) {
805 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
806 } else {
807 inst = emit(MOV(dst_null_d(), op[0]));
808 inst->conditional_mod = BRW_CONDITIONAL_NZ;
809 }
810 break;
811
812 case ir_binop_all_equal:
813 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
814 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
815 break;
816
817 case ir_binop_any_nequal:
818 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
819 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
820 break;
821
822 case ir_unop_any:
823 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
824 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
825 break;
826
827 case ir_binop_greater:
828 case ir_binop_gequal:
829 case ir_binop_less:
830 case ir_binop_lequal:
831 case ir_binop_equal:
832 case ir_binop_nequal:
833 emit(CMP(dst_null_d(), op[0], op[1],
834 brw_conditional_for_comparison(expr->operation)));
835 break;
836
837 case ir_triop_csel: {
838 /* Expand the boolean condition into the flag register. */
839 inst = emit(MOV(dst_null_d(), op[0]));
840 inst->conditional_mod = BRW_CONDITIONAL_NZ;
841
842 /* Select which boolean to return. */
843 dst_reg temp(this, expr->operands[1]->type);
844 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
845 inst->predicate = BRW_PREDICATE_NORMAL;
846
847 /* Expand the result to a condition code. */
848 inst = emit(MOV(dst_null_d(), src_reg(temp)));
849 inst->conditional_mod = BRW_CONDITIONAL_NZ;
850 break;
851 }
852
853 default:
854 unreachable("not reached");
855 }
856 return;
857 }
858
859 ir->accept(this);
860
861 resolve_ud_negate(&this->result);
862
863 if (brw->gen >= 6) {
864 vec4_instruction *inst = emit(AND(dst_null_d(),
865 this->result, src_reg(1)));
866 inst->conditional_mod = BRW_CONDITIONAL_NZ;
867 } else {
868 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
869 inst->conditional_mod = BRW_CONDITIONAL_NZ;
870 }
871 }
872
873 /**
874 * Emit a gen6 IF statement with the comparison folded into the IF
875 * instruction.
876 */
877 void
878 vec4_visitor::emit_if_gen6(ir_if *ir)
879 {
880 ir_expression *expr = ir->condition->as_expression();
881
882 if (expr && expr->operation != ir_binop_ubo_load) {
883 src_reg op[3];
884 dst_reg temp;
885
886 assert(expr->get_num_operands() <= 3);
887 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
888 expr->operands[i]->accept(this);
889 op[i] = this->result;
890 }
891
892 switch (expr->operation) {
893 case ir_unop_logic_not:
894 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
895 return;
896
897 case ir_binop_logic_xor:
898 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
899 return;
900
901 case ir_binop_logic_or:
902 temp = dst_reg(this, glsl_type::bool_type);
903 emit(OR(temp, op[0], op[1]));
904 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
905 return;
906
907 case ir_binop_logic_and:
908 temp = dst_reg(this, glsl_type::bool_type);
909 emit(AND(temp, op[0], op[1]));
910 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
911 return;
912
913 case ir_unop_f2b:
914 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
915 return;
916
917 case ir_unop_i2b:
918 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
919 return;
920
921 case ir_binop_greater:
922 case ir_binop_gequal:
923 case ir_binop_less:
924 case ir_binop_lequal:
925 case ir_binop_equal:
926 case ir_binop_nequal:
927 emit(IF(op[0], op[1],
928 brw_conditional_for_comparison(expr->operation)));
929 return;
930
931 case ir_binop_all_equal:
932 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
933 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
934 return;
935
936 case ir_binop_any_nequal:
937 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
938 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
939 return;
940
941 case ir_unop_any:
942 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
943 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
944 return;
945
946 case ir_triop_csel: {
947 /* Expand the boolean condition into the flag register. */
948 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
949 inst->conditional_mod = BRW_CONDITIONAL_NZ;
950
951 /* Select which boolean to return. */
952 dst_reg temp(this, expr->operands[1]->type);
953 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
954 inst->predicate = BRW_PREDICATE_NORMAL;
955
956 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
957 return;
958 }
959
960 default:
961 unreachable("not reached");
962 }
963 return;
964 }
965
966 ir->condition->accept(this);
967
968 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
969 }
970
971 void
972 vec4_visitor::visit(ir_variable *ir)
973 {
974 dst_reg *reg = NULL;
975
976 if (variable_storage(ir))
977 return;
978
979 switch (ir->data.mode) {
980 case ir_var_shader_in:
981 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
982 break;
983
984 case ir_var_shader_out:
985 reg = new(mem_ctx) dst_reg(this, ir->type);
986
987 for (int i = 0; i < type_size(ir->type); i++) {
988 output_reg[ir->data.location + i] = *reg;
989 output_reg[ir->data.location + i].reg_offset = i;
990 output_reg[ir->data.location + i].type =
991 brw_type_for_base_type(ir->type->get_scalar_type());
992 output_reg_annotation[ir->data.location + i] = ir->name;
993 }
994 break;
995
996 case ir_var_auto:
997 case ir_var_temporary:
998 reg = new(mem_ctx) dst_reg(this, ir->type);
999 break;
1000
1001 case ir_var_uniform:
1002 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1003
1004 /* Thanks to the lower_ubo_reference pass, we will see only
1005 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1006 * variables, so no need for them to be in variable_ht.
1007 *
1008 * Some uniforms, such as samplers and atomic counters, have no actual
1009 * storage, so we should ignore them.
1010 */
1011 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1012 return;
1013
1014 /* Track how big the whole uniform variable is, in case we need to put a
1015 * copy of its data into pull constants for array access.
1016 */
1017 assert(this->uniforms < uniform_array_size);
1018 this->uniform_size[this->uniforms] = type_size(ir->type);
1019
1020 if (!strncmp(ir->name, "gl_", 3)) {
1021 setup_builtin_uniform_values(ir);
1022 } else {
1023 setup_uniform_values(ir);
1024 }
1025 break;
1026
1027 case ir_var_system_value:
1028 reg = make_reg_for_system_value(ir);
1029 break;
1030
1031 default:
1032 unreachable("not reached");
1033 }
1034
1035 reg->type = brw_type_for_base_type(ir->type);
1036 hash_table_insert(this->variable_ht, reg, ir);
1037 }
1038
1039 void
1040 vec4_visitor::visit(ir_loop *ir)
1041 {
1042 /* We don't want debugging output to print the whole body of the
1043 * loop as the annotation.
1044 */
1045 this->base_ir = NULL;
1046
1047 emit(BRW_OPCODE_DO);
1048
1049 visit_instructions(&ir->body_instructions);
1050
1051 emit(BRW_OPCODE_WHILE);
1052 }
1053
1054 void
1055 vec4_visitor::visit(ir_loop_jump *ir)
1056 {
1057 switch (ir->mode) {
1058 case ir_loop_jump::jump_break:
1059 emit(BRW_OPCODE_BREAK);
1060 break;
1061 case ir_loop_jump::jump_continue:
1062 emit(BRW_OPCODE_CONTINUE);
1063 break;
1064 }
1065 }
1066
1067
1068 void
1069 vec4_visitor::visit(ir_function_signature *)
1070 {
1071 unreachable("not reached");
1072 }
1073
1074 void
1075 vec4_visitor::visit(ir_function *ir)
1076 {
1077 /* Ignore function bodies other than main() -- we shouldn't see calls to
1078 * them since they should all be inlined.
1079 */
1080 if (strcmp(ir->name, "main") == 0) {
1081 const ir_function_signature *sig;
1082 exec_list empty;
1083
1084 sig = ir->matching_signature(NULL, &empty, false);
1085
1086 assert(sig);
1087
1088 visit_instructions(&sig->body);
1089 }
1090 }
1091
1092 bool
1093 vec4_visitor::try_emit_mad(ir_expression *ir)
1094 {
1095 /* 3-src instructions were introduced in gen6. */
1096 if (brw->gen < 6)
1097 return false;
1098
1099 /* MAD can only handle floating-point data. */
1100 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1101 return false;
1102
1103 ir_rvalue *nonmul = ir->operands[1];
1104 ir_expression *mul = ir->operands[0]->as_expression();
1105
1106 if (!mul || mul->operation != ir_binop_mul) {
1107 nonmul = ir->operands[0];
1108 mul = ir->operands[1]->as_expression();
1109
1110 if (!mul || mul->operation != ir_binop_mul)
1111 return false;
1112 }
1113
1114 nonmul->accept(this);
1115 src_reg src0 = fix_3src_operand(this->result);
1116
1117 mul->operands[0]->accept(this);
1118 src_reg src1 = fix_3src_operand(this->result);
1119
1120 mul->operands[1]->accept(this);
1121 src_reg src2 = fix_3src_operand(this->result);
1122
1123 this->result = src_reg(this, ir->type);
1124 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1125
1126 return true;
1127 }
1128
1129 bool
1130 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1131 {
1132 /* This optimization relies on CMP setting the destination to 0 when
1133 * false. Early hardware only sets the least significant bit, and
1134 * leaves the other bits undefined. So we can't use it.
1135 */
1136 if (brw->gen < 6)
1137 return false;
1138
1139 ir_expression *const cmp = ir->operands[0]->as_expression();
1140
1141 if (cmp == NULL)
1142 return false;
1143
1144 switch (cmp->operation) {
1145 case ir_binop_less:
1146 case ir_binop_greater:
1147 case ir_binop_lequal:
1148 case ir_binop_gequal:
1149 case ir_binop_equal:
1150 case ir_binop_nequal:
1151 break;
1152
1153 default:
1154 return false;
1155 }
1156
1157 cmp->operands[0]->accept(this);
1158 const src_reg cmp_src0 = this->result;
1159
1160 cmp->operands[1]->accept(this);
1161 const src_reg cmp_src1 = this->result;
1162
1163 this->result = src_reg(this, ir->type);
1164
1165 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1166 brw_conditional_for_comparison(cmp->operation)));
1167
1168 /* If the comparison is false, this->result will just happen to be zero.
1169 */
1170 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1171 this->result, src_reg(1.0f));
1172 inst->predicate = BRW_PREDICATE_NORMAL;
1173 inst->predicate_inverse = true;
1174
1175 return true;
1176 }
1177
1178 void
1179 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1180 src_reg src0, src_reg src1)
1181 {
1182 vec4_instruction *inst;
1183
1184 if (brw->gen >= 6) {
1185 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1186 inst->conditional_mod = conditionalmod;
1187 } else {
1188 emit(CMP(dst, src0, src1, conditionalmod));
1189
1190 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1191 inst->predicate = BRW_PREDICATE_NORMAL;
1192 }
1193 }
1194
1195 void
1196 vec4_visitor::emit_lrp(const dst_reg &dst,
1197 const src_reg &x, const src_reg &y, const src_reg &a)
1198 {
1199 if (brw->gen >= 6) {
1200 /* Note that the instruction's argument order is reversed from GLSL
1201 * and the IR.
1202 */
1203 emit(LRP(dst,
1204 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1205 } else {
1206 /* Earlier generations don't support three source operations, so we
1207 * need to emit x*(1-a) + y*a.
1208 */
1209 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1210 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1211 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1212 y_times_a.writemask = dst.writemask;
1213 one_minus_a.writemask = dst.writemask;
1214 x_times_one_minus_a.writemask = dst.writemask;
1215
1216 emit(MUL(y_times_a, y, a));
1217 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1218 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1219 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1220 }
1221 }
1222
1223 void
1224 vec4_visitor::visit(ir_expression *ir)
1225 {
1226 unsigned int operand;
1227 src_reg op[Elements(ir->operands)];
1228 vec4_instruction *inst;
1229
1230 if (ir->operation == ir_binop_add) {
1231 if (try_emit_mad(ir))
1232 return;
1233 }
1234
1235 if (ir->operation == ir_unop_b2f) {
1236 if (try_emit_b2f_of_compare(ir))
1237 return;
1238 }
1239
1240 /* Storage for our result. Ideally for an assignment we'd be using
1241 * the actual storage for the result here, instead.
1242 */
1243 dst_reg result_dst(this, ir->type);
1244 src_reg result_src(result_dst);
1245
1246 if (ir->operation == ir_triop_csel) {
1247 ir->operands[1]->accept(this);
1248 op[1] = this->result;
1249 ir->operands[2]->accept(this);
1250 op[2] = this->result;
1251
1252 enum brw_predicate predicate;
1253 emit_bool_to_cond_code(ir->operands[0], &predicate);
1254 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1255 inst->predicate = predicate;
1256 this->result = result_src;
1257 return;
1258 }
1259
1260 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1261 this->result.file = BAD_FILE;
1262 ir->operands[operand]->accept(this);
1263 if (this->result.file == BAD_FILE) {
1264 fprintf(stderr, "Failed to get tree for expression operand:\n");
1265 ir->operands[operand]->fprint(stderr);
1266 exit(1);
1267 }
1268 op[operand] = this->result;
1269
1270 /* Matrix expression operands should have been broken down to vector
1271 * operations already.
1272 */
1273 assert(!ir->operands[operand]->type->is_matrix());
1274 }
1275
1276 /* If nothing special happens, this is the result. */
1277 this->result = result_src;
1278
1279 switch (ir->operation) {
1280 case ir_unop_logic_not:
1281 if (ctx->Const.UniformBooleanTrue != 1) {
1282 emit(NOT(result_dst, op[0]));
1283 } else {
1284 emit(XOR(result_dst, op[0], src_reg(1u)));
1285 }
1286 break;
1287 case ir_unop_neg:
1288 op[0].negate = !op[0].negate;
1289 emit(MOV(result_dst, op[0]));
1290 break;
1291 case ir_unop_abs:
1292 op[0].abs = true;
1293 op[0].negate = false;
1294 emit(MOV(result_dst, op[0]));
1295 break;
1296
1297 case ir_unop_sign:
1298 if (ir->type->is_float()) {
1299 /* AND(val, 0x80000000) gives the sign bit.
1300 *
1301 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1302 * zero.
1303 */
1304 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1305
1306 op[0].type = BRW_REGISTER_TYPE_UD;
1307 result_dst.type = BRW_REGISTER_TYPE_UD;
1308 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1309
1310 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1311 inst->predicate = BRW_PREDICATE_NORMAL;
1312
1313 this->result.type = BRW_REGISTER_TYPE_F;
1314 } else {
1315 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1316 * -> non-negative val generates 0x00000000.
1317 * Predicated OR sets 1 if val is positive.
1318 */
1319 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1320
1321 emit(ASR(result_dst, op[0], src_reg(31)));
1322
1323 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1324 inst->predicate = BRW_PREDICATE_NORMAL;
1325 }
1326 break;
1327
1328 case ir_unop_rcp:
1329 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1330 break;
1331
1332 case ir_unop_exp2:
1333 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1334 break;
1335 case ir_unop_log2:
1336 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1337 break;
1338 case ir_unop_exp:
1339 case ir_unop_log:
1340 unreachable("not reached: should be handled by ir_explog_to_explog2");
1341 case ir_unop_sin:
1342 case ir_unop_sin_reduced:
1343 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1344 break;
1345 case ir_unop_cos:
1346 case ir_unop_cos_reduced:
1347 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1348 break;
1349
1350 case ir_unop_dFdx:
1351 case ir_unop_dFdx_coarse:
1352 case ir_unop_dFdx_fine:
1353 case ir_unop_dFdy:
1354 case ir_unop_dFdy_coarse:
1355 case ir_unop_dFdy_fine:
1356 unreachable("derivatives not valid in vertex shader");
1357
1358 case ir_unop_bitfield_reverse:
1359 emit(BFREV(result_dst, op[0]));
1360 break;
1361 case ir_unop_bit_count:
1362 emit(CBIT(result_dst, op[0]));
1363 break;
1364 case ir_unop_find_msb: {
1365 src_reg temp = src_reg(this, glsl_type::uint_type);
1366
1367 inst = emit(FBH(dst_reg(temp), op[0]));
1368 inst->dst.writemask = WRITEMASK_XYZW;
1369
1370 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1371 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1372 * subtract the result from 31 to convert the MSB count into an LSB count.
1373 */
1374
1375 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1376 temp.swizzle = BRW_SWIZZLE_NOOP;
1377 emit(MOV(result_dst, temp));
1378
1379 src_reg src_tmp = src_reg(result_dst);
1380 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1381
1382 src_tmp.negate = true;
1383 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1384 inst->predicate = BRW_PREDICATE_NORMAL;
1385 break;
1386 }
1387 case ir_unop_find_lsb:
1388 emit(FBL(result_dst, op[0]));
1389 break;
1390 case ir_unop_saturate:
1391 inst = emit(MOV(result_dst, op[0]));
1392 inst->saturate = true;
1393 break;
1394
1395 case ir_unop_noise:
1396 unreachable("not reached: should be handled by lower_noise");
1397
1398 case ir_binop_add:
1399 emit(ADD(result_dst, op[0], op[1]));
1400 break;
1401 case ir_binop_sub:
1402 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1403
1404 case ir_binop_mul:
1405 if (brw->gen < 8 && ir->type->is_integer()) {
1406 /* For integer multiplication, the MUL uses the low 16 bits of one of
1407 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1408 * accumulates in the contribution of the upper 16 bits of that
1409 * operand. If we can determine that one of the args is in the low
1410 * 16 bits, though, we can just emit a single MUL.
1411 */
1412 if (ir->operands[0]->is_uint16_constant()) {
1413 if (brw->gen < 7)
1414 emit(MUL(result_dst, op[0], op[1]));
1415 else
1416 emit(MUL(result_dst, op[1], op[0]));
1417 } else if (ir->operands[1]->is_uint16_constant()) {
1418 if (brw->gen < 7)
1419 emit(MUL(result_dst, op[1], op[0]));
1420 else
1421 emit(MUL(result_dst, op[0], op[1]));
1422 } else {
1423 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1424
1425 emit(MUL(acc, op[0], op[1]));
1426 emit(MACH(dst_null_d(), op[0], op[1]));
1427 emit(MOV(result_dst, src_reg(acc)));
1428 }
1429 } else {
1430 emit(MUL(result_dst, op[0], op[1]));
1431 }
1432 break;
1433 case ir_binop_imul_high: {
1434 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1435
1436 emit(MUL(acc, op[0], op[1]));
1437 emit(MACH(result_dst, op[0], op[1]));
1438 break;
1439 }
1440 case ir_binop_div:
1441 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1442 assert(ir->type->is_integer());
1443 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1444 break;
1445 case ir_binop_carry: {
1446 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1447
1448 emit(ADDC(dst_null_ud(), op[0], op[1]));
1449 emit(MOV(result_dst, src_reg(acc)));
1450 break;
1451 }
1452 case ir_binop_borrow: {
1453 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1454
1455 emit(SUBB(dst_null_ud(), op[0], op[1]));
1456 emit(MOV(result_dst, src_reg(acc)));
1457 break;
1458 }
1459 case ir_binop_mod:
1460 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1461 assert(ir->type->is_integer());
1462 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1463 break;
1464
1465 case ir_binop_less:
1466 case ir_binop_greater:
1467 case ir_binop_lequal:
1468 case ir_binop_gequal:
1469 case ir_binop_equal:
1470 case ir_binop_nequal: {
1471 emit(CMP(result_dst, op[0], op[1],
1472 brw_conditional_for_comparison(ir->operation)));
1473 if (ctx->Const.UniformBooleanTrue == 1) {
1474 emit(AND(result_dst, result_src, src_reg(1u)));
1475 }
1476 break;
1477 }
1478
1479 case ir_binop_all_equal:
1480 /* "==" operator producing a scalar boolean. */
1481 if (ir->operands[0]->type->is_vector() ||
1482 ir->operands[1]->type->is_vector()) {
1483 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1484 emit(MOV(result_dst, src_reg(0)));
1485 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1486 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1487 } else {
1488 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1489 if (ctx->Const.UniformBooleanTrue == 1) {
1490 emit(AND(result_dst, result_src, src_reg(1u)));
1491 }
1492 }
1493 break;
1494 case ir_binop_any_nequal:
1495 /* "!=" operator producing a scalar boolean. */
1496 if (ir->operands[0]->type->is_vector() ||
1497 ir->operands[1]->type->is_vector()) {
1498 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1499
1500 emit(MOV(result_dst, src_reg(0)));
1501 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1502 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1503 } else {
1504 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1505 if (ctx->Const.UniformBooleanTrue == 1) {
1506 emit(AND(result_dst, result_src, src_reg(1u)));
1507 }
1508 }
1509 break;
1510
1511 case ir_unop_any:
1512 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1513 emit(MOV(result_dst, src_reg(0)));
1514
1515 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1516 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1517 break;
1518
1519 case ir_binop_logic_xor:
1520 emit(XOR(result_dst, op[0], op[1]));
1521 break;
1522
1523 case ir_binop_logic_or:
1524 emit(OR(result_dst, op[0], op[1]));
1525 break;
1526
1527 case ir_binop_logic_and:
1528 emit(AND(result_dst, op[0], op[1]));
1529 break;
1530
1531 case ir_binop_dot:
1532 assert(ir->operands[0]->type->is_vector());
1533 assert(ir->operands[0]->type == ir->operands[1]->type);
1534 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1535 break;
1536
1537 case ir_unop_sqrt:
1538 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1539 break;
1540 case ir_unop_rsq:
1541 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1542 break;
1543
1544 case ir_unop_bitcast_i2f:
1545 case ir_unop_bitcast_u2f:
1546 this->result = op[0];
1547 this->result.type = BRW_REGISTER_TYPE_F;
1548 break;
1549
1550 case ir_unop_bitcast_f2i:
1551 this->result = op[0];
1552 this->result.type = BRW_REGISTER_TYPE_D;
1553 break;
1554
1555 case ir_unop_bitcast_f2u:
1556 this->result = op[0];
1557 this->result.type = BRW_REGISTER_TYPE_UD;
1558 break;
1559
1560 case ir_unop_i2f:
1561 case ir_unop_i2u:
1562 case ir_unop_u2i:
1563 case ir_unop_u2f:
1564 case ir_unop_f2i:
1565 case ir_unop_f2u:
1566 emit(MOV(result_dst, op[0]));
1567 break;
1568 case ir_unop_b2i:
1569 if (ctx->Const.UniformBooleanTrue != 1) {
1570 emit(AND(result_dst, op[0], src_reg(1u)));
1571 } else {
1572 emit(MOV(result_dst, op[0]));
1573 }
1574 break;
1575 case ir_unop_b2f:
1576 if (ctx->Const.UniformBooleanTrue != 1) {
1577 op[0].type = BRW_REGISTER_TYPE_UD;
1578 result_dst.type = BRW_REGISTER_TYPE_UD;
1579 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1580 result_dst.type = BRW_REGISTER_TYPE_F;
1581 } else {
1582 emit(MOV(result_dst, op[0]));
1583 }
1584 break;
1585 case ir_unop_f2b:
1586 case ir_unop_i2b:
1587 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1588 if (ctx->Const.UniformBooleanTrue == 1) {
1589 emit(AND(result_dst, result_src, src_reg(1u)));
1590 }
1591 break;
1592
1593 case ir_unop_trunc:
1594 emit(RNDZ(result_dst, op[0]));
1595 break;
1596 case ir_unop_ceil:
1597 op[0].negate = !op[0].negate;
1598 inst = emit(RNDD(result_dst, op[0]));
1599 this->result.negate = true;
1600 break;
1601 case ir_unop_floor:
1602 inst = emit(RNDD(result_dst, op[0]));
1603 break;
1604 case ir_unop_fract:
1605 inst = emit(FRC(result_dst, op[0]));
1606 break;
1607 case ir_unop_round_even:
1608 emit(RNDE(result_dst, op[0]));
1609 break;
1610
1611 case ir_binop_min:
1612 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1613 break;
1614 case ir_binop_max:
1615 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1616 break;
1617
1618 case ir_binop_pow:
1619 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1620 break;
1621
1622 case ir_unop_bit_not:
1623 inst = emit(NOT(result_dst, op[0]));
1624 break;
1625 case ir_binop_bit_and:
1626 inst = emit(AND(result_dst, op[0], op[1]));
1627 break;
1628 case ir_binop_bit_xor:
1629 inst = emit(XOR(result_dst, op[0], op[1]));
1630 break;
1631 case ir_binop_bit_or:
1632 inst = emit(OR(result_dst, op[0], op[1]));
1633 break;
1634
1635 case ir_binop_lshift:
1636 inst = emit(SHL(result_dst, op[0], op[1]));
1637 break;
1638
1639 case ir_binop_rshift:
1640 if (ir->type->base_type == GLSL_TYPE_INT)
1641 inst = emit(ASR(result_dst, op[0], op[1]));
1642 else
1643 inst = emit(SHR(result_dst, op[0], op[1]));
1644 break;
1645
1646 case ir_binop_bfm:
1647 emit(BFI1(result_dst, op[0], op[1]));
1648 break;
1649
1650 case ir_binop_ubo_load: {
1651 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1652 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1653 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1654 src_reg offset;
1655
1656 /* Now, load the vector from that offset. */
1657 assert(ir->type->is_vector() || ir->type->is_scalar());
1658
1659 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1660 packed_consts.type = result.type;
1661 src_reg surf_index;
1662
1663 if (const_uniform_block) {
1664 /* The block index is a constant, so just emit the binding table entry
1665 * as an immediate.
1666 */
1667 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1668 const_uniform_block->value.u[0]);
1669 } else {
1670 /* The block index is not a constant. Evaluate the index expression
1671 * per-channel and add the base UBO index; the generator will select
1672 * a value from any live channel.
1673 */
1674 surf_index = src_reg(this, glsl_type::uint_type);
1675 emit(ADD(dst_reg(surf_index), op[0],
1676 src_reg(prog_data->base.binding_table.ubo_start)));
1677
1678 /* Assume this may touch any UBO. It would be nice to provide
1679 * a tighter bound, but the array information is already lowered away.
1680 */
1681 brw_mark_surface_used(&prog_data->base,
1682 prog_data->base.binding_table.ubo_start +
1683 shader_prog->NumUniformBlocks - 1);
1684 }
1685
1686 if (const_offset_ir) {
1687 if (brw->gen >= 8) {
1688 /* Store the offset in a GRF so we can send-from-GRF. */
1689 offset = src_reg(this, glsl_type::int_type);
1690 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1691 } else {
1692 /* Immediates are fine on older generations since they'll be moved
1693 * to a (potentially fake) MRF at the generator level.
1694 */
1695 offset = src_reg(const_offset / 16);
1696 }
1697 } else {
1698 offset = src_reg(this, glsl_type::uint_type);
1699 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1700 }
1701
1702 if (brw->gen >= 7) {
1703 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1704 grf_offset.type = offset.type;
1705
1706 emit(MOV(grf_offset, offset));
1707
1708 emit(new(mem_ctx) vec4_instruction(this,
1709 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1710 dst_reg(packed_consts),
1711 surf_index,
1712 src_reg(grf_offset)));
1713 } else {
1714 vec4_instruction *pull =
1715 emit(new(mem_ctx) vec4_instruction(this,
1716 VS_OPCODE_PULL_CONSTANT_LOAD,
1717 dst_reg(packed_consts),
1718 surf_index,
1719 offset));
1720 pull->base_mrf = 14;
1721 pull->mlen = 1;
1722 }
1723
1724 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1725 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1726 const_offset % 16 / 4,
1727 const_offset % 16 / 4,
1728 const_offset % 16 / 4);
1729
1730 /* UBO bools are any nonzero int. We need to convert them to use the
1731 * value of true stored in ctx->Const.UniformBooleanTrue.
1732 */
1733 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1734 emit(CMP(result_dst, packed_consts, src_reg(0u),
1735 BRW_CONDITIONAL_NZ));
1736 if (ctx->Const.UniformBooleanTrue == 1) {
1737 emit(AND(result_dst, result, src_reg(1u)));
1738 }
1739 } else {
1740 emit(MOV(result_dst, packed_consts));
1741 }
1742 break;
1743 }
1744
1745 case ir_binop_vector_extract:
1746 unreachable("should have been lowered by vec_index_to_cond_assign");
1747
1748 case ir_triop_fma:
1749 op[0] = fix_3src_operand(op[0]);
1750 op[1] = fix_3src_operand(op[1]);
1751 op[2] = fix_3src_operand(op[2]);
1752 /* Note that the instruction's argument order is reversed from GLSL
1753 * and the IR.
1754 */
1755 emit(MAD(result_dst, op[2], op[1], op[0]));
1756 break;
1757
1758 case ir_triop_lrp:
1759 emit_lrp(result_dst, op[0], op[1], op[2]);
1760 break;
1761
1762 case ir_triop_csel:
1763 unreachable("already handled above");
1764 break;
1765
1766 case ir_triop_bfi:
1767 op[0] = fix_3src_operand(op[0]);
1768 op[1] = fix_3src_operand(op[1]);
1769 op[2] = fix_3src_operand(op[2]);
1770 emit(BFI2(result_dst, op[0], op[1], op[2]));
1771 break;
1772
1773 case ir_triop_bitfield_extract:
1774 op[0] = fix_3src_operand(op[0]);
1775 op[1] = fix_3src_operand(op[1]);
1776 op[2] = fix_3src_operand(op[2]);
1777 /* Note that the instruction's argument order is reversed from GLSL
1778 * and the IR.
1779 */
1780 emit(BFE(result_dst, op[2], op[1], op[0]));
1781 break;
1782
1783 case ir_triop_vector_insert:
1784 unreachable("should have been lowered by lower_vector_insert");
1785
1786 case ir_quadop_bitfield_insert:
1787 unreachable("not reached: should be handled by "
1788 "bitfield_insert_to_bfm_bfi\n");
1789
1790 case ir_quadop_vector:
1791 unreachable("not reached: should be handled by lower_quadop_vector");
1792
1793 case ir_unop_pack_half_2x16:
1794 emit_pack_half_2x16(result_dst, op[0]);
1795 break;
1796 case ir_unop_unpack_half_2x16:
1797 emit_unpack_half_2x16(result_dst, op[0]);
1798 break;
1799 case ir_unop_unpack_unorm_4x8:
1800 emit_unpack_unorm_4x8(result_dst, op[0]);
1801 break;
1802 case ir_unop_unpack_snorm_4x8:
1803 emit_unpack_snorm_4x8(result_dst, op[0]);
1804 break;
1805 case ir_unop_pack_snorm_2x16:
1806 case ir_unop_pack_snorm_4x8:
1807 case ir_unop_pack_unorm_2x16:
1808 case ir_unop_pack_unorm_4x8:
1809 case ir_unop_unpack_snorm_2x16:
1810 case ir_unop_unpack_unorm_2x16:
1811 unreachable("not reached: should be handled by lower_packing_builtins");
1812 case ir_unop_unpack_half_2x16_split_x:
1813 case ir_unop_unpack_half_2x16_split_y:
1814 case ir_binop_pack_half_2x16_split:
1815 case ir_unop_interpolate_at_centroid:
1816 case ir_binop_interpolate_at_sample:
1817 case ir_binop_interpolate_at_offset:
1818 unreachable("not reached: should not occur in vertex shader");
1819 case ir_binop_ldexp:
1820 unreachable("not reached: should be handled by ldexp_to_arith()");
1821 }
1822 }
1823
1824
1825 void
1826 vec4_visitor::visit(ir_swizzle *ir)
1827 {
1828 src_reg src;
1829 int i = 0;
1830 int swizzle[4];
1831
1832 /* Note that this is only swizzles in expressions, not those on the left
1833 * hand side of an assignment, which do write masking. See ir_assignment
1834 * for that.
1835 */
1836
1837 ir->val->accept(this);
1838 src = this->result;
1839 assert(src.file != BAD_FILE);
1840
1841 for (i = 0; i < ir->type->vector_elements; i++) {
1842 switch (i) {
1843 case 0:
1844 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1845 break;
1846 case 1:
1847 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1848 break;
1849 case 2:
1850 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1851 break;
1852 case 3:
1853 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1854 break;
1855 }
1856 }
1857 for (; i < 4; i++) {
1858 /* Replicate the last channel out. */
1859 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1860 }
1861
1862 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1863
1864 this->result = src;
1865 }
1866
1867 void
1868 vec4_visitor::visit(ir_dereference_variable *ir)
1869 {
1870 const struct glsl_type *type = ir->type;
1871 dst_reg *reg = variable_storage(ir->var);
1872
1873 if (!reg) {
1874 fail("Failed to find variable storage for %s\n", ir->var->name);
1875 this->result = src_reg(brw_null_reg());
1876 return;
1877 }
1878
1879 this->result = src_reg(*reg);
1880
1881 /* System values get their swizzle from the dst_reg writemask */
1882 if (ir->var->data.mode == ir_var_system_value)
1883 return;
1884
1885 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1886 this->result.swizzle = swizzle_for_size(type->vector_elements);
1887 }
1888
1889
1890 int
1891 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1892 {
1893 /* Under normal circumstances array elements are stored consecutively, so
1894 * the stride is equal to the size of the array element.
1895 */
1896 return type_size(ir->type);
1897 }
1898
1899
1900 void
1901 vec4_visitor::visit(ir_dereference_array *ir)
1902 {
1903 ir_constant *constant_index;
1904 src_reg src;
1905 int array_stride = compute_array_stride(ir);
1906
1907 constant_index = ir->array_index->constant_expression_value();
1908
1909 ir->array->accept(this);
1910 src = this->result;
1911
1912 if (constant_index) {
1913 src.reg_offset += constant_index->value.i[0] * array_stride;
1914 } else {
1915 /* Variable index array dereference. It eats the "vec4" of the
1916 * base of the array and an index that offsets the Mesa register
1917 * index.
1918 */
1919 ir->array_index->accept(this);
1920
1921 src_reg index_reg;
1922
1923 if (array_stride == 1) {
1924 index_reg = this->result;
1925 } else {
1926 index_reg = src_reg(this, glsl_type::int_type);
1927
1928 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1929 }
1930
1931 if (src.reladdr) {
1932 src_reg temp = src_reg(this, glsl_type::int_type);
1933
1934 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1935
1936 index_reg = temp;
1937 }
1938
1939 src.reladdr = ralloc(mem_ctx, src_reg);
1940 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1941 }
1942
1943 /* If the type is smaller than a vec4, replicate the last channel out. */
1944 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1945 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1946 else
1947 src.swizzle = BRW_SWIZZLE_NOOP;
1948 src.type = brw_type_for_base_type(ir->type);
1949
1950 this->result = src;
1951 }
1952
1953 void
1954 vec4_visitor::visit(ir_dereference_record *ir)
1955 {
1956 unsigned int i;
1957 const glsl_type *struct_type = ir->record->type;
1958 int offset = 0;
1959
1960 ir->record->accept(this);
1961
1962 for (i = 0; i < struct_type->length; i++) {
1963 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1964 break;
1965 offset += type_size(struct_type->fields.structure[i].type);
1966 }
1967
1968 /* If the type is smaller than a vec4, replicate the last channel out. */
1969 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1970 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1971 else
1972 this->result.swizzle = BRW_SWIZZLE_NOOP;
1973 this->result.type = brw_type_for_base_type(ir->type);
1974
1975 this->result.reg_offset += offset;
1976 }
1977
1978 /**
1979 * We want to be careful in assignment setup to hit the actual storage
1980 * instead of potentially using a temporary like we might with the
1981 * ir_dereference handler.
1982 */
1983 static dst_reg
1984 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1985 {
1986 /* The LHS must be a dereference. If the LHS is a variable indexed array
1987 * access of a vector, it must be separated into a series conditional moves
1988 * before reaching this point (see ir_vec_index_to_cond_assign).
1989 */
1990 assert(ir->as_dereference());
1991 ir_dereference_array *deref_array = ir->as_dereference_array();
1992 if (deref_array) {
1993 assert(!deref_array->array->type->is_vector());
1994 }
1995
1996 /* Use the rvalue deref handler for the most part. We'll ignore
1997 * swizzles in it and write swizzles using writemask, though.
1998 */
1999 ir->accept(v);
2000 return dst_reg(v->result);
2001 }
2002
2003 void
2004 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2005 const struct glsl_type *type,
2006 enum brw_predicate predicate)
2007 {
2008 if (type->base_type == GLSL_TYPE_STRUCT) {
2009 for (unsigned int i = 0; i < type->length; i++) {
2010 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2011 }
2012 return;
2013 }
2014
2015 if (type->is_array()) {
2016 for (unsigned int i = 0; i < type->length; i++) {
2017 emit_block_move(dst, src, type->fields.array, predicate);
2018 }
2019 return;
2020 }
2021
2022 if (type->is_matrix()) {
2023 const struct glsl_type *vec_type;
2024
2025 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2026 type->vector_elements, 1);
2027
2028 for (int i = 0; i < type->matrix_columns; i++) {
2029 emit_block_move(dst, src, vec_type, predicate);
2030 }
2031 return;
2032 }
2033
2034 assert(type->is_scalar() || type->is_vector());
2035
2036 dst->type = brw_type_for_base_type(type);
2037 src->type = dst->type;
2038
2039 dst->writemask = (1 << type->vector_elements) - 1;
2040
2041 src->swizzle = swizzle_for_size(type->vector_elements);
2042
2043 vec4_instruction *inst = emit(MOV(*dst, *src));
2044 inst->predicate = predicate;
2045
2046 dst->reg_offset++;
2047 src->reg_offset++;
2048 }
2049
2050
2051 /* If the RHS processing resulted in an instruction generating a
2052 * temporary value, and it would be easy to rewrite the instruction to
2053 * generate its result right into the LHS instead, do so. This ends
2054 * up reliably removing instructions where it can be tricky to do so
2055 * later without real UD chain information.
2056 */
2057 bool
2058 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2059 dst_reg dst,
2060 src_reg src,
2061 vec4_instruction *pre_rhs_inst,
2062 vec4_instruction *last_rhs_inst)
2063 {
2064 /* This could be supported, but it would take more smarts. */
2065 if (ir->condition)
2066 return false;
2067
2068 if (pre_rhs_inst == last_rhs_inst)
2069 return false; /* No instructions generated to work with. */
2070
2071 /* Make sure the last instruction generated our source reg. */
2072 if (src.file != GRF ||
2073 src.file != last_rhs_inst->dst.file ||
2074 src.reg != last_rhs_inst->dst.reg ||
2075 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2076 src.reladdr ||
2077 src.abs ||
2078 src.negate ||
2079 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2080 return false;
2081
2082 /* Check that that last instruction fully initialized the channels
2083 * we want to use, in the order we want to use them. We could
2084 * potentially reswizzle the operands of many instructions so that
2085 * we could handle out of order channels, but don't yet.
2086 */
2087
2088 for (unsigned i = 0; i < 4; i++) {
2089 if (dst.writemask & (1 << i)) {
2090 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2091 return false;
2092
2093 if (BRW_GET_SWZ(src.swizzle, i) != i)
2094 return false;
2095 }
2096 }
2097
2098 /* Success! Rewrite the instruction. */
2099 last_rhs_inst->dst.file = dst.file;
2100 last_rhs_inst->dst.reg = dst.reg;
2101 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2102 last_rhs_inst->dst.reladdr = dst.reladdr;
2103 last_rhs_inst->dst.writemask &= dst.writemask;
2104
2105 return true;
2106 }
2107
2108 void
2109 vec4_visitor::visit(ir_assignment *ir)
2110 {
2111 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2112 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2113
2114 if (!ir->lhs->type->is_scalar() &&
2115 !ir->lhs->type->is_vector()) {
2116 ir->rhs->accept(this);
2117 src_reg src = this->result;
2118
2119 if (ir->condition) {
2120 emit_bool_to_cond_code(ir->condition, &predicate);
2121 }
2122
2123 /* emit_block_move doesn't account for swizzles in the source register.
2124 * This should be ok, since the source register is a structure or an
2125 * array, and those can't be swizzled. But double-check to be sure.
2126 */
2127 assert(src.swizzle ==
2128 (ir->rhs->type->is_matrix()
2129 ? swizzle_for_size(ir->rhs->type->vector_elements)
2130 : BRW_SWIZZLE_NOOP));
2131
2132 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2133 return;
2134 }
2135
2136 /* Now we're down to just a scalar/vector with writemasks. */
2137 int i;
2138
2139 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2140 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2141
2142 ir->rhs->accept(this);
2143
2144 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2145
2146 src_reg src = this->result;
2147
2148 int swizzles[4];
2149 int first_enabled_chan = 0;
2150 int src_chan = 0;
2151
2152 assert(ir->lhs->type->is_vector() ||
2153 ir->lhs->type->is_scalar());
2154 dst.writemask = ir->write_mask;
2155
2156 for (int i = 0; i < 4; i++) {
2157 if (dst.writemask & (1 << i)) {
2158 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2159 break;
2160 }
2161 }
2162
2163 /* Swizzle a small RHS vector into the channels being written.
2164 *
2165 * glsl ir treats write_mask as dictating how many channels are
2166 * present on the RHS while in our instructions we need to make
2167 * those channels appear in the slots of the vec4 they're written to.
2168 */
2169 for (int i = 0; i < 4; i++) {
2170 if (dst.writemask & (1 << i))
2171 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2172 else
2173 swizzles[i] = first_enabled_chan;
2174 }
2175 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2176 swizzles[2], swizzles[3]);
2177
2178 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2179 return;
2180 }
2181
2182 if (ir->condition) {
2183 emit_bool_to_cond_code(ir->condition, &predicate);
2184 }
2185
2186 for (i = 0; i < type_size(ir->lhs->type); i++) {
2187 vec4_instruction *inst = emit(MOV(dst, src));
2188 inst->predicate = predicate;
2189
2190 dst.reg_offset++;
2191 src.reg_offset++;
2192 }
2193 }
2194
2195 void
2196 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2197 {
2198 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2199 foreach_in_list(ir_constant, field_value, &ir->components) {
2200 emit_constant_values(dst, field_value);
2201 }
2202 return;
2203 }
2204
2205 if (ir->type->is_array()) {
2206 for (unsigned int i = 0; i < ir->type->length; i++) {
2207 emit_constant_values(dst, ir->array_elements[i]);
2208 }
2209 return;
2210 }
2211
2212 if (ir->type->is_matrix()) {
2213 for (int i = 0; i < ir->type->matrix_columns; i++) {
2214 float *vec = &ir->value.f[i * ir->type->vector_elements];
2215
2216 for (int j = 0; j < ir->type->vector_elements; j++) {
2217 dst->writemask = 1 << j;
2218 dst->type = BRW_REGISTER_TYPE_F;
2219
2220 emit(MOV(*dst, src_reg(vec[j])));
2221 }
2222 dst->reg_offset++;
2223 }
2224 return;
2225 }
2226
2227 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2228
2229 for (int i = 0; i < ir->type->vector_elements; i++) {
2230 if (!(remaining_writemask & (1 << i)))
2231 continue;
2232
2233 dst->writemask = 1 << i;
2234 dst->type = brw_type_for_base_type(ir->type);
2235
2236 /* Find other components that match the one we're about to
2237 * write. Emits fewer instructions for things like vec4(0.5,
2238 * 1.5, 1.5, 1.5).
2239 */
2240 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2241 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2242 if (ir->value.b[i] == ir->value.b[j])
2243 dst->writemask |= (1 << j);
2244 } else {
2245 /* u, i, and f storage all line up, so no need for a
2246 * switch case for comparing each type.
2247 */
2248 if (ir->value.u[i] == ir->value.u[j])
2249 dst->writemask |= (1 << j);
2250 }
2251 }
2252
2253 switch (ir->type->base_type) {
2254 case GLSL_TYPE_FLOAT:
2255 emit(MOV(*dst, src_reg(ir->value.f[i])));
2256 break;
2257 case GLSL_TYPE_INT:
2258 emit(MOV(*dst, src_reg(ir->value.i[i])));
2259 break;
2260 case GLSL_TYPE_UINT:
2261 emit(MOV(*dst, src_reg(ir->value.u[i])));
2262 break;
2263 case GLSL_TYPE_BOOL:
2264 emit(MOV(*dst,
2265 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2266 : 0u)));
2267 break;
2268 default:
2269 unreachable("Non-float/uint/int/bool constant");
2270 }
2271
2272 remaining_writemask &= ~dst->writemask;
2273 }
2274 dst->reg_offset++;
2275 }
2276
2277 void
2278 vec4_visitor::visit(ir_constant *ir)
2279 {
2280 dst_reg dst = dst_reg(this, ir->type);
2281 this->result = src_reg(dst);
2282
2283 emit_constant_values(&dst, ir);
2284 }
2285
2286 void
2287 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2288 {
2289 ir_dereference *deref = static_cast<ir_dereference *>(
2290 ir->actual_parameters.get_head());
2291 ir_variable *location = deref->variable_referenced();
2292 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2293 location->data.binding);
2294
2295 /* Calculate the surface offset */
2296 src_reg offset(this, glsl_type::uint_type);
2297 ir_dereference_array *deref_array = deref->as_dereference_array();
2298 if (deref_array) {
2299 deref_array->array_index->accept(this);
2300
2301 src_reg tmp(this, glsl_type::uint_type);
2302 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2303 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2304 } else {
2305 offset = location->data.atomic.offset;
2306 }
2307
2308 /* Emit the appropriate machine instruction */
2309 const char *callee = ir->callee->function_name();
2310 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2311
2312 if (!strcmp("__intrinsic_atomic_read", callee)) {
2313 emit_untyped_surface_read(surf_index, dst, offset);
2314
2315 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2316 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2317 src_reg(), src_reg());
2318
2319 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2320 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2321 src_reg(), src_reg());
2322 }
2323 }
2324
2325 void
2326 vec4_visitor::visit(ir_call *ir)
2327 {
2328 const char *callee = ir->callee->function_name();
2329
2330 if (!strcmp("__intrinsic_atomic_read", callee) ||
2331 !strcmp("__intrinsic_atomic_increment", callee) ||
2332 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2333 visit_atomic_counter_intrinsic(ir);
2334 } else {
2335 unreachable("Unsupported intrinsic.");
2336 }
2337 }
2338
2339 src_reg
2340 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2341 {
2342 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2343 inst->base_mrf = 2;
2344 inst->mlen = 1;
2345 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2346 inst->dst.writemask = WRITEMASK_XYZW;
2347
2348 inst->src[1] = sampler;
2349
2350 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2351 int param_base = inst->base_mrf;
2352 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2353 int zero_mask = 0xf & ~coord_mask;
2354
2355 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2356 coordinate));
2357
2358 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2359 src_reg(0)));
2360
2361 emit(inst);
2362 return src_reg(inst->dst);
2363 }
2364
2365 static bool
2366 is_high_sampler(struct brw_context *brw, src_reg sampler)
2367 {
2368 if (brw->gen < 8 && !brw->is_haswell)
2369 return false;
2370
2371 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2372 }
2373
2374 void
2375 vec4_visitor::visit(ir_texture *ir)
2376 {
2377 uint32_t sampler =
2378 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2379
2380 ir_rvalue *nonconst_sampler_index =
2381 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2382
2383 /* Handle non-constant sampler array indexing */
2384 src_reg sampler_reg;
2385 if (nonconst_sampler_index) {
2386 /* The highest sampler which may be used by this operation is
2387 * the last element of the array. Mark it here, because the generator
2388 * doesn't have enough information to determine the bound.
2389 */
2390 uint32_t array_size = ir->sampler->as_dereference_array()
2391 ->array->type->array_size();
2392
2393 uint32_t max_used = sampler + array_size - 1;
2394 if (ir->op == ir_tg4 && brw->gen < 8) {
2395 max_used += prog_data->base.binding_table.gather_texture_start;
2396 } else {
2397 max_used += prog_data->base.binding_table.texture_start;
2398 }
2399
2400 brw_mark_surface_used(&prog_data->base, max_used);
2401
2402 /* Emit code to evaluate the actual indexing expression */
2403 nonconst_sampler_index->accept(this);
2404 dst_reg temp(this, glsl_type::uint_type);
2405 emit(ADD(temp, this->result, src_reg(sampler)))
2406 ->force_writemask_all = true;
2407 sampler_reg = src_reg(temp);
2408 } else {
2409 /* Single sampler, or constant array index; the indexing expression
2410 * is just an immediate.
2411 */
2412 sampler_reg = src_reg(sampler);
2413 }
2414
2415 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2416 * emitting anything other than setting up the constant result.
2417 */
2418 if (ir->op == ir_tg4) {
2419 ir_constant *chan = ir->lod_info.component->as_constant();
2420 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2421 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2422 dst_reg result(this, ir->type);
2423 this->result = src_reg(result);
2424 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2425 return;
2426 }
2427 }
2428
2429 /* Should be lowered by do_lower_texture_projection */
2430 assert(!ir->projector);
2431
2432 /* Should be lowered */
2433 assert(!ir->offset || !ir->offset->type->is_array());
2434
2435 /* Generate code to compute all the subexpression trees. This has to be
2436 * done before loading any values into MRFs for the sampler message since
2437 * generating these values may involve SEND messages that need the MRFs.
2438 */
2439 src_reg coordinate;
2440 if (ir->coordinate) {
2441 ir->coordinate->accept(this);
2442 coordinate = this->result;
2443 }
2444
2445 src_reg shadow_comparitor;
2446 if (ir->shadow_comparitor) {
2447 ir->shadow_comparitor->accept(this);
2448 shadow_comparitor = this->result;
2449 }
2450
2451 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2452 src_reg offset_value;
2453 if (has_nonconstant_offset) {
2454 ir->offset->accept(this);
2455 offset_value = src_reg(this->result);
2456 }
2457
2458 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2459 src_reg lod, dPdx, dPdy, sample_index, mcs;
2460 switch (ir->op) {
2461 case ir_tex:
2462 lod = src_reg(0.0f);
2463 lod_type = glsl_type::float_type;
2464 break;
2465 case ir_txf:
2466 case ir_txl:
2467 case ir_txs:
2468 ir->lod_info.lod->accept(this);
2469 lod = this->result;
2470 lod_type = ir->lod_info.lod->type;
2471 break;
2472 case ir_query_levels:
2473 lod = src_reg(0);
2474 lod_type = glsl_type::int_type;
2475 break;
2476 case ir_txf_ms:
2477 ir->lod_info.sample_index->accept(this);
2478 sample_index = this->result;
2479 sample_index_type = ir->lod_info.sample_index->type;
2480
2481 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2482 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2483 else
2484 mcs = src_reg(0u);
2485 break;
2486 case ir_txd:
2487 ir->lod_info.grad.dPdx->accept(this);
2488 dPdx = this->result;
2489
2490 ir->lod_info.grad.dPdy->accept(this);
2491 dPdy = this->result;
2492
2493 lod_type = ir->lod_info.grad.dPdx->type;
2494 break;
2495 case ir_txb:
2496 case ir_lod:
2497 case ir_tg4:
2498 break;
2499 }
2500
2501 enum opcode opcode;
2502 switch (ir->op) {
2503 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2504 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2505 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2506 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2507 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2508 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2509 case ir_tg4: opcode = has_nonconstant_offset
2510 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2511 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2512 case ir_txb:
2513 unreachable("TXB is not valid for vertex shaders.");
2514 case ir_lod:
2515 unreachable("LOD is not valid for vertex shaders.");
2516 default:
2517 unreachable("Unrecognized tex op");
2518 }
2519
2520 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2521
2522 if (ir->offset != NULL && !has_nonconstant_offset) {
2523 inst->offset =
2524 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2525 ir->offset->type->vector_elements);
2526 }
2527
2528 /* Stuff the channel select bits in the top of the texture offset */
2529 if (ir->op == ir_tg4)
2530 inst->offset |= gather_channel(ir, sampler) << 16;
2531
2532 /* The message header is necessary for:
2533 * - Gen4 (always)
2534 * - Texel offsets
2535 * - Gather channel selection
2536 * - Sampler indices too large to fit in a 4-bit value.
2537 */
2538 inst->header_present =
2539 brw->gen < 5 || inst->offset != 0 || ir->op == ir_tg4 ||
2540 is_high_sampler(brw, sampler_reg);
2541 inst->base_mrf = 2;
2542 inst->mlen = inst->header_present + 1; /* always at least one */
2543 inst->dst = dst_reg(this, ir->type);
2544 inst->dst.writemask = WRITEMASK_XYZW;
2545 inst->shadow_compare = ir->shadow_comparitor != NULL;
2546
2547 inst->src[1] = sampler_reg;
2548
2549 /* MRF for the first parameter */
2550 int param_base = inst->base_mrf + inst->header_present;
2551
2552 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2553 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2554 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2555 } else {
2556 /* Load the coordinate */
2557 /* FINISHME: gl_clamp_mask and saturate */
2558 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2559 int zero_mask = 0xf & ~coord_mask;
2560
2561 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2562 coordinate));
2563
2564 if (zero_mask != 0) {
2565 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2566 src_reg(0)));
2567 }
2568 /* Load the shadow comparitor */
2569 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2570 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2571 WRITEMASK_X),
2572 shadow_comparitor));
2573 inst->mlen++;
2574 }
2575
2576 /* Load the LOD info */
2577 if (ir->op == ir_tex || ir->op == ir_txl) {
2578 int mrf, writemask;
2579 if (brw->gen >= 5) {
2580 mrf = param_base + 1;
2581 if (ir->shadow_comparitor) {
2582 writemask = WRITEMASK_Y;
2583 /* mlen already incremented */
2584 } else {
2585 writemask = WRITEMASK_X;
2586 inst->mlen++;
2587 }
2588 } else /* brw->gen == 4 */ {
2589 mrf = param_base;
2590 writemask = WRITEMASK_W;
2591 }
2592 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2593 } else if (ir->op == ir_txf) {
2594 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2595 } else if (ir->op == ir_txf_ms) {
2596 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2597 sample_index));
2598 if (brw->gen >= 7) {
2599 /* MCS data is in the first channel of `mcs`, but we need to get it into
2600 * the .y channel of the second vec4 of params, so replicate .x across
2601 * the whole vec4 and then mask off everything except .y
2602 */
2603 mcs.swizzle = BRW_SWIZZLE_XXXX;
2604 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2605 mcs));
2606 }
2607 inst->mlen++;
2608 } else if (ir->op == ir_txd) {
2609 const glsl_type *type = lod_type;
2610
2611 if (brw->gen >= 5) {
2612 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2613 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2614 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2615 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2616 inst->mlen++;
2617
2618 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2619 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2620 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2621 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2622 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2623 inst->mlen++;
2624
2625 if (ir->shadow_comparitor) {
2626 emit(MOV(dst_reg(MRF, param_base + 2,
2627 ir->shadow_comparitor->type, WRITEMASK_Z),
2628 shadow_comparitor));
2629 }
2630 }
2631 } else /* brw->gen == 4 */ {
2632 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2633 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2634 inst->mlen += 2;
2635 }
2636 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2637 if (ir->shadow_comparitor) {
2638 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2639 shadow_comparitor));
2640 }
2641
2642 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2643 offset_value));
2644 inst->mlen++;
2645 }
2646 }
2647
2648 emit(inst);
2649
2650 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2651 * spec requires layers.
2652 */
2653 if (ir->op == ir_txs) {
2654 glsl_type const *type = ir->sampler->type;
2655 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2656 type->sampler_array) {
2657 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2658 writemask(inst->dst, WRITEMASK_Z),
2659 src_reg(inst->dst), src_reg(6));
2660 }
2661 }
2662
2663 if (brw->gen == 6 && ir->op == ir_tg4) {
2664 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2665 }
2666
2667 swizzle_result(ir, src_reg(inst->dst), sampler);
2668 }
2669
2670 /**
2671 * Apply workarounds for Gen6 gather with UINT/SINT
2672 */
2673 void
2674 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2675 {
2676 if (!wa)
2677 return;
2678
2679 int width = (wa & WA_8BIT) ? 8 : 16;
2680 dst_reg dst_f = dst;
2681 dst_f.type = BRW_REGISTER_TYPE_F;
2682
2683 /* Convert from UNORM to UINT */
2684 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2685 emit(MOV(dst, src_reg(dst_f)));
2686
2687 if (wa & WA_SIGN) {
2688 /* Reinterpret the UINT value as a signed INT value by
2689 * shifting the sign bit into place, then shifting back
2690 * preserving sign.
2691 */
2692 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2693 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2694 }
2695 }
2696
2697 /**
2698 * Set up the gather channel based on the swizzle, for gather4.
2699 */
2700 uint32_t
2701 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2702 {
2703 ir_constant *chan = ir->lod_info.component->as_constant();
2704 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2705 switch (swiz) {
2706 case SWIZZLE_X: return 0;
2707 case SWIZZLE_Y:
2708 /* gather4 sampler is broken for green channel on RG32F --
2709 * we must ask for blue instead.
2710 */
2711 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2712 return 2;
2713 return 1;
2714 case SWIZZLE_Z: return 2;
2715 case SWIZZLE_W: return 3;
2716 default:
2717 unreachable("Not reached"); /* zero, one swizzles handled already */
2718 }
2719 }
2720
2721 void
2722 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2723 {
2724 int s = key->tex.swizzles[sampler];
2725
2726 this->result = src_reg(this, ir->type);
2727 dst_reg swizzled_result(this->result);
2728
2729 if (ir->op == ir_query_levels) {
2730 /* # levels is in .w */
2731 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2732 emit(MOV(swizzled_result, orig_val));
2733 return;
2734 }
2735
2736 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2737 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2738 emit(MOV(swizzled_result, orig_val));
2739 return;
2740 }
2741
2742
2743 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2744 int swizzle[4] = {0};
2745
2746 for (int i = 0; i < 4; i++) {
2747 switch (GET_SWZ(s, i)) {
2748 case SWIZZLE_ZERO:
2749 zero_mask |= (1 << i);
2750 break;
2751 case SWIZZLE_ONE:
2752 one_mask |= (1 << i);
2753 break;
2754 default:
2755 copy_mask |= (1 << i);
2756 swizzle[i] = GET_SWZ(s, i);
2757 break;
2758 }
2759 }
2760
2761 if (copy_mask) {
2762 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2763 swizzled_result.writemask = copy_mask;
2764 emit(MOV(swizzled_result, orig_val));
2765 }
2766
2767 if (zero_mask) {
2768 swizzled_result.writemask = zero_mask;
2769 emit(MOV(swizzled_result, src_reg(0.0f)));
2770 }
2771
2772 if (one_mask) {
2773 swizzled_result.writemask = one_mask;
2774 emit(MOV(swizzled_result, src_reg(1.0f)));
2775 }
2776 }
2777
2778 void
2779 vec4_visitor::visit(ir_return *)
2780 {
2781 unreachable("not reached");
2782 }
2783
2784 void
2785 vec4_visitor::visit(ir_discard *)
2786 {
2787 unreachable("not reached");
2788 }
2789
2790 void
2791 vec4_visitor::visit(ir_if *ir)
2792 {
2793 /* Don't point the annotation at the if statement, because then it plus
2794 * the then and else blocks get printed.
2795 */
2796 this->base_ir = ir->condition;
2797
2798 if (brw->gen == 6) {
2799 emit_if_gen6(ir);
2800 } else {
2801 enum brw_predicate predicate;
2802 emit_bool_to_cond_code(ir->condition, &predicate);
2803 emit(IF(predicate));
2804 }
2805
2806 visit_instructions(&ir->then_instructions);
2807
2808 if (!ir->else_instructions.is_empty()) {
2809 this->base_ir = ir->condition;
2810 emit(BRW_OPCODE_ELSE);
2811
2812 visit_instructions(&ir->else_instructions);
2813 }
2814
2815 this->base_ir = ir->condition;
2816 emit(BRW_OPCODE_ENDIF);
2817 }
2818
2819 void
2820 vec4_visitor::visit(ir_emit_vertex *)
2821 {
2822 unreachable("not reached");
2823 }
2824
2825 void
2826 vec4_visitor::visit(ir_end_primitive *)
2827 {
2828 unreachable("not reached");
2829 }
2830
2831 void
2832 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2833 dst_reg dst, src_reg offset,
2834 src_reg src0, src_reg src1)
2835 {
2836 unsigned mlen = 0;
2837
2838 /* Set the atomic operation offset. */
2839 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2840 mlen++;
2841
2842 /* Set the atomic operation arguments. */
2843 if (src0.file != BAD_FILE) {
2844 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2845 mlen++;
2846 }
2847
2848 if (src1.file != BAD_FILE) {
2849 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2850 mlen++;
2851 }
2852
2853 /* Emit the instruction. Note that this maps to the normal SIMD8
2854 * untyped atomic message on Ivy Bridge, but that's OK because
2855 * unused channels will be masked out.
2856 */
2857 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2858 src_reg(atomic_op), src_reg(surf_index));
2859 inst->base_mrf = 0;
2860 inst->mlen = mlen;
2861 }
2862
2863 void
2864 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2865 src_reg offset)
2866 {
2867 /* Set the surface read offset. */
2868 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2869
2870 /* Emit the instruction. Note that this maps to the normal SIMD8
2871 * untyped surface read message, but that's OK because unused
2872 * channels will be masked out.
2873 */
2874 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2875 dst, src_reg(surf_index));
2876 inst->base_mrf = 0;
2877 inst->mlen = 1;
2878 }
2879
2880 void
2881 vec4_visitor::emit_ndc_computation()
2882 {
2883 /* Get the position */
2884 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2885
2886 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2887 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2888 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2889
2890 current_annotation = "NDC";
2891 dst_reg ndc_w = ndc;
2892 ndc_w.writemask = WRITEMASK_W;
2893 src_reg pos_w = pos;
2894 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2895 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2896
2897 dst_reg ndc_xyz = ndc;
2898 ndc_xyz.writemask = WRITEMASK_XYZ;
2899
2900 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2901 }
2902
2903 void
2904 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2905 {
2906 if (brw->gen < 6 &&
2907 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2908 key->userclip_active || brw->has_negative_rhw_bug)) {
2909 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2910 dst_reg header1_w = header1;
2911 header1_w.writemask = WRITEMASK_W;
2912
2913 emit(MOV(header1, 0u));
2914
2915 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2916 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2917
2918 current_annotation = "Point size";
2919 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2920 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2921 }
2922
2923 if (key->userclip_active) {
2924 current_annotation = "Clipping flags";
2925 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2926 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2927
2928 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2929 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2930 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2931
2932 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2933 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2934 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2935 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2936 }
2937
2938 /* i965 clipping workaround:
2939 * 1) Test for -ve rhw
2940 * 2) If set,
2941 * set ndc = (0,0,0,0)
2942 * set ucp[6] = 1
2943 *
2944 * Later, clipping will detect ucp[6] and ensure the primitive is
2945 * clipped against all fixed planes.
2946 */
2947 if (brw->has_negative_rhw_bug) {
2948 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2949 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2950 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2951 vec4_instruction *inst;
2952 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2953 inst->predicate = BRW_PREDICATE_NORMAL;
2954 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2955 inst->predicate = BRW_PREDICATE_NORMAL;
2956 }
2957
2958 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2959 } else if (brw->gen < 6) {
2960 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2961 } else {
2962 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2963 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2964 dst_reg reg_w = reg;
2965 reg_w.writemask = WRITEMASK_W;
2966 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
2967 }
2968 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2969 dst_reg reg_y = reg;
2970 reg_y.writemask = WRITEMASK_Y;
2971 reg_y.type = BRW_REGISTER_TYPE_D;
2972 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
2973 }
2974 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2975 dst_reg reg_z = reg;
2976 reg_z.writemask = WRITEMASK_Z;
2977 reg_z.type = BRW_REGISTER_TYPE_D;
2978 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2979 }
2980 }
2981 }
2982
2983 void
2984 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2985 {
2986 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2987 *
2988 * "If a linked set of shaders forming the vertex stage contains no
2989 * static write to gl_ClipVertex or gl_ClipDistance, but the
2990 * application has requested clipping against user clip planes through
2991 * the API, then the coordinate written to gl_Position is used for
2992 * comparison against the user clip planes."
2993 *
2994 * This function is only called if the shader didn't write to
2995 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2996 * if the user wrote to it; otherwise we use gl_Position.
2997 */
2998 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2999 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3000 clip_vertex = VARYING_SLOT_POS;
3001 }
3002
3003 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3004 ++i) {
3005 reg.writemask = 1 << i;
3006 emit(DP4(reg,
3007 src_reg(output_reg[clip_vertex]),
3008 src_reg(this->userplane[i + offset])));
3009 }
3010 }
3011
3012 void
3013 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3014 {
3015 assert (varying < VARYING_SLOT_MAX);
3016 reg.type = output_reg[varying].type;
3017 current_annotation = output_reg_annotation[varying];
3018 /* Copy the register, saturating if necessary */
3019 vec4_instruction *inst = emit(MOV(reg,
3020 src_reg(output_reg[varying])));
3021 if ((varying == VARYING_SLOT_COL0 ||
3022 varying == VARYING_SLOT_COL1 ||
3023 varying == VARYING_SLOT_BFC0 ||
3024 varying == VARYING_SLOT_BFC1) &&
3025 key->clamp_vertex_color) {
3026 inst->saturate = true;
3027 }
3028 }
3029
3030 void
3031 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3032 {
3033 reg.type = BRW_REGISTER_TYPE_F;
3034
3035 switch (varying) {
3036 case VARYING_SLOT_PSIZ:
3037 {
3038 /* PSIZ is always in slot 0, and is coupled with other flags. */
3039 current_annotation = "indices, point width, clip flags";
3040 emit_psiz_and_flags(reg);
3041 break;
3042 }
3043 case BRW_VARYING_SLOT_NDC:
3044 current_annotation = "NDC";
3045 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3046 break;
3047 case VARYING_SLOT_POS:
3048 current_annotation = "gl_Position";
3049 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3050 break;
3051 case VARYING_SLOT_EDGE:
3052 /* This is present when doing unfilled polygons. We're supposed to copy
3053 * the edge flag from the user-provided vertex array
3054 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3055 * of that attribute (starts as 1.0f). This is then used in clipping to
3056 * determine which edges should be drawn as wireframe.
3057 */
3058 current_annotation = "edge flag";
3059 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3060 glsl_type::float_type, WRITEMASK_XYZW))));
3061 break;
3062 case BRW_VARYING_SLOT_PAD:
3063 /* No need to write to this slot */
3064 break;
3065 default:
3066 emit_generic_urb_slot(reg, varying);
3067 break;
3068 }
3069 }
3070
3071 static int
3072 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3073 {
3074 if (brw->gen >= 6) {
3075 /* URB data written (does not include the message header reg) must
3076 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3077 * section 5.4.3.2.2: URB_INTERLEAVED.
3078 *
3079 * URB entries are allocated on a multiple of 1024 bits, so an
3080 * extra 128 bits written here to make the end align to 256 is
3081 * no problem.
3082 */
3083 if ((mlen % 2) != 1)
3084 mlen++;
3085 }
3086
3087 return mlen;
3088 }
3089
3090
3091 /**
3092 * Generates the VUE payload plus the necessary URB write instructions to
3093 * output it.
3094 *
3095 * The VUE layout is documented in Volume 2a.
3096 */
3097 void
3098 vec4_visitor::emit_vertex()
3099 {
3100 /* MRF 0 is reserved for the debugger, so start with message header
3101 * in MRF 1.
3102 */
3103 int base_mrf = 1;
3104 int mrf = base_mrf;
3105 /* In the process of generating our URB write message contents, we
3106 * may need to unspill a register or load from an array. Those
3107 * reads would use MRFs 14-15.
3108 */
3109 int max_usable_mrf = 13;
3110
3111 /* The following assertion verifies that max_usable_mrf causes an
3112 * even-numbered amount of URB write data, which will meet gen6's
3113 * requirements for length alignment.
3114 */
3115 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3116
3117 /* First mrf is the g0-based message header containing URB handles and
3118 * such.
3119 */
3120 emit_urb_write_header(mrf++);
3121
3122 if (brw->gen < 6) {
3123 emit_ndc_computation();
3124 }
3125
3126 /* Lower legacy ff and ClipVertex clipping to clip distances */
3127 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3128 current_annotation = "user clip distances";
3129
3130 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3131 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3132
3133 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3134 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3135 }
3136
3137 /* We may need to split this up into several URB writes, so do them in a
3138 * loop.
3139 */
3140 int slot = 0;
3141 bool complete = false;
3142 do {
3143 /* URB offset is in URB row increments, and each of our MRFs is half of
3144 * one of those, since we're doing interleaved writes.
3145 */
3146 int offset = slot / 2;
3147
3148 mrf = base_mrf + 1;
3149 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3150 emit_urb_slot(dst_reg(MRF, mrf++),
3151 prog_data->vue_map.slot_to_varying[slot]);
3152
3153 /* If this was max_usable_mrf, we can't fit anything more into this
3154 * URB WRITE.
3155 */
3156 if (mrf > max_usable_mrf) {
3157 slot++;
3158 break;
3159 }
3160 }
3161
3162 complete = slot >= prog_data->vue_map.num_slots;
3163 current_annotation = "URB write";
3164 vec4_instruction *inst = emit_urb_write_opcode(complete);
3165 inst->base_mrf = base_mrf;
3166 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3167 inst->offset += offset;
3168 } while(!complete);
3169 }
3170
3171
3172 src_reg
3173 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3174 src_reg *reladdr, int reg_offset)
3175 {
3176 /* Because we store the values to scratch interleaved like our
3177 * vertex data, we need to scale the vec4 index by 2.
3178 */
3179 int message_header_scale = 2;
3180
3181 /* Pre-gen6, the message header uses byte offsets instead of vec4
3182 * (16-byte) offset units.
3183 */
3184 if (brw->gen < 6)
3185 message_header_scale *= 16;
3186
3187 if (reladdr) {
3188 src_reg index = src_reg(this, glsl_type::int_type);
3189
3190 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3191 src_reg(reg_offset)));
3192 emit_before(block, inst, MUL(dst_reg(index), index,
3193 src_reg(message_header_scale)));
3194
3195 return index;
3196 } else {
3197 return src_reg(reg_offset * message_header_scale);
3198 }
3199 }
3200
3201 src_reg
3202 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3203 src_reg *reladdr, int reg_offset)
3204 {
3205 if (reladdr) {
3206 src_reg index = src_reg(this, glsl_type::int_type);
3207
3208 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3209 src_reg(reg_offset)));
3210
3211 /* Pre-gen6, the message header uses byte offsets instead of vec4
3212 * (16-byte) offset units.
3213 */
3214 if (brw->gen < 6) {
3215 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3216 }
3217
3218 return index;
3219 } else if (brw->gen >= 8) {
3220 /* Store the offset in a GRF so we can send-from-GRF. */
3221 src_reg offset = src_reg(this, glsl_type::int_type);
3222 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3223 return offset;
3224 } else {
3225 int message_header_scale = brw->gen < 6 ? 16 : 1;
3226 return src_reg(reg_offset * message_header_scale);
3227 }
3228 }
3229
3230 /**
3231 * Emits an instruction before @inst to load the value named by @orig_src
3232 * from scratch space at @base_offset to @temp.
3233 *
3234 * @base_offset is measured in 32-byte units (the size of a register).
3235 */
3236 void
3237 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3238 dst_reg temp, src_reg orig_src,
3239 int base_offset)
3240 {
3241 int reg_offset = base_offset + orig_src.reg_offset;
3242 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3243 reg_offset);
3244
3245 emit_before(block, inst, SCRATCH_READ(temp, index));
3246 }
3247
3248 /**
3249 * Emits an instruction after @inst to store the value to be written
3250 * to @orig_dst to scratch space at @base_offset, from @temp.
3251 *
3252 * @base_offset is measured in 32-byte units (the size of a register).
3253 */
3254 void
3255 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3256 int base_offset)
3257 {
3258 int reg_offset = base_offset + inst->dst.reg_offset;
3259 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3260 reg_offset);
3261
3262 /* Create a temporary register to store *inst's result in.
3263 *
3264 * We have to be careful in MOVing from our temporary result register in
3265 * the scratch write. If we swizzle from channels of the temporary that
3266 * weren't initialized, it will confuse live interval analysis, which will
3267 * make spilling fail to make progress.
3268 */
3269 src_reg temp = src_reg(this, glsl_type::vec4_type);
3270 temp.type = inst->dst.type;
3271 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3272 int swizzles[4];
3273 for (int i = 0; i < 4; i++)
3274 if (inst->dst.writemask & (1 << i))
3275 swizzles[i] = i;
3276 else
3277 swizzles[i] = first_writemask_chan;
3278 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3279 swizzles[2], swizzles[3]);
3280
3281 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3282 inst->dst.writemask));
3283 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3284 write->predicate = inst->predicate;
3285 write->ir = inst->ir;
3286 write->annotation = inst->annotation;
3287 inst->insert_after(block, write);
3288
3289 inst->dst.file = temp.file;
3290 inst->dst.reg = temp.reg;
3291 inst->dst.reg_offset = temp.reg_offset;
3292 inst->dst.reladdr = NULL;
3293 }
3294
3295 /**
3296 * We can't generally support array access in GRF space, because a
3297 * single instruction's destination can only span 2 contiguous
3298 * registers. So, we send all GRF arrays that get variable index
3299 * access to scratch space.
3300 */
3301 void
3302 vec4_visitor::move_grf_array_access_to_scratch()
3303 {
3304 int scratch_loc[this->virtual_grf_count];
3305 memset(scratch_loc, -1, sizeof(scratch_loc));
3306
3307 /* First, calculate the set of virtual GRFs that need to be punted
3308 * to scratch due to having any array access on them, and where in
3309 * scratch.
3310 */
3311 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3312 if (inst->dst.file == GRF && inst->dst.reladdr &&
3313 scratch_loc[inst->dst.reg] == -1) {
3314 scratch_loc[inst->dst.reg] = c->last_scratch;
3315 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3316 }
3317
3318 for (int i = 0 ; i < 3; i++) {
3319 src_reg *src = &inst->src[i];
3320
3321 if (src->file == GRF && src->reladdr &&
3322 scratch_loc[src->reg] == -1) {
3323 scratch_loc[src->reg] = c->last_scratch;
3324 c->last_scratch += this->virtual_grf_sizes[src->reg];
3325 }
3326 }
3327 }
3328
3329 /* Now, for anything that will be accessed through scratch, rewrite
3330 * it to load/store. Note that this is a _safe list walk, because
3331 * we may generate a new scratch_write instruction after the one
3332 * we're processing.
3333 */
3334 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3335 /* Set up the annotation tracking for new generated instructions. */
3336 base_ir = inst->ir;
3337 current_annotation = inst->annotation;
3338
3339 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3340 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3341 }
3342
3343 for (int i = 0 ; i < 3; i++) {
3344 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3345 continue;
3346
3347 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3348
3349 emit_scratch_read(block, inst, temp, inst->src[i],
3350 scratch_loc[inst->src[i].reg]);
3351
3352 inst->src[i].file = temp.file;
3353 inst->src[i].reg = temp.reg;
3354 inst->src[i].reg_offset = temp.reg_offset;
3355 inst->src[i].reladdr = NULL;
3356 }
3357 }
3358 }
3359
3360 /**
3361 * Emits an instruction before @inst to load the value named by @orig_src
3362 * from the pull constant buffer (surface) at @base_offset to @temp.
3363 */
3364 void
3365 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3366 dst_reg temp, src_reg orig_src,
3367 int base_offset)
3368 {
3369 int reg_offset = base_offset + orig_src.reg_offset;
3370 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3371 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3372 reg_offset);
3373 vec4_instruction *load;
3374
3375 if (brw->gen >= 7) {
3376 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3377 grf_offset.type = offset.type;
3378 emit_before(block, inst, MOV(grf_offset, offset));
3379
3380 load = new(mem_ctx) vec4_instruction(this,
3381 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3382 temp, index, src_reg(grf_offset));
3383 } else {
3384 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3385 temp, index, offset);
3386 load->base_mrf = 14;
3387 load->mlen = 1;
3388 }
3389 emit_before(block, inst, load);
3390 }
3391
3392 /**
3393 * Implements array access of uniforms by inserting a
3394 * PULL_CONSTANT_LOAD instruction.
3395 *
3396 * Unlike temporary GRF array access (where we don't support it due to
3397 * the difficulty of doing relative addressing on instruction
3398 * destinations), we could potentially do array access of uniforms
3399 * that were loaded in GRF space as push constants. In real-world
3400 * usage we've seen, though, the arrays being used are always larger
3401 * than we could load as push constants, so just always move all
3402 * uniform array access out to a pull constant buffer.
3403 */
3404 void
3405 vec4_visitor::move_uniform_array_access_to_pull_constants()
3406 {
3407 int pull_constant_loc[this->uniforms];
3408 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3409 bool nested_reladdr;
3410
3411 /* Walk through and find array access of uniforms. Put a copy of that
3412 * uniform in the pull constant buffer.
3413 *
3414 * Note that we don't move constant-indexed accesses to arrays. No
3415 * testing has been done of the performance impact of this choice.
3416 */
3417 do {
3418 nested_reladdr = false;
3419
3420 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3421 for (int i = 0 ; i < 3; i++) {
3422 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3423 continue;
3424
3425 int uniform = inst->src[i].reg;
3426
3427 if (inst->src[i].reladdr->reladdr)
3428 nested_reladdr = true; /* will need another pass */
3429
3430 /* If this array isn't already present in the pull constant buffer,
3431 * add it.
3432 */
3433 if (pull_constant_loc[uniform] == -1) {
3434 const gl_constant_value **values =
3435 &stage_prog_data->param[uniform * 4];
3436
3437 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3438
3439 assert(uniform < uniform_array_size);
3440 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3441 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3442 = values[j];
3443 }
3444 }
3445
3446 /* Set up the annotation tracking for new generated instructions. */
3447 base_ir = inst->ir;
3448 current_annotation = inst->annotation;
3449
3450 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3451
3452 emit_pull_constant_load(block, inst, temp, inst->src[i],
3453 pull_constant_loc[uniform]);
3454
3455 inst->src[i].file = temp.file;
3456 inst->src[i].reg = temp.reg;
3457 inst->src[i].reg_offset = temp.reg_offset;
3458 inst->src[i].reladdr = NULL;
3459 }
3460 }
3461 } while (nested_reladdr);
3462
3463 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3464 * no need to track them as larger-than-vec4 objects. This will be
3465 * relied on in cutting out unused uniform vectors from push
3466 * constants.
3467 */
3468 split_uniform_registers();
3469 }
3470
3471 void
3472 vec4_visitor::resolve_ud_negate(src_reg *reg)
3473 {
3474 if (reg->type != BRW_REGISTER_TYPE_UD ||
3475 !reg->negate)
3476 return;
3477
3478 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3479 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3480 *reg = temp;
3481 }
3482
3483 vec4_visitor::vec4_visitor(struct brw_context *brw,
3484 struct brw_vec4_compile *c,
3485 struct gl_program *prog,
3486 const struct brw_vec4_prog_key *key,
3487 struct brw_vec4_prog_data *prog_data,
3488 struct gl_shader_program *shader_prog,
3489 gl_shader_stage stage,
3490 void *mem_ctx,
3491 bool debug_flag,
3492 bool no_spills,
3493 shader_time_shader_type st_base,
3494 shader_time_shader_type st_written,
3495 shader_time_shader_type st_reset)
3496 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3497 c(c),
3498 key(key),
3499 prog_data(prog_data),
3500 sanity_param_count(0),
3501 fail_msg(NULL),
3502 first_non_payload_grf(0),
3503 need_all_constants_in_pull_buffer(false),
3504 debug_flag(debug_flag),
3505 no_spills(no_spills),
3506 st_base(st_base),
3507 st_written(st_written),
3508 st_reset(st_reset)
3509 {
3510 this->mem_ctx = mem_ctx;
3511 this->failed = false;
3512
3513 this->base_ir = NULL;
3514 this->current_annotation = NULL;
3515 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3516
3517 this->variable_ht = hash_table_ctor(0,
3518 hash_table_pointer_hash,
3519 hash_table_pointer_compare);
3520
3521 this->virtual_grf_start = NULL;
3522 this->virtual_grf_end = NULL;
3523 this->virtual_grf_sizes = NULL;
3524 this->virtual_grf_count = 0;
3525 this->virtual_grf_reg_map = NULL;
3526 this->virtual_grf_reg_count = 0;
3527 this->virtual_grf_array_size = 0;
3528 this->live_intervals_valid = false;
3529
3530 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3531
3532 this->uniforms = 0;
3533
3534 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3535 * at least one. See setup_uniforms() in brw_vec4.cpp.
3536 */
3537 this->uniform_array_size = 1;
3538 if (prog_data) {
3539 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3540 }
3541
3542 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3543 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3544 }
3545
3546 vec4_visitor::~vec4_visitor()
3547 {
3548 hash_table_dtor(this->variable_ht);
3549 }
3550
3551
3552 void
3553 vec4_visitor::fail(const char *format, ...)
3554 {
3555 va_list va;
3556 char *msg;
3557
3558 if (failed)
3559 return;
3560
3561 failed = true;
3562
3563 va_start(va, format);
3564 msg = ralloc_vasprintf(mem_ctx, format, va);
3565 va_end(va);
3566 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3567
3568 this->fail_msg = msg;
3569
3570 if (debug_flag) {
3571 fprintf(stderr, "%s", msg);
3572 }
3573 }
3574
3575 } /* namespace brw */