i965/fs: Save push constant location information.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_visitor.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_visitor.cpp
25 *
26 * This file supports generating the FS LIR from the GLSL IR. The LIR
27 * makes it easier to do backend-specific optimizations than doing so
28 * in the GLSL IR or in the native code.
29 */
30 extern "C" {
31
32 #include <sys/types.h>
33
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "program/prog_parameter.h"
37 #include "program/prog_print.h"
38 #include "program/prog_optimize.h"
39 #include "program/register_allocate.h"
40 #include "program/sampler.h"
41 #include "program/hash_table.h"
42 #include "brw_context.h"
43 #include "brw_eu.h"
44 #include "brw_wm.h"
45 }
46 #include "brw_fs.h"
47 #include "main/uniforms.h"
48 #include "glsl/glsl_types.h"
49 #include "glsl/ir_optimization.h"
50
51 void
52 fs_visitor::visit(ir_variable *ir)
53 {
54 fs_reg *reg = NULL;
55
56 if (variable_storage(ir))
57 return;
58
59 if (ir->data.mode == ir_var_shader_in) {
60 if (!strcmp(ir->name, "gl_FragCoord")) {
61 reg = emit_fragcoord_interpolation(ir);
62 } else if (!strcmp(ir->name, "gl_FrontFacing")) {
63 reg = emit_frontfacing_interpolation(ir);
64 } else {
65 reg = emit_general_interpolation(ir);
66 }
67 assert(reg);
68 hash_table_insert(this->variable_ht, reg, ir);
69 return;
70 } else if (ir->data.mode == ir_var_shader_out) {
71 reg = new(this->mem_ctx) fs_reg(this, ir->type);
72
73 if (ir->data.index > 0) {
74 assert(ir->data.location == FRAG_RESULT_DATA0);
75 assert(ir->data.index == 1);
76 this->dual_src_output = *reg;
77 } else if (ir->data.location == FRAG_RESULT_COLOR) {
78 /* Writing gl_FragColor outputs to all color regions. */
79 for (unsigned int i = 0; i < MAX2(c->key.nr_color_regions, 1); i++) {
80 this->outputs[i] = *reg;
81 this->output_components[i] = 4;
82 }
83 } else if (ir->data.location == FRAG_RESULT_DEPTH) {
84 this->frag_depth = *reg;
85 } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
86 this->sample_mask = *reg;
87 } else {
88 /* gl_FragData or a user-defined FS output */
89 assert(ir->data.location >= FRAG_RESULT_DATA0 &&
90 ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
91
92 int vector_elements =
93 ir->type->is_array() ? ir->type->fields.array->vector_elements
94 : ir->type->vector_elements;
95
96 /* General color output. */
97 for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
98 int output = ir->data.location - FRAG_RESULT_DATA0 + i;
99 this->outputs[output] = *reg;
100 this->outputs[output].reg_offset += vector_elements * i;
101 this->output_components[output] = vector_elements;
102 }
103 }
104 } else if (ir->data.mode == ir_var_uniform) {
105 int param_index = uniforms;
106
107 /* Thanks to the lower_ubo_reference pass, we will see only
108 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
109 * variables, so no need for them to be in variable_ht.
110 *
111 * Atomic counters take no uniform storage, no need to do
112 * anything here.
113 */
114 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
115 return;
116
117 if (dispatch_width == 16) {
118 if (!variable_storage(ir)) {
119 fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
120 }
121 return;
122 }
123
124 param_size[param_index] = type_size(ir->type);
125 if (!strncmp(ir->name, "gl_", 3)) {
126 setup_builtin_uniform_values(ir);
127 } else {
128 setup_uniform_values(ir);
129 }
130
131 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
132 reg->type = brw_type_for_base_type(ir->type);
133
134 } else if (ir->data.mode == ir_var_system_value) {
135 if (ir->data.location == SYSTEM_VALUE_SAMPLE_POS) {
136 reg = emit_samplepos_setup(ir);
137 } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_ID) {
138 reg = emit_sampleid_setup(ir);
139 } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_MASK_IN) {
140 reg = emit_samplemaskin_setup(ir);
141 }
142 }
143
144 if (!reg)
145 reg = new(this->mem_ctx) fs_reg(this, ir->type);
146
147 hash_table_insert(this->variable_ht, reg, ir);
148 }
149
150 void
151 fs_visitor::visit(ir_dereference_variable *ir)
152 {
153 fs_reg *reg = variable_storage(ir->var);
154 this->result = *reg;
155 }
156
157 void
158 fs_visitor::visit(ir_dereference_record *ir)
159 {
160 const glsl_type *struct_type = ir->record->type;
161
162 ir->record->accept(this);
163
164 unsigned int offset = 0;
165 for (unsigned int i = 0; i < struct_type->length; i++) {
166 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
167 break;
168 offset += type_size(struct_type->fields.structure[i].type);
169 }
170 this->result.reg_offset += offset;
171 this->result.type = brw_type_for_base_type(ir->type);
172 }
173
174 void
175 fs_visitor::visit(ir_dereference_array *ir)
176 {
177 ir_constant *constant_index;
178 fs_reg src;
179 int element_size = type_size(ir->type);
180
181 constant_index = ir->array_index->as_constant();
182
183 ir->array->accept(this);
184 src = this->result;
185 src.type = brw_type_for_base_type(ir->type);
186
187 if (constant_index) {
188 assert(src.file == UNIFORM || src.file == GRF);
189 src.reg_offset += constant_index->value.i[0] * element_size;
190 } else {
191 /* Variable index array dereference. We attach the variable index
192 * component to the reg as a pointer to a register containing the
193 * offset. Currently only uniform arrays are supported in this patch,
194 * and that reladdr pointer is resolved by
195 * move_uniform_array_access_to_pull_constants(). All other array types
196 * are lowered by lower_variable_index_to_cond_assign().
197 */
198 ir->array_index->accept(this);
199
200 fs_reg index_reg;
201 index_reg = fs_reg(this, glsl_type::int_type);
202 emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
203
204 if (src.reladdr) {
205 emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
206 }
207
208 src.reladdr = ralloc(mem_ctx, fs_reg);
209 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
210 }
211 this->result = src;
212 }
213
214 void
215 fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
216 const fs_reg &a)
217 {
218 if (brw->gen < 6 ||
219 !x.is_valid_3src() ||
220 !y.is_valid_3src() ||
221 !a.is_valid_3src()) {
222 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
223 fs_reg y_times_a = fs_reg(this, glsl_type::float_type);
224 fs_reg one_minus_a = fs_reg(this, glsl_type::float_type);
225 fs_reg x_times_one_minus_a = fs_reg(this, glsl_type::float_type);
226
227 emit(MUL(y_times_a, y, a));
228
229 fs_reg negative_a = a;
230 negative_a.negate = !a.negate;
231 emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
232 emit(MUL(x_times_one_minus_a, x, one_minus_a));
233
234 emit(ADD(dst, x_times_one_minus_a, y_times_a));
235 } else {
236 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
237 * we need to reorder the operands.
238 */
239 emit(LRP(dst, a, y, x));
240 }
241 }
242
243 void
244 fs_visitor::emit_minmax(uint32_t conditionalmod, const fs_reg &dst,
245 const fs_reg &src0, const fs_reg &src1)
246 {
247 fs_inst *inst;
248
249 if (brw->gen >= 6) {
250 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
251 inst->conditional_mod = conditionalmod;
252 } else {
253 emit(CMP(reg_null_d, src0, src1, conditionalmod));
254
255 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
256 inst->predicate = BRW_PREDICATE_NORMAL;
257 }
258 }
259
260 /* Instruction selection: Produce a MOV.sat instead of
261 * MIN(MAX(val, 0), 1) when possible.
262 */
263 bool
264 fs_visitor::try_emit_saturate(ir_expression *ir)
265 {
266 ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
267
268 if (!sat_val)
269 return false;
270
271 fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
272
273 sat_val->accept(this);
274 fs_reg src = this->result;
275
276 fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
277
278 /* If the last instruction from our accept() didn't generate our
279 * src, generate a saturated MOV
280 */
281 fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
282 if (!modify || modify->regs_written != 1) {
283 this->result = fs_reg(this, ir->type);
284 fs_inst *inst = emit(MOV(this->result, src));
285 inst->saturate = true;
286 } else {
287 modify->saturate = true;
288 this->result = src;
289 }
290
291
292 return true;
293 }
294
295 bool
296 fs_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
297 {
298 /* 3-src instructions were introduced in gen6. */
299 if (brw->gen < 6)
300 return false;
301
302 /* MAD can only handle floating-point data. */
303 if (ir->type != glsl_type::float_type)
304 return false;
305
306 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
307 ir_expression *mul = ir->operands[mul_arg]->as_expression();
308
309 if (!mul || mul->operation != ir_binop_mul)
310 return false;
311
312 if (nonmul->as_constant() ||
313 mul->operands[0]->as_constant() ||
314 mul->operands[1]->as_constant())
315 return false;
316
317 nonmul->accept(this);
318 fs_reg src0 = this->result;
319
320 mul->operands[0]->accept(this);
321 fs_reg src1 = this->result;
322
323 mul->operands[1]->accept(this);
324 fs_reg src2 = this->result;
325
326 this->result = fs_reg(this, ir->type);
327 emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
328
329 return true;
330 }
331
332 void
333 fs_visitor::visit(ir_expression *ir)
334 {
335 unsigned int operand;
336 fs_reg op[3], temp;
337 fs_inst *inst;
338
339 assert(ir->get_num_operands() <= 3);
340
341 if (try_emit_saturate(ir))
342 return;
343 if (ir->operation == ir_binop_add) {
344 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
345 return;
346 }
347
348 for (operand = 0; operand < ir->get_num_operands(); operand++) {
349 ir->operands[operand]->accept(this);
350 if (this->result.file == BAD_FILE) {
351 fail("Failed to get tree for expression operand:\n");
352 ir->operands[operand]->fprint(stderr);
353 fprintf(stderr, "\n");
354 }
355 assert(this->result.is_valid_3src());
356 op[operand] = this->result;
357
358 /* Matrix expression operands should have been broken down to vector
359 * operations already.
360 */
361 assert(!ir->operands[operand]->type->is_matrix());
362 /* And then those vector operands should have been broken down to scalar.
363 */
364 assert(!ir->operands[operand]->type->is_vector());
365 }
366
367 /* Storage for our result. If our result goes into an assignment, it will
368 * just get copy-propagated out, so no worries.
369 */
370 this->result = fs_reg(this, ir->type);
371
372 switch (ir->operation) {
373 case ir_unop_logic_not:
374 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
375 * ones complement of the whole register, not just bit 0.
376 */
377 emit(XOR(this->result, op[0], fs_reg(1)));
378 break;
379 case ir_unop_neg:
380 op[0].negate = !op[0].negate;
381 emit(MOV(this->result, op[0]));
382 break;
383 case ir_unop_abs:
384 op[0].abs = true;
385 op[0].negate = false;
386 emit(MOV(this->result, op[0]));
387 break;
388 case ir_unop_sign:
389 if (ir->type->is_float()) {
390 /* AND(val, 0x80000000) gives the sign bit.
391 *
392 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
393 * zero.
394 */
395 emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
396
397 op[0].type = BRW_REGISTER_TYPE_UD;
398 this->result.type = BRW_REGISTER_TYPE_UD;
399 emit(AND(this->result, op[0], fs_reg(0x80000000u)));
400
401 inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
402 inst->predicate = BRW_PREDICATE_NORMAL;
403
404 this->result.type = BRW_REGISTER_TYPE_F;
405 } else {
406 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
407 * -> non-negative val generates 0x00000000.
408 * Predicated OR sets 1 if val is positive.
409 */
410 emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
411
412 emit(ASR(this->result, op[0], fs_reg(31)));
413
414 inst = emit(OR(this->result, this->result, fs_reg(1)));
415 inst->predicate = BRW_PREDICATE_NORMAL;
416 }
417 break;
418 case ir_unop_rcp:
419 emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
420 break;
421
422 case ir_unop_exp2:
423 emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
424 break;
425 case ir_unop_log2:
426 emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
427 break;
428 case ir_unop_exp:
429 case ir_unop_log:
430 assert(!"not reached: should be handled by ir_explog_to_explog2");
431 break;
432 case ir_unop_sin:
433 case ir_unop_sin_reduced:
434 emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
435 break;
436 case ir_unop_cos:
437 case ir_unop_cos_reduced:
438 emit_math(SHADER_OPCODE_COS, this->result, op[0]);
439 break;
440
441 case ir_unop_dFdx:
442 emit(FS_OPCODE_DDX, this->result, op[0]);
443 break;
444 case ir_unop_dFdy:
445 emit(FS_OPCODE_DDY, this->result, op[0]);
446 break;
447
448 case ir_binop_add:
449 emit(ADD(this->result, op[0], op[1]));
450 break;
451 case ir_binop_sub:
452 assert(!"not reached: should be handled by ir_sub_to_add_neg");
453 break;
454
455 case ir_binop_mul:
456 if (brw->gen < 8 && ir->type->is_integer()) {
457 /* For integer multiplication, the MUL uses the low 16 bits
458 * of one of the operands (src0 on gen6, src1 on gen7). The
459 * MACH accumulates in the contribution of the upper 16 bits
460 * of that operand.
461 *
462 * FINISHME: Emit just the MUL if we know an operand is small
463 * enough.
464 */
465 if (brw->gen >= 7 && dispatch_width == 16)
466 fail("SIMD16 explicit accumulator operands unsupported\n");
467
468 struct brw_reg acc = retype(brw_acc_reg(), this->result.type);
469
470 emit(MUL(acc, op[0], op[1]));
471 emit(MACH(reg_null_d, op[0], op[1]));
472 emit(MOV(this->result, fs_reg(acc)));
473 } else {
474 emit(MUL(this->result, op[0], op[1]));
475 }
476 break;
477 case ir_binop_imul_high: {
478 if (brw->gen >= 7 && dispatch_width == 16)
479 fail("SIMD16 explicit accumulator operands unsupported\n");
480
481 struct brw_reg acc = retype(brw_acc_reg(), this->result.type);
482
483 emit(MUL(acc, op[0], op[1]));
484 emit(MACH(this->result, op[0], op[1]));
485 break;
486 }
487 case ir_binop_div:
488 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
489 assert(ir->type->is_integer());
490 emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
491 break;
492 case ir_binop_carry: {
493 if (brw->gen >= 7 && dispatch_width == 16)
494 fail("SIMD16 explicit accumulator operands unsupported\n");
495
496 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
497
498 emit(ADDC(reg_null_ud, op[0], op[1]));
499 emit(MOV(this->result, fs_reg(acc)));
500 break;
501 }
502 case ir_binop_borrow: {
503 if (brw->gen >= 7 && dispatch_width == 16)
504 fail("SIMD16 explicit accumulator operands unsupported\n");
505
506 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
507
508 emit(SUBB(reg_null_ud, op[0], op[1]));
509 emit(MOV(this->result, fs_reg(acc)));
510 break;
511 }
512 case ir_binop_mod:
513 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
514 assert(ir->type->is_integer());
515 emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
516 break;
517
518 case ir_binop_less:
519 case ir_binop_greater:
520 case ir_binop_lequal:
521 case ir_binop_gequal:
522 case ir_binop_equal:
523 case ir_binop_all_equal:
524 case ir_binop_nequal:
525 case ir_binop_any_nequal:
526 resolve_bool_comparison(ir->operands[0], &op[0]);
527 resolve_bool_comparison(ir->operands[1], &op[1]);
528
529 emit(CMP(this->result, op[0], op[1],
530 brw_conditional_for_comparison(ir->operation)));
531 break;
532
533 case ir_binop_logic_xor:
534 emit(XOR(this->result, op[0], op[1]));
535 break;
536
537 case ir_binop_logic_or:
538 emit(OR(this->result, op[0], op[1]));
539 break;
540
541 case ir_binop_logic_and:
542 emit(AND(this->result, op[0], op[1]));
543 break;
544
545 case ir_binop_dot:
546 case ir_unop_any:
547 assert(!"not reached: should be handled by brw_fs_channel_expressions");
548 break;
549
550 case ir_unop_noise:
551 assert(!"not reached: should be handled by lower_noise");
552 break;
553
554 case ir_quadop_vector:
555 assert(!"not reached: should be handled by lower_quadop_vector");
556 break;
557
558 case ir_binop_vector_extract:
559 assert(!"not reached: should be handled by lower_vec_index_to_cond_assign()");
560 break;
561
562 case ir_triop_vector_insert:
563 assert(!"not reached: should be handled by lower_vector_insert()");
564 break;
565
566 case ir_binop_ldexp:
567 assert(!"not reached: should be handled by ldexp_to_arith()");
568 break;
569
570 case ir_unop_sqrt:
571 emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
572 break;
573
574 case ir_unop_rsq:
575 emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
576 break;
577
578 case ir_unop_bitcast_i2f:
579 case ir_unop_bitcast_u2f:
580 op[0].type = BRW_REGISTER_TYPE_F;
581 this->result = op[0];
582 break;
583 case ir_unop_i2u:
584 case ir_unop_bitcast_f2u:
585 op[0].type = BRW_REGISTER_TYPE_UD;
586 this->result = op[0];
587 break;
588 case ir_unop_u2i:
589 case ir_unop_bitcast_f2i:
590 op[0].type = BRW_REGISTER_TYPE_D;
591 this->result = op[0];
592 break;
593 case ir_unop_i2f:
594 case ir_unop_u2f:
595 case ir_unop_f2i:
596 case ir_unop_f2u:
597 emit(MOV(this->result, op[0]));
598 break;
599
600 case ir_unop_b2i:
601 emit(AND(this->result, op[0], fs_reg(1)));
602 break;
603 case ir_unop_b2f:
604 temp = fs_reg(this, glsl_type::int_type);
605 emit(AND(temp, op[0], fs_reg(1)));
606 emit(MOV(this->result, temp));
607 break;
608
609 case ir_unop_f2b:
610 emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
611 break;
612 case ir_unop_i2b:
613 emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
614 break;
615
616 case ir_unop_trunc:
617 emit(RNDZ(this->result, op[0]));
618 break;
619 case ir_unop_ceil:
620 op[0].negate = !op[0].negate;
621 emit(RNDD(this->result, op[0]));
622 this->result.negate = true;
623 break;
624 case ir_unop_floor:
625 emit(RNDD(this->result, op[0]));
626 break;
627 case ir_unop_fract:
628 emit(FRC(this->result, op[0]));
629 break;
630 case ir_unop_round_even:
631 emit(RNDE(this->result, op[0]));
632 break;
633
634 case ir_binop_min:
635 case ir_binop_max:
636 resolve_ud_negate(&op[0]);
637 resolve_ud_negate(&op[1]);
638 emit_minmax(ir->operation == ir_binop_min ?
639 BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
640 this->result, op[0], op[1]);
641 break;
642 case ir_unop_pack_snorm_2x16:
643 case ir_unop_pack_snorm_4x8:
644 case ir_unop_pack_unorm_2x16:
645 case ir_unop_pack_unorm_4x8:
646 case ir_unop_unpack_snorm_2x16:
647 case ir_unop_unpack_snorm_4x8:
648 case ir_unop_unpack_unorm_2x16:
649 case ir_unop_unpack_unorm_4x8:
650 case ir_unop_unpack_half_2x16:
651 case ir_unop_pack_half_2x16:
652 assert(!"not reached: should be handled by lower_packing_builtins");
653 break;
654 case ir_unop_unpack_half_2x16_split_x:
655 emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
656 break;
657 case ir_unop_unpack_half_2x16_split_y:
658 emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
659 break;
660 case ir_binop_pow:
661 emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
662 break;
663
664 case ir_unop_bitfield_reverse:
665 emit(BFREV(this->result, op[0]));
666 break;
667 case ir_unop_bit_count:
668 emit(CBIT(this->result, op[0]));
669 break;
670 case ir_unop_find_msb:
671 temp = fs_reg(this, glsl_type::uint_type);
672 emit(FBH(temp, op[0]));
673
674 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
675 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
676 * subtract the result from 31 to convert the MSB count into an LSB count.
677 */
678
679 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
680 emit(MOV(this->result, temp));
681 emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
682
683 temp.negate = true;
684 inst = emit(ADD(this->result, temp, fs_reg(31)));
685 inst->predicate = BRW_PREDICATE_NORMAL;
686 break;
687 case ir_unop_find_lsb:
688 emit(FBL(this->result, op[0]));
689 break;
690 case ir_triop_bitfield_extract:
691 /* Note that the instruction's argument order is reversed from GLSL
692 * and the IR.
693 */
694 emit(BFE(this->result, op[2], op[1], op[0]));
695 break;
696 case ir_binop_bfm:
697 emit(BFI1(this->result, op[0], op[1]));
698 break;
699 case ir_triop_bfi:
700 emit(BFI2(this->result, op[0], op[1], op[2]));
701 break;
702 case ir_quadop_bitfield_insert:
703 assert(!"not reached: should be handled by "
704 "lower_instructions::bitfield_insert_to_bfm_bfi");
705 break;
706
707 case ir_unop_bit_not:
708 emit(NOT(this->result, op[0]));
709 break;
710 case ir_binop_bit_and:
711 emit(AND(this->result, op[0], op[1]));
712 break;
713 case ir_binop_bit_xor:
714 emit(XOR(this->result, op[0], op[1]));
715 break;
716 case ir_binop_bit_or:
717 emit(OR(this->result, op[0], op[1]));
718 break;
719
720 case ir_binop_lshift:
721 emit(SHL(this->result, op[0], op[1]));
722 break;
723
724 case ir_binop_rshift:
725 if (ir->type->base_type == GLSL_TYPE_INT)
726 emit(ASR(this->result, op[0], op[1]));
727 else
728 emit(SHR(this->result, op[0], op[1]));
729 break;
730 case ir_binop_pack_half_2x16_split:
731 emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
732 break;
733 case ir_binop_ubo_load: {
734 /* This IR node takes a constant uniform block and a constant or
735 * variable byte offset within the block and loads a vector from that.
736 */
737 ir_constant *uniform_block = ir->operands[0]->as_constant();
738 ir_constant *const_offset = ir->operands[1]->as_constant();
739 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.ubo_start +
740 uniform_block->value.u[0]);
741 if (const_offset) {
742 fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
743 packed_consts.type = result.type;
744
745 fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
746 emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
747 packed_consts, surf_index, const_offset_reg));
748
749 for (int i = 0; i < ir->type->vector_elements; i++) {
750 packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
751
752 /* The std140 packing rules don't allow vectors to cross 16-byte
753 * boundaries, and a reg is 32 bytes.
754 */
755 assert(packed_consts.subreg_offset < 32);
756
757 /* UBO bools are any nonzero value. We consider bools to be
758 * values with the low bit set to 1. Convert them using CMP.
759 */
760 if (ir->type->base_type == GLSL_TYPE_BOOL) {
761 emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
762 } else {
763 emit(MOV(result, packed_consts));
764 }
765
766 result.reg_offset++;
767 }
768 } else {
769 /* Turn the byte offset into a dword offset. */
770 fs_reg base_offset = fs_reg(this, glsl_type::int_type);
771 emit(SHR(base_offset, op[1], fs_reg(2)));
772
773 for (int i = 0; i < ir->type->vector_elements; i++) {
774 emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
775 base_offset, i));
776
777 if (ir->type->base_type == GLSL_TYPE_BOOL)
778 emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
779
780 result.reg_offset++;
781 }
782 }
783
784 result.reg_offset = 0;
785 break;
786 }
787
788 case ir_triop_fma:
789 /* Note that the instruction's argument order is reversed from GLSL
790 * and the IR.
791 */
792 emit(MAD(this->result, op[2], op[1], op[0]));
793 break;
794
795 case ir_triop_lrp:
796 emit_lrp(this->result, op[0], op[1], op[2]);
797 break;
798
799 case ir_triop_csel:
800 emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
801 inst = emit(BRW_OPCODE_SEL, this->result, op[1], op[2]);
802 inst->predicate = BRW_PREDICATE_NORMAL;
803 break;
804 }
805 }
806
807 void
808 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
809 const glsl_type *type, bool predicated)
810 {
811 switch (type->base_type) {
812 case GLSL_TYPE_FLOAT:
813 case GLSL_TYPE_UINT:
814 case GLSL_TYPE_INT:
815 case GLSL_TYPE_BOOL:
816 for (unsigned int i = 0; i < type->components(); i++) {
817 l.type = brw_type_for_base_type(type);
818 r.type = brw_type_for_base_type(type);
819
820 if (predicated || !l.equals(r)) {
821 fs_inst *inst = emit(MOV(l, r));
822 inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
823 }
824
825 l.reg_offset++;
826 r.reg_offset++;
827 }
828 break;
829 case GLSL_TYPE_ARRAY:
830 for (unsigned int i = 0; i < type->length; i++) {
831 emit_assignment_writes(l, r, type->fields.array, predicated);
832 }
833 break;
834
835 case GLSL_TYPE_STRUCT:
836 for (unsigned int i = 0; i < type->length; i++) {
837 emit_assignment_writes(l, r, type->fields.structure[i].type,
838 predicated);
839 }
840 break;
841
842 case GLSL_TYPE_SAMPLER:
843 case GLSL_TYPE_IMAGE:
844 case GLSL_TYPE_ATOMIC_UINT:
845 break;
846
847 case GLSL_TYPE_VOID:
848 case GLSL_TYPE_ERROR:
849 case GLSL_TYPE_INTERFACE:
850 assert(!"not reached");
851 break;
852 }
853 }
854
855 /* If the RHS processing resulted in an instruction generating a
856 * temporary value, and it would be easy to rewrite the instruction to
857 * generate its result right into the LHS instead, do so. This ends
858 * up reliably removing instructions where it can be tricky to do so
859 * later without real UD chain information.
860 */
861 bool
862 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
863 fs_reg dst,
864 fs_reg src,
865 fs_inst *pre_rhs_inst,
866 fs_inst *last_rhs_inst)
867 {
868 /* Only attempt if we're doing a direct assignment. */
869 if (ir->condition ||
870 !(ir->lhs->type->is_scalar() ||
871 (ir->lhs->type->is_vector() &&
872 ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
873 return false;
874
875 /* Make sure the last instruction generated our source reg. */
876 fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
877 last_rhs_inst,
878 src);
879 if (!modify)
880 return false;
881
882 /* If last_rhs_inst wrote a different number of components than our LHS,
883 * we can't safely rewrite it.
884 */
885 if (virtual_grf_sizes[dst.reg] != modify->regs_written)
886 return false;
887
888 /* Success! Rewrite the instruction. */
889 modify->dst = dst;
890
891 return true;
892 }
893
894 void
895 fs_visitor::visit(ir_assignment *ir)
896 {
897 fs_reg l, r;
898 fs_inst *inst;
899
900 /* FINISHME: arrays on the lhs */
901 ir->lhs->accept(this);
902 l = this->result;
903
904 fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
905
906 ir->rhs->accept(this);
907 r = this->result;
908
909 fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
910
911 assert(l.file != BAD_FILE);
912 assert(r.file != BAD_FILE);
913
914 if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
915 return;
916
917 if (ir->condition) {
918 emit_bool_to_cond_code(ir->condition);
919 }
920
921 if (ir->lhs->type->is_scalar() ||
922 ir->lhs->type->is_vector()) {
923 for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
924 if (ir->write_mask & (1 << i)) {
925 inst = emit(MOV(l, r));
926 if (ir->condition)
927 inst->predicate = BRW_PREDICATE_NORMAL;
928 r.reg_offset++;
929 }
930 l.reg_offset++;
931 }
932 } else {
933 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
934 }
935 }
936
937 fs_inst *
938 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
939 fs_reg shadow_c, fs_reg lod, fs_reg dPdy)
940 {
941 int mlen;
942 int base_mrf = 1;
943 bool simd16 = false;
944 fs_reg orig_dst;
945
946 /* g0 header. */
947 mlen = 1;
948
949 if (ir->shadow_comparitor) {
950 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
951 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
952 coordinate.reg_offset++;
953 }
954
955 /* gen4's SIMD8 sampler always has the slots for u,v,r present.
956 * the unused slots must be zeroed.
957 */
958 for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
959 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
960 }
961 mlen += 3;
962
963 if (ir->op == ir_tex) {
964 /* There's no plain shadow compare message, so we use shadow
965 * compare with a bias of 0.0.
966 */
967 emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
968 mlen++;
969 } else if (ir->op == ir_txb || ir->op == ir_txl) {
970 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
971 mlen++;
972 } else {
973 assert(!"Should not get here.");
974 }
975
976 emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
977 mlen++;
978 } else if (ir->op == ir_tex) {
979 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
980 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
981 coordinate.reg_offset++;
982 }
983 /* zero the others. */
984 for (int i = ir->coordinate->type->vector_elements; i<3; i++) {
985 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
986 }
987 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
988 mlen += 3;
989 } else if (ir->op == ir_txd) {
990 fs_reg &dPdx = lod;
991
992 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
993 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
994 coordinate.reg_offset++;
995 }
996 /* the slots for u and v are always present, but r is optional */
997 mlen += MAX2(ir->coordinate->type->vector_elements, 2);
998
999 /* P = u, v, r
1000 * dPdx = dudx, dvdx, drdx
1001 * dPdy = dudy, dvdy, drdy
1002 *
1003 * 1-arg: Does not exist.
1004 *
1005 * 2-arg: dudx dvdx dudy dvdy
1006 * dPdx.x dPdx.y dPdy.x dPdy.y
1007 * m4 m5 m6 m7
1008 *
1009 * 3-arg: dudx dvdx drdx dudy dvdy drdy
1010 * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
1011 * m5 m6 m7 m8 m9 m10
1012 */
1013 for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
1014 emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
1015 dPdx.reg_offset++;
1016 }
1017 mlen += MAX2(ir->lod_info.grad.dPdx->type->vector_elements, 2);
1018
1019 for (int i = 0; i < ir->lod_info.grad.dPdy->type->vector_elements; i++) {
1020 emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
1021 dPdy.reg_offset++;
1022 }
1023 mlen += MAX2(ir->lod_info.grad.dPdy->type->vector_elements, 2);
1024 } else if (ir->op == ir_txs) {
1025 /* There's no SIMD8 resinfo message on Gen4. Use SIMD16 instead. */
1026 simd16 = true;
1027 emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1028 mlen += 2;
1029 } else {
1030 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1031 * instructions. We'll need to do SIMD16 here.
1032 */
1033 simd16 = true;
1034 assert(ir->op == ir_txb || ir->op == ir_txl || ir->op == ir_txf);
1035
1036 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1037 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
1038 coordinate));
1039 coordinate.reg_offset++;
1040 }
1041
1042 /* Initialize the rest of u/v/r with 0.0. Empirically, this seems to
1043 * be necessary for TXF (ld), but seems wise to do for all messages.
1044 */
1045 for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
1046 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
1047 }
1048
1049 /* lod/bias appears after u/v/r. */
1050 mlen += 6;
1051
1052 emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
1053 mlen++;
1054
1055 /* The unused upper half. */
1056 mlen++;
1057 }
1058
1059 if (simd16) {
1060 /* Now, since we're doing simd16, the return is 2 interleaved
1061 * vec4s where the odd-indexed ones are junk. We'll need to move
1062 * this weirdness around to the expected layout.
1063 */
1064 orig_dst = dst;
1065 dst = fs_reg(GRF, virtual_grf_alloc(8),
1066 (brw->is_g4x ?
1067 brw_type_for_base_type(ir->type) :
1068 BRW_REGISTER_TYPE_F));
1069 }
1070
1071 fs_inst *inst = NULL;
1072 switch (ir->op) {
1073 case ir_tex:
1074 inst = emit(SHADER_OPCODE_TEX, dst);
1075 break;
1076 case ir_txb:
1077 inst = emit(FS_OPCODE_TXB, dst);
1078 break;
1079 case ir_txl:
1080 inst = emit(SHADER_OPCODE_TXL, dst);
1081 break;
1082 case ir_txd:
1083 inst = emit(SHADER_OPCODE_TXD, dst);
1084 break;
1085 case ir_txs:
1086 inst = emit(SHADER_OPCODE_TXS, dst);
1087 break;
1088 case ir_txf:
1089 inst = emit(SHADER_OPCODE_TXF, dst);
1090 break;
1091 default:
1092 fail("unrecognized texture opcode");
1093 }
1094 inst->base_mrf = base_mrf;
1095 inst->mlen = mlen;
1096 inst->header_present = true;
1097 inst->regs_written = simd16 ? 8 : 4;
1098
1099 if (simd16) {
1100 for (int i = 0; i < 4; i++) {
1101 emit(MOV(orig_dst, dst));
1102 orig_dst.reg_offset++;
1103 dst.reg_offset += 2;
1104 }
1105 }
1106
1107 return inst;
1108 }
1109
1110 /* gen5's sampler has slots for u, v, r, array index, then optional
1111 * parameters like shadow comparitor or LOD bias. If optional
1112 * parameters aren't present, those base slots are optional and don't
1113 * need to be included in the message.
1114 *
1115 * We don't fill in the unnecessary slots regardless, which may look
1116 * surprising in the disassembly.
1117 */
1118 fs_inst *
1119 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1120 fs_reg shadow_c, fs_reg lod, fs_reg lod2,
1121 fs_reg sample_index)
1122 {
1123 int mlen = 0;
1124 int base_mrf = 2;
1125 int reg_width = dispatch_width / 8;
1126 bool header_present = false;
1127 const int vector_elements =
1128 ir->coordinate ? ir->coordinate->type->vector_elements : 0;
1129
1130 if (ir->offset) {
1131 /* The offsets set up by the ir_texture visitor are in the
1132 * m1 header, so we can't go headerless.
1133 */
1134 header_present = true;
1135 mlen++;
1136 base_mrf--;
1137 }
1138
1139 for (int i = 0; i < vector_elements; i++) {
1140 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
1141 coordinate));
1142 coordinate.reg_offset++;
1143 }
1144 mlen += vector_elements * reg_width;
1145
1146 if (ir->shadow_comparitor) {
1147 mlen = MAX2(mlen, header_present + 4 * reg_width);
1148
1149 emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
1150 mlen += reg_width;
1151 }
1152
1153 fs_inst *inst = NULL;
1154 switch (ir->op) {
1155 case ir_tex:
1156 inst = emit(SHADER_OPCODE_TEX, dst);
1157 break;
1158 case ir_txb:
1159 mlen = MAX2(mlen, header_present + 4 * reg_width);
1160 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1161 mlen += reg_width;
1162
1163 inst = emit(FS_OPCODE_TXB, dst);
1164 break;
1165 case ir_txl:
1166 mlen = MAX2(mlen, header_present + 4 * reg_width);
1167 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1168 mlen += reg_width;
1169
1170 inst = emit(SHADER_OPCODE_TXL, dst);
1171 break;
1172 case ir_txd: {
1173 mlen = MAX2(mlen, header_present + 4 * reg_width); /* skip over 'ai' */
1174
1175 /**
1176 * P = u, v, r
1177 * dPdx = dudx, dvdx, drdx
1178 * dPdy = dudy, dvdy, drdy
1179 *
1180 * Load up these values:
1181 * - dudx dudy dvdx dvdy drdx drdy
1182 * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
1183 */
1184 for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
1185 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1186 lod.reg_offset++;
1187 mlen += reg_width;
1188
1189 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod2));
1190 lod2.reg_offset++;
1191 mlen += reg_width;
1192 }
1193
1194 inst = emit(SHADER_OPCODE_TXD, dst);
1195 break;
1196 }
1197 case ir_txs:
1198 emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1199 mlen += reg_width;
1200 inst = emit(SHADER_OPCODE_TXS, dst);
1201 break;
1202 case ir_query_levels:
1203 emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1204 mlen += reg_width;
1205 inst = emit(SHADER_OPCODE_TXS, dst);
1206 break;
1207 case ir_txf:
1208 mlen = header_present + 4 * reg_width;
1209 emit(MOV(fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD), lod));
1210 inst = emit(SHADER_OPCODE_TXF, dst);
1211 break;
1212 case ir_txf_ms:
1213 mlen = header_present + 4 * reg_width;
1214
1215 /* lod */
1216 emit(MOV(fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD), fs_reg(0)));
1217 /* sample index */
1218 emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), sample_index));
1219 mlen += reg_width;
1220 inst = emit(SHADER_OPCODE_TXF_CMS, dst);
1221 break;
1222 case ir_lod:
1223 inst = emit(SHADER_OPCODE_LOD, dst);
1224 break;
1225 case ir_tg4:
1226 inst = emit(SHADER_OPCODE_TG4, dst);
1227 break;
1228 default:
1229 fail("unrecognized texture opcode");
1230 break;
1231 }
1232 inst->base_mrf = base_mrf;
1233 inst->mlen = mlen;
1234 inst->header_present = header_present;
1235 inst->regs_written = 4;
1236
1237 if (mlen > MAX_SAMPLER_MESSAGE_SIZE) {
1238 fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
1239 " disallowed by hardware\n");
1240 }
1241
1242 return inst;
1243 }
1244
1245 fs_inst *
1246 fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1247 fs_reg shadow_c, fs_reg lod, fs_reg lod2,
1248 fs_reg sample_index, fs_reg mcs, int sampler)
1249 {
1250 int reg_width = dispatch_width / 8;
1251 bool header_present = false;
1252
1253 fs_reg payload = fs_reg(this, glsl_type::float_type);
1254 fs_reg next = payload;
1255
1256 if (ir->op == ir_tg4 || (ir->offset && ir->op != ir_txf) || sampler >= 16) {
1257 /* For general texture offsets (no txf workaround), we need a header to
1258 * put them in. Note that for SIMD16 we're making space for two actual
1259 * hardware registers here, so the emit will have to fix up for this.
1260 *
1261 * * ir4_tg4 needs to place its channel select in the header,
1262 * for interaction with ARB_texture_swizzle
1263 *
1264 * The sampler index is only 4-bits, so for larger sampler numbers we
1265 * need to offset the Sampler State Pointer in the header.
1266 */
1267 header_present = true;
1268 next.reg_offset++;
1269 }
1270
1271 if (ir->shadow_comparitor) {
1272 emit(MOV(next, shadow_c));
1273 next.reg_offset++;
1274 }
1275
1276 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
1277 bool coordinate_done = false;
1278
1279 /* Set up the LOD info */
1280 switch (ir->op) {
1281 case ir_tex:
1282 case ir_lod:
1283 break;
1284 case ir_txb:
1285 emit(MOV(next, lod));
1286 next.reg_offset++;
1287 break;
1288 case ir_txl:
1289 emit(MOV(next, lod));
1290 next.reg_offset++;
1291 break;
1292 case ir_txd: {
1293 if (dispatch_width == 16)
1294 fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1295
1296 /* Load dPdx and the coordinate together:
1297 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1298 */
1299 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1300 emit(MOV(next, coordinate));
1301 coordinate.reg_offset++;
1302 next.reg_offset++;
1303
1304 /* For cube map array, the coordinate is (u,v,r,ai) but there are
1305 * only derivatives for (u, v, r).
1306 */
1307 if (i < ir->lod_info.grad.dPdx->type->vector_elements) {
1308 emit(MOV(next, lod));
1309 lod.reg_offset++;
1310 next.reg_offset++;
1311
1312 emit(MOV(next, lod2));
1313 lod2.reg_offset++;
1314 next.reg_offset++;
1315 }
1316 }
1317
1318 coordinate_done = true;
1319 break;
1320 }
1321 case ir_txs:
1322 emit(MOV(retype(next, BRW_REGISTER_TYPE_UD), lod));
1323 next.reg_offset++;
1324 break;
1325 case ir_query_levels:
1326 emit(MOV(retype(next, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1327 next.reg_offset++;
1328 break;
1329 case ir_txf:
1330 /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
1331 emit(MOV(retype(next, BRW_REGISTER_TYPE_D), coordinate));
1332 coordinate.reg_offset++;
1333 next.reg_offset++;
1334
1335 emit(MOV(retype(next, BRW_REGISTER_TYPE_D), lod));
1336 next.reg_offset++;
1337
1338 for (int i = 1; i < ir->coordinate->type->vector_elements; i++) {
1339 emit(MOV(retype(next, BRW_REGISTER_TYPE_D), coordinate));
1340 coordinate.reg_offset++;
1341 next.reg_offset++;
1342 }
1343
1344 coordinate_done = true;
1345 break;
1346 case ir_txf_ms:
1347 emit(MOV(retype(next, BRW_REGISTER_TYPE_UD), sample_index));
1348 next.reg_offset++;
1349
1350 /* data from the multisample control surface */
1351 emit(MOV(retype(next, BRW_REGISTER_TYPE_UD), mcs));
1352 next.reg_offset++;
1353
1354 /* there is no offsetting for this message; just copy in the integer
1355 * texture coordinates
1356 */
1357 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1358 emit(MOV(retype(next, BRW_REGISTER_TYPE_D), coordinate));
1359 coordinate.reg_offset++;
1360 next.reg_offset++;
1361 }
1362
1363 coordinate_done = true;
1364 break;
1365 case ir_tg4:
1366 if (has_nonconstant_offset) {
1367 if (ir->shadow_comparitor && dispatch_width == 16)
1368 fail("Gen7 does not support gather4_po_c in SIMD16 mode.");
1369
1370 /* More crazy intermixing */
1371 ir->offset->accept(this);
1372 fs_reg offset_value = this->result;
1373
1374 for (int i = 0; i < 2; i++) { /* u, v */
1375 emit(MOV(next, coordinate));
1376 coordinate.reg_offset++;
1377 next.reg_offset++;
1378 }
1379
1380 for (int i = 0; i < 2; i++) { /* offu, offv */
1381 emit(MOV(retype(next, BRW_REGISTER_TYPE_D), offset_value));
1382 offset_value.reg_offset++;
1383 next.reg_offset++;
1384 }
1385
1386 if (ir->coordinate->type->vector_elements == 3) { /* r if present */
1387 emit(MOV(next, coordinate));
1388 coordinate.reg_offset++;
1389 next.reg_offset++;
1390 }
1391
1392 coordinate_done = true;
1393 }
1394 break;
1395 }
1396
1397 /* Set up the coordinate (except for cases where it was done above) */
1398 if (ir->coordinate && !coordinate_done) {
1399 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1400 emit(MOV(next, coordinate));
1401 coordinate.reg_offset++;
1402 next.reg_offset++;
1403 }
1404 }
1405
1406 /* Generate the SEND */
1407 fs_inst *inst = NULL;
1408 switch (ir->op) {
1409 case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst, payload); break;
1410 case ir_txb: inst = emit(FS_OPCODE_TXB, dst, payload); break;
1411 case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst, payload); break;
1412 case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst, payload); break;
1413 case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst, payload); break;
1414 case ir_txf_ms: inst = emit(SHADER_OPCODE_TXF_CMS, dst, payload); break;
1415 case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst, payload); break;
1416 case ir_query_levels: inst = emit(SHADER_OPCODE_TXS, dst, payload); break;
1417 case ir_lod: inst = emit(SHADER_OPCODE_LOD, dst, payload); break;
1418 case ir_tg4:
1419 if (has_nonconstant_offset)
1420 inst = emit(SHADER_OPCODE_TG4_OFFSET, dst, payload);
1421 else
1422 inst = emit(SHADER_OPCODE_TG4, dst, payload);
1423 break;
1424 }
1425 inst->base_mrf = -1;
1426 if (reg_width == 2)
1427 inst->mlen = next.reg_offset * reg_width - header_present;
1428 else
1429 inst->mlen = next.reg_offset * reg_width;
1430 inst->header_present = header_present;
1431 inst->regs_written = 4;
1432
1433 virtual_grf_sizes[payload.reg] = next.reg_offset;
1434 if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
1435 fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
1436 " disallowed by hardware\n");
1437 }
1438
1439 return inst;
1440 }
1441
1442 fs_reg
1443 fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate,
1444 bool is_rect, int sampler, int texunit)
1445 {
1446 fs_inst *inst = NULL;
1447 bool needs_gl_clamp = true;
1448 fs_reg scale_x, scale_y;
1449
1450 /* The 965 requires the EU to do the normalization of GL rectangle
1451 * texture coordinates. We use the program parameter state
1452 * tracking to get the scaling factor.
1453 */
1454 if (is_rect &&
1455 (brw->gen < 6 ||
1456 (brw->gen >= 6 && (c->key.tex.gl_clamp_mask[0] & (1 << sampler) ||
1457 c->key.tex.gl_clamp_mask[1] & (1 << sampler))))) {
1458 struct gl_program_parameter_list *params = prog->Parameters;
1459 int tokens[STATE_LENGTH] = {
1460 STATE_INTERNAL,
1461 STATE_TEXRECT_SCALE,
1462 texunit,
1463 0,
1464 0
1465 };
1466
1467 if (dispatch_width == 16) {
1468 fail("rectangle scale uniform setup not supported on SIMD16\n");
1469 return coordinate;
1470 }
1471
1472 scale_x = fs_reg(UNIFORM, uniforms);
1473 scale_y = fs_reg(UNIFORM, uniforms + 1);
1474
1475 GLuint index = _mesa_add_state_reference(params,
1476 (gl_state_index *)tokens);
1477 stage_prog_data->param[uniforms++] =
1478 &prog->Parameters->ParameterValues[index][0].f;
1479 stage_prog_data->param[uniforms++] =
1480 &prog->Parameters->ParameterValues[index][1].f;
1481 }
1482
1483 /* The 965 requires the EU to do the normalization of GL rectangle
1484 * texture coordinates. We use the program parameter state
1485 * tracking to get the scaling factor.
1486 */
1487 if (brw->gen < 6 && is_rect) {
1488 fs_reg dst = fs_reg(this, ir->coordinate->type);
1489 fs_reg src = coordinate;
1490 coordinate = dst;
1491
1492 emit(MUL(dst, src, scale_x));
1493 dst.reg_offset++;
1494 src.reg_offset++;
1495 emit(MUL(dst, src, scale_y));
1496 } else if (is_rect) {
1497 /* On gen6+, the sampler handles the rectangle coordinates
1498 * natively, without needing rescaling. But that means we have
1499 * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
1500 * not [0, 1] like the default case below.
1501 */
1502 needs_gl_clamp = false;
1503
1504 for (int i = 0; i < 2; i++) {
1505 if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1506 fs_reg chan = coordinate;
1507 chan.reg_offset += i;
1508
1509 inst = emit(BRW_OPCODE_SEL, chan, chan, brw_imm_f(0.0));
1510 inst->conditional_mod = BRW_CONDITIONAL_G;
1511
1512 /* Our parameter comes in as 1.0/width or 1.0/height,
1513 * because that's what people normally want for doing
1514 * texture rectangle handling. We need width or height
1515 * for clamping, but we don't care enough to make a new
1516 * parameter type, so just invert back.
1517 */
1518 fs_reg limit = fs_reg(this, glsl_type::float_type);
1519 emit(MOV(limit, i == 0 ? scale_x : scale_y));
1520 emit(SHADER_OPCODE_RCP, limit, limit);
1521
1522 inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
1523 inst->conditional_mod = BRW_CONDITIONAL_L;
1524 }
1525 }
1526 }
1527
1528 if (ir->coordinate && needs_gl_clamp) {
1529 for (unsigned int i = 0;
1530 i < MIN2(ir->coordinate->type->vector_elements, 3); i++) {
1531 if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1532 fs_reg chan = coordinate;
1533 chan.reg_offset += i;
1534
1535 fs_inst *inst = emit(MOV(chan, chan));
1536 inst->saturate = true;
1537 }
1538 }
1539 }
1540 return coordinate;
1541 }
1542
1543 /* Sample from the MCS surface attached to this multisample texture. */
1544 fs_reg
1545 fs_visitor::emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, int sampler)
1546 {
1547 int reg_width = dispatch_width / 8;
1548 fs_reg payload = fs_reg(this, glsl_type::float_type);
1549 fs_reg dest = fs_reg(this, glsl_type::uvec4_type);
1550 fs_reg next = payload;
1551
1552 /* parameters are: u, v, r, lod; missing parameters are treated as zero */
1553 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1554 emit(MOV(retype(next, BRW_REGISTER_TYPE_D), coordinate));
1555 coordinate.reg_offset++;
1556 next.reg_offset++;
1557 }
1558
1559 fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload);
1560 virtual_grf_sizes[payload.reg] = next.reg_offset;
1561 inst->base_mrf = -1;
1562 inst->mlen = next.reg_offset * reg_width;
1563 inst->header_present = false;
1564 inst->regs_written = 4 * reg_width; /* we only care about one reg of response,
1565 * but the sampler always writes 4/8
1566 */
1567 inst->sampler = sampler;
1568
1569 return dest;
1570 }
1571
1572 void
1573 fs_visitor::visit(ir_texture *ir)
1574 {
1575 fs_inst *inst = NULL;
1576
1577 int sampler =
1578 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
1579 /* FINISHME: We're failing to recompile our programs when the sampler is
1580 * updated. This only matters for the texture rectangle scale parameters
1581 * (pre-gen6, or gen6+ with GL_CLAMP).
1582 */
1583 int texunit = prog->SamplerUnits[sampler];
1584
1585 if (ir->op == ir_tg4) {
1586 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
1587 * emitting anything other than setting up the constant result.
1588 */
1589 ir_constant *chan = ir->lod_info.component->as_constant();
1590 int swiz = GET_SWZ(c->key.tex.swizzles[sampler], chan->value.i[0]);
1591 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
1592
1593 fs_reg res = fs_reg(this, glsl_type::vec4_type);
1594 this->result = res;
1595
1596 for (int i=0; i<4; i++) {
1597 emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
1598 res.reg_offset++;
1599 }
1600 return;
1601 }
1602 }
1603
1604 /* Should be lowered by do_lower_texture_projection */
1605 assert(!ir->projector);
1606
1607 /* Should be lowered */
1608 assert(!ir->offset || !ir->offset->type->is_array());
1609
1610 /* Generate code to compute all the subexpression trees. This has to be
1611 * done before loading any values into MRFs for the sampler message since
1612 * generating these values may involve SEND messages that need the MRFs.
1613 */
1614 fs_reg coordinate;
1615 if (ir->coordinate) {
1616 ir->coordinate->accept(this);
1617
1618 coordinate = rescale_texcoord(ir, this->result,
1619 ir->sampler->type->sampler_dimensionality ==
1620 GLSL_SAMPLER_DIM_RECT,
1621 sampler, texunit);
1622 }
1623
1624 fs_reg shadow_comparitor;
1625 if (ir->shadow_comparitor) {
1626 ir->shadow_comparitor->accept(this);
1627 shadow_comparitor = this->result;
1628 }
1629
1630 fs_reg lod, lod2, sample_index, mcs;
1631 switch (ir->op) {
1632 case ir_tex:
1633 case ir_lod:
1634 case ir_tg4:
1635 case ir_query_levels:
1636 break;
1637 case ir_txb:
1638 ir->lod_info.bias->accept(this);
1639 lod = this->result;
1640 break;
1641 case ir_txd:
1642 ir->lod_info.grad.dPdx->accept(this);
1643 lod = this->result;
1644
1645 ir->lod_info.grad.dPdy->accept(this);
1646 lod2 = this->result;
1647 break;
1648 case ir_txf:
1649 case ir_txl:
1650 case ir_txs:
1651 ir->lod_info.lod->accept(this);
1652 lod = this->result;
1653 break;
1654 case ir_txf_ms:
1655 ir->lod_info.sample_index->accept(this);
1656 sample_index = this->result;
1657
1658 if (brw->gen >= 7 && c->key.tex.compressed_multisample_layout_mask & (1<<sampler))
1659 mcs = emit_mcs_fetch(ir, coordinate, sampler);
1660 else
1661 mcs = fs_reg(0u);
1662 break;
1663 default:
1664 assert(!"Unrecognized texture opcode");
1665 };
1666
1667 /* Writemasking doesn't eliminate channels on SIMD8 texture
1668 * samples, so don't worry about them.
1669 */
1670 fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
1671
1672 if (brw->gen >= 7) {
1673 inst = emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
1674 lod, lod2, sample_index, mcs, sampler);
1675 } else if (brw->gen >= 5) {
1676 inst = emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
1677 lod, lod2, sample_index);
1678 } else {
1679 inst = emit_texture_gen4(ir, dst, coordinate, shadow_comparitor,
1680 lod, lod2);
1681 }
1682
1683 if (ir->offset != NULL && ir->op != ir_txf)
1684 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
1685
1686 if (ir->op == ir_tg4)
1687 inst->texture_offset |= gather_channel(ir, sampler) << 16; // M0.2:16-17
1688
1689 inst->sampler = sampler;
1690
1691 if (ir->shadow_comparitor)
1692 inst->shadow_compare = true;
1693
1694 /* fixup #layers for cube map arrays */
1695 if (ir->op == ir_txs) {
1696 glsl_type const *type = ir->sampler->type;
1697 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
1698 type->sampler_array) {
1699 fs_reg depth = dst;
1700 depth.reg_offset = 2;
1701 emit_math(SHADER_OPCODE_INT_QUOTIENT, depth, depth, fs_reg(6));
1702 }
1703 }
1704
1705 if (brw->gen == 6 && ir->op == ir_tg4) {
1706 emit_gen6_gather_wa(c->key.tex.gen6_gather_wa[sampler], dst);
1707 }
1708
1709 swizzle_result(ir, dst, sampler);
1710 }
1711
1712 /**
1713 * Apply workarounds for Gen6 gather with UINT/SINT
1714 */
1715 void
1716 fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
1717 {
1718 if (!wa)
1719 return;
1720
1721 int width = (wa & WA_8BIT) ? 8 : 16;
1722
1723 for (int i = 0; i < 4; i++) {
1724 fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
1725 /* Convert from UNORM to UINT */
1726 emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
1727 emit(MOV(dst, dst_f));
1728
1729 if (wa & WA_SIGN) {
1730 /* Reinterpret the UINT value as a signed INT value by
1731 * shifting the sign bit into place, then shifting back
1732 * preserving sign.
1733 */
1734 emit(SHL(dst, dst, fs_reg(32 - width)));
1735 emit(ASR(dst, dst, fs_reg(32 - width)));
1736 }
1737
1738 dst.reg_offset++;
1739 }
1740 }
1741
1742 /**
1743 * Set up the gather channel based on the swizzle, for gather4.
1744 */
1745 uint32_t
1746 fs_visitor::gather_channel(ir_texture *ir, int sampler)
1747 {
1748 ir_constant *chan = ir->lod_info.component->as_constant();
1749 int swiz = GET_SWZ(c->key.tex.swizzles[sampler], chan->value.i[0]);
1750 switch (swiz) {
1751 case SWIZZLE_X: return 0;
1752 case SWIZZLE_Y:
1753 /* gather4 sampler is broken for green channel on RG32F --
1754 * we must ask for blue instead.
1755 */
1756 if (c->key.tex.gather_channel_quirk_mask & (1<<sampler))
1757 return 2;
1758 return 1;
1759 case SWIZZLE_Z: return 2;
1760 case SWIZZLE_W: return 3;
1761 default:
1762 assert(!"Not reached"); /* zero, one swizzles handled already */
1763 return 0;
1764 }
1765 }
1766
1767 /**
1768 * Swizzle the result of a texture result. This is necessary for
1769 * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
1770 */
1771 void
1772 fs_visitor::swizzle_result(ir_texture *ir, fs_reg orig_val, int sampler)
1773 {
1774 if (ir->op == ir_query_levels) {
1775 /* # levels is in .w */
1776 orig_val.reg_offset += 3;
1777 this->result = orig_val;
1778 return;
1779 }
1780
1781 this->result = orig_val;
1782
1783 /* txs,lod don't actually sample the texture, so swizzling the result
1784 * makes no sense.
1785 */
1786 if (ir->op == ir_txs || ir->op == ir_lod || ir->op == ir_tg4)
1787 return;
1788
1789 if (ir->type == glsl_type::float_type) {
1790 /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1791 assert(ir->sampler->type->sampler_shadow);
1792 } else if (c->key.tex.swizzles[sampler] != SWIZZLE_NOOP) {
1793 fs_reg swizzled_result = fs_reg(this, glsl_type::vec4_type);
1794
1795 for (int i = 0; i < 4; i++) {
1796 int swiz = GET_SWZ(c->key.tex.swizzles[sampler], i);
1797 fs_reg l = swizzled_result;
1798 l.reg_offset += i;
1799
1800 if (swiz == SWIZZLE_ZERO) {
1801 emit(MOV(l, fs_reg(0.0f)));
1802 } else if (swiz == SWIZZLE_ONE) {
1803 emit(MOV(l, fs_reg(1.0f)));
1804 } else {
1805 fs_reg r = orig_val;
1806 r.reg_offset += GET_SWZ(c->key.tex.swizzles[sampler], i);
1807 emit(MOV(l, r));
1808 }
1809 }
1810 this->result = swizzled_result;
1811 }
1812 }
1813
1814 void
1815 fs_visitor::visit(ir_swizzle *ir)
1816 {
1817 ir->val->accept(this);
1818 fs_reg val = this->result;
1819
1820 if (ir->type->vector_elements == 1) {
1821 this->result.reg_offset += ir->mask.x;
1822 return;
1823 }
1824
1825 fs_reg result = fs_reg(this, ir->type);
1826 this->result = result;
1827
1828 for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1829 fs_reg channel = val;
1830 int swiz = 0;
1831
1832 switch (i) {
1833 case 0:
1834 swiz = ir->mask.x;
1835 break;
1836 case 1:
1837 swiz = ir->mask.y;
1838 break;
1839 case 2:
1840 swiz = ir->mask.z;
1841 break;
1842 case 3:
1843 swiz = ir->mask.w;
1844 break;
1845 }
1846
1847 channel.reg_offset += swiz;
1848 emit(MOV(result, channel));
1849 result.reg_offset++;
1850 }
1851 }
1852
1853 void
1854 fs_visitor::visit(ir_discard *ir)
1855 {
1856 assert(ir->condition == NULL); /* FINISHME */
1857
1858 /* We track our discarded pixels in f0.1. By predicating on it, we can
1859 * update just the flag bits that aren't yet discarded. By emitting a
1860 * CMP of g0 != g0, all our currently executing channels will get turned
1861 * off.
1862 */
1863 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
1864 BRW_REGISTER_TYPE_UW));
1865 fs_inst *cmp = emit(CMP(reg_null_f, some_reg, some_reg,
1866 BRW_CONDITIONAL_NZ));
1867 cmp->predicate = BRW_PREDICATE_NORMAL;
1868 cmp->flag_subreg = 1;
1869
1870 if (brw->gen >= 6) {
1871 /* For performance, after a discard, jump to the end of the shader.
1872 * However, many people will do foliage by discarding based on a
1873 * texture's alpha mask, and then continue on to texture with the
1874 * remaining pixels. To avoid trashing the derivatives for those
1875 * texture samples, we'll only jump if all of the pixels in the subspan
1876 * have been discarded.
1877 */
1878 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1879 discard_jump->flag_subreg = 1;
1880 discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
1881 discard_jump->predicate_inverse = true;
1882 }
1883 }
1884
1885 void
1886 fs_visitor::visit(ir_constant *ir)
1887 {
1888 /* Set this->result to reg at the bottom of the function because some code
1889 * paths will cause this visitor to be applied to other fields. This will
1890 * cause the value stored in this->result to be modified.
1891 *
1892 * Make reg constant so that it doesn't get accidentally modified along the
1893 * way. Yes, I actually had this problem. :(
1894 */
1895 const fs_reg reg(this, ir->type);
1896 fs_reg dst_reg = reg;
1897
1898 if (ir->type->is_array()) {
1899 const unsigned size = type_size(ir->type->fields.array);
1900
1901 for (unsigned i = 0; i < ir->type->length; i++) {
1902 ir->array_elements[i]->accept(this);
1903 fs_reg src_reg = this->result;
1904
1905 dst_reg.type = src_reg.type;
1906 for (unsigned j = 0; j < size; j++) {
1907 emit(MOV(dst_reg, src_reg));
1908 src_reg.reg_offset++;
1909 dst_reg.reg_offset++;
1910 }
1911 }
1912 } else if (ir->type->is_record()) {
1913 foreach_list(node, &ir->components) {
1914 ir_constant *const field = (ir_constant *) node;
1915 const unsigned size = type_size(field->type);
1916
1917 field->accept(this);
1918 fs_reg src_reg = this->result;
1919
1920 dst_reg.type = src_reg.type;
1921 for (unsigned j = 0; j < size; j++) {
1922 emit(MOV(dst_reg, src_reg));
1923 src_reg.reg_offset++;
1924 dst_reg.reg_offset++;
1925 }
1926 }
1927 } else {
1928 const unsigned size = type_size(ir->type);
1929
1930 for (unsigned i = 0; i < size; i++) {
1931 switch (ir->type->base_type) {
1932 case GLSL_TYPE_FLOAT:
1933 emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
1934 break;
1935 case GLSL_TYPE_UINT:
1936 emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
1937 break;
1938 case GLSL_TYPE_INT:
1939 emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
1940 break;
1941 case GLSL_TYPE_BOOL:
1942 emit(MOV(dst_reg, fs_reg((int)ir->value.b[i])));
1943 break;
1944 default:
1945 assert(!"Non-float/uint/int/bool constant");
1946 }
1947 dst_reg.reg_offset++;
1948 }
1949 }
1950
1951 this->result = reg;
1952 }
1953
1954 void
1955 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1956 {
1957 ir_expression *expr = ir->as_expression();
1958
1959 if (expr &&
1960 expr->operation != ir_binop_logic_and &&
1961 expr->operation != ir_binop_logic_or &&
1962 expr->operation != ir_binop_logic_xor) {
1963 fs_reg op[2];
1964 fs_inst *inst;
1965
1966 assert(expr->get_num_operands() <= 2);
1967 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1968 assert(expr->operands[i]->type->is_scalar());
1969
1970 expr->operands[i]->accept(this);
1971 op[i] = this->result;
1972
1973 resolve_ud_negate(&op[i]);
1974 }
1975
1976 switch (expr->operation) {
1977 case ir_unop_logic_not:
1978 inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
1979 inst->conditional_mod = BRW_CONDITIONAL_Z;
1980 break;
1981
1982 case ir_unop_f2b:
1983 if (brw->gen >= 6) {
1984 emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
1985 } else {
1986 inst = emit(MOV(reg_null_f, op[0]));
1987 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1988 }
1989 break;
1990
1991 case ir_unop_i2b:
1992 if (brw->gen >= 6) {
1993 emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
1994 } else {
1995 inst = emit(MOV(reg_null_d, op[0]));
1996 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1997 }
1998 break;
1999
2000 case ir_binop_greater:
2001 case ir_binop_gequal:
2002 case ir_binop_less:
2003 case ir_binop_lequal:
2004 case ir_binop_equal:
2005 case ir_binop_all_equal:
2006 case ir_binop_nequal:
2007 case ir_binop_any_nequal:
2008 resolve_bool_comparison(expr->operands[0], &op[0]);
2009 resolve_bool_comparison(expr->operands[1], &op[1]);
2010
2011 emit(CMP(reg_null_d, op[0], op[1],
2012 brw_conditional_for_comparison(expr->operation)));
2013 break;
2014
2015 default:
2016 assert(!"not reached");
2017 fail("bad cond code\n");
2018 break;
2019 }
2020 return;
2021 }
2022
2023 ir->accept(this);
2024
2025 fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
2026 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2027 }
2028
2029 /**
2030 * Emit a gen6 IF statement with the comparison folded into the IF
2031 * instruction.
2032 */
2033 void
2034 fs_visitor::emit_if_gen6(ir_if *ir)
2035 {
2036 ir_expression *expr = ir->condition->as_expression();
2037
2038 if (expr) {
2039 fs_reg op[2];
2040 fs_inst *inst;
2041 fs_reg temp;
2042
2043 assert(expr->get_num_operands() <= 2);
2044 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2045 assert(expr->operands[i]->type->is_scalar());
2046
2047 expr->operands[i]->accept(this);
2048 op[i] = this->result;
2049 }
2050
2051 switch (expr->operation) {
2052 case ir_unop_logic_not:
2053 case ir_binop_logic_xor:
2054 case ir_binop_logic_or:
2055 case ir_binop_logic_and:
2056 /* For operations on bool arguments, only the low bit of the bool is
2057 * valid, and the others are undefined. Fall back to the condition
2058 * code path.
2059 */
2060 break;
2061
2062 case ir_unop_f2b:
2063 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
2064 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2065 return;
2066
2067 case ir_unop_i2b:
2068 emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2069 return;
2070
2071 case ir_binop_greater:
2072 case ir_binop_gequal:
2073 case ir_binop_less:
2074 case ir_binop_lequal:
2075 case ir_binop_equal:
2076 case ir_binop_all_equal:
2077 case ir_binop_nequal:
2078 case ir_binop_any_nequal:
2079 resolve_bool_comparison(expr->operands[0], &op[0]);
2080 resolve_bool_comparison(expr->operands[1], &op[1]);
2081
2082 emit(IF(op[0], op[1],
2083 brw_conditional_for_comparison(expr->operation)));
2084 return;
2085 default:
2086 assert(!"not reached");
2087 emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2088 fail("bad condition\n");
2089 return;
2090 }
2091 }
2092
2093 emit_bool_to_cond_code(ir->condition);
2094 fs_inst *inst = emit(BRW_OPCODE_IF);
2095 inst->predicate = BRW_PREDICATE_NORMAL;
2096 }
2097
2098 /**
2099 * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
2100 *
2101 * Many GLSL shaders contain the following pattern:
2102 *
2103 * x = condition ? foo : bar
2104 *
2105 * The compiler emits an ir_if tree for this, since each subexpression might be
2106 * a complex tree that could have side-effects or short-circuit logic.
2107 *
2108 * However, the common case is to simply select one of two constants or
2109 * variable values---which is exactly what SEL is for. In this case, the
2110 * assembly looks like:
2111 *
2112 * (+f0) IF
2113 * MOV dst src0
2114 * ELSE
2115 * MOV dst src1
2116 * ENDIF
2117 *
2118 * which can be easily translated into:
2119 *
2120 * (+f0) SEL dst src0 src1
2121 *
2122 * If src0 is an immediate value, we promote it to a temporary GRF.
2123 */
2124 void
2125 fs_visitor::try_replace_with_sel()
2126 {
2127 fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
2128 assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
2129
2130 /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
2131 int opcodes[] = {
2132 BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
2133 };
2134
2135 fs_inst *match = (fs_inst *) endif_inst->prev;
2136 for (int i = 0; i < 4; i++) {
2137 if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
2138 return;
2139 match = (fs_inst *) match->prev;
2140 }
2141
2142 /* The opcodes match; it looks like the right sequence of instructions. */
2143 fs_inst *else_mov = (fs_inst *) endif_inst->prev;
2144 fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
2145 fs_inst *if_inst = (fs_inst *) then_mov->prev;
2146
2147 /* Check that the MOVs are the right form. */
2148 if (then_mov->dst.equals(else_mov->dst) &&
2149 !then_mov->is_partial_write() &&
2150 !else_mov->is_partial_write()) {
2151
2152 /* Remove the matched instructions; we'll emit a SEL to replace them. */
2153 while (!if_inst->next->is_tail_sentinel())
2154 if_inst->next->remove();
2155 if_inst->remove();
2156
2157 /* Only the last source register can be a constant, so if the MOV in
2158 * the "then" clause uses a constant, we need to put it in a temporary.
2159 */
2160 fs_reg src0(then_mov->src[0]);
2161 if (src0.file == IMM) {
2162 src0 = fs_reg(this, glsl_type::float_type);
2163 src0.type = then_mov->src[0].type;
2164 emit(MOV(src0, then_mov->src[0]));
2165 }
2166
2167 fs_inst *sel;
2168 if (if_inst->conditional_mod) {
2169 /* Sandybridge-specific IF with embedded comparison */
2170 emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
2171 if_inst->conditional_mod));
2172 sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2173 sel->predicate = BRW_PREDICATE_NORMAL;
2174 } else {
2175 /* Separate CMP and IF instructions */
2176 sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2177 sel->predicate = if_inst->predicate;
2178 sel->predicate_inverse = if_inst->predicate_inverse;
2179 }
2180 }
2181 }
2182
2183 void
2184 fs_visitor::visit(ir_if *ir)
2185 {
2186 if (brw->gen < 6 && dispatch_width == 16) {
2187 fail("Can't support (non-uniform) control flow on SIMD16\n");
2188 }
2189
2190 /* Don't point the annotation at the if statement, because then it plus
2191 * the then and else blocks get printed.
2192 */
2193 this->base_ir = ir->condition;
2194
2195 if (brw->gen == 6) {
2196 emit_if_gen6(ir);
2197 } else {
2198 emit_bool_to_cond_code(ir->condition);
2199
2200 emit(IF(BRW_PREDICATE_NORMAL));
2201 }
2202
2203 foreach_list(node, &ir->then_instructions) {
2204 ir_instruction *ir = (ir_instruction *)node;
2205 this->base_ir = ir;
2206
2207 ir->accept(this);
2208 }
2209
2210 if (!ir->else_instructions.is_empty()) {
2211 emit(BRW_OPCODE_ELSE);
2212
2213 foreach_list(node, &ir->else_instructions) {
2214 ir_instruction *ir = (ir_instruction *)node;
2215 this->base_ir = ir;
2216
2217 ir->accept(this);
2218 }
2219 }
2220
2221 emit(BRW_OPCODE_ENDIF);
2222
2223 try_replace_with_sel();
2224 }
2225
2226 void
2227 fs_visitor::visit(ir_loop *ir)
2228 {
2229 if (brw->gen < 6 && dispatch_width == 16) {
2230 fail("Can't support (non-uniform) control flow on SIMD16\n");
2231 }
2232
2233 this->base_ir = NULL;
2234 emit(BRW_OPCODE_DO);
2235
2236 foreach_list(node, &ir->body_instructions) {
2237 ir_instruction *ir = (ir_instruction *)node;
2238
2239 this->base_ir = ir;
2240 ir->accept(this);
2241 }
2242
2243 this->base_ir = NULL;
2244 emit(BRW_OPCODE_WHILE);
2245 }
2246
2247 void
2248 fs_visitor::visit(ir_loop_jump *ir)
2249 {
2250 switch (ir->mode) {
2251 case ir_loop_jump::jump_break:
2252 emit(BRW_OPCODE_BREAK);
2253 break;
2254 case ir_loop_jump::jump_continue:
2255 emit(BRW_OPCODE_CONTINUE);
2256 break;
2257 }
2258 }
2259
2260 void
2261 fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2262 {
2263 ir_dereference *deref = static_cast<ir_dereference *>(
2264 ir->actual_parameters.get_head());
2265 ir_variable *location = deref->variable_referenced();
2266 unsigned surf_index = (c->prog_data.base.binding_table.abo_start +
2267 location->data.atomic.buffer_index);
2268
2269 /* Calculate the surface offset */
2270 fs_reg offset(this, glsl_type::uint_type);
2271 ir_dereference_array *deref_array = deref->as_dereference_array();
2272
2273 if (deref_array) {
2274 deref_array->array_index->accept(this);
2275
2276 fs_reg tmp(this, glsl_type::uint_type);
2277 emit(MUL(tmp, this->result, ATOMIC_COUNTER_SIZE));
2278 emit(ADD(offset, tmp, location->data.atomic.offset));
2279 } else {
2280 offset = location->data.atomic.offset;
2281 }
2282
2283 /* Emit the appropriate machine instruction */
2284 const char *callee = ir->callee->function_name();
2285 ir->return_deref->accept(this);
2286 fs_reg dst = this->result;
2287
2288 if (!strcmp("__intrinsic_atomic_read", callee)) {
2289 emit_untyped_surface_read(surf_index, dst, offset);
2290
2291 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2292 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2293 fs_reg(), fs_reg());
2294
2295 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2296 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2297 fs_reg(), fs_reg());
2298 }
2299 }
2300
2301 void
2302 fs_visitor::visit(ir_call *ir)
2303 {
2304 const char *callee = ir->callee->function_name();
2305
2306 if (!strcmp("__intrinsic_atomic_read", callee) ||
2307 !strcmp("__intrinsic_atomic_increment", callee) ||
2308 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2309 visit_atomic_counter_intrinsic(ir);
2310 } else {
2311 assert(!"Unsupported intrinsic.");
2312 }
2313 }
2314
2315 void
2316 fs_visitor::visit(ir_return *ir)
2317 {
2318 assert(!"FINISHME");
2319 }
2320
2321 void
2322 fs_visitor::visit(ir_function *ir)
2323 {
2324 /* Ignore function bodies other than main() -- we shouldn't see calls to
2325 * them since they should all be inlined before we get to ir_to_mesa.
2326 */
2327 if (strcmp(ir->name, "main") == 0) {
2328 const ir_function_signature *sig;
2329 exec_list empty;
2330
2331 sig = ir->matching_signature(NULL, &empty);
2332
2333 assert(sig);
2334
2335 foreach_list(node, &sig->body) {
2336 ir_instruction *ir = (ir_instruction *)node;
2337 this->base_ir = ir;
2338
2339 ir->accept(this);
2340 }
2341 }
2342 }
2343
2344 void
2345 fs_visitor::visit(ir_function_signature *ir)
2346 {
2347 assert(!"not reached");
2348 (void)ir;
2349 }
2350
2351 void
2352 fs_visitor::visit(ir_emit_vertex *)
2353 {
2354 assert(!"not reached");
2355 }
2356
2357 void
2358 fs_visitor::visit(ir_end_primitive *)
2359 {
2360 assert(!"not reached");
2361 }
2362
2363 void
2364 fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2365 fs_reg dst, fs_reg offset, fs_reg src0,
2366 fs_reg src1)
2367 {
2368 const unsigned operand_len = dispatch_width / 8;
2369 unsigned mlen = 0;
2370
2371 /* Initialize the sample mask in the message header. */
2372 emit(MOV(brw_uvec_mrf(8, mlen, 0), brw_imm_ud(0)))
2373 ->force_writemask_all = true;
2374
2375 if (fp->UsesKill) {
2376 emit(MOV(brw_uvec_mrf(1, mlen, 7), brw_flag_reg(0, 1)))
2377 ->force_writemask_all = true;
2378 } else {
2379 emit(MOV(brw_uvec_mrf(1, mlen, 7),
2380 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
2381 ->force_writemask_all = true;
2382 }
2383
2384 mlen++;
2385
2386 /* Set the atomic operation offset. */
2387 emit(MOV(brw_uvec_mrf(dispatch_width, mlen, 0), offset));
2388 mlen += operand_len;
2389
2390 /* Set the atomic operation arguments. */
2391 if (src0.file != BAD_FILE) {
2392 emit(MOV(brw_uvec_mrf(dispatch_width, mlen, 0), src0));
2393 mlen += operand_len;
2394 }
2395
2396 if (src1.file != BAD_FILE) {
2397 emit(MOV(brw_uvec_mrf(dispatch_width, mlen, 0), src1));
2398 mlen += operand_len;
2399 }
2400
2401 /* Emit the instruction. */
2402 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2403 atomic_op, surf_index);
2404 inst->base_mrf = 0;
2405 inst->mlen = mlen;
2406 emit(inst);
2407 }
2408
2409 void
2410 fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
2411 fs_reg offset)
2412 {
2413 const unsigned operand_len = dispatch_width / 8;
2414 unsigned mlen = 0;
2415
2416 /* Initialize the sample mask in the message header. */
2417 emit(MOV(brw_uvec_mrf(8, mlen, 0), brw_imm_ud(0)))
2418 ->force_writemask_all = true;
2419
2420 if (fp->UsesKill) {
2421 emit(MOV(brw_uvec_mrf(1, mlen, 7), brw_flag_reg(0, 1)))
2422 ->force_writemask_all = true;
2423 } else {
2424 emit(MOV(brw_uvec_mrf(1, mlen, 7),
2425 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
2426 ->force_writemask_all = true;
2427 }
2428
2429 mlen++;
2430
2431 /* Set the surface read offset. */
2432 emit(MOV(brw_uvec_mrf(dispatch_width, mlen, 0), offset));
2433 mlen += operand_len;
2434
2435 /* Emit the instruction. */
2436 fs_inst *inst = new(mem_ctx)
2437 fs_inst(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, surf_index);
2438 inst->base_mrf = 0;
2439 inst->mlen = mlen;
2440 emit(inst);
2441 }
2442
2443 fs_inst *
2444 fs_visitor::emit(fs_inst *inst)
2445 {
2446 if (force_uncompressed_stack > 0)
2447 inst->force_uncompressed = true;
2448
2449 inst->annotation = this->current_annotation;
2450 inst->ir = this->base_ir;
2451
2452 this->instructions.push_tail(inst);
2453
2454 return inst;
2455 }
2456
2457 void
2458 fs_visitor::emit(exec_list list)
2459 {
2460 foreach_list_safe(node, &list) {
2461 fs_inst *inst = (fs_inst *)node;
2462 inst->remove();
2463 emit(inst);
2464 }
2465 }
2466
2467 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
2468 void
2469 fs_visitor::emit_dummy_fs()
2470 {
2471 int reg_width = dispatch_width / 8;
2472
2473 /* Everyone's favorite color. */
2474 emit(MOV(fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f)));
2475 emit(MOV(fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f)));
2476 emit(MOV(fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f)));
2477 emit(MOV(fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f)));
2478
2479 fs_inst *write;
2480 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
2481 write->base_mrf = 2;
2482 write->mlen = 4 * reg_width;
2483 write->eot = true;
2484 }
2485
2486 /* The register location here is relative to the start of the URB
2487 * data. It will get adjusted to be a real location before
2488 * generate_code() time.
2489 */
2490 struct brw_reg
2491 fs_visitor::interp_reg(int location, int channel)
2492 {
2493 int regnr = c->prog_data.urb_setup[location] * 2 + channel / 2;
2494 int stride = (channel & 1) * 4;
2495
2496 assert(c->prog_data.urb_setup[location] != -1);
2497
2498 return brw_vec1_grf(regnr, stride);
2499 }
2500
2501 /** Emits the interpolation for the varying inputs. */
2502 void
2503 fs_visitor::emit_interpolation_setup_gen4()
2504 {
2505 this->current_annotation = "compute pixel centers";
2506 this->pixel_x = fs_reg(this, glsl_type::uint_type);
2507 this->pixel_y = fs_reg(this, glsl_type::uint_type);
2508 this->pixel_x.type = BRW_REGISTER_TYPE_UW;
2509 this->pixel_y.type = BRW_REGISTER_TYPE_UW;
2510
2511 emit(FS_OPCODE_PIXEL_X, this->pixel_x);
2512 emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
2513
2514 this->current_annotation = "compute pixel deltas from v0";
2515 if (brw->has_pln) {
2516 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2517 fs_reg(this, glsl_type::vec2_type);
2518 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2519 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
2520 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg_offset++;
2521 } else {
2522 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2523 fs_reg(this, glsl_type::float_type);
2524 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2525 fs_reg(this, glsl_type::float_type);
2526 }
2527 emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2528 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
2529 emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2530 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
2531
2532 this->current_annotation = "compute pos.w and 1/pos.w";
2533 /* Compute wpos.w. It's always in our setup, since it's needed to
2534 * interpolate the other attributes.
2535 */
2536 this->wpos_w = fs_reg(this, glsl_type::float_type);
2537 emit(FS_OPCODE_LINTERP, wpos_w,
2538 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2539 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2540 interp_reg(VARYING_SLOT_POS, 3));
2541 /* Compute the pixel 1/W value from wpos.w. */
2542 this->pixel_w = fs_reg(this, glsl_type::float_type);
2543 emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
2544 this->current_annotation = NULL;
2545 }
2546
2547 /** Emits the interpolation for the varying inputs. */
2548 void
2549 fs_visitor::emit_interpolation_setup_gen6()
2550 {
2551 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2552
2553 /* If the pixel centers end up used, the setup is the same as for gen4. */
2554 this->current_annotation = "compute pixel centers";
2555 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
2556 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
2557 int_pixel_x.type = BRW_REGISTER_TYPE_UW;
2558 int_pixel_y.type = BRW_REGISTER_TYPE_UW;
2559 emit(ADD(int_pixel_x,
2560 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
2561 fs_reg(brw_imm_v(0x10101010))));
2562 emit(ADD(int_pixel_y,
2563 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
2564 fs_reg(brw_imm_v(0x11001100))));
2565
2566 /* As of gen6, we can no longer mix float and int sources. We have
2567 * to turn the integer pixel centers into floats for their actual
2568 * use.
2569 */
2570 this->pixel_x = fs_reg(this, glsl_type::float_type);
2571 this->pixel_y = fs_reg(this, glsl_type::float_type);
2572 emit(MOV(this->pixel_x, int_pixel_x));
2573 emit(MOV(this->pixel_y, int_pixel_y));
2574
2575 this->current_annotation = "compute pos.w";
2576 this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2577 this->wpos_w = fs_reg(this, glsl_type::float_type);
2578 emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
2579
2580 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2581 uint8_t reg = c->barycentric_coord_reg[i];
2582 this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
2583 this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
2584 }
2585
2586 this->current_annotation = NULL;
2587 }
2588
2589 void
2590 fs_visitor::emit_color_write(int target, int index, int first_color_mrf)
2591 {
2592 int reg_width = dispatch_width / 8;
2593 fs_inst *inst;
2594 fs_reg color = outputs[target];
2595 fs_reg mrf;
2596
2597 /* If there's no color data to be written, skip it. */
2598 if (color.file == BAD_FILE)
2599 return;
2600
2601 color.reg_offset += index;
2602
2603 if (dispatch_width == 8 || brw->gen >= 6) {
2604 /* SIMD8 write looks like:
2605 * m + 0: r0
2606 * m + 1: r1
2607 * m + 2: g0
2608 * m + 3: g1
2609 *
2610 * gen6 SIMD16 DP write looks like:
2611 * m + 0: r0
2612 * m + 1: r1
2613 * m + 2: g0
2614 * m + 3: g1
2615 * m + 4: b0
2616 * m + 5: b1
2617 * m + 6: a0
2618 * m + 7: a1
2619 */
2620 inst = emit(MOV(fs_reg(MRF, first_color_mrf + index * reg_width,
2621 color.type),
2622 color));
2623 inst->saturate = c->key.clamp_fragment_color;
2624 } else {
2625 /* pre-gen6 SIMD16 single source DP write looks like:
2626 * m + 0: r0
2627 * m + 1: g0
2628 * m + 2: b0
2629 * m + 3: a0
2630 * m + 4: r1
2631 * m + 5: g1
2632 * m + 6: b1
2633 * m + 7: a1
2634 */
2635 if (brw->has_compr4) {
2636 /* By setting the high bit of the MRF register number, we
2637 * indicate that we want COMPR4 mode - instead of doing the
2638 * usual destination + 1 for the second half we get
2639 * destination + 4.
2640 */
2641 inst = emit(MOV(fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index,
2642 color.type),
2643 color));
2644 inst->saturate = c->key.clamp_fragment_color;
2645 } else {
2646 push_force_uncompressed();
2647 inst = emit(MOV(fs_reg(MRF, first_color_mrf + index, color.type),
2648 color));
2649 inst->saturate = c->key.clamp_fragment_color;
2650 pop_force_uncompressed();
2651
2652 inst = emit(MOV(fs_reg(MRF, first_color_mrf + index + 4, color.type),
2653 half(color, 1)));
2654 inst->force_sechalf = true;
2655 inst->saturate = c->key.clamp_fragment_color;
2656 }
2657 }
2658 }
2659
2660 static int
2661 cond_for_alpha_func(GLenum func)
2662 {
2663 switch(func) {
2664 case GL_GREATER:
2665 return BRW_CONDITIONAL_G;
2666 case GL_GEQUAL:
2667 return BRW_CONDITIONAL_GE;
2668 case GL_LESS:
2669 return BRW_CONDITIONAL_L;
2670 case GL_LEQUAL:
2671 return BRW_CONDITIONAL_LE;
2672 case GL_EQUAL:
2673 return BRW_CONDITIONAL_EQ;
2674 case GL_NOTEQUAL:
2675 return BRW_CONDITIONAL_NEQ;
2676 default:
2677 assert(!"Not reached");
2678 return 0;
2679 }
2680 }
2681
2682 /**
2683 * Alpha test support for when we compile it into the shader instead
2684 * of using the normal fixed-function alpha test.
2685 */
2686 void
2687 fs_visitor::emit_alpha_test()
2688 {
2689 this->current_annotation = "Alpha test";
2690
2691 fs_inst *cmp;
2692 if (c->key.alpha_test_func == GL_ALWAYS)
2693 return;
2694
2695 if (c->key.alpha_test_func == GL_NEVER) {
2696 /* f0.1 = 0 */
2697 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
2698 BRW_REGISTER_TYPE_UW));
2699 cmp = emit(CMP(reg_null_f, some_reg, some_reg,
2700 BRW_CONDITIONAL_NEQ));
2701 } else {
2702 /* RT0 alpha */
2703 fs_reg color = outputs[0];
2704 color.reg_offset += 3;
2705
2706 /* f0.1 &= func(color, ref) */
2707 cmp = emit(CMP(reg_null_f, color, fs_reg(c->key.alpha_test_ref),
2708 cond_for_alpha_func(c->key.alpha_test_func)));
2709 }
2710 cmp->predicate = BRW_PREDICATE_NORMAL;
2711 cmp->flag_subreg = 1;
2712 }
2713
2714 void
2715 fs_visitor::emit_fb_writes()
2716 {
2717 this->current_annotation = "FB write header";
2718 bool header_present = true;
2719 /* We can potentially have a message length of up to 15, so we have to set
2720 * base_mrf to either 0 or 1 in order to fit in m0..m15.
2721 */
2722 int base_mrf = 1;
2723 int nr = base_mrf;
2724 int reg_width = dispatch_width / 8;
2725 bool do_dual_src = this->dual_src_output.file != BAD_FILE;
2726 bool src0_alpha_to_render_target = false;
2727
2728 if (dispatch_width == 16 && do_dual_src) {
2729 fail("GL_ARB_blend_func_extended not yet supported in SIMD16.");
2730 do_dual_src = false;
2731 }
2732
2733 /* From the Sandy Bridge PRM, volume 4, page 198:
2734 *
2735 * "Dispatched Pixel Enables. One bit per pixel indicating
2736 * which pixels were originally enabled when the thread was
2737 * dispatched. This field is only required for the end-of-
2738 * thread message and on all dual-source messages."
2739 */
2740 if (brw->gen >= 6 &&
2741 (brw->is_haswell || brw->gen >= 8 || !this->fp->UsesKill) &&
2742 !do_dual_src &&
2743 c->key.nr_color_regions == 1) {
2744 header_present = false;
2745 }
2746
2747 if (header_present) {
2748 src0_alpha_to_render_target = brw->gen >= 6 &&
2749 !do_dual_src &&
2750 c->key.replicate_alpha;
2751 /* m2, m3 header */
2752 nr += 2;
2753 }
2754
2755 if (c->aa_dest_stencil_reg) {
2756 push_force_uncompressed();
2757 emit(MOV(fs_reg(MRF, nr++),
2758 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))));
2759 pop_force_uncompressed();
2760 }
2761
2762 c->prog_data.uses_omask =
2763 fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
2764 if(c->prog_data.uses_omask) {
2765 this->current_annotation = "FB write oMask";
2766 assert(this->sample_mask.file != BAD_FILE);
2767 /* Hand over gl_SampleMask. Only lower 16 bits are relevant. */
2768 emit(FS_OPCODE_SET_OMASK, fs_reg(MRF, nr, BRW_REGISTER_TYPE_UW), this->sample_mask);
2769 nr += 1;
2770 }
2771
2772 /* Reserve space for color. It'll be filled in per MRT below. */
2773 int color_mrf = nr;
2774 nr += 4 * reg_width;
2775 if (do_dual_src)
2776 nr += 4;
2777 if (src0_alpha_to_render_target)
2778 nr += reg_width;
2779
2780 if (c->source_depth_to_render_target) {
2781 if (brw->gen == 6 && dispatch_width == 16) {
2782 /* For outputting oDepth on gen6, SIMD8 writes have to be
2783 * used. This would require SIMD8 moves of each half to
2784 * message regs, kind of like pre-gen5 SIMD16 FB writes.
2785 * Just bail on doing so for now.
2786 */
2787 fail("Missing support for simd16 depth writes on gen6\n");
2788 }
2789
2790 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2791 /* Hand over gl_FragDepth. */
2792 assert(this->frag_depth.file != BAD_FILE);
2793 emit(MOV(fs_reg(MRF, nr), this->frag_depth));
2794 } else {
2795 /* Pass through the payload depth. */
2796 emit(MOV(fs_reg(MRF, nr),
2797 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
2798 }
2799 nr += reg_width;
2800 }
2801
2802 if (c->dest_depth_reg) {
2803 emit(MOV(fs_reg(MRF, nr),
2804 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))));
2805 nr += reg_width;
2806 }
2807
2808 if (do_dual_src) {
2809 fs_reg src0 = this->outputs[0];
2810 fs_reg src1 = this->dual_src_output;
2811
2812 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2813 "FB write src0");
2814 for (int i = 0; i < 4; i++) {
2815 fs_inst *inst = emit(MOV(fs_reg(MRF, color_mrf + i, src0.type), src0));
2816 src0.reg_offset++;
2817 inst->saturate = c->key.clamp_fragment_color;
2818 }
2819
2820 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2821 "FB write src1");
2822 for (int i = 0; i < 4; i++) {
2823 fs_inst *inst = emit(MOV(fs_reg(MRF, color_mrf + 4 + i, src1.type),
2824 src1));
2825 src1.reg_offset++;
2826 inst->saturate = c->key.clamp_fragment_color;
2827 }
2828
2829 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2830 emit_shader_time_end();
2831
2832 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2833 inst->target = 0;
2834 inst->base_mrf = base_mrf;
2835 inst->mlen = nr - base_mrf;
2836 inst->eot = true;
2837 inst->header_present = header_present;
2838 if ((brw->gen >= 8 || brw->is_haswell) && fp->UsesKill) {
2839 inst->predicate = BRW_PREDICATE_NORMAL;
2840 inst->flag_subreg = 1;
2841 }
2842
2843 c->prog_data.dual_src_blend = true;
2844 this->current_annotation = NULL;
2845 return;
2846 }
2847
2848 for (int target = 0; target < c->key.nr_color_regions; target++) {
2849 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2850 "FB write target %d",
2851 target);
2852 /* If src0_alpha_to_render_target is true, include source zero alpha
2853 * data in RenderTargetWrite message for targets > 0.
2854 */
2855 int write_color_mrf = color_mrf;
2856 if (src0_alpha_to_render_target && target != 0) {
2857 fs_inst *inst;
2858 fs_reg color = outputs[0];
2859 color.reg_offset += 3;
2860
2861 inst = emit(MOV(fs_reg(MRF, write_color_mrf, color.type),
2862 color));
2863 inst->saturate = c->key.clamp_fragment_color;
2864 write_color_mrf = color_mrf + reg_width;
2865 }
2866
2867 for (unsigned i = 0; i < this->output_components[target]; i++)
2868 emit_color_write(target, i, write_color_mrf);
2869
2870 bool eot = false;
2871 if (target == c->key.nr_color_regions - 1) {
2872 eot = true;
2873
2874 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2875 emit_shader_time_end();
2876 }
2877
2878 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2879 inst->target = target;
2880 inst->base_mrf = base_mrf;
2881 if (src0_alpha_to_render_target && target == 0)
2882 inst->mlen = nr - base_mrf - reg_width;
2883 else
2884 inst->mlen = nr - base_mrf;
2885 inst->eot = eot;
2886 inst->header_present = header_present;
2887 if ((brw->gen >= 8 || brw->is_haswell) && fp->UsesKill) {
2888 inst->predicate = BRW_PREDICATE_NORMAL;
2889 inst->flag_subreg = 1;
2890 }
2891 }
2892
2893 if (c->key.nr_color_regions == 0) {
2894 /* Even if there's no color buffers enabled, we still need to send
2895 * alpha out the pipeline to our null renderbuffer to support
2896 * alpha-testing, alpha-to-coverage, and so on.
2897 */
2898 emit_color_write(0, 3, color_mrf);
2899
2900 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2901 emit_shader_time_end();
2902
2903 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2904 inst->base_mrf = base_mrf;
2905 inst->mlen = nr - base_mrf;
2906 inst->eot = true;
2907 inst->header_present = header_present;
2908 if ((brw->gen >= 8 || brw->is_haswell) && fp->UsesKill) {
2909 inst->predicate = BRW_PREDICATE_NORMAL;
2910 inst->flag_subreg = 1;
2911 }
2912 }
2913
2914 this->current_annotation = NULL;
2915 }
2916
2917 void
2918 fs_visitor::resolve_ud_negate(fs_reg *reg)
2919 {
2920 if (reg->type != BRW_REGISTER_TYPE_UD ||
2921 !reg->negate)
2922 return;
2923
2924 fs_reg temp = fs_reg(this, glsl_type::uint_type);
2925 emit(MOV(temp, *reg));
2926 *reg = temp;
2927 }
2928
2929 void
2930 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
2931 {
2932 if (rvalue->type != glsl_type::bool_type)
2933 return;
2934
2935 fs_reg temp = fs_reg(this, glsl_type::bool_type);
2936 emit(AND(temp, *reg, fs_reg(1)));
2937 *reg = temp;
2938 }
2939
2940 fs_visitor::fs_visitor(struct brw_context *brw,
2941 struct brw_wm_compile *c,
2942 struct gl_shader_program *shader_prog,
2943 struct gl_fragment_program *fp,
2944 unsigned dispatch_width)
2945 : backend_visitor(brw, shader_prog, &fp->Base, &c->prog_data.base,
2946 MESA_SHADER_FRAGMENT),
2947 dispatch_width(dispatch_width)
2948 {
2949 this->c = c;
2950 this->fp = fp;
2951 this->mem_ctx = ralloc_context(NULL);
2952 this->failed = false;
2953 this->variable_ht = hash_table_ctor(0,
2954 hash_table_pointer_hash,
2955 hash_table_pointer_compare);
2956
2957 memset(this->outputs, 0, sizeof(this->outputs));
2958 memset(this->output_components, 0, sizeof(this->output_components));
2959 this->first_non_payload_grf = 0;
2960 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2961
2962 this->current_annotation = NULL;
2963 this->base_ir = NULL;
2964
2965 this->virtual_grf_sizes = NULL;
2966 this->virtual_grf_count = 0;
2967 this->virtual_grf_array_size = 0;
2968 this->virtual_grf_start = NULL;
2969 this->virtual_grf_end = NULL;
2970 this->live_intervals = NULL;
2971 this->regs_live_at_ip = NULL;
2972
2973 this->uniforms = 0;
2974 this->pull_constant_loc = NULL;
2975 this->params_remap = NULL;
2976 this->nr_params_remap = 0;
2977
2978 this->force_uncompressed_stack = 0;
2979
2980 this->spilled_any_registers = false;
2981
2982 this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
2983 }
2984
2985 fs_visitor::~fs_visitor()
2986 {
2987 ralloc_free(this->mem_ctx);
2988 hash_table_dtor(this->variable_ht);
2989 }