i965/fs: Make LOAD_PAYLOAD take a header size
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_visitor.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_visitor.cpp
25 *
26 * This file supports generating the FS LIR from the GLSL IR. The LIR
27 * makes it easier to do backend-specific optimizations than doing so
28 * in the GLSL IR or in the native code.
29 */
30 #include <sys/types.h>
31
32 #include "main/macros.h"
33 #include "main/shaderobj.h"
34 #include "program/prog_parameter.h"
35 #include "program/prog_print.h"
36 #include "program/prog_optimize.h"
37 #include "util/register_allocate.h"
38 #include "program/hash_table.h"
39 #include "brw_context.h"
40 #include "brw_eu.h"
41 #include "brw_wm.h"
42 #include "brw_cs.h"
43 #include "brw_vec4.h"
44 #include "brw_fs.h"
45 #include "main/uniforms.h"
46 #include "glsl/glsl_types.h"
47 #include "glsl/ir_optimization.h"
48 #include "program/sampler.h"
49
50
51 fs_reg *
52 fs_visitor::emit_vs_system_value(int location)
53 {
54 fs_reg *reg = new(this->mem_ctx)
55 fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D);
56 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
57
58 switch (location) {
59 case SYSTEM_VALUE_BASE_VERTEX:
60 reg->reg_offset = 0;
61 vs_prog_data->uses_vertexid = true;
62 break;
63 case SYSTEM_VALUE_VERTEX_ID:
64 case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
65 reg->reg_offset = 2;
66 vs_prog_data->uses_vertexid = true;
67 break;
68 case SYSTEM_VALUE_INSTANCE_ID:
69 reg->reg_offset = 3;
70 vs_prog_data->uses_instanceid = true;
71 break;
72 default:
73 unreachable("not reached");
74 }
75
76 return reg;
77 }
78
79 void
80 fs_visitor::visit(ir_variable *ir)
81 {
82 fs_reg *reg = NULL;
83
84 if (variable_storage(ir))
85 return;
86
87 if (ir->data.mode == ir_var_shader_in) {
88 assert(ir->data.location != -1);
89 if (stage == MESA_SHADER_VERTEX) {
90 reg = new(this->mem_ctx)
91 fs_reg(ATTR, ir->data.location,
92 brw_type_for_base_type(ir->type->get_scalar_type()));
93 } else if (ir->data.location == VARYING_SLOT_POS) {
94 reg = emit_fragcoord_interpolation(ir->data.pixel_center_integer,
95 ir->data.origin_upper_left);
96 } else if (ir->data.location == VARYING_SLOT_FACE) {
97 reg = emit_frontfacing_interpolation();
98 } else {
99 reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
100 emit_general_interpolation(*reg, ir->name, ir->type,
101 (glsl_interp_qualifier) ir->data.interpolation,
102 ir->data.location, ir->data.centroid,
103 ir->data.sample);
104 }
105 assert(reg);
106 hash_table_insert(this->variable_ht, reg, ir);
107 return;
108 } else if (ir->data.mode == ir_var_shader_out) {
109 reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
110
111 if (stage == MESA_SHADER_VERTEX) {
112 int vector_elements =
113 ir->type->is_array() ? ir->type->fields.array->vector_elements
114 : ir->type->vector_elements;
115
116 for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
117 int output = ir->data.location + i;
118 this->outputs[output] = *reg;
119 this->outputs[output].reg_offset = i * 4;
120 this->output_components[output] = vector_elements;
121 }
122
123 } else if (ir->data.index > 0) {
124 assert(ir->data.location == FRAG_RESULT_DATA0);
125 assert(ir->data.index == 1);
126 this->dual_src_output = *reg;
127 this->do_dual_src = true;
128 } else if (ir->data.location == FRAG_RESULT_COLOR) {
129 /* Writing gl_FragColor outputs to all color regions. */
130 assert(stage == MESA_SHADER_FRAGMENT);
131 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
132 for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
133 this->outputs[i] = *reg;
134 this->output_components[i] = 4;
135 }
136 } else if (ir->data.location == FRAG_RESULT_DEPTH) {
137 this->frag_depth = *reg;
138 } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
139 this->sample_mask = *reg;
140 } else {
141 /* gl_FragData or a user-defined FS output */
142 assert(ir->data.location >= FRAG_RESULT_DATA0 &&
143 ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
144
145 int vector_elements =
146 ir->type->is_array() ? ir->type->fields.array->vector_elements
147 : ir->type->vector_elements;
148
149 /* General color output. */
150 for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
151 int output = ir->data.location - FRAG_RESULT_DATA0 + i;
152 this->outputs[output] = offset(*reg, vector_elements * i);
153 this->output_components[output] = vector_elements;
154 }
155 }
156 } else if (ir->data.mode == ir_var_uniform) {
157 int param_index = uniforms;
158
159 /* Thanks to the lower_ubo_reference pass, we will see only
160 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
161 * variables, so no need for them to be in variable_ht.
162 *
163 * Some uniforms, such as samplers and atomic counters, have no actual
164 * storage, so we should ignore them.
165 */
166 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
167 return;
168
169 if (dispatch_width == 16) {
170 if (!variable_storage(ir)) {
171 fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
172 }
173 return;
174 }
175
176 param_size[param_index] = type_size(ir->type);
177 if (!strncmp(ir->name, "gl_", 3)) {
178 setup_builtin_uniform_values(ir);
179 } else {
180 setup_uniform_values(ir);
181 }
182
183 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
184 reg->type = brw_type_for_base_type(ir->type);
185
186 } else if (ir->data.mode == ir_var_system_value) {
187 switch (ir->data.location) {
188 case SYSTEM_VALUE_BASE_VERTEX:
189 case SYSTEM_VALUE_VERTEX_ID:
190 case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
191 case SYSTEM_VALUE_INSTANCE_ID:
192 reg = emit_vs_system_value(ir->data.location);
193 break;
194 case SYSTEM_VALUE_SAMPLE_POS:
195 reg = emit_samplepos_setup();
196 break;
197 case SYSTEM_VALUE_SAMPLE_ID:
198 reg = emit_sampleid_setup();
199 break;
200 case SYSTEM_VALUE_SAMPLE_MASK_IN:
201 assert(devinfo->gen >= 7);
202 reg = new(mem_ctx)
203 fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
204 BRW_REGISTER_TYPE_D));
205 break;
206 }
207 }
208
209 if (!reg)
210 reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
211
212 hash_table_insert(this->variable_ht, reg, ir);
213 }
214
215 void
216 fs_visitor::visit(ir_dereference_variable *ir)
217 {
218 fs_reg *reg = variable_storage(ir->var);
219
220 if (!reg) {
221 fail("Failed to find variable storage for %s\n", ir->var->name);
222 this->result = fs_reg(reg_null_d);
223 return;
224 }
225 this->result = *reg;
226 }
227
228 void
229 fs_visitor::visit(ir_dereference_record *ir)
230 {
231 const glsl_type *struct_type = ir->record->type;
232
233 ir->record->accept(this);
234
235 unsigned int off = 0;
236 for (unsigned int i = 0; i < struct_type->length; i++) {
237 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
238 break;
239 off += type_size(struct_type->fields.structure[i].type);
240 }
241 this->result = offset(this->result, off);
242 this->result.type = brw_type_for_base_type(ir->type);
243 }
244
245 void
246 fs_visitor::visit(ir_dereference_array *ir)
247 {
248 ir_constant *constant_index;
249 fs_reg src;
250 int element_size = type_size(ir->type);
251
252 constant_index = ir->array_index->as_constant();
253
254 ir->array->accept(this);
255 src = this->result;
256 src.type = brw_type_for_base_type(ir->type);
257
258 if (constant_index) {
259 if (src.file == ATTR) {
260 /* Attribute arrays get loaded as one vec4 per element. In that case
261 * offset the source register.
262 */
263 src.reg += constant_index->value.i[0];
264 } else {
265 assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
266 src = offset(src, constant_index->value.i[0] * element_size);
267 }
268 } else {
269 /* Variable index array dereference. We attach the variable index
270 * component to the reg as a pointer to a register containing the
271 * offset. Currently only uniform arrays are supported in this patch,
272 * and that reladdr pointer is resolved by
273 * move_uniform_array_access_to_pull_constants(). All other array types
274 * are lowered by lower_variable_index_to_cond_assign().
275 */
276 ir->array_index->accept(this);
277
278 fs_reg index_reg;
279 index_reg = vgrf(glsl_type::int_type);
280 emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
281
282 if (src.reladdr) {
283 emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
284 }
285
286 src.reladdr = ralloc(mem_ctx, fs_reg);
287 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
288 }
289 this->result = src;
290 }
291
292 fs_inst *
293 fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
294 const fs_reg &a)
295 {
296 if (devinfo->gen < 6) {
297 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
298 fs_reg y_times_a = vgrf(glsl_type::float_type);
299 fs_reg one_minus_a = vgrf(glsl_type::float_type);
300 fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
301
302 emit(MUL(y_times_a, y, a));
303
304 fs_reg negative_a = a;
305 negative_a.negate = !a.negate;
306 emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
307 emit(MUL(x_times_one_minus_a, x, one_minus_a));
308
309 return emit(ADD(dst, x_times_one_minus_a, y_times_a));
310 } else {
311 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
312 * we need to reorder the operands.
313 */
314 return emit(LRP(dst, a, y, x));
315 }
316 }
317
318 void
319 fs_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
320 const fs_reg &src0, const fs_reg &src1)
321 {
322 assert(conditionalmod == BRW_CONDITIONAL_GE ||
323 conditionalmod == BRW_CONDITIONAL_L);
324
325 fs_inst *inst;
326
327 if (devinfo->gen >= 6) {
328 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
329 inst->conditional_mod = conditionalmod;
330 } else {
331 emit(CMP(reg_null_d, src0, src1, conditionalmod));
332
333 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
334 inst->predicate = BRW_PREDICATE_NORMAL;
335 }
336 }
337
338 void
339 fs_visitor::emit_uniformize(const fs_reg &dst, const fs_reg &src)
340 {
341 const fs_reg chan_index = vgrf(glsl_type::uint_type);
342
343 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0))
344 ->force_writemask_all = true;
345 emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
346 src, component(chan_index, 0))
347 ->force_writemask_all = true;
348 }
349
350 bool
351 fs_visitor::try_emit_saturate(ir_expression *ir)
352 {
353 if (ir->operation != ir_unop_saturate)
354 return false;
355
356 ir_rvalue *sat_val = ir->operands[0];
357
358 fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
359
360 sat_val->accept(this);
361 fs_reg src = this->result;
362
363 fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
364
365 /* If the last instruction from our accept() generated our
366 * src, just set the saturate flag instead of emmitting a separate mov.
367 */
368 fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
369 if (modify && modify->regs_written == modify->dst.width / 8 &&
370 modify->can_do_saturate()) {
371 modify->saturate = true;
372 this->result = src;
373 return true;
374 }
375
376 return false;
377 }
378
379 bool
380 fs_visitor::try_emit_line(ir_expression *ir)
381 {
382 /* LINE's src0 must be of type float. */
383 if (ir->type != glsl_type::float_type)
384 return false;
385
386 ir_rvalue *nonmul = ir->operands[1];
387 ir_expression *mul = ir->operands[0]->as_expression();
388
389 if (!mul || mul->operation != ir_binop_mul) {
390 nonmul = ir->operands[0];
391 mul = ir->operands[1]->as_expression();
392
393 if (!mul || mul->operation != ir_binop_mul)
394 return false;
395 }
396
397 ir_constant *const_add = nonmul->as_constant();
398 if (!const_add)
399 return false;
400
401 int add_operand_vf = brw_float_to_vf(const_add->value.f[0]);
402 if (add_operand_vf == -1)
403 return false;
404
405 ir_rvalue *non_const_mul = mul->operands[1];
406 ir_constant *const_mul = mul->operands[0]->as_constant();
407 if (!const_mul) {
408 const_mul = mul->operands[1]->as_constant();
409
410 if (!const_mul)
411 return false;
412
413 non_const_mul = mul->operands[0];
414 }
415
416 int mul_operand_vf = brw_float_to_vf(const_mul->value.f[0]);
417 if (mul_operand_vf == -1)
418 return false;
419
420 non_const_mul->accept(this);
421 fs_reg src1 = this->result;
422
423 fs_reg src0 = vgrf(ir->type);
424 emit(BRW_OPCODE_MOV, src0,
425 fs_reg((uint8_t)mul_operand_vf, 0, 0, (uint8_t)add_operand_vf));
426
427 this->result = vgrf(ir->type);
428 emit(BRW_OPCODE_LINE, this->result, src0, src1);
429 return true;
430 }
431
432 bool
433 fs_visitor::try_emit_mad(ir_expression *ir)
434 {
435 /* 3-src instructions were introduced in gen6. */
436 if (devinfo->gen < 6)
437 return false;
438
439 /* MAD can only handle floating-point data. */
440 if (ir->type != glsl_type::float_type)
441 return false;
442
443 ir_rvalue *nonmul;
444 ir_expression *mul;
445 bool mul_negate, mul_abs;
446
447 for (int i = 0; i < 2; i++) {
448 mul_negate = false;
449 mul_abs = false;
450
451 mul = ir->operands[i]->as_expression();
452 nonmul = ir->operands[1 - i];
453
454 if (mul && mul->operation == ir_unop_abs) {
455 mul = mul->operands[0]->as_expression();
456 mul_abs = true;
457 } else if (mul && mul->operation == ir_unop_neg) {
458 mul = mul->operands[0]->as_expression();
459 mul_negate = true;
460 }
461
462 if (mul && mul->operation == ir_binop_mul)
463 break;
464 }
465
466 if (!mul || mul->operation != ir_binop_mul)
467 return false;
468
469 nonmul->accept(this);
470 fs_reg src0 = this->result;
471
472 mul->operands[0]->accept(this);
473 fs_reg src1 = this->result;
474 src1.negate ^= mul_negate;
475 src1.abs = mul_abs;
476 if (mul_abs)
477 src1.negate = false;
478
479 mul->operands[1]->accept(this);
480 fs_reg src2 = this->result;
481 src2.abs = mul_abs;
482 if (mul_abs)
483 src2.negate = false;
484
485 this->result = vgrf(ir->type);
486 emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
487
488 return true;
489 }
490
491 bool
492 fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir)
493 {
494 /* On platforms that do not natively generate 0u and ~0u for Boolean
495 * results, b2f expressions that look like
496 *
497 * f = b2f(expr cmp 0)
498 *
499 * will generate better code by pretending the expression is
500 *
501 * f = ir_triop_csel(0.0, 1.0, expr cmp 0)
502 *
503 * This is because the last instruction of "expr" can generate the
504 * condition code for the "cmp 0". This avoids having to do the "-(b & 1)"
505 * trick to generate 0u or ~0u for the Boolean result. This means code like
506 *
507 * mov(16) g16<1>F 1F
508 * mul.ge.f0(16) null g6<8,8,1>F g14<8,8,1>F
509 * (+f0) sel(16) m6<1>F g16<8,8,1>F 0F
510 *
511 * will be generated instead of
512 *
513 * mul(16) g2<1>F g12<8,8,1>F g4<8,8,1>F
514 * cmp.ge.f0(16) g2<1>D g4<8,8,1>F 0F
515 * and(16) g4<1>D g2<8,8,1>D 1D
516 * and(16) m6<1>D -g4<8,8,1>D 0x3f800000UD
517 *
518 * When the comparison is != 0.0 using the knowledge that the false case
519 * already results in zero would allow better code generation by possibly
520 * avoiding a load-immediate instruction.
521 */
522 ir_expression *cmp = ir->operands[0]->as_expression();
523 if (cmp == NULL)
524 return false;
525
526 if (cmp->operation == ir_binop_nequal) {
527 for (unsigned i = 0; i < 2; i++) {
528 ir_constant *c = cmp->operands[i]->as_constant();
529 if (c == NULL || !c->is_zero())
530 continue;
531
532 ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
533 if (expr != NULL) {
534 fs_reg op[2];
535
536 for (unsigned j = 0; j < 2; j++) {
537 cmp->operands[j]->accept(this);
538 op[j] = this->result;
539
540 resolve_ud_negate(&op[j]);
541 }
542
543 emit_bool_to_cond_code_of_reg(cmp, op);
544
545 /* In this case we know when the condition is true, op[i ^ 1]
546 * contains zero. Invert the predicate, use op[i ^ 1] as src0,
547 * and immediate 1.0f as src1.
548 */
549 this->result = vgrf(ir->type);
550 op[i ^ 1].type = BRW_REGISTER_TYPE_F;
551
552 fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
553 inst->predicate = BRW_PREDICATE_NORMAL;
554 inst->predicate_inverse = true;
555 return true;
556 }
557 }
558 }
559
560 emit_bool_to_cond_code(cmp);
561
562 fs_reg temp = vgrf(ir->type);
563 emit(MOV(temp, fs_reg(1.0f)));
564
565 this->result = vgrf(ir->type);
566 fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
567 inst->predicate = BRW_PREDICATE_NORMAL;
568
569 return true;
570 }
571
572 static int
573 pack_pixel_offset(float x)
574 {
575 /* Clamp upper end of the range to +7/16. See explanation in non-constant
576 * offset case below. */
577 int n = MIN2((int)(x * 16), 7);
578 return n & 0xf;
579 }
580
581 void
582 fs_visitor::emit_interpolate_expression(ir_expression *ir)
583 {
584 /* in SIMD16 mode, the pixel interpolator returns coords interleaved
585 * 8 channels at a time, same as the barycentric coords presented in
586 * the FS payload. this requires a bit of extra work to support.
587 */
588 no16("interpolate_at_* not yet supported in SIMD16 mode.");
589
590 assert(stage == MESA_SHADER_FRAGMENT);
591 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
592
593 ir_dereference * deref = ir->operands[0]->as_dereference();
594 ir_swizzle * swiz = NULL;
595 if (!deref) {
596 /* the api does not allow a swizzle here, but the varying packing code
597 * may have pushed one into here.
598 */
599 swiz = ir->operands[0]->as_swizzle();
600 assert(swiz);
601 deref = swiz->val->as_dereference();
602 }
603 assert(deref);
604 ir_variable * var = deref->variable_referenced();
605 assert(var);
606
607 /* 1. collect interpolation factors */
608
609 fs_reg dst_xy = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
610
611 /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
612 * even when there is no payload. in the per-slot offset case, we'll replace this with
613 * the proper source data. */
614 fs_reg src = vgrf(glsl_type::float_type);
615 int mlen = 1; /* one reg unless overriden */
616 int reg_width = dispatch_width / 8;
617 fs_inst *inst;
618
619 switch (ir->operation) {
620 case ir_unop_interpolate_at_centroid:
621 inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
622 break;
623
624 case ir_binop_interpolate_at_sample: {
625 ir_constant *sample_num = ir->operands[1]->as_constant();
626 assert(sample_num || !"nonconstant sample number should have been lowered.");
627
628 unsigned msg_data = sample_num->value.i[0] << 4;
629 inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, fs_reg(msg_data));
630 break;
631 }
632
633 case ir_binop_interpolate_at_offset: {
634 ir_constant *const_offset = ir->operands[1]->as_constant();
635 if (const_offset) {
636 unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
637 (pack_pixel_offset(const_offset->value.f[1]) << 4);
638 inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
639 fs_reg(msg_data));
640 } else {
641 /* pack the operands: hw wants offsets as 4 bit signed ints */
642 ir->operands[1]->accept(this);
643 src = vgrf(glsl_type::ivec2_type);
644 fs_reg src2 = src;
645 for (int i = 0; i < 2; i++) {
646 fs_reg temp = vgrf(glsl_type::float_type);
647 emit(MUL(temp, this->result, fs_reg(16.0f)));
648 emit(MOV(src2, temp)); /* float to int */
649
650 /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
651 * that we support a maximum offset of +0.5, which isn't representable
652 * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
653 * which is the opposite of what the shader author wanted.
654 *
655 * This is legal due to ARB_gpu_shader5's quantization rules:
656 *
657 * "Not all values of <offset> may be supported; x and y offsets may
658 * be rounded to fixed-point values with the number of fraction bits
659 * given by the implementation-dependent constant
660 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
661 */
662
663 fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
664 inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
665
666 src2 = offset(src2, 1);
667 this->result = offset(this->result, 1);
668 }
669
670 mlen = 2 * reg_width;
671 inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
672 fs_reg(0u));
673 }
674 break;
675 }
676
677 default:
678 unreachable("not reached");
679 }
680
681 inst->mlen = mlen;
682 inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
683 inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
684 INTERP_QUALIFIER_NOPERSPECTIVE;
685
686 /* 2. emit linterp */
687
688 fs_reg res = vgrf(ir->type);
689 this->result = res;
690
691 for (int i = 0; i < ir->type->vector_elements; i++) {
692 int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
693 emit(FS_OPCODE_LINTERP, res, dst_xy,
694 fs_reg(interp_reg(var->data.location, ch)));
695 res = offset(res, 1);
696 }
697 }
698
699 void
700 fs_visitor::visit(ir_expression *ir)
701 {
702 unsigned int operand;
703 fs_reg op[3], temp;
704 fs_inst *inst;
705 struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
706
707 assert(ir->get_num_operands() <= 3);
708
709 if (try_emit_saturate(ir))
710 return;
711
712 /* Deal with the real oddball stuff first */
713 switch (ir->operation) {
714 case ir_binop_add:
715 if (devinfo->gen <= 5 && try_emit_line(ir))
716 return;
717 if (try_emit_mad(ir))
718 return;
719 break;
720
721 case ir_triop_csel:
722 ir->operands[1]->accept(this);
723 op[1] = this->result;
724 ir->operands[2]->accept(this);
725 op[2] = this->result;
726
727 emit_bool_to_cond_code(ir->operands[0]);
728
729 this->result = vgrf(ir->type);
730 inst = emit(SEL(this->result, op[1], op[2]));
731 inst->predicate = BRW_PREDICATE_NORMAL;
732 return;
733
734 case ir_unop_b2f:
735 if (devinfo->gen <= 5 && try_emit_b2f_of_comparison(ir))
736 return;
737 break;
738
739 case ir_unop_interpolate_at_centroid:
740 case ir_binop_interpolate_at_offset:
741 case ir_binop_interpolate_at_sample:
742 emit_interpolate_expression(ir);
743 return;
744
745 default:
746 break;
747 }
748
749 for (operand = 0; operand < ir->get_num_operands(); operand++) {
750 ir->operands[operand]->accept(this);
751 if (this->result.file == BAD_FILE) {
752 fail("Failed to get tree for expression operand:\n");
753 ir->operands[operand]->fprint(stderr);
754 fprintf(stderr, "\n");
755 }
756 assert(this->result.file == GRF ||
757 this->result.file == UNIFORM || this->result.file == ATTR);
758 op[operand] = this->result;
759
760 /* Matrix expression operands should have been broken down to vector
761 * operations already.
762 */
763 assert(!ir->operands[operand]->type->is_matrix());
764 /* And then those vector operands should have been broken down to scalar.
765 */
766 assert(!ir->operands[operand]->type->is_vector());
767 }
768
769 /* Storage for our result. If our result goes into an assignment, it will
770 * just get copy-propagated out, so no worries.
771 */
772 this->result = vgrf(ir->type);
773
774 switch (ir->operation) {
775 case ir_unop_logic_not:
776 emit(NOT(this->result, op[0]));
777 break;
778 case ir_unop_neg:
779 op[0].negate = !op[0].negate;
780 emit(MOV(this->result, op[0]));
781 break;
782 case ir_unop_abs:
783 op[0].abs = true;
784 op[0].negate = false;
785 emit(MOV(this->result, op[0]));
786 break;
787 case ir_unop_sign:
788 if (ir->type->is_float()) {
789 /* AND(val, 0x80000000) gives the sign bit.
790 *
791 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
792 * zero.
793 */
794 emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
795
796 op[0].type = BRW_REGISTER_TYPE_UD;
797 this->result.type = BRW_REGISTER_TYPE_UD;
798 emit(AND(this->result, op[0], fs_reg(0x80000000u)));
799
800 inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
801 inst->predicate = BRW_PREDICATE_NORMAL;
802
803 this->result.type = BRW_REGISTER_TYPE_F;
804 } else {
805 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
806 * -> non-negative val generates 0x00000000.
807 * Predicated OR sets 1 if val is positive.
808 */
809 emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
810
811 emit(ASR(this->result, op[0], fs_reg(31)));
812
813 inst = emit(OR(this->result, this->result, fs_reg(1)));
814 inst->predicate = BRW_PREDICATE_NORMAL;
815 }
816 break;
817 case ir_unop_rcp:
818 emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
819 break;
820
821 case ir_unop_exp2:
822 emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
823 break;
824 case ir_unop_log2:
825 emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
826 break;
827 case ir_unop_exp:
828 case ir_unop_log:
829 unreachable("not reached: should be handled by ir_explog_to_explog2");
830 case ir_unop_sin:
831 emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
832 break;
833 case ir_unop_cos:
834 emit_math(SHADER_OPCODE_COS, this->result, op[0]);
835 break;
836
837 case ir_unop_dFdx:
838 /* Select one of the two opcodes based on the glHint value. */
839 if (fs_key->high_quality_derivatives)
840 emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
841 else
842 emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
843 break;
844
845 case ir_unop_dFdx_coarse:
846 emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
847 break;
848
849 case ir_unop_dFdx_fine:
850 emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
851 break;
852
853 case ir_unop_dFdy:
854 /* Select one of the two opcodes based on the glHint value. */
855 if (fs_key->high_quality_derivatives)
856 emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
857 else
858 emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
859 break;
860
861 case ir_unop_dFdy_coarse:
862 emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
863 break;
864
865 case ir_unop_dFdy_fine:
866 emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
867 break;
868
869 case ir_binop_add:
870 emit(ADD(this->result, op[0], op[1]));
871 break;
872 case ir_binop_sub:
873 unreachable("not reached: should be handled by ir_sub_to_add_neg");
874
875 case ir_binop_mul:
876 if (devinfo->gen < 8 && ir->type->is_integer()) {
877 /* For integer multiplication, the MUL uses the low 16 bits
878 * of one of the operands (src0 on gen6, src1 on gen7). The
879 * MACH accumulates in the contribution of the upper 16 bits
880 * of that operand.
881 */
882 if (ir->operands[0]->is_uint16_constant()) {
883 if (devinfo->gen < 7)
884 emit(MUL(this->result, op[0], op[1]));
885 else
886 emit(MUL(this->result, op[1], op[0]));
887 } else if (ir->operands[1]->is_uint16_constant()) {
888 if (devinfo->gen < 7)
889 emit(MUL(this->result, op[1], op[0]));
890 else
891 emit(MUL(this->result, op[0], op[1]));
892 } else {
893 if (devinfo->gen >= 7)
894 no16("SIMD16 explicit accumulator operands unsupported\n");
895
896 struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
897 this->result.type);
898
899 emit(MUL(acc, op[0], op[1]));
900 emit(MACH(reg_null_d, op[0], op[1]));
901 emit(MOV(this->result, fs_reg(acc)));
902 }
903 } else {
904 emit(MUL(this->result, op[0], op[1]));
905 }
906 break;
907 case ir_binop_imul_high: {
908 if (devinfo->gen >= 7)
909 no16("SIMD16 explicit accumulator operands unsupported\n");
910
911 struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
912 this->result.type);
913
914 fs_inst *mul = emit(MUL(acc, op[0], op[1]));
915 emit(MACH(this->result, op[0], op[1]));
916
917 /* Until Gen8, integer multiplies read 32-bits from one source, and
918 * 16-bits from the other, and relying on the MACH instruction to
919 * generate the high bits of the result.
920 *
921 * On Gen8, the multiply instruction does a full 32x32-bit multiply,
922 * but in order to do a 64x64-bit multiply we have to simulate the
923 * previous behavior and then use a MACH instruction.
924 *
925 * FINISHME: Don't use source modifiers on src1.
926 */
927 if (devinfo->gen >= 8) {
928 assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
929 mul->src[1].type == BRW_REGISTER_TYPE_UD);
930 if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
931 mul->src[1].type = BRW_REGISTER_TYPE_W;
932 mul->src[1].stride = 2;
933 } else {
934 mul->src[1].type = BRW_REGISTER_TYPE_UW;
935 mul->src[1].stride = 2;
936 }
937 }
938
939 break;
940 }
941 case ir_binop_div:
942 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
943 assert(ir->type->is_integer());
944 emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
945 break;
946 case ir_binop_carry: {
947 if (devinfo->gen >= 7)
948 no16("SIMD16 explicit accumulator operands unsupported\n");
949
950 struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
951 BRW_REGISTER_TYPE_UD);
952
953 emit(ADDC(reg_null_ud, op[0], op[1]));
954 emit(MOV(this->result, fs_reg(acc)));
955 break;
956 }
957 case ir_binop_borrow: {
958 if (devinfo->gen >= 7)
959 no16("SIMD16 explicit accumulator operands unsupported\n");
960
961 struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
962 BRW_REGISTER_TYPE_UD);
963
964 emit(SUBB(reg_null_ud, op[0], op[1]));
965 emit(MOV(this->result, fs_reg(acc)));
966 break;
967 }
968 case ir_binop_mod:
969 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
970 assert(ir->type->is_integer());
971 emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
972 break;
973
974 case ir_binop_less:
975 case ir_binop_greater:
976 case ir_binop_lequal:
977 case ir_binop_gequal:
978 case ir_binop_equal:
979 case ir_binop_all_equal:
980 case ir_binop_nequal:
981 case ir_binop_any_nequal:
982 if (devinfo->gen <= 5) {
983 resolve_bool_comparison(ir->operands[0], &op[0]);
984 resolve_bool_comparison(ir->operands[1], &op[1]);
985 }
986
987 emit(CMP(this->result, op[0], op[1],
988 brw_conditional_for_comparison(ir->operation)));
989 break;
990
991 case ir_binop_logic_xor:
992 emit(XOR(this->result, op[0], op[1]));
993 break;
994
995 case ir_binop_logic_or:
996 emit(OR(this->result, op[0], op[1]));
997 break;
998
999 case ir_binop_logic_and:
1000 emit(AND(this->result, op[0], op[1]));
1001 break;
1002
1003 case ir_binop_dot:
1004 case ir_unop_any:
1005 unreachable("not reached: should be handled by brw_fs_channel_expressions");
1006
1007 case ir_unop_noise:
1008 unreachable("not reached: should be handled by lower_noise");
1009
1010 case ir_quadop_vector:
1011 unreachable("not reached: should be handled by lower_quadop_vector");
1012
1013 case ir_binop_vector_extract:
1014 unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
1015
1016 case ir_triop_vector_insert:
1017 unreachable("not reached: should be handled by lower_vector_insert()");
1018
1019 case ir_binop_ldexp:
1020 unreachable("not reached: should be handled by ldexp_to_arith()");
1021
1022 case ir_unop_sqrt:
1023 emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
1024 break;
1025
1026 case ir_unop_rsq:
1027 emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
1028 break;
1029
1030 case ir_unop_bitcast_i2f:
1031 case ir_unop_bitcast_u2f:
1032 op[0].type = BRW_REGISTER_TYPE_F;
1033 this->result = op[0];
1034 break;
1035 case ir_unop_i2u:
1036 case ir_unop_bitcast_f2u:
1037 op[0].type = BRW_REGISTER_TYPE_UD;
1038 this->result = op[0];
1039 break;
1040 case ir_unop_u2i:
1041 case ir_unop_bitcast_f2i:
1042 op[0].type = BRW_REGISTER_TYPE_D;
1043 this->result = op[0];
1044 break;
1045 case ir_unop_i2f:
1046 case ir_unop_u2f:
1047 case ir_unop_f2i:
1048 case ir_unop_f2u:
1049 emit(MOV(this->result, op[0]));
1050 break;
1051
1052 case ir_unop_b2i:
1053 emit(AND(this->result, op[0], fs_reg(1)));
1054 break;
1055 case ir_unop_b2f:
1056 if (devinfo->gen <= 5) {
1057 resolve_bool_comparison(ir->operands[0], &op[0]);
1058 }
1059 op[0].type = BRW_REGISTER_TYPE_D;
1060 this->result.type = BRW_REGISTER_TYPE_D;
1061 emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
1062 this->result.type = BRW_REGISTER_TYPE_F;
1063 break;
1064
1065 case ir_unop_f2b:
1066 emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
1067 break;
1068 case ir_unop_i2b:
1069 emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
1070 break;
1071
1072 case ir_unop_trunc:
1073 emit(RNDZ(this->result, op[0]));
1074 break;
1075 case ir_unop_ceil: {
1076 fs_reg tmp = vgrf(ir->type);
1077 op[0].negate = !op[0].negate;
1078 emit(RNDD(tmp, op[0]));
1079 tmp.negate = true;
1080 emit(MOV(this->result, tmp));
1081 }
1082 break;
1083 case ir_unop_floor:
1084 emit(RNDD(this->result, op[0]));
1085 break;
1086 case ir_unop_fract:
1087 emit(FRC(this->result, op[0]));
1088 break;
1089 case ir_unop_round_even:
1090 emit(RNDE(this->result, op[0]));
1091 break;
1092
1093 case ir_binop_min:
1094 case ir_binop_max:
1095 resolve_ud_negate(&op[0]);
1096 resolve_ud_negate(&op[1]);
1097 emit_minmax(ir->operation == ir_binop_min ?
1098 BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
1099 this->result, op[0], op[1]);
1100 break;
1101 case ir_unop_pack_snorm_2x16:
1102 case ir_unop_pack_snorm_4x8:
1103 case ir_unop_pack_unorm_2x16:
1104 case ir_unop_pack_unorm_4x8:
1105 case ir_unop_unpack_snorm_2x16:
1106 case ir_unop_unpack_snorm_4x8:
1107 case ir_unop_unpack_unorm_2x16:
1108 case ir_unop_unpack_unorm_4x8:
1109 case ir_unop_unpack_half_2x16:
1110 case ir_unop_pack_half_2x16:
1111 unreachable("not reached: should be handled by lower_packing_builtins");
1112 case ir_unop_unpack_half_2x16_split_x:
1113 emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
1114 break;
1115 case ir_unop_unpack_half_2x16_split_y:
1116 emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
1117 break;
1118 case ir_binop_pow:
1119 emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
1120 break;
1121
1122 case ir_unop_bitfield_reverse:
1123 emit(BFREV(this->result, op[0]));
1124 break;
1125 case ir_unop_bit_count:
1126 emit(CBIT(this->result, op[0]));
1127 break;
1128 case ir_unop_find_msb:
1129 temp = vgrf(glsl_type::uint_type);
1130 emit(FBH(temp, op[0]));
1131
1132 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1133 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1134 * subtract the result from 31 to convert the MSB count into an LSB count.
1135 */
1136
1137 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1138 emit(MOV(this->result, temp));
1139 emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
1140
1141 temp.negate = true;
1142 inst = emit(ADD(this->result, temp, fs_reg(31)));
1143 inst->predicate = BRW_PREDICATE_NORMAL;
1144 break;
1145 case ir_unop_find_lsb:
1146 emit(FBL(this->result, op[0]));
1147 break;
1148 case ir_unop_saturate:
1149 inst = emit(MOV(this->result, op[0]));
1150 inst->saturate = true;
1151 break;
1152 case ir_triop_bitfield_extract:
1153 /* Note that the instruction's argument order is reversed from GLSL
1154 * and the IR.
1155 */
1156 emit(BFE(this->result, op[2], op[1], op[0]));
1157 break;
1158 case ir_binop_bfm:
1159 emit(BFI1(this->result, op[0], op[1]));
1160 break;
1161 case ir_triop_bfi:
1162 emit(BFI2(this->result, op[0], op[1], op[2]));
1163 break;
1164 case ir_quadop_bitfield_insert:
1165 unreachable("not reached: should be handled by "
1166 "lower_instructions::bitfield_insert_to_bfm_bfi");
1167
1168 case ir_unop_bit_not:
1169 emit(NOT(this->result, op[0]));
1170 break;
1171 case ir_binop_bit_and:
1172 emit(AND(this->result, op[0], op[1]));
1173 break;
1174 case ir_binop_bit_xor:
1175 emit(XOR(this->result, op[0], op[1]));
1176 break;
1177 case ir_binop_bit_or:
1178 emit(OR(this->result, op[0], op[1]));
1179 break;
1180
1181 case ir_binop_lshift:
1182 emit(SHL(this->result, op[0], op[1]));
1183 break;
1184
1185 case ir_binop_rshift:
1186 if (ir->type->base_type == GLSL_TYPE_INT)
1187 emit(ASR(this->result, op[0], op[1]));
1188 else
1189 emit(SHR(this->result, op[0], op[1]));
1190 break;
1191 case ir_binop_pack_half_2x16_split:
1192 emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
1193 break;
1194 case ir_binop_ubo_load: {
1195 /* This IR node takes a constant uniform block and a constant or
1196 * variable byte offset within the block and loads a vector from that.
1197 */
1198 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1199 ir_constant *const_offset = ir->operands[1]->as_constant();
1200 fs_reg surf_index;
1201
1202 if (const_uniform_block) {
1203 /* The block index is a constant, so just emit the binding table entry
1204 * as an immediate.
1205 */
1206 surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
1207 const_uniform_block->value.u[0]);
1208 } else {
1209 /* The block index is not a constant. Evaluate the index expression
1210 * per-channel and add the base UBO index; we have to select a value
1211 * from any live channel.
1212 */
1213 surf_index = vgrf(glsl_type::uint_type);
1214 emit(ADD(surf_index, op[0],
1215 fs_reg(stage_prog_data->binding_table.ubo_start)));
1216 emit_uniformize(surf_index, surf_index);
1217
1218 /* Assume this may touch any UBO. It would be nice to provide
1219 * a tighter bound, but the array information is already lowered away.
1220 */
1221 brw_mark_surface_used(prog_data,
1222 stage_prog_data->binding_table.ubo_start +
1223 shader_prog->NumUniformBlocks - 1);
1224 }
1225
1226 if (const_offset) {
1227 fs_reg packed_consts = vgrf(glsl_type::float_type);
1228 packed_consts.type = result.type;
1229
1230 fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
1231 emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
1232 packed_consts, surf_index, const_offset_reg));
1233
1234 for (int i = 0; i < ir->type->vector_elements; i++) {
1235 packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
1236
1237 /* The std140 packing rules don't allow vectors to cross 16-byte
1238 * boundaries, and a reg is 32 bytes.
1239 */
1240 assert(packed_consts.subreg_offset < 32);
1241
1242 /* UBO bools are any nonzero value. We consider bools to be
1243 * values with the low bit set to 1. Convert them using CMP.
1244 */
1245 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1246 emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
1247 } else {
1248 emit(MOV(result, packed_consts));
1249 }
1250
1251 result = offset(result, 1);
1252 }
1253 } else {
1254 /* Turn the byte offset into a dword offset. */
1255 fs_reg base_offset = vgrf(glsl_type::int_type);
1256 emit(SHR(base_offset, op[1], fs_reg(2)));
1257
1258 for (int i = 0; i < ir->type->vector_elements; i++) {
1259 emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
1260 base_offset, i));
1261
1262 if (ir->type->base_type == GLSL_TYPE_BOOL)
1263 emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
1264
1265 result = offset(result, 1);
1266 }
1267 }
1268
1269 result.reg_offset = 0;
1270 break;
1271 }
1272
1273 case ir_triop_fma:
1274 /* Note that the instruction's argument order is reversed from GLSL
1275 * and the IR.
1276 */
1277 emit(MAD(this->result, op[2], op[1], op[0]));
1278 break;
1279
1280 case ir_triop_lrp:
1281 emit_lrp(this->result, op[0], op[1], op[2]);
1282 break;
1283
1284 case ir_triop_csel:
1285 case ir_unop_interpolate_at_centroid:
1286 case ir_binop_interpolate_at_offset:
1287 case ir_binop_interpolate_at_sample:
1288 unreachable("already handled above");
1289 break;
1290
1291 case ir_unop_d2f:
1292 case ir_unop_f2d:
1293 case ir_unop_d2i:
1294 case ir_unop_i2d:
1295 case ir_unop_d2u:
1296 case ir_unop_u2d:
1297 case ir_unop_d2b:
1298 case ir_unop_pack_double_2x32:
1299 case ir_unop_unpack_double_2x32:
1300 case ir_unop_frexp_sig:
1301 case ir_unop_frexp_exp:
1302 unreachable("fp64 todo");
1303 break;
1304 }
1305 }
1306
1307 void
1308 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1309 const glsl_type *type, bool predicated)
1310 {
1311 switch (type->base_type) {
1312 case GLSL_TYPE_FLOAT:
1313 case GLSL_TYPE_UINT:
1314 case GLSL_TYPE_INT:
1315 case GLSL_TYPE_BOOL:
1316 for (unsigned int i = 0; i < type->components(); i++) {
1317 l.type = brw_type_for_base_type(type);
1318 r.type = brw_type_for_base_type(type);
1319
1320 if (predicated || !l.equals(r)) {
1321 fs_inst *inst = emit(MOV(l, r));
1322 inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
1323 }
1324
1325 l = offset(l, 1);
1326 r = offset(r, 1);
1327 }
1328 break;
1329 case GLSL_TYPE_ARRAY:
1330 for (unsigned int i = 0; i < type->length; i++) {
1331 emit_assignment_writes(l, r, type->fields.array, predicated);
1332 }
1333 break;
1334
1335 case GLSL_TYPE_STRUCT:
1336 for (unsigned int i = 0; i < type->length; i++) {
1337 emit_assignment_writes(l, r, type->fields.structure[i].type,
1338 predicated);
1339 }
1340 break;
1341
1342 case GLSL_TYPE_SAMPLER:
1343 case GLSL_TYPE_IMAGE:
1344 case GLSL_TYPE_ATOMIC_UINT:
1345 break;
1346
1347 case GLSL_TYPE_DOUBLE:
1348 case GLSL_TYPE_VOID:
1349 case GLSL_TYPE_ERROR:
1350 case GLSL_TYPE_INTERFACE:
1351 unreachable("not reached");
1352 }
1353 }
1354
1355 /* If the RHS processing resulted in an instruction generating a
1356 * temporary value, and it would be easy to rewrite the instruction to
1357 * generate its result right into the LHS instead, do so. This ends
1358 * up reliably removing instructions where it can be tricky to do so
1359 * later without real UD chain information.
1360 */
1361 bool
1362 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1363 fs_reg dst,
1364 fs_reg src,
1365 fs_inst *pre_rhs_inst,
1366 fs_inst *last_rhs_inst)
1367 {
1368 /* Only attempt if we're doing a direct assignment. */
1369 if (ir->condition ||
1370 !(ir->lhs->type->is_scalar() ||
1371 (ir->lhs->type->is_vector() &&
1372 ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
1373 return false;
1374
1375 /* Make sure the last instruction generated our source reg. */
1376 fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
1377 last_rhs_inst,
1378 src);
1379 if (!modify)
1380 return false;
1381
1382 /* If last_rhs_inst wrote a different number of components than our LHS,
1383 * we can't safely rewrite it.
1384 */
1385 if (alloc.sizes[dst.reg] != modify->regs_written)
1386 return false;
1387
1388 /* Success! Rewrite the instruction. */
1389 modify->dst = dst;
1390
1391 return true;
1392 }
1393
1394 void
1395 fs_visitor::visit(ir_assignment *ir)
1396 {
1397 fs_reg l, r;
1398 fs_inst *inst;
1399
1400 /* FINISHME: arrays on the lhs */
1401 ir->lhs->accept(this);
1402 l = this->result;
1403
1404 fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
1405
1406 ir->rhs->accept(this);
1407 r = this->result;
1408
1409 fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
1410
1411 assert(l.file != BAD_FILE);
1412 assert(r.file != BAD_FILE);
1413
1414 if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
1415 return;
1416
1417 if (ir->condition) {
1418 emit_bool_to_cond_code(ir->condition);
1419 }
1420
1421 if (ir->lhs->type->is_scalar() ||
1422 ir->lhs->type->is_vector()) {
1423 for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1424 if (ir->write_mask & (1 << i)) {
1425 inst = emit(MOV(l, r));
1426 if (ir->condition)
1427 inst->predicate = BRW_PREDICATE_NORMAL;
1428 r = offset(r, 1);
1429 }
1430 l = offset(l, 1);
1431 }
1432 } else {
1433 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1434 }
1435 }
1436
1437 fs_inst *
1438 fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
1439 fs_reg coordinate, int coord_components,
1440 fs_reg shadow_c,
1441 fs_reg lod, fs_reg dPdy, int grad_components,
1442 uint32_t sampler)
1443 {
1444 int mlen;
1445 int base_mrf = 1;
1446 bool simd16 = false;
1447 fs_reg orig_dst;
1448
1449 /* g0 header. */
1450 mlen = 1;
1451
1452 if (shadow_c.file != BAD_FILE) {
1453 for (int i = 0; i < coord_components; i++) {
1454 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1455 coordinate = offset(coordinate, 1);
1456 }
1457
1458 /* gen4's SIMD8 sampler always has the slots for u,v,r present.
1459 * the unused slots must be zeroed.
1460 */
1461 for (int i = coord_components; i < 3; i++) {
1462 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
1463 }
1464 mlen += 3;
1465
1466 if (op == ir_tex) {
1467 /* There's no plain shadow compare message, so we use shadow
1468 * compare with a bias of 0.0.
1469 */
1470 emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
1471 mlen++;
1472 } else if (op == ir_txb || op == ir_txl) {
1473 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1474 mlen++;
1475 } else {
1476 unreachable("Should not get here.");
1477 }
1478
1479 emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
1480 mlen++;
1481 } else if (op == ir_tex) {
1482 for (int i = 0; i < coord_components; i++) {
1483 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1484 coordinate = offset(coordinate, 1);
1485 }
1486 /* zero the others. */
1487 for (int i = coord_components; i<3; i++) {
1488 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
1489 }
1490 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1491 mlen += 3;
1492 } else if (op == ir_txd) {
1493 fs_reg &dPdx = lod;
1494
1495 for (int i = 0; i < coord_components; i++) {
1496 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1497 coordinate = offset(coordinate, 1);
1498 }
1499 /* the slots for u and v are always present, but r is optional */
1500 mlen += MAX2(coord_components, 2);
1501
1502 /* P = u, v, r
1503 * dPdx = dudx, dvdx, drdx
1504 * dPdy = dudy, dvdy, drdy
1505 *
1506 * 1-arg: Does not exist.
1507 *
1508 * 2-arg: dudx dvdx dudy dvdy
1509 * dPdx.x dPdx.y dPdy.x dPdy.y
1510 * m4 m5 m6 m7
1511 *
1512 * 3-arg: dudx dvdx drdx dudy dvdy drdy
1513 * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
1514 * m5 m6 m7 m8 m9 m10
1515 */
1516 for (int i = 0; i < grad_components; i++) {
1517 emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
1518 dPdx = offset(dPdx, 1);
1519 }
1520 mlen += MAX2(grad_components, 2);
1521
1522 for (int i = 0; i < grad_components; i++) {
1523 emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
1524 dPdy = offset(dPdy, 1);
1525 }
1526 mlen += MAX2(grad_components, 2);
1527 } else if (op == ir_txs) {
1528 /* There's no SIMD8 resinfo message on Gen4. Use SIMD16 instead. */
1529 simd16 = true;
1530 emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1531 mlen += 2;
1532 } else {
1533 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1534 * instructions. We'll need to do SIMD16 here.
1535 */
1536 simd16 = true;
1537 assert(op == ir_txb || op == ir_txl || op == ir_txf);
1538
1539 for (int i = 0; i < coord_components; i++) {
1540 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
1541 coordinate));
1542 coordinate = offset(coordinate, 1);
1543 }
1544
1545 /* Initialize the rest of u/v/r with 0.0. Empirically, this seems to
1546 * be necessary for TXF (ld), but seems wise to do for all messages.
1547 */
1548 for (int i = coord_components; i < 3; i++) {
1549 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
1550 }
1551
1552 /* lod/bias appears after u/v/r. */
1553 mlen += 6;
1554
1555 emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
1556 mlen++;
1557
1558 /* The unused upper half. */
1559 mlen++;
1560 }
1561
1562 if (simd16) {
1563 /* Now, since we're doing simd16, the return is 2 interleaved
1564 * vec4s where the odd-indexed ones are junk. We'll need to move
1565 * this weirdness around to the expected layout.
1566 */
1567 orig_dst = dst;
1568 dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
1569 }
1570
1571 enum opcode opcode;
1572 switch (op) {
1573 case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1574 case ir_txb: opcode = FS_OPCODE_TXB; break;
1575 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1576 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
1577 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
1578 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
1579 default:
1580 unreachable("not reached");
1581 }
1582
1583 fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1584 inst->base_mrf = base_mrf;
1585 inst->mlen = mlen;
1586 inst->header_size = 1;
1587 inst->regs_written = simd16 ? 8 : 4;
1588
1589 if (simd16) {
1590 for (int i = 0; i < 4; i++) {
1591 emit(MOV(orig_dst, dst));
1592 orig_dst = offset(orig_dst, 1);
1593 dst = offset(dst, 2);
1594 }
1595 }
1596
1597 return inst;
1598 }
1599
1600 fs_inst *
1601 fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
1602 fs_reg coordinate, int vector_elements,
1603 fs_reg shadow_c, fs_reg lod,
1604 uint32_t sampler)
1605 {
1606 fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
1607 bool has_lod = op == ir_txl || op == ir_txb || op == ir_txf;
1608
1609 if (has_lod && shadow_c.file != BAD_FILE)
1610 no16("TXB and TXL with shadow comparison unsupported in SIMD16.");
1611
1612 if (op == ir_txd)
1613 no16("textureGrad unsupported in SIMD16.");
1614
1615 /* Copy the coordinates. */
1616 for (int i = 0; i < vector_elements; i++) {
1617 emit(MOV(retype(offset(message, i), coordinate.type), coordinate));
1618 coordinate = offset(coordinate, 1);
1619 }
1620
1621 fs_reg msg_end = offset(message, vector_elements);
1622
1623 /* Messages other than sample and ld require all three components */
1624 if (has_lod || shadow_c.file != BAD_FILE) {
1625 for (int i = vector_elements; i < 3; i++) {
1626 emit(MOV(offset(message, i), fs_reg(0.0f)));
1627 }
1628 }
1629
1630 if (has_lod) {
1631 fs_reg msg_lod = retype(offset(message, 3), op == ir_txf ?
1632 BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
1633 emit(MOV(msg_lod, lod));
1634 msg_end = offset(msg_lod, 1);
1635 }
1636
1637 if (shadow_c.file != BAD_FILE) {
1638 fs_reg msg_ref = offset(message, 3 + has_lod);
1639 emit(MOV(msg_ref, shadow_c));
1640 msg_end = offset(msg_ref, 1);
1641 }
1642
1643 enum opcode opcode;
1644 switch (op) {
1645 case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1646 case ir_txb: opcode = FS_OPCODE_TXB; break;
1647 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
1648 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1649 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
1650 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
1651 default: unreachable("not reached");
1652 }
1653
1654 fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1655 inst->base_mrf = message.reg - 1;
1656 inst->mlen = msg_end.reg - inst->base_mrf;
1657 inst->header_size = 1;
1658 inst->regs_written = 8;
1659
1660 return inst;
1661 }
1662
1663 /* gen5's sampler has slots for u, v, r, array index, then optional
1664 * parameters like shadow comparitor or LOD bias. If optional
1665 * parameters aren't present, those base slots are optional and don't
1666 * need to be included in the message.
1667 *
1668 * We don't fill in the unnecessary slots regardless, which may look
1669 * surprising in the disassembly.
1670 */
1671 fs_inst *
1672 fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
1673 fs_reg coordinate, int vector_elements,
1674 fs_reg shadow_c,
1675 fs_reg lod, fs_reg lod2, int grad_components,
1676 fs_reg sample_index, uint32_t sampler,
1677 bool has_offset)
1678 {
1679 int reg_width = dispatch_width / 8;
1680 unsigned header_size = 0;
1681
1682 fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
1683 fs_reg msg_coords = message;
1684
1685 if (has_offset) {
1686 /* The offsets set up by the ir_texture visitor are in the
1687 * m1 header, so we can't go headerless.
1688 */
1689 header_size = 1;
1690 message.reg--;
1691 }
1692
1693 for (int i = 0; i < vector_elements; i++) {
1694 emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
1695 coordinate = offset(coordinate, 1);
1696 }
1697 fs_reg msg_end = offset(msg_coords, vector_elements);
1698 fs_reg msg_lod = offset(msg_coords, 4);
1699
1700 if (shadow_c.file != BAD_FILE) {
1701 fs_reg msg_shadow = msg_lod;
1702 emit(MOV(msg_shadow, shadow_c));
1703 msg_lod = offset(msg_shadow, 1);
1704 msg_end = msg_lod;
1705 }
1706
1707 enum opcode opcode;
1708 switch (op) {
1709 case ir_tex:
1710 opcode = SHADER_OPCODE_TEX;
1711 break;
1712 case ir_txb:
1713 emit(MOV(msg_lod, lod));
1714 msg_end = offset(msg_lod, 1);
1715
1716 opcode = FS_OPCODE_TXB;
1717 break;
1718 case ir_txl:
1719 emit(MOV(msg_lod, lod));
1720 msg_end = offset(msg_lod, 1);
1721
1722 opcode = SHADER_OPCODE_TXL;
1723 break;
1724 case ir_txd: {
1725 /**
1726 * P = u, v, r
1727 * dPdx = dudx, dvdx, drdx
1728 * dPdy = dudy, dvdy, drdy
1729 *
1730 * Load up these values:
1731 * - dudx dudy dvdx dvdy drdx drdy
1732 * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
1733 */
1734 msg_end = msg_lod;
1735 for (int i = 0; i < grad_components; i++) {
1736 emit(MOV(msg_end, lod));
1737 lod = offset(lod, 1);
1738 msg_end = offset(msg_end, 1);
1739
1740 emit(MOV(msg_end, lod2));
1741 lod2 = offset(lod2, 1);
1742 msg_end = offset(msg_end, 1);
1743 }
1744
1745 opcode = SHADER_OPCODE_TXD;
1746 break;
1747 }
1748 case ir_txs:
1749 msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
1750 emit(MOV(msg_lod, lod));
1751 msg_end = offset(msg_lod, 1);
1752
1753 opcode = SHADER_OPCODE_TXS;
1754 break;
1755 case ir_query_levels:
1756 msg_lod = msg_end;
1757 emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1758 msg_end = offset(msg_lod, 1);
1759
1760 opcode = SHADER_OPCODE_TXS;
1761 break;
1762 case ir_txf:
1763 msg_lod = offset(msg_coords, 3);
1764 emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
1765 msg_end = offset(msg_lod, 1);
1766
1767 opcode = SHADER_OPCODE_TXF;
1768 break;
1769 case ir_txf_ms:
1770 msg_lod = offset(msg_coords, 3);
1771 /* lod */
1772 emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1773 /* sample index */
1774 emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
1775 msg_end = offset(msg_lod, 2);
1776
1777 opcode = SHADER_OPCODE_TXF_CMS;
1778 break;
1779 case ir_lod:
1780 opcode = SHADER_OPCODE_LOD;
1781 break;
1782 case ir_tg4:
1783 opcode = SHADER_OPCODE_TG4;
1784 break;
1785 default:
1786 unreachable("not reached");
1787 }
1788
1789 fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1790 inst->base_mrf = message.reg;
1791 inst->mlen = msg_end.reg - message.reg;
1792 inst->header_size = header_size;
1793 inst->regs_written = 4 * reg_width;
1794
1795 if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
1796 fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
1797 " disallowed by hardware\n");
1798 }
1799
1800 return inst;
1801 }
1802
1803 static bool
1804 is_high_sampler(const struct brw_device_info *devinfo, fs_reg sampler)
1805 {
1806 if (devinfo->gen < 8 && !devinfo->is_haswell)
1807 return false;
1808
1809 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
1810 }
1811
1812 fs_inst *
1813 fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
1814 fs_reg coordinate, int coord_components,
1815 fs_reg shadow_c,
1816 fs_reg lod, fs_reg lod2, int grad_components,
1817 fs_reg sample_index, fs_reg mcs, fs_reg sampler,
1818 fs_reg offset_value)
1819 {
1820 int reg_width = dispatch_width / 8;
1821 unsigned header_size = 0;
1822
1823 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
1824 for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
1825 sources[i] = vgrf(glsl_type::float_type);
1826 }
1827 int length = 0;
1828
1829 if (op == ir_tg4 || offset_value.file != BAD_FILE ||
1830 is_high_sampler(devinfo, sampler)) {
1831 /* For general texture offsets (no txf workaround), we need a header to
1832 * put them in. Note that for SIMD16 we're making space for two actual
1833 * hardware registers here, so the emit will have to fix up for this.
1834 *
1835 * * ir4_tg4 needs to place its channel select in the header,
1836 * for interaction with ARB_texture_swizzle
1837 *
1838 * The sampler index is only 4-bits, so for larger sampler numbers we
1839 * need to offset the Sampler State Pointer in the header.
1840 */
1841 header_size = 1;
1842 sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1843 length++;
1844 }
1845
1846 if (shadow_c.file != BAD_FILE) {
1847 emit(MOV(sources[length], shadow_c));
1848 length++;
1849 }
1850
1851 bool has_nonconstant_offset =
1852 offset_value.file != BAD_FILE && offset_value.file != IMM;
1853 bool coordinate_done = false;
1854
1855 /* The sampler can only meaningfully compute LOD for fragment shader
1856 * messages. For all other stages, we change the opcode to ir_txl and
1857 * hardcode the LOD to 0.
1858 */
1859 if (stage != MESA_SHADER_FRAGMENT && op == ir_tex) {
1860 op = ir_txl;
1861 lod = fs_reg(0.0f);
1862 }
1863
1864 /* Set up the LOD info */
1865 switch (op) {
1866 case ir_tex:
1867 case ir_lod:
1868 break;
1869 case ir_txb:
1870 emit(MOV(sources[length], lod));
1871 length++;
1872 break;
1873 case ir_txl:
1874 emit(MOV(sources[length], lod));
1875 length++;
1876 break;
1877 case ir_txd: {
1878 no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1879
1880 /* Load dPdx and the coordinate together:
1881 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1882 */
1883 for (int i = 0; i < coord_components; i++) {
1884 emit(MOV(sources[length], coordinate));
1885 coordinate = offset(coordinate, 1);
1886 length++;
1887
1888 /* For cube map array, the coordinate is (u,v,r,ai) but there are
1889 * only derivatives for (u, v, r).
1890 */
1891 if (i < grad_components) {
1892 emit(MOV(sources[length], lod));
1893 lod = offset(lod, 1);
1894 length++;
1895
1896 emit(MOV(sources[length], lod2));
1897 lod2 = offset(lod2, 1);
1898 length++;
1899 }
1900 }
1901
1902 coordinate_done = true;
1903 break;
1904 }
1905 case ir_txs:
1906 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
1907 length++;
1908 break;
1909 case ir_query_levels:
1910 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1911 length++;
1912 break;
1913 case ir_txf:
1914 /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
1915 * On Gen9 they are u, v, lod, r
1916 */
1917
1918 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1919 coordinate = offset(coordinate, 1);
1920 length++;
1921
1922 if (devinfo->gen >= 9) {
1923 if (coord_components >= 2) {
1924 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1925 coordinate = offset(coordinate, 1);
1926 }
1927 length++;
1928 }
1929
1930 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
1931 length++;
1932
1933 for (int i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
1934 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1935 coordinate = offset(coordinate, 1);
1936 length++;
1937 }
1938
1939 coordinate_done = true;
1940 break;
1941 case ir_txf_ms:
1942 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
1943 length++;
1944
1945 /* data from the multisample control surface */
1946 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
1947 length++;
1948
1949 /* there is no offsetting for this message; just copy in the integer
1950 * texture coordinates
1951 */
1952 for (int i = 0; i < coord_components; i++) {
1953 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1954 coordinate = offset(coordinate, 1);
1955 length++;
1956 }
1957
1958 coordinate_done = true;
1959 break;
1960 case ir_tg4:
1961 if (has_nonconstant_offset) {
1962 if (shadow_c.file != BAD_FILE)
1963 no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
1964
1965 /* More crazy intermixing */
1966 for (int i = 0; i < 2; i++) { /* u, v */
1967 emit(MOV(sources[length], coordinate));
1968 coordinate = offset(coordinate, 1);
1969 length++;
1970 }
1971
1972 for (int i = 0; i < 2; i++) { /* offu, offv */
1973 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
1974 offset_value = offset(offset_value, 1);
1975 length++;
1976 }
1977
1978 if (coord_components == 3) { /* r if present */
1979 emit(MOV(sources[length], coordinate));
1980 coordinate = offset(coordinate, 1);
1981 length++;
1982 }
1983
1984 coordinate_done = true;
1985 }
1986 break;
1987 }
1988
1989 /* Set up the coordinate (except for cases where it was done above) */
1990 if (!coordinate_done) {
1991 for (int i = 0; i < coord_components; i++) {
1992 emit(MOV(sources[length], coordinate));
1993 coordinate = offset(coordinate, 1);
1994 length++;
1995 }
1996 }
1997
1998 int mlen;
1999 if (reg_width == 2)
2000 mlen = length * reg_width - header_size;
2001 else
2002 mlen = length * reg_width;
2003
2004 fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
2005 BRW_REGISTER_TYPE_F);
2006 emit(LOAD_PAYLOAD(src_payload, sources, length, header_size));
2007
2008 /* Generate the SEND */
2009 enum opcode opcode;
2010 switch (op) {
2011 case ir_tex: opcode = SHADER_OPCODE_TEX; break;
2012 case ir_txb: opcode = FS_OPCODE_TXB; break;
2013 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2014 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2015 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2016 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2017 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2018 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2019 case ir_lod: opcode = SHADER_OPCODE_LOD; break;
2020 case ir_tg4:
2021 if (has_nonconstant_offset)
2022 opcode = SHADER_OPCODE_TG4_OFFSET;
2023 else
2024 opcode = SHADER_OPCODE_TG4;
2025 break;
2026 default:
2027 unreachable("not reached");
2028 }
2029 fs_inst *inst = emit(opcode, dst, src_payload, sampler);
2030 inst->base_mrf = -1;
2031 inst->mlen = mlen;
2032 inst->header_size = header_size;
2033 inst->regs_written = 4 * reg_width;
2034
2035 if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
2036 fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
2037 " disallowed by hardware\n");
2038 }
2039
2040 return inst;
2041 }
2042
2043 fs_reg
2044 fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
2045 bool is_rect, uint32_t sampler, int texunit)
2046 {
2047 fs_inst *inst = NULL;
2048 bool needs_gl_clamp = true;
2049 fs_reg scale_x, scale_y;
2050
2051 /* The 965 requires the EU to do the normalization of GL rectangle
2052 * texture coordinates. We use the program parameter state
2053 * tracking to get the scaling factor.
2054 */
2055 if (is_rect &&
2056 (devinfo->gen < 6 ||
2057 (devinfo->gen >= 6 && (key_tex->gl_clamp_mask[0] & (1 << sampler) ||
2058 key_tex->gl_clamp_mask[1] & (1 << sampler))))) {
2059 struct gl_program_parameter_list *params = prog->Parameters;
2060 int tokens[STATE_LENGTH] = {
2061 STATE_INTERNAL,
2062 STATE_TEXRECT_SCALE,
2063 texunit,
2064 0,
2065 0
2066 };
2067
2068 no16("rectangle scale uniform setup not supported on SIMD16\n");
2069 if (dispatch_width == 16) {
2070 return coordinate;
2071 }
2072
2073 GLuint index = _mesa_add_state_reference(params,
2074 (gl_state_index *)tokens);
2075 /* Try to find existing copies of the texrect scale uniforms. */
2076 for (unsigned i = 0; i < uniforms; i++) {
2077 if (stage_prog_data->param[i] ==
2078 &prog->Parameters->ParameterValues[index][0]) {
2079 scale_x = fs_reg(UNIFORM, i);
2080 scale_y = fs_reg(UNIFORM, i + 1);
2081 break;
2082 }
2083 }
2084
2085 /* If we didn't already set them up, do so now. */
2086 if (scale_x.file == BAD_FILE) {
2087 scale_x = fs_reg(UNIFORM, uniforms);
2088 scale_y = fs_reg(UNIFORM, uniforms + 1);
2089
2090 stage_prog_data->param[uniforms++] =
2091 &prog->Parameters->ParameterValues[index][0];
2092 stage_prog_data->param[uniforms++] =
2093 &prog->Parameters->ParameterValues[index][1];
2094 }
2095 }
2096
2097 /* The 965 requires the EU to do the normalization of GL rectangle
2098 * texture coordinates. We use the program parameter state
2099 * tracking to get the scaling factor.
2100 */
2101 if (devinfo->gen < 6 && is_rect) {
2102 fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
2103 fs_reg src = coordinate;
2104 coordinate = dst;
2105
2106 emit(MUL(dst, src, scale_x));
2107 dst = offset(dst, 1);
2108 src = offset(src, 1);
2109 emit(MUL(dst, src, scale_y));
2110 } else if (is_rect) {
2111 /* On gen6+, the sampler handles the rectangle coordinates
2112 * natively, without needing rescaling. But that means we have
2113 * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
2114 * not [0, 1] like the default case below.
2115 */
2116 needs_gl_clamp = false;
2117
2118 for (int i = 0; i < 2; i++) {
2119 if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
2120 fs_reg chan = coordinate;
2121 chan = offset(chan, i);
2122
2123 inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
2124 inst->conditional_mod = BRW_CONDITIONAL_GE;
2125
2126 /* Our parameter comes in as 1.0/width or 1.0/height,
2127 * because that's what people normally want for doing
2128 * texture rectangle handling. We need width or height
2129 * for clamping, but we don't care enough to make a new
2130 * parameter type, so just invert back.
2131 */
2132 fs_reg limit = vgrf(glsl_type::float_type);
2133 emit(MOV(limit, i == 0 ? scale_x : scale_y));
2134 emit(SHADER_OPCODE_RCP, limit, limit);
2135
2136 inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
2137 inst->conditional_mod = BRW_CONDITIONAL_L;
2138 }
2139 }
2140 }
2141
2142 if (coord_components > 0 && needs_gl_clamp) {
2143 for (int i = 0; i < MIN2(coord_components, 3); i++) {
2144 if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
2145 fs_reg chan = coordinate;
2146 chan = offset(chan, i);
2147
2148 fs_inst *inst = emit(MOV(chan, chan));
2149 inst->saturate = true;
2150 }
2151 }
2152 }
2153 return coordinate;
2154 }
2155
2156 /* Sample from the MCS surface attached to this multisample texture. */
2157 fs_reg
2158 fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
2159 {
2160 int reg_width = dispatch_width / 8;
2161 fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
2162 BRW_REGISTER_TYPE_F);
2163 fs_reg dest = vgrf(glsl_type::uvec4_type);
2164 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
2165
2166 /* parameters are: u, v, r; missing parameters are treated as zero */
2167 for (int i = 0; i < components; i++) {
2168 sources[i] = vgrf(glsl_type::float_type);
2169 emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
2170 coordinate = offset(coordinate, 1);
2171 }
2172
2173 emit(LOAD_PAYLOAD(payload, sources, components, 0));
2174
2175 fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
2176 inst->base_mrf = -1;
2177 inst->mlen = components * reg_width;
2178 inst->header_size = 0;
2179 inst->regs_written = 4 * reg_width; /* we only care about one reg of
2180 * response, but the sampler always
2181 * writes 4/8
2182 */
2183
2184 return dest;
2185 }
2186
2187 void
2188 fs_visitor::emit_texture(ir_texture_opcode op,
2189 const glsl_type *dest_type,
2190 fs_reg coordinate, int coord_components,
2191 fs_reg shadow_c,
2192 fs_reg lod, fs_reg lod2, int grad_components,
2193 fs_reg sample_index,
2194 fs_reg offset_value,
2195 fs_reg mcs,
2196 int gather_component,
2197 bool is_cube_array,
2198 bool is_rect,
2199 uint32_t sampler,
2200 fs_reg sampler_reg, int texunit)
2201 {
2202 fs_inst *inst = NULL;
2203
2204 if (op == ir_tg4) {
2205 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2206 * emitting anything other than setting up the constant result.
2207 */
2208 int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
2209 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2210
2211 fs_reg res = vgrf(glsl_type::vec4_type);
2212 this->result = res;
2213
2214 for (int i=0; i<4; i++) {
2215 emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
2216 res = offset(res, 1);
2217 }
2218 return;
2219 }
2220 }
2221
2222 if (coordinate.file != BAD_FILE) {
2223 /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
2224 * samplers. This should only be a problem with GL_CLAMP on Gen7.
2225 */
2226 coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
2227 sampler, texunit);
2228 }
2229
2230 /* Writemasking doesn't eliminate channels on SIMD8 texture
2231 * samples, so don't worry about them.
2232 */
2233 fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
2234
2235 if (devinfo->gen >= 7) {
2236 inst = emit_texture_gen7(op, dst, coordinate, coord_components,
2237 shadow_c, lod, lod2, grad_components,
2238 sample_index, mcs, sampler_reg,
2239 offset_value);
2240 } else if (devinfo->gen >= 5) {
2241 inst = emit_texture_gen5(op, dst, coordinate, coord_components,
2242 shadow_c, lod, lod2, grad_components,
2243 sample_index, sampler,
2244 offset_value.file != BAD_FILE);
2245 } else if (dispatch_width == 16) {
2246 inst = emit_texture_gen4_simd16(op, dst, coordinate, coord_components,
2247 shadow_c, lod, sampler);
2248 } else {
2249 inst = emit_texture_gen4(op, dst, coordinate, coord_components,
2250 shadow_c, lod, lod2, grad_components,
2251 sampler);
2252 }
2253
2254 if (shadow_c.file != BAD_FILE)
2255 inst->shadow_compare = true;
2256
2257 if (offset_value.file == IMM)
2258 inst->offset = offset_value.fixed_hw_reg.dw1.ud;
2259
2260 if (op == ir_tg4) {
2261 inst->offset |=
2262 gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
2263
2264 if (devinfo->gen == 6)
2265 emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst);
2266 }
2267
2268 /* fixup #layers for cube map arrays */
2269 if (op == ir_txs && is_cube_array) {
2270 fs_reg depth = offset(dst, 2);
2271 fs_reg fixed_depth = vgrf(glsl_type::int_type);
2272 emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
2273
2274 fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
2275 int components = inst->regs_written / (dst.width / 8);
2276 for (int i = 0; i < components; i++) {
2277 if (i == 2) {
2278 fixed_payload[i] = fixed_depth;
2279 } else {
2280 fixed_payload[i] = offset(dst, i);
2281 }
2282 }
2283 emit(LOAD_PAYLOAD(dst, fixed_payload, components, 0));
2284 }
2285
2286 swizzle_result(op, dest_type->vector_elements, dst, sampler);
2287 }
2288
2289 void
2290 fs_visitor::visit(ir_texture *ir)
2291 {
2292 uint32_t sampler =
2293 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2294
2295 ir_rvalue *nonconst_sampler_index =
2296 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2297
2298 /* Handle non-constant sampler array indexing */
2299 fs_reg sampler_reg;
2300 if (nonconst_sampler_index) {
2301 /* The highest sampler which may be used by this operation is
2302 * the last element of the array. Mark it here, because the generator
2303 * doesn't have enough information to determine the bound.
2304 */
2305 uint32_t array_size = ir->sampler->as_dereference_array()
2306 ->array->type->array_size();
2307
2308 uint32_t max_used = sampler + array_size - 1;
2309 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2310 max_used += stage_prog_data->binding_table.gather_texture_start;
2311 } else {
2312 max_used += stage_prog_data->binding_table.texture_start;
2313 }
2314
2315 brw_mark_surface_used(prog_data, max_used);
2316
2317 /* Emit code to evaluate the actual indexing expression */
2318 nonconst_sampler_index->accept(this);
2319 fs_reg temp = vgrf(glsl_type::uint_type);
2320 emit(ADD(temp, this->result, fs_reg(sampler)));
2321 emit_uniformize(temp, temp);
2322
2323 sampler_reg = temp;
2324 } else {
2325 /* Single sampler, or constant array index; the indexing expression
2326 * is just an immediate.
2327 */
2328 sampler_reg = fs_reg(sampler);
2329 }
2330
2331 /* FINISHME: We're failing to recompile our programs when the sampler is
2332 * updated. This only matters for the texture rectangle scale parameters
2333 * (pre-gen6, or gen6+ with GL_CLAMP).
2334 */
2335 int texunit = prog->SamplerUnits[sampler];
2336
2337 /* Should be lowered by do_lower_texture_projection */
2338 assert(!ir->projector);
2339
2340 /* Should be lowered */
2341 assert(!ir->offset || !ir->offset->type->is_array());
2342
2343 /* Generate code to compute all the subexpression trees. This has to be
2344 * done before loading any values into MRFs for the sampler message since
2345 * generating these values may involve SEND messages that need the MRFs.
2346 */
2347 fs_reg coordinate;
2348 int coord_components = 0;
2349 if (ir->coordinate) {
2350 coord_components = ir->coordinate->type->vector_elements;
2351 ir->coordinate->accept(this);
2352 coordinate = this->result;
2353 }
2354
2355 fs_reg shadow_comparitor;
2356 if (ir->shadow_comparitor) {
2357 ir->shadow_comparitor->accept(this);
2358 shadow_comparitor = this->result;
2359 }
2360
2361 fs_reg offset_value;
2362 if (ir->offset) {
2363 ir_constant *const_offset = ir->offset->as_constant();
2364 if (const_offset) {
2365 /* Store the header bitfield in an IMM register. This allows us to
2366 * use offset_value.file to distinguish between no offset, a constant
2367 * offset, and a non-constant offset.
2368 */
2369 offset_value =
2370 fs_reg(brw_texture_offset(const_offset->value.i,
2371 const_offset->type->vector_elements));
2372 } else {
2373 ir->offset->accept(this);
2374 offset_value = this->result;
2375 }
2376 }
2377
2378 fs_reg lod, lod2, sample_index, mcs;
2379 int grad_components = 0;
2380 switch (ir->op) {
2381 case ir_tex:
2382 case ir_lod:
2383 case ir_tg4:
2384 case ir_query_levels:
2385 break;
2386 case ir_txb:
2387 ir->lod_info.bias->accept(this);
2388 lod = this->result;
2389 break;
2390 case ir_txd:
2391 ir->lod_info.grad.dPdx->accept(this);
2392 lod = this->result;
2393
2394 ir->lod_info.grad.dPdy->accept(this);
2395 lod2 = this->result;
2396
2397 grad_components = ir->lod_info.grad.dPdx->type->vector_elements;
2398 break;
2399 case ir_txf:
2400 case ir_txl:
2401 case ir_txs:
2402 ir->lod_info.lod->accept(this);
2403 lod = this->result;
2404 break;
2405 case ir_txf_ms:
2406 ir->lod_info.sample_index->accept(this);
2407 sample_index = this->result;
2408
2409 if (devinfo->gen >= 7 &&
2410 key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
2411 mcs = emit_mcs_fetch(coordinate, ir->coordinate->type->vector_elements,
2412 sampler_reg);
2413 } else {
2414 mcs = fs_reg(0u);
2415 }
2416 break;
2417 default:
2418 unreachable("Unrecognized texture opcode");
2419 };
2420
2421 int gather_component = 0;
2422 if (ir->op == ir_tg4)
2423 gather_component = ir->lod_info.component->as_constant()->value.i[0];
2424
2425 bool is_rect =
2426 ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT;
2427
2428 bool is_cube_array =
2429 ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2430 ir->sampler->type->sampler_array;
2431
2432 emit_texture(ir->op, ir->type, coordinate, coord_components,
2433 shadow_comparitor, lod, lod2, grad_components,
2434 sample_index, offset_value, mcs,
2435 gather_component, is_cube_array, is_rect, sampler,
2436 sampler_reg, texunit);
2437 }
2438
2439 /**
2440 * Apply workarounds for Gen6 gather with UINT/SINT
2441 */
2442 void
2443 fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
2444 {
2445 if (!wa)
2446 return;
2447
2448 int width = (wa & WA_8BIT) ? 8 : 16;
2449
2450 for (int i = 0; i < 4; i++) {
2451 fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
2452 /* Convert from UNORM to UINT */
2453 emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
2454 emit(MOV(dst, dst_f));
2455
2456 if (wa & WA_SIGN) {
2457 /* Reinterpret the UINT value as a signed INT value by
2458 * shifting the sign bit into place, then shifting back
2459 * preserving sign.
2460 */
2461 emit(SHL(dst, dst, fs_reg(32 - width)));
2462 emit(ASR(dst, dst, fs_reg(32 - width)));
2463 }
2464
2465 dst = offset(dst, 1);
2466 }
2467 }
2468
2469 /**
2470 * Set up the gather channel based on the swizzle, for gather4.
2471 */
2472 uint32_t
2473 fs_visitor::gather_channel(int orig_chan, uint32_t sampler)
2474 {
2475 int swiz = GET_SWZ(key_tex->swizzles[sampler], orig_chan);
2476 switch (swiz) {
2477 case SWIZZLE_X: return 0;
2478 case SWIZZLE_Y:
2479 /* gather4 sampler is broken for green channel on RG32F --
2480 * we must ask for blue instead.
2481 */
2482 if (key_tex->gather_channel_quirk_mask & (1 << sampler))
2483 return 2;
2484 return 1;
2485 case SWIZZLE_Z: return 2;
2486 case SWIZZLE_W: return 3;
2487 default:
2488 unreachable("Not reached"); /* zero, one swizzles handled already */
2489 }
2490 }
2491
2492 /**
2493 * Swizzle the result of a texture result. This is necessary for
2494 * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
2495 */
2496 void
2497 fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
2498 fs_reg orig_val, uint32_t sampler)
2499 {
2500 if (op == ir_query_levels) {
2501 /* # levels is in .w */
2502 this->result = offset(orig_val, 3);
2503 return;
2504 }
2505
2506 this->result = orig_val;
2507
2508 /* txs,lod don't actually sample the texture, so swizzling the result
2509 * makes no sense.
2510 */
2511 if (op == ir_txs || op == ir_lod || op == ir_tg4)
2512 return;
2513
2514 if (dest_components == 1) {
2515 /* Ignore DEPTH_TEXTURE_MODE swizzling. */
2516 } else if (key_tex->swizzles[sampler] != SWIZZLE_NOOP) {
2517 fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
2518 swizzled_result.type = orig_val.type;
2519
2520 for (int i = 0; i < 4; i++) {
2521 int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
2522 fs_reg l = swizzled_result;
2523 l = offset(l, i);
2524
2525 if (swiz == SWIZZLE_ZERO) {
2526 emit(MOV(l, fs_reg(0.0f)));
2527 } else if (swiz == SWIZZLE_ONE) {
2528 emit(MOV(l, fs_reg(1.0f)));
2529 } else {
2530 emit(MOV(l, offset(orig_val,
2531 GET_SWZ(key_tex->swizzles[sampler], i))));
2532 }
2533 }
2534 this->result = swizzled_result;
2535 }
2536 }
2537
2538 void
2539 fs_visitor::visit(ir_swizzle *ir)
2540 {
2541 ir->val->accept(this);
2542 fs_reg val = this->result;
2543
2544 if (ir->type->vector_elements == 1) {
2545 this->result = offset(this->result, ir->mask.x);
2546 return;
2547 }
2548
2549 fs_reg result = vgrf(ir->type);
2550 this->result = result;
2551
2552 for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
2553 fs_reg channel = val;
2554 int swiz = 0;
2555
2556 switch (i) {
2557 case 0:
2558 swiz = ir->mask.x;
2559 break;
2560 case 1:
2561 swiz = ir->mask.y;
2562 break;
2563 case 2:
2564 swiz = ir->mask.z;
2565 break;
2566 case 3:
2567 swiz = ir->mask.w;
2568 break;
2569 }
2570
2571 emit(MOV(result, offset(channel, swiz)));
2572 result = offset(result, 1);
2573 }
2574 }
2575
2576 void
2577 fs_visitor::visit(ir_discard *ir)
2578 {
2579 /* We track our discarded pixels in f0.1. By predicating on it, we can
2580 * update just the flag bits that aren't yet discarded. If there's no
2581 * condition, we emit a CMP of g0 != g0, so all currently executing
2582 * channels will get turned off.
2583 */
2584 fs_inst *cmp;
2585 if (ir->condition) {
2586 emit_bool_to_cond_code(ir->condition);
2587 cmp = (fs_inst *) this->instructions.get_tail();
2588 cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
2589 } else {
2590 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
2591 BRW_REGISTER_TYPE_UW));
2592 cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
2593 }
2594 cmp->predicate = BRW_PREDICATE_NORMAL;
2595 cmp->flag_subreg = 1;
2596
2597 if (devinfo->gen >= 6) {
2598 emit_discard_jump();
2599 }
2600 }
2601
2602 void
2603 fs_visitor::visit(ir_constant *ir)
2604 {
2605 /* Set this->result to reg at the bottom of the function because some code
2606 * paths will cause this visitor to be applied to other fields. This will
2607 * cause the value stored in this->result to be modified.
2608 *
2609 * Make reg constant so that it doesn't get accidentally modified along the
2610 * way. Yes, I actually had this problem. :(
2611 */
2612 const fs_reg reg = vgrf(ir->type);
2613 fs_reg dst_reg = reg;
2614
2615 if (ir->type->is_array()) {
2616 const unsigned size = type_size(ir->type->fields.array);
2617
2618 for (unsigned i = 0; i < ir->type->length; i++) {
2619 ir->array_elements[i]->accept(this);
2620 fs_reg src_reg = this->result;
2621
2622 dst_reg.type = src_reg.type;
2623 for (unsigned j = 0; j < size; j++) {
2624 emit(MOV(dst_reg, src_reg));
2625 src_reg = offset(src_reg, 1);
2626 dst_reg = offset(dst_reg, 1);
2627 }
2628 }
2629 } else if (ir->type->is_record()) {
2630 foreach_in_list(ir_constant, field, &ir->components) {
2631 const unsigned size = type_size(field->type);
2632
2633 field->accept(this);
2634 fs_reg src_reg = this->result;
2635
2636 dst_reg.type = src_reg.type;
2637 for (unsigned j = 0; j < size; j++) {
2638 emit(MOV(dst_reg, src_reg));
2639 src_reg = offset(src_reg, 1);
2640 dst_reg = offset(dst_reg, 1);
2641 }
2642 }
2643 } else {
2644 const unsigned size = type_size(ir->type);
2645
2646 for (unsigned i = 0; i < size; i++) {
2647 switch (ir->type->base_type) {
2648 case GLSL_TYPE_FLOAT:
2649 emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
2650 break;
2651 case GLSL_TYPE_UINT:
2652 emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
2653 break;
2654 case GLSL_TYPE_INT:
2655 emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
2656 break;
2657 case GLSL_TYPE_BOOL:
2658 emit(MOV(dst_reg, fs_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2659 break;
2660 default:
2661 unreachable("Non-float/uint/int/bool constant");
2662 }
2663 dst_reg = offset(dst_reg, 1);
2664 }
2665 }
2666
2667 this->result = reg;
2668 }
2669
2670 void
2671 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
2672 {
2673 ir_expression *expr = ir->as_expression();
2674
2675 if (!expr || expr->operation == ir_binop_ubo_load) {
2676 ir->accept(this);
2677
2678 fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
2679 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2680 return;
2681 }
2682
2683 fs_reg op[3];
2684
2685 assert(expr->get_num_operands() <= 3);
2686 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2687 assert(expr->operands[i]->type->is_scalar());
2688
2689 expr->operands[i]->accept(this);
2690 op[i] = this->result;
2691
2692 resolve_ud_negate(&op[i]);
2693 }
2694
2695 emit_bool_to_cond_code_of_reg(expr, op);
2696 }
2697
2698 void
2699 fs_visitor::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
2700 {
2701 fs_inst *inst;
2702
2703 switch (expr->operation) {
2704 case ir_unop_logic_not:
2705 inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
2706 inst->conditional_mod = BRW_CONDITIONAL_Z;
2707 break;
2708
2709 case ir_binop_logic_xor:
2710 if (devinfo->gen <= 5) {
2711 fs_reg temp = vgrf(expr->type);
2712 emit(XOR(temp, op[0], op[1]));
2713 inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2714 } else {
2715 inst = emit(XOR(reg_null_d, op[0], op[1]));
2716 }
2717 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2718 break;
2719
2720 case ir_binop_logic_or:
2721 if (devinfo->gen <= 5) {
2722 fs_reg temp = vgrf(expr->type);
2723 emit(OR(temp, op[0], op[1]));
2724 inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2725 } else {
2726 inst = emit(OR(reg_null_d, op[0], op[1]));
2727 }
2728 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2729 break;
2730
2731 case ir_binop_logic_and:
2732 if (devinfo->gen <= 5) {
2733 fs_reg temp = vgrf(expr->type);
2734 emit(AND(temp, op[0], op[1]));
2735 inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2736 } else {
2737 inst = emit(AND(reg_null_d, op[0], op[1]));
2738 }
2739 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2740 break;
2741
2742 case ir_unop_f2b:
2743 if (devinfo->gen >= 6) {
2744 emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
2745 } else {
2746 inst = emit(MOV(reg_null_f, op[0]));
2747 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2748 }
2749 break;
2750
2751 case ir_unop_i2b:
2752 if (devinfo->gen >= 6) {
2753 emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2754 } else {
2755 inst = emit(MOV(reg_null_d, op[0]));
2756 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2757 }
2758 break;
2759
2760 case ir_binop_greater:
2761 case ir_binop_gequal:
2762 case ir_binop_less:
2763 case ir_binop_lequal:
2764 case ir_binop_equal:
2765 case ir_binop_all_equal:
2766 case ir_binop_nequal:
2767 case ir_binop_any_nequal:
2768 if (devinfo->gen <= 5) {
2769 resolve_bool_comparison(expr->operands[0], &op[0]);
2770 resolve_bool_comparison(expr->operands[1], &op[1]);
2771 }
2772
2773 emit(CMP(reg_null_d, op[0], op[1],
2774 brw_conditional_for_comparison(expr->operation)));
2775 break;
2776
2777 case ir_triop_csel: {
2778 /* Expand the boolean condition into the flag register. */
2779 inst = emit(MOV(reg_null_d, op[0]));
2780 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2781
2782 /* Select which boolean to return. */
2783 fs_reg temp = vgrf(expr->operands[1]->type);
2784 inst = emit(SEL(temp, op[1], op[2]));
2785 inst->predicate = BRW_PREDICATE_NORMAL;
2786
2787 /* Expand the result to a condition code. */
2788 inst = emit(MOV(reg_null_d, temp));
2789 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2790 break;
2791 }
2792
2793 default:
2794 unreachable("not reached");
2795 }
2796 }
2797
2798 /**
2799 * Emit a gen6 IF statement with the comparison folded into the IF
2800 * instruction.
2801 */
2802 void
2803 fs_visitor::emit_if_gen6(ir_if *ir)
2804 {
2805 ir_expression *expr = ir->condition->as_expression();
2806
2807 if (expr && expr->operation != ir_binop_ubo_load) {
2808 fs_reg op[3];
2809 fs_inst *inst;
2810 fs_reg temp;
2811
2812 assert(expr->get_num_operands() <= 3);
2813 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2814 assert(expr->operands[i]->type->is_scalar());
2815
2816 expr->operands[i]->accept(this);
2817 op[i] = this->result;
2818 }
2819
2820 switch (expr->operation) {
2821 case ir_unop_logic_not:
2822 emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
2823 return;
2824
2825 case ir_binop_logic_xor:
2826 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
2827 return;
2828
2829 case ir_binop_logic_or:
2830 temp = vgrf(glsl_type::bool_type);
2831 emit(OR(temp, op[0], op[1]));
2832 emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2833 return;
2834
2835 case ir_binop_logic_and:
2836 temp = vgrf(glsl_type::bool_type);
2837 emit(AND(temp, op[0], op[1]));
2838 emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2839 return;
2840
2841 case ir_unop_f2b:
2842 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
2843 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2844 return;
2845
2846 case ir_unop_i2b:
2847 emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2848 return;
2849
2850 case ir_binop_greater:
2851 case ir_binop_gequal:
2852 case ir_binop_less:
2853 case ir_binop_lequal:
2854 case ir_binop_equal:
2855 case ir_binop_all_equal:
2856 case ir_binop_nequal:
2857 case ir_binop_any_nequal:
2858 if (devinfo->gen <= 5) {
2859 resolve_bool_comparison(expr->operands[0], &op[0]);
2860 resolve_bool_comparison(expr->operands[1], &op[1]);
2861 }
2862
2863 emit(IF(op[0], op[1],
2864 brw_conditional_for_comparison(expr->operation)));
2865 return;
2866
2867 case ir_triop_csel: {
2868 /* Expand the boolean condition into the flag register. */
2869 fs_inst *inst = emit(MOV(reg_null_d, op[0]));
2870 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2871
2872 /* Select which boolean to use as the result. */
2873 fs_reg temp = vgrf(expr->operands[1]->type);
2874 inst = emit(SEL(temp, op[1], op[2]));
2875 inst->predicate = BRW_PREDICATE_NORMAL;
2876
2877 emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2878 return;
2879 }
2880
2881 default:
2882 unreachable("not reached");
2883 }
2884 }
2885
2886 ir->condition->accept(this);
2887 emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
2888 }
2889
2890 bool
2891 fs_visitor::try_opt_frontfacing_ternary(ir_if *ir)
2892 {
2893 ir_dereference_variable *deref = ir->condition->as_dereference_variable();
2894 if (!deref || strcmp(deref->var->name, "gl_FrontFacing") != 0)
2895 return false;
2896
2897 if (ir->then_instructions.length() != 1 ||
2898 ir->else_instructions.length() != 1)
2899 return false;
2900
2901 ir_assignment *then_assign =
2902 ((ir_instruction *)ir->then_instructions.head)->as_assignment();
2903 ir_assignment *else_assign =
2904 ((ir_instruction *)ir->else_instructions.head)->as_assignment();
2905
2906 if (!then_assign || then_assign->condition ||
2907 !else_assign || else_assign->condition ||
2908 then_assign->write_mask != else_assign->write_mask ||
2909 !then_assign->lhs->equals(else_assign->lhs))
2910 return false;
2911
2912 ir_constant *then_rhs = then_assign->rhs->as_constant();
2913 ir_constant *else_rhs = else_assign->rhs->as_constant();
2914
2915 if (!then_rhs || !else_rhs)
2916 return false;
2917
2918 if (then_rhs->type->base_type != GLSL_TYPE_FLOAT)
2919 return false;
2920
2921 if ((then_rhs->is_one() && else_rhs->is_negative_one()) ||
2922 (else_rhs->is_one() && then_rhs->is_negative_one())) {
2923 then_assign->lhs->accept(this);
2924 fs_reg dst = this->result;
2925 dst.type = BRW_REGISTER_TYPE_D;
2926 fs_reg tmp = vgrf(glsl_type::int_type);
2927
2928 if (devinfo->gen >= 6) {
2929 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
2930 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
2931
2932 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
2933 *
2934 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
2935 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
2936 *
2937 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
2938 */
2939
2940 if (then_rhs->is_negative_one()) {
2941 assert(else_rhs->is_one());
2942 g0.negate = true;
2943 }
2944
2945 tmp.type = BRW_REGISTER_TYPE_W;
2946 tmp.subreg_offset = 2;
2947 tmp.stride = 2;
2948
2949 fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
2950 or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
2951
2952 tmp.type = BRW_REGISTER_TYPE_D;
2953 tmp.subreg_offset = 0;
2954 tmp.stride = 1;
2955 } else {
2956 /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
2957 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
2958
2959 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
2960 *
2961 * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D
2962 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
2963 *
2964 * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
2965 */
2966
2967 if (then_rhs->is_negative_one()) {
2968 assert(else_rhs->is_one());
2969 g1_6.negate = true;
2970 }
2971
2972 emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
2973 }
2974 emit(AND(dst, tmp, fs_reg(0xbf800000)));
2975 return true;
2976 }
2977
2978 return false;
2979 }
2980
2981 /**
2982 * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
2983 *
2984 * Many GLSL shaders contain the following pattern:
2985 *
2986 * x = condition ? foo : bar
2987 *
2988 * The compiler emits an ir_if tree for this, since each subexpression might be
2989 * a complex tree that could have side-effects or short-circuit logic.
2990 *
2991 * However, the common case is to simply select one of two constants or
2992 * variable values---which is exactly what SEL is for. In this case, the
2993 * assembly looks like:
2994 *
2995 * (+f0) IF
2996 * MOV dst src0
2997 * ELSE
2998 * MOV dst src1
2999 * ENDIF
3000 *
3001 * which can be easily translated into:
3002 *
3003 * (+f0) SEL dst src0 src1
3004 *
3005 * If src0 is an immediate value, we promote it to a temporary GRF.
3006 */
3007 bool
3008 fs_visitor::try_replace_with_sel()
3009 {
3010 fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
3011 assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
3012
3013 /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
3014 int opcodes[] = {
3015 BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
3016 };
3017
3018 fs_inst *match = (fs_inst *) endif_inst->prev;
3019 for (int i = 0; i < 4; i++) {
3020 if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
3021 return false;
3022 match = (fs_inst *) match->prev;
3023 }
3024
3025 /* The opcodes match; it looks like the right sequence of instructions. */
3026 fs_inst *else_mov = (fs_inst *) endif_inst->prev;
3027 fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
3028 fs_inst *if_inst = (fs_inst *) then_mov->prev;
3029
3030 /* Check that the MOVs are the right form. */
3031 if (then_mov->dst.equals(else_mov->dst) &&
3032 !then_mov->is_partial_write() &&
3033 !else_mov->is_partial_write()) {
3034
3035 /* Remove the matched instructions; we'll emit a SEL to replace them. */
3036 while (!if_inst->next->is_tail_sentinel())
3037 if_inst->next->exec_node::remove();
3038 if_inst->exec_node::remove();
3039
3040 /* Only the last source register can be a constant, so if the MOV in
3041 * the "then" clause uses a constant, we need to put it in a temporary.
3042 */
3043 fs_reg src0(then_mov->src[0]);
3044 if (src0.file == IMM) {
3045 src0 = vgrf(glsl_type::float_type);
3046 src0.type = then_mov->src[0].type;
3047 emit(MOV(src0, then_mov->src[0]));
3048 }
3049
3050 fs_inst *sel;
3051 if (if_inst->conditional_mod) {
3052 /* Sandybridge-specific IF with embedded comparison */
3053 emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
3054 if_inst->conditional_mod));
3055 sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
3056 sel->predicate = BRW_PREDICATE_NORMAL;
3057 } else {
3058 /* Separate CMP and IF instructions */
3059 sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
3060 sel->predicate = if_inst->predicate;
3061 sel->predicate_inverse = if_inst->predicate_inverse;
3062 }
3063
3064 return true;
3065 }
3066
3067 return false;
3068 }
3069
3070 void
3071 fs_visitor::visit(ir_if *ir)
3072 {
3073 if (try_opt_frontfacing_ternary(ir))
3074 return;
3075
3076 /* Don't point the annotation at the if statement, because then it plus
3077 * the then and else blocks get printed.
3078 */
3079 this->base_ir = ir->condition;
3080
3081 if (devinfo->gen == 6) {
3082 emit_if_gen6(ir);
3083 } else {
3084 emit_bool_to_cond_code(ir->condition);
3085
3086 emit(IF(BRW_PREDICATE_NORMAL));
3087 }
3088
3089 foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
3090 this->base_ir = ir_;
3091 ir_->accept(this);
3092 }
3093
3094 if (!ir->else_instructions.is_empty()) {
3095 emit(BRW_OPCODE_ELSE);
3096
3097 foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
3098 this->base_ir = ir_;
3099 ir_->accept(this);
3100 }
3101 }
3102
3103 emit(BRW_OPCODE_ENDIF);
3104
3105 if (!try_replace_with_sel() && devinfo->gen < 6) {
3106 no16("Can't support (non-uniform) control flow on SIMD16\n");
3107 }
3108 }
3109
3110 void
3111 fs_visitor::visit(ir_loop *ir)
3112 {
3113 if (devinfo->gen < 6) {
3114 no16("Can't support (non-uniform) control flow on SIMD16\n");
3115 }
3116
3117 this->base_ir = NULL;
3118 emit(BRW_OPCODE_DO);
3119
3120 foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
3121 this->base_ir = ir_;
3122 ir_->accept(this);
3123 }
3124
3125 this->base_ir = NULL;
3126 emit(BRW_OPCODE_WHILE);
3127 }
3128
3129 void
3130 fs_visitor::visit(ir_loop_jump *ir)
3131 {
3132 switch (ir->mode) {
3133 case ir_loop_jump::jump_break:
3134 emit(BRW_OPCODE_BREAK);
3135 break;
3136 case ir_loop_jump::jump_continue:
3137 emit(BRW_OPCODE_CONTINUE);
3138 break;
3139 }
3140 }
3141
3142 void
3143 fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
3144 {
3145 ir_dereference *deref = static_cast<ir_dereference *>(
3146 ir->actual_parameters.get_head());
3147 ir_variable *location = deref->variable_referenced();
3148 unsigned surf_index = (stage_prog_data->binding_table.abo_start +
3149 location->data.binding);
3150
3151 /* Calculate the surface offset */
3152 fs_reg offset = vgrf(glsl_type::uint_type);
3153 ir_dereference_array *deref_array = deref->as_dereference_array();
3154
3155 if (deref_array) {
3156 deref_array->array_index->accept(this);
3157
3158 fs_reg tmp = vgrf(glsl_type::uint_type);
3159 emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
3160 emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
3161 } else {
3162 offset = fs_reg(location->data.atomic.offset);
3163 }
3164
3165 /* Emit the appropriate machine instruction */
3166 const char *callee = ir->callee->function_name();
3167 ir->return_deref->accept(this);
3168 fs_reg dst = this->result;
3169
3170 if (!strcmp("__intrinsic_atomic_read", callee)) {
3171 emit_untyped_surface_read(surf_index, dst, offset);
3172
3173 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
3174 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
3175 fs_reg(), fs_reg());
3176
3177 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
3178 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
3179 fs_reg(), fs_reg());
3180 }
3181 }
3182
3183 void
3184 fs_visitor::visit(ir_call *ir)
3185 {
3186 const char *callee = ir->callee->function_name();
3187
3188 if (!strcmp("__intrinsic_atomic_read", callee) ||
3189 !strcmp("__intrinsic_atomic_increment", callee) ||
3190 !strcmp("__intrinsic_atomic_predecrement", callee)) {
3191 visit_atomic_counter_intrinsic(ir);
3192 } else {
3193 unreachable("Unsupported intrinsic.");
3194 }
3195 }
3196
3197 void
3198 fs_visitor::visit(ir_return *)
3199 {
3200 unreachable("FINISHME");
3201 }
3202
3203 void
3204 fs_visitor::visit(ir_function *ir)
3205 {
3206 /* Ignore function bodies other than main() -- we shouldn't see calls to
3207 * them since they should all be inlined before we get to ir_to_mesa.
3208 */
3209 if (strcmp(ir->name, "main") == 0) {
3210 const ir_function_signature *sig;
3211 exec_list empty;
3212
3213 sig = ir->matching_signature(NULL, &empty, false);
3214
3215 assert(sig);
3216
3217 foreach_in_list(ir_instruction, ir_, &sig->body) {
3218 this->base_ir = ir_;
3219 ir_->accept(this);
3220 }
3221 }
3222 }
3223
3224 void
3225 fs_visitor::visit(ir_function_signature *)
3226 {
3227 unreachable("not reached");
3228 }
3229
3230 void
3231 fs_visitor::visit(ir_emit_vertex *)
3232 {
3233 unreachable("not reached");
3234 }
3235
3236 void
3237 fs_visitor::visit(ir_end_primitive *)
3238 {
3239 unreachable("not reached");
3240 }
3241
3242 void
3243 fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3244 fs_reg dst, fs_reg offset, fs_reg src0,
3245 fs_reg src1)
3246 {
3247 int reg_width = dispatch_width / 8;
3248 int length = 0;
3249
3250 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
3251
3252 sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
3253 /* Initialize the sample mask in the message header. */
3254 emit(MOV(sources[0], fs_reg(0u)))
3255 ->force_writemask_all = true;
3256
3257 if (stage == MESA_SHADER_FRAGMENT) {
3258 if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
3259 emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
3260 ->force_writemask_all = true;
3261 } else {
3262 emit(MOV(component(sources[0], 7),
3263 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
3264 ->force_writemask_all = true;
3265 }
3266 } else {
3267 /* The execution mask is part of the side-band information sent together with
3268 * the message payload to the data port. It's implicitly ANDed with the sample
3269 * mask sent in the header to compute the actual set of channels that execute
3270 * the atomic operation.
3271 */
3272 assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
3273 emit(MOV(component(sources[0], 7),
3274 fs_reg(0xffffu)))->force_writemask_all = true;
3275 }
3276 length++;
3277
3278 /* Set the atomic operation offset. */
3279 sources[1] = vgrf(glsl_type::uint_type);
3280 emit(MOV(sources[1], offset));
3281 length++;
3282
3283 /* Set the atomic operation arguments. */
3284 if (src0.file != BAD_FILE) {
3285 sources[length] = vgrf(glsl_type::uint_type);
3286 emit(MOV(sources[length], src0));
3287 length++;
3288 }
3289
3290 if (src1.file != BAD_FILE) {
3291 sources[length] = vgrf(glsl_type::uint_type);
3292 emit(MOV(sources[length], src1));
3293 length++;
3294 }
3295
3296 int mlen = 1 + (length - 1) * reg_width;
3297 fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
3298 BRW_REGISTER_TYPE_UD);
3299 emit(LOAD_PAYLOAD(src_payload, sources, length, 1));
3300
3301 /* Emit the instruction. */
3302 fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
3303 fs_reg(surf_index), fs_reg(atomic_op));
3304 inst->mlen = mlen;
3305 }
3306
3307 void
3308 fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
3309 fs_reg offset)
3310 {
3311 int reg_width = dispatch_width / 8;
3312
3313 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
3314
3315 sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
3316 /* Initialize the sample mask in the message header. */
3317 emit(MOV(sources[0], fs_reg(0u)))
3318 ->force_writemask_all = true;
3319
3320 if (stage == MESA_SHADER_FRAGMENT) {
3321 if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
3322 emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
3323 ->force_writemask_all = true;
3324 } else {
3325 emit(MOV(component(sources[0], 7),
3326 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
3327 ->force_writemask_all = true;
3328 }
3329 } else {
3330 /* The execution mask is part of the side-band information sent together with
3331 * the message payload to the data port. It's implicitly ANDed with the sample
3332 * mask sent in the header to compute the actual set of channels that execute
3333 * the atomic operation.
3334 */
3335 assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
3336 emit(MOV(component(sources[0], 7),
3337 fs_reg(0xffffu)))->force_writemask_all = true;
3338 }
3339
3340 /* Set the surface read offset. */
3341 sources[1] = vgrf(glsl_type::uint_type);
3342 emit(MOV(sources[1], offset));
3343
3344 int mlen = 1 + reg_width;
3345 fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
3346 BRW_REGISTER_TYPE_UD);
3347 fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2, 1));
3348
3349 /* Emit the instruction. */
3350 inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
3351 fs_reg(surf_index), fs_reg(1));
3352 inst->mlen = mlen;
3353 }
3354
3355 fs_inst *
3356 fs_visitor::emit(fs_inst *inst)
3357 {
3358 if (dispatch_width == 16 && inst->exec_size == 8)
3359 inst->force_uncompressed = true;
3360
3361 inst->annotation = this->current_annotation;
3362 inst->ir = this->base_ir;
3363
3364 this->instructions.push_tail(inst);
3365
3366 return inst;
3367 }
3368
3369 void
3370 fs_visitor::emit(exec_list list)
3371 {
3372 foreach_in_list_safe(fs_inst, inst, &list) {
3373 inst->exec_node::remove();
3374 emit(inst);
3375 }
3376 }
3377
3378 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
3379 void
3380 fs_visitor::emit_dummy_fs()
3381 {
3382 int reg_width = dispatch_width / 8;
3383
3384 /* Everyone's favorite color. */
3385 const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
3386 for (int i = 0; i < 4; i++) {
3387 emit(MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
3388 dispatch_width), fs_reg(color[i])));
3389 }
3390
3391 fs_inst *write;
3392 write = emit(FS_OPCODE_FB_WRITE);
3393 write->eot = true;
3394 if (devinfo->gen >= 6) {
3395 write->base_mrf = 2;
3396 write->mlen = 4 * reg_width;
3397 } else {
3398 write->header_size = 2;
3399 write->base_mrf = 0;
3400 write->mlen = 2 + 4 * reg_width;
3401 }
3402
3403 /* Tell the SF we don't have any inputs. Gen4-5 require at least one
3404 * varying to avoid GPU hangs, so set that.
3405 */
3406 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3407 wm_prog_data->num_varying_inputs = devinfo->gen < 6 ? 1 : 0;
3408 memset(wm_prog_data->urb_setup, -1,
3409 sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
3410
3411 /* We don't have any uniforms. */
3412 stage_prog_data->nr_params = 0;
3413 stage_prog_data->nr_pull_params = 0;
3414 stage_prog_data->curb_read_length = 0;
3415 stage_prog_data->dispatch_grf_start_reg = 2;
3416 wm_prog_data->dispatch_grf_start_reg_16 = 2;
3417 grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
3418
3419 calculate_cfg();
3420 }
3421
3422 /* The register location here is relative to the start of the URB
3423 * data. It will get adjusted to be a real location before
3424 * generate_code() time.
3425 */
3426 struct brw_reg
3427 fs_visitor::interp_reg(int location, int channel)
3428 {
3429 assert(stage == MESA_SHADER_FRAGMENT);
3430 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3431 int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
3432 int stride = (channel & 1) * 4;
3433
3434 assert(prog_data->urb_setup[location] != -1);
3435
3436 return brw_vec1_grf(regnr, stride);
3437 }
3438
3439 /** Emits the interpolation for the varying inputs. */
3440 void
3441 fs_visitor::emit_interpolation_setup_gen4()
3442 {
3443 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
3444
3445 this->current_annotation = "compute pixel centers";
3446 this->pixel_x = vgrf(glsl_type::uint_type);
3447 this->pixel_y = vgrf(glsl_type::uint_type);
3448 this->pixel_x.type = BRW_REGISTER_TYPE_UW;
3449 this->pixel_y.type = BRW_REGISTER_TYPE_UW;
3450 emit(ADD(this->pixel_x,
3451 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
3452 fs_reg(brw_imm_v(0x10101010))));
3453 emit(ADD(this->pixel_y,
3454 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
3455 fs_reg(brw_imm_v(0x11001100))));
3456
3457 this->current_annotation = "compute pixel deltas from v0";
3458
3459 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3460 vgrf(glsl_type::vec2_type);
3461 const fs_reg &delta_xy = this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
3462 const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
3463 const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
3464
3465 if (devinfo->has_pln && dispatch_width == 16) {
3466 emit(ADD(half(offset(delta_xy, 0), 0), half(this->pixel_x, 0), xstart));
3467 emit(ADD(half(offset(delta_xy, 0), 1), half(this->pixel_y, 0), ystart));
3468 emit(ADD(half(offset(delta_xy, 1), 0), half(this->pixel_x, 1), xstart))
3469 ->force_sechalf = true;
3470 emit(ADD(half(offset(delta_xy, 1), 1), half(this->pixel_y, 1), ystart))
3471 ->force_sechalf = true;
3472 } else {
3473 emit(ADD(offset(delta_xy, 0), this->pixel_x, xstart));
3474 emit(ADD(offset(delta_xy, 1), this->pixel_y, ystart));
3475 }
3476
3477 this->current_annotation = "compute pos.w and 1/pos.w";
3478 /* Compute wpos.w. It's always in our setup, since it's needed to
3479 * interpolate the other attributes.
3480 */
3481 this->wpos_w = vgrf(glsl_type::float_type);
3482 emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, interp_reg(VARYING_SLOT_POS, 3));
3483 /* Compute the pixel 1/W value from wpos.w. */
3484 this->pixel_w = vgrf(glsl_type::float_type);
3485 emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
3486 this->current_annotation = NULL;
3487 }
3488
3489 /** Emits the interpolation for the varying inputs. */
3490 void
3491 fs_visitor::emit_interpolation_setup_gen6()
3492 {
3493 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
3494
3495 this->current_annotation = "compute pixel centers";
3496 if (brw->gen >= 8 || dispatch_width == 8) {
3497 /* The "Register Region Restrictions" page says for BDW (and newer,
3498 * presumably):
3499 *
3500 * "When destination spans two registers, the source may be one or
3501 * two registers. The destination elements must be evenly split
3502 * between the two registers."
3503 *
3504 * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
3505 * compute our pixel centers.
3506 */
3507 fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
3508 BRW_REGISTER_TYPE_UW, dispatch_width * 2);
3509 emit(ADD(int_pixel_xy,
3510 fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
3511 fs_reg(brw_imm_v(0x11001010))))
3512 ->force_writemask_all = true;
3513
3514 this->pixel_x = vgrf(glsl_type::float_type);
3515 this->pixel_y = vgrf(glsl_type::float_type);
3516 emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
3517 emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
3518 } else {
3519 /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
3520 *
3521 * "When destination spans two registers, the source MUST span two
3522 * registers."
3523 *
3524 * Since the GRF source of the ADD will only read a single register, we
3525 * must do two separate ADDs in SIMD16.
3526 */
3527 fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
3528 fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
3529 int_pixel_x.type = BRW_REGISTER_TYPE_UW;
3530 int_pixel_y.type = BRW_REGISTER_TYPE_UW;
3531 emit(ADD(int_pixel_x,
3532 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
3533 fs_reg(brw_imm_v(0x10101010))));
3534 emit(ADD(int_pixel_y,
3535 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
3536 fs_reg(brw_imm_v(0x11001100))));
3537
3538 /* As of gen6, we can no longer mix float and int sources. We have
3539 * to turn the integer pixel centers into floats for their actual
3540 * use.
3541 */
3542 this->pixel_x = vgrf(glsl_type::float_type);
3543 this->pixel_y = vgrf(glsl_type::float_type);
3544 emit(MOV(this->pixel_x, int_pixel_x));
3545 emit(MOV(this->pixel_y, int_pixel_y));
3546 }
3547
3548 this->current_annotation = "compute pos.w";
3549 this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
3550 this->wpos_w = vgrf(glsl_type::float_type);
3551 emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
3552
3553 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3554 uint8_t reg = payload.barycentric_coord_reg[i];
3555 this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
3556 }
3557
3558 this->current_annotation = NULL;
3559 }
3560
3561 int
3562 fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
3563 bool use_2nd_half)
3564 {
3565 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3566 fs_inst *inst;
3567
3568 if (color.file == BAD_FILE) {
3569 return 4 * (dispatch_width / 8);
3570 }
3571
3572 uint8_t colors_enabled;
3573 if (components == 0) {
3574 /* We want to write one component to the alpha channel */
3575 colors_enabled = 0x8;
3576 } else {
3577 /* Enable the first components-many channels */
3578 colors_enabled = (1 << components) - 1;
3579 }
3580
3581 if (dispatch_width == 8 || (devinfo->gen >= 6 && !do_dual_src)) {
3582 /* SIMD8 write looks like:
3583 * m + 0: r0
3584 * m + 1: r1
3585 * m + 2: g0
3586 * m + 3: g1
3587 *
3588 * gen6 SIMD16 DP write looks like:
3589 * m + 0: r0
3590 * m + 1: r1
3591 * m + 2: g0
3592 * m + 3: g1
3593 * m + 4: b0
3594 * m + 5: b1
3595 * m + 6: a0
3596 * m + 7: a1
3597 */
3598 int len = 0;
3599 for (unsigned i = 0; i < 4; ++i) {
3600 if (colors_enabled & (1 << i)) {
3601 dst[len] = fs_reg(GRF, alloc.allocate(color.width / 8),
3602 color.type, color.width);
3603 inst = emit(MOV(dst[len], offset(color, i)));
3604 inst->saturate = key->clamp_fragment_color;
3605 } else if (color.width == 16) {
3606 /* We need two BAD_FILE slots for a 16-wide color */
3607 len++;
3608 }
3609 len++;
3610 }
3611 return len;
3612 } else if (devinfo->gen >= 6 && do_dual_src) {
3613 /* SIMD16 dual source blending for gen6+.
3614 *
3615 * From the SNB PRM, volume 4, part 1, page 193:
3616 *
3617 * "The dual source render target messages only have SIMD8 forms due to
3618 * maximum message length limitations. SIMD16 pixel shaders must send two
3619 * of these messages to cover all of the pixels. Each message contains
3620 * two colors (4 channels each) for each pixel in the message payload."
3621 *
3622 * So in SIMD16 dual source blending we will send 2 SIMD8 messages,
3623 * each one will call this function twice (one for each color involved),
3624 * so in each pass we only write 4 registers. Notice that the second
3625 * SIMD8 message needs to read color data from the 2nd half of the color
3626 * registers, so it needs to call this with use_2nd_half = true.
3627 */
3628 for (unsigned i = 0; i < 4; ++i) {
3629 if (colors_enabled & (1 << i)) {
3630 dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
3631 inst = emit(MOV(dst[i], half(offset(color, i),
3632 use_2nd_half ? 1 : 0)));
3633 inst->saturate = key->clamp_fragment_color;
3634 if (use_2nd_half)
3635 inst->force_sechalf = true;
3636 }
3637 }
3638 return 4;
3639 } else {
3640 /* pre-gen6 SIMD16 single source DP write looks like:
3641 * m + 0: r0
3642 * m + 1: g0
3643 * m + 2: b0
3644 * m + 3: a0
3645 * m + 4: r1
3646 * m + 5: g1
3647 * m + 6: b1
3648 * m + 7: a1
3649 */
3650 for (unsigned i = 0; i < 4; ++i) {
3651 if (colors_enabled & (1 << i)) {
3652 dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
3653 inst = emit(MOV(dst[i], half(offset(color, i), 0)));
3654 inst->saturate = key->clamp_fragment_color;
3655
3656 dst[i + 4] = fs_reg(GRF, alloc.allocate(1), color.type);
3657 inst = emit(MOV(dst[i + 4], half(offset(color, i), 1)));
3658 inst->saturate = key->clamp_fragment_color;
3659 inst->force_sechalf = true;
3660 }
3661 }
3662 return 8;
3663 }
3664 }
3665
3666 static enum brw_conditional_mod
3667 cond_for_alpha_func(GLenum func)
3668 {
3669 switch(func) {
3670 case GL_GREATER:
3671 return BRW_CONDITIONAL_G;
3672 case GL_GEQUAL:
3673 return BRW_CONDITIONAL_GE;
3674 case GL_LESS:
3675 return BRW_CONDITIONAL_L;
3676 case GL_LEQUAL:
3677 return BRW_CONDITIONAL_LE;
3678 case GL_EQUAL:
3679 return BRW_CONDITIONAL_EQ;
3680 case GL_NOTEQUAL:
3681 return BRW_CONDITIONAL_NEQ;
3682 default:
3683 unreachable("Not reached");
3684 }
3685 }
3686
3687 /**
3688 * Alpha test support for when we compile it into the shader instead
3689 * of using the normal fixed-function alpha test.
3690 */
3691 void
3692 fs_visitor::emit_alpha_test()
3693 {
3694 assert(stage == MESA_SHADER_FRAGMENT);
3695 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3696 this->current_annotation = "Alpha test";
3697
3698 fs_inst *cmp;
3699 if (key->alpha_test_func == GL_ALWAYS)
3700 return;
3701
3702 if (key->alpha_test_func == GL_NEVER) {
3703 /* f0.1 = 0 */
3704 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3705 BRW_REGISTER_TYPE_UW));
3706 cmp = emit(CMP(reg_null_f, some_reg, some_reg,
3707 BRW_CONDITIONAL_NEQ));
3708 } else {
3709 /* RT0 alpha */
3710 fs_reg color = offset(outputs[0], 3);
3711
3712 /* f0.1 &= func(color, ref) */
3713 cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
3714 cond_for_alpha_func(key->alpha_test_func)));
3715 }
3716 cmp->predicate = BRW_PREDICATE_NORMAL;
3717 cmp->flag_subreg = 1;
3718 }
3719
3720 fs_inst *
3721 fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
3722 fs_reg src0_alpha, unsigned components,
3723 unsigned exec_size, bool use_2nd_half)
3724 {
3725 assert(stage == MESA_SHADER_FRAGMENT);
3726 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3727 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3728
3729 this->current_annotation = "FB write header";
3730 int header_size = 2, payload_header_size;
3731 int reg_size = exec_size / 8;
3732
3733 /* We can potentially have a message length of up to 15, so we have to set
3734 * base_mrf to either 0 or 1 in order to fit in m0..m15.
3735 */
3736 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
3737 int length = 0;
3738
3739 /* From the Sandy Bridge PRM, volume 4, page 198:
3740 *
3741 * "Dispatched Pixel Enables. One bit per pixel indicating
3742 * which pixels were originally enabled when the thread was
3743 * dispatched. This field is only required for the end-of-
3744 * thread message and on all dual-source messages."
3745 */
3746 if (devinfo->gen >= 6 &&
3747 (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
3748 color1.file == BAD_FILE &&
3749 key->nr_color_regions == 1) {
3750 header_size = 0;
3751 }
3752
3753 if (header_size != 0) {
3754 assert(header_size == 2);
3755 /* Allocate 2 registers for a header */
3756 length += 2;
3757 }
3758
3759 if (payload.aa_dest_stencil_reg) {
3760 sources[length] = fs_reg(GRF, alloc.allocate(1));
3761 emit(MOV(sources[length],
3762 fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
3763 length++;
3764 }
3765
3766 prog_data->uses_omask =
3767 prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
3768 if (prog_data->uses_omask) {
3769 this->current_annotation = "FB write oMask";
3770 assert(this->sample_mask.file != BAD_FILE);
3771 /* Hand over gl_SampleMask. Only lower 16 bits are relevant. Since
3772 * it's unsinged single words, one vgrf is always 16-wide.
3773 */
3774 sources[length] = fs_reg(GRF, alloc.allocate(1),
3775 BRW_REGISTER_TYPE_UW, 16);
3776 emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
3777 length++;
3778 }
3779
3780 payload_header_size = length;
3781
3782 if (color0.file == BAD_FILE) {
3783 /* Even if there's no color buffers enabled, we still need to send
3784 * alpha out the pipeline to our null renderbuffer to support
3785 * alpha-testing, alpha-to-coverage, and so on.
3786 */
3787 length += setup_color_payload(sources + length, this->outputs[0], 0,
3788 false);
3789 } else if (color1.file == BAD_FILE) {
3790 if (src0_alpha.file != BAD_FILE) {
3791 sources[length] = fs_reg(GRF, alloc.allocate(reg_size),
3792 src0_alpha.type, src0_alpha.width);
3793 fs_inst *inst = emit(MOV(sources[length], src0_alpha));
3794 inst->saturate = key->clamp_fragment_color;
3795 length++;
3796 }
3797
3798 length += setup_color_payload(sources + length, color0, components,
3799 false);
3800 } else {
3801 length += setup_color_payload(sources + length, color0, components,
3802 use_2nd_half);
3803 length += setup_color_payload(sources + length, color1, components,
3804 use_2nd_half);
3805 }
3806
3807 if (source_depth_to_render_target) {
3808 if (devinfo->gen == 6) {
3809 /* For outputting oDepth on gen6, SIMD8 writes have to be
3810 * used. This would require SIMD8 moves of each half to
3811 * message regs, kind of like pre-gen5 SIMD16 FB writes.
3812 * Just bail on doing so for now.
3813 */
3814 no16("Missing support for simd16 depth writes on gen6\n");
3815 }
3816
3817 sources[length] = vgrf(glsl_type::float_type);
3818 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3819 /* Hand over gl_FragDepth. */
3820 assert(this->frag_depth.file != BAD_FILE);
3821 emit(MOV(sources[length], this->frag_depth));
3822 } else {
3823 /* Pass through the payload depth. */
3824 emit(MOV(sources[length],
3825 fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
3826 }
3827 length++;
3828 }
3829
3830 if (payload.dest_depth_reg) {
3831 sources[length] = vgrf(glsl_type::float_type);
3832 emit(MOV(sources[length],
3833 fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
3834 length++;
3835 }
3836
3837 fs_inst *load;
3838 fs_inst *write;
3839 if (devinfo->gen >= 7) {
3840 /* Send from the GRF */
3841 fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
3842 load = emit(LOAD_PAYLOAD(payload, sources, length, payload_header_size));
3843 payload.reg = alloc.allocate(load->regs_written);
3844 payload.width = dispatch_width;
3845 load->dst = payload;
3846 write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
3847 write->base_mrf = -1;
3848 } else {
3849 /* Send from the MRF */
3850 load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
3851 sources, length, payload_header_size));
3852 write = emit(FS_OPCODE_FB_WRITE);
3853 write->exec_size = exec_size;
3854 write->base_mrf = 1;
3855 }
3856
3857 write->mlen = load->regs_written;
3858 write->header_size = header_size;
3859 if (prog_data->uses_kill) {
3860 write->predicate = BRW_PREDICATE_NORMAL;
3861 write->flag_subreg = 1;
3862 }
3863 return write;
3864 }
3865
3866 void
3867 fs_visitor::emit_fb_writes()
3868 {
3869 assert(stage == MESA_SHADER_FRAGMENT);
3870 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3871 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3872
3873 fs_inst *inst = NULL;
3874 if (do_dual_src) {
3875 this->current_annotation = ralloc_asprintf(this->mem_ctx,
3876 "FB dual-source write");
3877 inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
3878 reg_undef, 4, 8);
3879 inst->target = 0;
3880
3881 /* SIMD16 dual source blending requires to send two SIMD8 dual source
3882 * messages, where each message contains color data for 8 pixels. Color
3883 * data for the first group of pixels is stored in the "lower" half of
3884 * the color registers, so in SIMD16, the previous message did:
3885 * m + 0: r0
3886 * m + 1: g0
3887 * m + 2: b0
3888 * m + 3: a0
3889 *
3890 * Here goes the second message, which packs color data for the
3891 * remaining 8 pixels. Color data for these pixels is stored in the
3892 * "upper" half of the color registers, so we need to do:
3893 * m + 0: r1
3894 * m + 1: g1
3895 * m + 2: b1
3896 * m + 3: a1
3897 */
3898 if (dispatch_width == 16) {
3899 inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
3900 reg_undef, 4, 8, true);
3901 inst->target = 0;
3902 }
3903
3904 prog_data->dual_src_blend = true;
3905 } else {
3906 for (int target = 0; target < key->nr_color_regions; target++) {
3907 /* Skip over outputs that weren't written. */
3908 if (this->outputs[target].file == BAD_FILE)
3909 continue;
3910
3911 this->current_annotation = ralloc_asprintf(this->mem_ctx,
3912 "FB write target %d",
3913 target);
3914 fs_reg src0_alpha;
3915 if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
3916 src0_alpha = offset(outputs[0], 3);
3917
3918 inst = emit_single_fb_write(this->outputs[target], reg_undef,
3919 src0_alpha,
3920 this->output_components[target],
3921 dispatch_width);
3922 inst->target = target;
3923 }
3924 }
3925
3926 if (inst == NULL) {
3927 /* Even if there's no color buffers enabled, we still need to send
3928 * alpha out the pipeline to our null renderbuffer to support
3929 * alpha-testing, alpha-to-coverage, and so on.
3930 */
3931 inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0,
3932 dispatch_width);
3933 inst->target = 0;
3934 }
3935
3936 inst->eot = true;
3937 this->current_annotation = NULL;
3938 }
3939
3940 void
3941 fs_visitor::setup_uniform_clipplane_values()
3942 {
3943 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
3944 const struct brw_vue_prog_key *key =
3945 (const struct brw_vue_prog_key *) this->key;
3946
3947 for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
3948 this->userplane[i] = fs_reg(UNIFORM, uniforms);
3949 for (int j = 0; j < 4; ++j) {
3950 stage_prog_data->param[uniforms + j] =
3951 (gl_constant_value *) &clip_planes[i][j];
3952 }
3953 uniforms += 4;
3954 }
3955 }
3956
3957 void fs_visitor::compute_clip_distance()
3958 {
3959 struct brw_vue_prog_data *vue_prog_data =
3960 (struct brw_vue_prog_data *) prog_data;
3961 const struct brw_vue_prog_key *key =
3962 (const struct brw_vue_prog_key *) this->key;
3963
3964 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3965 *
3966 * "If a linked set of shaders forming the vertex stage contains no
3967 * static write to gl_ClipVertex or gl_ClipDistance, but the
3968 * application has requested clipping against user clip planes through
3969 * the API, then the coordinate written to gl_Position is used for
3970 * comparison against the user clip planes."
3971 *
3972 * This function is only called if the shader didn't write to
3973 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3974 * if the user wrote to it; otherwise we use gl_Position.
3975 */
3976
3977 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3978 if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
3979 clip_vertex = VARYING_SLOT_POS;
3980
3981 /* If the clip vertex isn't written, skip this. Typically this means
3982 * the GS will set up clipping. */
3983 if (outputs[clip_vertex].file == BAD_FILE)
3984 return;
3985
3986 setup_uniform_clipplane_values();
3987
3988 current_annotation = "user clip distances";
3989
3990 this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
3991 this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
3992
3993 for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
3994 fs_reg u = userplane[i];
3995 fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
3996 output.reg_offset = i & 3;
3997
3998 emit(MUL(output, outputs[clip_vertex], u));
3999 for (int j = 1; j < 4; j++) {
4000 u.reg = userplane[i].reg + j;
4001 emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
4002 }
4003 }
4004 }
4005
4006 void
4007 fs_visitor::emit_urb_writes()
4008 {
4009 int slot, urb_offset, length;
4010 struct brw_vs_prog_data *vs_prog_data =
4011 (struct brw_vs_prog_data *) prog_data;
4012 const struct brw_vs_prog_key *key =
4013 (const struct brw_vs_prog_key *) this->key;
4014 const GLbitfield64 psiz_mask =
4015 VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
4016 const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
4017 bool flush;
4018 fs_reg sources[8];
4019
4020 /* Lower legacy ff and ClipVertex clipping to clip distances */
4021 if (key->base.userclip_active && !prog->UsesClipDistanceOut)
4022 compute_clip_distance();
4023
4024 /* If we don't have any valid slots to write, just do a minimal urb write
4025 * send to terminate the shader. */
4026 if (vue_map->slots_valid == 0) {
4027
4028 fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
4029 fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
4030 BRW_REGISTER_TYPE_UD))));
4031 inst->force_writemask_all = true;
4032
4033 inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
4034 inst->eot = true;
4035 inst->mlen = 1;
4036 inst->offset = 1;
4037 return;
4038 }
4039
4040 length = 0;
4041 urb_offset = 0;
4042 flush = false;
4043 for (slot = 0; slot < vue_map->num_slots; slot++) {
4044 fs_reg reg, src, zero;
4045
4046 int varying = vue_map->slot_to_varying[slot];
4047 switch (varying) {
4048 case VARYING_SLOT_PSIZ:
4049
4050 /* The point size varying slot is the vue header and is always in the
4051 * vue map. But often none of the special varyings that live there
4052 * are written and in that case we can skip writing to the vue
4053 * header, provided the corresponding state properly clamps the
4054 * values further down the pipeline. */
4055 if ((vue_map->slots_valid & psiz_mask) == 0) {
4056 assert(length == 0);
4057 urb_offset++;
4058 break;
4059 }
4060
4061 zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
4062 emit(MOV(zero, fs_reg(0u)));
4063
4064 sources[length++] = zero;
4065 if (vue_map->slots_valid & VARYING_BIT_LAYER)
4066 sources[length++] = this->outputs[VARYING_SLOT_LAYER];
4067 else
4068 sources[length++] = zero;
4069
4070 if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
4071 sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
4072 else
4073 sources[length++] = zero;
4074
4075 if (vue_map->slots_valid & VARYING_BIT_PSIZ)
4076 sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
4077 else
4078 sources[length++] = zero;
4079 break;
4080
4081 case BRW_VARYING_SLOT_NDC:
4082 case VARYING_SLOT_EDGE:
4083 unreachable("unexpected scalar vs output");
4084 break;
4085
4086 case BRW_VARYING_SLOT_PAD:
4087 break;
4088
4089 default:
4090 /* gl_Position is always in the vue map, but isn't always written by
4091 * the shader. Other varyings (clip distances) get added to the vue
4092 * map but don't always get written. In those cases, the
4093 * corresponding this->output[] slot will be invalid we and can skip
4094 * the urb write for the varying. If we've already queued up a vue
4095 * slot for writing we flush a mlen 5 urb write, otherwise we just
4096 * advance the urb_offset.
4097 */
4098 if (this->outputs[varying].file == BAD_FILE) {
4099 if (length > 0)
4100 flush = true;
4101 else
4102 urb_offset++;
4103 break;
4104 }
4105
4106 if ((varying == VARYING_SLOT_COL0 ||
4107 varying == VARYING_SLOT_COL1 ||
4108 varying == VARYING_SLOT_BFC0 ||
4109 varying == VARYING_SLOT_BFC1) &&
4110 key->clamp_vertex_color) {
4111 /* We need to clamp these guys, so do a saturating MOV into a
4112 * temp register and use that for the payload.
4113 */
4114 for (int i = 0; i < 4; i++) {
4115 reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
4116 src = offset(this->outputs[varying], i);
4117 fs_inst *inst = emit(MOV(reg, src));
4118 inst->saturate = true;
4119 sources[length++] = reg;
4120 }
4121 } else {
4122 for (int i = 0; i < 4; i++)
4123 sources[length++] = offset(this->outputs[varying], i);
4124 }
4125 break;
4126 }
4127
4128 current_annotation = "URB write";
4129
4130 /* If we've queued up 8 registers of payload (2 VUE slots), if this is
4131 * the last slot or if we need to flush (see BAD_FILE varying case
4132 * above), emit a URB write send now to flush out the data.
4133 */
4134 int last = slot == vue_map->num_slots - 1;
4135 if (length == 8 || last)
4136 flush = true;
4137 if (flush) {
4138 fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
4139 fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
4140 BRW_REGISTER_TYPE_F);
4141
4142 /* We need WE_all on the MOV for the message header (the URB handles)
4143 * so do a MOV to a dummy register and set force_writemask_all on the
4144 * MOV. LOAD_PAYLOAD will preserve that.
4145 */
4146 fs_reg dummy = fs_reg(GRF, alloc.allocate(1),
4147 BRW_REGISTER_TYPE_UD);
4148 fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
4149 BRW_REGISTER_TYPE_UD))));
4150 inst->force_writemask_all = true;
4151 payload_sources[0] = dummy;
4152
4153 memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
4154 emit(LOAD_PAYLOAD(payload, payload_sources, length + 1, 1));
4155
4156 inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
4157 inst->eot = last;
4158 inst->mlen = length + 1;
4159 inst->offset = urb_offset;
4160 urb_offset = slot + 1;
4161 length = 0;
4162 flush = false;
4163 }
4164 }
4165 }
4166
4167 void
4168 fs_visitor::resolve_ud_negate(fs_reg *reg)
4169 {
4170 if (reg->type != BRW_REGISTER_TYPE_UD ||
4171 !reg->negate)
4172 return;
4173
4174 fs_reg temp = vgrf(glsl_type::uint_type);
4175 emit(MOV(temp, *reg));
4176 *reg = temp;
4177 }
4178
4179 void
4180 fs_visitor::emit_cs_terminate()
4181 {
4182 assert(brw->gen >= 7);
4183
4184 /* We are getting the thread ID from the compute shader header */
4185 assert(stage == MESA_SHADER_COMPUTE);
4186
4187 /* We can't directly send from g0, since sends with EOT have to use
4188 * g112-127. So, copy it to a virtual register, The register allocator will
4189 * make sure it uses the appropriate register range.
4190 */
4191 struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
4192 fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
4193 fs_inst *inst = emit(MOV(payload, g0));
4194 inst->force_writemask_all = true;
4195
4196 /* Send a message to the thread spawner to terminate the thread. */
4197 inst = emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
4198 inst->eot = true;
4199 }
4200
4201 /**
4202 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
4203 *
4204 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
4205 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
4206 */
4207 void
4208 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
4209 {
4210 assert(devinfo->gen <= 5);
4211
4212 if (rvalue->type != glsl_type::bool_type)
4213 return;
4214
4215 fs_reg and_result = vgrf(glsl_type::bool_type);
4216 fs_reg neg_result = vgrf(glsl_type::bool_type);
4217 emit(AND(and_result, *reg, fs_reg(1)));
4218 emit(MOV(neg_result, negate(and_result)));
4219 *reg = neg_result;
4220 }
4221
4222 fs_visitor::fs_visitor(struct brw_context *brw,
4223 void *mem_ctx,
4224 const struct brw_wm_prog_key *key,
4225 struct brw_wm_prog_data *prog_data,
4226 struct gl_shader_program *shader_prog,
4227 struct gl_fragment_program *fp,
4228 unsigned dispatch_width)
4229 : backend_visitor(brw, shader_prog, &fp->Base, &prog_data->base,
4230 MESA_SHADER_FRAGMENT),
4231 reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
4232 reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
4233 reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
4234 key(key), prog_data(&prog_data->base),
4235 dispatch_width(dispatch_width), promoted_constants(0)
4236 {
4237 this->mem_ctx = mem_ctx;
4238 init();
4239 }
4240
4241 fs_visitor::fs_visitor(struct brw_context *brw,
4242 void *mem_ctx,
4243 const struct brw_vs_prog_key *key,
4244 struct brw_vs_prog_data *prog_data,
4245 struct gl_shader_program *shader_prog,
4246 struct gl_vertex_program *cp,
4247 unsigned dispatch_width)
4248 : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base,
4249 MESA_SHADER_VERTEX),
4250 reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
4251 reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
4252 reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
4253 key(key), prog_data(&prog_data->base.base),
4254 dispatch_width(dispatch_width), promoted_constants(0)
4255 {
4256 this->mem_ctx = mem_ctx;
4257 init();
4258 }
4259
4260 fs_visitor::fs_visitor(struct brw_context *brw,
4261 void *mem_ctx,
4262 const struct brw_cs_prog_key *key,
4263 struct brw_cs_prog_data *prog_data,
4264 struct gl_shader_program *shader_prog,
4265 struct gl_compute_program *cp,
4266 unsigned dispatch_width)
4267 : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base,
4268 MESA_SHADER_COMPUTE),
4269 reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
4270 reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
4271 reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
4272 key(key), prog_data(&prog_data->base),
4273 dispatch_width(dispatch_width)
4274 {
4275 this->mem_ctx = mem_ctx;
4276 init();
4277 }
4278
4279 void
4280 fs_visitor::init()
4281 {
4282 switch (stage) {
4283 case MESA_SHADER_FRAGMENT:
4284 key_tex = &((const brw_wm_prog_key *) key)->tex;
4285 break;
4286 case MESA_SHADER_VERTEX:
4287 case MESA_SHADER_GEOMETRY:
4288 key_tex = &((const brw_vue_prog_key *) key)->tex;
4289 break;
4290 case MESA_SHADER_COMPUTE:
4291 key_tex = &((const brw_cs_prog_key*) key)->tex;
4292 break;
4293 default:
4294 unreachable("unhandled shader stage");
4295 }
4296
4297 this->failed = false;
4298 this->simd16_unsupported = false;
4299 this->no16_msg = NULL;
4300 this->variable_ht = hash_table_ctor(0,
4301 hash_table_pointer_hash,
4302 hash_table_pointer_compare);
4303
4304 this->nir_locals = NULL;
4305 this->nir_globals = NULL;
4306
4307 memset(&this->payload, 0, sizeof(this->payload));
4308 memset(this->outputs, 0, sizeof(this->outputs));
4309 memset(this->output_components, 0, sizeof(this->output_components));
4310 this->source_depth_to_render_target = false;
4311 this->runtime_check_aads_emit = false;
4312 this->first_non_payload_grf = 0;
4313 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
4314
4315 this->current_annotation = NULL;
4316 this->base_ir = NULL;
4317
4318 this->virtual_grf_start = NULL;
4319 this->virtual_grf_end = NULL;
4320 this->live_intervals = NULL;
4321 this->regs_live_at_ip = NULL;
4322
4323 this->uniforms = 0;
4324 this->last_scratch = 0;
4325 this->pull_constant_loc = NULL;
4326 this->push_constant_loc = NULL;
4327
4328 this->spilled_any_registers = false;
4329 this->do_dual_src = false;
4330
4331 if (dispatch_width == 8)
4332 this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
4333 }
4334
4335 fs_visitor::~fs_visitor()
4336 {
4337 hash_table_dtor(this->variable_ht);
4338 }