nir: Try to make sense of the nir_shader_compiler_options code.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_nir.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "glsl/ir.h"
25 #include "glsl/ir_optimization.h"
26 #include "glsl/nir/glsl_to_nir.h"
27 #include "brw_fs.h"
28
29 static void
30 nir_optimize(nir_shader *nir)
31 {
32 bool progress;
33 do {
34 progress = false;
35 nir_lower_vars_to_ssa(nir);
36 nir_validate_shader(nir);
37 nir_lower_alu_to_scalar(nir);
38 nir_validate_shader(nir);
39 progress |= nir_copy_prop(nir);
40 nir_validate_shader(nir);
41 nir_lower_phis_to_scalar(nir);
42 nir_validate_shader(nir);
43 progress |= nir_copy_prop(nir);
44 nir_validate_shader(nir);
45 progress |= nir_opt_dce(nir);
46 nir_validate_shader(nir);
47 progress |= nir_opt_cse(nir);
48 nir_validate_shader(nir);
49 progress |= nir_opt_peephole_select(nir);
50 nir_validate_shader(nir);
51 progress |= nir_opt_algebraic(nir);
52 nir_validate_shader(nir);
53 progress |= nir_opt_constant_folding(nir);
54 nir_validate_shader(nir);
55 progress |= nir_opt_remove_phis(nir);
56 nir_validate_shader(nir);
57 } while (progress);
58 }
59
60 static bool
61 count_nir_instrs_in_block(nir_block *block, void *state)
62 {
63 int *count = (int *) state;
64 nir_foreach_instr(block, instr) {
65 *count = *count + 1;
66 }
67 return true;
68 }
69
70 static int
71 count_nir_instrs(nir_shader *nir)
72 {
73 int count = 0;
74 nir_foreach_overload(nir, overload) {
75 if (!overload->impl)
76 continue;
77 nir_foreach_block(overload->impl, count_nir_instrs_in_block, &count);
78 }
79 return count;
80 }
81
82 void
83 fs_visitor::emit_nir_code()
84 {
85 const nir_shader_compiler_options *options =
86 ctx->Const.ShaderCompilerOptions[stage].NirOptions;
87
88 /* first, lower the GLSL IR shader to NIR */
89 lower_output_reads(shader->base.ir);
90 nir_shader *nir = glsl_to_nir(shader->base.ir, true, options);
91 nir_validate_shader(nir);
92
93 nir_lower_global_vars_to_local(nir);
94 nir_validate_shader(nir);
95
96 nir_split_var_copies(nir);
97 nir_validate_shader(nir);
98
99 nir_optimize(nir);
100
101 /* Lower a bunch of stuff */
102 nir_lower_var_copies(nir);
103 nir_validate_shader(nir);
104
105 nir_lower_io(nir);
106 nir_validate_shader(nir);
107
108 nir_lower_locals_to_regs(nir);
109 nir_validate_shader(nir);
110
111 nir_remove_dead_variables(nir);
112 nir_validate_shader(nir);
113
114 nir_lower_samplers(nir, shader_prog, shader->base.Program);
115 nir_validate_shader(nir);
116
117 nir_lower_system_values(nir);
118 nir_validate_shader(nir);
119
120 nir_lower_atomics(nir);
121 nir_validate_shader(nir);
122
123 nir_optimize(nir);
124
125 nir_lower_to_source_mods(nir);
126 nir_validate_shader(nir);
127 nir_copy_prop(nir);
128 nir_validate_shader(nir);
129
130 if (unlikely(debug_enabled)) {
131 fprintf(stderr, "NIR (SSA form) for %s shader:\n", stage_name);
132 nir_print_shader(nir, stderr);
133 }
134
135 if (dispatch_width == 8) {
136 static GLuint msg_id = 0;
137 _mesa_gl_debug(&brw->ctx, &msg_id,
138 MESA_DEBUG_SOURCE_SHADER_COMPILER,
139 MESA_DEBUG_TYPE_OTHER,
140 MESA_DEBUG_SEVERITY_NOTIFICATION,
141 "%s NIR shader: %d inst\n",
142 stage_abbrev,
143 count_nir_instrs(nir));
144 }
145
146 nir_convert_from_ssa(nir);
147 nir_validate_shader(nir);
148
149 /* emit the arrays used for inputs and outputs - load/store intrinsics will
150 * be converted to reads/writes of these arrays
151 */
152
153 if (nir->num_inputs > 0) {
154 nir_inputs = vgrf(nir->num_inputs);
155 nir_setup_inputs(nir);
156 }
157
158 if (nir->num_outputs > 0) {
159 nir_outputs = vgrf(nir->num_outputs);
160 nir_setup_outputs(nir);
161 }
162
163 if (nir->num_uniforms > 0) {
164 nir_uniforms = fs_reg(UNIFORM, 0);
165 nir_setup_uniforms(nir);
166 }
167
168 nir_emit_system_values(nir);
169
170 nir_globals = ralloc_array(mem_ctx, fs_reg, nir->reg_alloc);
171 foreach_list_typed(nir_register, reg, node, &nir->registers) {
172 unsigned array_elems =
173 reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
174 unsigned size = array_elems * reg->num_components;
175 nir_globals[reg->index] = vgrf(size);
176 }
177
178 /* get the main function and emit it */
179 nir_foreach_overload(nir, overload) {
180 assert(strcmp(overload->function->name, "main") == 0);
181 assert(overload->impl);
182 nir_emit_impl(overload->impl);
183 }
184
185 if (unlikely(debug_enabled)) {
186 fprintf(stderr, "NIR (final form) for %s shader:\n", stage_name);
187 nir_print_shader(nir, stderr);
188 }
189
190 ralloc_free(nir);
191 }
192
193 void
194 fs_visitor::nir_setup_inputs(nir_shader *shader)
195 {
196 struct hash_entry *entry;
197 hash_table_foreach(shader->inputs, entry) {
198 nir_variable *var = (nir_variable *) entry->data;
199 fs_reg varying = offset(nir_inputs, var->data.driver_location);
200
201 fs_reg reg;
202 if (var->data.location == VARYING_SLOT_POS) {
203 reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
204 var->data.origin_upper_left);
205 emit_percomp(MOV(varying, reg), 0xF);
206 } else {
207 emit_general_interpolation(varying, var->name, var->type,
208 (glsl_interp_qualifier) var->data.interpolation,
209 var->data.location, var->data.centroid,
210 var->data.sample);
211 }
212 }
213 }
214
215 void
216 fs_visitor::nir_setup_outputs(nir_shader *shader)
217 {
218 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
219
220 struct hash_entry *entry;
221 hash_table_foreach(shader->outputs, entry) {
222 nir_variable *var = (nir_variable *) entry->data;
223 fs_reg reg = offset(nir_outputs, var->data.driver_location);
224
225 if (var->data.index > 0) {
226 assert(var->data.location == FRAG_RESULT_DATA0);
227 assert(var->data.index == 1);
228 this->dual_src_output = reg;
229 this->do_dual_src = true;
230 } else if (var->data.location == FRAG_RESULT_COLOR) {
231 /* Writing gl_FragColor outputs to all color regions. */
232 for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
233 this->outputs[i] = reg;
234 this->output_components[i] = 4;
235 }
236 } else if (var->data.location == FRAG_RESULT_DEPTH) {
237 this->frag_depth = reg;
238 } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) {
239 this->sample_mask = reg;
240 } else {
241 /* gl_FragData or a user-defined FS output */
242 assert(var->data.location >= FRAG_RESULT_DATA0 &&
243 var->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
244
245 int vector_elements =
246 var->type->is_array() ? var->type->fields.array->vector_elements
247 : var->type->vector_elements;
248
249 /* General color output. */
250 for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
251 int output = var->data.location - FRAG_RESULT_DATA0 + i;
252 this->outputs[output] = offset(reg, vector_elements * i);
253 this->output_components[output] = vector_elements;
254 }
255 }
256 }
257 }
258
259 void
260 fs_visitor::nir_setup_uniforms(nir_shader *shader)
261 {
262 uniforms = shader->num_uniforms;
263 param_size[0] = shader->num_uniforms;
264
265 if (dispatch_width != 8)
266 return;
267
268 struct hash_entry *entry;
269 hash_table_foreach(shader->uniforms, entry) {
270 nir_variable *var = (nir_variable *) entry->data;
271
272 /* UBO's and atomics don't take up space in the uniform file */
273
274 if (var->interface_type != NULL || var->type->contains_atomic())
275 continue;
276
277 if (strncmp(var->name, "gl_", 3) == 0)
278 nir_setup_builtin_uniform(var);
279 else
280 nir_setup_uniform(var);
281 }
282 }
283
284 void
285 fs_visitor::nir_setup_uniform(nir_variable *var)
286 {
287 int namelen = strlen(var->name);
288
289 /* The data for our (non-builtin) uniforms is stored in a series of
290 * gl_uniform_driver_storage structs for each subcomponent that
291 * glGetUniformLocation() could name. We know it's been set up in the
292 * same order we'd walk the type, so walk the list of storage and find
293 * anything with our name, or the prefix of a component that starts with
294 * our name.
295 */
296 unsigned index = var->data.driver_location;
297 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
298 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
299
300 if (strncmp(var->name, storage->name, namelen) != 0 ||
301 (storage->name[namelen] != 0 &&
302 storage->name[namelen] != '.' &&
303 storage->name[namelen] != '[')) {
304 continue;
305 }
306
307 unsigned slots = storage->type->component_slots();
308 if (storage->array_elements)
309 slots *= storage->array_elements;
310
311 for (unsigned i = 0; i < slots; i++) {
312 stage_prog_data->param[index++] = &storage->storage[i];
313 }
314 }
315
316 /* Make sure we actually initialized the right amount of stuff here. */
317 assert(var->data.driver_location + var->type->component_slots() == index);
318 }
319
320 void
321 fs_visitor::nir_setup_builtin_uniform(nir_variable *var)
322 {
323 const nir_state_slot *const slots = var->state_slots;
324 assert(var->state_slots != NULL);
325
326 unsigned uniform_index = var->data.driver_location;
327 for (unsigned int i = 0; i < var->num_state_slots; i++) {
328 /* This state reference has already been setup by ir_to_mesa, but we'll
329 * get the same index back here.
330 */
331 int index = _mesa_add_state_reference(this->prog->Parameters,
332 (gl_state_index *)slots[i].tokens);
333
334 /* Add each of the unique swizzles of the element as a parameter.
335 * This'll end up matching the expected layout of the
336 * array/matrix/structure we're trying to fill in.
337 */
338 int last_swiz = -1;
339 for (unsigned int j = 0; j < 4; j++) {
340 int swiz = GET_SWZ(slots[i].swizzle, j);
341 if (swiz == last_swiz)
342 break;
343 last_swiz = swiz;
344
345 stage_prog_data->param[uniform_index++] =
346 &prog->Parameters->ParameterValues[index][swiz];
347 }
348 }
349 }
350
351 static bool
352 emit_system_values_block(nir_block *block, void *void_visitor)
353 {
354 fs_visitor *v = (fs_visitor *)void_visitor;
355 fs_reg *reg;
356
357 nir_foreach_instr(block, instr) {
358 if (instr->type != nir_instr_type_intrinsic)
359 continue;
360
361 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
362 switch (intrin->intrinsic) {
363 case nir_intrinsic_load_sample_pos:
364 assert(v->stage == MESA_SHADER_FRAGMENT);
365 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
366 if (reg->file == BAD_FILE)
367 *reg = *v->emit_samplepos_setup();
368 break;
369
370 case nir_intrinsic_load_sample_id:
371 assert(v->stage == MESA_SHADER_FRAGMENT);
372 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
373 if (reg->file == BAD_FILE)
374 *reg = *v->emit_sampleid_setup();
375 break;
376
377 case nir_intrinsic_load_sample_mask_in:
378 assert(v->stage == MESA_SHADER_FRAGMENT);
379 assert(v->brw->gen >= 7);
380 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
381 if (reg->file == BAD_FILE)
382 *reg = fs_reg(retype(brw_vec8_grf(v->payload.sample_mask_in_reg, 0),
383 BRW_REGISTER_TYPE_D));
384 break;
385
386 default:
387 break;
388 }
389 }
390
391 return true;
392 }
393
394 void
395 fs_visitor::nir_emit_system_values(nir_shader *shader)
396 {
397 nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
398 nir_foreach_overload(shader, overload) {
399 assert(strcmp(overload->function->name, "main") == 0);
400 assert(overload->impl);
401 nir_foreach_block(overload->impl, emit_system_values_block, this);
402 }
403 }
404
405 void
406 fs_visitor::nir_emit_impl(nir_function_impl *impl)
407 {
408 nir_locals = reralloc(mem_ctx, nir_locals, fs_reg, impl->reg_alloc);
409 foreach_list_typed(nir_register, reg, node, &impl->registers) {
410 unsigned array_elems =
411 reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
412 unsigned size = array_elems * reg->num_components;
413 nir_locals[reg->index] = vgrf(size);
414 }
415
416 nir_emit_cf_list(&impl->body);
417 }
418
419 void
420 fs_visitor::nir_emit_cf_list(exec_list *list)
421 {
422 exec_list_validate(list);
423 foreach_list_typed(nir_cf_node, node, node, list) {
424 switch (node->type) {
425 case nir_cf_node_if:
426 nir_emit_if(nir_cf_node_as_if(node));
427 break;
428
429 case nir_cf_node_loop:
430 nir_emit_loop(nir_cf_node_as_loop(node));
431 break;
432
433 case nir_cf_node_block:
434 nir_emit_block(nir_cf_node_as_block(node));
435 break;
436
437 default:
438 unreachable("Invalid CFG node block");
439 }
440 }
441 }
442
443 void
444 fs_visitor::nir_emit_if(nir_if *if_stmt)
445 {
446 /* first, put the condition into f0 */
447 fs_inst *inst = emit(MOV(reg_null_d,
448 retype(get_nir_src(if_stmt->condition),
449 BRW_REGISTER_TYPE_UD)));
450 inst->conditional_mod = BRW_CONDITIONAL_NZ;
451
452 emit(IF(BRW_PREDICATE_NORMAL));
453
454 nir_emit_cf_list(&if_stmt->then_list);
455
456 /* note: if the else is empty, dead CF elimination will remove it */
457 emit(BRW_OPCODE_ELSE);
458
459 nir_emit_cf_list(&if_stmt->else_list);
460
461 emit(BRW_OPCODE_ENDIF);
462
463 if (!try_replace_with_sel() && brw->gen < 6) {
464 no16("Can't support (non-uniform) control flow on SIMD16\n");
465 }
466 }
467
468 void
469 fs_visitor::nir_emit_loop(nir_loop *loop)
470 {
471 if (brw->gen < 6) {
472 no16("Can't support (non-uniform) control flow on SIMD16\n");
473 }
474
475 emit(BRW_OPCODE_DO);
476
477 nir_emit_cf_list(&loop->body);
478
479 emit(BRW_OPCODE_WHILE);
480 }
481
482 void
483 fs_visitor::nir_emit_block(nir_block *block)
484 {
485 nir_foreach_instr(block, instr) {
486 nir_emit_instr(instr);
487 }
488 }
489
490 void
491 fs_visitor::nir_emit_instr(nir_instr *instr)
492 {
493 switch (instr->type) {
494 case nir_instr_type_alu:
495 nir_emit_alu(nir_instr_as_alu(instr));
496 break;
497
498 case nir_instr_type_intrinsic:
499 nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
500 break;
501
502 case nir_instr_type_tex:
503 nir_emit_texture(nir_instr_as_tex(instr));
504 break;
505
506 case nir_instr_type_load_const:
507 /* We can hit these, but we do nothing now and use them as
508 * immediates later.
509 */
510 break;
511
512 case nir_instr_type_jump:
513 nir_emit_jump(nir_instr_as_jump(instr));
514 break;
515
516 default:
517 unreachable("unknown instruction type");
518 }
519 }
520
521 static brw_reg_type
522 brw_type_for_nir_type(nir_alu_type type)
523 {
524 switch (type) {
525 case nir_type_bool:
526 case nir_type_unsigned:
527 return BRW_REGISTER_TYPE_UD;
528 case nir_type_int:
529 return BRW_REGISTER_TYPE_D;
530 case nir_type_float:
531 return BRW_REGISTER_TYPE_F;
532 default:
533 unreachable("unknown type");
534 }
535
536 return BRW_REGISTER_TYPE_F;
537 }
538
539 bool
540 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
541 const fs_reg &result)
542 {
543 if (instr->src[0].src.is_ssa ||
544 !instr->src[0].src.reg.reg ||
545 !instr->src[0].src.reg.reg->parent_instr)
546 return false;
547
548 if (instr->src[0].src.reg.reg->parent_instr->type !=
549 nir_instr_type_intrinsic)
550 return false;
551
552 nir_intrinsic_instr *src0 =
553 nir_instr_as_intrinsic(instr->src[0].src.reg.reg->parent_instr);
554
555 if (src0->intrinsic != nir_intrinsic_load_front_face)
556 return false;
557
558 nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
559 if (!value1 || fabsf(value1->f[0]) != 1.0f)
560 return false;
561
562 nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
563 if (!value2 || fabsf(value2->f[0]) != 1.0f)
564 return false;
565
566 fs_reg tmp = vgrf(glsl_type::int_type);
567
568 if (brw->gen >= 6) {
569 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
570 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
571
572 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
573 *
574 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
575 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
576 *
577 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
578 *
579 * This negation looks like it's safe in practice, because bits 0:4 will
580 * surely be TRIANGLES
581 */
582
583 if (value1->f[0] == -1.0f) {
584 g0.negate = true;
585 }
586
587 tmp.type = BRW_REGISTER_TYPE_W;
588 tmp.subreg_offset = 2;
589 tmp.stride = 2;
590
591 fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
592 or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
593
594 tmp.type = BRW_REGISTER_TYPE_D;
595 tmp.subreg_offset = 0;
596 tmp.stride = 1;
597 } else {
598 /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
599 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
600
601 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
602 *
603 * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D
604 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
605 *
606 * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
607 *
608 * This negation looks like it's safe in practice, because bits 0:4 will
609 * surely be TRIANGLES
610 */
611
612 if (value1->f[0] == -1.0f) {
613 g1_6.negate = true;
614 }
615
616 emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
617 }
618 emit(AND(retype(result, BRW_REGISTER_TYPE_D), tmp, fs_reg(0xbf800000)));
619
620 return true;
621 }
622
623 void
624 fs_visitor::nir_emit_alu(nir_alu_instr *instr)
625 {
626 struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
627 fs_inst *inst;
628
629 fs_reg result = get_nir_dest(instr->dest.dest);
630 result.type = brw_type_for_nir_type(nir_op_infos[instr->op].output_type);
631
632 fs_reg op[4];
633 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
634 op[i] = get_nir_src(instr->src[i].src);
635 op[i].type = brw_type_for_nir_type(nir_op_infos[instr->op].input_types[i]);
636 op[i].abs = instr->src[i].abs;
637 op[i].negate = instr->src[i].negate;
638 }
639
640 /* We get a bunch of mov's out of the from_ssa pass and they may still
641 * be vectorized. We'll handle them as a special-case. We'll also
642 * handle vecN here because it's basically the same thing.
643 */
644 switch (instr->op) {
645 case nir_op_imov:
646 case nir_op_fmov:
647 case nir_op_vec2:
648 case nir_op_vec3:
649 case nir_op_vec4: {
650 fs_reg temp = result;
651 bool need_extra_copy = false;
652 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
653 if (!instr->src[i].src.is_ssa &&
654 instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
655 need_extra_copy = true;
656 temp = retype(vgrf(4), result.type);
657 break;
658 }
659 }
660
661 for (unsigned i = 0; i < 4; i++) {
662 if (!(instr->dest.write_mask & (1 << i)))
663 continue;
664
665 if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
666 inst = emit(MOV(offset(temp, i),
667 offset(op[0], instr->src[0].swizzle[i])));
668 } else {
669 inst = emit(MOV(offset(temp, i),
670 offset(op[i], instr->src[i].swizzle[0])));
671 }
672 inst->saturate = instr->dest.saturate;
673 }
674
675 /* In this case the source and destination registers were the same,
676 * so we need to insert an extra set of moves in order to deal with
677 * any swizzling.
678 */
679 if (need_extra_copy) {
680 for (unsigned i = 0; i < 4; i++) {
681 if (!(instr->dest.write_mask & (1 << i)))
682 continue;
683
684 emit(MOV(offset(result, i), offset(temp, i)));
685 }
686 }
687 return;
688 }
689 default:
690 break;
691 }
692
693 /* At this point, we have dealt with any instruction that operates on
694 * more than a single channel. Therefore, we can just adjust the source
695 * and destination registers for that channel and emit the instruction.
696 */
697 unsigned channel = 0;
698 if (nir_op_infos[instr->op].output_size == 0) {
699 /* Since NIR is doing the scalarizing for us, we should only ever see
700 * vectorized operations with a single channel.
701 */
702 assert(_mesa_bitcount(instr->dest.write_mask) == 1);
703 channel = ffs(instr->dest.write_mask) - 1;
704
705 result = offset(result, channel);
706 }
707
708 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
709 assert(nir_op_infos[instr->op].input_sizes[i] < 2);
710 op[i] = offset(op[i], instr->src[i].swizzle[channel]);
711 }
712
713 switch (instr->op) {
714 case nir_op_i2f:
715 case nir_op_u2f:
716 inst = emit(MOV(result, op[0]));
717 inst->saturate = instr->dest.saturate;
718 break;
719
720 case nir_op_f2i:
721 case nir_op_f2u:
722 emit(MOV(result, op[0]));
723 break;
724
725 case nir_op_fsign: {
726 /* AND(val, 0x80000000) gives the sign bit.
727 *
728 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
729 * zero.
730 */
731 emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
732
733 fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
734 op[0].type = BRW_REGISTER_TYPE_UD;
735 result.type = BRW_REGISTER_TYPE_UD;
736 emit(AND(result_int, op[0], fs_reg(0x80000000u)));
737
738 inst = emit(OR(result_int, result_int, fs_reg(0x3f800000u)));
739 inst->predicate = BRW_PREDICATE_NORMAL;
740 if (instr->dest.saturate) {
741 inst = emit(MOV(result, result));
742 inst->saturate = true;
743 }
744 break;
745 }
746
747 case nir_op_isign:
748 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
749 * -> non-negative val generates 0x00000000.
750 * Predicated OR sets 1 if val is positive.
751 */
752 emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
753 emit(ASR(result, op[0], fs_reg(31)));
754 inst = emit(OR(result, result, fs_reg(1)));
755 inst->predicate = BRW_PREDICATE_NORMAL;
756 break;
757
758 case nir_op_frcp:
759 inst = emit_math(SHADER_OPCODE_RCP, result, op[0]);
760 inst->saturate = instr->dest.saturate;
761 break;
762
763 case nir_op_fexp2:
764 inst = emit_math(SHADER_OPCODE_EXP2, result, op[0]);
765 inst->saturate = instr->dest.saturate;
766 break;
767
768 case nir_op_flog2:
769 inst = emit_math(SHADER_OPCODE_LOG2, result, op[0]);
770 inst->saturate = instr->dest.saturate;
771 break;
772
773 case nir_op_fexp:
774 case nir_op_flog:
775 unreachable("not reached: should be handled by ir_explog_to_explog2");
776
777 case nir_op_fsin:
778 case nir_op_fsin_reduced:
779 inst = emit_math(SHADER_OPCODE_SIN, result, op[0]);
780 inst->saturate = instr->dest.saturate;
781 break;
782
783 case nir_op_fcos:
784 case nir_op_fcos_reduced:
785 inst = emit_math(SHADER_OPCODE_COS, result, op[0]);
786 inst->saturate = instr->dest.saturate;
787 break;
788
789 case nir_op_fddx:
790 if (fs_key->high_quality_derivatives) {
791 inst = emit(FS_OPCODE_DDX_FINE, result, op[0]);
792 } else {
793 inst = emit(FS_OPCODE_DDX_COARSE, result, op[0]);
794 }
795 inst->saturate = instr->dest.saturate;
796 break;
797 case nir_op_fddx_fine:
798 inst = emit(FS_OPCODE_DDX_FINE, result, op[0]);
799 inst->saturate = instr->dest.saturate;
800 break;
801 case nir_op_fddx_coarse:
802 inst = emit(FS_OPCODE_DDX_COARSE, result, op[0]);
803 inst->saturate = instr->dest.saturate;
804 break;
805 case nir_op_fddy:
806 if (fs_key->high_quality_derivatives) {
807 inst = emit(FS_OPCODE_DDY_FINE, result, op[0],
808 fs_reg(fs_key->render_to_fbo));
809 } else {
810 inst = emit(FS_OPCODE_DDY_COARSE, result, op[0],
811 fs_reg(fs_key->render_to_fbo));
812 }
813 inst->saturate = instr->dest.saturate;
814 break;
815 case nir_op_fddy_fine:
816 inst = emit(FS_OPCODE_DDY_FINE, result, op[0],
817 fs_reg(fs_key->render_to_fbo));
818 inst->saturate = instr->dest.saturate;
819 break;
820 case nir_op_fddy_coarse:
821 inst = emit(FS_OPCODE_DDY_COARSE, result, op[0],
822 fs_reg(fs_key->render_to_fbo));
823 inst->saturate = instr->dest.saturate;
824 break;
825
826 case nir_op_fadd:
827 case nir_op_iadd:
828 inst = emit(ADD(result, op[0], op[1]));
829 inst->saturate = instr->dest.saturate;
830 break;
831
832 case nir_op_fmul:
833 inst = emit(MUL(result, op[0], op[1]));
834 inst->saturate = instr->dest.saturate;
835 break;
836
837 case nir_op_imul: {
838 if (brw->gen >= 8) {
839 emit(MUL(result, op[0], op[1]));
840 break;
841 } else {
842 nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
843 nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
844
845 if (value0 && value0->u[0] < (1 << 16)) {
846 if (brw->gen < 7) {
847 emit(MUL(result, op[0], op[1]));
848 } else {
849 emit(MUL(result, op[1], op[0]));
850 }
851 break;
852 } else if (value1 && value1->u[0] < (1 << 16)) {
853 if (brw->gen < 7) {
854 emit(MUL(result, op[1], op[0]));
855 } else {
856 emit(MUL(result, op[0], op[1]));
857 }
858 break;
859 }
860 }
861
862 if (brw->gen >= 7)
863 no16("SIMD16 explicit accumulator operands unsupported\n");
864
865 struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
866
867 emit(MUL(acc, op[0], op[1]));
868 emit(MACH(reg_null_d, op[0], op[1]));
869 emit(MOV(result, fs_reg(acc)));
870 break;
871 }
872
873 case nir_op_imul_high:
874 case nir_op_umul_high: {
875 if (brw->gen >= 7)
876 no16("SIMD16 explicit accumulator operands unsupported\n");
877
878 struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
879
880 emit(MUL(acc, op[0], op[1]));
881 emit(MACH(result, op[0], op[1]));
882 break;
883 }
884
885 case nir_op_idiv:
886 case nir_op_udiv:
887 emit_math(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
888 break;
889
890 case nir_op_uadd_carry: {
891 if (brw->gen >= 7)
892 no16("SIMD16 explicit accumulator operands unsupported\n");
893
894 struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
895 BRW_REGISTER_TYPE_UD);
896
897 emit(ADDC(reg_null_ud, op[0], op[1]));
898 emit(MOV(result, fs_reg(acc)));
899 break;
900 }
901
902 case nir_op_usub_borrow: {
903 if (brw->gen >= 7)
904 no16("SIMD16 explicit accumulator operands unsupported\n");
905
906 struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
907 BRW_REGISTER_TYPE_UD);
908
909 emit(SUBB(reg_null_ud, op[0], op[1]));
910 emit(MOV(result, fs_reg(acc)));
911 break;
912 }
913
914 case nir_op_umod:
915 emit_math(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
916 break;
917
918 case nir_op_flt:
919 case nir_op_ilt:
920 case nir_op_ult:
921 emit(CMP(result, op[0], op[1], BRW_CONDITIONAL_L));
922 break;
923
924 case nir_op_fge:
925 case nir_op_ige:
926 case nir_op_uge:
927 emit(CMP(result, op[0], op[1], BRW_CONDITIONAL_GE));
928 break;
929
930 case nir_op_feq:
931 case nir_op_ieq:
932 emit(CMP(result, op[0], op[1], BRW_CONDITIONAL_Z));
933 break;
934
935 case nir_op_fne:
936 case nir_op_ine:
937 emit(CMP(result, op[0], op[1], BRW_CONDITIONAL_NZ));
938 break;
939
940 case nir_op_inot:
941 if (brw->gen >= 8) {
942 resolve_source_modifiers(&op[0]);
943 }
944 emit(NOT(result, op[0]));
945 break;
946 case nir_op_ixor:
947 if (brw->gen >= 8) {
948 resolve_source_modifiers(&op[0]);
949 resolve_source_modifiers(&op[1]);
950 }
951 emit(XOR(result, op[0], op[1]));
952 break;
953 case nir_op_ior:
954 if (brw->gen >= 8) {
955 resolve_source_modifiers(&op[0]);
956 resolve_source_modifiers(&op[1]);
957 }
958 emit(OR(result, op[0], op[1]));
959 break;
960 case nir_op_iand:
961 if (brw->gen >= 8) {
962 resolve_source_modifiers(&op[0]);
963 resolve_source_modifiers(&op[1]);
964 }
965 emit(AND(result, op[0], op[1]));
966 break;
967
968 case nir_op_fdot2:
969 case nir_op_fdot3:
970 case nir_op_fdot4:
971 case nir_op_bany2:
972 case nir_op_bany3:
973 case nir_op_bany4:
974 case nir_op_ball2:
975 case nir_op_ball3:
976 case nir_op_ball4:
977 case nir_op_ball_fequal2:
978 case nir_op_ball_iequal2:
979 case nir_op_ball_fequal3:
980 case nir_op_ball_iequal3:
981 case nir_op_ball_fequal4:
982 case nir_op_ball_iequal4:
983 case nir_op_bany_fnequal2:
984 case nir_op_bany_inequal2:
985 case nir_op_bany_fnequal3:
986 case nir_op_bany_inequal3:
987 case nir_op_bany_fnequal4:
988 case nir_op_bany_inequal4:
989 unreachable("Lowered by nir_lower_alu_reductions");
990
991 case nir_op_fnoise1_1:
992 case nir_op_fnoise1_2:
993 case nir_op_fnoise1_3:
994 case nir_op_fnoise1_4:
995 case nir_op_fnoise2_1:
996 case nir_op_fnoise2_2:
997 case nir_op_fnoise2_3:
998 case nir_op_fnoise2_4:
999 case nir_op_fnoise3_1:
1000 case nir_op_fnoise3_2:
1001 case nir_op_fnoise3_3:
1002 case nir_op_fnoise3_4:
1003 case nir_op_fnoise4_1:
1004 case nir_op_fnoise4_2:
1005 case nir_op_fnoise4_3:
1006 case nir_op_fnoise4_4:
1007 unreachable("not reached: should be handled by lower_noise");
1008
1009 case nir_op_ldexp:
1010 unreachable("not reached: should be handled by ldexp_to_arith()");
1011
1012 case nir_op_fsqrt:
1013 inst = emit_math(SHADER_OPCODE_SQRT, result, op[0]);
1014 inst->saturate = instr->dest.saturate;
1015 break;
1016
1017 case nir_op_frsq:
1018 inst = emit_math(SHADER_OPCODE_RSQ, result, op[0]);
1019 inst->saturate = instr->dest.saturate;
1020 break;
1021
1022 case nir_op_b2i:
1023 emit(AND(result, op[0], fs_reg(1)));
1024 break;
1025 case nir_op_b2f:
1026 emit(AND(retype(result, BRW_REGISTER_TYPE_UD), op[0], fs_reg(0x3f800000u)));
1027 break;
1028
1029 case nir_op_f2b:
1030 emit(CMP(result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
1031 break;
1032 case nir_op_i2b:
1033 emit(CMP(result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
1034 break;
1035
1036 case nir_op_ftrunc:
1037 inst = emit(RNDZ(result, op[0]));
1038 inst->saturate = instr->dest.saturate;
1039 break;
1040
1041 case nir_op_fceil: {
1042 op[0].negate = !op[0].negate;
1043 fs_reg temp = vgrf(glsl_type::float_type);
1044 emit(RNDD(temp, op[0]));
1045 temp.negate = true;
1046 inst = emit(MOV(result, temp));
1047 inst->saturate = instr->dest.saturate;
1048 break;
1049 }
1050 case nir_op_ffloor:
1051 inst = emit(RNDD(result, op[0]));
1052 inst->saturate = instr->dest.saturate;
1053 break;
1054 case nir_op_ffract:
1055 inst = emit(FRC(result, op[0]));
1056 inst->saturate = instr->dest.saturate;
1057 break;
1058 case nir_op_fround_even:
1059 inst = emit(RNDE(result, op[0]));
1060 inst->saturate = instr->dest.saturate;
1061 break;
1062
1063 case nir_op_fmin:
1064 case nir_op_imin:
1065 case nir_op_umin:
1066 if (brw->gen >= 6) {
1067 inst = emit(BRW_OPCODE_SEL, result, op[0], op[1]);
1068 inst->conditional_mod = BRW_CONDITIONAL_L;
1069 } else {
1070 emit(CMP(reg_null_d, op[0], op[1], BRW_CONDITIONAL_L));
1071 inst = emit(SEL(result, op[0], op[1]));
1072 }
1073 inst->saturate = instr->dest.saturate;
1074 break;
1075
1076 case nir_op_fmax:
1077 case nir_op_imax:
1078 case nir_op_umax:
1079 if (brw->gen >= 6) {
1080 inst = emit(BRW_OPCODE_SEL, result, op[0], op[1]);
1081 inst->conditional_mod = BRW_CONDITIONAL_GE;
1082 } else {
1083 emit(CMP(reg_null_d, op[0], op[1], BRW_CONDITIONAL_GE));
1084 inst = emit(SEL(result, op[0], op[1]));
1085 }
1086 inst->saturate = instr->dest.saturate;
1087 break;
1088
1089 case nir_op_pack_snorm_2x16:
1090 case nir_op_pack_snorm_4x8:
1091 case nir_op_pack_unorm_2x16:
1092 case nir_op_pack_unorm_4x8:
1093 case nir_op_unpack_snorm_2x16:
1094 case nir_op_unpack_snorm_4x8:
1095 case nir_op_unpack_unorm_2x16:
1096 case nir_op_unpack_unorm_4x8:
1097 case nir_op_unpack_half_2x16:
1098 case nir_op_pack_half_2x16:
1099 unreachable("not reached: should be handled by lower_packing_builtins");
1100
1101 case nir_op_unpack_half_2x16_split_x:
1102 inst = emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
1103 inst->saturate = instr->dest.saturate;
1104 break;
1105 case nir_op_unpack_half_2x16_split_y:
1106 inst = emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
1107 inst->saturate = instr->dest.saturate;
1108 break;
1109
1110 case nir_op_fpow:
1111 inst = emit_math(SHADER_OPCODE_POW, result, op[0], op[1]);
1112 inst->saturate = instr->dest.saturate;
1113 break;
1114
1115 case nir_op_bitfield_reverse:
1116 emit(BFREV(result, op[0]));
1117 break;
1118
1119 case nir_op_bit_count:
1120 emit(CBIT(result, op[0]));
1121 break;
1122
1123 case nir_op_ufind_msb:
1124 case nir_op_ifind_msb: {
1125 emit(FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]));
1126
1127 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1128 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1129 * subtract the result from 31 to convert the MSB count into an LSB count.
1130 */
1131
1132 emit(CMP(reg_null_d, result, fs_reg(-1), BRW_CONDITIONAL_NZ));
1133 fs_reg neg_result(result);
1134 neg_result.negate = true;
1135 inst = emit(ADD(result, neg_result, fs_reg(31)));
1136 inst->predicate = BRW_PREDICATE_NORMAL;
1137 break;
1138 }
1139
1140 case nir_op_find_lsb:
1141 emit(FBL(result, op[0]));
1142 break;
1143
1144 case nir_op_ubitfield_extract:
1145 case nir_op_ibitfield_extract:
1146 emit(BFE(result, op[2], op[1], op[0]));
1147 break;
1148 case nir_op_bfm:
1149 emit(BFI1(result, op[0], op[1]));
1150 break;
1151 case nir_op_bfi:
1152 emit(BFI2(result, op[0], op[1], op[2]));
1153 break;
1154
1155 case nir_op_bitfield_insert:
1156 unreachable("not reached: should be handled by "
1157 "lower_instructions::bitfield_insert_to_bfm_bfi");
1158
1159 case nir_op_ishl:
1160 emit(SHL(result, op[0], op[1]));
1161 break;
1162 case nir_op_ishr:
1163 emit(ASR(result, op[0], op[1]));
1164 break;
1165 case nir_op_ushr:
1166 emit(SHR(result, op[0], op[1]));
1167 break;
1168
1169 case nir_op_pack_half_2x16_split:
1170 emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1171 break;
1172
1173 case nir_op_ffma:
1174 inst = emit(MAD(result, op[2], op[1], op[0]));
1175 inst->saturate = instr->dest.saturate;
1176 break;
1177
1178 case nir_op_flrp:
1179 /* TODO emulate for gen < 6 */
1180 inst = emit(LRP(result, op[2], op[1], op[0]));
1181 inst->saturate = instr->dest.saturate;
1182 break;
1183
1184 case nir_op_bcsel:
1185 if (optimize_frontfacing_ternary(instr, result))
1186 return;
1187
1188 emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
1189 inst = emit(SEL(result, op[1], op[2]));
1190 inst->predicate = BRW_PREDICATE_NORMAL;
1191 break;
1192
1193 default:
1194 unreachable("unhandled instruction");
1195 }
1196 }
1197
1198 fs_reg
1199 fs_visitor::get_nir_src(nir_src src)
1200 {
1201 if (src.is_ssa) {
1202 assert(src.ssa->parent_instr->type == nir_instr_type_load_const);
1203 nir_load_const_instr *load = nir_instr_as_load_const(src.ssa->parent_instr);
1204 fs_reg reg = vgrf(src.ssa->num_components);
1205 reg.type = BRW_REGISTER_TYPE_D;
1206
1207 for (unsigned i = 0; i < src.ssa->num_components; ++i)
1208 emit(MOV(offset(reg, i), fs_reg(load->value.i[i])));
1209
1210 return reg;
1211 } else {
1212 fs_reg reg;
1213 if (src.reg.reg->is_global)
1214 reg = nir_globals[src.reg.reg->index];
1215 else
1216 reg = nir_locals[src.reg.reg->index];
1217
1218 /* to avoid floating-point denorm flushing problems, set the type by
1219 * default to D - instructions that need floating point semantics will set
1220 * this to F if they need to
1221 */
1222 reg = retype(offset(reg, src.reg.base_offset), BRW_REGISTER_TYPE_D);
1223 if (src.reg.indirect) {
1224 reg.reladdr = new(mem_ctx) fs_reg();
1225 *reg.reladdr = retype(get_nir_src(*src.reg.indirect),
1226 BRW_REGISTER_TYPE_D);
1227 }
1228
1229 return reg;
1230 }
1231 }
1232
1233 fs_reg
1234 fs_visitor::get_nir_dest(nir_dest dest)
1235 {
1236 fs_reg reg;
1237 if (dest.reg.reg->is_global)
1238 reg = nir_globals[dest.reg.reg->index];
1239 else
1240 reg = nir_locals[dest.reg.reg->index];
1241
1242 reg = offset(reg, dest.reg.base_offset);
1243 if (dest.reg.indirect) {
1244 reg.reladdr = new(mem_ctx) fs_reg();
1245 *reg.reladdr = retype(get_nir_src(*dest.reg.indirect),
1246 BRW_REGISTER_TYPE_D);
1247 }
1248
1249 return reg;
1250 }
1251
1252 void
1253 fs_visitor::emit_percomp(fs_inst *inst, unsigned wr_mask)
1254 {
1255 for (unsigned i = 0; i < 4; i++) {
1256 if (!((wr_mask >> i) & 1))
1257 continue;
1258
1259 fs_inst *new_inst = new(mem_ctx) fs_inst(*inst);
1260 new_inst->dst = offset(new_inst->dst, i);
1261 for (unsigned j = 0; j < new_inst->sources; j++)
1262 if (inst->src[j].file == GRF)
1263 new_inst->src[j] = offset(new_inst->src[j], i);
1264
1265 emit(new_inst);
1266 }
1267 }
1268
1269 void
1270 fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
1271 {
1272 fs_reg dest;
1273 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
1274 dest = get_nir_dest(instr->dest);
1275
1276 bool has_indirect = false;
1277
1278 switch (instr->intrinsic) {
1279 case nir_intrinsic_discard:
1280 case nir_intrinsic_discard_if: {
1281 /* We track our discarded pixels in f0.1. By predicating on it, we can
1282 * update just the flag bits that aren't yet discarded. If there's no
1283 * condition, we emit a CMP of g0 != g0, so all currently executing
1284 * channels will get turned off.
1285 */
1286 fs_inst *cmp;
1287 if (instr->intrinsic == nir_intrinsic_discard_if) {
1288 cmp = emit(CMP(reg_null_f, get_nir_src(instr->src[0]),
1289 fs_reg(0), BRW_CONDITIONAL_Z));
1290 } else {
1291 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
1292 BRW_REGISTER_TYPE_UW));
1293 cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
1294 }
1295 cmp->predicate = BRW_PREDICATE_NORMAL;
1296 cmp->flag_subreg = 1;
1297
1298 if (brw->gen >= 6) {
1299 /* For performance, after a discard, jump to the end of the shader.
1300 * Only jump if all relevant channels have been discarded.
1301 */
1302 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1303 discard_jump->flag_subreg = 1;
1304
1305 discard_jump->predicate = (dispatch_width == 8)
1306 ? BRW_PREDICATE_ALIGN1_ANY8H
1307 : BRW_PREDICATE_ALIGN1_ANY16H;
1308 discard_jump->predicate_inverse = true;
1309 }
1310
1311 break;
1312 }
1313
1314 case nir_intrinsic_atomic_counter_inc:
1315 case nir_intrinsic_atomic_counter_dec:
1316 case nir_intrinsic_atomic_counter_read: {
1317 unsigned surf_index = prog_data->binding_table.abo_start +
1318 (unsigned) instr->const_index[0];
1319 fs_reg offset = fs_reg(get_nir_src(instr->src[0]));
1320
1321 switch (instr->intrinsic) {
1322 case nir_intrinsic_atomic_counter_inc:
1323 emit_untyped_atomic(BRW_AOP_INC, surf_index, dest, offset,
1324 fs_reg(), fs_reg());
1325 break;
1326 case nir_intrinsic_atomic_counter_dec:
1327 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dest, offset,
1328 fs_reg(), fs_reg());
1329 break;
1330 case nir_intrinsic_atomic_counter_read:
1331 emit_untyped_surface_read(surf_index, dest, offset);
1332 break;
1333 default:
1334 unreachable("Unreachable");
1335 }
1336 break;
1337 }
1338
1339 case nir_intrinsic_load_front_face:
1340 emit(MOV(retype(dest, BRW_REGISTER_TYPE_D),
1341 *emit_frontfacing_interpolation()));
1342 break;
1343
1344 case nir_intrinsic_load_sample_mask_in: {
1345 fs_reg sample_mask_in = nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
1346 assert(sample_mask_in.file != BAD_FILE);
1347 dest.type = sample_mask_in.type;
1348 emit(MOV(dest, sample_mask_in));
1349 break;
1350 }
1351
1352 case nir_intrinsic_load_sample_pos: {
1353 fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
1354 assert(sample_pos.file != BAD_FILE);
1355 dest.type = sample_pos.type;
1356 emit(MOV(dest, sample_pos));
1357 emit(MOV(offset(dest, 1), offset(sample_pos, 1)));
1358 break;
1359 }
1360
1361 case nir_intrinsic_load_sample_id: {
1362 fs_reg sample_id = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
1363 assert(sample_id.file != BAD_FILE);
1364 dest.type = sample_id.type;
1365 emit(MOV(dest, sample_id));
1366 break;
1367 }
1368
1369 case nir_intrinsic_load_uniform_indirect:
1370 has_indirect = true;
1371 case nir_intrinsic_load_uniform: {
1372 unsigned index = 0;
1373 for (int i = 0; i < instr->const_index[1]; i++) {
1374 for (unsigned j = 0; j < instr->num_components; j++) {
1375 fs_reg src = offset(retype(nir_uniforms, dest.type),
1376 instr->const_index[0] + index);
1377 if (has_indirect)
1378 src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
1379 index++;
1380
1381 emit(MOV(dest, src));
1382 dest = offset(dest, 1);
1383 }
1384 }
1385 break;
1386 }
1387
1388 case nir_intrinsic_load_ubo_indirect:
1389 has_indirect = true;
1390 /* fallthrough */
1391 case nir_intrinsic_load_ubo: {
1392 nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
1393 fs_reg surf_index;
1394
1395 if (const_index) {
1396 surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
1397 const_index->u[0]);
1398 } else {
1399 /* The block index is not a constant. Evaluate the index expression
1400 * per-channel and add the base UBO index; the generator will select
1401 * a value from any live channel.
1402 */
1403 surf_index = vgrf(glsl_type::uint_type);
1404 emit(ADD(surf_index, get_nir_src(instr->src[0]),
1405 fs_reg(stage_prog_data->binding_table.ubo_start)))
1406 ->force_writemask_all = true;
1407
1408 /* Assume this may touch any UBO. It would be nice to provide
1409 * a tighter bound, but the array information is already lowered away.
1410 */
1411 brw_mark_surface_used(prog_data,
1412 stage_prog_data->binding_table.ubo_start +
1413 shader_prog->NumUniformBlocks - 1);
1414 }
1415
1416 if (has_indirect) {
1417 /* Turn the byte offset into a dword offset. */
1418 fs_reg base_offset = vgrf(glsl_type::int_type);
1419 emit(SHR(base_offset, retype(get_nir_src(instr->src[1]),
1420 BRW_REGISTER_TYPE_D),
1421 fs_reg(2)));
1422
1423 unsigned vec4_offset = instr->const_index[0] / 4;
1424 for (int i = 0; i < instr->num_components; i++)
1425 emit(VARYING_PULL_CONSTANT_LOAD(offset(dest, i), surf_index,
1426 base_offset, vec4_offset + i));
1427 } else {
1428 fs_reg packed_consts = vgrf(glsl_type::float_type);
1429 packed_consts.type = dest.type;
1430
1431 fs_reg const_offset_reg((unsigned) instr->const_index[0] & ~15);
1432 emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
1433 surf_index, const_offset_reg);
1434
1435 for (unsigned i = 0; i < instr->num_components; i++) {
1436 packed_consts.set_smear(instr->const_index[0] % 16 / 4 + i);
1437
1438 /* The std140 packing rules don't allow vectors to cross 16-byte
1439 * boundaries, and a reg is 32 bytes.
1440 */
1441 assert(packed_consts.subreg_offset < 32);
1442
1443 emit(MOV(dest, packed_consts));
1444 dest = offset(dest, 1);
1445 }
1446 }
1447 break;
1448 }
1449
1450 case nir_intrinsic_load_input_indirect:
1451 has_indirect = true;
1452 /* fallthrough */
1453 case nir_intrinsic_load_input: {
1454 unsigned index = 0;
1455 for (int i = 0; i < instr->const_index[1]; i++) {
1456 for (unsigned j = 0; j < instr->num_components; j++) {
1457 fs_reg src = offset(retype(nir_inputs, dest.type),
1458 instr->const_index[0] + index);
1459 if (has_indirect)
1460 src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
1461 index++;
1462
1463 emit(MOV(dest, src));
1464 dest = offset(dest, 1);
1465 }
1466 }
1467 break;
1468 }
1469
1470 /* Handle ARB_gpu_shader5 interpolation intrinsics
1471 *
1472 * It's worth a quick word of explanation as to why we handle the full
1473 * variable-based interpolation intrinsic rather than a lowered version
1474 * with like we do for other inputs. We have to do that because the way
1475 * we set up inputs doesn't allow us to use the already setup inputs for
1476 * interpolation. At the beginning of the shader, we go through all of
1477 * the input variables and do the initial interpolation and put it in
1478 * the nir_inputs array based on its location as determined in
1479 * nir_lower_io. If the input isn't used, dead code cleans up and
1480 * everything works fine. However, when we get to the ARB_gpu_shader5
1481 * interpolation intrinsics, we need to reinterpolate the input
1482 * differently. If we used an intrinsic that just had an index it would
1483 * only give us the offset into the nir_inputs array. However, this is
1484 * useless because that value is post-interpolation and we need
1485 * pre-interpolation. In order to get the actual location of the bits
1486 * we get from the vertex fetching hardware, we need the variable.
1487 */
1488 case nir_intrinsic_interp_var_at_centroid:
1489 case nir_intrinsic_interp_var_at_sample:
1490 case nir_intrinsic_interp_var_at_offset: {
1491 /* in SIMD16 mode, the pixel interpolator returns coords interleaved
1492 * 8 channels at a time, same as the barycentric coords presented in
1493 * the FS payload. this requires a bit of extra work to support.
1494 */
1495 no16("interpolate_at_* not yet supported in SIMD16 mode.");
1496
1497 fs_reg dst_x = vgrf(2);
1498 fs_reg dst_y = offset(dst_x, 1);
1499
1500 /* For most messages, we need one reg of ignored data; the hardware
1501 * requires mlen==1 even when there is no payload. in the per-slot
1502 * offset case, we'll replace this with the proper source data.
1503 */
1504 fs_reg src = vgrf(glsl_type::float_type);
1505 int mlen = 1; /* one reg unless overriden */
1506 fs_inst *inst;
1507
1508 switch (instr->intrinsic) {
1509 case nir_intrinsic_interp_var_at_centroid:
1510 inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
1511 break;
1512
1513 case nir_intrinsic_interp_var_at_sample: {
1514 /* XXX: We should probably handle non-constant sample id's */
1515 nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
1516 assert(const_sample);
1517 unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0;
1518 inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src,
1519 fs_reg(msg_data));
1520 break;
1521 }
1522
1523 case nir_intrinsic_interp_var_at_offset: {
1524 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
1525
1526 if (const_offset) {
1527 unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
1528 unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
1529
1530 inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
1531 fs_reg(off_x | (off_y << 4)));
1532 } else {
1533 src = vgrf(glsl_type::ivec2_type);
1534 fs_reg offset_src = retype(get_nir_src(instr->src[0]),
1535 BRW_REGISTER_TYPE_F);
1536 for (int i = 0; i < 2; i++) {
1537 fs_reg temp = vgrf(glsl_type::float_type);
1538 emit(MUL(temp, offset(offset_src, i), fs_reg(16.0f)));
1539 fs_reg itemp = vgrf(glsl_type::int_type);
1540 emit(MOV(itemp, temp)); /* float to int */
1541
1542 /* Clamp the upper end of the range to +7/16.
1543 * ARB_gpu_shader5 requires that we support a maximum offset
1544 * of +0.5, which isn't representable in a S0.4 value -- if
1545 * we didn't clamp it, we'd end up with -8/16, which is the
1546 * opposite of what the shader author wanted.
1547 *
1548 * This is legal due to ARB_gpu_shader5's quantization
1549 * rules:
1550 *
1551 * "Not all values of <offset> may be supported; x and y
1552 * offsets may be rounded to fixed-point values with the
1553 * number of fraction bits given by the
1554 * implementation-dependent constant
1555 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
1556 */
1557
1558 emit(BRW_OPCODE_SEL, offset(src, i), itemp, fs_reg(7))
1559 ->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
1560 }
1561
1562 mlen = 2;
1563 inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
1564 fs_reg(0u));
1565 }
1566 break;
1567 }
1568
1569 default:
1570 unreachable("Invalid intrinsic");
1571 }
1572
1573 inst->mlen = mlen;
1574 inst->regs_written = 2; /* 2 floats per slot returned */
1575 inst->pi_noperspective = instr->variables[0]->var->data.interpolation ==
1576 INTERP_QUALIFIER_NOPERSPECTIVE;
1577
1578 for (unsigned j = 0; j < instr->num_components; j++) {
1579 fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
1580 src.type = dest.type;
1581
1582 emit(FS_OPCODE_LINTERP, dest, dst_x, dst_y, src);
1583 dest = offset(dest, 1);
1584 }
1585 break;
1586 }
1587
1588 case nir_intrinsic_store_output_indirect:
1589 has_indirect = true;
1590 case nir_intrinsic_store_output: {
1591 fs_reg src = get_nir_src(instr->src[0]);
1592 unsigned index = 0;
1593 for (int i = 0; i < instr->const_index[1]; i++) {
1594 for (unsigned j = 0; j < instr->num_components; j++) {
1595 fs_reg new_dest = offset(retype(nir_outputs, src.type),
1596 instr->const_index[0] + index);
1597 if (has_indirect)
1598 src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[1]));
1599 index++;
1600 emit(MOV(new_dest, src));
1601 src = offset(src, 1);
1602 }
1603 }
1604 break;
1605 }
1606
1607 default:
1608 unreachable("unknown intrinsic");
1609 }
1610 }
1611
1612 void
1613 fs_visitor::nir_emit_texture(nir_tex_instr *instr)
1614 {
1615 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1616 unsigned sampler = instr->sampler_index;
1617 fs_reg sampler_reg(sampler);
1618
1619 /* FINISHME: We're failing to recompile our programs when the sampler is
1620 * updated. This only matters for the texture rectangle scale parameters
1621 * (pre-gen6, or gen6+ with GL_CLAMP).
1622 */
1623 int texunit = prog->SamplerUnits[sampler];
1624
1625 int gather_component = instr->component;
1626
1627 bool is_rect = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
1628
1629 bool is_cube_array = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
1630 instr->is_array;
1631
1632 int lod_components = 0, offset_components = 0;
1633
1634 fs_reg coordinate, shadow_comparitor, lod, lod2, sample_index, mcs, offset;
1635
1636 for (unsigned i = 0; i < instr->num_srcs; i++) {
1637 fs_reg src = get_nir_src(instr->src[i].src);
1638 switch (instr->src[i].src_type) {
1639 case nir_tex_src_bias:
1640 lod = retype(src, BRW_REGISTER_TYPE_F);
1641 break;
1642 case nir_tex_src_comparitor:
1643 shadow_comparitor = retype(src, BRW_REGISTER_TYPE_F);
1644 break;
1645 case nir_tex_src_coord:
1646 switch (instr->op) {
1647 case nir_texop_txf:
1648 case nir_texop_txf_ms:
1649 coordinate = retype(src, BRW_REGISTER_TYPE_D);
1650 break;
1651 default:
1652 coordinate = retype(src, BRW_REGISTER_TYPE_F);
1653 break;
1654 }
1655 break;
1656 case nir_tex_src_ddx:
1657 lod = retype(src, BRW_REGISTER_TYPE_F);
1658 lod_components = nir_tex_instr_src_size(instr, i);
1659 break;
1660 case nir_tex_src_ddy:
1661 lod2 = retype(src, BRW_REGISTER_TYPE_F);
1662 break;
1663 case nir_tex_src_lod:
1664 switch (instr->op) {
1665 case nir_texop_txs:
1666 lod = retype(src, BRW_REGISTER_TYPE_UD);
1667 break;
1668 case nir_texop_txf:
1669 lod = retype(src, BRW_REGISTER_TYPE_D);
1670 break;
1671 default:
1672 lod = retype(src, BRW_REGISTER_TYPE_F);
1673 break;
1674 }
1675 break;
1676 case nir_tex_src_ms_index:
1677 sample_index = retype(src, BRW_REGISTER_TYPE_UD);
1678 break;
1679 case nir_tex_src_offset:
1680 offset = retype(src, BRW_REGISTER_TYPE_D);
1681 if (instr->is_array)
1682 offset_components = instr->coord_components - 1;
1683 else
1684 offset_components = instr->coord_components;
1685 break;
1686 case nir_tex_src_projector:
1687 unreachable("should be lowered");
1688
1689 case nir_tex_src_sampler_offset: {
1690 /* Figure out the highest possible sampler index and mark it as used */
1691 uint32_t max_used = sampler + instr->sampler_array_size - 1;
1692 if (instr->op == nir_texop_tg4 && brw->gen < 8) {
1693 max_used += stage_prog_data->binding_table.gather_texture_start;
1694 } else {
1695 max_used += stage_prog_data->binding_table.texture_start;
1696 }
1697 brw_mark_surface_used(prog_data, max_used);
1698
1699 /* Emit code to evaluate the actual indexing expression */
1700 sampler_reg = vgrf(glsl_type::uint_type);
1701 emit(ADD(sampler_reg, src, fs_reg(sampler)))
1702 ->force_writemask_all = true;
1703 break;
1704 }
1705
1706 default:
1707 unreachable("unknown texture source");
1708 }
1709 }
1710
1711 if (instr->op == nir_texop_txf_ms) {
1712 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
1713 mcs = emit_mcs_fetch(coordinate, instr->coord_components, sampler_reg);
1714 else
1715 mcs = fs_reg(0u);
1716 }
1717
1718 for (unsigned i = 0; i < 3; i++) {
1719 if (instr->const_offset[i] != 0) {
1720 assert(offset_components == 0);
1721 offset = fs_reg(brw_texture_offset(ctx, instr->const_offset, 3));
1722 break;
1723 }
1724 }
1725
1726 enum glsl_base_type dest_base_type;
1727 switch (instr->dest_type) {
1728 case nir_type_float:
1729 dest_base_type = GLSL_TYPE_FLOAT;
1730 break;
1731 case nir_type_int:
1732 dest_base_type = GLSL_TYPE_INT;
1733 break;
1734 case nir_type_unsigned:
1735 dest_base_type = GLSL_TYPE_UINT;
1736 break;
1737 default:
1738 unreachable("bad type");
1739 }
1740
1741 const glsl_type *dest_type =
1742 glsl_type::get_instance(dest_base_type, nir_tex_instr_dest_size(instr),
1743 1);
1744
1745 ir_texture_opcode op;
1746 switch (instr->op) {
1747 case nir_texop_lod: op = ir_lod; break;
1748 case nir_texop_query_levels: op = ir_query_levels; break;
1749 case nir_texop_tex: op = ir_tex; break;
1750 case nir_texop_tg4: op = ir_tg4; break;
1751 case nir_texop_txb: op = ir_txb; break;
1752 case nir_texop_txd: op = ir_txd; break;
1753 case nir_texop_txf: op = ir_txf; break;
1754 case nir_texop_txf_ms: op = ir_txf_ms; break;
1755 case nir_texop_txl: op = ir_txl; break;
1756 case nir_texop_txs: op = ir_txs; break;
1757 default:
1758 unreachable("unknown texture opcode");
1759 }
1760
1761 emit_texture(op, dest_type, coordinate, instr->coord_components,
1762 shadow_comparitor, lod, lod2, lod_components, sample_index,
1763 offset, offset_components, mcs, gather_component,
1764 is_cube_array, is_rect, sampler, sampler_reg, texunit);
1765
1766 fs_reg dest = get_nir_dest(instr->dest);
1767 dest.type = this->result.type;
1768 unsigned num_components = nir_tex_instr_dest_size(instr);
1769 emit_percomp(MOV(dest, this->result), (1 << num_components) - 1);
1770 }
1771
1772 void
1773 fs_visitor::nir_emit_jump(nir_jump_instr *instr)
1774 {
1775 switch (instr->type) {
1776 case nir_jump_break:
1777 emit(BRW_OPCODE_BREAK);
1778 break;
1779 case nir_jump_continue:
1780 emit(BRW_OPCODE_CONTINUE);
1781 break;
1782 case nir_jump_return:
1783 default:
1784 unreachable("unknown jump");
1785 }
1786 }