i965/fs: Allow SIMD16 on pre-SNB when try_replace_with_sel is successful
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_nir.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "glsl/ir.h"
25 #include "glsl/ir_optimization.h"
26 #include "glsl/nir/glsl_to_nir.h"
27 #include "brw_fs.h"
28
29 static void
30 nir_optimize(nir_shader *nir)
31 {
32 bool progress;
33 do {
34 progress = false;
35 nir_lower_vars_to_ssa(nir);
36 nir_validate_shader(nir);
37 progress |= nir_copy_prop(nir);
38 nir_validate_shader(nir);
39 progress |= nir_opt_dce(nir);
40 nir_validate_shader(nir);
41 progress |= nir_opt_cse(nir);
42 nir_validate_shader(nir);
43 progress |= nir_opt_peephole_select(nir);
44 nir_validate_shader(nir);
45 progress |= nir_opt_algebraic(nir);
46 nir_validate_shader(nir);
47 progress |= nir_opt_constant_folding(nir);
48 nir_validate_shader(nir);
49 } while (progress);
50 }
51
52 static bool
53 count_nir_instrs_in_block(nir_block *block, void *state)
54 {
55 int *count = (int *) state;
56 nir_foreach_instr(block, instr) {
57 *count = *count + 1;
58 }
59 return true;
60 }
61
62 static int
63 count_nir_instrs(nir_shader *nir)
64 {
65 int count = 0;
66 nir_foreach_overload(nir, overload) {
67 if (!overload->impl)
68 continue;
69 nir_foreach_block(overload->impl, count_nir_instrs_in_block, &count);
70 }
71 return count;
72 }
73
74 void
75 fs_visitor::emit_nir_code()
76 {
77 /* first, lower the GLSL IR shader to NIR */
78 lower_output_reads(shader->base.ir);
79 nir_shader *nir = glsl_to_nir(shader->base.ir, NULL, true);
80 nir_validate_shader(nir);
81
82 nir_lower_global_vars_to_local(nir);
83 nir_validate_shader(nir);
84
85 nir_split_var_copies(nir);
86 nir_validate_shader(nir);
87
88 nir_optimize(nir);
89
90 /* Lower a bunch of stuff */
91 nir_lower_var_copies(nir);
92 nir_validate_shader(nir);
93
94 nir_lower_io(nir);
95 nir_validate_shader(nir);
96
97 nir_lower_locals_to_regs(nir);
98 nir_validate_shader(nir);
99
100 nir_remove_dead_variables(nir);
101 nir_validate_shader(nir);
102
103 nir_lower_samplers(nir, shader_prog, shader->base.Program);
104 nir_validate_shader(nir);
105
106 nir_lower_system_values(nir);
107 nir_validate_shader(nir);
108
109 nir_lower_atomics(nir);
110 nir_validate_shader(nir);
111
112 nir_optimize(nir);
113
114 nir_lower_to_source_mods(nir);
115 nir_validate_shader(nir);
116 nir_copy_prop(nir);
117 nir_validate_shader(nir);
118
119 if (INTEL_DEBUG & DEBUG_WM) {
120 fprintf(stderr, "NIR (SSA form) for fragment shader:\n");
121 nir_print_shader(nir, stderr);
122 }
123
124 if (dispatch_width == 8) {
125 static GLuint msg_id = 0;
126 _mesa_gl_debug(&brw->ctx, &msg_id,
127 MESA_DEBUG_SOURCE_SHADER_COMPILER,
128 MESA_DEBUG_TYPE_OTHER,
129 MESA_DEBUG_SEVERITY_NOTIFICATION,
130 "FS NIR shader: %d inst\n",
131 count_nir_instrs(nir));
132 }
133
134 nir_convert_from_ssa(nir);
135 nir_validate_shader(nir);
136 nir_lower_vec_to_movs(nir);
137 nir_validate_shader(nir);
138
139 /* emit the arrays used for inputs and outputs - load/store intrinsics will
140 * be converted to reads/writes of these arrays
141 */
142
143 if (nir->num_inputs > 0) {
144 nir_inputs = vgrf(nir->num_inputs);
145 nir_setup_inputs(nir);
146 }
147
148 if (nir->num_outputs > 0) {
149 nir_outputs = vgrf(nir->num_outputs);
150 nir_setup_outputs(nir);
151 }
152
153 if (nir->num_uniforms > 0) {
154 nir_uniforms = fs_reg(UNIFORM, 0);
155 nir_setup_uniforms(nir);
156 }
157
158 nir_emit_system_values(nir);
159
160 nir_globals = ralloc_array(mem_ctx, fs_reg, nir->reg_alloc);
161 foreach_list_typed(nir_register, reg, node, &nir->registers) {
162 unsigned array_elems =
163 reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
164 unsigned size = array_elems * reg->num_components;
165 nir_globals[reg->index] = vgrf(size);
166 }
167
168 /* get the main function and emit it */
169 nir_foreach_overload(nir, overload) {
170 assert(strcmp(overload->function->name, "main") == 0);
171 assert(overload->impl);
172 nir_emit_impl(overload->impl);
173 }
174
175 if (INTEL_DEBUG & DEBUG_WM) {
176 fprintf(stderr, "NIR (final form) for fragment shader:\n");
177 nir_print_shader(nir, stderr);
178 }
179
180 ralloc_free(nir);
181 }
182
183 void
184 fs_visitor::nir_setup_inputs(nir_shader *shader)
185 {
186 struct hash_entry *entry;
187 hash_table_foreach(shader->inputs, entry) {
188 nir_variable *var = (nir_variable *) entry->data;
189 fs_reg varying = offset(nir_inputs, var->data.driver_location);
190
191 fs_reg reg;
192 if (!strcmp(var->name, "gl_FragCoord")) {
193 reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
194 var->data.origin_upper_left);
195 emit_percomp(MOV(varying, reg), 0xF);
196 } else if (!strcmp(var->name, "gl_FrontFacing")) {
197 reg = *emit_frontfacing_interpolation();
198 emit(MOV(retype(varying, BRW_REGISTER_TYPE_UD), reg));
199 } else {
200 emit_general_interpolation(varying, var->name, var->type,
201 (glsl_interp_qualifier) var->data.interpolation,
202 var->data.location, var->data.centroid,
203 var->data.sample);
204 }
205 }
206 }
207
208 void
209 fs_visitor::nir_setup_outputs(nir_shader *shader)
210 {
211 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
212
213 struct hash_entry *entry;
214 hash_table_foreach(shader->outputs, entry) {
215 nir_variable *var = (nir_variable *) entry->data;
216 fs_reg reg = offset(nir_outputs, var->data.driver_location);
217
218 if (var->data.index > 0) {
219 assert(var->data.location == FRAG_RESULT_DATA0);
220 assert(var->data.index == 1);
221 this->dual_src_output = reg;
222 this->do_dual_src = true;
223 } else if (var->data.location == FRAG_RESULT_COLOR) {
224 /* Writing gl_FragColor outputs to all color regions. */
225 for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
226 this->outputs[i] = reg;
227 this->output_components[i] = 4;
228 }
229 } else if (var->data.location == FRAG_RESULT_DEPTH) {
230 this->frag_depth = reg;
231 } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) {
232 this->sample_mask = reg;
233 } else {
234 /* gl_FragData or a user-defined FS output */
235 assert(var->data.location >= FRAG_RESULT_DATA0 &&
236 var->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
237
238 int vector_elements =
239 var->type->is_array() ? var->type->fields.array->vector_elements
240 : var->type->vector_elements;
241
242 /* General color output. */
243 for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
244 int output = var->data.location - FRAG_RESULT_DATA0 + i;
245 this->outputs[output] = offset(reg, vector_elements * i);
246 this->output_components[output] = vector_elements;
247 }
248 }
249 }
250 }
251
252 void
253 fs_visitor::nir_setup_uniforms(nir_shader *shader)
254 {
255 uniforms = shader->num_uniforms;
256 param_size[0] = shader->num_uniforms;
257
258 if (dispatch_width != 8)
259 return;
260
261 struct hash_entry *entry;
262 hash_table_foreach(shader->uniforms, entry) {
263 nir_variable *var = (nir_variable *) entry->data;
264
265 /* UBO's and atomics don't take up space in the uniform file */
266
267 if (var->interface_type != NULL || var->type->contains_atomic())
268 continue;
269
270 if (strncmp(var->name, "gl_", 3) == 0)
271 nir_setup_builtin_uniform(var);
272 else
273 nir_setup_uniform(var);
274 }
275 }
276
277 void
278 fs_visitor::nir_setup_uniform(nir_variable *var)
279 {
280 int namelen = strlen(var->name);
281
282 /* The data for our (non-builtin) uniforms is stored in a series of
283 * gl_uniform_driver_storage structs for each subcomponent that
284 * glGetUniformLocation() could name. We know it's been set up in the
285 * same order we'd walk the type, so walk the list of storage and find
286 * anything with our name, or the prefix of a component that starts with
287 * our name.
288 */
289 unsigned index = var->data.driver_location;
290 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
291 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
292
293 if (strncmp(var->name, storage->name, namelen) != 0 ||
294 (storage->name[namelen] != 0 &&
295 storage->name[namelen] != '.' &&
296 storage->name[namelen] != '[')) {
297 continue;
298 }
299
300 unsigned slots = storage->type->component_slots();
301 if (storage->array_elements)
302 slots *= storage->array_elements;
303
304 for (unsigned i = 0; i < slots; i++) {
305 stage_prog_data->param[index++] = &storage->storage[i];
306 }
307 }
308
309 /* Make sure we actually initialized the right amount of stuff here. */
310 assert(var->data.driver_location + var->type->component_slots() == index);
311 }
312
313 void
314 fs_visitor::nir_setup_builtin_uniform(nir_variable *var)
315 {
316 const nir_state_slot *const slots = var->state_slots;
317 assert(var->state_slots != NULL);
318
319 unsigned uniform_index = var->data.driver_location;
320 for (unsigned int i = 0; i < var->num_state_slots; i++) {
321 /* This state reference has already been setup by ir_to_mesa, but we'll
322 * get the same index back here.
323 */
324 int index = _mesa_add_state_reference(this->prog->Parameters,
325 (gl_state_index *)slots[i].tokens);
326
327 /* Add each of the unique swizzles of the element as a parameter.
328 * This'll end up matching the expected layout of the
329 * array/matrix/structure we're trying to fill in.
330 */
331 int last_swiz = -1;
332 for (unsigned int j = 0; j < 4; j++) {
333 int swiz = GET_SWZ(slots[i].swizzle, j);
334 if (swiz == last_swiz)
335 break;
336 last_swiz = swiz;
337
338 stage_prog_data->param[uniform_index++] =
339 &prog->Parameters->ParameterValues[index][swiz];
340 }
341 }
342 }
343
344 static bool
345 emit_system_values_block(nir_block *block, void *void_visitor)
346 {
347 fs_visitor *v = (fs_visitor *)void_visitor;
348 fs_reg *reg;
349
350 nir_foreach_instr(block, instr) {
351 if (instr->type != nir_instr_type_intrinsic)
352 continue;
353
354 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
355 switch (intrin->intrinsic) {
356 case nir_intrinsic_load_sample_pos:
357 assert(v->stage == MESA_SHADER_FRAGMENT);
358 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
359 if (reg->file == BAD_FILE)
360 *reg = *v->emit_samplepos_setup();
361 break;
362
363 case nir_intrinsic_load_sample_id:
364 assert(v->stage == MESA_SHADER_FRAGMENT);
365 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
366 if (reg->file == BAD_FILE)
367 *reg = *v->emit_sampleid_setup();
368 break;
369
370 case nir_intrinsic_load_sample_mask_in:
371 assert(v->stage == MESA_SHADER_FRAGMENT);
372 assert(v->brw->gen >= 7);
373 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
374 if (reg->file == BAD_FILE)
375 *reg = fs_reg(retype(brw_vec8_grf(v->payload.sample_mask_in_reg, 0),
376 BRW_REGISTER_TYPE_D));
377 break;
378
379 default:
380 break;
381 }
382 }
383
384 return true;
385 }
386
387 void
388 fs_visitor::nir_emit_system_values(nir_shader *shader)
389 {
390 nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
391 nir_foreach_overload(shader, overload) {
392 assert(strcmp(overload->function->name, "main") == 0);
393 assert(overload->impl);
394 nir_foreach_block(overload->impl, emit_system_values_block, this);
395 }
396 }
397
398 void
399 fs_visitor::nir_emit_impl(nir_function_impl *impl)
400 {
401 nir_locals = reralloc(mem_ctx, nir_locals, fs_reg, impl->reg_alloc);
402 foreach_list_typed(nir_register, reg, node, &impl->registers) {
403 unsigned array_elems =
404 reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
405 unsigned size = array_elems * reg->num_components;
406 nir_locals[reg->index] = vgrf(size);
407 }
408
409 nir_emit_cf_list(&impl->body);
410 }
411
412 void
413 fs_visitor::nir_emit_cf_list(exec_list *list)
414 {
415 foreach_list_typed(nir_cf_node, node, node, list) {
416 switch (node->type) {
417 case nir_cf_node_if:
418 nir_emit_if(nir_cf_node_as_if(node));
419 break;
420
421 case nir_cf_node_loop:
422 nir_emit_loop(nir_cf_node_as_loop(node));
423 break;
424
425 case nir_cf_node_block:
426 nir_emit_block(nir_cf_node_as_block(node));
427 break;
428
429 default:
430 unreachable("Invalid CFG node block");
431 }
432 }
433 }
434
435 void
436 fs_visitor::nir_emit_if(nir_if *if_stmt)
437 {
438 /* first, put the condition into f0 */
439 fs_inst *inst = emit(MOV(reg_null_d,
440 retype(get_nir_src(if_stmt->condition),
441 BRW_REGISTER_TYPE_UD)));
442 inst->conditional_mod = BRW_CONDITIONAL_NZ;
443
444 emit(IF(BRW_PREDICATE_NORMAL));
445
446 nir_emit_cf_list(&if_stmt->then_list);
447
448 /* note: if the else is empty, dead CF elimination will remove it */
449 emit(BRW_OPCODE_ELSE);
450
451 nir_emit_cf_list(&if_stmt->else_list);
452
453 emit(BRW_OPCODE_ENDIF);
454
455 if (!try_replace_with_sel() && brw->gen < 6) {
456 no16("Can't support (non-uniform) control flow on SIMD16\n");
457 }
458 }
459
460 void
461 fs_visitor::nir_emit_loop(nir_loop *loop)
462 {
463 if (brw->gen < 6) {
464 no16("Can't support (non-uniform) control flow on SIMD16\n");
465 }
466
467 emit(BRW_OPCODE_DO);
468
469 nir_emit_cf_list(&loop->body);
470
471 emit(BRW_OPCODE_WHILE);
472 }
473
474 void
475 fs_visitor::nir_emit_block(nir_block *block)
476 {
477 nir_foreach_instr(block, instr) {
478 nir_emit_instr(instr);
479 }
480 }
481
482 void
483 fs_visitor::nir_emit_instr(nir_instr *instr)
484 {
485 switch (instr->type) {
486 case nir_instr_type_alu:
487 nir_emit_alu(nir_instr_as_alu(instr));
488 break;
489
490 case nir_instr_type_intrinsic:
491 nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
492 break;
493
494 case nir_instr_type_tex:
495 nir_emit_texture(nir_instr_as_tex(instr));
496 break;
497
498 case nir_instr_type_load_const:
499 /* We can hit these, but we do nothing now and use them as
500 * immediates later.
501 */
502 break;
503
504 case nir_instr_type_jump:
505 nir_emit_jump(nir_instr_as_jump(instr));
506 break;
507
508 default:
509 unreachable("unknown instruction type");
510 }
511 }
512
513 static brw_reg_type
514 brw_type_for_nir_type(nir_alu_type type)
515 {
516 switch (type) {
517 case nir_type_bool:
518 case nir_type_unsigned:
519 return BRW_REGISTER_TYPE_UD;
520 case nir_type_int:
521 return BRW_REGISTER_TYPE_D;
522 case nir_type_float:
523 return BRW_REGISTER_TYPE_F;
524 default:
525 unreachable("unknown type");
526 }
527
528 return BRW_REGISTER_TYPE_F;
529 }
530
531 void
532 fs_visitor::nir_emit_alu(nir_alu_instr *instr)
533 {
534 struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
535
536 fs_reg op[3];
537 fs_reg result = get_nir_dest(instr->dest.dest);
538 result.type = brw_type_for_nir_type(nir_op_infos[instr->op].output_type);
539
540 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
541 op[i] = get_nir_alu_src(instr, i);
542
543 switch (instr->op) {
544 case nir_op_fmov:
545 case nir_op_i2f:
546 case nir_op_u2f: {
547 fs_inst *inst = MOV(result, op[0]);
548 inst->saturate = instr->dest.saturate;
549 emit_percomp(inst, instr->dest.write_mask);
550 }
551 break;
552
553 case nir_op_imov:
554 case nir_op_f2i:
555 case nir_op_f2u:
556 emit_percomp(MOV(result, op[0]), instr->dest.write_mask);
557 break;
558
559 case nir_op_fsign: {
560 /* AND(val, 0x80000000) gives the sign bit.
561 *
562 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
563 * zero.
564 */
565 emit_percomp(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ),
566 instr->dest.write_mask);
567
568 fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
569 op[0].type = BRW_REGISTER_TYPE_UD;
570 result.type = BRW_REGISTER_TYPE_UD;
571 emit_percomp(AND(result_int, op[0], fs_reg(0x80000000u)),
572 instr->dest.write_mask);
573
574 fs_inst *inst = OR(result_int, result_int, fs_reg(0x3f800000u));
575 inst->predicate = BRW_PREDICATE_NORMAL;
576 emit_percomp(inst, instr->dest.write_mask);
577 if (instr->dest.saturate) {
578 fs_inst *inst = MOV(result, result);
579 inst->saturate = true;
580 emit_percomp(inst, instr->dest.write_mask);
581 }
582 break;
583 }
584
585 case nir_op_isign: {
586 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
587 * -> non-negative val generates 0x00000000.
588 * Predicated OR sets 1 if val is positive.
589 */
590 emit_percomp(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G),
591 instr->dest.write_mask);
592
593 emit_percomp(ASR(result, op[0], fs_reg(31)), instr->dest.write_mask);
594
595 fs_inst *inst = OR(result, result, fs_reg(1));
596 inst->predicate = BRW_PREDICATE_NORMAL;
597 emit_percomp(inst, instr->dest.write_mask);
598 break;
599 }
600
601 case nir_op_frcp:
602 emit_math_percomp(SHADER_OPCODE_RCP, result, op[0],
603 instr->dest.write_mask, instr->dest.saturate);
604 break;
605
606 case nir_op_fexp2:
607 emit_math_percomp(SHADER_OPCODE_EXP2, result, op[0],
608 instr->dest.write_mask, instr->dest.saturate);
609 break;
610
611 case nir_op_flog2:
612 emit_math_percomp(SHADER_OPCODE_LOG2, result, op[0],
613 instr->dest.write_mask, instr->dest.saturate);
614 break;
615
616 case nir_op_fexp:
617 case nir_op_flog:
618 unreachable("not reached: should be handled by ir_explog_to_explog2");
619
620 case nir_op_fsin:
621 case nir_op_fsin_reduced:
622 emit_math_percomp(SHADER_OPCODE_SIN, result, op[0],
623 instr->dest.write_mask, instr->dest.saturate);
624 break;
625
626 case nir_op_fcos:
627 case nir_op_fcos_reduced:
628 emit_math_percomp(SHADER_OPCODE_COS, result, op[0],
629 instr->dest.write_mask, instr->dest.saturate);
630 break;
631
632 case nir_op_fddx:
633 if (fs_key->high_quality_derivatives)
634 emit_percomp(FS_OPCODE_DDX_FINE, result, op[0],
635 instr->dest.write_mask, instr->dest.saturate);
636 else
637 emit_percomp(FS_OPCODE_DDX_COARSE, result, op[0],
638 instr->dest.write_mask, instr->dest.saturate);
639 break;
640 case nir_op_fddx_fine:
641 emit_percomp(FS_OPCODE_DDX_FINE, result, op[0],
642 instr->dest.write_mask, instr->dest.saturate);
643 break;
644 case nir_op_fddx_coarse:
645 emit_percomp(FS_OPCODE_DDX_COARSE, result, op[0],
646 instr->dest.write_mask, instr->dest.saturate);
647 break;
648 case nir_op_fddy:
649 if (fs_key->high_quality_derivatives)
650 emit_percomp(FS_OPCODE_DDY_FINE, result, op[0],
651 fs_reg(fs_key->render_to_fbo),
652 instr->dest.write_mask, instr->dest.saturate);
653 else
654 emit_percomp(FS_OPCODE_DDY_COARSE, result, op[0],
655 fs_reg(fs_key->render_to_fbo),
656 instr->dest.write_mask, instr->dest.saturate);
657 break;
658 case nir_op_fddy_fine:
659 emit_percomp(FS_OPCODE_DDY_FINE, result, op[0],
660 fs_reg(fs_key->render_to_fbo),
661 instr->dest.write_mask, instr->dest.saturate);
662 break;
663 case nir_op_fddy_coarse:
664 emit_percomp(FS_OPCODE_DDY_COARSE, result, op[0],
665 fs_reg(fs_key->render_to_fbo),
666 instr->dest.write_mask, instr->dest.saturate);
667 break;
668
669 case nir_op_fadd:
670 case nir_op_iadd: {
671 fs_inst *inst = ADD(result, op[0], op[1]);
672 inst->saturate = instr->dest.saturate;
673 emit_percomp(inst, instr->dest.write_mask);
674 break;
675 }
676
677 case nir_op_fmul: {
678 fs_inst *inst = MUL(result, op[0], op[1]);
679 inst->saturate = instr->dest.saturate;
680 emit_percomp(inst, instr->dest.write_mask);
681 break;
682 }
683
684 case nir_op_imul: {
685 /* TODO put in the 16-bit constant optimization once we have SSA */
686
687 if (brw->gen >= 7)
688 no16("SIMD16 explicit accumulator operands unsupported\n");
689
690 struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
691
692 emit_percomp(MUL(acc, op[0], op[1]), instr->dest.write_mask);
693 emit_percomp(MACH(reg_null_d, op[0], op[1]), instr->dest.write_mask);
694 emit_percomp(MOV(result, fs_reg(acc)), instr->dest.write_mask);
695 break;
696 }
697
698 case nir_op_imul_high:
699 case nir_op_umul_high: {
700 if (brw->gen >= 7)
701 no16("SIMD16 explicit accumulator operands unsupported\n");
702
703 struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
704
705 emit_percomp(MUL(acc, op[0], op[1]), instr->dest.write_mask);
706 emit_percomp(MACH(result, op[0], op[1]), instr->dest.write_mask);
707 break;
708 }
709
710 case nir_op_idiv:
711 case nir_op_udiv:
712 emit_math_percomp(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1],
713 instr->dest.write_mask);
714 break;
715
716 case nir_op_uadd_carry: {
717 if (brw->gen >= 7)
718 no16("SIMD16 explicit accumulator operands unsupported\n");
719
720 struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
721 BRW_REGISTER_TYPE_UD);
722
723 emit_percomp(ADDC(reg_null_ud, op[0], op[1]), instr->dest.write_mask);
724 emit_percomp(MOV(result, fs_reg(acc)), instr->dest.write_mask);
725 break;
726 }
727
728 case nir_op_usub_borrow: {
729 if (brw->gen >= 7)
730 no16("SIMD16 explicit accumulator operands unsupported\n");
731
732 struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
733 BRW_REGISTER_TYPE_UD);
734
735 emit_percomp(SUBB(reg_null_ud, op[0], op[1]), instr->dest.write_mask);
736 emit_percomp(MOV(result, fs_reg(acc)), instr->dest.write_mask);
737 break;
738 }
739
740 case nir_op_umod:
741 emit_math_percomp(SHADER_OPCODE_INT_REMAINDER, result, op[0],
742 op[1], instr->dest.write_mask);
743 break;
744
745 case nir_op_flt:
746 case nir_op_ilt:
747 case nir_op_ult:
748 emit_percomp(CMP(result, op[0], op[1], BRW_CONDITIONAL_L),
749 instr->dest.write_mask);
750 break;
751
752 case nir_op_fge:
753 case nir_op_ige:
754 case nir_op_uge:
755 emit_percomp(CMP(result, op[0], op[1], BRW_CONDITIONAL_GE),
756 instr->dest.write_mask);
757 break;
758
759 case nir_op_feq:
760 case nir_op_ieq:
761 emit_percomp(CMP(result, op[0], op[1], BRW_CONDITIONAL_Z),
762 instr->dest.write_mask);
763 break;
764
765 case nir_op_fne:
766 case nir_op_ine:
767 emit_percomp(CMP(result, op[0], op[1], BRW_CONDITIONAL_NZ),
768 instr->dest.write_mask);
769 break;
770
771 case nir_op_ball_fequal2:
772 case nir_op_ball_iequal2:
773 case nir_op_ball_fequal3:
774 case nir_op_ball_iequal3:
775 case nir_op_ball_fequal4:
776 case nir_op_ball_iequal4: {
777 unsigned num_components = nir_op_infos[instr->op].input_sizes[0];
778 fs_reg temp = vgrf(num_components);
779 emit_percomp(CMP(temp, op[0], op[1], BRW_CONDITIONAL_Z),
780 (1 << num_components) - 1);
781 emit_reduction(BRW_OPCODE_AND, result, temp, num_components);
782 break;
783 }
784
785 case nir_op_bany_fnequal2:
786 case nir_op_bany_inequal2:
787 case nir_op_bany_fnequal3:
788 case nir_op_bany_inequal3:
789 case nir_op_bany_fnequal4:
790 case nir_op_bany_inequal4: {
791 unsigned num_components = nir_op_infos[instr->op].input_sizes[0];
792 fs_reg temp = vgrf(num_components);
793 temp.type = BRW_REGISTER_TYPE_UD;
794 emit_percomp(CMP(temp, op[0], op[1], BRW_CONDITIONAL_NZ),
795 (1 << num_components) - 1);
796 emit_reduction(BRW_OPCODE_OR, result, temp, num_components);
797 break;
798 }
799
800 case nir_op_inot:
801 emit_percomp(NOT(result, op[0]), instr->dest.write_mask);
802 break;
803 case nir_op_ixor:
804 emit_percomp(XOR(result, op[0], op[1]), instr->dest.write_mask);
805 break;
806 case nir_op_ior:
807 emit_percomp(OR(result, op[0], op[1]), instr->dest.write_mask);
808 break;
809 case nir_op_iand:
810 emit_percomp(AND(result, op[0], op[1]), instr->dest.write_mask);
811 break;
812
813 case nir_op_fdot2:
814 case nir_op_fdot3:
815 case nir_op_fdot4: {
816 unsigned num_components = nir_op_infos[instr->op].input_sizes[0];
817 fs_reg temp = vgrf(num_components);
818 emit_percomp(MUL(temp, op[0], op[1]), (1 << num_components) - 1);
819 emit_reduction(BRW_OPCODE_ADD, result, temp, num_components);
820 if (instr->dest.saturate) {
821 fs_inst *inst = emit(MOV(result, result));
822 inst->saturate = true;
823 }
824 break;
825 }
826
827 case nir_op_bany2:
828 case nir_op_bany3:
829 case nir_op_bany4: {
830 unsigned num_components = nir_op_infos[instr->op].input_sizes[0];
831 emit_reduction(BRW_OPCODE_OR, result, op[0], num_components);
832 break;
833 }
834
835 case nir_op_ball2:
836 case nir_op_ball3:
837 case nir_op_ball4: {
838 unsigned num_components = nir_op_infos[instr->op].input_sizes[0];
839 emit_reduction(BRW_OPCODE_AND, result, op[0], num_components);
840 break;
841 }
842
843 case nir_op_fnoise1_1:
844 case nir_op_fnoise1_2:
845 case nir_op_fnoise1_3:
846 case nir_op_fnoise1_4:
847 case nir_op_fnoise2_1:
848 case nir_op_fnoise2_2:
849 case nir_op_fnoise2_3:
850 case nir_op_fnoise2_4:
851 case nir_op_fnoise3_1:
852 case nir_op_fnoise3_2:
853 case nir_op_fnoise3_3:
854 case nir_op_fnoise3_4:
855 case nir_op_fnoise4_1:
856 case nir_op_fnoise4_2:
857 case nir_op_fnoise4_3:
858 case nir_op_fnoise4_4:
859 unreachable("not reached: should be handled by lower_noise");
860
861 case nir_op_vec2:
862 case nir_op_vec3:
863 case nir_op_vec4:
864 unreachable("not reached: should be handled by lower_quadop_vector");
865
866 case nir_op_ldexp:
867 unreachable("not reached: should be handled by ldexp_to_arith()");
868
869 case nir_op_fsqrt:
870 emit_math_percomp(SHADER_OPCODE_SQRT, result, op[0],
871 instr->dest.write_mask, instr->dest.saturate);
872 break;
873
874 case nir_op_frsq:
875 emit_math_percomp(SHADER_OPCODE_RSQ, result, op[0],
876 instr->dest.write_mask, instr->dest.saturate);
877 break;
878
879 case nir_op_b2i:
880 emit_percomp(AND(result, op[0], fs_reg(1)), instr->dest.write_mask);
881 break;
882 case nir_op_b2f: {
883 emit_percomp(AND(retype(result, BRW_REGISTER_TYPE_UD), op[0],
884 fs_reg(0x3f800000u)),
885 instr->dest.write_mask);
886 break;
887 }
888
889 case nir_op_f2b:
890 emit_percomp(CMP(result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ),
891 instr->dest.write_mask);
892 break;
893 case nir_op_i2b:
894 emit_percomp(CMP(result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ),
895 instr->dest.write_mask);
896 break;
897
898 case nir_op_ftrunc: {
899 fs_inst *inst = RNDZ(result, op[0]);
900 inst->saturate = instr->dest.saturate;
901 emit_percomp(inst, instr->dest.write_mask);
902 break;
903 }
904 case nir_op_fceil: {
905 op[0].negate = !op[0].negate;
906 fs_reg temp = vgrf(glsl_type::vec4_type);
907 emit_percomp(RNDD(temp, op[0]), instr->dest.write_mask);
908 temp.negate = true;
909 fs_inst *inst = MOV(result, temp);
910 inst->saturate = instr->dest.saturate;
911 emit_percomp(inst, instr->dest.write_mask);
912 break;
913 }
914 case nir_op_ffloor: {
915 fs_inst *inst = RNDD(result, op[0]);
916 inst->saturate = instr->dest.saturate;
917 emit_percomp(inst, instr->dest.write_mask);
918 break;
919 }
920 case nir_op_ffract: {
921 fs_inst *inst = FRC(result, op[0]);
922 inst->saturate = instr->dest.saturate;
923 emit_percomp(inst, instr->dest.write_mask);
924 break;
925 }
926 case nir_op_fround_even: {
927 fs_inst *inst = RNDE(result, op[0]);
928 inst->saturate = instr->dest.saturate;
929 emit_percomp(inst, instr->dest.write_mask);
930 break;
931 }
932
933 case nir_op_fmin:
934 case nir_op_imin:
935 case nir_op_umin:
936 if (brw->gen >= 6) {
937 emit_percomp(BRW_OPCODE_SEL, result, op[0], op[1],
938 instr->dest.write_mask, instr->dest.saturate,
939 BRW_PREDICATE_NONE, BRW_CONDITIONAL_L);
940 } else {
941 emit_percomp(CMP(reg_null_d, op[0], op[1], BRW_CONDITIONAL_L),
942 instr->dest.write_mask);
943
944 emit_percomp(BRW_OPCODE_SEL, result, op[0], op[1],
945 instr->dest.write_mask, instr->dest.saturate,
946 BRW_PREDICATE_NORMAL);
947 }
948 break;
949
950 case nir_op_fmax:
951 case nir_op_imax:
952 case nir_op_umax:
953 if (brw->gen >= 6) {
954 emit_percomp(BRW_OPCODE_SEL, result, op[0], op[1],
955 instr->dest.write_mask, instr->dest.saturate,
956 BRW_PREDICATE_NONE, BRW_CONDITIONAL_GE);
957 } else {
958 emit_percomp(CMP(reg_null_d, op[0], op[1], BRW_CONDITIONAL_GE),
959 instr->dest.write_mask);
960
961 emit_percomp(BRW_OPCODE_SEL, result, op[0], op[1],
962 instr->dest.write_mask, instr->dest.saturate,
963 BRW_PREDICATE_NORMAL);
964 }
965 break;
966
967 case nir_op_pack_snorm_2x16:
968 case nir_op_pack_snorm_4x8:
969 case nir_op_pack_unorm_2x16:
970 case nir_op_pack_unorm_4x8:
971 case nir_op_unpack_snorm_2x16:
972 case nir_op_unpack_snorm_4x8:
973 case nir_op_unpack_unorm_2x16:
974 case nir_op_unpack_unorm_4x8:
975 case nir_op_unpack_half_2x16:
976 case nir_op_pack_half_2x16:
977 unreachable("not reached: should be handled by lower_packing_builtins");
978
979 case nir_op_unpack_half_2x16_split_x:
980 emit_percomp(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0],
981 instr->dest.write_mask, instr->dest.saturate);
982 break;
983 case nir_op_unpack_half_2x16_split_y:
984 emit_percomp(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0],
985 instr->dest.write_mask, instr->dest.saturate);
986 break;
987
988 case nir_op_fpow:
989 emit_percomp(SHADER_OPCODE_POW, result, op[0], op[1],
990 instr->dest.write_mask, instr->dest.saturate);
991 break;
992
993 case nir_op_bitfield_reverse:
994 emit_percomp(BFREV(result, op[0]), instr->dest.write_mask);
995 break;
996
997 case nir_op_bit_count:
998 emit_percomp(CBIT(result, op[0]), instr->dest.write_mask);
999 break;
1000
1001 case nir_op_ufind_msb:
1002 case nir_op_ifind_msb: {
1003 emit_percomp(FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]),
1004 instr->dest.write_mask);
1005
1006 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1007 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1008 * subtract the result from 31 to convert the MSB count into an LSB count.
1009 */
1010
1011 emit_percomp(CMP(reg_null_d, result, fs_reg(-1), BRW_CONDITIONAL_NZ),
1012 instr->dest.write_mask);
1013 fs_reg neg_result(result);
1014 neg_result.negate = true;
1015 fs_inst *inst = ADD(result, neg_result, fs_reg(31));
1016 inst->predicate = BRW_PREDICATE_NORMAL;
1017 emit_percomp(inst, instr->dest.write_mask);
1018 break;
1019 }
1020
1021 case nir_op_find_lsb:
1022 emit_percomp(FBL(result, op[0]), instr->dest.write_mask);
1023 break;
1024
1025 case nir_op_ubitfield_extract:
1026 case nir_op_ibitfield_extract:
1027 emit_percomp(BFE(result, op[2], op[1], op[0]), instr->dest.write_mask);
1028 break;
1029 case nir_op_bfm:
1030 emit_percomp(BFI1(result, op[0], op[1]), instr->dest.write_mask);
1031 break;
1032 case nir_op_bfi:
1033 emit_percomp(BFI2(result, op[0], op[1], op[2]), instr->dest.write_mask);
1034 break;
1035
1036 case nir_op_bitfield_insert:
1037 unreachable("not reached: should be handled by "
1038 "lower_instructions::bitfield_insert_to_bfm_bfi");
1039
1040 case nir_op_ishl:
1041 emit_percomp(SHL(result, op[0], op[1]), instr->dest.write_mask);
1042 break;
1043 case nir_op_ishr:
1044 emit_percomp(ASR(result, op[0], op[1]), instr->dest.write_mask);
1045 break;
1046 case nir_op_ushr:
1047 emit_percomp(SHR(result, op[0], op[1]), instr->dest.write_mask);
1048 break;
1049
1050 case nir_op_pack_half_2x16_split:
1051 emit_percomp(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1],
1052 instr->dest.write_mask);
1053 break;
1054
1055 case nir_op_ffma:
1056 emit_percomp(MAD(result, op[2], op[1], op[0]), instr->dest.write_mask);
1057 break;
1058
1059 case nir_op_flrp:
1060 /* TODO emulate for gen < 6 */
1061 emit_percomp(LRP(result, op[2], op[1], op[0]), instr->dest.write_mask);
1062 break;
1063
1064 case nir_op_bcsel:
1065 for (unsigned i = 0; i < 4; i++) {
1066 if (!((instr->dest.write_mask >> i) & 1))
1067 continue;
1068
1069 emit(CMP(reg_null_d, offset(op[0], i), fs_reg(0), BRW_CONDITIONAL_NZ));
1070 emit(SEL(offset(result, i), offset(op[1], i), offset(op[2], i)))
1071 ->predicate = BRW_PREDICATE_NORMAL;
1072 }
1073 break;
1074
1075 default:
1076 unreachable("unhandled instruction");
1077 }
1078 }
1079
1080 fs_reg
1081 fs_visitor::get_nir_src(nir_src src)
1082 {
1083 if (src.is_ssa) {
1084 assert(src.ssa->parent_instr->type == nir_instr_type_load_const);
1085 nir_load_const_instr *load = nir_instr_as_load_const(src.ssa->parent_instr);
1086 fs_reg reg = vgrf(src.ssa->num_components);
1087 reg.type = BRW_REGISTER_TYPE_D;
1088
1089 for (unsigned i = 0; i < src.ssa->num_components; ++i)
1090 emit(MOV(offset(reg, i), fs_reg(load->value.i[i])));
1091
1092 return reg;
1093 } else {
1094 fs_reg reg;
1095 if (src.reg.reg->is_global)
1096 reg = nir_globals[src.reg.reg->index];
1097 else
1098 reg = nir_locals[src.reg.reg->index];
1099
1100 /* to avoid floating-point denorm flushing problems, set the type by
1101 * default to D - instructions that need floating point semantics will set
1102 * this to F if they need to
1103 */
1104 reg = retype(offset(reg, src.reg.base_offset), BRW_REGISTER_TYPE_D);
1105 if (src.reg.indirect) {
1106 reg.reladdr = new(mem_ctx) fs_reg();
1107 *reg.reladdr = retype(get_nir_src(*src.reg.indirect),
1108 BRW_REGISTER_TYPE_D);
1109 }
1110
1111 return reg;
1112 }
1113 }
1114
1115 fs_reg
1116 fs_visitor::get_nir_alu_src(nir_alu_instr *instr, unsigned src)
1117 {
1118 fs_reg reg = get_nir_src(instr->src[src].src);
1119
1120 reg.type = brw_type_for_nir_type(nir_op_infos[instr->op].input_types[src]);
1121 reg.abs = instr->src[src].abs;
1122 reg.negate = instr->src[src].negate;
1123
1124 bool needs_swizzle = false;
1125 unsigned num_components = 0;
1126 for (unsigned i = 0; i < 4; i++) {
1127 if (!nir_alu_instr_channel_used(instr, src, i))
1128 continue;
1129
1130 if (instr->src[src].swizzle[i] != i)
1131 needs_swizzle = true;
1132
1133 num_components = i + 1;
1134 }
1135
1136 if (needs_swizzle) {
1137 /* resolve the swizzle through MOV's */
1138 fs_reg new_reg = vgrf(num_components);
1139 new_reg.type = reg.type;
1140
1141 for (unsigned i = 0; i < 4; i++) {
1142 if (!nir_alu_instr_channel_used(instr, src, i))
1143 continue;
1144
1145 emit(MOV(offset(new_reg, i),
1146 offset(reg, instr->src[src].swizzle[i])));
1147 }
1148
1149 return new_reg;
1150 }
1151
1152 return reg;
1153 }
1154
1155 fs_reg
1156 fs_visitor::get_nir_dest(nir_dest dest)
1157 {
1158 fs_reg reg;
1159 if (dest.reg.reg->is_global)
1160 reg = nir_globals[dest.reg.reg->index];
1161 else
1162 reg = nir_locals[dest.reg.reg->index];
1163
1164 reg = offset(reg, dest.reg.base_offset);
1165 if (dest.reg.indirect) {
1166 reg.reladdr = new(mem_ctx) fs_reg();
1167 *reg.reladdr = retype(get_nir_src(*dest.reg.indirect),
1168 BRW_REGISTER_TYPE_D);
1169 }
1170
1171 return reg;
1172 }
1173
1174 void
1175 fs_visitor::emit_percomp(fs_inst *inst, unsigned wr_mask)
1176 {
1177 for (unsigned i = 0; i < 4; i++) {
1178 if (!((wr_mask >> i) & 1))
1179 continue;
1180
1181 fs_inst *new_inst = new(mem_ctx) fs_inst(*inst);
1182 new_inst->dst = offset(new_inst->dst, i);
1183 for (unsigned j = 0; j < new_inst->sources; j++)
1184 if (inst->src[j].file == GRF)
1185 new_inst->src[j] = offset(new_inst->src[j], i);
1186
1187 emit(new_inst);
1188 }
1189 }
1190
1191 void
1192 fs_visitor::emit_percomp(enum opcode op, fs_reg dest, fs_reg src0,
1193 unsigned wr_mask, bool saturate,
1194 enum brw_predicate predicate,
1195 enum brw_conditional_mod mod)
1196 {
1197 for (unsigned i = 0; i < 4; i++) {
1198 if (!((wr_mask >> i) & 1))
1199 continue;
1200
1201 fs_reg new_src0 = src0.file == GRF ? offset(src0, i) : src0;
1202 fs_inst *new_inst = new(mem_ctx) fs_inst(op, offset(dest, i), new_src0);
1203
1204 new_inst->predicate = predicate;
1205 new_inst->conditional_mod = mod;
1206 new_inst->saturate = saturate;
1207 emit(new_inst);
1208 }
1209 }
1210
1211 void
1212 fs_visitor::emit_percomp(enum opcode op, fs_reg dest, fs_reg src0, fs_reg src1,
1213 unsigned wr_mask, bool saturate,
1214 enum brw_predicate predicate,
1215 enum brw_conditional_mod mod)
1216 {
1217 for (unsigned i = 0; i < 4; i++) {
1218 if (!((wr_mask >> i) & 1))
1219 continue;
1220
1221 fs_reg new_src0 = src0.file == GRF ? offset(src0, i) : src0;
1222 fs_reg new_src1 = src1.file == GRF ? offset(src1, i) : src1;
1223 fs_inst *new_inst =
1224 new(mem_ctx) fs_inst(op, offset(dest, i), new_src0, new_src1);
1225
1226 new_inst->predicate = predicate;
1227 new_inst->conditional_mod = mod;
1228 new_inst->saturate = saturate;
1229 emit(new_inst);
1230 }
1231 }
1232
1233 void
1234 fs_visitor::emit_math_percomp(enum opcode op, fs_reg dest, fs_reg src0,
1235 unsigned wr_mask, bool saturate)
1236 {
1237 for (unsigned i = 0; i < 4; i++) {
1238 if (!((wr_mask >> i) & 1))
1239 continue;
1240
1241 fs_reg new_src0 = src0.file == GRF ? offset(src0, i) : src0;
1242 fs_inst *new_inst = emit_math(op, offset(dest, i), new_src0);
1243 new_inst->saturate = saturate;
1244 }
1245 }
1246
1247 void
1248 fs_visitor::emit_math_percomp(enum opcode op, fs_reg dest, fs_reg src0,
1249 fs_reg src1, unsigned wr_mask,
1250 bool saturate)
1251 {
1252 for (unsigned i = 0; i < 4; i++) {
1253 if (!((wr_mask >> i) & 1))
1254 continue;
1255
1256 fs_reg new_src0 = src0.file == GRF ? offset(src0, i) : src0;
1257 fs_reg new_src1 = src1.file == GRF ? offset(src1, i) : src1;
1258 fs_inst *new_inst = emit_math(op, offset(dest, i), new_src0, new_src1);
1259 new_inst->saturate = saturate;
1260 }
1261 }
1262
1263 void
1264 fs_visitor::emit_reduction(enum opcode op, fs_reg dest, fs_reg src,
1265 unsigned num_components)
1266 {
1267 fs_reg src0 = src;
1268 fs_reg src1 = offset(src, 1);
1269
1270 if (num_components == 2) {
1271 emit(op, dest, src0, src1);
1272 return;
1273 }
1274
1275 fs_reg temp1 = vgrf(1);
1276 temp1.type = src.type;
1277 emit(op, temp1, src0, src1);
1278
1279 fs_reg src2 = offset(src, 2);
1280
1281 if (num_components == 3) {
1282 emit(op, dest, temp1, src2);
1283 return;
1284 }
1285
1286 assert(num_components == 4);
1287
1288 fs_reg src3 = offset(src, 3);
1289 fs_reg temp2 = vgrf(1);
1290 temp2.type = src.type;
1291
1292 emit(op, temp2, src2, src3);
1293 emit(op, dest, temp1, temp2);
1294 }
1295
1296 void
1297 fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
1298 {
1299 fs_reg dest;
1300 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
1301 dest = get_nir_dest(instr->dest);
1302
1303 bool has_indirect = false;
1304
1305 switch (instr->intrinsic) {
1306 case nir_intrinsic_discard: {
1307 /* We track our discarded pixels in f0.1. By predicating on it, we can
1308 * update just the flag bits that aren't yet discarded. By emitting a
1309 * CMP of g0 != g0, all our currently executing channels will get turned
1310 * off.
1311 */
1312 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
1313 BRW_REGISTER_TYPE_UW));
1314 fs_inst *cmp = emit(CMP(reg_null_f, some_reg, some_reg,
1315 BRW_CONDITIONAL_NZ));
1316 cmp->predicate = BRW_PREDICATE_NORMAL;
1317 cmp->flag_subreg = 1;
1318
1319 if (brw->gen >= 6) {
1320 /* For performance, after a discard, jump to the end of the shader.
1321 * Only jump if all relevant channels have been discarded.
1322 */
1323 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1324 discard_jump->flag_subreg = 1;
1325
1326 discard_jump->predicate = (dispatch_width == 8)
1327 ? BRW_PREDICATE_ALIGN1_ANY8H
1328 : BRW_PREDICATE_ALIGN1_ANY16H;
1329 discard_jump->predicate_inverse = true;
1330 }
1331
1332 break;
1333 }
1334
1335 case nir_intrinsic_atomic_counter_inc:
1336 case nir_intrinsic_atomic_counter_dec:
1337 case nir_intrinsic_atomic_counter_read: {
1338 unsigned surf_index = prog_data->binding_table.abo_start +
1339 (unsigned) instr->const_index[0];
1340 fs_reg offset = fs_reg(get_nir_src(instr->src[0]));
1341
1342 switch (instr->intrinsic) {
1343 case nir_intrinsic_atomic_counter_inc:
1344 emit_untyped_atomic(BRW_AOP_INC, surf_index, dest, offset,
1345 fs_reg(), fs_reg());
1346 break;
1347 case nir_intrinsic_atomic_counter_dec:
1348 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dest, offset,
1349 fs_reg(), fs_reg());
1350 break;
1351 case nir_intrinsic_atomic_counter_read:
1352 emit_untyped_surface_read(surf_index, dest, offset);
1353 break;
1354 default:
1355 unreachable("Unreachable");
1356 }
1357 break;
1358 }
1359
1360 case nir_intrinsic_load_front_face:
1361 assert(!"TODO");
1362
1363 case nir_intrinsic_load_sample_mask_in: {
1364 fs_reg sample_mask_in = nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
1365 assert(sample_mask_in.file != BAD_FILE);
1366 dest.type = sample_mask_in.type;
1367 emit(MOV(dest, sample_mask_in));
1368 break;
1369 }
1370
1371 case nir_intrinsic_load_sample_pos: {
1372 fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
1373 assert(sample_pos.file != BAD_FILE);
1374 dest.type = sample_pos.type;
1375 emit(MOV(dest, sample_pos));
1376 emit(MOV(offset(dest, 1), offset(sample_pos, 1)));
1377 break;
1378 }
1379
1380 case nir_intrinsic_load_sample_id: {
1381 fs_reg sample_id = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
1382 assert(sample_id.file != BAD_FILE);
1383 dest.type = sample_id.type;
1384 emit(MOV(dest, sample_id));
1385 break;
1386 }
1387
1388 case nir_intrinsic_load_uniform_indirect:
1389 has_indirect = true;
1390 case nir_intrinsic_load_uniform: {
1391 unsigned index = 0;
1392 for (int i = 0; i < instr->const_index[1]; i++) {
1393 for (unsigned j = 0; j < instr->num_components; j++) {
1394 fs_reg src = offset(retype(nir_uniforms, dest.type),
1395 instr->const_index[0] + index);
1396 if (has_indirect)
1397 src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
1398 index++;
1399
1400 emit(MOV(dest, src));
1401 dest = offset(dest, 1);
1402 }
1403 }
1404 break;
1405 }
1406
1407 case nir_intrinsic_load_ubo_indirect:
1408 has_indirect = true;
1409 case nir_intrinsic_load_ubo: {
1410 nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
1411 fs_reg surf_index;
1412
1413 if (const_index) {
1414 surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
1415 const_index->u[0]);
1416 } else {
1417 /* The block index is not a constant. Evaluate the index expression
1418 * per-channel and add the base UBO index; the generator will select
1419 * a value from any live channel.
1420 */
1421 surf_index = vgrf(glsl_type::uint_type);
1422 emit(ADD(surf_index, get_nir_src(instr->src[0]),
1423 fs_reg(stage_prog_data->binding_table.ubo_start)))
1424 ->force_writemask_all = true;
1425
1426 /* Assume this may touch any UBO. It would be nice to provide
1427 * a tighter bound, but the array information is already lowered away.
1428 */
1429 brw_mark_surface_used(prog_data,
1430 stage_prog_data->binding_table.ubo_start +
1431 shader_prog->NumUniformBlocks - 1);
1432 }
1433
1434 if (has_indirect) {
1435 /* Turn the byte offset into a dword offset. */
1436 fs_reg base_offset = vgrf(glsl_type::int_type);
1437 emit(SHR(base_offset, retype(get_nir_src(instr->src[1]),
1438 BRW_REGISTER_TYPE_D),
1439 fs_reg(2)));
1440
1441 unsigned vec4_offset = instr->const_index[0] / 4;
1442 for (int i = 0; i < instr->num_components; i++)
1443 emit(VARYING_PULL_CONSTANT_LOAD(offset(dest, i), surf_index,
1444 base_offset, vec4_offset + i));
1445 } else {
1446 fs_reg packed_consts = vgrf(glsl_type::float_type);
1447 packed_consts.type = dest.type;
1448
1449 fs_reg const_offset_reg((unsigned) instr->const_index[0] & ~15);
1450 emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
1451 surf_index, const_offset_reg);
1452
1453 for (unsigned i = 0; i < instr->num_components; i++) {
1454 packed_consts.set_smear(instr->const_index[0] % 16 / 4 + i);
1455
1456 /* The std140 packing rules don't allow vectors to cross 16-byte
1457 * boundaries, and a reg is 32 bytes.
1458 */
1459 assert(packed_consts.subreg_offset < 32);
1460
1461 emit(MOV(dest, packed_consts));
1462 dest = offset(dest, 1);
1463 }
1464 }
1465 break;
1466 }
1467
1468 case nir_intrinsic_load_input_indirect:
1469 has_indirect = true;
1470 case nir_intrinsic_load_input: {
1471 unsigned index = 0;
1472 for (int i = 0; i < instr->const_index[1]; i++) {
1473 for (unsigned j = 0; j < instr->num_components; j++) {
1474 fs_reg src = offset(retype(nir_inputs, dest.type),
1475 instr->const_index[0] + index);
1476 if (has_indirect)
1477 src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
1478 index++;
1479
1480 emit(MOV(dest, src));
1481 dest = offset(dest, 1);
1482 }
1483 }
1484 break;
1485 }
1486
1487 /* Handle ARB_gpu_shader5 interpolation intrinsics
1488 *
1489 * It's worth a quick word of explanation as to why we handle the full
1490 * variable-based interpolation intrinsic rather than a lowered version
1491 * with like we do for other inputs. We have to do that because the way
1492 * we set up inputs doesn't allow us to use the already setup inputs for
1493 * interpolation. At the beginning of the shader, we go through all of
1494 * the input variables and do the initial interpolation and put it in
1495 * the nir_inputs array based on its location as determined in
1496 * nir_lower_io. If the input isn't used, dead code cleans up and
1497 * everything works fine. However, when we get to the ARB_gpu_shader5
1498 * interpolation intrinsics, we need to reinterpolate the input
1499 * differently. If we used an intrinsic that just had an index it would
1500 * only give us the offset into the nir_inputs array. However, this is
1501 * useless because that value is post-interpolation and we need
1502 * pre-interpolation. In order to get the actual location of the bits
1503 * we get from the vertex fetching hardware, we need the variable.
1504 */
1505 case nir_intrinsic_interp_var_at_centroid:
1506 case nir_intrinsic_interp_var_at_sample:
1507 case nir_intrinsic_interp_var_at_offset: {
1508 /* in SIMD16 mode, the pixel interpolator returns coords interleaved
1509 * 8 channels at a time, same as the barycentric coords presented in
1510 * the FS payload. this requires a bit of extra work to support.
1511 */
1512 no16("interpolate_at_* not yet supported in SIMD16 mode.");
1513
1514 fs_reg dst_x = vgrf(2);
1515 fs_reg dst_y = offset(dst_x, 1);
1516
1517 /* For most messages, we need one reg of ignored data; the hardware
1518 * requires mlen==1 even when there is no payload. in the per-slot
1519 * offset case, we'll replace this with the proper source data.
1520 */
1521 fs_reg src = vgrf(glsl_type::float_type);
1522 int mlen = 1; /* one reg unless overriden */
1523 fs_inst *inst;
1524
1525 switch (instr->intrinsic) {
1526 case nir_intrinsic_interp_var_at_centroid:
1527 inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
1528 break;
1529
1530 case nir_intrinsic_interp_var_at_sample: {
1531 /* XXX: We should probably handle non-constant sample id's */
1532 nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
1533 assert(const_sample);
1534 unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0;
1535 inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src,
1536 fs_reg(msg_data));
1537 break;
1538 }
1539
1540 case nir_intrinsic_interp_var_at_offset: {
1541 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
1542
1543 if (const_offset) {
1544 unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
1545 unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
1546
1547 inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
1548 fs_reg(off_x | (off_y << 4)));
1549 } else {
1550 src = vgrf(glsl_type::ivec2_type);
1551 fs_reg offset_src = retype(get_nir_src(instr->src[0]),
1552 BRW_REGISTER_TYPE_F);
1553 for (int i = 0; i < 2; i++) {
1554 fs_reg temp = vgrf(glsl_type::float_type);
1555 emit(MUL(temp, offset(offset_src, i), fs_reg(16.0f)));
1556 fs_reg itemp = vgrf(glsl_type::int_type);
1557 emit(MOV(itemp, temp)); /* float to int */
1558
1559 /* Clamp the upper end of the range to +7/16.
1560 * ARB_gpu_shader5 requires that we support a maximum offset
1561 * of +0.5, which isn't representable in a S0.4 value -- if
1562 * we didn't clamp it, we'd end up with -8/16, which is the
1563 * opposite of what the shader author wanted.
1564 *
1565 * This is legal due to ARB_gpu_shader5's quantization
1566 * rules:
1567 *
1568 * "Not all values of <offset> may be supported; x and y
1569 * offsets may be rounded to fixed-point values with the
1570 * number of fraction bits given by the
1571 * implementation-dependent constant
1572 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
1573 */
1574
1575 emit(BRW_OPCODE_SEL, offset(src, i), itemp, fs_reg(7))
1576 ->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
1577 }
1578
1579 mlen = 2;
1580 inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
1581 fs_reg(0u));
1582 }
1583 break;
1584 }
1585
1586 default:
1587 unreachable("Invalid intrinsic");
1588 }
1589
1590 inst->mlen = mlen;
1591 inst->regs_written = 2; /* 2 floats per slot returned */
1592 inst->pi_noperspective = instr->variables[0]->var->data.interpolation ==
1593 INTERP_QUALIFIER_NOPERSPECTIVE;
1594
1595 for (unsigned j = 0; j < instr->num_components; j++) {
1596 fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
1597 src.type = dest.type;
1598
1599 emit(FS_OPCODE_LINTERP, dest, dst_x, dst_y, src);
1600 dest = offset(dest, 1);
1601 }
1602 break;
1603 }
1604
1605 case nir_intrinsic_store_output_indirect:
1606 has_indirect = true;
1607 case nir_intrinsic_store_output: {
1608 fs_reg src = get_nir_src(instr->src[0]);
1609 unsigned index = 0;
1610 for (int i = 0; i < instr->const_index[1]; i++) {
1611 for (unsigned j = 0; j < instr->num_components; j++) {
1612 fs_reg new_dest = offset(retype(nir_outputs, src.type),
1613 instr->const_index[0] + index);
1614 if (has_indirect)
1615 src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[1]));
1616 index++;
1617 emit(MOV(new_dest, src));
1618 src = offset(src, 1);
1619 }
1620 }
1621 break;
1622 }
1623
1624 default:
1625 unreachable("unknown intrinsic");
1626 }
1627 }
1628
1629 void
1630 fs_visitor::nir_emit_texture(nir_tex_instr *instr)
1631 {
1632 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1633 unsigned sampler = instr->sampler_index;
1634 fs_reg sampler_reg(sampler);
1635
1636 /* FINISHME: We're failing to recompile our programs when the sampler is
1637 * updated. This only matters for the texture rectangle scale parameters
1638 * (pre-gen6, or gen6+ with GL_CLAMP).
1639 */
1640 int texunit = prog->SamplerUnits[sampler];
1641
1642 int gather_component = instr->component;
1643
1644 bool is_rect = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
1645
1646 bool is_cube_array = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
1647 instr->is_array;
1648
1649 int lod_components, offset_components = 0;
1650
1651 fs_reg coordinate, shadow_comparitor, lod, lod2, sample_index, mcs, offset;
1652
1653 for (unsigned i = 0; i < instr->num_srcs; i++) {
1654 fs_reg src = get_nir_src(instr->src[i].src);
1655 switch (instr->src[i].src_type) {
1656 case nir_tex_src_bias:
1657 lod = retype(src, BRW_REGISTER_TYPE_F);
1658 break;
1659 case nir_tex_src_comparitor:
1660 shadow_comparitor = retype(src, BRW_REGISTER_TYPE_F);
1661 break;
1662 case nir_tex_src_coord:
1663 switch (instr->op) {
1664 case nir_texop_txf:
1665 case nir_texop_txf_ms:
1666 coordinate = retype(src, BRW_REGISTER_TYPE_D);
1667 break;
1668 default:
1669 coordinate = retype(src, BRW_REGISTER_TYPE_F);
1670 break;
1671 }
1672 break;
1673 case nir_tex_src_ddx:
1674 lod = retype(src, BRW_REGISTER_TYPE_F);
1675 lod_components = nir_tex_instr_src_size(instr, i);
1676 break;
1677 case nir_tex_src_ddy:
1678 lod2 = retype(src, BRW_REGISTER_TYPE_F);
1679 break;
1680 case nir_tex_src_lod:
1681 switch (instr->op) {
1682 case nir_texop_txs:
1683 lod = retype(src, BRW_REGISTER_TYPE_UD);
1684 break;
1685 case nir_texop_txf:
1686 lod = retype(src, BRW_REGISTER_TYPE_D);
1687 break;
1688 default:
1689 lod = retype(src, BRW_REGISTER_TYPE_F);
1690 break;
1691 }
1692 break;
1693 case nir_tex_src_ms_index:
1694 sample_index = retype(src, BRW_REGISTER_TYPE_UD);
1695 break;
1696 case nir_tex_src_offset:
1697 offset = retype(src, BRW_REGISTER_TYPE_D);
1698 if (instr->is_array)
1699 offset_components = instr->coord_components - 1;
1700 else
1701 offset_components = instr->coord_components;
1702 break;
1703 case nir_tex_src_projector:
1704 unreachable("should be lowered");
1705
1706 case nir_tex_src_sampler_offset: {
1707 /* Figure out the highest possible sampler index and mark it as used */
1708 uint32_t max_used = sampler + instr->sampler_array_size - 1;
1709 if (instr->op == nir_texop_tg4 && brw->gen < 8) {
1710 max_used += stage_prog_data->binding_table.gather_texture_start;
1711 } else {
1712 max_used += stage_prog_data->binding_table.texture_start;
1713 }
1714 brw_mark_surface_used(prog_data, max_used);
1715
1716 /* Emit code to evaluate the actual indexing expression */
1717 sampler_reg = vgrf(glsl_type::uint_type);
1718 emit(ADD(sampler_reg, src, fs_reg(sampler)))
1719 ->force_writemask_all = true;
1720 break;
1721 }
1722
1723 default:
1724 unreachable("unknown texture source");
1725 }
1726 }
1727
1728 if (instr->op == nir_texop_txf_ms) {
1729 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
1730 mcs = emit_mcs_fetch(coordinate, instr->coord_components, sampler_reg);
1731 else
1732 mcs = fs_reg(0u);
1733 }
1734
1735 for (unsigned i = 0; i < 3; i++) {
1736 if (instr->const_offset[i] != 0) {
1737 assert(offset_components == 0);
1738 offset = fs_reg(brw_texture_offset(ctx, instr->const_offset, 3));
1739 break;
1740 }
1741 }
1742
1743 enum glsl_base_type dest_base_type;
1744 switch (instr->dest_type) {
1745 case nir_type_float:
1746 dest_base_type = GLSL_TYPE_FLOAT;
1747 break;
1748 case nir_type_int:
1749 dest_base_type = GLSL_TYPE_INT;
1750 break;
1751 case nir_type_unsigned:
1752 dest_base_type = GLSL_TYPE_UINT;
1753 break;
1754 default:
1755 unreachable("bad type");
1756 }
1757
1758 const glsl_type *dest_type =
1759 glsl_type::get_instance(dest_base_type, nir_tex_instr_dest_size(instr),
1760 1);
1761
1762 ir_texture_opcode op;
1763 switch (instr->op) {
1764 case nir_texop_lod: op = ir_lod; break;
1765 case nir_texop_query_levels: op = ir_query_levels; break;
1766 case nir_texop_tex: op = ir_tex; break;
1767 case nir_texop_tg4: op = ir_tg4; break;
1768 case nir_texop_txb: op = ir_txb; break;
1769 case nir_texop_txd: op = ir_txd; break;
1770 case nir_texop_txf: op = ir_txf; break;
1771 case nir_texop_txf_ms: op = ir_txf_ms; break;
1772 case nir_texop_txl: op = ir_txl; break;
1773 case nir_texop_txs: op = ir_txs; break;
1774 default:
1775 unreachable("unknown texture opcode");
1776 }
1777
1778 emit_texture(op, dest_type, coordinate, instr->coord_components,
1779 shadow_comparitor, lod, lod2, lod_components, sample_index,
1780 offset, offset_components, mcs, gather_component,
1781 is_cube_array, is_rect, sampler, sampler_reg, texunit);
1782
1783 fs_reg dest = get_nir_dest(instr->dest);
1784 dest.type = this->result.type;
1785 unsigned num_components = nir_tex_instr_dest_size(instr);
1786 emit_percomp(MOV(dest, this->result), (1 << num_components) - 1);
1787 }
1788
1789 void
1790 fs_visitor::nir_emit_jump(nir_jump_instr *instr)
1791 {
1792 switch (instr->type) {
1793 case nir_jump_break:
1794 emit(BRW_OPCODE_BREAK);
1795 break;
1796 case nir_jump_continue:
1797 emit(BRW_OPCODE_CONTINUE);
1798 break;
1799 case nir_jump_return:
1800 default:
1801 unreachable("unknown jump");
1802 }
1803 }