i965: only try print GLSL IR once when using INTEL_DEBUG to dump ir
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_nir.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/glsl/ir.h"
25 #include "brw_fs.h"
26 #include "brw_fs_surface_builder.h"
27 #include "brw_nir.h"
28 #include "brw_program.h"
29
30 using namespace brw;
31 using namespace brw::surface_access;
32
33 void
34 fs_visitor::emit_nir_code()
35 {
36 /* emit the arrays used for inputs and outputs - load/store intrinsics will
37 * be converted to reads/writes of these arrays
38 */
39 nir_setup_outputs();
40 nir_setup_uniforms();
41 nir_emit_system_values();
42
43 /* get the main function and emit it */
44 nir_foreach_function(function, nir) {
45 assert(strcmp(function->name, "main") == 0);
46 assert(function->impl);
47 nir_emit_impl(function->impl);
48 }
49 }
50
51 void
52 fs_visitor::nir_setup_outputs()
53 {
54 if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
55 return;
56
57 nir_foreach_variable(var, &nir->outputs) {
58 const unsigned vec4s = type_size_vec4(var->type);
59 fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s);
60 for (unsigned i = 0; i < vec4s; i++) {
61 if (outputs[var->data.driver_location + i].file == BAD_FILE)
62 outputs[var->data.driver_location + i] = offset(reg, bld, 4 * i);
63 }
64 }
65 }
66
67 void
68 fs_visitor::nir_setup_uniforms()
69 {
70 if (dispatch_width != min_dispatch_width)
71 return;
72
73 uniforms = nir->num_uniforms / 4;
74 }
75
76 static bool
77 emit_system_values_block(nir_block *block, fs_visitor *v)
78 {
79 fs_reg *reg;
80
81 nir_foreach_instr(instr, block) {
82 if (instr->type != nir_instr_type_intrinsic)
83 continue;
84
85 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
86 switch (intrin->intrinsic) {
87 case nir_intrinsic_load_vertex_id:
88 unreachable("should be lowered by lower_vertex_id().");
89
90 case nir_intrinsic_load_vertex_id_zero_base:
91 assert(v->stage == MESA_SHADER_VERTEX);
92 reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
93 if (reg->file == BAD_FILE)
94 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
95 break;
96
97 case nir_intrinsic_load_base_vertex:
98 assert(v->stage == MESA_SHADER_VERTEX);
99 reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
100 if (reg->file == BAD_FILE)
101 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
102 break;
103
104 case nir_intrinsic_load_instance_id:
105 assert(v->stage == MESA_SHADER_VERTEX);
106 reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
107 if (reg->file == BAD_FILE)
108 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
109 break;
110
111 case nir_intrinsic_load_base_instance:
112 assert(v->stage == MESA_SHADER_VERTEX);
113 reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
114 if (reg->file == BAD_FILE)
115 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE);
116 break;
117
118 case nir_intrinsic_load_draw_id:
119 assert(v->stage == MESA_SHADER_VERTEX);
120 reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID];
121 if (reg->file == BAD_FILE)
122 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID);
123 break;
124
125 case nir_intrinsic_load_invocation_id:
126 if (v->stage == MESA_SHADER_TESS_CTRL)
127 break;
128 assert(v->stage == MESA_SHADER_GEOMETRY);
129 reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
130 if (reg->file == BAD_FILE) {
131 const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
132 fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
133 fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
134 abld.SHR(iid, g1, brw_imm_ud(27u));
135 *reg = iid;
136 }
137 break;
138
139 case nir_intrinsic_load_sample_pos:
140 assert(v->stage == MESA_SHADER_FRAGMENT);
141 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
142 if (reg->file == BAD_FILE)
143 *reg = *v->emit_samplepos_setup();
144 break;
145
146 case nir_intrinsic_load_sample_id:
147 assert(v->stage == MESA_SHADER_FRAGMENT);
148 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
149 if (reg->file == BAD_FILE)
150 *reg = *v->emit_sampleid_setup();
151 break;
152
153 case nir_intrinsic_load_sample_mask_in:
154 assert(v->stage == MESA_SHADER_FRAGMENT);
155 assert(v->devinfo->gen >= 7);
156 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
157 if (reg->file == BAD_FILE)
158 *reg = *v->emit_samplemaskin_setup();
159 break;
160
161 case nir_intrinsic_load_work_group_id:
162 assert(v->stage == MESA_SHADER_COMPUTE);
163 reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
164 if (reg->file == BAD_FILE)
165 *reg = *v->emit_cs_work_group_id_setup();
166 break;
167
168 case nir_intrinsic_load_helper_invocation:
169 assert(v->stage == MESA_SHADER_FRAGMENT);
170 reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
171 if (reg->file == BAD_FILE) {
172 const fs_builder abld =
173 v->bld.annotate("gl_HelperInvocation", NULL);
174
175 /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
176 * pixel mask is in g1.7 of the thread payload.
177 *
178 * We move the per-channel pixel enable bit to the low bit of each
179 * channel by shifting the byte containing the pixel mask by the
180 * vector immediate 0x76543210UV.
181 *
182 * The region of <1,8,0> reads only 1 byte (the pixel masks for
183 * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
184 * masks for 2 and 3) in SIMD16.
185 */
186 fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
187 abld.SHR(shifted,
188 stride(byte_offset(retype(brw_vec1_grf(1, 0),
189 BRW_REGISTER_TYPE_UB), 28),
190 1, 8, 0),
191 brw_imm_v(0x76543210));
192
193 /* A set bit in the pixel mask means the channel is enabled, but
194 * that is the opposite of gl_HelperInvocation so we need to invert
195 * the mask.
196 *
197 * The negate source-modifier bit of logical instructions on Gen8+
198 * performs 1's complement negation, so we can use that instead of
199 * a NOT instruction.
200 */
201 fs_reg inverted = negate(shifted);
202 if (v->devinfo->gen < 8) {
203 inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
204 abld.NOT(inverted, shifted);
205 }
206
207 /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
208 * with 1 and negating.
209 */
210 fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
211 abld.AND(anded, inverted, brw_imm_uw(1));
212
213 fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
214 abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
215 *reg = dst;
216 }
217 break;
218
219 default:
220 break;
221 }
222 }
223
224 return true;
225 }
226
227 void
228 fs_visitor::nir_emit_system_values()
229 {
230 nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
231 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
232 nir_system_values[i] = fs_reg();
233 }
234
235 nir_foreach_function(function, nir) {
236 assert(strcmp(function->name, "main") == 0);
237 assert(function->impl);
238 nir_foreach_block(block, function->impl) {
239 emit_system_values_block(block, this);
240 }
241 }
242 }
243
244 void
245 fs_visitor::nir_emit_impl(nir_function_impl *impl)
246 {
247 nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
248 for (unsigned i = 0; i < impl->reg_alloc; i++) {
249 nir_locals[i] = fs_reg();
250 }
251
252 foreach_list_typed(nir_register, reg, node, &impl->registers) {
253 unsigned array_elems =
254 reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
255 unsigned size = array_elems * reg->num_components;
256 const brw_reg_type reg_type =
257 reg->bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
258 nir_locals[reg->index] = bld.vgrf(reg_type, size);
259 }
260
261 nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
262 impl->ssa_alloc);
263
264 nir_emit_cf_list(&impl->body);
265 }
266
267 void
268 fs_visitor::nir_emit_cf_list(exec_list *list)
269 {
270 exec_list_validate(list);
271 foreach_list_typed(nir_cf_node, node, node, list) {
272 switch (node->type) {
273 case nir_cf_node_if:
274 nir_emit_if(nir_cf_node_as_if(node));
275 break;
276
277 case nir_cf_node_loop:
278 nir_emit_loop(nir_cf_node_as_loop(node));
279 break;
280
281 case nir_cf_node_block:
282 nir_emit_block(nir_cf_node_as_block(node));
283 break;
284
285 default:
286 unreachable("Invalid CFG node block");
287 }
288 }
289 }
290
291 void
292 fs_visitor::nir_emit_if(nir_if *if_stmt)
293 {
294 /* first, put the condition into f0 */
295 fs_inst *inst = bld.MOV(bld.null_reg_d(),
296 retype(get_nir_src(if_stmt->condition),
297 BRW_REGISTER_TYPE_D));
298 inst->conditional_mod = BRW_CONDITIONAL_NZ;
299
300 bld.IF(BRW_PREDICATE_NORMAL);
301
302 nir_emit_cf_list(&if_stmt->then_list);
303
304 /* note: if the else is empty, dead CF elimination will remove it */
305 bld.emit(BRW_OPCODE_ELSE);
306
307 nir_emit_cf_list(&if_stmt->else_list);
308
309 bld.emit(BRW_OPCODE_ENDIF);
310 }
311
312 void
313 fs_visitor::nir_emit_loop(nir_loop *loop)
314 {
315 bld.emit(BRW_OPCODE_DO);
316
317 nir_emit_cf_list(&loop->body);
318
319 bld.emit(BRW_OPCODE_WHILE);
320 }
321
322 void
323 fs_visitor::nir_emit_block(nir_block *block)
324 {
325 nir_foreach_instr(instr, block) {
326 nir_emit_instr(instr);
327 }
328 }
329
330 void
331 fs_visitor::nir_emit_instr(nir_instr *instr)
332 {
333 const fs_builder abld = bld.annotate(NULL, instr);
334
335 switch (instr->type) {
336 case nir_instr_type_alu:
337 nir_emit_alu(abld, nir_instr_as_alu(instr));
338 break;
339
340 case nir_instr_type_intrinsic:
341 switch (stage) {
342 case MESA_SHADER_VERTEX:
343 nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
344 break;
345 case MESA_SHADER_TESS_CTRL:
346 nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
347 break;
348 case MESA_SHADER_TESS_EVAL:
349 nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
350 break;
351 case MESA_SHADER_GEOMETRY:
352 nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
353 break;
354 case MESA_SHADER_FRAGMENT:
355 nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
356 break;
357 case MESA_SHADER_COMPUTE:
358 nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
359 break;
360 default:
361 unreachable("unsupported shader stage");
362 }
363 break;
364
365 case nir_instr_type_tex:
366 nir_emit_texture(abld, nir_instr_as_tex(instr));
367 break;
368
369 case nir_instr_type_load_const:
370 nir_emit_load_const(abld, nir_instr_as_load_const(instr));
371 break;
372
373 case nir_instr_type_ssa_undef:
374 /* We create a new VGRF for undefs on every use (by handling
375 * them in get_nir_src()), rather than for each definition.
376 * This helps register coalescing eliminate MOVs from undef.
377 */
378 break;
379
380 case nir_instr_type_jump:
381 nir_emit_jump(abld, nir_instr_as_jump(instr));
382 break;
383
384 default:
385 unreachable("unknown instruction type");
386 }
387 }
388
389 /**
390 * Recognizes a parent instruction of nir_op_extract_* and changes the type to
391 * match instr.
392 */
393 bool
394 fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
395 const fs_reg &result)
396 {
397 if (!instr->src[0].src.is_ssa ||
398 !instr->src[0].src.ssa->parent_instr)
399 return false;
400
401 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
402 return false;
403
404 nir_alu_instr *src0 =
405 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
406
407 if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
408 src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
409 return false;
410
411 nir_const_value *element = nir_src_as_const_value(src0->src[1].src);
412 assert(element != NULL);
413
414 /* Element type to extract.*/
415 const brw_reg_type type = brw_int_type(
416 src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
417 src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
418
419 fs_reg op0 = get_nir_src(src0->src[0].src);
420 op0.type = brw_type_for_nir_type(
421 (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
422 nir_src_bit_size(src0->src[0].src)));
423 op0 = offset(op0, bld, src0->src[0].swizzle[0]);
424
425 set_saturate(instr->dest.saturate,
426 bld.MOV(result, subscript(op0, type, element->u32[0])));
427 return true;
428 }
429
430 bool
431 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
432 const fs_reg &result)
433 {
434 if (!instr->src[0].src.is_ssa ||
435 instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
436 return false;
437
438 nir_intrinsic_instr *src0 =
439 nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
440
441 if (src0->intrinsic != nir_intrinsic_load_front_face)
442 return false;
443
444 nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
445 if (!value1 || fabsf(value1->f32[0]) != 1.0f)
446 return false;
447
448 nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
449 if (!value2 || fabsf(value2->f32[0]) != 1.0f)
450 return false;
451
452 fs_reg tmp = vgrf(glsl_type::int_type);
453
454 if (devinfo->gen >= 6) {
455 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
456 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
457
458 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
459 *
460 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
461 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
462 *
463 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
464 *
465 * This negation looks like it's safe in practice, because bits 0:4 will
466 * surely be TRIANGLES
467 */
468
469 if (value1->f32[0] == -1.0f) {
470 g0.negate = true;
471 }
472
473 bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
474 g0, brw_imm_uw(0x3f80));
475 } else {
476 /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
477 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
478
479 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
480 *
481 * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D
482 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
483 *
484 * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
485 *
486 * This negation looks like it's safe in practice, because bits 0:4 will
487 * surely be TRIANGLES
488 */
489
490 if (value1->f32[0] == -1.0f) {
491 g1_6.negate = true;
492 }
493
494 bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
495 }
496 bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
497
498 return true;
499 }
500
501 static void
502 emit_find_msb_using_lzd(const fs_builder &bld,
503 const fs_reg &result,
504 const fs_reg &src,
505 bool is_signed)
506 {
507 fs_inst *inst;
508 fs_reg temp = src;
509
510 if (is_signed) {
511 /* LZD of an absolute value source almost always does the right
512 * thing. There are two problem values:
513 *
514 * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns
515 * 0. However, findMSB(int(0x80000000)) == 30.
516 *
517 * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns
518 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
519 *
520 * For a value of zero or negative one, -1 will be returned.
521 *
522 * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but
523 * findMSB(-(1<<x)) should return x-1.
524 *
525 * For all negative number cases, including 0x80000000 and
526 * 0xffffffff, the correct value is obtained from LZD if instead of
527 * negating the (already negative) value the logical-not is used. A
528 * conditonal logical-not can be achieved in two instructions.
529 */
530 temp = bld.vgrf(BRW_REGISTER_TYPE_D);
531
532 bld.ASR(temp, src, brw_imm_d(31));
533 bld.XOR(temp, temp, src);
534 }
535
536 bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
537 retype(temp, BRW_REGISTER_TYPE_UD));
538
539 /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
540 * from the LSB side. Subtract the result from 31 to convert the MSB
541 * count into an LSB count. If no bits are set, LZD will return 32.
542 * 31-32 = -1, which is exactly what findMSB() is supposed to return.
543 */
544 inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
545 inst->src[0].negate = true;
546 }
547
548 void
549 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
550 {
551 struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
552 fs_inst *inst;
553
554 fs_reg result = get_nir_dest(instr->dest.dest);
555 result.type = brw_type_for_nir_type(
556 (nir_alu_type)(nir_op_infos[instr->op].output_type |
557 nir_dest_bit_size(instr->dest.dest)));
558
559 fs_reg op[4];
560 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
561 op[i] = get_nir_src(instr->src[i].src);
562 op[i].type = brw_type_for_nir_type(
563 (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
564 nir_src_bit_size(instr->src[i].src)));
565 op[i].abs = instr->src[i].abs;
566 op[i].negate = instr->src[i].negate;
567 }
568
569 /* We get a bunch of mov's out of the from_ssa pass and they may still
570 * be vectorized. We'll handle them as a special-case. We'll also
571 * handle vecN here because it's basically the same thing.
572 */
573 switch (instr->op) {
574 case nir_op_imov:
575 case nir_op_fmov:
576 case nir_op_vec2:
577 case nir_op_vec3:
578 case nir_op_vec4: {
579 fs_reg temp = result;
580 bool need_extra_copy = false;
581 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
582 if (!instr->src[i].src.is_ssa &&
583 instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
584 need_extra_copy = true;
585 temp = bld.vgrf(result.type, 4);
586 break;
587 }
588 }
589
590 for (unsigned i = 0; i < 4; i++) {
591 if (!(instr->dest.write_mask & (1 << i)))
592 continue;
593
594 if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
595 inst = bld.MOV(offset(temp, bld, i),
596 offset(op[0], bld, instr->src[0].swizzle[i]));
597 } else {
598 inst = bld.MOV(offset(temp, bld, i),
599 offset(op[i], bld, instr->src[i].swizzle[0]));
600 }
601 inst->saturate = instr->dest.saturate;
602 }
603
604 /* In this case the source and destination registers were the same,
605 * so we need to insert an extra set of moves in order to deal with
606 * any swizzling.
607 */
608 if (need_extra_copy) {
609 for (unsigned i = 0; i < 4; i++) {
610 if (!(instr->dest.write_mask & (1 << i)))
611 continue;
612
613 bld.MOV(offset(result, bld, i), offset(temp, bld, i));
614 }
615 }
616 return;
617 }
618 default:
619 break;
620 }
621
622 /* At this point, we have dealt with any instruction that operates on
623 * more than a single channel. Therefore, we can just adjust the source
624 * and destination registers for that channel and emit the instruction.
625 */
626 unsigned channel = 0;
627 if (nir_op_infos[instr->op].output_size == 0) {
628 /* Since NIR is doing the scalarizing for us, we should only ever see
629 * vectorized operations with a single channel.
630 */
631 assert(_mesa_bitcount(instr->dest.write_mask) == 1);
632 channel = ffs(instr->dest.write_mask) - 1;
633
634 result = offset(result, bld, channel);
635 }
636
637 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
638 assert(nir_op_infos[instr->op].input_sizes[i] < 2);
639 op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
640 }
641
642 switch (instr->op) {
643 case nir_op_i2f:
644 case nir_op_u2f:
645 if (optimize_extract_to_float(instr, result))
646 return;
647 inst = bld.MOV(result, op[0]);
648 inst->saturate = instr->dest.saturate;
649 break;
650
651 case nir_op_f2d:
652 case nir_op_i2d:
653 case nir_op_u2d:
654 /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
655 *
656 * "When source or destination is 64b (...), regioning in Align1
657 * must follow these rules:
658 *
659 * 1. Source and destination horizontal stride must be aligned to
660 * the same qword.
661 * (...)"
662 *
663 * This means that 32-bit to 64-bit conversions need to have the 32-bit
664 * data elements aligned to 64-bit. This restriction does not apply to
665 * BDW and later.
666 */
667 if (devinfo->is_cherryview || devinfo->is_broxton) {
668 fs_reg tmp = bld.vgrf(result.type, 1);
669 tmp = subscript(tmp, op[0].type, 0);
670 inst = bld.MOV(tmp, op[0]);
671 inst = bld.MOV(result, tmp);
672 inst->saturate = instr->dest.saturate;
673 break;
674 }
675 /* fallthrough */
676 case nir_op_d2f:
677 case nir_op_d2i:
678 case nir_op_d2u:
679 inst = bld.MOV(result, op[0]);
680 inst->saturate = instr->dest.saturate;
681 break;
682
683 case nir_op_f2i:
684 case nir_op_f2u:
685 bld.MOV(result, op[0]);
686 break;
687
688 case nir_op_fsign: {
689 if (type_sz(op[0].type) < 8) {
690 /* AND(val, 0x80000000) gives the sign bit.
691 *
692 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
693 * zero.
694 */
695 bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
696
697 fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
698 op[0].type = BRW_REGISTER_TYPE_UD;
699 result.type = BRW_REGISTER_TYPE_UD;
700 bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
701
702 inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
703 inst->predicate = BRW_PREDICATE_NORMAL;
704 if (instr->dest.saturate) {
705 inst = bld.MOV(result, result);
706 inst->saturate = true;
707 }
708 } else {
709 /* For doubles we do the same but we need to consider:
710 *
711 * - 2-src instructions can't operate with 64-bit immediates
712 * - The sign is encoded in the high 32-bit of each DF
713 * - CMP with DF requires special handling in SIMD16
714 * - We need to produce a DF result.
715 */
716
717 /* 2-src instructions can't have 64-bit immediates, so put 0.0 in
718 * a register and compare with that.
719 */
720 fs_reg tmp = vgrf(glsl_type::double_type);
721 bld.MOV(tmp, setup_imm_df(bld, 0.0));
722
723 /* A direct DF CMP using the flag register (null dst) won't work in
724 * SIMD16 because the CMP will be split in two by lower_simd_width,
725 * resulting in two CMP instructions with the same dst (NULL),
726 * leading to dead code elimination of the first one. In SIMD8,
727 * however, there is no need to split the CMP and we can save some
728 * work.
729 */
730 fs_reg dst_tmp = vgrf(glsl_type::double_type);
731 bld.CMP(dst_tmp, op[0], tmp, BRW_CONDITIONAL_NZ);
732
733 /* In SIMD16 we want to avoid using a NULL dst register with DF CMP,
734 * so we store the result of the comparison in a vgrf instead and
735 * then we generate a UD comparison from that that won't have to
736 * be split by lower_simd_width. This is what NIR does to handle
737 * double comparisons in the general case.
738 */
739 if (bld.dispatch_width() == 16 ) {
740 fs_reg dst_tmp_ud = retype(dst_tmp, BRW_REGISTER_TYPE_UD);
741 bld.MOV(dst_tmp_ud, subscript(dst_tmp, BRW_REGISTER_TYPE_UD, 0));
742 bld.CMP(bld.null_reg_ud(),
743 dst_tmp_ud, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
744 }
745
746 /* Get the high 32-bit of each double component where the sign is */
747 fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
748 bld.MOV(result_int, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
749
750 /* Get the sign bit */
751 bld.AND(result_int, result_int, brw_imm_ud(0x80000000u));
752
753 /* Add 1.0 to the sign, predicated to skip the case of op[0] == 0.0 */
754 inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
755 inst->predicate = BRW_PREDICATE_NORMAL;
756
757 /* Convert from 32-bit float to 64-bit double */
758 result.type = BRW_REGISTER_TYPE_DF;
759 inst = bld.MOV(result, retype(result_int, BRW_REGISTER_TYPE_F));
760
761 if (instr->dest.saturate) {
762 inst = bld.MOV(result, result);
763 inst->saturate = true;
764 }
765 }
766 break;
767 }
768
769 case nir_op_isign:
770 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
771 * -> non-negative val generates 0x00000000.
772 * Predicated OR sets 1 if val is positive.
773 */
774 assert(nir_dest_bit_size(instr->dest.dest) < 64);
775 bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G);
776 bld.ASR(result, op[0], brw_imm_d(31));
777 inst = bld.OR(result, result, brw_imm_d(1));
778 inst->predicate = BRW_PREDICATE_NORMAL;
779 break;
780
781 case nir_op_frcp:
782 inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
783 inst->saturate = instr->dest.saturate;
784 break;
785
786 case nir_op_fexp2:
787 inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
788 inst->saturate = instr->dest.saturate;
789 break;
790
791 case nir_op_flog2:
792 inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
793 inst->saturate = instr->dest.saturate;
794 break;
795
796 case nir_op_fsin:
797 inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
798 inst->saturate = instr->dest.saturate;
799 break;
800
801 case nir_op_fcos:
802 inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
803 inst->saturate = instr->dest.saturate;
804 break;
805
806 case nir_op_fddx:
807 if (fs_key->high_quality_derivatives) {
808 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
809 } else {
810 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
811 }
812 inst->saturate = instr->dest.saturate;
813 break;
814 case nir_op_fddx_fine:
815 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
816 inst->saturate = instr->dest.saturate;
817 break;
818 case nir_op_fddx_coarse:
819 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
820 inst->saturate = instr->dest.saturate;
821 break;
822 case nir_op_fddy:
823 if (fs_key->high_quality_derivatives) {
824 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
825 } else {
826 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
827 }
828 inst->saturate = instr->dest.saturate;
829 break;
830 case nir_op_fddy_fine:
831 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
832 inst->saturate = instr->dest.saturate;
833 break;
834 case nir_op_fddy_coarse:
835 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
836 inst->saturate = instr->dest.saturate;
837 break;
838
839 case nir_op_iadd:
840 assert(nir_dest_bit_size(instr->dest.dest) < 64);
841 case nir_op_fadd:
842 inst = bld.ADD(result, op[0], op[1]);
843 inst->saturate = instr->dest.saturate;
844 break;
845
846 case nir_op_fmul:
847 inst = bld.MUL(result, op[0], op[1]);
848 inst->saturate = instr->dest.saturate;
849 break;
850
851 case nir_op_imul:
852 assert(nir_dest_bit_size(instr->dest.dest) < 64);
853 bld.MUL(result, op[0], op[1]);
854 break;
855
856 case nir_op_imul_high:
857 case nir_op_umul_high:
858 assert(nir_dest_bit_size(instr->dest.dest) < 64);
859 bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
860 break;
861
862 case nir_op_idiv:
863 case nir_op_udiv:
864 assert(nir_dest_bit_size(instr->dest.dest) < 64);
865 bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
866 break;
867
868 case nir_op_uadd_carry:
869 unreachable("Should have been lowered by carry_to_arith().");
870
871 case nir_op_usub_borrow:
872 unreachable("Should have been lowered by borrow_to_arith().");
873
874 case nir_op_umod:
875 case nir_op_irem:
876 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
877 * appears that our hardware just does the right thing for signed
878 * remainder.
879 */
880 assert(nir_dest_bit_size(instr->dest.dest) < 64);
881 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
882 break;
883
884 case nir_op_imod: {
885 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */
886 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
887
888 /* Math instructions don't support conditional mod */
889 inst = bld.MOV(bld.null_reg_d(), result);
890 inst->conditional_mod = BRW_CONDITIONAL_NZ;
891
892 /* Now, we need to determine if signs of the sources are different.
893 * When we XOR the sources, the top bit is 0 if they are the same and 1
894 * if they are different. We can then use a conditional modifier to
895 * turn that into a predicate. This leads us to an XOR.l instruction.
896 *
897 * Technically, according to the PRM, you're not allowed to use .l on a
898 * XOR instruction. However, emperical experiments and Curro's reading
899 * of the simulator source both indicate that it's safe.
900 */
901 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
902 inst = bld.XOR(tmp, op[0], op[1]);
903 inst->predicate = BRW_PREDICATE_NORMAL;
904 inst->conditional_mod = BRW_CONDITIONAL_L;
905
906 /* If the result of the initial remainder operation is non-zero and the
907 * two sources have different signs, add in a copy of op[1] to get the
908 * final integer modulus value.
909 */
910 inst = bld.ADD(result, result, op[1]);
911 inst->predicate = BRW_PREDICATE_NORMAL;
912 break;
913 }
914
915 case nir_op_flt:
916 case nir_op_fge:
917 case nir_op_feq:
918 case nir_op_fne: {
919 fs_reg dest = result;
920 if (nir_src_bit_size(instr->src[0].src) > 32) {
921 dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1);
922 }
923 brw_conditional_mod cond;
924 switch (instr->op) {
925 case nir_op_flt:
926 cond = BRW_CONDITIONAL_L;
927 break;
928 case nir_op_fge:
929 cond = BRW_CONDITIONAL_GE;
930 break;
931 case nir_op_feq:
932 cond = BRW_CONDITIONAL_Z;
933 break;
934 case nir_op_fne:
935 cond = BRW_CONDITIONAL_NZ;
936 break;
937 default:
938 unreachable("bad opcode");
939 }
940 bld.CMP(dest, op[0], op[1], cond);
941 if (nir_src_bit_size(instr->src[0].src) > 32) {
942 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
943 }
944 break;
945 }
946
947 case nir_op_ilt:
948 case nir_op_ult:
949 assert(nir_dest_bit_size(instr->dest.dest) < 64);
950 bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_L);
951 break;
952
953 case nir_op_ige:
954 case nir_op_uge:
955 assert(nir_dest_bit_size(instr->dest.dest) < 64);
956 bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_GE);
957 break;
958
959 case nir_op_ieq:
960 assert(nir_dest_bit_size(instr->dest.dest) < 64);
961 bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_Z);
962 break;
963
964 case nir_op_ine:
965 assert(nir_dest_bit_size(instr->dest.dest) < 64);
966 bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_NZ);
967 break;
968
969 case nir_op_inot:
970 assert(nir_dest_bit_size(instr->dest.dest) < 64);
971 if (devinfo->gen >= 8) {
972 op[0] = resolve_source_modifiers(op[0]);
973 }
974 bld.NOT(result, op[0]);
975 break;
976 case nir_op_ixor:
977 assert(nir_dest_bit_size(instr->dest.dest) < 64);
978 if (devinfo->gen >= 8) {
979 op[0] = resolve_source_modifiers(op[0]);
980 op[1] = resolve_source_modifiers(op[1]);
981 }
982 bld.XOR(result, op[0], op[1]);
983 break;
984 case nir_op_ior:
985 assert(nir_dest_bit_size(instr->dest.dest) < 64);
986 if (devinfo->gen >= 8) {
987 op[0] = resolve_source_modifiers(op[0]);
988 op[1] = resolve_source_modifiers(op[1]);
989 }
990 bld.OR(result, op[0], op[1]);
991 break;
992 case nir_op_iand:
993 assert(nir_dest_bit_size(instr->dest.dest) < 64);
994 if (devinfo->gen >= 8) {
995 op[0] = resolve_source_modifiers(op[0]);
996 op[1] = resolve_source_modifiers(op[1]);
997 }
998 bld.AND(result, op[0], op[1]);
999 break;
1000
1001 case nir_op_fdot2:
1002 case nir_op_fdot3:
1003 case nir_op_fdot4:
1004 case nir_op_ball_fequal2:
1005 case nir_op_ball_iequal2:
1006 case nir_op_ball_fequal3:
1007 case nir_op_ball_iequal3:
1008 case nir_op_ball_fequal4:
1009 case nir_op_ball_iequal4:
1010 case nir_op_bany_fnequal2:
1011 case nir_op_bany_inequal2:
1012 case nir_op_bany_fnequal3:
1013 case nir_op_bany_inequal3:
1014 case nir_op_bany_fnequal4:
1015 case nir_op_bany_inequal4:
1016 unreachable("Lowered by nir_lower_alu_reductions");
1017
1018 case nir_op_fnoise1_1:
1019 case nir_op_fnoise1_2:
1020 case nir_op_fnoise1_3:
1021 case nir_op_fnoise1_4:
1022 case nir_op_fnoise2_1:
1023 case nir_op_fnoise2_2:
1024 case nir_op_fnoise2_3:
1025 case nir_op_fnoise2_4:
1026 case nir_op_fnoise3_1:
1027 case nir_op_fnoise3_2:
1028 case nir_op_fnoise3_3:
1029 case nir_op_fnoise3_4:
1030 case nir_op_fnoise4_1:
1031 case nir_op_fnoise4_2:
1032 case nir_op_fnoise4_3:
1033 case nir_op_fnoise4_4:
1034 unreachable("not reached: should be handled by lower_noise");
1035
1036 case nir_op_ldexp:
1037 unreachable("not reached: should be handled by ldexp_to_arith()");
1038
1039 case nir_op_fsqrt:
1040 inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1041 inst->saturate = instr->dest.saturate;
1042 break;
1043
1044 case nir_op_frsq:
1045 inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1046 inst->saturate = instr->dest.saturate;
1047 break;
1048
1049 case nir_op_b2i:
1050 case nir_op_b2f:
1051 bld.MOV(result, negate(op[0]));
1052 break;
1053
1054 case nir_op_f2b:
1055 bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
1056 break;
1057 case nir_op_d2b: {
1058 /* two-argument instructions can't take 64-bit immediates */
1059 fs_reg zero = vgrf(glsl_type::double_type);
1060 bld.MOV(zero, setup_imm_df(bld, 0.0));
1061 /* A SIMD16 execution needs to be split in two instructions, so use
1062 * a vgrf instead of the flag register as dst so instruction splitting
1063 * works
1064 */
1065 fs_reg tmp = vgrf(glsl_type::double_type);
1066 bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1067 bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1068 break;
1069 }
1070 case nir_op_i2b:
1071 bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1072 break;
1073
1074 case nir_op_ftrunc:
1075 inst = bld.RNDZ(result, op[0]);
1076 inst->saturate = instr->dest.saturate;
1077 break;
1078
1079 case nir_op_fceil: {
1080 op[0].negate = !op[0].negate;
1081 fs_reg temp = vgrf(glsl_type::float_type);
1082 bld.RNDD(temp, op[0]);
1083 temp.negate = true;
1084 inst = bld.MOV(result, temp);
1085 inst->saturate = instr->dest.saturate;
1086 break;
1087 }
1088 case nir_op_ffloor:
1089 inst = bld.RNDD(result, op[0]);
1090 inst->saturate = instr->dest.saturate;
1091 break;
1092 case nir_op_ffract:
1093 inst = bld.FRC(result, op[0]);
1094 inst->saturate = instr->dest.saturate;
1095 break;
1096 case nir_op_fround_even:
1097 inst = bld.RNDE(result, op[0]);
1098 inst->saturate = instr->dest.saturate;
1099 break;
1100
1101 case nir_op_fquantize2f16: {
1102 fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1103 fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1104 fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1105
1106 /* The destination stride must be at least as big as the source stride. */
1107 tmp16.type = BRW_REGISTER_TYPE_W;
1108 tmp16.stride = 2;
1109
1110 /* Check for denormal */
1111 fs_reg abs_src0 = op[0];
1112 abs_src0.abs = true;
1113 bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1114 BRW_CONDITIONAL_L);
1115 /* Get the appropriately signed zero */
1116 bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1117 retype(op[0], BRW_REGISTER_TYPE_UD),
1118 brw_imm_ud(0x80000000));
1119 /* Do the actual F32 -> F16 -> F32 conversion */
1120 bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1121 bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1122 /* Select that or zero based on normal status */
1123 inst = bld.SEL(result, zero, tmp32);
1124 inst->predicate = BRW_PREDICATE_NORMAL;
1125 inst->saturate = instr->dest.saturate;
1126 break;
1127 }
1128
1129 case nir_op_imin:
1130 case nir_op_umin:
1131 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1132 case nir_op_fmin:
1133 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1134 inst->saturate = instr->dest.saturate;
1135 break;
1136
1137 case nir_op_imax:
1138 case nir_op_umax:
1139 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1140 case nir_op_fmax:
1141 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1142 inst->saturate = instr->dest.saturate;
1143 break;
1144
1145 case nir_op_pack_snorm_2x16:
1146 case nir_op_pack_snorm_4x8:
1147 case nir_op_pack_unorm_2x16:
1148 case nir_op_pack_unorm_4x8:
1149 case nir_op_unpack_snorm_2x16:
1150 case nir_op_unpack_snorm_4x8:
1151 case nir_op_unpack_unorm_2x16:
1152 case nir_op_unpack_unorm_4x8:
1153 case nir_op_unpack_half_2x16:
1154 case nir_op_pack_half_2x16:
1155 unreachable("not reached: should be handled by lower_packing_builtins");
1156
1157 case nir_op_unpack_half_2x16_split_x:
1158 inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
1159 inst->saturate = instr->dest.saturate;
1160 break;
1161 case nir_op_unpack_half_2x16_split_y:
1162 inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
1163 inst->saturate = instr->dest.saturate;
1164 break;
1165
1166 case nir_op_pack_double_2x32_split:
1167 /* Optimize the common case where we are re-packing a double with
1168 * the result of a previous double unpack. In this case we can take the
1169 * 32-bit value to use in the re-pack from the original double and bypass
1170 * the unpack operation.
1171 */
1172 for (int i = 0; i < 2; i++) {
1173 if (instr->src[i].src.is_ssa)
1174 continue;
1175
1176 const nir_instr *parent_instr = instr->src[i].src.ssa->parent_instr;
1177 if (parent_instr->type == nir_instr_type_alu)
1178 continue;
1179
1180 const nir_alu_instr *alu_parent = nir_instr_as_alu(parent_instr);
1181 if (alu_parent->op == nir_op_unpack_double_2x32_split_x ||
1182 alu_parent->op == nir_op_unpack_double_2x32_split_y)
1183 continue;
1184
1185 if (!alu_parent->src[0].src.is_ssa)
1186 continue;
1187
1188 op[i] = get_nir_src(alu_parent->src[0].src);
1189 op[i] = offset(retype(op[i], BRW_REGISTER_TYPE_DF), bld,
1190 alu_parent->src[0].swizzle[channel]);
1191 if (alu_parent->op == nir_op_unpack_double_2x32_split_y)
1192 op[i] = subscript(op[i], BRW_REGISTER_TYPE_UD, 1);
1193 else
1194 op[i] = subscript(op[i], BRW_REGISTER_TYPE_UD, 0);
1195 }
1196 bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1197 break;
1198
1199 case nir_op_unpack_double_2x32_split_x:
1200 case nir_op_unpack_double_2x32_split_y: {
1201 /* Optimize the common case where we are unpacking from a double we have
1202 * previously packed. In this case we can just bypass the pack operation
1203 * and source directly from its arguments.
1204 */
1205 unsigned index = (instr->op == nir_op_unpack_double_2x32_split_x) ? 0 : 1;
1206 if (instr->src[0].src.is_ssa) {
1207 nir_instr *parent_instr = instr->src[0].src.ssa->parent_instr;
1208 if (parent_instr->type == nir_instr_type_alu) {
1209 nir_alu_instr *alu_parent = nir_instr_as_alu(parent_instr);
1210 if (alu_parent->op == nir_op_pack_double_2x32_split &&
1211 alu_parent->src[index].src.is_ssa) {
1212 op[0] = retype(get_nir_src(alu_parent->src[index].src),
1213 BRW_REGISTER_TYPE_UD);
1214 op[0] =
1215 offset(op[0], bld, alu_parent->src[index].swizzle[channel]);
1216 bld.MOV(result, op[0]);
1217 break;
1218 }
1219 }
1220 }
1221
1222 if (instr->op == nir_op_unpack_double_2x32_split_x)
1223 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1224 else
1225 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1226 break;
1227 }
1228
1229 case nir_op_fpow:
1230 inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1231 inst->saturate = instr->dest.saturate;
1232 break;
1233
1234 case nir_op_bitfield_reverse:
1235 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1236 bld.BFREV(result, op[0]);
1237 break;
1238
1239 case nir_op_bit_count:
1240 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1241 bld.CBIT(result, op[0]);
1242 break;
1243
1244 case nir_op_ufind_msb: {
1245 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1246 emit_find_msb_using_lzd(bld, result, op[0], false);
1247 break;
1248 }
1249
1250 case nir_op_ifind_msb: {
1251 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1252
1253 if (devinfo->gen < 7) {
1254 emit_find_msb_using_lzd(bld, result, op[0], true);
1255 } else {
1256 bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1257
1258 /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1259 * count from the LSB side. If FBH didn't return an error
1260 * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1261 * count into an LSB count.
1262 */
1263 bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1264
1265 inst = bld.ADD(result, result, brw_imm_d(31));
1266 inst->predicate = BRW_PREDICATE_NORMAL;
1267 inst->src[0].negate = true;
1268 }
1269 break;
1270 }
1271
1272 case nir_op_find_lsb:
1273 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1274
1275 if (devinfo->gen < 7) {
1276 fs_reg temp = vgrf(glsl_type::int_type);
1277
1278 /* (x & -x) generates a value that consists of only the LSB of x.
1279 * For all powers of 2, findMSB(y) == findLSB(y).
1280 */
1281 fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1282 fs_reg negated_src = src;
1283
1284 /* One must be negated, and the other must be non-negated. It
1285 * doesn't matter which is which.
1286 */
1287 negated_src.negate = true;
1288 src.negate = false;
1289
1290 bld.AND(temp, src, negated_src);
1291 emit_find_msb_using_lzd(bld, result, temp, false);
1292 } else {
1293 bld.FBL(result, op[0]);
1294 }
1295 break;
1296
1297 case nir_op_ubitfield_extract:
1298 case nir_op_ibitfield_extract:
1299 unreachable("should have been lowered");
1300 case nir_op_ubfe:
1301 case nir_op_ibfe:
1302 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1303 bld.BFE(result, op[2], op[1], op[0]);
1304 break;
1305 case nir_op_bfm:
1306 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1307 bld.BFI1(result, op[0], op[1]);
1308 break;
1309 case nir_op_bfi:
1310 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1311 bld.BFI2(result, op[0], op[1], op[2]);
1312 break;
1313
1314 case nir_op_bitfield_insert:
1315 unreachable("not reached: should have been lowered");
1316
1317 case nir_op_ishl:
1318 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1319 bld.SHL(result, op[0], op[1]);
1320 break;
1321 case nir_op_ishr:
1322 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1323 bld.ASR(result, op[0], op[1]);
1324 break;
1325 case nir_op_ushr:
1326 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1327 bld.SHR(result, op[0], op[1]);
1328 break;
1329
1330 case nir_op_pack_half_2x16_split:
1331 bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1332 break;
1333
1334 case nir_op_ffma:
1335 inst = bld.MAD(result, op[2], op[1], op[0]);
1336 inst->saturate = instr->dest.saturate;
1337 break;
1338
1339 case nir_op_flrp:
1340 inst = bld.LRP(result, op[0], op[1], op[2]);
1341 inst->saturate = instr->dest.saturate;
1342 break;
1343
1344 case nir_op_bcsel:
1345 if (optimize_frontfacing_ternary(instr, result))
1346 return;
1347
1348 bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1349 inst = bld.SEL(result, op[1], op[2]);
1350 inst->predicate = BRW_PREDICATE_NORMAL;
1351 break;
1352
1353 case nir_op_extract_u8:
1354 case nir_op_extract_i8: {
1355 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1356 nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
1357 assert(byte != NULL);
1358 bld.MOV(result, subscript(op[0], type, byte->u32[0]));
1359 break;
1360 }
1361
1362 case nir_op_extract_u16:
1363 case nir_op_extract_i16: {
1364 const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1365 nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
1366 assert(word != NULL);
1367 bld.MOV(result, subscript(op[0], type, word->u32[0]));
1368 break;
1369 }
1370
1371 default:
1372 unreachable("unhandled instruction");
1373 }
1374
1375 /* If we need to do a boolean resolve, replace the result with -(x & 1)
1376 * to sign extend the low bit to 0/~0
1377 */
1378 if (devinfo->gen <= 5 &&
1379 (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
1380 fs_reg masked = vgrf(glsl_type::int_type);
1381 bld.AND(masked, result, brw_imm_d(1));
1382 masked.negate = true;
1383 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
1384 }
1385 }
1386
1387 void
1388 fs_visitor::nir_emit_load_const(const fs_builder &bld,
1389 nir_load_const_instr *instr)
1390 {
1391 const brw_reg_type reg_type =
1392 instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
1393 fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1394
1395 switch (instr->def.bit_size) {
1396 case 32:
1397 for (unsigned i = 0; i < instr->def.num_components; i++)
1398 bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
1399 break;
1400
1401 case 64:
1402 for (unsigned i = 0; i < instr->def.num_components; i++)
1403 bld.MOV(offset(reg, bld, i),
1404 setup_imm_df(bld, instr->value.f64[i]));
1405 break;
1406
1407 default:
1408 unreachable("Invalid bit size");
1409 }
1410
1411 nir_ssa_values[instr->def.index] = reg;
1412 }
1413
1414 fs_reg
1415 fs_visitor::get_nir_src(const nir_src &src)
1416 {
1417 fs_reg reg;
1418 if (src.is_ssa) {
1419 if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
1420 const brw_reg_type reg_type = src.ssa->bit_size == 32 ?
1421 BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
1422 reg = bld.vgrf(reg_type, src.ssa->num_components);
1423 } else {
1424 reg = nir_ssa_values[src.ssa->index];
1425 }
1426 } else {
1427 /* We don't handle indirects on locals */
1428 assert(src.reg.indirect == NULL);
1429 reg = offset(nir_locals[src.reg.reg->index], bld,
1430 src.reg.base_offset * src.reg.reg->num_components);
1431 }
1432
1433 /* to avoid floating-point denorm flushing problems, set the type by
1434 * default to D - instructions that need floating point semantics will set
1435 * this to F if they need to
1436 */
1437 return retype(reg, BRW_REGISTER_TYPE_D);
1438 }
1439
1440 /**
1441 * Return an IMM for constants; otherwise call get_nir_src() as normal.
1442 */
1443 fs_reg
1444 fs_visitor::get_nir_src_imm(const nir_src &src)
1445 {
1446 nir_const_value *val = nir_src_as_const_value(src);
1447 return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
1448 }
1449
1450 fs_reg
1451 fs_visitor::get_nir_dest(const nir_dest &dest)
1452 {
1453 if (dest.is_ssa) {
1454 const brw_reg_type reg_type =
1455 dest.ssa.bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
1456 nir_ssa_values[dest.ssa.index] =
1457 bld.vgrf(reg_type, dest.ssa.num_components);
1458 return nir_ssa_values[dest.ssa.index];
1459 } else {
1460 /* We don't handle indirects on locals */
1461 assert(dest.reg.indirect == NULL);
1462 return offset(nir_locals[dest.reg.reg->index], bld,
1463 dest.reg.base_offset * dest.reg.reg->num_components);
1464 }
1465 }
1466
1467 fs_reg
1468 fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
1469 {
1470 fs_reg image(UNIFORM, deref->var->data.driver_location / 4,
1471 BRW_REGISTER_TYPE_UD);
1472 fs_reg indirect;
1473 unsigned indirect_max = 0;
1474
1475 for (const nir_deref *tail = &deref->deref; tail->child;
1476 tail = tail->child) {
1477 const nir_deref_array *deref_array = nir_deref_as_array(tail->child);
1478 assert(tail->child->deref_type == nir_deref_type_array);
1479 const unsigned size = glsl_get_length(tail->type);
1480 const unsigned element_size = type_size_scalar(deref_array->deref.type);
1481 const unsigned base = MIN2(deref_array->base_offset, size - 1);
1482 image = offset(image, bld, base * element_size);
1483
1484 if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
1485 fs_reg tmp = vgrf(glsl_type::uint_type);
1486
1487 /* Accessing an invalid surface index with the dataport can result
1488 * in a hang. According to the spec "if the index used to
1489 * select an individual element is negative or greater than or
1490 * equal to the size of the array, the results of the operation
1491 * are undefined but may not lead to termination" -- which is one
1492 * of the possible outcomes of the hang. Clamp the index to
1493 * prevent access outside of the array bounds.
1494 */
1495 bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
1496 BRW_REGISTER_TYPE_UD),
1497 brw_imm_ud(size - base - 1), BRW_CONDITIONAL_L);
1498
1499 indirect_max += element_size * (tail->type->length - 1);
1500
1501 bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4));
1502 if (indirect.file == BAD_FILE) {
1503 indirect = tmp;
1504 } else {
1505 bld.ADD(indirect, indirect, tmp);
1506 }
1507 }
1508 }
1509
1510 if (indirect.file == BAD_FILE) {
1511 return image;
1512 } else {
1513 /* Emit a pile of MOVs to load the uniform into a temporary. The
1514 * dead-code elimination pass will get rid of what we don't use.
1515 */
1516 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE);
1517 for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) {
1518 bld.emit(SHADER_OPCODE_MOV_INDIRECT,
1519 offset(tmp, bld, j), offset(image, bld, j),
1520 indirect, brw_imm_ud((indirect_max + 1) * 4));
1521 }
1522 return tmp;
1523 }
1524 }
1525
1526 void
1527 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
1528 unsigned wr_mask)
1529 {
1530 for (unsigned i = 0; i < 4; i++) {
1531 if (!((wr_mask >> i) & 1))
1532 continue;
1533
1534 fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
1535 new_inst->dst = offset(new_inst->dst, bld, i);
1536 for (unsigned j = 0; j < new_inst->sources; j++)
1537 if (new_inst->src[j].file == VGRF)
1538 new_inst->src[j] = offset(new_inst->src[j], bld, i);
1539
1540 bld.emit(new_inst);
1541 }
1542 }
1543
1544 /**
1545 * Get the matching channel register datatype for an image intrinsic of the
1546 * specified GLSL image type.
1547 */
1548 static brw_reg_type
1549 get_image_base_type(const glsl_type *type)
1550 {
1551 switch ((glsl_base_type)type->sampled_type) {
1552 case GLSL_TYPE_UINT:
1553 return BRW_REGISTER_TYPE_UD;
1554 case GLSL_TYPE_INT:
1555 return BRW_REGISTER_TYPE_D;
1556 case GLSL_TYPE_FLOAT:
1557 return BRW_REGISTER_TYPE_F;
1558 default:
1559 unreachable("Not reached.");
1560 }
1561 }
1562
1563 /**
1564 * Get the appropriate atomic op for an image atomic intrinsic.
1565 */
1566 static unsigned
1567 get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
1568 {
1569 switch (op) {
1570 case nir_intrinsic_image_atomic_add:
1571 return BRW_AOP_ADD;
1572 case nir_intrinsic_image_atomic_min:
1573 return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
1574 BRW_AOP_IMIN : BRW_AOP_UMIN);
1575 case nir_intrinsic_image_atomic_max:
1576 return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
1577 BRW_AOP_IMAX : BRW_AOP_UMAX);
1578 case nir_intrinsic_image_atomic_and:
1579 return BRW_AOP_AND;
1580 case nir_intrinsic_image_atomic_or:
1581 return BRW_AOP_OR;
1582 case nir_intrinsic_image_atomic_xor:
1583 return BRW_AOP_XOR;
1584 case nir_intrinsic_image_atomic_exchange:
1585 return BRW_AOP_MOV;
1586 case nir_intrinsic_image_atomic_comp_swap:
1587 return BRW_AOP_CMPWR;
1588 default:
1589 unreachable("Not reachable.");
1590 }
1591 }
1592
1593 static fs_inst *
1594 emit_pixel_interpolater_send(const fs_builder &bld,
1595 enum opcode opcode,
1596 const fs_reg &dst,
1597 const fs_reg &src,
1598 const fs_reg &desc,
1599 glsl_interp_mode interpolation)
1600 {
1601 struct brw_wm_prog_data *wm_prog_data =
1602 brw_wm_prog_data(bld.shader->stage_prog_data);
1603 fs_inst *inst;
1604 fs_reg payload;
1605 int mlen;
1606
1607 if (src.file == BAD_FILE) {
1608 /* Dummy payload */
1609 payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
1610 mlen = 1;
1611 } else {
1612 payload = src;
1613 mlen = 2 * bld.dispatch_width() / 8;
1614 }
1615
1616 inst = bld.emit(opcode, dst, payload, desc);
1617 inst->mlen = mlen;
1618 /* 2 floats per slot returned */
1619 inst->size_written = 2 * dst.component_size(inst->exec_size);
1620 inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
1621
1622 wm_prog_data->pulls_bary = true;
1623
1624 return inst;
1625 }
1626
1627 /**
1628 * Computes 1 << x, given a D/UD register containing some value x.
1629 */
1630 static fs_reg
1631 intexp2(const fs_builder &bld, const fs_reg &x)
1632 {
1633 assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
1634
1635 fs_reg result = bld.vgrf(x.type, 1);
1636 fs_reg one = bld.vgrf(x.type, 1);
1637
1638 bld.MOV(one, retype(brw_imm_d(1), one.type));
1639 bld.SHL(result, one, x);
1640 return result;
1641 }
1642
1643 void
1644 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
1645 {
1646 assert(stage == MESA_SHADER_GEOMETRY);
1647
1648 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1649
1650 if (gs_compile->control_data_header_size_bits == 0)
1651 return;
1652
1653 /* We can only do EndPrimitive() functionality when the control data
1654 * consists of cut bits. Fortunately, the only time it isn't is when the
1655 * output type is points, in which case EndPrimitive() is a no-op.
1656 */
1657 if (gs_prog_data->control_data_format !=
1658 GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
1659 return;
1660 }
1661
1662 /* Cut bits use one bit per vertex. */
1663 assert(gs_compile->control_data_bits_per_vertex == 1);
1664
1665 fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1666 vertex_count.type = BRW_REGISTER_TYPE_UD;
1667
1668 /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
1669 * vertex n, 0 otherwise. So all we need to do here is mark bit
1670 * (vertex_count - 1) % 32 in the cut_bits register to indicate that
1671 * EndPrimitive() was called after emitting vertex (vertex_count - 1);
1672 * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
1673 *
1674 * Note that if EndPrimitive() is called before emitting any vertices, this
1675 * will cause us to set bit 31 of the control_data_bits register to 1.
1676 * That's fine because:
1677 *
1678 * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
1679 * output, so the hardware will ignore cut bit 31.
1680 *
1681 * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
1682 * last vertex, so setting cut bit 31 has no effect (since the primitive
1683 * is automatically ended when the GS terminates).
1684 *
1685 * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
1686 * control_data_bits register to 0 when the first vertex is emitted.
1687 */
1688
1689 const fs_builder abld = bld.annotate("end primitive");
1690
1691 /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
1692 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1693 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1694 fs_reg mask = intexp2(abld, prev_count);
1695 /* Note: we're relying on the fact that the GEN SHL instruction only pays
1696 * attention to the lower 5 bits of its second source argument, so on this
1697 * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
1698 * ((vertex_count - 1) % 32).
1699 */
1700 abld.OR(this->control_data_bits, this->control_data_bits, mask);
1701 }
1702
1703 void
1704 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
1705 {
1706 assert(stage == MESA_SHADER_GEOMETRY);
1707 assert(gs_compile->control_data_bits_per_vertex != 0);
1708
1709 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1710
1711 const fs_builder abld = bld.annotate("emit control data bits");
1712 const fs_builder fwa_bld = bld.exec_all();
1713
1714 /* We use a single UD register to accumulate control data bits (32 bits
1715 * for each of the SIMD8 channels). So we need to write a DWord (32 bits)
1716 * at a time.
1717 *
1718 * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
1719 * We have select a 128-bit group via the Global and Per-Slot Offsets, then
1720 * use the Channel Mask phase to enable/disable which DWord within that
1721 * group to write. (Remember, different SIMD8 channels may have emitted
1722 * different numbers of vertices, so we may need per-slot offsets.)
1723 *
1724 * Channel masking presents an annoying problem: we may have to replicate
1725 * the data up to 4 times:
1726 *
1727 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
1728 *
1729 * To avoid penalizing shaders that emit a small number of vertices, we
1730 * can avoid these sometimes: if the size of the control data header is
1731 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
1732 * land in the same 128-bit group, so we can skip per-slot offsets.
1733 *
1734 * Similarly, if the control data header is <= 32 bits, there is only one
1735 * DWord, so we can skip channel masks.
1736 */
1737 enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
1738
1739 fs_reg channel_mask, per_slot_offset;
1740
1741 if (gs_compile->control_data_header_size_bits > 32) {
1742 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
1743 channel_mask = vgrf(glsl_type::uint_type);
1744 }
1745
1746 if (gs_compile->control_data_header_size_bits > 128) {
1747 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
1748 per_slot_offset = vgrf(glsl_type::uint_type);
1749 }
1750
1751 /* Figure out which DWord we're trying to write to using the formula:
1752 *
1753 * dword_index = (vertex_count - 1) * bits_per_vertex / 32
1754 *
1755 * Since bits_per_vertex is a power of two, and is known at compile
1756 * time, this can be optimized to:
1757 *
1758 * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
1759 */
1760 if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
1761 fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1762 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1763 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1764 unsigned log2_bits_per_vertex =
1765 util_last_bit(gs_compile->control_data_bits_per_vertex);
1766 abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
1767
1768 if (per_slot_offset.file != BAD_FILE) {
1769 /* Set the per-slot offset to dword_index / 4, so that we'll write to
1770 * the appropriate OWord within the control data header.
1771 */
1772 abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
1773 }
1774
1775 /* Set the channel masks to 1 << (dword_index % 4), so that we'll
1776 * write to the appropriate DWORD within the OWORD.
1777 */
1778 fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1779 fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
1780 channel_mask = intexp2(fwa_bld, channel);
1781 /* Then the channel masks need to be in bits 23:16. */
1782 fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
1783 }
1784
1785 /* Store the control data bits in the message payload and send it. */
1786 int mlen = 2;
1787 if (channel_mask.file != BAD_FILE)
1788 mlen += 4; /* channel masks, plus 3 extra copies of the data */
1789 if (per_slot_offset.file != BAD_FILE)
1790 mlen++;
1791
1792 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
1793 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
1794 int i = 0;
1795 sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1796 if (per_slot_offset.file != BAD_FILE)
1797 sources[i++] = per_slot_offset;
1798 if (channel_mask.file != BAD_FILE)
1799 sources[i++] = channel_mask;
1800 while (i < mlen) {
1801 sources[i++] = this->control_data_bits;
1802 }
1803
1804 abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
1805 fs_inst *inst = abld.emit(opcode, reg_undef, payload);
1806 inst->mlen = mlen;
1807 /* We need to increment Global Offset by 256-bits to make room for
1808 * Broadwell's extra "Vertex Count" payload at the beginning of the
1809 * URB entry. Since this is an OWord message, Global Offset is counted
1810 * in 128-bit units, so we must set it to 2.
1811 */
1812 if (gs_prog_data->static_vertex_count == -1)
1813 inst->offset = 2;
1814 }
1815
1816 void
1817 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
1818 unsigned stream_id)
1819 {
1820 /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
1821
1822 /* Note: we are calling this *before* increasing vertex_count, so
1823 * this->vertex_count == vertex_count - 1 in the formula above.
1824 */
1825
1826 /* Stream mode uses 2 bits per vertex */
1827 assert(gs_compile->control_data_bits_per_vertex == 2);
1828
1829 /* Must be a valid stream */
1830 assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
1831
1832 /* Control data bits are initialized to 0 so we don't have to set any
1833 * bits when sending vertices to stream 0.
1834 */
1835 if (stream_id == 0)
1836 return;
1837
1838 const fs_builder abld = bld.annotate("set stream control data bits", NULL);
1839
1840 /* reg::sid = stream_id */
1841 fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1842 abld.MOV(sid, brw_imm_ud(stream_id));
1843
1844 /* reg:shift_count = 2 * (vertex_count - 1) */
1845 fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1846 abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
1847
1848 /* Note: we're relying on the fact that the GEN SHL instruction only pays
1849 * attention to the lower 5 bits of its second source argument, so on this
1850 * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
1851 * stream_id << ((2 * (vertex_count - 1)) % 32).
1852 */
1853 fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1854 abld.SHL(mask, sid, shift_count);
1855 abld.OR(this->control_data_bits, this->control_data_bits, mask);
1856 }
1857
1858 void
1859 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
1860 unsigned stream_id)
1861 {
1862 assert(stage == MESA_SHADER_GEOMETRY);
1863
1864 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1865
1866 fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1867 vertex_count.type = BRW_REGISTER_TYPE_UD;
1868
1869 /* Haswell and later hardware ignores the "Render Stream Select" bits
1870 * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
1871 * and instead sends all primitives down the pipeline for rasterization.
1872 * If the SOL stage is enabled, "Render Stream Select" is honored and
1873 * primitives bound to non-zero streams are discarded after stream output.
1874 *
1875 * Since the only purpose of primives sent to non-zero streams is to
1876 * be recorded by transform feedback, we can simply discard all geometry
1877 * bound to these streams when transform feedback is disabled.
1878 */
1879 if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
1880 return;
1881
1882 /* If we're outputting 32 control data bits or less, then we can wait
1883 * until the shader is over to output them all. Otherwise we need to
1884 * output them as we go. Now is the time to do it, since we're about to
1885 * output the vertex_count'th vertex, so it's guaranteed that the
1886 * control data bits associated with the (vertex_count - 1)th vertex are
1887 * correct.
1888 */
1889 if (gs_compile->control_data_header_size_bits > 32) {
1890 const fs_builder abld =
1891 bld.annotate("emit vertex: emit control data bits");
1892
1893 /* Only emit control data bits if we've finished accumulating a batch
1894 * of 32 bits. This is the case when:
1895 *
1896 * (vertex_count * bits_per_vertex) % 32 == 0
1897 *
1898 * (in other words, when the last 5 bits of vertex_count *
1899 * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
1900 * integer n (which is always the case, since bits_per_vertex is
1901 * always 1 or 2), this is equivalent to requiring that the last 5-n
1902 * bits of vertex_count are 0:
1903 *
1904 * vertex_count & (2^(5-n) - 1) == 0
1905 *
1906 * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
1907 * equivalent to:
1908 *
1909 * vertex_count & (32 / bits_per_vertex - 1) == 0
1910 *
1911 * TODO: If vertex_count is an immediate, we could do some of this math
1912 * at compile time...
1913 */
1914 fs_inst *inst =
1915 abld.AND(bld.null_reg_d(), vertex_count,
1916 brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
1917 inst->conditional_mod = BRW_CONDITIONAL_Z;
1918
1919 abld.IF(BRW_PREDICATE_NORMAL);
1920 /* If vertex_count is 0, then no control data bits have been
1921 * accumulated yet, so we can skip emitting them.
1922 */
1923 abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
1924 BRW_CONDITIONAL_NEQ);
1925 abld.IF(BRW_PREDICATE_NORMAL);
1926 emit_gs_control_data_bits(vertex_count);
1927 abld.emit(BRW_OPCODE_ENDIF);
1928
1929 /* Reset control_data_bits to 0 so we can start accumulating a new
1930 * batch.
1931 *
1932 * Note: in the case where vertex_count == 0, this neutralizes the
1933 * effect of any call to EndPrimitive() that the shader may have
1934 * made before outputting its first vertex.
1935 */
1936 inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
1937 inst->force_writemask_all = true;
1938 abld.emit(BRW_OPCODE_ENDIF);
1939 }
1940
1941 emit_urb_writes(vertex_count);
1942
1943 /* In stream mode we have to set control data bits for all vertices
1944 * unless we have disabled control data bits completely (which we do
1945 * do for GL_POINTS outputs that don't use streams).
1946 */
1947 if (gs_compile->control_data_header_size_bits > 0 &&
1948 gs_prog_data->control_data_format ==
1949 GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
1950 set_gs_stream_control_data_bits(vertex_count, stream_id);
1951 }
1952 }
1953
1954 void
1955 fs_visitor::emit_gs_input_load(const fs_reg &dst,
1956 const nir_src &vertex_src,
1957 unsigned base_offset,
1958 const nir_src &offset_src,
1959 unsigned num_components,
1960 unsigned first_component)
1961 {
1962 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1963
1964 nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
1965 nir_const_value *offset_const = nir_src_as_const_value(offset_src);
1966 const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
1967
1968 /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
1969 * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w]. Only
1970 * gl_PointSize is available as a GS input, however, so it must be that.
1971 */
1972 const bool is_point_size = (base_offset == 0);
1973
1974 /* TODO: figure out push input layout for invocations == 1 */
1975 if (gs_prog_data->invocations == 1 &&
1976 offset_const != NULL && vertex_const != NULL &&
1977 4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
1978 int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
1979 vertex_const->u32[0] * push_reg_count;
1980 /* This input was pushed into registers. */
1981 if (is_point_size) {
1982 /* gl_PointSize comes in .w */
1983 bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
1984 } else {
1985 for (unsigned i = 0; i < num_components; i++) {
1986 bld.MOV(offset(dst, bld, i),
1987 fs_reg(ATTR, imm_offset + i, dst.type));
1988 }
1989 }
1990 return;
1991 }
1992
1993 /* Resort to the pull model. Ensure the VUE handles are provided. */
1994 gs_prog_data->base.include_vue_handles = true;
1995
1996 unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
1997 fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1998
1999 if (gs_prog_data->invocations == 1) {
2000 if (vertex_const) {
2001 /* The vertex index is constant; just select the proper URB handle. */
2002 icp_handle =
2003 retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0),
2004 BRW_REGISTER_TYPE_UD);
2005 } else {
2006 /* The vertex index is non-constant. We need to use indirect
2007 * addressing to fetch the proper URB handle.
2008 *
2009 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2010 * indicating that channel <n> should read the handle from
2011 * DWord <n>. We convert that to bytes by multiplying by 4.
2012 *
2013 * Next, we convert the vertex index to bytes by multiplying
2014 * by 32 (shifting by 5), and add the two together. This is
2015 * the final indirect byte offset.
2016 */
2017 fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
2018 fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2019 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2020 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2021
2022 /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2023 bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2024 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2025 bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2026 /* Convert vertex_index to bytes (multiply by 32) */
2027 bld.SHL(vertex_offset_bytes,
2028 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2029 brw_imm_ud(5u));
2030 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2031
2032 /* Use first_icp_handle as the base offset. There is one register
2033 * of URB handles per vertex, so inform the register allocator that
2034 * we might read up to nir->info->gs.vertices_in registers.
2035 */
2036 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2037 fs_reg(brw_vec8_grf(first_icp_handle, 0)),
2038 fs_reg(icp_offset_bytes),
2039 brw_imm_ud(nir->info->gs.vertices_in * REG_SIZE));
2040 }
2041 } else {
2042 assert(gs_prog_data->invocations > 1);
2043
2044 if (vertex_const) {
2045 assert(devinfo->gen >= 9 || vertex_const->i32[0] <= 5);
2046 bld.MOV(icp_handle,
2047 retype(brw_vec1_grf(first_icp_handle +
2048 vertex_const->i32[0] / 8,
2049 vertex_const->i32[0] % 8),
2050 BRW_REGISTER_TYPE_UD));
2051 } else {
2052 /* The vertex index is non-constant. We need to use indirect
2053 * addressing to fetch the proper URB handle.
2054 *
2055 */
2056 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2057
2058 /* Convert vertex_index to bytes (multiply by 4) */
2059 bld.SHL(icp_offset_bytes,
2060 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2061 brw_imm_ud(2u));
2062
2063 /* Use first_icp_handle as the base offset. There is one DWord
2064 * of URB handles per vertex, so inform the register allocator that
2065 * we might read up to ceil(nir->info->gs.vertices_in / 8) registers.
2066 */
2067 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2068 fs_reg(brw_vec8_grf(first_icp_handle, 0)),
2069 fs_reg(icp_offset_bytes),
2070 brw_imm_ud(DIV_ROUND_UP(nir->info->gs.vertices_in, 8) *
2071 REG_SIZE));
2072 }
2073 }
2074
2075 fs_inst *inst;
2076
2077 fs_reg tmp_dst = dst;
2078 fs_reg indirect_offset = get_nir_src(offset_src);
2079 unsigned num_iterations = 1;
2080 unsigned orig_num_components = num_components;
2081
2082 if (type_sz(dst.type) == 8) {
2083 if (num_components > 2) {
2084 num_iterations = 2;
2085 num_components = 2;
2086 }
2087 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2088 tmp_dst = tmp;
2089 first_component = first_component / 2;
2090 }
2091
2092 for (unsigned iter = 0; iter < num_iterations; iter++) {
2093 if (offset_const) {
2094 /* Constant indexing - use global offset. */
2095 if (first_component != 0) {
2096 unsigned read_components = num_components + first_component;
2097 fs_reg tmp = bld.vgrf(dst.type, read_components);
2098 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2099 inst->size_written = read_components *
2100 tmp.component_size(inst->exec_size);
2101 for (unsigned i = 0; i < num_components; i++) {
2102 bld.MOV(offset(tmp_dst, bld, i),
2103 offset(tmp, bld, i + first_component));
2104 }
2105 } else {
2106 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
2107 icp_handle);
2108 inst->size_written = num_components *
2109 tmp_dst.component_size(inst->exec_size);
2110 }
2111 inst->offset = base_offset + offset_const->u32[0];
2112 inst->mlen = 1;
2113 } else {
2114 /* Indirect indexing - use per-slot offsets as well. */
2115 const fs_reg srcs[] = { icp_handle, indirect_offset };
2116 unsigned read_components = num_components + first_component;
2117 fs_reg tmp = bld.vgrf(dst.type, read_components);
2118 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2119 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2120 if (first_component != 0) {
2121 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2122 payload);
2123 inst->size_written = read_components *
2124 tmp.component_size(inst->exec_size);
2125 for (unsigned i = 0; i < num_components; i++) {
2126 bld.MOV(offset(tmp_dst, bld, i),
2127 offset(tmp, bld, i + first_component));
2128 }
2129 } else {
2130 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
2131 payload);
2132 inst->size_written = num_components *
2133 tmp_dst.component_size(inst->exec_size);
2134 }
2135 inst->offset = base_offset;
2136 inst->mlen = 2;
2137 }
2138
2139 if (type_sz(dst.type) == 8) {
2140 shuffle_32bit_load_result_to_64bit_data(
2141 bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
2142
2143 for (unsigned c = 0; c < num_components; c++)
2144 bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
2145 }
2146
2147 if (num_iterations > 1) {
2148 num_components = orig_num_components - 2;
2149 if(offset_const) {
2150 base_offset++;
2151 } else {
2152 fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2153 bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
2154 indirect_offset = new_indirect;
2155 }
2156 }
2157 }
2158
2159 if (is_point_size) {
2160 /* Read the whole VUE header (because of alignment) and read .w. */
2161 fs_reg tmp = bld.vgrf(dst.type, 4);
2162 inst->dst = tmp;
2163 inst->size_written = 4 * REG_SIZE;
2164 bld.MOV(dst, offset(tmp, bld, 3));
2165 }
2166 }
2167
2168 fs_reg
2169 fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2170 {
2171 nir_src *offset_src = nir_get_io_offset_src(instr);
2172 nir_const_value *const_value = nir_src_as_const_value(*offset_src);
2173
2174 if (const_value) {
2175 /* The only constant offset we should find is 0. brw_nir.c's
2176 * add_const_offset_to_base() will fold other constant offsets
2177 * into instr->const_index[0].
2178 */
2179 assert(const_value->u32[0] == 0);
2180 return fs_reg();
2181 }
2182
2183 return get_nir_src(*offset_src);
2184 }
2185
2186 static void
2187 do_untyped_vector_read(const fs_builder &bld,
2188 const fs_reg dest,
2189 const fs_reg surf_index,
2190 const fs_reg offset_reg,
2191 unsigned num_components)
2192 {
2193 if (type_sz(dest.type) == 4) {
2194 fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
2195 1 /* dims */,
2196 num_components,
2197 BRW_PREDICATE_NONE);
2198 read_result.type = dest.type;
2199 for (unsigned i = 0; i < num_components; i++)
2200 bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
2201 } else if (type_sz(dest.type) == 8) {
2202 /* Reading a dvec, so we need to:
2203 *
2204 * 1. Multiply num_components by 2, to account for the fact that we
2205 * need to read 64-bit components.
2206 * 2. Shuffle the result of the load to form valid 64-bit elements
2207 * 3. Emit a second load (for components z/w) if needed.
2208 */
2209 fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
2210 bld.MOV(read_offset, offset_reg);
2211
2212 int iters = num_components <= 2 ? 1 : 2;
2213
2214 /* Load the dvec, the first iteration loads components x/y, the second
2215 * iteration, if needed, loads components z/w
2216 */
2217 for (int it = 0; it < iters; it++) {
2218 /* Compute number of components to read in this iteration */
2219 int iter_components = MIN2(2, num_components);
2220 num_components -= iter_components;
2221
2222 /* Read. Since this message reads 32-bit components, we need to
2223 * read twice as many components.
2224 */
2225 fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset,
2226 1 /* dims */,
2227 iter_components * 2,
2228 BRW_PREDICATE_NONE);
2229
2230 /* Shuffle the 32-bit load result into valid 64-bit data */
2231 const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
2232 shuffle_32bit_load_result_to_64bit_data(
2233 bld, packed_result, read_result, iter_components);
2234
2235 /* Move each component to its destination */
2236 read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
2237 for (int c = 0; c < iter_components; c++) {
2238 bld.MOV(offset(dest, bld, it * 2 + c),
2239 offset(packed_result, bld, c));
2240 }
2241
2242 bld.ADD(read_offset, read_offset, brw_imm_ud(16));
2243 }
2244 } else {
2245 unreachable("Unsupported type");
2246 }
2247 }
2248
2249 void
2250 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2251 nir_intrinsic_instr *instr)
2252 {
2253 assert(stage == MESA_SHADER_VERTEX);
2254
2255 fs_reg dest;
2256 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2257 dest = get_nir_dest(instr->dest);
2258
2259 switch (instr->intrinsic) {
2260 case nir_intrinsic_load_vertex_id:
2261 unreachable("should be lowered by lower_vertex_id()");
2262
2263 case nir_intrinsic_load_vertex_id_zero_base:
2264 case nir_intrinsic_load_base_vertex:
2265 case nir_intrinsic_load_instance_id:
2266 case nir_intrinsic_load_base_instance:
2267 case nir_intrinsic_load_draw_id: {
2268 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
2269 fs_reg val = nir_system_values[sv];
2270 assert(val.file != BAD_FILE);
2271 dest.type = val.type;
2272 bld.MOV(dest, val);
2273 break;
2274 }
2275
2276 case nir_intrinsic_load_input: {
2277 fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
2278 unsigned first_component = nir_intrinsic_component(instr);
2279 unsigned num_components = instr->num_components;
2280 enum brw_reg_type type = dest.type;
2281
2282 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
2283 assert(const_offset && "Indirect input loads not allowed");
2284 src = offset(src, bld, const_offset->u32[0]);
2285
2286 for (unsigned j = 0; j < num_components; j++) {
2287 bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
2288 }
2289
2290 if (type == BRW_REGISTER_TYPE_DF) {
2291 /* Once the double vector is read, set again its original register
2292 * type to continue with normal execution.
2293 */
2294 src = retype(src, type);
2295 dest = retype(dest, type);
2296 }
2297
2298 if (type_sz(src.type) == 8) {
2299 shuffle_32bit_load_result_to_64bit_data(bld,
2300 dest,
2301 retype(dest, BRW_REGISTER_TYPE_F),
2302 instr->num_components);
2303 }
2304 break;
2305 }
2306
2307 default:
2308 nir_emit_intrinsic(bld, instr);
2309 break;
2310 }
2311 }
2312
2313 void
2314 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2315 nir_intrinsic_instr *instr)
2316 {
2317 assert(stage == MESA_SHADER_TESS_CTRL);
2318 struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2319 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2320
2321 fs_reg dst;
2322 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2323 dst = get_nir_dest(instr->dest);
2324
2325 switch (instr->intrinsic) {
2326 case nir_intrinsic_load_primitive_id:
2327 bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
2328 break;
2329 case nir_intrinsic_load_invocation_id:
2330 bld.MOV(retype(dst, invocation_id.type), invocation_id);
2331 break;
2332 case nir_intrinsic_load_patch_vertices_in:
2333 bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2334 brw_imm_d(tcs_key->input_vertices));
2335 break;
2336
2337 case nir_intrinsic_barrier: {
2338 if (tcs_prog_data->instances == 1)
2339 break;
2340
2341 fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2342 fs_reg m0_2 = component(m0, 2);
2343
2344 const fs_builder chanbld = bld.exec_all().group(1, 0);
2345
2346 /* Zero the message header */
2347 bld.exec_all().MOV(m0, brw_imm_ud(0u));
2348
2349 /* Copy "Barrier ID" from r0.2, bits 16:13 */
2350 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2351 brw_imm_ud(INTEL_MASK(16, 13)));
2352
2353 /* Shift it up to bits 27:24. */
2354 chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2355
2356 /* Set the Barrier Count and the enable bit */
2357 chanbld.OR(m0_2, m0_2,
2358 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2359
2360 bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2361 break;
2362 }
2363
2364 case nir_intrinsic_load_input:
2365 unreachable("nir_lower_io should never give us these.");
2366 break;
2367
2368 case nir_intrinsic_load_per_vertex_input: {
2369 fs_reg indirect_offset = get_indirect_offset(instr);
2370 unsigned imm_offset = instr->const_index[0];
2371
2372 const nir_src &vertex_src = instr->src[0];
2373 nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
2374
2375 fs_inst *inst;
2376
2377 fs_reg icp_handle;
2378
2379 if (vertex_const) {
2380 /* Emit a MOV to resolve <0,1,0> regioning. */
2381 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2382 bld.MOV(icp_handle,
2383 retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3),
2384 vertex_const->i32[0] & 7),
2385 BRW_REGISTER_TYPE_UD));
2386 } else if (tcs_prog_data->instances == 1 &&
2387 vertex_src.is_ssa &&
2388 vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
2389 nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
2390 /* For the common case of only 1 instance, an array index of
2391 * gl_InvocationID means reading g1. Skip all the indirect work.
2392 */
2393 icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2394 } else {
2395 /* The vertex index is non-constant. We need to use indirect
2396 * addressing to fetch the proper URB handle.
2397 */
2398 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2399
2400 /* Each ICP handle is a single DWord (4 bytes) */
2401 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2402 bld.SHL(vertex_offset_bytes,
2403 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2404 brw_imm_ud(2u));
2405
2406 /* Start at g1. We might read up to 4 registers. */
2407 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2408 fs_reg(brw_vec8_grf(1, 0)), vertex_offset_bytes,
2409 brw_imm_ud(4 * REG_SIZE));
2410 }
2411
2412 /* We can only read two double components with each URB read, so
2413 * we send two read messages in that case, each one loading up to
2414 * two double components.
2415 */
2416 unsigned num_iterations = 1;
2417 unsigned num_components = instr->num_components;
2418 unsigned first_component = nir_intrinsic_component(instr);
2419 fs_reg orig_dst = dst;
2420 if (type_sz(dst.type) == 8) {
2421 first_component = first_component / 2;
2422 if (instr->num_components > 2) {
2423 num_iterations = 2;
2424 num_components = 2;
2425 }
2426
2427 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2428 dst = tmp;
2429 }
2430
2431 for (unsigned iter = 0; iter < num_iterations; iter++) {
2432 if (indirect_offset.file == BAD_FILE) {
2433 /* Constant indexing - use global offset. */
2434 if (first_component != 0) {
2435 unsigned read_components = num_components + first_component;
2436 fs_reg tmp = bld.vgrf(dst.type, read_components);
2437 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2438 for (unsigned i = 0; i < num_components; i++) {
2439 bld.MOV(offset(dst, bld, i),
2440 offset(tmp, bld, i + first_component));
2441 }
2442 } else {
2443 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2444 }
2445 inst->offset = imm_offset;
2446 inst->mlen = 1;
2447 } else {
2448 /* Indirect indexing - use per-slot offsets as well. */
2449 const fs_reg srcs[] = { icp_handle, indirect_offset };
2450 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2451 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2452 if (first_component != 0) {
2453 unsigned read_components = num_components + first_component;
2454 fs_reg tmp = bld.vgrf(dst.type, read_components);
2455 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2456 payload);
2457 for (unsigned i = 0; i < num_components; i++) {
2458 bld.MOV(offset(dst, bld, i),
2459 offset(tmp, bld, i + first_component));
2460 }
2461 } else {
2462 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2463 payload);
2464 }
2465 inst->offset = imm_offset;
2466 inst->mlen = 2;
2467 }
2468 inst->size_written = (num_components + first_component) *
2469 inst->dst.component_size(inst->exec_size);
2470
2471 /* If we are reading 64-bit data using 32-bit read messages we need
2472 * build proper 64-bit data elements by shuffling the low and high
2473 * 32-bit components around like we do for other things like UBOs
2474 * or SSBOs.
2475 */
2476 if (type_sz(dst.type) == 8) {
2477 shuffle_32bit_load_result_to_64bit_data(
2478 bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components);
2479
2480 for (unsigned c = 0; c < num_components; c++) {
2481 bld.MOV(offset(orig_dst, bld, iter * 2 + c),
2482 offset(dst, bld, c));
2483 }
2484 }
2485
2486 /* Copy the temporary to the destination to deal with writemasking.
2487 *
2488 * Also attempt to deal with gl_PointSize being in the .w component.
2489 */
2490 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2491 assert(type_sz(dst.type) < 8);
2492 inst->dst = bld.vgrf(dst.type, 4);
2493 inst->size_written = 4 * REG_SIZE;
2494 bld.MOV(dst, offset(inst->dst, bld, 3));
2495 }
2496
2497 /* If we are loading double data and we need a second read message
2498 * adjust the write offset
2499 */
2500 if (num_iterations > 1) {
2501 num_components = instr->num_components - 2;
2502 imm_offset++;
2503 }
2504 }
2505 break;
2506 }
2507
2508 case nir_intrinsic_load_output:
2509 case nir_intrinsic_load_per_vertex_output: {
2510 fs_reg indirect_offset = get_indirect_offset(instr);
2511 unsigned imm_offset = instr->const_index[0];
2512 unsigned first_component = nir_intrinsic_component(instr);
2513
2514 fs_inst *inst;
2515 if (indirect_offset.file == BAD_FILE) {
2516 /* Replicate the patch handle to all enabled channels */
2517 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2518 bld.MOV(patch_handle,
2519 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
2520
2521 if (imm_offset == 0) {
2522 /* This is a read of gl_TessLevelInner[], which lives in the
2523 * Patch URB header. The layout depends on the domain.
2524 */
2525 dst.type = BRW_REGISTER_TYPE_F;
2526 switch (tcs_key->tes_primitive_mode) {
2527 case GL_QUADS: {
2528 /* DWords 3-2 (reversed) */
2529 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
2530
2531 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
2532 inst->offset = 0;
2533 inst->mlen = 1;
2534 inst->size_written = 4 * REG_SIZE;
2535
2536 /* dst.xy = tmp.wz */
2537 bld.MOV(dst, offset(tmp, bld, 3));
2538 bld.MOV(offset(dst, bld, 1), offset(tmp, bld, 2));
2539 break;
2540 }
2541 case GL_TRIANGLES:
2542 /* DWord 4; hardcode offset = 1 and size_written = REG_SIZE */
2543 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle);
2544 inst->offset = 1;
2545 inst->mlen = 1;
2546 inst->size_written = REG_SIZE;
2547 break;
2548 case GL_ISOLINES:
2549 /* All channels are undefined. */
2550 break;
2551 default:
2552 unreachable("Bogus tessellation domain");
2553 }
2554 } else if (imm_offset == 1) {
2555 /* This is a read of gl_TessLevelOuter[], which lives in the
2556 * Patch URB header. The layout depends on the domain.
2557 */
2558 dst.type = BRW_REGISTER_TYPE_F;
2559
2560 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
2561 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
2562 inst->offset = 1;
2563 inst->mlen = 1;
2564 inst->size_written = 4 * REG_SIZE;
2565
2566 /* Reswizzle: WZYX */
2567 fs_reg srcs[4] = {
2568 offset(tmp, bld, 3),
2569 offset(tmp, bld, 2),
2570 offset(tmp, bld, 1),
2571 offset(tmp, bld, 0),
2572 };
2573
2574 unsigned num_components;
2575 switch (tcs_key->tes_primitive_mode) {
2576 case GL_QUADS:
2577 num_components = 4;
2578 break;
2579 case GL_TRIANGLES:
2580 num_components = 3;
2581 break;
2582 case GL_ISOLINES:
2583 /* Isolines are not reversed; swizzle .zw -> .xy */
2584 srcs[0] = offset(tmp, bld, 2);
2585 srcs[1] = offset(tmp, bld, 3);
2586 num_components = 2;
2587 break;
2588 default:
2589 unreachable("Bogus tessellation domain");
2590 }
2591 bld.LOAD_PAYLOAD(dst, srcs, num_components, 0);
2592 } else {
2593 if (first_component != 0) {
2594 unsigned read_components =
2595 instr->num_components + first_component;
2596 fs_reg tmp = bld.vgrf(dst.type, read_components);
2597 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2598 patch_handle);
2599 inst->size_written = read_components * REG_SIZE;
2600 for (unsigned i = 0; i < instr->num_components; i++) {
2601 bld.MOV(offset(dst, bld, i),
2602 offset(tmp, bld, i + first_component));
2603 }
2604 } else {
2605 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
2606 patch_handle);
2607 inst->size_written = instr->num_components * REG_SIZE;
2608 }
2609 inst->offset = imm_offset;
2610 inst->mlen = 1;
2611 }
2612 } else {
2613 /* Indirect indexing - use per-slot offsets as well. */
2614 const fs_reg srcs[] = {
2615 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2616 indirect_offset
2617 };
2618 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2619 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2620 if (first_component != 0) {
2621 unsigned read_components =
2622 instr->num_components + first_component;
2623 fs_reg tmp = bld.vgrf(dst.type, read_components);
2624 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2625 payload);
2626 inst->size_written = read_components * REG_SIZE;
2627 for (unsigned i = 0; i < instr->num_components; i++) {
2628 bld.MOV(offset(dst, bld, i),
2629 offset(tmp, bld, i + first_component));
2630 }
2631 } else {
2632 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2633 payload);
2634 inst->size_written = instr->num_components * REG_SIZE;
2635 }
2636 inst->offset = imm_offset;
2637 inst->mlen = 2;
2638 }
2639 break;
2640 }
2641
2642 case nir_intrinsic_store_output:
2643 case nir_intrinsic_store_per_vertex_output: {
2644 fs_reg value = get_nir_src(instr->src[0]);
2645 bool is_64bit = (instr->src[0].is_ssa ?
2646 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
2647 fs_reg indirect_offset = get_indirect_offset(instr);
2648 unsigned imm_offset = instr->const_index[0];
2649 unsigned swiz = BRW_SWIZZLE_XYZW;
2650 unsigned mask = instr->const_index[1];
2651 unsigned header_regs = 0;
2652 fs_reg srcs[7];
2653 srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2654
2655 if (indirect_offset.file != BAD_FILE) {
2656 srcs[header_regs++] = indirect_offset;
2657 } else if (!is_passthrough_shader) {
2658 if (imm_offset == 0) {
2659 value.type = BRW_REGISTER_TYPE_F;
2660
2661 mask &= (1 << tesslevel_inner_components(tcs_key->tes_primitive_mode)) - 1;
2662
2663 /* This is a write to gl_TessLevelInner[], which lives in the
2664 * Patch URB header. The layout depends on the domain.
2665 */
2666 switch (tcs_key->tes_primitive_mode) {
2667 case GL_QUADS:
2668 /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
2669 * We use an XXYX swizzle to reverse put .xy in the .wz
2670 * channels, and use a .zw writemask.
2671 */
2672 mask = writemask_for_backwards_vector(mask);
2673 swiz = BRW_SWIZZLE4(0, 0, 1, 0);
2674 break;
2675 case GL_TRIANGLES:
2676 /* gl_TessLevelInner[].x lives at DWord 4, so we set the
2677 * writemask to X and bump the URB offset by 1.
2678 */
2679 imm_offset = 1;
2680 break;
2681 case GL_ISOLINES:
2682 /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
2683 return;
2684 default:
2685 unreachable("Bogus tessellation domain");
2686 }
2687 } else if (imm_offset == 1) {
2688 /* This is a write to gl_TessLevelOuter[] which lives in the
2689 * Patch URB Header at DWords 4-7. However, it's reversed, so
2690 * instead of .xyzw we have .wzyx.
2691 */
2692 value.type = BRW_REGISTER_TYPE_F;
2693
2694 mask &= (1 << tesslevel_outer_components(tcs_key->tes_primitive_mode)) - 1;
2695
2696 if (tcs_key->tes_primitive_mode == GL_ISOLINES) {
2697 /* Isolines .xy should be stored in .zw, in order. */
2698 swiz = BRW_SWIZZLE4(0, 0, 0, 1);
2699 mask <<= 2;
2700 } else {
2701 /* Other domains are reversed; store .wzyx instead of .xyzw */
2702 swiz = BRW_SWIZZLE_WZYX;
2703 mask = writemask_for_backwards_vector(mask);
2704 }
2705 }
2706 }
2707
2708 if (mask == 0)
2709 break;
2710
2711 unsigned num_components = util_last_bit(mask);
2712 enum opcode opcode;
2713
2714 /* We can only pack two 64-bit components in a single message, so send
2715 * 2 messages if we have more components
2716 */
2717 unsigned num_iterations = 1;
2718 unsigned iter_components = num_components;
2719 unsigned first_component = nir_intrinsic_component(instr);
2720 if (is_64bit) {
2721 first_component = first_component / 2;
2722 if (instr->num_components > 2) {
2723 num_iterations = 2;
2724 iter_components = 2;
2725 }
2726 }
2727
2728 /* 64-bit data needs to me shuffled before we can write it to the URB.
2729 * We will use this temporary to shuffle the components in each
2730 * iteration.
2731 */
2732 fs_reg tmp =
2733 fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type);
2734
2735 mask = mask << first_component;
2736
2737 for (unsigned iter = 0; iter < num_iterations; iter++) {
2738 if (!is_64bit && mask != WRITEMASK_XYZW) {
2739 srcs[header_regs++] = brw_imm_ud(mask << 16);
2740 opcode = indirect_offset.file != BAD_FILE ?
2741 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2742 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2743 } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
2744 /* Expand the 64-bit mask to 32-bit channels. We only handle
2745 * two channels in each iteration, so we only care about X/Y.
2746 */
2747 unsigned mask32 = 0;
2748 if (mask & WRITEMASK_X)
2749 mask32 |= WRITEMASK_XY;
2750 if (mask & WRITEMASK_Y)
2751 mask32 |= WRITEMASK_ZW;
2752
2753 /* If the mask does not include any of the channels X or Y there
2754 * is nothing to do in this iteration. Move on to the next couple
2755 * of 64-bit channels.
2756 */
2757 if (!mask32) {
2758 mask >>= 2;
2759 imm_offset++;
2760 continue;
2761 }
2762
2763 srcs[header_regs++] = brw_imm_ud(mask32 << 16);
2764 opcode = indirect_offset.file != BAD_FILE ?
2765 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2766 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2767 } else {
2768 opcode = indirect_offset.file != BAD_FILE ?
2769 SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
2770 SHADER_OPCODE_URB_WRITE_SIMD8;
2771 }
2772
2773 for (unsigned i = 0; i < iter_components; i++) {
2774 if (!(mask & (1 << (i + first_component))))
2775 continue;
2776
2777 if (!is_64bit) {
2778 srcs[header_regs + i + first_component] =
2779 offset(value, bld, BRW_GET_SWZ(swiz, i));
2780 } else {
2781 /* We need to shuffle the 64-bit data to match the layout
2782 * expected by our 32-bit URB write messages. We use a temporary
2783 * for that.
2784 */
2785 unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i);
2786 shuffle_64bit_data_for_32bit_write(bld,
2787 retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F),
2788 retype(offset(value, bld, 2 * channel), BRW_REGISTER_TYPE_DF),
2789 1);
2790
2791 /* Now copy the data to the destination */
2792 fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
2793 unsigned idx = 2 * i;
2794 bld.MOV(dest, offset(tmp, bld, idx));
2795 bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1));
2796 srcs[header_regs + idx + first_component * 2] = dest;
2797 srcs[header_regs + idx + 1 + first_component * 2] =
2798 offset(dest, bld, 1);
2799 }
2800 }
2801
2802 unsigned mlen =
2803 header_regs + (is_64bit ? 2 * iter_components : iter_components) +
2804 (is_64bit ? 2 * first_component : first_component);
2805 fs_reg payload =
2806 bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2807 bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
2808
2809 fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
2810 inst->offset = imm_offset;
2811 inst->mlen = mlen;
2812
2813 /* If this is a 64-bit attribute, select the next two 64-bit channels
2814 * to be handled in the next iteration.
2815 */
2816 if (is_64bit) {
2817 mask >>= 2;
2818 imm_offset++;
2819 }
2820 }
2821 break;
2822 }
2823
2824 default:
2825 nir_emit_intrinsic(bld, instr);
2826 break;
2827 }
2828 }
2829
2830 void
2831 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
2832 nir_intrinsic_instr *instr)
2833 {
2834 assert(stage == MESA_SHADER_TESS_EVAL);
2835 struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
2836
2837 fs_reg dest;
2838 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2839 dest = get_nir_dest(instr->dest);
2840
2841 switch (instr->intrinsic) {
2842 case nir_intrinsic_load_primitive_id:
2843 bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
2844 break;
2845 case nir_intrinsic_load_tess_coord:
2846 /* gl_TessCoord is part of the payload in g1-3 */
2847 for (unsigned i = 0; i < 3; i++) {
2848 bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
2849 }
2850 break;
2851
2852 case nir_intrinsic_load_tess_level_outer:
2853 /* When the TES reads gl_TessLevelOuter, we ensure that the patch header
2854 * appears as a push-model input. So, we can simply use the ATTR file
2855 * rather than issuing URB read messages. The data is stored in the
2856 * high DWords in reverse order - DWord 7 contains .x, DWord 6 contains
2857 * .y, and so on.
2858 */
2859 switch (tes_prog_data->domain) {
2860 case BRW_TESS_DOMAIN_QUAD:
2861 for (unsigned i = 0; i < 4; i++)
2862 bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i));
2863 break;
2864 case BRW_TESS_DOMAIN_TRI:
2865 for (unsigned i = 0; i < 3; i++)
2866 bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i));
2867 break;
2868 case BRW_TESS_DOMAIN_ISOLINE:
2869 for (unsigned i = 0; i < 2; i++)
2870 bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 6 + i));
2871 break;
2872 }
2873 break;
2874
2875 case nir_intrinsic_load_tess_level_inner:
2876 /* When the TES reads gl_TessLevelInner, we ensure that the patch header
2877 * appears as a push-model input. So, we can simply use the ATTR file
2878 * rather than issuing URB read messages.
2879 */
2880 switch (tes_prog_data->domain) {
2881 case BRW_TESS_DOMAIN_QUAD:
2882 bld.MOV(dest, component(fs_reg(ATTR, 0), 3));
2883 bld.MOV(offset(dest, bld, 1), component(fs_reg(ATTR, 0), 2));
2884 break;
2885 case BRW_TESS_DOMAIN_TRI:
2886 bld.MOV(dest, component(fs_reg(ATTR, 0), 4));
2887 break;
2888 case BRW_TESS_DOMAIN_ISOLINE:
2889 /* ignore - value is undefined */
2890 break;
2891 }
2892 break;
2893
2894 case nir_intrinsic_load_input:
2895 case nir_intrinsic_load_per_vertex_input: {
2896 fs_reg indirect_offset = get_indirect_offset(instr);
2897 unsigned imm_offset = instr->const_index[0];
2898 unsigned first_component = nir_intrinsic_component(instr);
2899
2900 if (type_sz(dest.type) == 8) {
2901 first_component = first_component / 2;
2902 }
2903
2904 fs_inst *inst;
2905 if (indirect_offset.file == BAD_FILE) {
2906 /* Arbitrarily only push up to 32 vec4 slots worth of data,
2907 * which is 16 registers (since each holds 2 vec4 slots).
2908 */
2909 const unsigned max_push_slots = 32;
2910 if (imm_offset < max_push_slots) {
2911 fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
2912 for (int i = 0; i < instr->num_components; i++) {
2913 unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
2914 i + first_component;
2915 bld.MOV(offset(dest, bld, i), component(src, comp));
2916 }
2917 tes_prog_data->base.urb_read_length =
2918 MAX2(tes_prog_data->base.urb_read_length,
2919 DIV_ROUND_UP(imm_offset + 1, 2));
2920 } else {
2921 /* Replicate the patch handle to all enabled channels */
2922 const fs_reg srcs[] = {
2923 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
2924 };
2925 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2926 bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
2927
2928 if (first_component != 0) {
2929 unsigned read_components =
2930 instr->num_components + first_component;
2931 fs_reg tmp = bld.vgrf(dest.type, read_components);
2932 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2933 patch_handle);
2934 inst->size_written = read_components * REG_SIZE;
2935 for (unsigned i = 0; i < instr->num_components; i++) {
2936 bld.MOV(offset(dest, bld, i),
2937 offset(tmp, bld, i + first_component));
2938 }
2939 } else {
2940 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
2941 patch_handle);
2942 inst->size_written = instr->num_components * REG_SIZE;
2943 }
2944 inst->mlen = 1;
2945 inst->offset = imm_offset;
2946 }
2947 } else {
2948 /* Indirect indexing - use per-slot offsets as well. */
2949
2950 /* We can only read two double components with each URB read, so
2951 * we send two read messages in that case, each one loading up to
2952 * two double components.
2953 */
2954 unsigned num_iterations = 1;
2955 unsigned num_components = instr->num_components;
2956 fs_reg orig_dest = dest;
2957 if (type_sz(dest.type) == 8) {
2958 if (instr->num_components > 2) {
2959 num_iterations = 2;
2960 num_components = 2;
2961 }
2962 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
2963 dest = tmp;
2964 }
2965
2966 for (unsigned iter = 0; iter < num_iterations; iter++) {
2967 const fs_reg srcs[] = {
2968 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2969 indirect_offset
2970 };
2971 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2972 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2973
2974 if (first_component != 0) {
2975 unsigned read_components =
2976 num_components + first_component;
2977 fs_reg tmp = bld.vgrf(dest.type, read_components);
2978 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2979 payload);
2980 for (unsigned i = 0; i < num_components; i++) {
2981 bld.MOV(offset(dest, bld, i),
2982 offset(tmp, bld, i + first_component));
2983 }
2984 } else {
2985 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
2986 payload);
2987 }
2988 inst->mlen = 2;
2989 inst->offset = imm_offset;
2990 inst->size_written = (num_components + first_component) *
2991 inst->dst.component_size(inst->exec_size);
2992
2993 /* If we are reading 64-bit data using 32-bit read messages we need
2994 * build proper 64-bit data elements by shuffling the low and high
2995 * 32-bit components around like we do for other things like UBOs
2996 * or SSBOs.
2997 */
2998 if (type_sz(dest.type) == 8) {
2999 shuffle_32bit_load_result_to_64bit_data(
3000 bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);
3001
3002 for (unsigned c = 0; c < num_components; c++) {
3003 bld.MOV(offset(orig_dest, bld, iter * 2 + c),
3004 offset(dest, bld, c));
3005 }
3006 }
3007
3008 /* If we are loading double data and we need a second read message
3009 * adjust the offset
3010 */
3011 if (num_iterations > 1) {
3012 num_components = instr->num_components - 2;
3013 imm_offset++;
3014 }
3015 }
3016 }
3017 break;
3018 }
3019 default:
3020 nir_emit_intrinsic(bld, instr);
3021 break;
3022 }
3023 }
3024
3025 void
3026 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
3027 nir_intrinsic_instr *instr)
3028 {
3029 assert(stage == MESA_SHADER_GEOMETRY);
3030 fs_reg indirect_offset;
3031
3032 fs_reg dest;
3033 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3034 dest = get_nir_dest(instr->dest);
3035
3036 switch (instr->intrinsic) {
3037 case nir_intrinsic_load_primitive_id:
3038 assert(stage == MESA_SHADER_GEOMETRY);
3039 assert(brw_gs_prog_data(prog_data)->include_primitive_id);
3040 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
3041 retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
3042 break;
3043
3044 case nir_intrinsic_load_input:
3045 unreachable("load_input intrinsics are invalid for the GS stage");
3046
3047 case nir_intrinsic_load_per_vertex_input:
3048 emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
3049 instr->src[1], instr->num_components,
3050 nir_intrinsic_component(instr));
3051 break;
3052
3053 case nir_intrinsic_emit_vertex_with_counter:
3054 emit_gs_vertex(instr->src[0], instr->const_index[0]);
3055 break;
3056
3057 case nir_intrinsic_end_primitive_with_counter:
3058 emit_gs_end_primitive(instr->src[0]);
3059 break;
3060
3061 case nir_intrinsic_set_vertex_count:
3062 bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
3063 break;
3064
3065 case nir_intrinsic_load_invocation_id: {
3066 fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
3067 assert(val.file != BAD_FILE);
3068 dest.type = val.type;
3069 bld.MOV(dest, val);
3070 break;
3071 }
3072
3073 default:
3074 nir_emit_intrinsic(bld, instr);
3075 break;
3076 }
3077 }
3078
3079 /**
3080 * Fetch the current render target layer index.
3081 */
3082 static fs_reg
3083 fetch_render_target_array_index(const fs_builder &bld)
3084 {
3085 if (bld.shader->devinfo->gen >= 6) {
3086 /* The render target array index is provided in the thread payload as
3087 * bits 26:16 of r0.0.
3088 */
3089 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3090 bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
3091 brw_imm_uw(0x7ff));
3092 return idx;
3093 } else {
3094 /* Pre-SNB we only ever render into the first layer of the framebuffer
3095 * since layered rendering is not implemented.
3096 */
3097 return brw_imm_ud(0);
3098 }
3099 }
3100
3101 /**
3102 * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3103 * framebuffer at the current fragment coordinates and sample index.
3104 */
3105 fs_inst *
3106 fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
3107 unsigned target)
3108 {
3109 const struct gen_device_info *devinfo = bld.shader->devinfo;
3110
3111 assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3112 const brw_wm_prog_key *wm_key =
3113 reinterpret_cast<const brw_wm_prog_key *>(key);
3114 assert(!wm_key->coherent_fb_fetch);
3115 const struct brw_wm_prog_data *wm_prog_data =
3116 brw_wm_prog_data(stage_prog_data);
3117
3118 /* Calculate the surface index relative to the start of the texture binding
3119 * table block, since that's what the texturing messages expect.
3120 */
3121 const unsigned surface = target +
3122 wm_prog_data->binding_table.render_target_read_start -
3123 wm_prog_data->base.binding_table.texture_start;
3124
3125 brw_mark_surface_used(
3126 bld.shader->stage_prog_data,
3127 wm_prog_data->binding_table.render_target_read_start + target);
3128
3129 /* Calculate the fragment coordinates. */
3130 const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
3131 bld.MOV(offset(coords, bld, 0), pixel_x);
3132 bld.MOV(offset(coords, bld, 1), pixel_y);
3133 bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3134
3135 /* Calculate the sample index and MCS payload when multisampling. Luckily
3136 * the MCS fetch message behaves deterministically for UMS surfaces, so it
3137 * shouldn't be necessary to recompile based on whether the framebuffer is
3138 * CMS or UMS.
3139 */
3140 if (wm_key->multisample_fbo &&
3141 nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3142 nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
3143
3144 const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
3145 const fs_reg mcs = wm_key->multisample_fbo ?
3146 emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
3147
3148 /* Use either a normal or a CMS texel fetch message depending on whether
3149 * the framebuffer is single or multisample. On SKL+ use the wide CMS
3150 * message just in case the framebuffer uses 16x multisampling, it should
3151 * be equivalent to the normal CMS fetch for lower multisampling modes.
3152 */
3153 const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
3154 devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
3155 SHADER_OPCODE_TXF_CMS_LOGICAL;
3156
3157 /* Emit the instruction. */
3158 const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(),
3159 sample, mcs,
3160 brw_imm_ud(surface), brw_imm_ud(0),
3161 fs_reg(), brw_imm_ud(3), brw_imm_ud(0) };
3162 STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS);
3163
3164 fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3165 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3166
3167 return inst;
3168 }
3169
3170 /**
3171 * Actual coherent framebuffer read implemented using the native render target
3172 * read message. Requires SKL+.
3173 */
3174 static fs_inst *
3175 emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
3176 {
3177 assert(bld.shader->devinfo->gen >= 9);
3178 fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3179 inst->target = target;
3180 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3181
3182 return inst;
3183 }
3184
3185 static fs_reg
3186 alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
3187 {
3188 if (n && regs[0].file != BAD_FILE) {
3189 return regs[0];
3190
3191 } else {
3192 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3193
3194 for (unsigned i = 0; i < n; i++)
3195 regs[i] = tmp;
3196
3197 return tmp;
3198 }
3199 }
3200
3201 static fs_reg
3202 alloc_frag_output(fs_visitor *v, unsigned location)
3203 {
3204 assert(v->stage == MESA_SHADER_FRAGMENT);
3205 const brw_wm_prog_key *const key =
3206 reinterpret_cast<const brw_wm_prog_key *>(v->key);
3207 const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3208 const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3209
3210 if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3211 return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3212
3213 else if (l == FRAG_RESULT_COLOR)
3214 return alloc_temporary(v->bld, 4, v->outputs,
3215 MAX2(key->nr_color_regions, 1));
3216
3217 else if (l == FRAG_RESULT_DEPTH)
3218 return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3219
3220 else if (l == FRAG_RESULT_STENCIL)
3221 return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3222
3223 else if (l == FRAG_RESULT_SAMPLE_MASK)
3224 return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3225
3226 else if (l >= FRAG_RESULT_DATA0 &&
3227 l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3228 return alloc_temporary(v->bld, 4,
3229 &v->outputs[l - FRAG_RESULT_DATA0], 1);
3230
3231 else
3232 unreachable("Invalid location");
3233 }
3234
3235 void
3236 fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3237 nir_intrinsic_instr *instr)
3238 {
3239 assert(stage == MESA_SHADER_FRAGMENT);
3240
3241 fs_reg dest;
3242 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3243 dest = get_nir_dest(instr->dest);
3244
3245 switch (instr->intrinsic) {
3246 case nir_intrinsic_load_front_face:
3247 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3248 *emit_frontfacing_interpolation());
3249 break;
3250
3251 case nir_intrinsic_load_sample_pos: {
3252 fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3253 assert(sample_pos.file != BAD_FILE);
3254 dest.type = sample_pos.type;
3255 bld.MOV(dest, sample_pos);
3256 bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3257 break;
3258 }
3259
3260 case nir_intrinsic_load_helper_invocation:
3261 case nir_intrinsic_load_sample_mask_in:
3262 case nir_intrinsic_load_sample_id: {
3263 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3264 fs_reg val = nir_system_values[sv];
3265 assert(val.file != BAD_FILE);
3266 dest.type = val.type;
3267 bld.MOV(dest, val);
3268 break;
3269 }
3270
3271 case nir_intrinsic_store_output: {
3272 const fs_reg src = get_nir_src(instr->src[0]);
3273 const nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3274 assert(const_offset && "Indirect output stores not allowed");
3275 const unsigned location = nir_intrinsic_base(instr) +
3276 SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION);
3277 const fs_reg new_dest = retype(alloc_frag_output(this, location),
3278 src.type);
3279
3280 for (unsigned j = 0; j < instr->num_components; j++)
3281 bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3282 offset(src, bld, j));
3283
3284 break;
3285 }
3286
3287 case nir_intrinsic_load_output: {
3288 const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3289 BRW_NIR_FRAG_OUTPUT_LOCATION);
3290 assert(l >= FRAG_RESULT_DATA0);
3291 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3292 assert(const_offset && "Indirect output loads not allowed");
3293 const unsigned target = l - FRAG_RESULT_DATA0 + const_offset->u32[0];
3294 const fs_reg tmp = bld.vgrf(dest.type, 4);
3295
3296 if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3297 emit_coherent_fb_read(bld, tmp, target);
3298 else
3299 emit_non_coherent_fb_read(bld, tmp, target);
3300
3301 for (unsigned j = 0; j < instr->num_components; j++) {
3302 bld.MOV(offset(dest, bld, j),
3303 offset(tmp, bld, nir_intrinsic_component(instr) + j));
3304 }
3305
3306 break;
3307 }
3308
3309 case nir_intrinsic_discard:
3310 case nir_intrinsic_discard_if: {
3311 /* We track our discarded pixels in f0.1. By predicating on it, we can
3312 * update just the flag bits that aren't yet discarded. If there's no
3313 * condition, we emit a CMP of g0 != g0, so all currently executing
3314 * channels will get turned off.
3315 */
3316 fs_inst *cmp;
3317 if (instr->intrinsic == nir_intrinsic_discard_if) {
3318 cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3319 brw_imm_d(0), BRW_CONDITIONAL_Z);
3320 } else {
3321 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3322 BRW_REGISTER_TYPE_UW));
3323 cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3324 }
3325 cmp->predicate = BRW_PREDICATE_NORMAL;
3326 cmp->flag_subreg = 1;
3327
3328 if (devinfo->gen >= 6) {
3329 emit_discard_jump();
3330 }
3331 break;
3332 }
3333
3334 case nir_intrinsic_load_input: {
3335 /* load_input is only used for flat inputs */
3336 unsigned base = nir_intrinsic_base(instr);
3337 unsigned component = nir_intrinsic_component(instr);
3338 unsigned num_components = instr->num_components;
3339 enum brw_reg_type type = dest.type;
3340
3341 /* Special case fields in the VUE header */
3342 if (base == VARYING_SLOT_LAYER)
3343 component = 1;
3344 else if (base == VARYING_SLOT_VIEWPORT)
3345 component = 2;
3346
3347 if (nir_dest_bit_size(instr->dest) == 64) {
3348 /* const_index is in 32-bit type size units that could not be aligned
3349 * with DF. We need to read the double vector as if it was a float
3350 * vector of twice the number of components to fetch the right data.
3351 */
3352 type = BRW_REGISTER_TYPE_F;
3353 num_components *= 2;
3354 }
3355
3356 for (unsigned int i = 0; i < num_components; i++) {
3357 struct brw_reg interp = interp_reg(base, component + i);
3358 interp = suboffset(interp, 3);
3359 bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
3360 retype(fs_reg(interp), type));
3361 }
3362
3363 if (nir_dest_bit_size(instr->dest) == 64) {
3364 shuffle_32bit_load_result_to_64bit_data(bld,
3365 dest,
3366 retype(dest, type),
3367 instr->num_components);
3368 }
3369 break;
3370 }
3371
3372 case nir_intrinsic_load_barycentric_pixel:
3373 case nir_intrinsic_load_barycentric_centroid:
3374 case nir_intrinsic_load_barycentric_sample:
3375 /* Do nothing - load_interpolated_input handling will handle it later. */
3376 break;
3377
3378 case nir_intrinsic_load_barycentric_at_sample: {
3379 const glsl_interp_mode interpolation =
3380 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3381
3382 nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
3383
3384 if (const_sample) {
3385 unsigned msg_data = const_sample->i32[0] << 4;
3386
3387 emit_pixel_interpolater_send(bld,
3388 FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3389 dest,
3390 fs_reg(), /* src */
3391 brw_imm_ud(msg_data),
3392 interpolation);
3393 } else {
3394 const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3395 BRW_REGISTER_TYPE_UD);
3396
3397 if (nir_src_is_dynamically_uniform(instr->src[0])) {
3398 const fs_reg sample_id = bld.emit_uniformize(sample_src);
3399 const fs_reg msg_data = vgrf(glsl_type::uint_type);
3400 bld.exec_all().group(1, 0)
3401 .SHL(msg_data, sample_id, brw_imm_ud(4u));
3402 emit_pixel_interpolater_send(bld,
3403 FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3404 dest,
3405 fs_reg(), /* src */
3406 msg_data,
3407 interpolation);
3408 } else {
3409 /* Make a loop that sends a message to the pixel interpolater
3410 * for the sample number in each live channel. If there are
3411 * multiple channels with the same sample number then these
3412 * will be handled simultaneously with a single interation of
3413 * the loop.
3414 */
3415 bld.emit(BRW_OPCODE_DO);
3416
3417 /* Get the next live sample number into sample_id_reg */
3418 const fs_reg sample_id = bld.emit_uniformize(sample_src);
3419
3420 /* Set the flag register so that we can perform the send
3421 * message on all channels that have the same sample number
3422 */
3423 bld.CMP(bld.null_reg_ud(),
3424 sample_src, sample_id,
3425 BRW_CONDITIONAL_EQ);
3426 const fs_reg msg_data = vgrf(glsl_type::uint_type);
3427 bld.exec_all().group(1, 0)
3428 .SHL(msg_data, sample_id, brw_imm_ud(4u));
3429 fs_inst *inst =
3430 emit_pixel_interpolater_send(bld,
3431 FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3432 dest,
3433 fs_reg(), /* src */
3434 msg_data,
3435 interpolation);
3436 set_predicate(BRW_PREDICATE_NORMAL, inst);
3437
3438 /* Continue the loop if there are any live channels left */
3439 set_predicate_inv(BRW_PREDICATE_NORMAL,
3440 true, /* inverse */
3441 bld.emit(BRW_OPCODE_WHILE));
3442 }
3443 }
3444 break;
3445 }
3446
3447 case nir_intrinsic_load_barycentric_at_offset: {
3448 const glsl_interp_mode interpolation =
3449 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3450
3451 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3452
3453 if (const_offset) {
3454 unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
3455 unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
3456
3457 emit_pixel_interpolater_send(bld,
3458 FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3459 dest,
3460 fs_reg(), /* src */
3461 brw_imm_ud(off_x | (off_y << 4)),
3462 interpolation);
3463 } else {
3464 fs_reg src = vgrf(glsl_type::ivec2_type);
3465 fs_reg offset_src = retype(get_nir_src(instr->src[0]),
3466 BRW_REGISTER_TYPE_F);
3467 for (int i = 0; i < 2; i++) {
3468 fs_reg temp = vgrf(glsl_type::float_type);
3469 bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
3470 fs_reg itemp = vgrf(glsl_type::int_type);
3471 /* float to int */
3472 bld.MOV(itemp, temp);
3473
3474 /* Clamp the upper end of the range to +7/16.
3475 * ARB_gpu_shader5 requires that we support a maximum offset
3476 * of +0.5, which isn't representable in a S0.4 value -- if
3477 * we didn't clamp it, we'd end up with -8/16, which is the
3478 * opposite of what the shader author wanted.
3479 *
3480 * This is legal due to ARB_gpu_shader5's quantization
3481 * rules:
3482 *
3483 * "Not all values of <offset> may be supported; x and y
3484 * offsets may be rounded to fixed-point values with the
3485 * number of fraction bits given by the
3486 * implementation-dependent constant
3487 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
3488 */
3489 set_condmod(BRW_CONDITIONAL_L,
3490 bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
3491 }
3492
3493 const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3494 emit_pixel_interpolater_send(bld,
3495 opcode,
3496 dest,
3497 src,
3498 brw_imm_ud(0u),
3499 interpolation);
3500 }
3501 break;
3502 }
3503
3504 case nir_intrinsic_load_interpolated_input: {
3505 if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
3506 emit_fragcoord_interpolation(dest);
3507 break;
3508 }
3509
3510 assert(instr->src[0].ssa &&
3511 instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3512 nir_intrinsic_instr *bary_intrinsic =
3513 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3514 nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3515 enum glsl_interp_mode interp_mode =
3516 (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3517 fs_reg dst_xy;
3518
3519 if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3520 bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3521 /* Use the result of the PI message */
3522 dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3523 } else {
3524 /* Use the delta_xy values computed from the payload */
3525 enum brw_barycentric_mode bary =
3526 brw_barycentric_mode(interp_mode, bary_intrin);
3527
3528 dst_xy = this->delta_xy[bary];
3529 }
3530
3531 for (unsigned int i = 0; i < instr->num_components; i++) {
3532 fs_reg interp =
3533 fs_reg(interp_reg(nir_intrinsic_base(instr),
3534 nir_intrinsic_component(instr) + i));
3535 interp.type = BRW_REGISTER_TYPE_F;
3536 dest.type = BRW_REGISTER_TYPE_F;
3537
3538 if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3539 fs_reg tmp = vgrf(glsl_type::float_type);
3540 bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3541 bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3542 } else {
3543 bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3544 }
3545 }
3546 break;
3547 }
3548
3549 default:
3550 nir_emit_intrinsic(bld, instr);
3551 break;
3552 }
3553 }
3554
3555 void
3556 fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3557 nir_intrinsic_instr *instr)
3558 {
3559 assert(stage == MESA_SHADER_COMPUTE);
3560 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3561
3562 fs_reg dest;
3563 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3564 dest = get_nir_dest(instr->dest);
3565
3566 switch (instr->intrinsic) {
3567 case nir_intrinsic_barrier:
3568 emit_barrier();
3569 cs_prog_data->uses_barrier = true;
3570 break;
3571
3572 case nir_intrinsic_load_local_invocation_id:
3573 case nir_intrinsic_load_work_group_id: {
3574 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3575 fs_reg val = nir_system_values[sv];
3576 assert(val.file != BAD_FILE);
3577 dest.type = val.type;
3578 for (unsigned i = 0; i < 3; i++)
3579 bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3580 break;
3581 }
3582
3583 case nir_intrinsic_load_num_work_groups: {
3584 const unsigned surface =
3585 cs_prog_data->binding_table.work_groups_start;
3586
3587 cs_prog_data->uses_num_work_groups = true;
3588
3589 fs_reg surf_index = brw_imm_ud(surface);
3590 brw_mark_surface_used(prog_data, surface);
3591
3592 /* Read the 3 GLuint components of gl_NumWorkGroups */
3593 for (unsigned i = 0; i < 3; i++) {
3594 fs_reg read_result =
3595 emit_untyped_read(bld, surf_index,
3596 brw_imm_ud(i << 2),
3597 1 /* dims */, 1 /* size */,
3598 BRW_PREDICATE_NONE);
3599 read_result.type = dest.type;
3600 bld.MOV(dest, read_result);
3601 dest = offset(dest, bld, 1);
3602 }
3603 break;
3604 }
3605
3606 case nir_intrinsic_shared_atomic_add:
3607 nir_emit_shared_atomic(bld, BRW_AOP_ADD, instr);
3608 break;
3609 case nir_intrinsic_shared_atomic_imin:
3610 nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
3611 break;
3612 case nir_intrinsic_shared_atomic_umin:
3613 nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
3614 break;
3615 case nir_intrinsic_shared_atomic_imax:
3616 nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
3617 break;
3618 case nir_intrinsic_shared_atomic_umax:
3619 nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
3620 break;
3621 case nir_intrinsic_shared_atomic_and:
3622 nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
3623 break;
3624 case nir_intrinsic_shared_atomic_or:
3625 nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
3626 break;
3627 case nir_intrinsic_shared_atomic_xor:
3628 nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
3629 break;
3630 case nir_intrinsic_shared_atomic_exchange:
3631 nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
3632 break;
3633 case nir_intrinsic_shared_atomic_comp_swap:
3634 nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
3635 break;
3636
3637 case nir_intrinsic_load_shared: {
3638 assert(devinfo->gen >= 7);
3639
3640 fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
3641
3642 /* Get the offset to read from */
3643 fs_reg offset_reg;
3644 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3645 if (const_offset) {
3646 offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
3647 } else {
3648 offset_reg = vgrf(glsl_type::uint_type);
3649 bld.ADD(offset_reg,
3650 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
3651 brw_imm_ud(instr->const_index[0]));
3652 }
3653
3654 /* Read the vector */
3655 do_untyped_vector_read(bld, dest, surf_index, offset_reg,
3656 instr->num_components);
3657 break;
3658 }
3659
3660 case nir_intrinsic_store_shared: {
3661 assert(devinfo->gen >= 7);
3662
3663 /* Block index */
3664 fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
3665
3666 /* Value */
3667 fs_reg val_reg = get_nir_src(instr->src[0]);
3668
3669 /* Writemask */
3670 unsigned writemask = instr->const_index[1];
3671
3672 /* get_nir_src() retypes to integer. Be wary of 64-bit types though
3673 * since the untyped writes below operate in units of 32-bits, which
3674 * means that we need to write twice as many components each time.
3675 * Also, we have to suffle 64-bit data to be in the appropriate layout
3676 * expected by our 32-bit write messages.
3677 */
3678 unsigned type_size = 4;
3679 unsigned bit_size = instr->src[0].is_ssa ?
3680 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
3681 if (bit_size == 64) {
3682 type_size = 8;
3683 fs_reg tmp =
3684 fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
3685 shuffle_64bit_data_for_32bit_write(
3686 bld,
3687 retype(tmp, BRW_REGISTER_TYPE_F),
3688 retype(val_reg, BRW_REGISTER_TYPE_DF),
3689 instr->num_components);
3690 val_reg = tmp;
3691 }
3692
3693 unsigned type_slots = type_size / 4;
3694
3695 /* Combine groups of consecutive enabled channels in one write
3696 * message. We use ffs to find the first enabled channel and then ffs on
3697 * the bit-inverse, down-shifted writemask to determine the length of
3698 * the block of enabled bits.
3699 */
3700 while (writemask) {
3701 unsigned first_component = ffs(writemask) - 1;
3702 unsigned length = ffs(~(writemask >> first_component)) - 1;
3703
3704 /* We can't write more than 2 64-bit components at once. Limit the
3705 * length of the write to what we can do and let the next iteration
3706 * handle the rest
3707 */
3708 if (type_size > 4)
3709 length = MIN2(2, length);
3710
3711 fs_reg offset_reg;
3712 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3713 if (const_offset) {
3714 offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] +
3715 type_size * first_component);
3716 } else {
3717 offset_reg = vgrf(glsl_type::uint_type);
3718 bld.ADD(offset_reg,
3719 retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
3720 brw_imm_ud(instr->const_index[0] + type_size * first_component));
3721 }
3722
3723 emit_untyped_write(bld, surf_index, offset_reg,
3724 offset(val_reg, bld, first_component * type_slots),
3725 1 /* dims */, length * type_slots,
3726 BRW_PREDICATE_NONE);
3727
3728 /* Clear the bits in the writemask that we just wrote, then try
3729 * again to see if more channels are left.
3730 */
3731 writemask &= (15 << (first_component + length));
3732 }
3733
3734 break;
3735 }
3736
3737 default:
3738 nir_emit_intrinsic(bld, instr);
3739 break;
3740 }
3741 }
3742
3743 void
3744 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
3745 {
3746 fs_reg dest;
3747 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3748 dest = get_nir_dest(instr->dest);
3749
3750 switch (instr->intrinsic) {
3751 case nir_intrinsic_atomic_counter_inc:
3752 case nir_intrinsic_atomic_counter_dec:
3753 case nir_intrinsic_atomic_counter_read:
3754 case nir_intrinsic_atomic_counter_add:
3755 case nir_intrinsic_atomic_counter_min:
3756 case nir_intrinsic_atomic_counter_max:
3757 case nir_intrinsic_atomic_counter_and:
3758 case nir_intrinsic_atomic_counter_or:
3759 case nir_intrinsic_atomic_counter_xor:
3760 case nir_intrinsic_atomic_counter_exchange:
3761 case nir_intrinsic_atomic_counter_comp_swap: {
3762 if (stage == MESA_SHADER_FRAGMENT &&
3763 instr->intrinsic != nir_intrinsic_atomic_counter_read)
3764 brw_wm_prog_data(prog_data)->has_side_effects = true;
3765
3766 /* Get some metadata from the image intrinsic. */
3767 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3768
3769 /* Get the arguments of the atomic intrinsic. */
3770 const fs_reg offset = get_nir_src(instr->src[0]);
3771 const unsigned surface = (stage_prog_data->binding_table.abo_start +
3772 instr->const_index[0]);
3773 const fs_reg src0 = (info->num_srcs >= 2
3774 ? get_nir_src(instr->src[1]) : fs_reg());
3775 const fs_reg src1 = (info->num_srcs >= 3
3776 ? get_nir_src(instr->src[2]) : fs_reg());
3777 fs_reg tmp;
3778
3779 assert(info->num_srcs <= 3);
3780
3781 /* Emit a surface read or atomic op. */
3782 if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
3783 tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1);
3784 } else {
3785 tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, src0,
3786 src1, 1, 1,
3787 get_atomic_counter_op(instr->intrinsic));
3788 }
3789
3790 /* Assign the result. */
3791 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
3792
3793 /* Mark the surface as used. */
3794 brw_mark_surface_used(stage_prog_data, surface);
3795 break;
3796 }
3797
3798 case nir_intrinsic_image_load:
3799 case nir_intrinsic_image_store:
3800 case nir_intrinsic_image_atomic_add:
3801 case nir_intrinsic_image_atomic_min:
3802 case nir_intrinsic_image_atomic_max:
3803 case nir_intrinsic_image_atomic_and:
3804 case nir_intrinsic_image_atomic_or:
3805 case nir_intrinsic_image_atomic_xor:
3806 case nir_intrinsic_image_atomic_exchange:
3807 case nir_intrinsic_image_atomic_comp_swap: {
3808 using namespace image_access;
3809
3810 if (stage == MESA_SHADER_FRAGMENT &&
3811 instr->intrinsic != nir_intrinsic_image_load)
3812 brw_wm_prog_data(prog_data)->has_side_effects = true;
3813
3814 /* Get the referenced image variable and type. */
3815 const nir_variable *var = instr->variables[0]->var;
3816 const glsl_type *type = var->type->without_array();
3817 const brw_reg_type base_type = get_image_base_type(type);
3818
3819 /* Get some metadata from the image intrinsic. */
3820 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3821 const unsigned arr_dims = type->sampler_array ? 1 : 0;
3822 const unsigned surf_dims = type->coordinate_components() - arr_dims;
3823 const unsigned format = var->data.image.format;
3824
3825 /* Get the arguments of the image intrinsic. */
3826 const fs_reg image = get_nir_image_deref(instr->variables[0]);
3827 const fs_reg addr = retype(get_nir_src(instr->src[0]),
3828 BRW_REGISTER_TYPE_UD);
3829 const fs_reg src0 = (info->num_srcs >= 3 ?
3830 retype(get_nir_src(instr->src[2]), base_type) :
3831 fs_reg());
3832 const fs_reg src1 = (info->num_srcs >= 4 ?
3833 retype(get_nir_src(instr->src[3]), base_type) :
3834 fs_reg());
3835 fs_reg tmp;
3836
3837 /* Emit an image load, store or atomic op. */
3838 if (instr->intrinsic == nir_intrinsic_image_load)
3839 tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
3840
3841 else if (instr->intrinsic == nir_intrinsic_image_store)
3842 emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
3843 var->data.image.write_only ? GL_NONE : format);
3844
3845 else
3846 tmp = emit_image_atomic(bld, image, addr, src0, src1,
3847 surf_dims, arr_dims, info->dest_components,
3848 get_image_atomic_op(instr->intrinsic, type));
3849
3850 /* Assign the result. */
3851 for (unsigned c = 0; c < info->dest_components; ++c)
3852 bld.MOV(offset(retype(dest, base_type), bld, c),
3853 offset(tmp, bld, c));
3854 break;
3855 }
3856
3857 case nir_intrinsic_memory_barrier_atomic_counter:
3858 case nir_intrinsic_memory_barrier_buffer:
3859 case nir_intrinsic_memory_barrier_image:
3860 case nir_intrinsic_memory_barrier: {
3861 const fs_builder ubld = bld.group(8, 0);
3862 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3863 ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
3864 ->size_written = 2 * REG_SIZE;
3865 break;
3866 }
3867
3868 case nir_intrinsic_group_memory_barrier:
3869 case nir_intrinsic_memory_barrier_shared:
3870 /* We treat these workgroup-level barriers as no-ops. This should be
3871 * safe at present and as long as:
3872 *
3873 * - Memory access instructions are not subsequently reordered by the
3874 * compiler back-end.
3875 *
3876 * - All threads from a given compute shader workgroup fit within a
3877 * single subslice and therefore talk to the same HDC shared unit
3878 * what supposedly guarantees ordering and coherency between threads
3879 * from the same workgroup. This may change in the future when we
3880 * start splitting workgroups across multiple subslices.
3881 *
3882 * - The context is not in fault-and-stream mode, which could cause
3883 * memory transactions (including to SLM) prior to the barrier to be
3884 * replayed after the barrier if a pagefault occurs. This shouldn't
3885 * be a problem up to and including SKL because fault-and-stream is
3886 * not usable due to hardware issues, but that's likely to change in
3887 * the future.
3888 */
3889 break;
3890
3891 case nir_intrinsic_shader_clock: {
3892 /* We cannot do anything if there is an event, so ignore it for now */
3893 const fs_reg shader_clock = get_timestamp(bld);
3894 const fs_reg srcs[] = { component(shader_clock, 0),
3895 component(shader_clock, 1) };
3896 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3897 break;
3898 }
3899
3900 case nir_intrinsic_image_size: {
3901 /* Get the referenced image variable and type. */
3902 const nir_variable *var = instr->variables[0]->var;
3903 const glsl_type *type = var->type->without_array();
3904
3905 /* Get the size of the image. */
3906 const fs_reg image = get_nir_image_deref(instr->variables[0]);
3907 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
3908
3909 /* For 1DArray image types, the array index is stored in the Z component.
3910 * Fix this by swizzling the Z component to the Y component.
3911 */
3912 const bool is_1d_array_image =
3913 type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
3914 type->sampler_array;
3915
3916 /* For CubeArray images, we should count the number of cubes instead
3917 * of the number of faces. Fix it by dividing the (Z component) by 6.
3918 */
3919 const bool is_cube_array_image =
3920 type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
3921 type->sampler_array;
3922
3923 /* Copy all the components. */
3924 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3925 for (unsigned c = 0; c < info->dest_components; ++c) {
3926 if ((int)c >= type->coordinate_components()) {
3927 bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3928 brw_imm_d(1));
3929 } else if (c == 1 && is_1d_array_image) {
3930 bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3931 offset(size, bld, 2));
3932 } else if (c == 2 && is_cube_array_image) {
3933 bld.emit(SHADER_OPCODE_INT_QUOTIENT,
3934 offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3935 offset(size, bld, c), brw_imm_d(6));
3936 } else {
3937 bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3938 offset(size, bld, c));
3939 }
3940 }
3941
3942 break;
3943 }
3944
3945 case nir_intrinsic_image_samples:
3946 /* The driver does not support multi-sampled images. */
3947 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
3948 break;
3949
3950 case nir_intrinsic_load_uniform: {
3951 /* Offsets are in bytes but they should always be multiples of 4 */
3952 assert(instr->const_index[0] % 4 == 0);
3953
3954 fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
3955
3956 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3957 if (const_offset) {
3958 /* Offsets are in bytes but they should always be multiples of 4 */
3959 assert(const_offset->u32[0] % 4 == 0);
3960 src.offset = const_offset->u32[0];
3961
3962 for (unsigned j = 0; j < instr->num_components; j++) {
3963 bld.MOV(offset(dest, bld, j), offset(src, bld, j));
3964 }
3965 } else {
3966 fs_reg indirect = retype(get_nir_src(instr->src[0]),
3967 BRW_REGISTER_TYPE_UD);
3968
3969 /* We need to pass a size to the MOV_INDIRECT but we don't want it to
3970 * go past the end of the uniform. In order to keep the n'th
3971 * component from running past, we subtract off the size of all but
3972 * one component of the vector.
3973 */
3974 assert(instr->const_index[1] >=
3975 instr->num_components * (int) type_sz(dest.type));
3976 unsigned read_size = instr->const_index[1] -
3977 (instr->num_components - 1) * type_sz(dest.type);
3978
3979 fs_reg indirect_chv_high_32bit;
3980 bool is_chv_bxt_64bit =
3981 (devinfo->is_cherryview || devinfo->is_broxton) &&
3982 type_sz(dest.type) == 8;
3983 if (is_chv_bxt_64bit) {
3984 indirect_chv_high_32bit = vgrf(glsl_type::uint_type);
3985 /* Calculate indirect address to read high 32 bits */
3986 bld.ADD(indirect_chv_high_32bit, indirect, brw_imm_ud(4));
3987 }
3988
3989 for (unsigned j = 0; j < instr->num_components; j++) {
3990 if (!is_chv_bxt_64bit) {
3991 bld.emit(SHADER_OPCODE_MOV_INDIRECT,
3992 offset(dest, bld, j), offset(src, bld, j),
3993 indirect, brw_imm_ud(read_size));
3994 } else {
3995 bld.emit(SHADER_OPCODE_MOV_INDIRECT,
3996 subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, 0),
3997 offset(src, bld, j),
3998 indirect, brw_imm_ud(read_size));
3999
4000 bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4001 subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, 1),
4002 offset(src, bld, j),
4003 indirect_chv_high_32bit, brw_imm_ud(read_size));
4004 }
4005 }
4006 }
4007 break;
4008 }
4009
4010 case nir_intrinsic_load_ubo: {
4011 nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
4012 fs_reg surf_index;
4013
4014 if (const_index) {
4015 const unsigned index = stage_prog_data->binding_table.ubo_start +
4016 const_index->u32[0];
4017 surf_index = brw_imm_ud(index);
4018 brw_mark_surface_used(prog_data, index);
4019 } else {
4020 /* The block index is not a constant. Evaluate the index expression
4021 * per-channel and add the base UBO index; we have to select a value
4022 * from any live channel.
4023 */
4024 surf_index = vgrf(glsl_type::uint_type);
4025 bld.ADD(surf_index, get_nir_src(instr->src[0]),
4026 brw_imm_ud(stage_prog_data->binding_table.ubo_start));
4027 surf_index = bld.emit_uniformize(surf_index);
4028
4029 /* Assume this may touch any UBO. It would be nice to provide
4030 * a tighter bound, but the array information is already lowered away.
4031 */
4032 brw_mark_surface_used(prog_data,
4033 stage_prog_data->binding_table.ubo_start +
4034 nir->info->num_ubos - 1);
4035 }
4036
4037 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
4038 if (const_offset == NULL) {
4039 fs_reg base_offset = retype(get_nir_src(instr->src[1]),
4040 BRW_REGISTER_TYPE_UD);
4041
4042 for (int i = 0; i < instr->num_components; i++)
4043 VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
4044 base_offset, i * type_sz(dest.type));
4045 } else {
4046 /* Even if we are loading doubles, a pull constant load will load
4047 * a 32-bit vec4, so should only reserve vgrf space for that. If we
4048 * need to load a full dvec4 we will have to emit 2 loads. This is
4049 * similar to demote_pull_constants(), except that in that case we
4050 * see individual accesses to each component of the vector and then
4051 * we let CSE deal with duplicate loads. Here we see a vector access
4052 * and we have to split it if necessary.
4053 */
4054 const unsigned type_size = type_sz(dest.type);
4055 const fs_reg packed_consts = bld.vgrf(BRW_REGISTER_TYPE_F);
4056 for (unsigned c = 0; c < instr->num_components;) {
4057 const unsigned base = const_offset->u32[0] + c * type_size;
4058
4059 /* Number of usable components in the next 16B-aligned load */
4060 const unsigned count = MIN2(instr->num_components - c,
4061 (16 - base % 16) / type_size);
4062
4063 bld.exec_all()
4064 .emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
4065 packed_consts, surf_index, brw_imm_ud(base & ~15));
4066
4067 const fs_reg consts =
4068 retype(byte_offset(packed_consts, base & 15), dest.type);
4069
4070 for (unsigned d = 0; d < count; d++)
4071 bld.MOV(offset(dest, bld, c + d), component(consts, d));
4072
4073 c += count;
4074 }
4075 }
4076 break;
4077 }
4078
4079 case nir_intrinsic_load_ssbo: {
4080 assert(devinfo->gen >= 7);
4081
4082 nir_const_value *const_uniform_block =
4083 nir_src_as_const_value(instr->src[0]);
4084
4085 fs_reg surf_index;
4086 if (const_uniform_block) {
4087 unsigned index = stage_prog_data->binding_table.ssbo_start +
4088 const_uniform_block->u32[0];
4089 surf_index = brw_imm_ud(index);
4090 brw_mark_surface_used(prog_data, index);
4091 } else {
4092 surf_index = vgrf(glsl_type::uint_type);
4093 bld.ADD(surf_index, get_nir_src(instr->src[0]),
4094 brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
4095
4096 /* Assume this may touch any UBO. It would be nice to provide
4097 * a tighter bound, but the array information is already lowered away.
4098 */
4099 brw_mark_surface_used(prog_data,
4100 stage_prog_data->binding_table.ssbo_start +
4101 nir->info->num_ssbos - 1);
4102 }
4103
4104 fs_reg offset_reg;
4105 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
4106 if (const_offset) {
4107 offset_reg = brw_imm_ud(const_offset->u32[0]);
4108 } else {
4109 offset_reg = get_nir_src(instr->src[1]);
4110 }
4111
4112 /* Read the vector */
4113 do_untyped_vector_read(bld, dest, surf_index, offset_reg,
4114 instr->num_components);
4115
4116 break;
4117 }
4118
4119 case nir_intrinsic_store_ssbo: {
4120 assert(devinfo->gen >= 7);
4121
4122 if (stage == MESA_SHADER_FRAGMENT)
4123 brw_wm_prog_data(prog_data)->has_side_effects = true;
4124
4125 /* Block index */
4126 fs_reg surf_index;
4127 nir_const_value *const_uniform_block =
4128 nir_src_as_const_value(instr->src[1]);
4129 if (const_uniform_block) {
4130 unsigned index = stage_prog_data->binding_table.ssbo_start +
4131 const_uniform_block->u32[0];
4132 surf_index = brw_imm_ud(index);
4133 brw_mark_surface_used(prog_data, index);
4134 } else {
4135 surf_index = vgrf(glsl_type::uint_type);
4136 bld.ADD(surf_index, get_nir_src(instr->src[1]),
4137 brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
4138
4139 brw_mark_surface_used(prog_data,
4140 stage_prog_data->binding_table.ssbo_start +
4141 nir->info->num_ssbos - 1);
4142 }
4143
4144 /* Value */
4145 fs_reg val_reg = get_nir_src(instr->src[0]);
4146
4147 /* Writemask */
4148 unsigned writemask = instr->const_index[0];
4149
4150 /* get_nir_src() retypes to integer. Be wary of 64-bit types though
4151 * since the untyped writes below operate in units of 32-bits, which
4152 * means that we need to write twice as many components each time.
4153 * Also, we have to suffle 64-bit data to be in the appropriate layout
4154 * expected by our 32-bit write messages.
4155 */
4156 unsigned type_size = 4;
4157 unsigned bit_size = instr->src[0].is_ssa ?
4158 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
4159 if (bit_size == 64) {
4160 type_size = 8;
4161 fs_reg tmp =
4162 fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
4163 shuffle_64bit_data_for_32bit_write(bld,
4164 retype(tmp, BRW_REGISTER_TYPE_F),
4165 retype(val_reg, BRW_REGISTER_TYPE_DF),
4166 instr->num_components);
4167 val_reg = tmp;
4168 }
4169
4170 unsigned type_slots = type_size / 4;
4171
4172 /* Combine groups of consecutive enabled channels in one write
4173 * message. We use ffs to find the first enabled channel and then ffs on
4174 * the bit-inverse, down-shifted writemask to determine the length of
4175 * the block of enabled bits.
4176 */
4177 while (writemask) {
4178 unsigned first_component = ffs(writemask) - 1;
4179 unsigned length = ffs(~(writemask >> first_component)) - 1;
4180
4181 /* We can't write more than 2 64-bit components at once. Limit the
4182 * length of the write to what we can do and let the next iteration
4183 * handle the rest
4184 */
4185 if (type_size > 4)
4186 length = MIN2(2, length);
4187
4188 fs_reg offset_reg;
4189 nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
4190 if (const_offset) {
4191 offset_reg = brw_imm_ud(const_offset->u32[0] +
4192 type_size * first_component);
4193 } else {
4194 offset_reg = vgrf(glsl_type::uint_type);
4195 bld.ADD(offset_reg,
4196 retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
4197 brw_imm_ud(type_size * first_component));
4198 }
4199
4200
4201 emit_untyped_write(bld, surf_index, offset_reg,
4202 offset(val_reg, bld, first_component * type_slots),
4203 1 /* dims */, length * type_slots,
4204 BRW_PREDICATE_NONE);
4205
4206 /* Clear the bits in the writemask that we just wrote, then try
4207 * again to see if more channels are left.
4208 */
4209 writemask &= (15 << (first_component + length));
4210 }
4211 break;
4212 }
4213
4214 case nir_intrinsic_store_output: {
4215 fs_reg src = get_nir_src(instr->src[0]);
4216
4217 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
4218 assert(const_offset && "Indirect output stores not allowed");
4219 fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
4220 4 * const_offset->u32[0]), src.type);
4221
4222 unsigned num_components = instr->num_components;
4223 unsigned first_component = nir_intrinsic_component(instr);
4224 unsigned bit_size = instr->src[0].is_ssa ?
4225 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
4226 if (bit_size == 64) {
4227 fs_reg tmp =
4228 fs_reg(VGRF, alloc.allocate(2 * num_components),
4229 BRW_REGISTER_TYPE_F);
4230 shuffle_64bit_data_for_32bit_write(
4231 bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components);
4232 src = retype(tmp, src.type);
4233 num_components *= 2;
4234 }
4235
4236 for (unsigned j = 0; j < num_components; j++) {
4237 bld.MOV(offset(new_dest, bld, j + first_component),
4238 offset(src, bld, j));
4239 }
4240 break;
4241 }
4242
4243 case nir_intrinsic_ssbo_atomic_add:
4244 nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr);
4245 break;
4246 case nir_intrinsic_ssbo_atomic_imin:
4247 nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
4248 break;
4249 case nir_intrinsic_ssbo_atomic_umin:
4250 nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
4251 break;
4252 case nir_intrinsic_ssbo_atomic_imax:
4253 nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
4254 break;
4255 case nir_intrinsic_ssbo_atomic_umax:
4256 nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
4257 break;
4258 case nir_intrinsic_ssbo_atomic_and:
4259 nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
4260 break;
4261 case nir_intrinsic_ssbo_atomic_or:
4262 nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
4263 break;
4264 case nir_intrinsic_ssbo_atomic_xor:
4265 nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
4266 break;
4267 case nir_intrinsic_ssbo_atomic_exchange:
4268 nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
4269 break;
4270 case nir_intrinsic_ssbo_atomic_comp_swap:
4271 nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
4272 break;
4273
4274 case nir_intrinsic_get_buffer_size: {
4275 nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
4276 unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
4277
4278 /* A resinfo's sampler message is used to get the buffer size. The
4279 * SIMD8's writeback message consists of four registers and SIMD16's
4280 * writeback message consists of 8 destination registers (two per each
4281 * component). Because we are only interested on the first channel of
4282 * the first returned component, where resinfo returns the buffer size
4283 * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
4284 * the dispatch width.
4285 */
4286 const fs_builder ubld = bld.exec_all().group(8, 0);
4287 fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4288 fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4289
4290 /* Set LOD = 0 */
4291 ubld.MOV(src_payload, brw_imm_d(0));
4292
4293 const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
4294 fs_inst *inst = ubld.emit(FS_OPCODE_GET_BUFFER_SIZE, ret_payload,
4295 src_payload, brw_imm_ud(index));
4296 inst->header_size = 0;
4297 inst->mlen = 1;
4298 inst->size_written = 4 * REG_SIZE;
4299
4300 bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0));
4301 brw_mark_surface_used(prog_data, index);
4302 break;
4303 }
4304
4305 case nir_intrinsic_load_channel_num: {
4306 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
4307 dest = retype(dest, BRW_REGISTER_TYPE_UD);
4308 const fs_builder allbld8 = bld.group(8, 0).exec_all();
4309 allbld8.MOV(tmp, brw_imm_v(0x76543210));
4310 if (dispatch_width > 8)
4311 allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
4312 if (dispatch_width > 16) {
4313 const fs_builder allbld16 = bld.group(16, 0).exec_all();
4314 allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
4315 }
4316 bld.MOV(dest, tmp);
4317 break;
4318 }
4319
4320 default:
4321 unreachable("unknown intrinsic");
4322 }
4323 }
4324
4325 void
4326 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
4327 int op, nir_intrinsic_instr *instr)
4328 {
4329 if (stage == MESA_SHADER_FRAGMENT)
4330 brw_wm_prog_data(prog_data)->has_side_effects = true;
4331
4332 fs_reg dest;
4333 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4334 dest = get_nir_dest(instr->dest);
4335
4336 fs_reg surface;
4337 nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
4338 if (const_surface) {
4339 unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
4340 const_surface->u32[0];
4341 surface = brw_imm_ud(surf_index);
4342 brw_mark_surface_used(prog_data, surf_index);
4343 } else {
4344 surface = vgrf(glsl_type::uint_type);
4345 bld.ADD(surface, get_nir_src(instr->src[0]),
4346 brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
4347
4348 /* Assume this may touch any SSBO. This is the same we do for other
4349 * UBO/SSBO accesses with non-constant surface.
4350 */
4351 brw_mark_surface_used(prog_data,
4352 stage_prog_data->binding_table.ssbo_start +
4353 nir->info->num_ssbos - 1);
4354 }
4355
4356 fs_reg offset = get_nir_src(instr->src[1]);
4357 fs_reg data1 = get_nir_src(instr->src[2]);
4358 fs_reg data2;
4359 if (op == BRW_AOP_CMPWR)
4360 data2 = get_nir_src(instr->src[3]);
4361
4362 /* Emit the actual atomic operation */
4363
4364 fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
4365 data1, data2,
4366 1 /* dims */, 1 /* rsize */,
4367 op,
4368 BRW_PREDICATE_NONE);
4369 dest.type = atomic_result.type;
4370 bld.MOV(dest, atomic_result);
4371 }
4372
4373 void
4374 fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
4375 int op, nir_intrinsic_instr *instr)
4376 {
4377 fs_reg dest;
4378 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4379 dest = get_nir_dest(instr->dest);
4380
4381 fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
4382 fs_reg offset;
4383 fs_reg data1 = get_nir_src(instr->src[1]);
4384 fs_reg data2;
4385 if (op == BRW_AOP_CMPWR)
4386 data2 = get_nir_src(instr->src[2]);
4387
4388 /* Get the offset */
4389 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
4390 if (const_offset) {
4391 offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
4392 } else {
4393 offset = vgrf(glsl_type::uint_type);
4394 bld.ADD(offset,
4395 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
4396 brw_imm_ud(instr->const_index[0]));
4397 }
4398
4399 /* Emit the actual atomic operation operation */
4400
4401 fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
4402 data1, data2,
4403 1 /* dims */, 1 /* rsize */,
4404 op,
4405 BRW_PREDICATE_NONE);
4406 dest.type = atomic_result.type;
4407 bld.MOV(dest, atomic_result);
4408 }
4409
4410 void
4411 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
4412 {
4413 unsigned texture = instr->texture_index;
4414 unsigned sampler = instr->sampler_index;
4415
4416 fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4417
4418 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
4419 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
4420
4421 int lod_components = 0;
4422
4423 /* The hardware requires a LOD for buffer textures */
4424 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
4425 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
4426
4427 for (unsigned i = 0; i < instr->num_srcs; i++) {
4428 fs_reg src = get_nir_src(instr->src[i].src);
4429 switch (instr->src[i].src_type) {
4430 case nir_tex_src_bias:
4431 srcs[TEX_LOGICAL_SRC_LOD] =
4432 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
4433 break;
4434 case nir_tex_src_comparitor:
4435 srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
4436 break;
4437 case nir_tex_src_coord:
4438 switch (instr->op) {
4439 case nir_texop_txf:
4440 case nir_texop_txf_ms:
4441 case nir_texop_txf_ms_mcs:
4442 case nir_texop_samples_identical:
4443 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
4444 break;
4445 default:
4446 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
4447 break;
4448 }
4449 break;
4450 case nir_tex_src_ddx:
4451 srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
4452 lod_components = nir_tex_instr_src_size(instr, i);
4453 break;
4454 case nir_tex_src_ddy:
4455 srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
4456 break;
4457 case nir_tex_src_lod:
4458 switch (instr->op) {
4459 case nir_texop_txs:
4460 srcs[TEX_LOGICAL_SRC_LOD] =
4461 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
4462 break;
4463 case nir_texop_txf:
4464 srcs[TEX_LOGICAL_SRC_LOD] =
4465 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
4466 break;
4467 default:
4468 srcs[TEX_LOGICAL_SRC_LOD] =
4469 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
4470 break;
4471 }
4472 break;
4473 case nir_tex_src_ms_index:
4474 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
4475 break;
4476
4477 case nir_tex_src_offset: {
4478 nir_const_value *const_offset =
4479 nir_src_as_const_value(instr->src[i].src);
4480 if (const_offset) {
4481 unsigned header_bits = brw_texture_offset(const_offset->i32, 3);
4482 if (header_bits != 0)
4483 srcs[TEX_LOGICAL_SRC_OFFSET_VALUE] = brw_imm_ud(header_bits);
4484 } else {
4485 srcs[TEX_LOGICAL_SRC_OFFSET_VALUE] =
4486 retype(src, BRW_REGISTER_TYPE_D);
4487 }
4488 break;
4489 }
4490
4491 case nir_tex_src_projector:
4492 unreachable("should be lowered");
4493
4494 case nir_tex_src_texture_offset: {
4495 /* Figure out the highest possible texture index and mark it as used */
4496 uint32_t max_used = texture + instr->texture_array_size - 1;
4497 if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
4498 max_used += stage_prog_data->binding_table.gather_texture_start;
4499 } else {
4500 max_used += stage_prog_data->binding_table.texture_start;
4501 }
4502 brw_mark_surface_used(prog_data, max_used);
4503
4504 /* Emit code to evaluate the actual indexing expression */
4505 fs_reg tmp = vgrf(glsl_type::uint_type);
4506 bld.ADD(tmp, src, brw_imm_ud(texture));
4507 srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
4508 break;
4509 }
4510
4511 case nir_tex_src_sampler_offset: {
4512 /* Emit code to evaluate the actual indexing expression */
4513 fs_reg tmp = vgrf(glsl_type::uint_type);
4514 bld.ADD(tmp, src, brw_imm_ud(sampler));
4515 srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
4516 break;
4517 }
4518
4519 case nir_tex_src_ms_mcs:
4520 assert(instr->op == nir_texop_txf_ms);
4521 srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
4522 break;
4523
4524 case nir_tex_src_plane: {
4525 nir_const_value *const_plane =
4526 nir_src_as_const_value(instr->src[i].src);
4527 const uint32_t plane = const_plane->u32[0];
4528 const uint32_t texture_index =
4529 instr->texture_index +
4530 stage_prog_data->binding_table.plane_start[plane] -
4531 stage_prog_data->binding_table.texture_start;
4532
4533 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
4534 break;
4535 }
4536
4537 default:
4538 unreachable("unknown texture source");
4539 }
4540 }
4541
4542 if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
4543 (instr->op == nir_texop_txf_ms ||
4544 instr->op == nir_texop_samples_identical)) {
4545 if (devinfo->gen >= 7 &&
4546 key_tex->compressed_multisample_layout_mask & (1 << texture)) {
4547 srcs[TEX_LOGICAL_SRC_MCS] =
4548 emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
4549 instr->coord_components,
4550 srcs[TEX_LOGICAL_SRC_SURFACE]);
4551 } else {
4552 srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
4553 }
4554 }
4555
4556 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
4557 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
4558
4559 if (instr->op == nir_texop_query_levels ||
4560 (instr->op == nir_texop_tex && stage != MESA_SHADER_FRAGMENT)) {
4561 /* textureQueryLevels() and texture() are implemented in terms of TXS
4562 * and TXL respectively, so we need to pass a valid LOD argument.
4563 */
4564 assert(srcs[TEX_LOGICAL_SRC_LOD].file == BAD_FILE);
4565 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0u);
4566 }
4567
4568 enum opcode opcode;
4569 switch (instr->op) {
4570 case nir_texop_tex:
4571 opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL :
4572 SHADER_OPCODE_TXL_LOGICAL);
4573 break;
4574 case nir_texop_txb:
4575 opcode = FS_OPCODE_TXB_LOGICAL;
4576 break;
4577 case nir_texop_txl:
4578 opcode = SHADER_OPCODE_TXL_LOGICAL;
4579 break;
4580 case nir_texop_txd:
4581 opcode = SHADER_OPCODE_TXD_LOGICAL;
4582 break;
4583 case nir_texop_txf:
4584 opcode = SHADER_OPCODE_TXF_LOGICAL;
4585 break;
4586 case nir_texop_txf_ms:
4587 if ((key_tex->msaa_16 & (1 << sampler)))
4588 opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
4589 else
4590 opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
4591 break;
4592 case nir_texop_txf_ms_mcs:
4593 opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
4594 break;
4595 case nir_texop_query_levels:
4596 case nir_texop_txs:
4597 opcode = SHADER_OPCODE_TXS_LOGICAL;
4598 break;
4599 case nir_texop_lod:
4600 opcode = SHADER_OPCODE_LOD_LOGICAL;
4601 break;
4602 case nir_texop_tg4:
4603 if (srcs[TEX_LOGICAL_SRC_OFFSET_VALUE].file != BAD_FILE &&
4604 srcs[TEX_LOGICAL_SRC_OFFSET_VALUE].file != IMM)
4605 opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
4606 else
4607 opcode = SHADER_OPCODE_TG4_LOGICAL;
4608 break;
4609 case nir_texop_texture_samples:
4610 opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
4611 break;
4612 case nir_texop_samples_identical: {
4613 fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
4614
4615 /* If mcs is an immediate value, it means there is no MCS. In that case
4616 * just return false.
4617 */
4618 if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
4619 bld.MOV(dst, brw_imm_ud(0u));
4620 } else if ((key_tex->msaa_16 & (1 << sampler))) {
4621 fs_reg tmp = vgrf(glsl_type::uint_type);
4622 bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
4623 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
4624 bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
4625 } else {
4626 bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
4627 BRW_CONDITIONAL_EQ);
4628 }
4629 return;
4630 }
4631 default:
4632 unreachable("unknown texture opcode");
4633 }
4634
4635 fs_reg dst = bld.vgrf(brw_type_for_nir_type(instr->dest_type), 4);
4636 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
4637
4638 const unsigned dest_size = nir_tex_instr_dest_size(instr);
4639 if (devinfo->gen >= 9 &&
4640 instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
4641 unsigned write_mask = instr->dest.is_ssa ?
4642 nir_ssa_def_components_read(&instr->dest.ssa):
4643 (1 << dest_size) - 1;
4644 assert(write_mask != 0); /* dead code should have been eliminated */
4645 inst->size_written = util_last_bit(write_mask) *
4646 inst->dst.component_size(inst->exec_size);
4647 } else {
4648 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
4649 }
4650
4651 if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
4652 inst->shadow_compare = true;
4653
4654 if (srcs[TEX_LOGICAL_SRC_OFFSET_VALUE].file == IMM)
4655 inst->offset = srcs[TEX_LOGICAL_SRC_OFFSET_VALUE].ud;
4656
4657 if (instr->op == nir_texop_tg4) {
4658 if (instr->component == 1 &&
4659 key_tex->gather_channel_quirk_mask & (1 << texture)) {
4660 /* gather4 sampler is broken for green channel on RG32F --
4661 * we must ask for blue instead.
4662 */
4663 inst->offset |= 2 << 16;
4664 } else {
4665 inst->offset |= instr->component << 16;
4666 }
4667
4668 if (devinfo->gen == 6)
4669 emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
4670 }
4671
4672 fs_reg nir_dest[4];
4673 for (unsigned i = 0; i < dest_size; i++)
4674 nir_dest[i] = offset(dst, bld, i);
4675
4676 if (instr->op == nir_texop_query_levels) {
4677 /* # levels is in .w */
4678 nir_dest[0] = offset(dst, bld, 3);
4679 } else if (instr->op == nir_texop_txs &&
4680 dest_size >= 3 && devinfo->gen < 7) {
4681 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
4682 fs_reg depth = offset(dst, bld, 2);
4683 nir_dest[2] = vgrf(glsl_type::int_type);
4684 bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
4685 }
4686
4687 bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
4688 }
4689
4690 void
4691 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
4692 {
4693 switch (instr->type) {
4694 case nir_jump_break:
4695 bld.emit(BRW_OPCODE_BREAK);
4696 break;
4697 case nir_jump_continue:
4698 bld.emit(BRW_OPCODE_CONTINUE);
4699 break;
4700 case nir_jump_return:
4701 default:
4702 unreachable("unknown jump");
4703 }
4704 }
4705
4706 /**
4707 * This helper takes the result of a load operation that reads 32-bit elements
4708 * in this format:
4709 *
4710 * x x x x x x x x
4711 * y y y y y y y y
4712 * z z z z z z z z
4713 * w w w w w w w w
4714 *
4715 * and shuffles the data to get this:
4716 *
4717 * x y x y x y x y
4718 * x y x y x y x y
4719 * z w z w z w z w
4720 * z w z w z w z w
4721 *
4722 * Which is exactly what we want if the load is reading 64-bit components
4723 * like doubles, where x represents the low 32-bit of the x double component
4724 * and y represents the high 32-bit of the x double component (likewise with
4725 * z and w for double component y). The parameter @components represents
4726 * the number of 64-bit components present in @src. This would typically be
4727 * 2 at most, since we can only fit 2 double elements in the result of a
4728 * vec4 load.
4729 *
4730 * Notice that @dst and @src can be the same register.
4731 */
4732 void
4733 shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
4734 const fs_reg &dst,
4735 const fs_reg &src,
4736 uint32_t components)
4737 {
4738 assert(type_sz(src.type) == 4);
4739 assert(type_sz(dst.type) == 8);
4740
4741 /* A temporary that we will use to shuffle the 32-bit data of each
4742 * component in the vector into valid 64-bit data. We can't write directly
4743 * to dst because dst can be (and would usually be) the same as src
4744 * and in that case the first MOV in the loop below would overwrite the
4745 * data read in the second MOV.
4746 */
4747 fs_reg tmp = bld.vgrf(dst.type);
4748
4749 for (unsigned i = 0; i < components; i++) {
4750 const fs_reg component_i = offset(src, bld, 2 * i);
4751
4752 bld.MOV(subscript(tmp, src.type, 0), component_i);
4753 bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1));
4754
4755 bld.MOV(offset(dst, bld, i), tmp);
4756 }
4757 }
4758
4759 /**
4760 * This helper does the inverse operation of
4761 * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
4762 *
4763 * We need to do this when we are going to use untyped write messsages that
4764 * operate with 32-bit components in order to arrange our 64-bit data to be
4765 * in the expected layout.
4766 *
4767 * Notice that callers of this function, unlike in the case of the inverse
4768 * operation, would typically need to call this with dst and src being
4769 * different registers, since they would otherwise corrupt the original
4770 * 64-bit data they are about to write. Because of this the function checks
4771 * that the src and dst regions involved in the operation do not overlap.
4772 */
4773 void
4774 shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
4775 const fs_reg &dst,
4776 const fs_reg &src,
4777 uint32_t components)
4778 {
4779 assert(type_sz(src.type) == 8);
4780 assert(type_sz(dst.type) == 4);
4781
4782 assert(!regions_overlap(
4783 dst, 2 * components * dst.component_size(bld.dispatch_width()),
4784 src, components * src.component_size(bld.dispatch_width())));
4785
4786 for (unsigned i = 0; i < components; i++) {
4787 const fs_reg component_i = offset(src, bld, i);
4788 bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0));
4789 bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1));
4790 }
4791 }
4792
4793 fs_reg
4794 setup_imm_df(const fs_builder &bld, double v)
4795 {
4796 const struct gen_device_info *devinfo = bld.shader->devinfo;
4797 assert(devinfo->gen >= 7);
4798
4799 if (devinfo->gen >= 8)
4800 return brw_imm_df(v);
4801
4802 /* gen7.5 does not support DF immediates straighforward but the DIM
4803 * instruction allows to set the 64-bit immediate value.
4804 */
4805 if (devinfo->is_haswell) {
4806 const fs_builder ubld = bld.exec_all();
4807 fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
4808 ubld.DIM(dst, brw_imm_df(v));
4809 return component(dst, 0);
4810 }
4811
4812 /* gen7 does not support DF immediates, so we generate a 64-bit constant by
4813 * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
4814 * the high 32-bit to suboffset 4 and then applying a stride of 0.
4815 *
4816 * Alternatively, we could also produce a normal VGRF (without stride 0)
4817 * by writing to all the channels in the VGRF, however, that would hit the
4818 * gen7 bug where we have to split writes that span more than 1 register
4819 * into instructions with a width of 4 (otherwise the write to the second
4820 * register written runs into an execmask hardware bug) which isn't very
4821 * nice.
4822 */
4823 union {
4824 double d;
4825 struct {
4826 uint32_t i1;
4827 uint32_t i2;
4828 };
4829 } di;
4830
4831 di.d = v;
4832
4833 const fs_builder ubld = bld.exec_all().group(1, 0);
4834 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4835 ubld.MOV(tmp, brw_imm_ud(di.i1));
4836 ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
4837
4838 return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
4839 }