i965: split EU defines to brw_eu_defines.h
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_nir.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/glsl/ir.h"
25 #include "brw_fs.h"
26 #include "brw_fs_surface_builder.h"
27 #include "brw_nir.h"
28
29 using namespace brw;
30 using namespace brw::surface_access;
31
32 void
33 fs_visitor::emit_nir_code()
34 {
35 /* emit the arrays used for inputs and outputs - load/store intrinsics will
36 * be converted to reads/writes of these arrays
37 */
38 nir_setup_outputs();
39 nir_setup_uniforms();
40 nir_emit_system_values();
41
42 /* get the main function and emit it */
43 nir_foreach_function(function, nir) {
44 assert(strcmp(function->name, "main") == 0);
45 assert(function->impl);
46 nir_emit_impl(function->impl);
47 }
48 }
49
50 void
51 fs_visitor::nir_setup_outputs()
52 {
53 if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
54 return;
55
56 nir_foreach_variable(var, &nir->outputs) {
57 const unsigned vec4s =
58 var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
59 : type_size_vec4(var->type);
60 fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s);
61 for (unsigned i = 0; i < vec4s; i++) {
62 if (outputs[var->data.driver_location + i].file == BAD_FILE)
63 outputs[var->data.driver_location + i] = offset(reg, bld, 4 * i);
64 }
65 }
66 }
67
68 void
69 fs_visitor::nir_setup_uniforms()
70 {
71 if (dispatch_width != min_dispatch_width)
72 return;
73
74 uniforms = nir->num_uniforms / 4;
75 }
76
77 static bool
78 emit_system_values_block(nir_block *block, fs_visitor *v)
79 {
80 fs_reg *reg;
81
82 nir_foreach_instr(instr, block) {
83 if (instr->type != nir_instr_type_intrinsic)
84 continue;
85
86 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
87 switch (intrin->intrinsic) {
88 case nir_intrinsic_load_vertex_id:
89 unreachable("should be lowered by lower_vertex_id().");
90
91 case nir_intrinsic_load_vertex_id_zero_base:
92 assert(v->stage == MESA_SHADER_VERTEX);
93 reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
94 if (reg->file == BAD_FILE)
95 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
96 break;
97
98 case nir_intrinsic_load_base_vertex:
99 assert(v->stage == MESA_SHADER_VERTEX);
100 reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
101 if (reg->file == BAD_FILE)
102 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
103 break;
104
105 case nir_intrinsic_load_instance_id:
106 assert(v->stage == MESA_SHADER_VERTEX);
107 reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
108 if (reg->file == BAD_FILE)
109 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
110 break;
111
112 case nir_intrinsic_load_base_instance:
113 assert(v->stage == MESA_SHADER_VERTEX);
114 reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
115 if (reg->file == BAD_FILE)
116 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE);
117 break;
118
119 case nir_intrinsic_load_draw_id:
120 assert(v->stage == MESA_SHADER_VERTEX);
121 reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID];
122 if (reg->file == BAD_FILE)
123 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID);
124 break;
125
126 case nir_intrinsic_load_invocation_id:
127 if (v->stage == MESA_SHADER_TESS_CTRL)
128 break;
129 assert(v->stage == MESA_SHADER_GEOMETRY);
130 reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
131 if (reg->file == BAD_FILE) {
132 const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
133 fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
134 fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
135 abld.SHR(iid, g1, brw_imm_ud(27u));
136 *reg = iid;
137 }
138 break;
139
140 case nir_intrinsic_load_sample_pos:
141 assert(v->stage == MESA_SHADER_FRAGMENT);
142 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
143 if (reg->file == BAD_FILE)
144 *reg = *v->emit_samplepos_setup();
145 break;
146
147 case nir_intrinsic_load_sample_id:
148 assert(v->stage == MESA_SHADER_FRAGMENT);
149 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
150 if (reg->file == BAD_FILE)
151 *reg = *v->emit_sampleid_setup();
152 break;
153
154 case nir_intrinsic_load_sample_mask_in:
155 assert(v->stage == MESA_SHADER_FRAGMENT);
156 assert(v->devinfo->gen >= 7);
157 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
158 if (reg->file == BAD_FILE)
159 *reg = *v->emit_samplemaskin_setup();
160 break;
161
162 case nir_intrinsic_load_work_group_id:
163 assert(v->stage == MESA_SHADER_COMPUTE);
164 reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
165 if (reg->file == BAD_FILE)
166 *reg = *v->emit_cs_work_group_id_setup();
167 break;
168
169 case nir_intrinsic_load_helper_invocation:
170 assert(v->stage == MESA_SHADER_FRAGMENT);
171 reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
172 if (reg->file == BAD_FILE) {
173 const fs_builder abld =
174 v->bld.annotate("gl_HelperInvocation", NULL);
175
176 /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
177 * pixel mask is in g1.7 of the thread payload.
178 *
179 * We move the per-channel pixel enable bit to the low bit of each
180 * channel by shifting the byte containing the pixel mask by the
181 * vector immediate 0x76543210UV.
182 *
183 * The region of <1,8,0> reads only 1 byte (the pixel masks for
184 * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
185 * masks for 2 and 3) in SIMD16.
186 */
187 fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
188 abld.SHR(shifted,
189 stride(byte_offset(retype(brw_vec1_grf(1, 0),
190 BRW_REGISTER_TYPE_UB), 28),
191 1, 8, 0),
192 brw_imm_v(0x76543210));
193
194 /* A set bit in the pixel mask means the channel is enabled, but
195 * that is the opposite of gl_HelperInvocation so we need to invert
196 * the mask.
197 *
198 * The negate source-modifier bit of logical instructions on Gen8+
199 * performs 1's complement negation, so we can use that instead of
200 * a NOT instruction.
201 */
202 fs_reg inverted = negate(shifted);
203 if (v->devinfo->gen < 8) {
204 inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
205 abld.NOT(inverted, shifted);
206 }
207
208 /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
209 * with 1 and negating.
210 */
211 fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
212 abld.AND(anded, inverted, brw_imm_uw(1));
213
214 fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
215 abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
216 *reg = dst;
217 }
218 break;
219
220 default:
221 break;
222 }
223 }
224
225 return true;
226 }
227
228 void
229 fs_visitor::nir_emit_system_values()
230 {
231 nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
232 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
233 nir_system_values[i] = fs_reg();
234 }
235
236 nir_foreach_function(function, nir) {
237 assert(strcmp(function->name, "main") == 0);
238 assert(function->impl);
239 nir_foreach_block(block, function->impl) {
240 emit_system_values_block(block, this);
241 }
242 }
243 }
244
245 void
246 fs_visitor::nir_emit_impl(nir_function_impl *impl)
247 {
248 nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
249 for (unsigned i = 0; i < impl->reg_alloc; i++) {
250 nir_locals[i] = fs_reg();
251 }
252
253 foreach_list_typed(nir_register, reg, node, &impl->registers) {
254 unsigned array_elems =
255 reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
256 unsigned size = array_elems * reg->num_components;
257 const brw_reg_type reg_type =
258 reg->bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
259 nir_locals[reg->index] = bld.vgrf(reg_type, size);
260 }
261
262 nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
263 impl->ssa_alloc);
264
265 nir_emit_cf_list(&impl->body);
266 }
267
268 void
269 fs_visitor::nir_emit_cf_list(exec_list *list)
270 {
271 exec_list_validate(list);
272 foreach_list_typed(nir_cf_node, node, node, list) {
273 switch (node->type) {
274 case nir_cf_node_if:
275 nir_emit_if(nir_cf_node_as_if(node));
276 break;
277
278 case nir_cf_node_loop:
279 nir_emit_loop(nir_cf_node_as_loop(node));
280 break;
281
282 case nir_cf_node_block:
283 nir_emit_block(nir_cf_node_as_block(node));
284 break;
285
286 default:
287 unreachable("Invalid CFG node block");
288 }
289 }
290 }
291
292 void
293 fs_visitor::nir_emit_if(nir_if *if_stmt)
294 {
295 /* first, put the condition into f0 */
296 fs_inst *inst = bld.MOV(bld.null_reg_d(),
297 retype(get_nir_src(if_stmt->condition),
298 BRW_REGISTER_TYPE_D));
299 inst->conditional_mod = BRW_CONDITIONAL_NZ;
300
301 bld.IF(BRW_PREDICATE_NORMAL);
302
303 nir_emit_cf_list(&if_stmt->then_list);
304
305 /* note: if the else is empty, dead CF elimination will remove it */
306 bld.emit(BRW_OPCODE_ELSE);
307
308 nir_emit_cf_list(&if_stmt->else_list);
309
310 bld.emit(BRW_OPCODE_ENDIF);
311 }
312
313 void
314 fs_visitor::nir_emit_loop(nir_loop *loop)
315 {
316 bld.emit(BRW_OPCODE_DO);
317
318 nir_emit_cf_list(&loop->body);
319
320 bld.emit(BRW_OPCODE_WHILE);
321 }
322
323 void
324 fs_visitor::nir_emit_block(nir_block *block)
325 {
326 nir_foreach_instr(instr, block) {
327 nir_emit_instr(instr);
328 }
329 }
330
331 void
332 fs_visitor::nir_emit_instr(nir_instr *instr)
333 {
334 const fs_builder abld = bld.annotate(NULL, instr);
335
336 switch (instr->type) {
337 case nir_instr_type_alu:
338 nir_emit_alu(abld, nir_instr_as_alu(instr));
339 break;
340
341 case nir_instr_type_intrinsic:
342 switch (stage) {
343 case MESA_SHADER_VERTEX:
344 nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
345 break;
346 case MESA_SHADER_TESS_CTRL:
347 nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
348 break;
349 case MESA_SHADER_TESS_EVAL:
350 nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
351 break;
352 case MESA_SHADER_GEOMETRY:
353 nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
354 break;
355 case MESA_SHADER_FRAGMENT:
356 nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
357 break;
358 case MESA_SHADER_COMPUTE:
359 nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
360 break;
361 default:
362 unreachable("unsupported shader stage");
363 }
364 break;
365
366 case nir_instr_type_tex:
367 nir_emit_texture(abld, nir_instr_as_tex(instr));
368 break;
369
370 case nir_instr_type_load_const:
371 nir_emit_load_const(abld, nir_instr_as_load_const(instr));
372 break;
373
374 case nir_instr_type_ssa_undef:
375 /* We create a new VGRF for undefs on every use (by handling
376 * them in get_nir_src()), rather than for each definition.
377 * This helps register coalescing eliminate MOVs from undef.
378 */
379 break;
380
381 case nir_instr_type_jump:
382 nir_emit_jump(abld, nir_instr_as_jump(instr));
383 break;
384
385 default:
386 unreachable("unknown instruction type");
387 }
388 }
389
390 /**
391 * Recognizes a parent instruction of nir_op_extract_* and changes the type to
392 * match instr.
393 */
394 bool
395 fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
396 const fs_reg &result)
397 {
398 if (!instr->src[0].src.is_ssa ||
399 !instr->src[0].src.ssa->parent_instr)
400 return false;
401
402 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
403 return false;
404
405 nir_alu_instr *src0 =
406 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
407
408 if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
409 src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
410 return false;
411
412 nir_const_value *element = nir_src_as_const_value(src0->src[1].src);
413 assert(element != NULL);
414
415 /* Element type to extract.*/
416 const brw_reg_type type = brw_int_type(
417 src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
418 src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
419
420 fs_reg op0 = get_nir_src(src0->src[0].src);
421 op0.type = brw_type_for_nir_type(devinfo,
422 (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
423 nir_src_bit_size(src0->src[0].src)));
424 op0 = offset(op0, bld, src0->src[0].swizzle[0]);
425
426 set_saturate(instr->dest.saturate,
427 bld.MOV(result, subscript(op0, type, element->u32[0])));
428 return true;
429 }
430
431 bool
432 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
433 const fs_reg &result)
434 {
435 if (!instr->src[0].src.is_ssa ||
436 instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
437 return false;
438
439 nir_intrinsic_instr *src0 =
440 nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
441
442 if (src0->intrinsic != nir_intrinsic_load_front_face)
443 return false;
444
445 nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
446 if (!value1 || fabsf(value1->f32[0]) != 1.0f)
447 return false;
448
449 nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
450 if (!value2 || fabsf(value2->f32[0]) != 1.0f)
451 return false;
452
453 fs_reg tmp = vgrf(glsl_type::int_type);
454
455 if (devinfo->gen >= 6) {
456 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
457 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
458
459 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
460 *
461 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
462 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
463 *
464 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
465 *
466 * This negation looks like it's safe in practice, because bits 0:4 will
467 * surely be TRIANGLES
468 */
469
470 if (value1->f32[0] == -1.0f) {
471 g0.negate = true;
472 }
473
474 bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
475 g0, brw_imm_uw(0x3f80));
476 } else {
477 /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
478 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
479
480 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
481 *
482 * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D
483 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
484 *
485 * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
486 *
487 * This negation looks like it's safe in practice, because bits 0:4 will
488 * surely be TRIANGLES
489 */
490
491 if (value1->f32[0] == -1.0f) {
492 g1_6.negate = true;
493 }
494
495 bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
496 }
497 bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
498
499 return true;
500 }
501
502 static void
503 emit_find_msb_using_lzd(const fs_builder &bld,
504 const fs_reg &result,
505 const fs_reg &src,
506 bool is_signed)
507 {
508 fs_inst *inst;
509 fs_reg temp = src;
510
511 if (is_signed) {
512 /* LZD of an absolute value source almost always does the right
513 * thing. There are two problem values:
514 *
515 * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns
516 * 0. However, findMSB(int(0x80000000)) == 30.
517 *
518 * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns
519 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
520 *
521 * For a value of zero or negative one, -1 will be returned.
522 *
523 * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but
524 * findMSB(-(1<<x)) should return x-1.
525 *
526 * For all negative number cases, including 0x80000000 and
527 * 0xffffffff, the correct value is obtained from LZD if instead of
528 * negating the (already negative) value the logical-not is used. A
529 * conditonal logical-not can be achieved in two instructions.
530 */
531 temp = bld.vgrf(BRW_REGISTER_TYPE_D);
532
533 bld.ASR(temp, src, brw_imm_d(31));
534 bld.XOR(temp, temp, src);
535 }
536
537 bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
538 retype(temp, BRW_REGISTER_TYPE_UD));
539
540 /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
541 * from the LSB side. Subtract the result from 31 to convert the MSB
542 * count into an LSB count. If no bits are set, LZD will return 32.
543 * 31-32 = -1, which is exactly what findMSB() is supposed to return.
544 */
545 inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
546 inst->src[0].negate = true;
547 }
548
549 void
550 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
551 {
552 struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
553 fs_inst *inst;
554
555 fs_reg result = get_nir_dest(instr->dest.dest);
556 result.type = brw_type_for_nir_type(devinfo,
557 (nir_alu_type)(nir_op_infos[instr->op].output_type |
558 nir_dest_bit_size(instr->dest.dest)));
559
560 fs_reg op[4];
561 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
562 op[i] = get_nir_src(instr->src[i].src);
563 op[i].type = brw_type_for_nir_type(devinfo,
564 (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
565 nir_src_bit_size(instr->src[i].src)));
566 op[i].abs = instr->src[i].abs;
567 op[i].negate = instr->src[i].negate;
568 }
569
570 /* We get a bunch of mov's out of the from_ssa pass and they may still
571 * be vectorized. We'll handle them as a special-case. We'll also
572 * handle vecN here because it's basically the same thing.
573 */
574 switch (instr->op) {
575 case nir_op_imov:
576 case nir_op_fmov:
577 case nir_op_vec2:
578 case nir_op_vec3:
579 case nir_op_vec4: {
580 fs_reg temp = result;
581 bool need_extra_copy = false;
582 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
583 if (!instr->src[i].src.is_ssa &&
584 instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
585 need_extra_copy = true;
586 temp = bld.vgrf(result.type, 4);
587 break;
588 }
589 }
590
591 for (unsigned i = 0; i < 4; i++) {
592 if (!(instr->dest.write_mask & (1 << i)))
593 continue;
594
595 if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
596 inst = bld.MOV(offset(temp, bld, i),
597 offset(op[0], bld, instr->src[0].swizzle[i]));
598 } else {
599 inst = bld.MOV(offset(temp, bld, i),
600 offset(op[i], bld, instr->src[i].swizzle[0]));
601 }
602 inst->saturate = instr->dest.saturate;
603 }
604
605 /* In this case the source and destination registers were the same,
606 * so we need to insert an extra set of moves in order to deal with
607 * any swizzling.
608 */
609 if (need_extra_copy) {
610 for (unsigned i = 0; i < 4; i++) {
611 if (!(instr->dest.write_mask & (1 << i)))
612 continue;
613
614 bld.MOV(offset(result, bld, i), offset(temp, bld, i));
615 }
616 }
617 return;
618 }
619 default:
620 break;
621 }
622
623 /* At this point, we have dealt with any instruction that operates on
624 * more than a single channel. Therefore, we can just adjust the source
625 * and destination registers for that channel and emit the instruction.
626 */
627 unsigned channel = 0;
628 if (nir_op_infos[instr->op].output_size == 0) {
629 /* Since NIR is doing the scalarizing for us, we should only ever see
630 * vectorized operations with a single channel.
631 */
632 assert(_mesa_bitcount(instr->dest.write_mask) == 1);
633 channel = ffs(instr->dest.write_mask) - 1;
634
635 result = offset(result, bld, channel);
636 }
637
638 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
639 assert(nir_op_infos[instr->op].input_sizes[i] < 2);
640 op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
641 }
642
643 switch (instr->op) {
644 case nir_op_i2f:
645 case nir_op_u2f:
646 case nir_op_i642d:
647 case nir_op_u642d:
648 if (optimize_extract_to_float(instr, result))
649 return;
650 inst = bld.MOV(result, op[0]);
651 inst->saturate = instr->dest.saturate;
652 break;
653
654 case nir_op_f2d:
655 case nir_op_i2d:
656 case nir_op_u2d:
657 /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
658 *
659 * "When source or destination is 64b (...), regioning in Align1
660 * must follow these rules:
661 *
662 * 1. Source and destination horizontal stride must be aligned to
663 * the same qword.
664 * (...)"
665 *
666 * This means that 32-bit to 64-bit conversions need to have the 32-bit
667 * data elements aligned to 64-bit. This restriction does not apply to
668 * BDW and later.
669 */
670 if (nir_dest_bit_size(instr->dest.dest) == 64 &&
671 nir_src_bit_size(instr->src[0].src) == 32 &&
672 (devinfo->is_cherryview || devinfo->is_broxton)) {
673 fs_reg tmp = bld.vgrf(result.type, 1);
674 tmp = subscript(tmp, op[0].type, 0);
675 inst = bld.MOV(tmp, op[0]);
676 inst = bld.MOV(result, tmp);
677 inst->saturate = instr->dest.saturate;
678 break;
679 }
680 /* fallthrough */
681 case nir_op_f2i64:
682 case nir_op_f2u64:
683 case nir_op_i2i64:
684 case nir_op_i2u64:
685 case nir_op_u2i64:
686 case nir_op_u2u64:
687 case nir_op_b2i64:
688 case nir_op_d2f:
689 case nir_op_d2i:
690 case nir_op_d2u:
691 case nir_op_i642f:
692 case nir_op_u642f:
693 case nir_op_u2i32:
694 case nir_op_i2i32:
695 case nir_op_u2u32:
696 case nir_op_i2u32:
697 if (instr->op == nir_op_b2i64) {
698 bld.MOV(result, negate(op[0]));
699 } else {
700 inst = bld.MOV(result, op[0]);
701 inst->saturate = instr->dest.saturate;
702 }
703 break;
704
705 case nir_op_f2i:
706 case nir_op_f2u:
707 bld.MOV(result, op[0]);
708 break;
709
710 case nir_op_fsign: {
711 if (op[0].abs) {
712 /* Straightforward since the source can be assumed to be
713 * non-negative.
714 */
715 set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
716 set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(result, brw_imm_f(1.0f)));
717
718 } else if (type_sz(op[0].type) < 8) {
719 /* AND(val, 0x80000000) gives the sign bit.
720 *
721 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
722 * zero.
723 */
724 bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
725
726 fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
727 op[0].type = BRW_REGISTER_TYPE_UD;
728 result.type = BRW_REGISTER_TYPE_UD;
729 bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
730
731 inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
732 inst->predicate = BRW_PREDICATE_NORMAL;
733 if (instr->dest.saturate) {
734 inst = bld.MOV(result, result);
735 inst->saturate = true;
736 }
737 } else {
738 /* For doubles we do the same but we need to consider:
739 *
740 * - 2-src instructions can't operate with 64-bit immediates
741 * - The sign is encoded in the high 32-bit of each DF
742 * - CMP with DF requires special handling in SIMD16
743 * - We need to produce a DF result.
744 */
745
746 /* 2-src instructions can't have 64-bit immediates, so put 0.0 in
747 * a register and compare with that.
748 */
749 fs_reg tmp = vgrf(glsl_type::double_type);
750 bld.MOV(tmp, setup_imm_df(bld, 0.0));
751
752 /* A direct DF CMP using the flag register (null dst) won't work in
753 * SIMD16 because the CMP will be split in two by lower_simd_width,
754 * resulting in two CMP instructions with the same dst (NULL),
755 * leading to dead code elimination of the first one. In SIMD8,
756 * however, there is no need to split the CMP and we can save some
757 * work.
758 */
759 fs_reg dst_tmp = vgrf(glsl_type::double_type);
760 bld.CMP(dst_tmp, op[0], tmp, BRW_CONDITIONAL_NZ);
761
762 /* In SIMD16 we want to avoid using a NULL dst register with DF CMP,
763 * so we store the result of the comparison in a vgrf instead and
764 * then we generate a UD comparison from that that won't have to
765 * be split by lower_simd_width. This is what NIR does to handle
766 * double comparisons in the general case.
767 */
768 if (bld.dispatch_width() == 16 ) {
769 fs_reg dst_tmp_ud = retype(dst_tmp, BRW_REGISTER_TYPE_UD);
770 bld.MOV(dst_tmp_ud, subscript(dst_tmp, BRW_REGISTER_TYPE_UD, 0));
771 bld.CMP(bld.null_reg_ud(),
772 dst_tmp_ud, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
773 }
774
775 /* Get the high 32-bit of each double component where the sign is */
776 fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
777 bld.MOV(result_int, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
778
779 /* Get the sign bit */
780 bld.AND(result_int, result_int, brw_imm_ud(0x80000000u));
781
782 /* Add 1.0 to the sign, predicated to skip the case of op[0] == 0.0 */
783 inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
784 inst->predicate = BRW_PREDICATE_NORMAL;
785
786 /* Convert from 32-bit float to 64-bit double */
787 result.type = BRW_REGISTER_TYPE_DF;
788 inst = bld.MOV(result, retype(result_int, BRW_REGISTER_TYPE_F));
789
790 if (instr->dest.saturate) {
791 inst = bld.MOV(result, result);
792 inst->saturate = true;
793 }
794 }
795 break;
796 }
797
798 case nir_op_isign:
799 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
800 * -> non-negative val generates 0x00000000.
801 * Predicated OR sets 1 if val is positive.
802 */
803 assert(nir_dest_bit_size(instr->dest.dest) < 64);
804 bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G);
805 bld.ASR(result, op[0], brw_imm_d(31));
806 inst = bld.OR(result, result, brw_imm_d(1));
807 inst->predicate = BRW_PREDICATE_NORMAL;
808 break;
809
810 case nir_op_frcp:
811 inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
812 inst->saturate = instr->dest.saturate;
813 break;
814
815 case nir_op_fexp2:
816 inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
817 inst->saturate = instr->dest.saturate;
818 break;
819
820 case nir_op_flog2:
821 inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
822 inst->saturate = instr->dest.saturate;
823 break;
824
825 case nir_op_fsin:
826 inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
827 inst->saturate = instr->dest.saturate;
828 break;
829
830 case nir_op_fcos:
831 inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
832 inst->saturate = instr->dest.saturate;
833 break;
834
835 case nir_op_fddx:
836 if (fs_key->high_quality_derivatives) {
837 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
838 } else {
839 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
840 }
841 inst->saturate = instr->dest.saturate;
842 break;
843 case nir_op_fddx_fine:
844 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
845 inst->saturate = instr->dest.saturate;
846 break;
847 case nir_op_fddx_coarse:
848 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
849 inst->saturate = instr->dest.saturate;
850 break;
851 case nir_op_fddy:
852 if (fs_key->high_quality_derivatives) {
853 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
854 } else {
855 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
856 }
857 inst->saturate = instr->dest.saturate;
858 break;
859 case nir_op_fddy_fine:
860 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
861 inst->saturate = instr->dest.saturate;
862 break;
863 case nir_op_fddy_coarse:
864 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
865 inst->saturate = instr->dest.saturate;
866 break;
867
868 case nir_op_iadd:
869 case nir_op_fadd:
870 inst = bld.ADD(result, op[0], op[1]);
871 inst->saturate = instr->dest.saturate;
872 break;
873
874 case nir_op_fmul:
875 inst = bld.MUL(result, op[0], op[1]);
876 inst->saturate = instr->dest.saturate;
877 break;
878
879 case nir_op_imul:
880 assert(nir_dest_bit_size(instr->dest.dest) < 64);
881 bld.MUL(result, op[0], op[1]);
882 break;
883
884 case nir_op_imul_high:
885 case nir_op_umul_high:
886 assert(nir_dest_bit_size(instr->dest.dest) < 64);
887 bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
888 break;
889
890 case nir_op_idiv:
891 case nir_op_udiv:
892 assert(nir_dest_bit_size(instr->dest.dest) < 64);
893 bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
894 break;
895
896 case nir_op_uadd_carry:
897 unreachable("Should have been lowered by carry_to_arith().");
898
899 case nir_op_usub_borrow:
900 unreachable("Should have been lowered by borrow_to_arith().");
901
902 case nir_op_umod:
903 case nir_op_irem:
904 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
905 * appears that our hardware just does the right thing for signed
906 * remainder.
907 */
908 assert(nir_dest_bit_size(instr->dest.dest) < 64);
909 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
910 break;
911
912 case nir_op_imod: {
913 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */
914 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
915
916 /* Math instructions don't support conditional mod */
917 inst = bld.MOV(bld.null_reg_d(), result);
918 inst->conditional_mod = BRW_CONDITIONAL_NZ;
919
920 /* Now, we need to determine if signs of the sources are different.
921 * When we XOR the sources, the top bit is 0 if they are the same and 1
922 * if they are different. We can then use a conditional modifier to
923 * turn that into a predicate. This leads us to an XOR.l instruction.
924 *
925 * Technically, according to the PRM, you're not allowed to use .l on a
926 * XOR instruction. However, emperical experiments and Curro's reading
927 * of the simulator source both indicate that it's safe.
928 */
929 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
930 inst = bld.XOR(tmp, op[0], op[1]);
931 inst->predicate = BRW_PREDICATE_NORMAL;
932 inst->conditional_mod = BRW_CONDITIONAL_L;
933
934 /* If the result of the initial remainder operation is non-zero and the
935 * two sources have different signs, add in a copy of op[1] to get the
936 * final integer modulus value.
937 */
938 inst = bld.ADD(result, result, op[1]);
939 inst->predicate = BRW_PREDICATE_NORMAL;
940 break;
941 }
942
943 case nir_op_flt:
944 case nir_op_fge:
945 case nir_op_feq:
946 case nir_op_fne: {
947 fs_reg dest = result;
948 if (nir_src_bit_size(instr->src[0].src) > 32) {
949 dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1);
950 }
951 brw_conditional_mod cond;
952 switch (instr->op) {
953 case nir_op_flt:
954 cond = BRW_CONDITIONAL_L;
955 break;
956 case nir_op_fge:
957 cond = BRW_CONDITIONAL_GE;
958 break;
959 case nir_op_feq:
960 cond = BRW_CONDITIONAL_Z;
961 break;
962 case nir_op_fne:
963 cond = BRW_CONDITIONAL_NZ;
964 break;
965 default:
966 unreachable("bad opcode");
967 }
968 bld.CMP(dest, op[0], op[1], cond);
969 if (nir_src_bit_size(instr->src[0].src) > 32) {
970 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
971 }
972 break;
973 }
974
975 case nir_op_ilt:
976 case nir_op_ult:
977 case nir_op_ige:
978 case nir_op_uge:
979 case nir_op_ieq:
980 case nir_op_ine: {
981 fs_reg dest = result;
982 if (nir_src_bit_size(instr->src[0].src) > 32) {
983 dest = bld.vgrf(BRW_REGISTER_TYPE_UQ, 1);
984 }
985
986 brw_conditional_mod cond;
987 switch (instr->op) {
988 case nir_op_ilt:
989 case nir_op_ult:
990 cond = BRW_CONDITIONAL_L;
991 break;
992 case nir_op_ige:
993 case nir_op_uge:
994 cond = BRW_CONDITIONAL_GE;
995 break;
996 case nir_op_ieq:
997 cond = BRW_CONDITIONAL_Z;
998 break;
999 case nir_op_ine:
1000 cond = BRW_CONDITIONAL_NZ;
1001 break;
1002 default:
1003 unreachable("bad opcode");
1004 }
1005 bld.CMP(dest, op[0], op[1], cond);
1006 if (nir_src_bit_size(instr->src[0].src) > 32) {
1007 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1008 }
1009 break;
1010 }
1011
1012 case nir_op_inot:
1013 if (devinfo->gen >= 8) {
1014 op[0] = resolve_source_modifiers(op[0]);
1015 }
1016 bld.NOT(result, op[0]);
1017 break;
1018 case nir_op_ixor:
1019 if (devinfo->gen >= 8) {
1020 op[0] = resolve_source_modifiers(op[0]);
1021 op[1] = resolve_source_modifiers(op[1]);
1022 }
1023 bld.XOR(result, op[0], op[1]);
1024 break;
1025 case nir_op_ior:
1026 if (devinfo->gen >= 8) {
1027 op[0] = resolve_source_modifiers(op[0]);
1028 op[1] = resolve_source_modifiers(op[1]);
1029 }
1030 bld.OR(result, op[0], op[1]);
1031 break;
1032 case nir_op_iand:
1033 if (devinfo->gen >= 8) {
1034 op[0] = resolve_source_modifiers(op[0]);
1035 op[1] = resolve_source_modifiers(op[1]);
1036 }
1037 bld.AND(result, op[0], op[1]);
1038 break;
1039
1040 case nir_op_fdot2:
1041 case nir_op_fdot3:
1042 case nir_op_fdot4:
1043 case nir_op_ball_fequal2:
1044 case nir_op_ball_iequal2:
1045 case nir_op_ball_fequal3:
1046 case nir_op_ball_iequal3:
1047 case nir_op_ball_fequal4:
1048 case nir_op_ball_iequal4:
1049 case nir_op_bany_fnequal2:
1050 case nir_op_bany_inequal2:
1051 case nir_op_bany_fnequal3:
1052 case nir_op_bany_inequal3:
1053 case nir_op_bany_fnequal4:
1054 case nir_op_bany_inequal4:
1055 unreachable("Lowered by nir_lower_alu_reductions");
1056
1057 case nir_op_fnoise1_1:
1058 case nir_op_fnoise1_2:
1059 case nir_op_fnoise1_3:
1060 case nir_op_fnoise1_4:
1061 case nir_op_fnoise2_1:
1062 case nir_op_fnoise2_2:
1063 case nir_op_fnoise2_3:
1064 case nir_op_fnoise2_4:
1065 case nir_op_fnoise3_1:
1066 case nir_op_fnoise3_2:
1067 case nir_op_fnoise3_3:
1068 case nir_op_fnoise3_4:
1069 case nir_op_fnoise4_1:
1070 case nir_op_fnoise4_2:
1071 case nir_op_fnoise4_3:
1072 case nir_op_fnoise4_4:
1073 unreachable("not reached: should be handled by lower_noise");
1074
1075 case nir_op_ldexp:
1076 unreachable("not reached: should be handled by ldexp_to_arith()");
1077
1078 case nir_op_fsqrt:
1079 inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1080 inst->saturate = instr->dest.saturate;
1081 break;
1082
1083 case nir_op_frsq:
1084 inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1085 inst->saturate = instr->dest.saturate;
1086 break;
1087
1088 case nir_op_b2i:
1089 case nir_op_b2f:
1090 bld.MOV(result, negate(op[0]));
1091 break;
1092
1093 case nir_op_f2b:
1094 bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
1095 break;
1096
1097 case nir_op_i642b:
1098 case nir_op_d2b: {
1099 /* two-argument instructions can't take 64-bit immediates */
1100 fs_reg zero;
1101 fs_reg tmp;
1102
1103 if (instr->op == nir_op_d2b) {
1104 zero = vgrf(glsl_type::double_type);
1105 tmp = vgrf(glsl_type::double_type);
1106 } else {
1107 zero = vgrf(glsl_type::int64_t_type);
1108 tmp = vgrf(glsl_type::int64_t_type);
1109 }
1110
1111 bld.MOV(zero, setup_imm_df(bld, 0.0));
1112 /* A SIMD16 execution needs to be split in two instructions, so use
1113 * a vgrf instead of the flag register as dst so instruction splitting
1114 * works
1115 */
1116 bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1117 bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1118 break;
1119 }
1120 case nir_op_i2b:
1121 bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1122 break;
1123
1124 case nir_op_ftrunc:
1125 inst = bld.RNDZ(result, op[0]);
1126 inst->saturate = instr->dest.saturate;
1127 break;
1128
1129 case nir_op_fceil: {
1130 op[0].negate = !op[0].negate;
1131 fs_reg temp = vgrf(glsl_type::float_type);
1132 bld.RNDD(temp, op[0]);
1133 temp.negate = true;
1134 inst = bld.MOV(result, temp);
1135 inst->saturate = instr->dest.saturate;
1136 break;
1137 }
1138 case nir_op_ffloor:
1139 inst = bld.RNDD(result, op[0]);
1140 inst->saturate = instr->dest.saturate;
1141 break;
1142 case nir_op_ffract:
1143 inst = bld.FRC(result, op[0]);
1144 inst->saturate = instr->dest.saturate;
1145 break;
1146 case nir_op_fround_even:
1147 inst = bld.RNDE(result, op[0]);
1148 inst->saturate = instr->dest.saturate;
1149 break;
1150
1151 case nir_op_fquantize2f16: {
1152 fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1153 fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1154 fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1155
1156 /* The destination stride must be at least as big as the source stride. */
1157 tmp16.type = BRW_REGISTER_TYPE_W;
1158 tmp16.stride = 2;
1159
1160 /* Check for denormal */
1161 fs_reg abs_src0 = op[0];
1162 abs_src0.abs = true;
1163 bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1164 BRW_CONDITIONAL_L);
1165 /* Get the appropriately signed zero */
1166 bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1167 retype(op[0], BRW_REGISTER_TYPE_UD),
1168 brw_imm_ud(0x80000000));
1169 /* Do the actual F32 -> F16 -> F32 conversion */
1170 bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1171 bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1172 /* Select that or zero based on normal status */
1173 inst = bld.SEL(result, zero, tmp32);
1174 inst->predicate = BRW_PREDICATE_NORMAL;
1175 inst->saturate = instr->dest.saturate;
1176 break;
1177 }
1178
1179 case nir_op_imin:
1180 case nir_op_umin:
1181 case nir_op_fmin:
1182 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1183 inst->saturate = instr->dest.saturate;
1184 break;
1185
1186 case nir_op_imax:
1187 case nir_op_umax:
1188 case nir_op_fmax:
1189 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1190 inst->saturate = instr->dest.saturate;
1191 break;
1192
1193 case nir_op_pack_snorm_2x16:
1194 case nir_op_pack_snorm_4x8:
1195 case nir_op_pack_unorm_2x16:
1196 case nir_op_pack_unorm_4x8:
1197 case nir_op_unpack_snorm_2x16:
1198 case nir_op_unpack_snorm_4x8:
1199 case nir_op_unpack_unorm_2x16:
1200 case nir_op_unpack_unorm_4x8:
1201 case nir_op_unpack_half_2x16:
1202 case nir_op_pack_half_2x16:
1203 unreachable("not reached: should be handled by lower_packing_builtins");
1204
1205 case nir_op_unpack_half_2x16_split_x:
1206 inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
1207 inst->saturate = instr->dest.saturate;
1208 break;
1209 case nir_op_unpack_half_2x16_split_y:
1210 inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
1211 inst->saturate = instr->dest.saturate;
1212 break;
1213
1214 case nir_op_pack_64_2x32_split:
1215 bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1216 break;
1217
1218 case nir_op_unpack_64_2x32_split_x:
1219 case nir_op_unpack_64_2x32_split_y: {
1220 if (instr->op == nir_op_unpack_64_2x32_split_x)
1221 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1222 else
1223 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1224 break;
1225 }
1226
1227 case nir_op_fpow:
1228 inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1229 inst->saturate = instr->dest.saturate;
1230 break;
1231
1232 case nir_op_bitfield_reverse:
1233 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1234 bld.BFREV(result, op[0]);
1235 break;
1236
1237 case nir_op_bit_count:
1238 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1239 bld.CBIT(result, op[0]);
1240 break;
1241
1242 case nir_op_ufind_msb: {
1243 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1244 emit_find_msb_using_lzd(bld, result, op[0], false);
1245 break;
1246 }
1247
1248 case nir_op_ifind_msb: {
1249 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1250
1251 if (devinfo->gen < 7) {
1252 emit_find_msb_using_lzd(bld, result, op[0], true);
1253 } else {
1254 bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1255
1256 /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1257 * count from the LSB side. If FBH didn't return an error
1258 * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1259 * count into an LSB count.
1260 */
1261 bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1262
1263 inst = bld.ADD(result, result, brw_imm_d(31));
1264 inst->predicate = BRW_PREDICATE_NORMAL;
1265 inst->src[0].negate = true;
1266 }
1267 break;
1268 }
1269
1270 case nir_op_find_lsb:
1271 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1272
1273 if (devinfo->gen < 7) {
1274 fs_reg temp = vgrf(glsl_type::int_type);
1275
1276 /* (x & -x) generates a value that consists of only the LSB of x.
1277 * For all powers of 2, findMSB(y) == findLSB(y).
1278 */
1279 fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1280 fs_reg negated_src = src;
1281
1282 /* One must be negated, and the other must be non-negated. It
1283 * doesn't matter which is which.
1284 */
1285 negated_src.negate = true;
1286 src.negate = false;
1287
1288 bld.AND(temp, src, negated_src);
1289 emit_find_msb_using_lzd(bld, result, temp, false);
1290 } else {
1291 bld.FBL(result, op[0]);
1292 }
1293 break;
1294
1295 case nir_op_ubitfield_extract:
1296 case nir_op_ibitfield_extract:
1297 unreachable("should have been lowered");
1298 case nir_op_ubfe:
1299 case nir_op_ibfe:
1300 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1301 bld.BFE(result, op[2], op[1], op[0]);
1302 break;
1303 case nir_op_bfm:
1304 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1305 bld.BFI1(result, op[0], op[1]);
1306 break;
1307 case nir_op_bfi:
1308 assert(nir_dest_bit_size(instr->dest.dest) < 64);
1309 bld.BFI2(result, op[0], op[1], op[2]);
1310 break;
1311
1312 case nir_op_bitfield_insert:
1313 unreachable("not reached: should have been lowered");
1314
1315 case nir_op_ishl:
1316 bld.SHL(result, op[0], op[1]);
1317 break;
1318 case nir_op_ishr:
1319 bld.ASR(result, op[0], op[1]);
1320 break;
1321 case nir_op_ushr:
1322 bld.SHR(result, op[0], op[1]);
1323 break;
1324
1325 case nir_op_pack_half_2x16_split:
1326 bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1327 break;
1328
1329 case nir_op_ffma:
1330 inst = bld.MAD(result, op[2], op[1], op[0]);
1331 inst->saturate = instr->dest.saturate;
1332 break;
1333
1334 case nir_op_flrp:
1335 inst = bld.LRP(result, op[0], op[1], op[2]);
1336 inst->saturate = instr->dest.saturate;
1337 break;
1338
1339 case nir_op_bcsel:
1340 if (optimize_frontfacing_ternary(instr, result))
1341 return;
1342
1343 bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1344 inst = bld.SEL(result, op[1], op[2]);
1345 inst->predicate = BRW_PREDICATE_NORMAL;
1346 break;
1347
1348 case nir_op_extract_u8:
1349 case nir_op_extract_i8: {
1350 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1351 nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
1352 assert(byte != NULL);
1353 bld.MOV(result, subscript(op[0], type, byte->u32[0]));
1354 break;
1355 }
1356
1357 case nir_op_extract_u16:
1358 case nir_op_extract_i16: {
1359 const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1360 nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
1361 assert(word != NULL);
1362 bld.MOV(result, subscript(op[0], type, word->u32[0]));
1363 break;
1364 }
1365
1366 default:
1367 unreachable("unhandled instruction");
1368 }
1369
1370 /* If we need to do a boolean resolve, replace the result with -(x & 1)
1371 * to sign extend the low bit to 0/~0
1372 */
1373 if (devinfo->gen <= 5 &&
1374 (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
1375 fs_reg masked = vgrf(glsl_type::int_type);
1376 bld.AND(masked, result, brw_imm_d(1));
1377 masked.negate = true;
1378 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
1379 }
1380 }
1381
1382 void
1383 fs_visitor::nir_emit_load_const(const fs_builder &bld,
1384 nir_load_const_instr *instr)
1385 {
1386 const brw_reg_type reg_type =
1387 instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
1388 fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1389
1390 switch (instr->def.bit_size) {
1391 case 32:
1392 for (unsigned i = 0; i < instr->def.num_components; i++)
1393 bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
1394 break;
1395
1396 case 64:
1397 for (unsigned i = 0; i < instr->def.num_components; i++)
1398 bld.MOV(offset(reg, bld, i),
1399 setup_imm_df(bld, instr->value.f64[i]));
1400 break;
1401
1402 default:
1403 unreachable("Invalid bit size");
1404 }
1405
1406 nir_ssa_values[instr->def.index] = reg;
1407 }
1408
1409 fs_reg
1410 fs_visitor::get_nir_src(const nir_src &src)
1411 {
1412 fs_reg reg;
1413 if (src.is_ssa) {
1414 if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
1415 const brw_reg_type reg_type = src.ssa->bit_size == 32 ?
1416 BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
1417 reg = bld.vgrf(reg_type, src.ssa->num_components);
1418 } else {
1419 reg = nir_ssa_values[src.ssa->index];
1420 }
1421 } else {
1422 /* We don't handle indirects on locals */
1423 assert(src.reg.indirect == NULL);
1424 reg = offset(nir_locals[src.reg.reg->index], bld,
1425 src.reg.base_offset * src.reg.reg->num_components);
1426 }
1427
1428 /* to avoid floating-point denorm flushing problems, set the type by
1429 * default to D - instructions that need floating point semantics will set
1430 * this to F if they need to
1431 */
1432 return retype(reg, BRW_REGISTER_TYPE_D);
1433 }
1434
1435 /**
1436 * Return an IMM for constants; otherwise call get_nir_src() as normal.
1437 */
1438 fs_reg
1439 fs_visitor::get_nir_src_imm(const nir_src &src)
1440 {
1441 nir_const_value *val = nir_src_as_const_value(src);
1442 return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
1443 }
1444
1445 fs_reg
1446 fs_visitor::get_nir_dest(const nir_dest &dest)
1447 {
1448 if (dest.is_ssa) {
1449 const brw_reg_type reg_type =
1450 dest.ssa.bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
1451 nir_ssa_values[dest.ssa.index] =
1452 bld.vgrf(reg_type, dest.ssa.num_components);
1453 return nir_ssa_values[dest.ssa.index];
1454 } else {
1455 /* We don't handle indirects on locals */
1456 assert(dest.reg.indirect == NULL);
1457 return offset(nir_locals[dest.reg.reg->index], bld,
1458 dest.reg.base_offset * dest.reg.reg->num_components);
1459 }
1460 }
1461
1462 fs_reg
1463 fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
1464 {
1465 fs_reg image(UNIFORM, deref->var->data.driver_location / 4,
1466 BRW_REGISTER_TYPE_UD);
1467 fs_reg indirect;
1468 unsigned indirect_max = 0;
1469
1470 for (const nir_deref *tail = &deref->deref; tail->child;
1471 tail = tail->child) {
1472 const nir_deref_array *deref_array = nir_deref_as_array(tail->child);
1473 assert(tail->child->deref_type == nir_deref_type_array);
1474 const unsigned size = glsl_get_length(tail->type);
1475 const unsigned element_size = type_size_scalar(deref_array->deref.type);
1476 const unsigned base = MIN2(deref_array->base_offset, size - 1);
1477 image = offset(image, bld, base * element_size);
1478
1479 if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
1480 fs_reg tmp = vgrf(glsl_type::uint_type);
1481
1482 /* Accessing an invalid surface index with the dataport can result
1483 * in a hang. According to the spec "if the index used to
1484 * select an individual element is negative or greater than or
1485 * equal to the size of the array, the results of the operation
1486 * are undefined but may not lead to termination" -- which is one
1487 * of the possible outcomes of the hang. Clamp the index to
1488 * prevent access outside of the array bounds.
1489 */
1490 bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
1491 BRW_REGISTER_TYPE_UD),
1492 brw_imm_ud(size - base - 1), BRW_CONDITIONAL_L);
1493
1494 indirect_max += element_size * (tail->type->length - 1);
1495
1496 bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4));
1497 if (indirect.file == BAD_FILE) {
1498 indirect = tmp;
1499 } else {
1500 bld.ADD(indirect, indirect, tmp);
1501 }
1502 }
1503 }
1504
1505 if (indirect.file == BAD_FILE) {
1506 return image;
1507 } else {
1508 /* Emit a pile of MOVs to load the uniform into a temporary. The
1509 * dead-code elimination pass will get rid of what we don't use.
1510 */
1511 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE);
1512 for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) {
1513 bld.emit(SHADER_OPCODE_MOV_INDIRECT,
1514 offset(tmp, bld, j), offset(image, bld, j),
1515 indirect, brw_imm_ud((indirect_max + 1) * 4));
1516 }
1517 return tmp;
1518 }
1519 }
1520
1521 void
1522 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
1523 unsigned wr_mask)
1524 {
1525 for (unsigned i = 0; i < 4; i++) {
1526 if (!((wr_mask >> i) & 1))
1527 continue;
1528
1529 fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
1530 new_inst->dst = offset(new_inst->dst, bld, i);
1531 for (unsigned j = 0; j < new_inst->sources; j++)
1532 if (new_inst->src[j].file == VGRF)
1533 new_inst->src[j] = offset(new_inst->src[j], bld, i);
1534
1535 bld.emit(new_inst);
1536 }
1537 }
1538
1539 /**
1540 * Get the matching channel register datatype for an image intrinsic of the
1541 * specified GLSL image type.
1542 */
1543 static brw_reg_type
1544 get_image_base_type(const glsl_type *type)
1545 {
1546 switch ((glsl_base_type)type->sampled_type) {
1547 case GLSL_TYPE_UINT:
1548 return BRW_REGISTER_TYPE_UD;
1549 case GLSL_TYPE_INT:
1550 return BRW_REGISTER_TYPE_D;
1551 case GLSL_TYPE_FLOAT:
1552 return BRW_REGISTER_TYPE_F;
1553 default:
1554 unreachable("Not reached.");
1555 }
1556 }
1557
1558 /**
1559 * Get the appropriate atomic op for an image atomic intrinsic.
1560 */
1561 static unsigned
1562 get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
1563 {
1564 switch (op) {
1565 case nir_intrinsic_image_atomic_add:
1566 return BRW_AOP_ADD;
1567 case nir_intrinsic_image_atomic_min:
1568 return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
1569 BRW_AOP_IMIN : BRW_AOP_UMIN);
1570 case nir_intrinsic_image_atomic_max:
1571 return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
1572 BRW_AOP_IMAX : BRW_AOP_UMAX);
1573 case nir_intrinsic_image_atomic_and:
1574 return BRW_AOP_AND;
1575 case nir_intrinsic_image_atomic_or:
1576 return BRW_AOP_OR;
1577 case nir_intrinsic_image_atomic_xor:
1578 return BRW_AOP_XOR;
1579 case nir_intrinsic_image_atomic_exchange:
1580 return BRW_AOP_MOV;
1581 case nir_intrinsic_image_atomic_comp_swap:
1582 return BRW_AOP_CMPWR;
1583 default:
1584 unreachable("Not reachable.");
1585 }
1586 }
1587
1588 static fs_inst *
1589 emit_pixel_interpolater_send(const fs_builder &bld,
1590 enum opcode opcode,
1591 const fs_reg &dst,
1592 const fs_reg &src,
1593 const fs_reg &desc,
1594 glsl_interp_mode interpolation)
1595 {
1596 struct brw_wm_prog_data *wm_prog_data =
1597 brw_wm_prog_data(bld.shader->stage_prog_data);
1598 fs_inst *inst;
1599 fs_reg payload;
1600 int mlen;
1601
1602 if (src.file == BAD_FILE) {
1603 /* Dummy payload */
1604 payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
1605 mlen = 1;
1606 } else {
1607 payload = src;
1608 mlen = 2 * bld.dispatch_width() / 8;
1609 }
1610
1611 inst = bld.emit(opcode, dst, payload, desc);
1612 inst->mlen = mlen;
1613 /* 2 floats per slot returned */
1614 inst->size_written = 2 * dst.component_size(inst->exec_size);
1615 inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
1616
1617 wm_prog_data->pulls_bary = true;
1618
1619 return inst;
1620 }
1621
1622 /**
1623 * Computes 1 << x, given a D/UD register containing some value x.
1624 */
1625 static fs_reg
1626 intexp2(const fs_builder &bld, const fs_reg &x)
1627 {
1628 assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
1629
1630 fs_reg result = bld.vgrf(x.type, 1);
1631 fs_reg one = bld.vgrf(x.type, 1);
1632
1633 bld.MOV(one, retype(brw_imm_d(1), one.type));
1634 bld.SHL(result, one, x);
1635 return result;
1636 }
1637
1638 void
1639 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
1640 {
1641 assert(stage == MESA_SHADER_GEOMETRY);
1642
1643 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1644
1645 if (gs_compile->control_data_header_size_bits == 0)
1646 return;
1647
1648 /* We can only do EndPrimitive() functionality when the control data
1649 * consists of cut bits. Fortunately, the only time it isn't is when the
1650 * output type is points, in which case EndPrimitive() is a no-op.
1651 */
1652 if (gs_prog_data->control_data_format !=
1653 GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
1654 return;
1655 }
1656
1657 /* Cut bits use one bit per vertex. */
1658 assert(gs_compile->control_data_bits_per_vertex == 1);
1659
1660 fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1661 vertex_count.type = BRW_REGISTER_TYPE_UD;
1662
1663 /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
1664 * vertex n, 0 otherwise. So all we need to do here is mark bit
1665 * (vertex_count - 1) % 32 in the cut_bits register to indicate that
1666 * EndPrimitive() was called after emitting vertex (vertex_count - 1);
1667 * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
1668 *
1669 * Note that if EndPrimitive() is called before emitting any vertices, this
1670 * will cause us to set bit 31 of the control_data_bits register to 1.
1671 * That's fine because:
1672 *
1673 * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
1674 * output, so the hardware will ignore cut bit 31.
1675 *
1676 * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
1677 * last vertex, so setting cut bit 31 has no effect (since the primitive
1678 * is automatically ended when the GS terminates).
1679 *
1680 * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
1681 * control_data_bits register to 0 when the first vertex is emitted.
1682 */
1683
1684 const fs_builder abld = bld.annotate("end primitive");
1685
1686 /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
1687 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1688 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1689 fs_reg mask = intexp2(abld, prev_count);
1690 /* Note: we're relying on the fact that the GEN SHL instruction only pays
1691 * attention to the lower 5 bits of its second source argument, so on this
1692 * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
1693 * ((vertex_count - 1) % 32).
1694 */
1695 abld.OR(this->control_data_bits, this->control_data_bits, mask);
1696 }
1697
1698 void
1699 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
1700 {
1701 assert(stage == MESA_SHADER_GEOMETRY);
1702 assert(gs_compile->control_data_bits_per_vertex != 0);
1703
1704 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1705
1706 const fs_builder abld = bld.annotate("emit control data bits");
1707 const fs_builder fwa_bld = bld.exec_all();
1708
1709 /* We use a single UD register to accumulate control data bits (32 bits
1710 * for each of the SIMD8 channels). So we need to write a DWord (32 bits)
1711 * at a time.
1712 *
1713 * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
1714 * We have select a 128-bit group via the Global and Per-Slot Offsets, then
1715 * use the Channel Mask phase to enable/disable which DWord within that
1716 * group to write. (Remember, different SIMD8 channels may have emitted
1717 * different numbers of vertices, so we may need per-slot offsets.)
1718 *
1719 * Channel masking presents an annoying problem: we may have to replicate
1720 * the data up to 4 times:
1721 *
1722 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
1723 *
1724 * To avoid penalizing shaders that emit a small number of vertices, we
1725 * can avoid these sometimes: if the size of the control data header is
1726 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
1727 * land in the same 128-bit group, so we can skip per-slot offsets.
1728 *
1729 * Similarly, if the control data header is <= 32 bits, there is only one
1730 * DWord, so we can skip channel masks.
1731 */
1732 enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
1733
1734 fs_reg channel_mask, per_slot_offset;
1735
1736 if (gs_compile->control_data_header_size_bits > 32) {
1737 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
1738 channel_mask = vgrf(glsl_type::uint_type);
1739 }
1740
1741 if (gs_compile->control_data_header_size_bits > 128) {
1742 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
1743 per_slot_offset = vgrf(glsl_type::uint_type);
1744 }
1745
1746 /* Figure out which DWord we're trying to write to using the formula:
1747 *
1748 * dword_index = (vertex_count - 1) * bits_per_vertex / 32
1749 *
1750 * Since bits_per_vertex is a power of two, and is known at compile
1751 * time, this can be optimized to:
1752 *
1753 * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
1754 */
1755 if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
1756 fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1757 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1758 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1759 unsigned log2_bits_per_vertex =
1760 util_last_bit(gs_compile->control_data_bits_per_vertex);
1761 abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
1762
1763 if (per_slot_offset.file != BAD_FILE) {
1764 /* Set the per-slot offset to dword_index / 4, so that we'll write to
1765 * the appropriate OWord within the control data header.
1766 */
1767 abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
1768 }
1769
1770 /* Set the channel masks to 1 << (dword_index % 4), so that we'll
1771 * write to the appropriate DWORD within the OWORD.
1772 */
1773 fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1774 fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
1775 channel_mask = intexp2(fwa_bld, channel);
1776 /* Then the channel masks need to be in bits 23:16. */
1777 fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
1778 }
1779
1780 /* Store the control data bits in the message payload and send it. */
1781 int mlen = 2;
1782 if (channel_mask.file != BAD_FILE)
1783 mlen += 4; /* channel masks, plus 3 extra copies of the data */
1784 if (per_slot_offset.file != BAD_FILE)
1785 mlen++;
1786
1787 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
1788 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
1789 int i = 0;
1790 sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1791 if (per_slot_offset.file != BAD_FILE)
1792 sources[i++] = per_slot_offset;
1793 if (channel_mask.file != BAD_FILE)
1794 sources[i++] = channel_mask;
1795 while (i < mlen) {
1796 sources[i++] = this->control_data_bits;
1797 }
1798
1799 abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
1800 fs_inst *inst = abld.emit(opcode, reg_undef, payload);
1801 inst->mlen = mlen;
1802 /* We need to increment Global Offset by 256-bits to make room for
1803 * Broadwell's extra "Vertex Count" payload at the beginning of the
1804 * URB entry. Since this is an OWord message, Global Offset is counted
1805 * in 128-bit units, so we must set it to 2.
1806 */
1807 if (gs_prog_data->static_vertex_count == -1)
1808 inst->offset = 2;
1809 }
1810
1811 void
1812 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
1813 unsigned stream_id)
1814 {
1815 /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
1816
1817 /* Note: we are calling this *before* increasing vertex_count, so
1818 * this->vertex_count == vertex_count - 1 in the formula above.
1819 */
1820
1821 /* Stream mode uses 2 bits per vertex */
1822 assert(gs_compile->control_data_bits_per_vertex == 2);
1823
1824 /* Must be a valid stream */
1825 assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
1826
1827 /* Control data bits are initialized to 0 so we don't have to set any
1828 * bits when sending vertices to stream 0.
1829 */
1830 if (stream_id == 0)
1831 return;
1832
1833 const fs_builder abld = bld.annotate("set stream control data bits", NULL);
1834
1835 /* reg::sid = stream_id */
1836 fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1837 abld.MOV(sid, brw_imm_ud(stream_id));
1838
1839 /* reg:shift_count = 2 * (vertex_count - 1) */
1840 fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1841 abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
1842
1843 /* Note: we're relying on the fact that the GEN SHL instruction only pays
1844 * attention to the lower 5 bits of its second source argument, so on this
1845 * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
1846 * stream_id << ((2 * (vertex_count - 1)) % 32).
1847 */
1848 fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1849 abld.SHL(mask, sid, shift_count);
1850 abld.OR(this->control_data_bits, this->control_data_bits, mask);
1851 }
1852
1853 void
1854 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
1855 unsigned stream_id)
1856 {
1857 assert(stage == MESA_SHADER_GEOMETRY);
1858
1859 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1860
1861 fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1862 vertex_count.type = BRW_REGISTER_TYPE_UD;
1863
1864 /* Haswell and later hardware ignores the "Render Stream Select" bits
1865 * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
1866 * and instead sends all primitives down the pipeline for rasterization.
1867 * If the SOL stage is enabled, "Render Stream Select" is honored and
1868 * primitives bound to non-zero streams are discarded after stream output.
1869 *
1870 * Since the only purpose of primives sent to non-zero streams is to
1871 * be recorded by transform feedback, we can simply discard all geometry
1872 * bound to these streams when transform feedback is disabled.
1873 */
1874 if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
1875 return;
1876
1877 /* If we're outputting 32 control data bits or less, then we can wait
1878 * until the shader is over to output them all. Otherwise we need to
1879 * output them as we go. Now is the time to do it, since we're about to
1880 * output the vertex_count'th vertex, so it's guaranteed that the
1881 * control data bits associated with the (vertex_count - 1)th vertex are
1882 * correct.
1883 */
1884 if (gs_compile->control_data_header_size_bits > 32) {
1885 const fs_builder abld =
1886 bld.annotate("emit vertex: emit control data bits");
1887
1888 /* Only emit control data bits if we've finished accumulating a batch
1889 * of 32 bits. This is the case when:
1890 *
1891 * (vertex_count * bits_per_vertex) % 32 == 0
1892 *
1893 * (in other words, when the last 5 bits of vertex_count *
1894 * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
1895 * integer n (which is always the case, since bits_per_vertex is
1896 * always 1 or 2), this is equivalent to requiring that the last 5-n
1897 * bits of vertex_count are 0:
1898 *
1899 * vertex_count & (2^(5-n) - 1) == 0
1900 *
1901 * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
1902 * equivalent to:
1903 *
1904 * vertex_count & (32 / bits_per_vertex - 1) == 0
1905 *
1906 * TODO: If vertex_count is an immediate, we could do some of this math
1907 * at compile time...
1908 */
1909 fs_inst *inst =
1910 abld.AND(bld.null_reg_d(), vertex_count,
1911 brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
1912 inst->conditional_mod = BRW_CONDITIONAL_Z;
1913
1914 abld.IF(BRW_PREDICATE_NORMAL);
1915 /* If vertex_count is 0, then no control data bits have been
1916 * accumulated yet, so we can skip emitting them.
1917 */
1918 abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
1919 BRW_CONDITIONAL_NEQ);
1920 abld.IF(BRW_PREDICATE_NORMAL);
1921 emit_gs_control_data_bits(vertex_count);
1922 abld.emit(BRW_OPCODE_ENDIF);
1923
1924 /* Reset control_data_bits to 0 so we can start accumulating a new
1925 * batch.
1926 *
1927 * Note: in the case where vertex_count == 0, this neutralizes the
1928 * effect of any call to EndPrimitive() that the shader may have
1929 * made before outputting its first vertex.
1930 */
1931 inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
1932 inst->force_writemask_all = true;
1933 abld.emit(BRW_OPCODE_ENDIF);
1934 }
1935
1936 emit_urb_writes(vertex_count);
1937
1938 /* In stream mode we have to set control data bits for all vertices
1939 * unless we have disabled control data bits completely (which we do
1940 * do for GL_POINTS outputs that don't use streams).
1941 */
1942 if (gs_compile->control_data_header_size_bits > 0 &&
1943 gs_prog_data->control_data_format ==
1944 GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
1945 set_gs_stream_control_data_bits(vertex_count, stream_id);
1946 }
1947 }
1948
1949 void
1950 fs_visitor::emit_gs_input_load(const fs_reg &dst,
1951 const nir_src &vertex_src,
1952 unsigned base_offset,
1953 const nir_src &offset_src,
1954 unsigned num_components,
1955 unsigned first_component)
1956 {
1957 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1958
1959 nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
1960 nir_const_value *offset_const = nir_src_as_const_value(offset_src);
1961 const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
1962
1963 /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
1964 * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w]. Only
1965 * gl_PointSize is available as a GS input, however, so it must be that.
1966 */
1967 const bool is_point_size = (base_offset == 0);
1968
1969 /* TODO: figure out push input layout for invocations == 1 */
1970 if (gs_prog_data->invocations == 1 &&
1971 offset_const != NULL && vertex_const != NULL &&
1972 4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
1973 int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
1974 vertex_const->u32[0] * push_reg_count;
1975 /* This input was pushed into registers. */
1976 if (is_point_size) {
1977 /* gl_PointSize comes in .w */
1978 bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
1979 } else {
1980 for (unsigned i = 0; i < num_components; i++) {
1981 bld.MOV(offset(dst, bld, i),
1982 fs_reg(ATTR, imm_offset + i + first_component, dst.type));
1983 }
1984 }
1985 return;
1986 }
1987
1988 /* Resort to the pull model. Ensure the VUE handles are provided. */
1989 gs_prog_data->base.include_vue_handles = true;
1990
1991 unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
1992 fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1993
1994 if (gs_prog_data->invocations == 1) {
1995 if (vertex_const) {
1996 /* The vertex index is constant; just select the proper URB handle. */
1997 icp_handle =
1998 retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0),
1999 BRW_REGISTER_TYPE_UD);
2000 } else {
2001 /* The vertex index is non-constant. We need to use indirect
2002 * addressing to fetch the proper URB handle.
2003 *
2004 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2005 * indicating that channel <n> should read the handle from
2006 * DWord <n>. We convert that to bytes by multiplying by 4.
2007 *
2008 * Next, we convert the vertex index to bytes by multiplying
2009 * by 32 (shifting by 5), and add the two together. This is
2010 * the final indirect byte offset.
2011 */
2012 fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
2013 fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2014 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2015 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2016
2017 /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2018 bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2019 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2020 bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2021 /* Convert vertex_index to bytes (multiply by 32) */
2022 bld.SHL(vertex_offset_bytes,
2023 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2024 brw_imm_ud(5u));
2025 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2026
2027 /* Use first_icp_handle as the base offset. There is one register
2028 * of URB handles per vertex, so inform the register allocator that
2029 * we might read up to nir->info->gs.vertices_in registers.
2030 */
2031 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2032 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2033 fs_reg(icp_offset_bytes),
2034 brw_imm_ud(nir->info->gs.vertices_in * REG_SIZE));
2035 }
2036 } else {
2037 assert(gs_prog_data->invocations > 1);
2038
2039 if (vertex_const) {
2040 assert(devinfo->gen >= 9 || vertex_const->i32[0] <= 5);
2041 bld.MOV(icp_handle,
2042 retype(brw_vec1_grf(first_icp_handle +
2043 vertex_const->i32[0] / 8,
2044 vertex_const->i32[0] % 8),
2045 BRW_REGISTER_TYPE_UD));
2046 } else {
2047 /* The vertex index is non-constant. We need to use indirect
2048 * addressing to fetch the proper URB handle.
2049 *
2050 */
2051 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2052
2053 /* Convert vertex_index to bytes (multiply by 4) */
2054 bld.SHL(icp_offset_bytes,
2055 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2056 brw_imm_ud(2u));
2057
2058 /* Use first_icp_handle as the base offset. There is one DWord
2059 * of URB handles per vertex, so inform the register allocator that
2060 * we might read up to ceil(nir->info->gs.vertices_in / 8) registers.
2061 */
2062 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2063 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2064 fs_reg(icp_offset_bytes),
2065 brw_imm_ud(DIV_ROUND_UP(nir->info->gs.vertices_in, 8) *
2066 REG_SIZE));
2067 }
2068 }
2069
2070 fs_inst *inst;
2071
2072 fs_reg tmp_dst = dst;
2073 fs_reg indirect_offset = get_nir_src(offset_src);
2074 unsigned num_iterations = 1;
2075 unsigned orig_num_components = num_components;
2076
2077 if (type_sz(dst.type) == 8) {
2078 if (num_components > 2) {
2079 num_iterations = 2;
2080 num_components = 2;
2081 }
2082 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2083 tmp_dst = tmp;
2084 first_component = first_component / 2;
2085 }
2086
2087 for (unsigned iter = 0; iter < num_iterations; iter++) {
2088 if (offset_const) {
2089 /* Constant indexing - use global offset. */
2090 if (first_component != 0) {
2091 unsigned read_components = num_components + first_component;
2092 fs_reg tmp = bld.vgrf(dst.type, read_components);
2093 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2094 inst->size_written = read_components *
2095 tmp.component_size(inst->exec_size);
2096 for (unsigned i = 0; i < num_components; i++) {
2097 bld.MOV(offset(tmp_dst, bld, i),
2098 offset(tmp, bld, i + first_component));
2099 }
2100 } else {
2101 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
2102 icp_handle);
2103 inst->size_written = num_components *
2104 tmp_dst.component_size(inst->exec_size);
2105 }
2106 inst->offset = base_offset + offset_const->u32[0];
2107 inst->mlen = 1;
2108 } else {
2109 /* Indirect indexing - use per-slot offsets as well. */
2110 const fs_reg srcs[] = { icp_handle, indirect_offset };
2111 unsigned read_components = num_components + first_component;
2112 fs_reg tmp = bld.vgrf(dst.type, read_components);
2113 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2114 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2115 if (first_component != 0) {
2116 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2117 payload);
2118 inst->size_written = read_components *
2119 tmp.component_size(inst->exec_size);
2120 for (unsigned i = 0; i < num_components; i++) {
2121 bld.MOV(offset(tmp_dst, bld, i),
2122 offset(tmp, bld, i + first_component));
2123 }
2124 } else {
2125 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
2126 payload);
2127 inst->size_written = num_components *
2128 tmp_dst.component_size(inst->exec_size);
2129 }
2130 inst->offset = base_offset;
2131 inst->mlen = 2;
2132 }
2133
2134 if (type_sz(dst.type) == 8) {
2135 shuffle_32bit_load_result_to_64bit_data(
2136 bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
2137
2138 for (unsigned c = 0; c < num_components; c++)
2139 bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
2140 }
2141
2142 if (num_iterations > 1) {
2143 num_components = orig_num_components - 2;
2144 if(offset_const) {
2145 base_offset++;
2146 } else {
2147 fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2148 bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
2149 indirect_offset = new_indirect;
2150 }
2151 }
2152 }
2153
2154 if (is_point_size) {
2155 /* Read the whole VUE header (because of alignment) and read .w. */
2156 fs_reg tmp = bld.vgrf(dst.type, 4);
2157 inst->dst = tmp;
2158 inst->size_written = 4 * REG_SIZE;
2159 bld.MOV(dst, offset(tmp, bld, 3));
2160 }
2161 }
2162
2163 fs_reg
2164 fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2165 {
2166 nir_src *offset_src = nir_get_io_offset_src(instr);
2167 nir_const_value *const_value = nir_src_as_const_value(*offset_src);
2168
2169 if (const_value) {
2170 /* The only constant offset we should find is 0. brw_nir.c's
2171 * add_const_offset_to_base() will fold other constant offsets
2172 * into instr->const_index[0].
2173 */
2174 assert(const_value->u32[0] == 0);
2175 return fs_reg();
2176 }
2177
2178 return get_nir_src(*offset_src);
2179 }
2180
2181 static void
2182 do_untyped_vector_read(const fs_builder &bld,
2183 const fs_reg dest,
2184 const fs_reg surf_index,
2185 const fs_reg offset_reg,
2186 unsigned num_components)
2187 {
2188 if (type_sz(dest.type) == 4) {
2189 fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
2190 1 /* dims */,
2191 num_components,
2192 BRW_PREDICATE_NONE);
2193 read_result.type = dest.type;
2194 for (unsigned i = 0; i < num_components; i++)
2195 bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
2196 } else if (type_sz(dest.type) == 8) {
2197 /* Reading a dvec, so we need to:
2198 *
2199 * 1. Multiply num_components by 2, to account for the fact that we
2200 * need to read 64-bit components.
2201 * 2. Shuffle the result of the load to form valid 64-bit elements
2202 * 3. Emit a second load (for components z/w) if needed.
2203 */
2204 fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
2205 bld.MOV(read_offset, offset_reg);
2206
2207 int iters = num_components <= 2 ? 1 : 2;
2208
2209 /* Load the dvec, the first iteration loads components x/y, the second
2210 * iteration, if needed, loads components z/w
2211 */
2212 for (int it = 0; it < iters; it++) {
2213 /* Compute number of components to read in this iteration */
2214 int iter_components = MIN2(2, num_components);
2215 num_components -= iter_components;
2216
2217 /* Read. Since this message reads 32-bit components, we need to
2218 * read twice as many components.
2219 */
2220 fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset,
2221 1 /* dims */,
2222 iter_components * 2,
2223 BRW_PREDICATE_NONE);
2224
2225 /* Shuffle the 32-bit load result into valid 64-bit data */
2226 const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
2227 shuffle_32bit_load_result_to_64bit_data(
2228 bld, packed_result, read_result, iter_components);
2229
2230 /* Move each component to its destination */
2231 read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
2232 for (int c = 0; c < iter_components; c++) {
2233 bld.MOV(offset(dest, bld, it * 2 + c),
2234 offset(packed_result, bld, c));
2235 }
2236
2237 bld.ADD(read_offset, read_offset, brw_imm_ud(16));
2238 }
2239 } else {
2240 unreachable("Unsupported type");
2241 }
2242 }
2243
2244 void
2245 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2246 nir_intrinsic_instr *instr)
2247 {
2248 assert(stage == MESA_SHADER_VERTEX);
2249
2250 fs_reg dest;
2251 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2252 dest = get_nir_dest(instr->dest);
2253
2254 switch (instr->intrinsic) {
2255 case nir_intrinsic_load_vertex_id:
2256 unreachable("should be lowered by lower_vertex_id()");
2257
2258 case nir_intrinsic_load_vertex_id_zero_base:
2259 case nir_intrinsic_load_base_vertex:
2260 case nir_intrinsic_load_instance_id:
2261 case nir_intrinsic_load_base_instance:
2262 case nir_intrinsic_load_draw_id: {
2263 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
2264 fs_reg val = nir_system_values[sv];
2265 assert(val.file != BAD_FILE);
2266 dest.type = val.type;
2267 bld.MOV(dest, val);
2268 break;
2269 }
2270
2271 case nir_intrinsic_load_input: {
2272 fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
2273 unsigned first_component = nir_intrinsic_component(instr);
2274 unsigned num_components = instr->num_components;
2275 enum brw_reg_type type = dest.type;
2276
2277 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
2278 assert(const_offset && "Indirect input loads not allowed");
2279 src = offset(src, bld, const_offset->u32[0]);
2280
2281 for (unsigned j = 0; j < num_components; j++) {
2282 bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
2283 }
2284
2285 if (type == BRW_REGISTER_TYPE_DF) {
2286 /* Once the double vector is read, set again its original register
2287 * type to continue with normal execution.
2288 */
2289 src = retype(src, type);
2290 dest = retype(dest, type);
2291 }
2292
2293 if (type_sz(src.type) == 8) {
2294 shuffle_32bit_load_result_to_64bit_data(bld,
2295 dest,
2296 retype(dest, BRW_REGISTER_TYPE_F),
2297 instr->num_components);
2298 }
2299 break;
2300 }
2301
2302 default:
2303 nir_emit_intrinsic(bld, instr);
2304 break;
2305 }
2306 }
2307
2308 void
2309 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2310 nir_intrinsic_instr *instr)
2311 {
2312 assert(stage == MESA_SHADER_TESS_CTRL);
2313 struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2314 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2315
2316 fs_reg dst;
2317 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2318 dst = get_nir_dest(instr->dest);
2319
2320 switch (instr->intrinsic) {
2321 case nir_intrinsic_load_primitive_id:
2322 bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
2323 break;
2324 case nir_intrinsic_load_invocation_id:
2325 bld.MOV(retype(dst, invocation_id.type), invocation_id);
2326 break;
2327 case nir_intrinsic_load_patch_vertices_in:
2328 bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2329 brw_imm_d(tcs_key->input_vertices));
2330 break;
2331
2332 case nir_intrinsic_barrier: {
2333 if (tcs_prog_data->instances == 1)
2334 break;
2335
2336 fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2337 fs_reg m0_2 = component(m0, 2);
2338
2339 const fs_builder chanbld = bld.exec_all().group(1, 0);
2340
2341 /* Zero the message header */
2342 bld.exec_all().MOV(m0, brw_imm_ud(0u));
2343
2344 /* Copy "Barrier ID" from r0.2, bits 16:13 */
2345 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2346 brw_imm_ud(INTEL_MASK(16, 13)));
2347
2348 /* Shift it up to bits 27:24. */
2349 chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2350
2351 /* Set the Barrier Count and the enable bit */
2352 chanbld.OR(m0_2, m0_2,
2353 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2354
2355 bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2356 break;
2357 }
2358
2359 case nir_intrinsic_load_input:
2360 unreachable("nir_lower_io should never give us these.");
2361 break;
2362
2363 case nir_intrinsic_load_per_vertex_input: {
2364 fs_reg indirect_offset = get_indirect_offset(instr);
2365 unsigned imm_offset = instr->const_index[0];
2366
2367 const nir_src &vertex_src = instr->src[0];
2368 nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
2369
2370 fs_inst *inst;
2371
2372 fs_reg icp_handle;
2373
2374 if (vertex_const) {
2375 /* Emit a MOV to resolve <0,1,0> regioning. */
2376 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2377 bld.MOV(icp_handle,
2378 retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3),
2379 vertex_const->i32[0] & 7),
2380 BRW_REGISTER_TYPE_UD));
2381 } else if (tcs_prog_data->instances == 1 &&
2382 vertex_src.is_ssa &&
2383 vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
2384 nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
2385 /* For the common case of only 1 instance, an array index of
2386 * gl_InvocationID means reading g1. Skip all the indirect work.
2387 */
2388 icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2389 } else {
2390 /* The vertex index is non-constant. We need to use indirect
2391 * addressing to fetch the proper URB handle.
2392 */
2393 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2394
2395 /* Each ICP handle is a single DWord (4 bytes) */
2396 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2397 bld.SHL(vertex_offset_bytes,
2398 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2399 brw_imm_ud(2u));
2400
2401 /* Start at g1. We might read up to 4 registers. */
2402 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2403 retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
2404 brw_imm_ud(4 * REG_SIZE));
2405 }
2406
2407 /* We can only read two double components with each URB read, so
2408 * we send two read messages in that case, each one loading up to
2409 * two double components.
2410 */
2411 unsigned num_iterations = 1;
2412 unsigned num_components = instr->num_components;
2413 unsigned first_component = nir_intrinsic_component(instr);
2414 fs_reg orig_dst = dst;
2415 if (type_sz(dst.type) == 8) {
2416 first_component = first_component / 2;
2417 if (instr->num_components > 2) {
2418 num_iterations = 2;
2419 num_components = 2;
2420 }
2421
2422 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2423 dst = tmp;
2424 }
2425
2426 for (unsigned iter = 0; iter < num_iterations; iter++) {
2427 if (indirect_offset.file == BAD_FILE) {
2428 /* Constant indexing - use global offset. */
2429 if (first_component != 0) {
2430 unsigned read_components = num_components + first_component;
2431 fs_reg tmp = bld.vgrf(dst.type, read_components);
2432 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2433 for (unsigned i = 0; i < num_components; i++) {
2434 bld.MOV(offset(dst, bld, i),
2435 offset(tmp, bld, i + first_component));
2436 }
2437 } else {
2438 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2439 }
2440 inst->offset = imm_offset;
2441 inst->mlen = 1;
2442 } else {
2443 /* Indirect indexing - use per-slot offsets as well. */
2444 const fs_reg srcs[] = { icp_handle, indirect_offset };
2445 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2446 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2447 if (first_component != 0) {
2448 unsigned read_components = num_components + first_component;
2449 fs_reg tmp = bld.vgrf(dst.type, read_components);
2450 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2451 payload);
2452 for (unsigned i = 0; i < num_components; i++) {
2453 bld.MOV(offset(dst, bld, i),
2454 offset(tmp, bld, i + first_component));
2455 }
2456 } else {
2457 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2458 payload);
2459 }
2460 inst->offset = imm_offset;
2461 inst->mlen = 2;
2462 }
2463 inst->size_written = (num_components + first_component) *
2464 inst->dst.component_size(inst->exec_size);
2465
2466 /* If we are reading 64-bit data using 32-bit read messages we need
2467 * build proper 64-bit data elements by shuffling the low and high
2468 * 32-bit components around like we do for other things like UBOs
2469 * or SSBOs.
2470 */
2471 if (type_sz(dst.type) == 8) {
2472 shuffle_32bit_load_result_to_64bit_data(
2473 bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components);
2474
2475 for (unsigned c = 0; c < num_components; c++) {
2476 bld.MOV(offset(orig_dst, bld, iter * 2 + c),
2477 offset(dst, bld, c));
2478 }
2479 }
2480
2481 /* Copy the temporary to the destination to deal with writemasking.
2482 *
2483 * Also attempt to deal with gl_PointSize being in the .w component.
2484 */
2485 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2486 assert(type_sz(dst.type) < 8);
2487 inst->dst = bld.vgrf(dst.type, 4);
2488 inst->size_written = 4 * REG_SIZE;
2489 bld.MOV(dst, offset(inst->dst, bld, 3));
2490 }
2491
2492 /* If we are loading double data and we need a second read message
2493 * adjust the write offset
2494 */
2495 if (num_iterations > 1) {
2496 num_components = instr->num_components - 2;
2497 imm_offset++;
2498 }
2499 }
2500 break;
2501 }
2502
2503 case nir_intrinsic_load_output:
2504 case nir_intrinsic_load_per_vertex_output: {
2505 fs_reg indirect_offset = get_indirect_offset(instr);
2506 unsigned imm_offset = instr->const_index[0];
2507 unsigned first_component = nir_intrinsic_component(instr);
2508
2509 fs_inst *inst;
2510 if (indirect_offset.file == BAD_FILE) {
2511 /* Replicate the patch handle to all enabled channels */
2512 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2513 bld.MOV(patch_handle,
2514 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
2515
2516 {
2517 if (first_component != 0) {
2518 unsigned read_components =
2519 instr->num_components + first_component;
2520 fs_reg tmp = bld.vgrf(dst.type, read_components);
2521 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2522 patch_handle);
2523 inst->size_written = read_components * REG_SIZE;
2524 for (unsigned i = 0; i < instr->num_components; i++) {
2525 bld.MOV(offset(dst, bld, i),
2526 offset(tmp, bld, i + first_component));
2527 }
2528 } else {
2529 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
2530 patch_handle);
2531 inst->size_written = instr->num_components * REG_SIZE;
2532 }
2533 inst->offset = imm_offset;
2534 inst->mlen = 1;
2535 }
2536 } else {
2537 /* Indirect indexing - use per-slot offsets as well. */
2538 const fs_reg srcs[] = {
2539 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2540 indirect_offset
2541 };
2542 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2543 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2544 if (first_component != 0) {
2545 unsigned read_components =
2546 instr->num_components + first_component;
2547 fs_reg tmp = bld.vgrf(dst.type, read_components);
2548 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2549 payload);
2550 inst->size_written = read_components * REG_SIZE;
2551 for (unsigned i = 0; i < instr->num_components; i++) {
2552 bld.MOV(offset(dst, bld, i),
2553 offset(tmp, bld, i + first_component));
2554 }
2555 } else {
2556 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2557 payload);
2558 inst->size_written = instr->num_components * REG_SIZE;
2559 }
2560 inst->offset = imm_offset;
2561 inst->mlen = 2;
2562 }
2563 break;
2564 }
2565
2566 case nir_intrinsic_store_output:
2567 case nir_intrinsic_store_per_vertex_output: {
2568 fs_reg value = get_nir_src(instr->src[0]);
2569 bool is_64bit = (instr->src[0].is_ssa ?
2570 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
2571 fs_reg indirect_offset = get_indirect_offset(instr);
2572 unsigned imm_offset = instr->const_index[0];
2573 unsigned swiz = BRW_SWIZZLE_XYZW;
2574 unsigned mask = instr->const_index[1];
2575 unsigned header_regs = 0;
2576 fs_reg srcs[7];
2577 srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2578
2579 if (indirect_offset.file != BAD_FILE) {
2580 srcs[header_regs++] = indirect_offset;
2581 }
2582
2583 if (mask == 0)
2584 break;
2585
2586 unsigned num_components = util_last_bit(mask);
2587 enum opcode opcode;
2588
2589 /* We can only pack two 64-bit components in a single message, so send
2590 * 2 messages if we have more components
2591 */
2592 unsigned num_iterations = 1;
2593 unsigned iter_components = num_components;
2594 unsigned first_component = nir_intrinsic_component(instr);
2595 if (is_64bit) {
2596 first_component = first_component / 2;
2597 if (instr->num_components > 2) {
2598 num_iterations = 2;
2599 iter_components = 2;
2600 }
2601 }
2602
2603 /* 64-bit data needs to me shuffled before we can write it to the URB.
2604 * We will use this temporary to shuffle the components in each
2605 * iteration.
2606 */
2607 fs_reg tmp =
2608 fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type);
2609
2610 mask = mask << first_component;
2611
2612 for (unsigned iter = 0; iter < num_iterations; iter++) {
2613 if (!is_64bit && mask != WRITEMASK_XYZW) {
2614 srcs[header_regs++] = brw_imm_ud(mask << 16);
2615 opcode = indirect_offset.file != BAD_FILE ?
2616 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2617 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2618 } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
2619 /* Expand the 64-bit mask to 32-bit channels. We only handle
2620 * two channels in each iteration, so we only care about X/Y.
2621 */
2622 unsigned mask32 = 0;
2623 if (mask & WRITEMASK_X)
2624 mask32 |= WRITEMASK_XY;
2625 if (mask & WRITEMASK_Y)
2626 mask32 |= WRITEMASK_ZW;
2627
2628 /* If the mask does not include any of the channels X or Y there
2629 * is nothing to do in this iteration. Move on to the next couple
2630 * of 64-bit channels.
2631 */
2632 if (!mask32) {
2633 mask >>= 2;
2634 imm_offset++;
2635 continue;
2636 }
2637
2638 srcs[header_regs++] = brw_imm_ud(mask32 << 16);
2639 opcode = indirect_offset.file != BAD_FILE ?
2640 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2641 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2642 } else {
2643 opcode = indirect_offset.file != BAD_FILE ?
2644 SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
2645 SHADER_OPCODE_URB_WRITE_SIMD8;
2646 }
2647
2648 for (unsigned i = 0; i < iter_components; i++) {
2649 if (!(mask & (1 << (i + first_component))))
2650 continue;
2651
2652 if (!is_64bit) {
2653 srcs[header_regs + i + first_component] =
2654 offset(value, bld, BRW_GET_SWZ(swiz, i));
2655 } else {
2656 /* We need to shuffle the 64-bit data to match the layout
2657 * expected by our 32-bit URB write messages. We use a temporary
2658 * for that.
2659 */
2660 unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i);
2661 shuffle_64bit_data_for_32bit_write(bld,
2662 retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F),
2663 retype(offset(value, bld, 2 * channel), BRW_REGISTER_TYPE_DF),
2664 1);
2665
2666 /* Now copy the data to the destination */
2667 fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
2668 unsigned idx = 2 * i;
2669 bld.MOV(dest, offset(tmp, bld, idx));
2670 bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1));
2671 srcs[header_regs + idx + first_component * 2] = dest;
2672 srcs[header_regs + idx + 1 + first_component * 2] =
2673 offset(dest, bld, 1);
2674 }
2675 }
2676
2677 unsigned mlen =
2678 header_regs + (is_64bit ? 2 * iter_components : iter_components) +
2679 (is_64bit ? 2 * first_component : first_component);
2680 fs_reg payload =
2681 bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2682 bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
2683
2684 fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
2685 inst->offset = imm_offset;
2686 inst->mlen = mlen;
2687
2688 /* If this is a 64-bit attribute, select the next two 64-bit channels
2689 * to be handled in the next iteration.
2690 */
2691 if (is_64bit) {
2692 mask >>= 2;
2693 imm_offset++;
2694 }
2695 }
2696 break;
2697 }
2698
2699 default:
2700 nir_emit_intrinsic(bld, instr);
2701 break;
2702 }
2703 }
2704
2705 void
2706 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
2707 nir_intrinsic_instr *instr)
2708 {
2709 assert(stage == MESA_SHADER_TESS_EVAL);
2710 struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
2711
2712 fs_reg dest;
2713 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2714 dest = get_nir_dest(instr->dest);
2715
2716 switch (instr->intrinsic) {
2717 case nir_intrinsic_load_primitive_id:
2718 bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
2719 break;
2720 case nir_intrinsic_load_tess_coord:
2721 /* gl_TessCoord is part of the payload in g1-3 */
2722 for (unsigned i = 0; i < 3; i++) {
2723 bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
2724 }
2725 break;
2726
2727 case nir_intrinsic_load_input:
2728 case nir_intrinsic_load_per_vertex_input: {
2729 fs_reg indirect_offset = get_indirect_offset(instr);
2730 unsigned imm_offset = instr->const_index[0];
2731 unsigned first_component = nir_intrinsic_component(instr);
2732
2733 if (type_sz(dest.type) == 8) {
2734 first_component = first_component / 2;
2735 }
2736
2737 fs_inst *inst;
2738 if (indirect_offset.file == BAD_FILE) {
2739 /* Arbitrarily only push up to 32 vec4 slots worth of data,
2740 * which is 16 registers (since each holds 2 vec4 slots).
2741 */
2742 const unsigned max_push_slots = 32;
2743 if (imm_offset < max_push_slots) {
2744 fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
2745 for (int i = 0; i < instr->num_components; i++) {
2746 unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
2747 i + first_component;
2748 bld.MOV(offset(dest, bld, i), component(src, comp));
2749 }
2750 tes_prog_data->base.urb_read_length =
2751 MAX2(tes_prog_data->base.urb_read_length,
2752 DIV_ROUND_UP(imm_offset + 1, 2));
2753 } else {
2754 /* Replicate the patch handle to all enabled channels */
2755 const fs_reg srcs[] = {
2756 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
2757 };
2758 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2759 bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
2760
2761 if (first_component != 0) {
2762 unsigned read_components =
2763 instr->num_components + first_component;
2764 fs_reg tmp = bld.vgrf(dest.type, read_components);
2765 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2766 patch_handle);
2767 inst->size_written = read_components * REG_SIZE;
2768 for (unsigned i = 0; i < instr->num_components; i++) {
2769 bld.MOV(offset(dest, bld, i),
2770 offset(tmp, bld, i + first_component));
2771 }
2772 } else {
2773 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
2774 patch_handle);
2775 inst->size_written = instr->num_components * REG_SIZE;
2776 }
2777 inst->mlen = 1;
2778 inst->offset = imm_offset;
2779 }
2780 } else {
2781 /* Indirect indexing - use per-slot offsets as well. */
2782
2783 /* We can only read two double components with each URB read, so
2784 * we send two read messages in that case, each one loading up to
2785 * two double components.
2786 */
2787 unsigned num_iterations = 1;
2788 unsigned num_components = instr->num_components;
2789 fs_reg orig_dest = dest;
2790 if (type_sz(dest.type) == 8) {
2791 if (instr->num_components > 2) {
2792 num_iterations = 2;
2793 num_components = 2;
2794 }
2795 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
2796 dest = tmp;
2797 }
2798
2799 for (unsigned iter = 0; iter < num_iterations; iter++) {
2800 const fs_reg srcs[] = {
2801 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2802 indirect_offset
2803 };
2804 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2805 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2806
2807 if (first_component != 0) {
2808 unsigned read_components =
2809 num_components + first_component;
2810 fs_reg tmp = bld.vgrf(dest.type, read_components);
2811 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2812 payload);
2813 for (unsigned i = 0; i < num_components; i++) {
2814 bld.MOV(offset(dest, bld, i),
2815 offset(tmp, bld, i + first_component));
2816 }
2817 } else {
2818 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
2819 payload);
2820 }
2821 inst->mlen = 2;
2822 inst->offset = imm_offset;
2823 inst->size_written = (num_components + first_component) *
2824 inst->dst.component_size(inst->exec_size);
2825
2826 /* If we are reading 64-bit data using 32-bit read messages we need
2827 * build proper 64-bit data elements by shuffling the low and high
2828 * 32-bit components around like we do for other things like UBOs
2829 * or SSBOs.
2830 */
2831 if (type_sz(dest.type) == 8) {
2832 shuffle_32bit_load_result_to_64bit_data(
2833 bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);
2834
2835 for (unsigned c = 0; c < num_components; c++) {
2836 bld.MOV(offset(orig_dest, bld, iter * 2 + c),
2837 offset(dest, bld, c));
2838 }
2839 }
2840
2841 /* If we are loading double data and we need a second read message
2842 * adjust the offset
2843 */
2844 if (num_iterations > 1) {
2845 num_components = instr->num_components - 2;
2846 imm_offset++;
2847 }
2848 }
2849 }
2850 break;
2851 }
2852 default:
2853 nir_emit_intrinsic(bld, instr);
2854 break;
2855 }
2856 }
2857
2858 void
2859 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
2860 nir_intrinsic_instr *instr)
2861 {
2862 assert(stage == MESA_SHADER_GEOMETRY);
2863 fs_reg indirect_offset;
2864
2865 fs_reg dest;
2866 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2867 dest = get_nir_dest(instr->dest);
2868
2869 switch (instr->intrinsic) {
2870 case nir_intrinsic_load_primitive_id:
2871 assert(stage == MESA_SHADER_GEOMETRY);
2872 assert(brw_gs_prog_data(prog_data)->include_primitive_id);
2873 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
2874 retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
2875 break;
2876
2877 case nir_intrinsic_load_input:
2878 unreachable("load_input intrinsics are invalid for the GS stage");
2879
2880 case nir_intrinsic_load_per_vertex_input:
2881 emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
2882 instr->src[1], instr->num_components,
2883 nir_intrinsic_component(instr));
2884 break;
2885
2886 case nir_intrinsic_emit_vertex_with_counter:
2887 emit_gs_vertex(instr->src[0], instr->const_index[0]);
2888 break;
2889
2890 case nir_intrinsic_end_primitive_with_counter:
2891 emit_gs_end_primitive(instr->src[0]);
2892 break;
2893
2894 case nir_intrinsic_set_vertex_count:
2895 bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
2896 break;
2897
2898 case nir_intrinsic_load_invocation_id: {
2899 fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
2900 assert(val.file != BAD_FILE);
2901 dest.type = val.type;
2902 bld.MOV(dest, val);
2903 break;
2904 }
2905
2906 default:
2907 nir_emit_intrinsic(bld, instr);
2908 break;
2909 }
2910 }
2911
2912 /**
2913 * Fetch the current render target layer index.
2914 */
2915 static fs_reg
2916 fetch_render_target_array_index(const fs_builder &bld)
2917 {
2918 if (bld.shader->devinfo->gen >= 6) {
2919 /* The render target array index is provided in the thread payload as
2920 * bits 26:16 of r0.0.
2921 */
2922 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
2923 bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
2924 brw_imm_uw(0x7ff));
2925 return idx;
2926 } else {
2927 /* Pre-SNB we only ever render into the first layer of the framebuffer
2928 * since layered rendering is not implemented.
2929 */
2930 return brw_imm_ud(0);
2931 }
2932 }
2933
2934 /**
2935 * Fake non-coherent framebuffer read implemented using TXF to fetch from the
2936 * framebuffer at the current fragment coordinates and sample index.
2937 */
2938 fs_inst *
2939 fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
2940 unsigned target)
2941 {
2942 const struct gen_device_info *devinfo = bld.shader->devinfo;
2943
2944 assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
2945 const brw_wm_prog_key *wm_key =
2946 reinterpret_cast<const brw_wm_prog_key *>(key);
2947 assert(!wm_key->coherent_fb_fetch);
2948 const struct brw_wm_prog_data *wm_prog_data =
2949 brw_wm_prog_data(stage_prog_data);
2950
2951 /* Calculate the surface index relative to the start of the texture binding
2952 * table block, since that's what the texturing messages expect.
2953 */
2954 const unsigned surface = target +
2955 wm_prog_data->binding_table.render_target_read_start -
2956 wm_prog_data->base.binding_table.texture_start;
2957
2958 brw_mark_surface_used(
2959 bld.shader->stage_prog_data,
2960 wm_prog_data->binding_table.render_target_read_start + target);
2961
2962 /* Calculate the fragment coordinates. */
2963 const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
2964 bld.MOV(offset(coords, bld, 0), pixel_x);
2965 bld.MOV(offset(coords, bld, 1), pixel_y);
2966 bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
2967
2968 /* Calculate the sample index and MCS payload when multisampling. Luckily
2969 * the MCS fetch message behaves deterministically for UMS surfaces, so it
2970 * shouldn't be necessary to recompile based on whether the framebuffer is
2971 * CMS or UMS.
2972 */
2973 if (wm_key->multisample_fbo &&
2974 nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
2975 nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
2976
2977 const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
2978 const fs_reg mcs = wm_key->multisample_fbo ?
2979 emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
2980
2981 /* Use either a normal or a CMS texel fetch message depending on whether
2982 * the framebuffer is single or multisample. On SKL+ use the wide CMS
2983 * message just in case the framebuffer uses 16x multisampling, it should
2984 * be equivalent to the normal CMS fetch for lower multisampling modes.
2985 */
2986 const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
2987 devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
2988 SHADER_OPCODE_TXF_CMS_LOGICAL;
2989
2990 /* Emit the instruction. */
2991 const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(),
2992 sample, mcs,
2993 brw_imm_ud(surface), brw_imm_ud(0),
2994 fs_reg(), brw_imm_ud(3), brw_imm_ud(0) };
2995 STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS);
2996
2997 fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
2998 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
2999
3000 return inst;
3001 }
3002
3003 /**
3004 * Actual coherent framebuffer read implemented using the native render target
3005 * read message. Requires SKL+.
3006 */
3007 static fs_inst *
3008 emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
3009 {
3010 assert(bld.shader->devinfo->gen >= 9);
3011 fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3012 inst->target = target;
3013 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3014
3015 return inst;
3016 }
3017
3018 static fs_reg
3019 alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
3020 {
3021 if (n && regs[0].file != BAD_FILE) {
3022 return regs[0];
3023
3024 } else {
3025 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3026
3027 for (unsigned i = 0; i < n; i++)
3028 regs[i] = tmp;
3029
3030 return tmp;
3031 }
3032 }
3033
3034 static fs_reg
3035 alloc_frag_output(fs_visitor *v, unsigned location)
3036 {
3037 assert(v->stage == MESA_SHADER_FRAGMENT);
3038 const brw_wm_prog_key *const key =
3039 reinterpret_cast<const brw_wm_prog_key *>(v->key);
3040 const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3041 const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3042
3043 if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3044 return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3045
3046 else if (l == FRAG_RESULT_COLOR)
3047 return alloc_temporary(v->bld, 4, v->outputs,
3048 MAX2(key->nr_color_regions, 1));
3049
3050 else if (l == FRAG_RESULT_DEPTH)
3051 return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3052
3053 else if (l == FRAG_RESULT_STENCIL)
3054 return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3055
3056 else if (l == FRAG_RESULT_SAMPLE_MASK)
3057 return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3058
3059 else if (l >= FRAG_RESULT_DATA0 &&
3060 l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3061 return alloc_temporary(v->bld, 4,
3062 &v->outputs[l - FRAG_RESULT_DATA0], 1);
3063
3064 else
3065 unreachable("Invalid location");
3066 }
3067
3068 void
3069 fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3070 nir_intrinsic_instr *instr)
3071 {
3072 assert(stage == MESA_SHADER_FRAGMENT);
3073
3074 fs_reg dest;
3075 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3076 dest = get_nir_dest(instr->dest);
3077
3078 switch (instr->intrinsic) {
3079 case nir_intrinsic_load_front_face:
3080 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3081 *emit_frontfacing_interpolation());
3082 break;
3083
3084 case nir_intrinsic_load_sample_pos: {
3085 fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3086 assert(sample_pos.file != BAD_FILE);
3087 dest.type = sample_pos.type;
3088 bld.MOV(dest, sample_pos);
3089 bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3090 break;
3091 }
3092
3093 case nir_intrinsic_load_layer_id:
3094 dest.type = BRW_REGISTER_TYPE_UD;
3095 bld.MOV(dest, fetch_render_target_array_index(bld));
3096 break;
3097
3098 case nir_intrinsic_load_helper_invocation:
3099 case nir_intrinsic_load_sample_mask_in:
3100 case nir_intrinsic_load_sample_id: {
3101 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3102 fs_reg val = nir_system_values[sv];
3103 assert(val.file != BAD_FILE);
3104 dest.type = val.type;
3105 bld.MOV(dest, val);
3106 break;
3107 }
3108
3109 case nir_intrinsic_store_output: {
3110 const fs_reg src = get_nir_src(instr->src[0]);
3111 const nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3112 assert(const_offset && "Indirect output stores not allowed");
3113 const unsigned location = nir_intrinsic_base(instr) +
3114 SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION);
3115 const fs_reg new_dest = retype(alloc_frag_output(this, location),
3116 src.type);
3117
3118 for (unsigned j = 0; j < instr->num_components; j++)
3119 bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3120 offset(src, bld, j));
3121
3122 break;
3123 }
3124
3125 case nir_intrinsic_load_output: {
3126 const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3127 BRW_NIR_FRAG_OUTPUT_LOCATION);
3128 assert(l >= FRAG_RESULT_DATA0);
3129 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3130 assert(const_offset && "Indirect output loads not allowed");
3131 const unsigned target = l - FRAG_RESULT_DATA0 + const_offset->u32[0];
3132 const fs_reg tmp = bld.vgrf(dest.type, 4);
3133
3134 if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3135 emit_coherent_fb_read(bld, tmp, target);
3136 else
3137 emit_non_coherent_fb_read(bld, tmp, target);
3138
3139 for (unsigned j = 0; j < instr->num_components; j++) {
3140 bld.MOV(offset(dest, bld, j),
3141 offset(tmp, bld, nir_intrinsic_component(instr) + j));
3142 }
3143
3144 break;
3145 }
3146
3147 case nir_intrinsic_discard:
3148 case nir_intrinsic_discard_if: {
3149 /* We track our discarded pixels in f0.1. By predicating on it, we can
3150 * update just the flag bits that aren't yet discarded. If there's no
3151 * condition, we emit a CMP of g0 != g0, so all currently executing
3152 * channels will get turned off.
3153 */
3154 fs_inst *cmp;
3155 if (instr->intrinsic == nir_intrinsic_discard_if) {
3156 cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3157 brw_imm_d(0), BRW_CONDITIONAL_Z);
3158 } else {
3159 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3160 BRW_REGISTER_TYPE_UW));
3161 cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3162 }
3163 cmp->predicate = BRW_PREDICATE_NORMAL;
3164 cmp->flag_subreg = 1;
3165
3166 if (devinfo->gen >= 6) {
3167 emit_discard_jump();
3168 }
3169 break;
3170 }
3171
3172 case nir_intrinsic_load_input: {
3173 /* load_input is only used for flat inputs */
3174 unsigned base = nir_intrinsic_base(instr);
3175 unsigned component = nir_intrinsic_component(instr);
3176 unsigned num_components = instr->num_components;
3177 enum brw_reg_type type = dest.type;
3178
3179 /* Special case fields in the VUE header */
3180 if (base == VARYING_SLOT_LAYER)
3181 component = 1;
3182 else if (base == VARYING_SLOT_VIEWPORT)
3183 component = 2;
3184
3185 if (nir_dest_bit_size(instr->dest) == 64) {
3186 /* const_index is in 32-bit type size units that could not be aligned
3187 * with DF. We need to read the double vector as if it was a float
3188 * vector of twice the number of components to fetch the right data.
3189 */
3190 type = BRW_REGISTER_TYPE_F;
3191 num_components *= 2;
3192 }
3193
3194 for (unsigned int i = 0; i < num_components; i++) {
3195 struct brw_reg interp = interp_reg(base, component + i);
3196 interp = suboffset(interp, 3);
3197 bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
3198 retype(fs_reg(interp), type));
3199 }
3200
3201 if (nir_dest_bit_size(instr->dest) == 64) {
3202 shuffle_32bit_load_result_to_64bit_data(bld,
3203 dest,
3204 retype(dest, type),
3205 instr->num_components);
3206 }
3207 break;
3208 }
3209
3210 case nir_intrinsic_load_barycentric_pixel:
3211 case nir_intrinsic_load_barycentric_centroid:
3212 case nir_intrinsic_load_barycentric_sample:
3213 /* Do nothing - load_interpolated_input handling will handle it later. */
3214 break;
3215
3216 case nir_intrinsic_load_barycentric_at_sample: {
3217 const glsl_interp_mode interpolation =
3218 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3219
3220 nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
3221
3222 if (const_sample) {
3223 unsigned msg_data = const_sample->i32[0] << 4;
3224
3225 emit_pixel_interpolater_send(bld,
3226 FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3227 dest,
3228 fs_reg(), /* src */
3229 brw_imm_ud(msg_data),
3230 interpolation);
3231 } else {
3232 const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3233 BRW_REGISTER_TYPE_UD);
3234
3235 if (nir_src_is_dynamically_uniform(instr->src[0])) {
3236 const fs_reg sample_id = bld.emit_uniformize(sample_src);
3237 const fs_reg msg_data = vgrf(glsl_type::uint_type);
3238 bld.exec_all().group(1, 0)
3239 .SHL(msg_data, sample_id, brw_imm_ud(4u));
3240 emit_pixel_interpolater_send(bld,
3241 FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3242 dest,
3243 fs_reg(), /* src */
3244 msg_data,
3245 interpolation);
3246 } else {
3247 /* Make a loop that sends a message to the pixel interpolater
3248 * for the sample number in each live channel. If there are
3249 * multiple channels with the same sample number then these
3250 * will be handled simultaneously with a single interation of
3251 * the loop.
3252 */
3253 bld.emit(BRW_OPCODE_DO);
3254
3255 /* Get the next live sample number into sample_id_reg */
3256 const fs_reg sample_id = bld.emit_uniformize(sample_src);
3257
3258 /* Set the flag register so that we can perform the send
3259 * message on all channels that have the same sample number
3260 */
3261 bld.CMP(bld.null_reg_ud(),
3262 sample_src, sample_id,
3263 BRW_CONDITIONAL_EQ);
3264 const fs_reg msg_data = vgrf(glsl_type::uint_type);
3265 bld.exec_all().group(1, 0)
3266 .SHL(msg_data, sample_id, brw_imm_ud(4u));
3267 fs_inst *inst =
3268 emit_pixel_interpolater_send(bld,
3269 FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3270 dest,
3271 fs_reg(), /* src */
3272 msg_data,
3273 interpolation);
3274 set_predicate(BRW_PREDICATE_NORMAL, inst);
3275
3276 /* Continue the loop if there are any live channels left */
3277 set_predicate_inv(BRW_PREDICATE_NORMAL,
3278 true, /* inverse */
3279 bld.emit(BRW_OPCODE_WHILE));
3280 }
3281 }
3282 break;
3283 }
3284
3285 case nir_intrinsic_load_barycentric_at_offset: {
3286 const glsl_interp_mode interpolation =
3287 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3288
3289 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3290
3291 if (const_offset) {
3292 unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
3293 unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
3294
3295 emit_pixel_interpolater_send(bld,
3296 FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3297 dest,
3298 fs_reg(), /* src */
3299 brw_imm_ud(off_x | (off_y << 4)),
3300 interpolation);
3301 } else {
3302 fs_reg src = vgrf(glsl_type::ivec2_type);
3303 fs_reg offset_src = retype(get_nir_src(instr->src[0]),
3304 BRW_REGISTER_TYPE_F);
3305 for (int i = 0; i < 2; i++) {
3306 fs_reg temp = vgrf(glsl_type::float_type);
3307 bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
3308 fs_reg itemp = vgrf(glsl_type::int_type);
3309 /* float to int */
3310 bld.MOV(itemp, temp);
3311
3312 /* Clamp the upper end of the range to +7/16.
3313 * ARB_gpu_shader5 requires that we support a maximum offset
3314 * of +0.5, which isn't representable in a S0.4 value -- if
3315 * we didn't clamp it, we'd end up with -8/16, which is the
3316 * opposite of what the shader author wanted.
3317 *
3318 * This is legal due to ARB_gpu_shader5's quantization
3319 * rules:
3320 *
3321 * "Not all values of <offset> may be supported; x and y
3322 * offsets may be rounded to fixed-point values with the
3323 * number of fraction bits given by the
3324 * implementation-dependent constant
3325 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
3326 */
3327 set_condmod(BRW_CONDITIONAL_L,
3328 bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
3329 }
3330
3331 const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3332 emit_pixel_interpolater_send(bld,
3333 opcode,
3334 dest,
3335 src,
3336 brw_imm_ud(0u),
3337 interpolation);
3338 }
3339 break;
3340 }
3341
3342 case nir_intrinsic_load_interpolated_input: {
3343 if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
3344 emit_fragcoord_interpolation(dest);
3345 break;
3346 }
3347
3348 assert(instr->src[0].ssa &&
3349 instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3350 nir_intrinsic_instr *bary_intrinsic =
3351 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3352 nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3353 enum glsl_interp_mode interp_mode =
3354 (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3355 fs_reg dst_xy;
3356
3357 if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3358 bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3359 /* Use the result of the PI message */
3360 dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3361 } else {
3362 /* Use the delta_xy values computed from the payload */
3363 enum brw_barycentric_mode bary =
3364 brw_barycentric_mode(interp_mode, bary_intrin);
3365
3366 dst_xy = this->delta_xy[bary];
3367 }
3368
3369 for (unsigned int i = 0; i < instr->num_components; i++) {
3370 fs_reg interp =
3371 fs_reg(interp_reg(nir_intrinsic_base(instr),
3372 nir_intrinsic_component(instr) + i));
3373 interp.type = BRW_REGISTER_TYPE_F;
3374 dest.type = BRW_REGISTER_TYPE_F;
3375
3376 if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3377 fs_reg tmp = vgrf(glsl_type::float_type);
3378 bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3379 bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3380 } else {
3381 bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3382 }
3383 }
3384 break;
3385 }
3386
3387 default:
3388 nir_emit_intrinsic(bld, instr);
3389 break;
3390 }
3391 }
3392
3393 void
3394 fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3395 nir_intrinsic_instr *instr)
3396 {
3397 assert(stage == MESA_SHADER_COMPUTE);
3398 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3399
3400 fs_reg dest;
3401 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3402 dest = get_nir_dest(instr->dest);
3403
3404 switch (instr->intrinsic) {
3405 case nir_intrinsic_barrier:
3406 emit_barrier();
3407 cs_prog_data->uses_barrier = true;
3408 break;
3409
3410 case nir_intrinsic_load_local_invocation_id:
3411 case nir_intrinsic_load_work_group_id: {
3412 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3413 fs_reg val = nir_system_values[sv];
3414 assert(val.file != BAD_FILE);
3415 dest.type = val.type;
3416 for (unsigned i = 0; i < 3; i++)
3417 bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3418 break;
3419 }
3420
3421 case nir_intrinsic_load_num_work_groups: {
3422 const unsigned surface =
3423 cs_prog_data->binding_table.work_groups_start;
3424
3425 cs_prog_data->uses_num_work_groups = true;
3426
3427 fs_reg surf_index = brw_imm_ud(surface);
3428 brw_mark_surface_used(prog_data, surface);
3429
3430 /* Read the 3 GLuint components of gl_NumWorkGroups */
3431 for (unsigned i = 0; i < 3; i++) {
3432 fs_reg read_result =
3433 emit_untyped_read(bld, surf_index,
3434 brw_imm_ud(i << 2),
3435 1 /* dims */, 1 /* size */,
3436 BRW_PREDICATE_NONE);
3437 read_result.type = dest.type;
3438 bld.MOV(dest, read_result);
3439 dest = offset(dest, bld, 1);
3440 }
3441 break;
3442 }
3443
3444 case nir_intrinsic_shared_atomic_add:
3445 nir_emit_shared_atomic(bld, BRW_AOP_ADD, instr);
3446 break;
3447 case nir_intrinsic_shared_atomic_imin:
3448 nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
3449 break;
3450 case nir_intrinsic_shared_atomic_umin:
3451 nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
3452 break;
3453 case nir_intrinsic_shared_atomic_imax:
3454 nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
3455 break;
3456 case nir_intrinsic_shared_atomic_umax:
3457 nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
3458 break;
3459 case nir_intrinsic_shared_atomic_and:
3460 nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
3461 break;
3462 case nir_intrinsic_shared_atomic_or:
3463 nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
3464 break;
3465 case nir_intrinsic_shared_atomic_xor:
3466 nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
3467 break;
3468 case nir_intrinsic_shared_atomic_exchange:
3469 nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
3470 break;
3471 case nir_intrinsic_shared_atomic_comp_swap:
3472 nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
3473 break;
3474
3475 case nir_intrinsic_load_shared: {
3476 assert(devinfo->gen >= 7);
3477
3478 fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
3479
3480 /* Get the offset to read from */
3481 fs_reg offset_reg;
3482 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3483 if (const_offset) {
3484 offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
3485 } else {
3486 offset_reg = vgrf(glsl_type::uint_type);
3487 bld.ADD(offset_reg,
3488 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
3489 brw_imm_ud(instr->const_index[0]));
3490 }
3491
3492 /* Read the vector */
3493 do_untyped_vector_read(bld, dest, surf_index, offset_reg,
3494 instr->num_components);
3495 break;
3496 }
3497
3498 case nir_intrinsic_store_shared: {
3499 assert(devinfo->gen >= 7);
3500
3501 /* Block index */
3502 fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
3503
3504 /* Value */
3505 fs_reg val_reg = get_nir_src(instr->src[0]);
3506
3507 /* Writemask */
3508 unsigned writemask = instr->const_index[1];
3509
3510 /* get_nir_src() retypes to integer. Be wary of 64-bit types though
3511 * since the untyped writes below operate in units of 32-bits, which
3512 * means that we need to write twice as many components each time.
3513 * Also, we have to suffle 64-bit data to be in the appropriate layout
3514 * expected by our 32-bit write messages.
3515 */
3516 unsigned type_size = 4;
3517 unsigned bit_size = instr->src[0].is_ssa ?
3518 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
3519 if (bit_size == 64) {
3520 type_size = 8;
3521 fs_reg tmp =
3522 fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
3523 shuffle_64bit_data_for_32bit_write(
3524 bld,
3525 retype(tmp, BRW_REGISTER_TYPE_F),
3526 retype(val_reg, BRW_REGISTER_TYPE_DF),
3527 instr->num_components);
3528 val_reg = tmp;
3529 }
3530
3531 unsigned type_slots = type_size / 4;
3532
3533 /* Combine groups of consecutive enabled channels in one write
3534 * message. We use ffs to find the first enabled channel and then ffs on
3535 * the bit-inverse, down-shifted writemask to determine the length of
3536 * the block of enabled bits.
3537 */
3538 while (writemask) {
3539 unsigned first_component = ffs(writemask) - 1;
3540 unsigned length = ffs(~(writemask >> first_component)) - 1;
3541
3542 /* We can't write more than 2 64-bit components at once. Limit the
3543 * length of the write to what we can do and let the next iteration
3544 * handle the rest
3545 */
3546 if (type_size > 4)
3547 length = MIN2(2, length);
3548
3549 fs_reg offset_reg;
3550 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3551 if (const_offset) {
3552 offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] +
3553 type_size * first_component);
3554 } else {
3555 offset_reg = vgrf(glsl_type::uint_type);
3556 bld.ADD(offset_reg,
3557 retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
3558 brw_imm_ud(instr->const_index[0] + type_size * first_component));
3559 }
3560
3561 emit_untyped_write(bld, surf_index, offset_reg,
3562 offset(val_reg, bld, first_component * type_slots),
3563 1 /* dims */, length * type_slots,
3564 BRW_PREDICATE_NONE);
3565
3566 /* Clear the bits in the writemask that we just wrote, then try
3567 * again to see if more channels are left.
3568 */
3569 writemask &= (15 << (first_component + length));
3570 }
3571
3572 break;
3573 }
3574
3575 default:
3576 nir_emit_intrinsic(bld, instr);
3577 break;
3578 }
3579 }
3580
3581 void
3582 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
3583 {
3584 fs_reg dest;
3585 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3586 dest = get_nir_dest(instr->dest);
3587
3588 switch (instr->intrinsic) {
3589 case nir_intrinsic_atomic_counter_inc:
3590 case nir_intrinsic_atomic_counter_dec:
3591 case nir_intrinsic_atomic_counter_read:
3592 case nir_intrinsic_atomic_counter_add:
3593 case nir_intrinsic_atomic_counter_min:
3594 case nir_intrinsic_atomic_counter_max:
3595 case nir_intrinsic_atomic_counter_and:
3596 case nir_intrinsic_atomic_counter_or:
3597 case nir_intrinsic_atomic_counter_xor:
3598 case nir_intrinsic_atomic_counter_exchange:
3599 case nir_intrinsic_atomic_counter_comp_swap: {
3600 if (stage == MESA_SHADER_FRAGMENT &&
3601 instr->intrinsic != nir_intrinsic_atomic_counter_read)
3602 brw_wm_prog_data(prog_data)->has_side_effects = true;
3603
3604 /* Get some metadata from the image intrinsic. */
3605 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3606
3607 /* Get the arguments of the atomic intrinsic. */
3608 const fs_reg offset = get_nir_src(instr->src[0]);
3609 const unsigned surface = (stage_prog_data->binding_table.abo_start +
3610 instr->const_index[0]);
3611 const fs_reg src0 = (info->num_srcs >= 2
3612 ? get_nir_src(instr->src[1]) : fs_reg());
3613 const fs_reg src1 = (info->num_srcs >= 3
3614 ? get_nir_src(instr->src[2]) : fs_reg());
3615 fs_reg tmp;
3616
3617 assert(info->num_srcs <= 3);
3618
3619 /* Emit a surface read or atomic op. */
3620 if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
3621 tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1);
3622 } else {
3623 tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, src0,
3624 src1, 1, 1,
3625 get_atomic_counter_op(instr->intrinsic));
3626 }
3627
3628 /* Assign the result. */
3629 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
3630
3631 /* Mark the surface as used. */
3632 brw_mark_surface_used(stage_prog_data, surface);
3633 break;
3634 }
3635
3636 case nir_intrinsic_image_load:
3637 case nir_intrinsic_image_store:
3638 case nir_intrinsic_image_atomic_add:
3639 case nir_intrinsic_image_atomic_min:
3640 case nir_intrinsic_image_atomic_max:
3641 case nir_intrinsic_image_atomic_and:
3642 case nir_intrinsic_image_atomic_or:
3643 case nir_intrinsic_image_atomic_xor:
3644 case nir_intrinsic_image_atomic_exchange:
3645 case nir_intrinsic_image_atomic_comp_swap: {
3646 using namespace image_access;
3647
3648 if (stage == MESA_SHADER_FRAGMENT &&
3649 instr->intrinsic != nir_intrinsic_image_load)
3650 brw_wm_prog_data(prog_data)->has_side_effects = true;
3651
3652 /* Get the referenced image variable and type. */
3653 const nir_variable *var = instr->variables[0]->var;
3654 const glsl_type *type = var->type->without_array();
3655 const brw_reg_type base_type = get_image_base_type(type);
3656
3657 /* Get some metadata from the image intrinsic. */
3658 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3659 const unsigned arr_dims = type->sampler_array ? 1 : 0;
3660 const unsigned surf_dims = type->coordinate_components() - arr_dims;
3661 const unsigned format = var->data.image.format;
3662
3663 /* Get the arguments of the image intrinsic. */
3664 const fs_reg image = get_nir_image_deref(instr->variables[0]);
3665 const fs_reg addr = retype(get_nir_src(instr->src[0]),
3666 BRW_REGISTER_TYPE_UD);
3667 const fs_reg src0 = (info->num_srcs >= 3 ?
3668 retype(get_nir_src(instr->src[2]), base_type) :
3669 fs_reg());
3670 const fs_reg src1 = (info->num_srcs >= 4 ?
3671 retype(get_nir_src(instr->src[3]), base_type) :
3672 fs_reg());
3673 fs_reg tmp;
3674
3675 /* Emit an image load, store or atomic op. */
3676 if (instr->intrinsic == nir_intrinsic_image_load)
3677 tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
3678
3679 else if (instr->intrinsic == nir_intrinsic_image_store)
3680 emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
3681 var->data.image.write_only ? GL_NONE : format);
3682
3683 else
3684 tmp = emit_image_atomic(bld, image, addr, src0, src1,
3685 surf_dims, arr_dims, info->dest_components,
3686 get_image_atomic_op(instr->intrinsic, type));
3687
3688 /* Assign the result. */
3689 for (unsigned c = 0; c < info->dest_components; ++c)
3690 bld.MOV(offset(retype(dest, base_type), bld, c),
3691 offset(tmp, bld, c));
3692 break;
3693 }
3694
3695 case nir_intrinsic_memory_barrier_atomic_counter:
3696 case nir_intrinsic_memory_barrier_buffer:
3697 case nir_intrinsic_memory_barrier_image:
3698 case nir_intrinsic_memory_barrier: {
3699 const fs_builder ubld = bld.group(8, 0);
3700 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3701 ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
3702 ->size_written = 2 * REG_SIZE;
3703 break;
3704 }
3705
3706 case nir_intrinsic_group_memory_barrier:
3707 case nir_intrinsic_memory_barrier_shared:
3708 /* We treat these workgroup-level barriers as no-ops. This should be
3709 * safe at present and as long as:
3710 *
3711 * - Memory access instructions are not subsequently reordered by the
3712 * compiler back-end.
3713 *
3714 * - All threads from a given compute shader workgroup fit within a
3715 * single subslice and therefore talk to the same HDC shared unit
3716 * what supposedly guarantees ordering and coherency between threads
3717 * from the same workgroup. This may change in the future when we
3718 * start splitting workgroups across multiple subslices.
3719 *
3720 * - The context is not in fault-and-stream mode, which could cause
3721 * memory transactions (including to SLM) prior to the barrier to be
3722 * replayed after the barrier if a pagefault occurs. This shouldn't
3723 * be a problem up to and including SKL because fault-and-stream is
3724 * not usable due to hardware issues, but that's likely to change in
3725 * the future.
3726 */
3727 break;
3728
3729 case nir_intrinsic_shader_clock: {
3730 /* We cannot do anything if there is an event, so ignore it for now */
3731 const fs_reg shader_clock = get_timestamp(bld);
3732 const fs_reg srcs[] = { component(shader_clock, 0),
3733 component(shader_clock, 1) };
3734 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3735 break;
3736 }
3737
3738 case nir_intrinsic_image_size: {
3739 /* Get the referenced image variable and type. */
3740 const nir_variable *var = instr->variables[0]->var;
3741 const glsl_type *type = var->type->without_array();
3742
3743 /* Get the size of the image. */
3744 const fs_reg image = get_nir_image_deref(instr->variables[0]);
3745 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
3746
3747 /* For 1DArray image types, the array index is stored in the Z component.
3748 * Fix this by swizzling the Z component to the Y component.
3749 */
3750 const bool is_1d_array_image =
3751 type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
3752 type->sampler_array;
3753
3754 /* For CubeArray images, we should count the number of cubes instead
3755 * of the number of faces. Fix it by dividing the (Z component) by 6.
3756 */
3757 const bool is_cube_array_image =
3758 type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
3759 type->sampler_array;
3760
3761 /* Copy all the components. */
3762 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3763 for (unsigned c = 0; c < info->dest_components; ++c) {
3764 if ((int)c >= type->coordinate_components()) {
3765 bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3766 brw_imm_d(1));
3767 } else if (c == 1 && is_1d_array_image) {
3768 bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3769 offset(size, bld, 2));
3770 } else if (c == 2 && is_cube_array_image) {
3771 bld.emit(SHADER_OPCODE_INT_QUOTIENT,
3772 offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3773 offset(size, bld, c), brw_imm_d(6));
3774 } else {
3775 bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3776 offset(size, bld, c));
3777 }
3778 }
3779
3780 break;
3781 }
3782
3783 case nir_intrinsic_image_samples:
3784 /* The driver does not support multi-sampled images. */
3785 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
3786 break;
3787
3788 case nir_intrinsic_load_uniform: {
3789 /* Offsets are in bytes but they should always be multiples of 4 */
3790 assert(instr->const_index[0] % 4 == 0);
3791
3792 fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
3793
3794 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3795 if (const_offset) {
3796 /* Offsets are in bytes but they should always be multiples of 4 */
3797 assert(const_offset->u32[0] % 4 == 0);
3798 src.offset = const_offset->u32[0];
3799
3800 for (unsigned j = 0; j < instr->num_components; j++) {
3801 bld.MOV(offset(dest, bld, j), offset(src, bld, j));
3802 }
3803 } else {
3804 fs_reg indirect = retype(get_nir_src(instr->src[0]),
3805 BRW_REGISTER_TYPE_UD);
3806
3807 /* We need to pass a size to the MOV_INDIRECT but we don't want it to
3808 * go past the end of the uniform. In order to keep the n'th
3809 * component from running past, we subtract off the size of all but
3810 * one component of the vector.
3811 */
3812 assert(instr->const_index[1] >=
3813 instr->num_components * (int) type_sz(dest.type));
3814 unsigned read_size = instr->const_index[1] -
3815 (instr->num_components - 1) * type_sz(dest.type);
3816
3817 bool supports_64bit_indirects =
3818 !devinfo->is_cherryview && !devinfo->is_broxton;
3819
3820 if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
3821 for (unsigned j = 0; j < instr->num_components; j++) {
3822 bld.emit(SHADER_OPCODE_MOV_INDIRECT,
3823 offset(dest, bld, j), offset(src, bld, j),
3824 indirect, brw_imm_ud(read_size));
3825 }
3826 } else {
3827 const unsigned num_mov_indirects =
3828 type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
3829 /* We read a little bit less per MOV INDIRECT, as they are now
3830 * 32-bits ones instead of 64-bit. Fix read_size then.
3831 */
3832 const unsigned read_size_32bit = read_size -
3833 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
3834 for (unsigned j = 0; j < instr->num_components; j++) {
3835 for (unsigned i = 0; i < num_mov_indirects; i++) {
3836 bld.emit(SHADER_OPCODE_MOV_INDIRECT,
3837 subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
3838 subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
3839 indirect, brw_imm_ud(read_size_32bit));
3840 }
3841 }
3842 }
3843 }
3844 break;
3845 }
3846
3847 case nir_intrinsic_load_ubo: {
3848 nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
3849 fs_reg surf_index;
3850
3851 if (const_index) {
3852 const unsigned index = stage_prog_data->binding_table.ubo_start +
3853 const_index->u32[0];
3854 surf_index = brw_imm_ud(index);
3855 brw_mark_surface_used(prog_data, index);
3856 } else {
3857 /* The block index is not a constant. Evaluate the index expression
3858 * per-channel and add the base UBO index; we have to select a value
3859 * from any live channel.
3860 */
3861 surf_index = vgrf(glsl_type::uint_type);
3862 bld.ADD(surf_index, get_nir_src(instr->src[0]),
3863 brw_imm_ud(stage_prog_data->binding_table.ubo_start));
3864 surf_index = bld.emit_uniformize(surf_index);
3865
3866 /* Assume this may touch any UBO. It would be nice to provide
3867 * a tighter bound, but the array information is already lowered away.
3868 */
3869 brw_mark_surface_used(prog_data,
3870 stage_prog_data->binding_table.ubo_start +
3871 nir->info->num_ubos - 1);
3872 }
3873
3874 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3875 if (const_offset == NULL) {
3876 fs_reg base_offset = retype(get_nir_src(instr->src[1]),
3877 BRW_REGISTER_TYPE_UD);
3878
3879 for (int i = 0; i < instr->num_components; i++)
3880 VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
3881 base_offset, i * type_sz(dest.type));
3882 } else {
3883 /* Even if we are loading doubles, a pull constant load will load
3884 * a 32-bit vec4, so should only reserve vgrf space for that. If we
3885 * need to load a full dvec4 we will have to emit 2 loads. This is
3886 * similar to demote_pull_constants(), except that in that case we
3887 * see individual accesses to each component of the vector and then
3888 * we let CSE deal with duplicate loads. Here we see a vector access
3889 * and we have to split it if necessary.
3890 */
3891 const unsigned type_size = type_sz(dest.type);
3892 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
3893 const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
3894 const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
3895
3896 for (unsigned c = 0; c < instr->num_components;) {
3897 const unsigned base = const_offset->u32[0] + c * type_size;
3898 /* Number of usable components in the next block-aligned load. */
3899 const unsigned count = MIN2(instr->num_components - c,
3900 (block_sz - base % block_sz) / type_size);
3901
3902 ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
3903 packed_consts, surf_index,
3904 brw_imm_ud(base & ~(block_sz - 1)));
3905
3906 const fs_reg consts =
3907 retype(byte_offset(packed_consts, base & (block_sz - 1)),
3908 dest.type);
3909
3910 for (unsigned d = 0; d < count; d++)
3911 bld.MOV(offset(dest, bld, c + d), component(consts, d));
3912
3913 c += count;
3914 }
3915 }
3916 break;
3917 }
3918
3919 case nir_intrinsic_load_ssbo: {
3920 assert(devinfo->gen >= 7);
3921
3922 nir_const_value *const_uniform_block =
3923 nir_src_as_const_value(instr->src[0]);
3924
3925 fs_reg surf_index;
3926 if (const_uniform_block) {
3927 unsigned index = stage_prog_data->binding_table.ssbo_start +
3928 const_uniform_block->u32[0];
3929 surf_index = brw_imm_ud(index);
3930 brw_mark_surface_used(prog_data, index);
3931 } else {
3932 surf_index = vgrf(glsl_type::uint_type);
3933 bld.ADD(surf_index, get_nir_src(instr->src[0]),
3934 brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
3935
3936 /* Assume this may touch any UBO. It would be nice to provide
3937 * a tighter bound, but the array information is already lowered away.
3938 */
3939 brw_mark_surface_used(prog_data,
3940 stage_prog_data->binding_table.ssbo_start +
3941 nir->info->num_ssbos - 1);
3942 }
3943
3944 fs_reg offset_reg;
3945 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3946 if (const_offset) {
3947 offset_reg = brw_imm_ud(const_offset->u32[0]);
3948 } else {
3949 offset_reg = get_nir_src(instr->src[1]);
3950 }
3951
3952 /* Read the vector */
3953 do_untyped_vector_read(bld, dest, surf_index, offset_reg,
3954 instr->num_components);
3955
3956 break;
3957 }
3958
3959 case nir_intrinsic_store_ssbo: {
3960 assert(devinfo->gen >= 7);
3961
3962 if (stage == MESA_SHADER_FRAGMENT)
3963 brw_wm_prog_data(prog_data)->has_side_effects = true;
3964
3965 /* Block index */
3966 fs_reg surf_index;
3967 nir_const_value *const_uniform_block =
3968 nir_src_as_const_value(instr->src[1]);
3969 if (const_uniform_block) {
3970 unsigned index = stage_prog_data->binding_table.ssbo_start +
3971 const_uniform_block->u32[0];
3972 surf_index = brw_imm_ud(index);
3973 brw_mark_surface_used(prog_data, index);
3974 } else {
3975 surf_index = vgrf(glsl_type::uint_type);
3976 bld.ADD(surf_index, get_nir_src(instr->src[1]),
3977 brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
3978
3979 brw_mark_surface_used(prog_data,
3980 stage_prog_data->binding_table.ssbo_start +
3981 nir->info->num_ssbos - 1);
3982 }
3983
3984 /* Value */
3985 fs_reg val_reg = get_nir_src(instr->src[0]);
3986
3987 /* Writemask */
3988 unsigned writemask = instr->const_index[0];
3989
3990 /* get_nir_src() retypes to integer. Be wary of 64-bit types though
3991 * since the untyped writes below operate in units of 32-bits, which
3992 * means that we need to write twice as many components each time.
3993 * Also, we have to suffle 64-bit data to be in the appropriate layout
3994 * expected by our 32-bit write messages.
3995 */
3996 unsigned type_size = 4;
3997 unsigned bit_size = instr->src[0].is_ssa ?
3998 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
3999 if (bit_size == 64) {
4000 type_size = 8;
4001 fs_reg tmp =
4002 fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
4003 shuffle_64bit_data_for_32bit_write(bld,
4004 retype(tmp, BRW_REGISTER_TYPE_F),
4005 retype(val_reg, BRW_REGISTER_TYPE_DF),
4006 instr->num_components);
4007 val_reg = tmp;
4008 }
4009
4010 unsigned type_slots = type_size / 4;
4011
4012 /* Combine groups of consecutive enabled channels in one write
4013 * message. We use ffs to find the first enabled channel and then ffs on
4014 * the bit-inverse, down-shifted writemask to determine the length of
4015 * the block of enabled bits.
4016 */
4017 while (writemask) {
4018 unsigned first_component = ffs(writemask) - 1;
4019 unsigned length = ffs(~(writemask >> first_component)) - 1;
4020
4021 /* We can't write more than 2 64-bit components at once. Limit the
4022 * length of the write to what we can do and let the next iteration
4023 * handle the rest
4024 */
4025 if (type_size > 4)
4026 length = MIN2(2, length);
4027
4028 fs_reg offset_reg;
4029 nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
4030 if (const_offset) {
4031 offset_reg = brw_imm_ud(const_offset->u32[0] +
4032 type_size * first_component);
4033 } else {
4034 offset_reg = vgrf(glsl_type::uint_type);
4035 bld.ADD(offset_reg,
4036 retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
4037 brw_imm_ud(type_size * first_component));
4038 }
4039
4040
4041 emit_untyped_write(bld, surf_index, offset_reg,
4042 offset(val_reg, bld, first_component * type_slots),
4043 1 /* dims */, length * type_slots,
4044 BRW_PREDICATE_NONE);
4045
4046 /* Clear the bits in the writemask that we just wrote, then try
4047 * again to see if more channels are left.
4048 */
4049 writemask &= (15 << (first_component + length));
4050 }
4051 break;
4052 }
4053
4054 case nir_intrinsic_store_output: {
4055 fs_reg src = get_nir_src(instr->src[0]);
4056
4057 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
4058 assert(const_offset && "Indirect output stores not allowed");
4059 fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
4060 4 * const_offset->u32[0]), src.type);
4061
4062 unsigned num_components = instr->num_components;
4063 unsigned first_component = nir_intrinsic_component(instr);
4064 unsigned bit_size = instr->src[0].is_ssa ?
4065 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
4066 if (bit_size == 64) {
4067 fs_reg tmp =
4068 fs_reg(VGRF, alloc.allocate(2 * num_components),
4069 BRW_REGISTER_TYPE_F);
4070 shuffle_64bit_data_for_32bit_write(
4071 bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components);
4072 src = retype(tmp, src.type);
4073 num_components *= 2;
4074 }
4075
4076 for (unsigned j = 0; j < num_components; j++) {
4077 bld.MOV(offset(new_dest, bld, j + first_component),
4078 offset(src, bld, j));
4079 }
4080 break;
4081 }
4082
4083 case nir_intrinsic_ssbo_atomic_add:
4084 nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr);
4085 break;
4086 case nir_intrinsic_ssbo_atomic_imin:
4087 nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
4088 break;
4089 case nir_intrinsic_ssbo_atomic_umin:
4090 nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
4091 break;
4092 case nir_intrinsic_ssbo_atomic_imax:
4093 nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
4094 break;
4095 case nir_intrinsic_ssbo_atomic_umax:
4096 nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
4097 break;
4098 case nir_intrinsic_ssbo_atomic_and:
4099 nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
4100 break;
4101 case nir_intrinsic_ssbo_atomic_or:
4102 nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
4103 break;
4104 case nir_intrinsic_ssbo_atomic_xor:
4105 nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
4106 break;
4107 case nir_intrinsic_ssbo_atomic_exchange:
4108 nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
4109 break;
4110 case nir_intrinsic_ssbo_atomic_comp_swap:
4111 nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
4112 break;
4113
4114 case nir_intrinsic_get_buffer_size: {
4115 nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
4116 unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
4117
4118 /* A resinfo's sampler message is used to get the buffer size. The
4119 * SIMD8's writeback message consists of four registers and SIMD16's
4120 * writeback message consists of 8 destination registers (two per each
4121 * component). Because we are only interested on the first channel of
4122 * the first returned component, where resinfo returns the buffer size
4123 * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
4124 * the dispatch width.
4125 */
4126 const fs_builder ubld = bld.exec_all().group(8, 0);
4127 fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4128 fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4129
4130 /* Set LOD = 0 */
4131 ubld.MOV(src_payload, brw_imm_d(0));
4132
4133 const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
4134 fs_inst *inst = ubld.emit(FS_OPCODE_GET_BUFFER_SIZE, ret_payload,
4135 src_payload, brw_imm_ud(index));
4136 inst->header_size = 0;
4137 inst->mlen = 1;
4138 inst->size_written = 4 * REG_SIZE;
4139
4140 bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0));
4141 brw_mark_surface_used(prog_data, index);
4142 break;
4143 }
4144
4145 case nir_intrinsic_load_channel_num: {
4146 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
4147 dest = retype(dest, BRW_REGISTER_TYPE_UD);
4148 const fs_builder allbld8 = bld.group(8, 0).exec_all();
4149 allbld8.MOV(tmp, brw_imm_v(0x76543210));
4150 if (dispatch_width > 8)
4151 allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
4152 if (dispatch_width > 16) {
4153 const fs_builder allbld16 = bld.group(16, 0).exec_all();
4154 allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
4155 }
4156 bld.MOV(dest, tmp);
4157 break;
4158 }
4159
4160 default:
4161 unreachable("unknown intrinsic");
4162 }
4163 }
4164
4165 void
4166 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
4167 int op, nir_intrinsic_instr *instr)
4168 {
4169 if (stage == MESA_SHADER_FRAGMENT)
4170 brw_wm_prog_data(prog_data)->has_side_effects = true;
4171
4172 fs_reg dest;
4173 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4174 dest = get_nir_dest(instr->dest);
4175
4176 fs_reg surface;
4177 nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
4178 if (const_surface) {
4179 unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
4180 const_surface->u32[0];
4181 surface = brw_imm_ud(surf_index);
4182 brw_mark_surface_used(prog_data, surf_index);
4183 } else {
4184 surface = vgrf(glsl_type::uint_type);
4185 bld.ADD(surface, get_nir_src(instr->src[0]),
4186 brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
4187
4188 /* Assume this may touch any SSBO. This is the same we do for other
4189 * UBO/SSBO accesses with non-constant surface.
4190 */
4191 brw_mark_surface_used(prog_data,
4192 stage_prog_data->binding_table.ssbo_start +
4193 nir->info->num_ssbos - 1);
4194 }
4195
4196 fs_reg offset = get_nir_src(instr->src[1]);
4197 fs_reg data1 = get_nir_src(instr->src[2]);
4198 fs_reg data2;
4199 if (op == BRW_AOP_CMPWR)
4200 data2 = get_nir_src(instr->src[3]);
4201
4202 /* Emit the actual atomic operation */
4203
4204 fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
4205 data1, data2,
4206 1 /* dims */, 1 /* rsize */,
4207 op,
4208 BRW_PREDICATE_NONE);
4209 dest.type = atomic_result.type;
4210 bld.MOV(dest, atomic_result);
4211 }
4212
4213 void
4214 fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
4215 int op, nir_intrinsic_instr *instr)
4216 {
4217 fs_reg dest;
4218 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4219 dest = get_nir_dest(instr->dest);
4220
4221 fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
4222 fs_reg offset;
4223 fs_reg data1 = get_nir_src(instr->src[1]);
4224 fs_reg data2;
4225 if (op == BRW_AOP_CMPWR)
4226 data2 = get_nir_src(instr->src[2]);
4227
4228 /* Get the offset */
4229 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
4230 if (const_offset) {
4231 offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
4232 } else {
4233 offset = vgrf(glsl_type::uint_type);
4234 bld.ADD(offset,
4235 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
4236 brw_imm_ud(instr->const_index[0]));
4237 }
4238
4239 /* Emit the actual atomic operation operation */
4240
4241 fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
4242 data1, data2,
4243 1 /* dims */, 1 /* rsize */,
4244 op,
4245 BRW_PREDICATE_NONE);
4246 dest.type = atomic_result.type;
4247 bld.MOV(dest, atomic_result);
4248 }
4249
4250 void
4251 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
4252 {
4253 unsigned texture = instr->texture_index;
4254 unsigned sampler = instr->sampler_index;
4255
4256 fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4257
4258 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
4259 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
4260
4261 int lod_components = 0;
4262
4263 /* The hardware requires a LOD for buffer textures */
4264 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
4265 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
4266
4267 uint32_t header_bits = 0;
4268 for (unsigned i = 0; i < instr->num_srcs; i++) {
4269 fs_reg src = get_nir_src(instr->src[i].src);
4270 switch (instr->src[i].src_type) {
4271 case nir_tex_src_bias:
4272 srcs[TEX_LOGICAL_SRC_LOD] =
4273 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
4274 break;
4275 case nir_tex_src_comparator:
4276 srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
4277 break;
4278 case nir_tex_src_coord:
4279 switch (instr->op) {
4280 case nir_texop_txf:
4281 case nir_texop_txf_ms:
4282 case nir_texop_txf_ms_mcs:
4283 case nir_texop_samples_identical:
4284 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
4285 break;
4286 default:
4287 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
4288 break;
4289 }
4290 break;
4291 case nir_tex_src_ddx:
4292 srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
4293 lod_components = nir_tex_instr_src_size(instr, i);
4294 break;
4295 case nir_tex_src_ddy:
4296 srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
4297 break;
4298 case nir_tex_src_lod:
4299 switch (instr->op) {
4300 case nir_texop_txs:
4301 srcs[TEX_LOGICAL_SRC_LOD] =
4302 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
4303 break;
4304 case nir_texop_txf:
4305 srcs[TEX_LOGICAL_SRC_LOD] =
4306 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
4307 break;
4308 default:
4309 srcs[TEX_LOGICAL_SRC_LOD] =
4310 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
4311 break;
4312 }
4313 break;
4314 case nir_tex_src_ms_index:
4315 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
4316 break;
4317
4318 case nir_tex_src_offset: {
4319 nir_const_value *const_offset =
4320 nir_src_as_const_value(instr->src[i].src);
4321 unsigned offset_bits = 0;
4322 if (const_offset &&
4323 brw_texture_offset(const_offset->i32,
4324 nir_tex_instr_src_size(instr, i),
4325 &offset_bits)) {
4326 header_bits |= offset_bits;
4327 } else {
4328 srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
4329 retype(src, BRW_REGISTER_TYPE_D);
4330 }
4331 break;
4332 }
4333
4334 case nir_tex_src_projector:
4335 unreachable("should be lowered");
4336
4337 case nir_tex_src_texture_offset: {
4338 /* Figure out the highest possible texture index and mark it as used */
4339 uint32_t max_used = texture + instr->texture_array_size - 1;
4340 if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
4341 max_used += stage_prog_data->binding_table.gather_texture_start;
4342 } else {
4343 max_used += stage_prog_data->binding_table.texture_start;
4344 }
4345 brw_mark_surface_used(prog_data, max_used);
4346
4347 /* Emit code to evaluate the actual indexing expression */
4348 fs_reg tmp = vgrf(glsl_type::uint_type);
4349 bld.ADD(tmp, src, brw_imm_ud(texture));
4350 srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
4351 break;
4352 }
4353
4354 case nir_tex_src_sampler_offset: {
4355 /* Emit code to evaluate the actual indexing expression */
4356 fs_reg tmp = vgrf(glsl_type::uint_type);
4357 bld.ADD(tmp, src, brw_imm_ud(sampler));
4358 srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
4359 break;
4360 }
4361
4362 case nir_tex_src_ms_mcs:
4363 assert(instr->op == nir_texop_txf_ms);
4364 srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
4365 break;
4366
4367 case nir_tex_src_plane: {
4368 nir_const_value *const_plane =
4369 nir_src_as_const_value(instr->src[i].src);
4370 const uint32_t plane = const_plane->u32[0];
4371 const uint32_t texture_index =
4372 instr->texture_index +
4373 stage_prog_data->binding_table.plane_start[plane] -
4374 stage_prog_data->binding_table.texture_start;
4375
4376 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
4377 break;
4378 }
4379
4380 default:
4381 unreachable("unknown texture source");
4382 }
4383 }
4384
4385 if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
4386 (instr->op == nir_texop_txf_ms ||
4387 instr->op == nir_texop_samples_identical)) {
4388 if (devinfo->gen >= 7 &&
4389 key_tex->compressed_multisample_layout_mask & (1 << texture)) {
4390 srcs[TEX_LOGICAL_SRC_MCS] =
4391 emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
4392 instr->coord_components,
4393 srcs[TEX_LOGICAL_SRC_SURFACE]);
4394 } else {
4395 srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
4396 }
4397 }
4398
4399 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
4400 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
4401
4402 if (instr->op == nir_texop_query_levels ||
4403 (instr->op == nir_texop_tex && stage != MESA_SHADER_FRAGMENT)) {
4404 /* textureQueryLevels() and texture() are implemented in terms of TXS
4405 * and TXL respectively, so we need to pass a valid LOD argument.
4406 */
4407 assert(srcs[TEX_LOGICAL_SRC_LOD].file == BAD_FILE);
4408 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0u);
4409 }
4410
4411 enum opcode opcode;
4412 switch (instr->op) {
4413 case nir_texop_tex:
4414 opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL :
4415 SHADER_OPCODE_TXL_LOGICAL);
4416 break;
4417 case nir_texop_txb:
4418 opcode = FS_OPCODE_TXB_LOGICAL;
4419 break;
4420 case nir_texop_txl:
4421 opcode = SHADER_OPCODE_TXL_LOGICAL;
4422 break;
4423 case nir_texop_txd:
4424 opcode = SHADER_OPCODE_TXD_LOGICAL;
4425 break;
4426 case nir_texop_txf:
4427 opcode = SHADER_OPCODE_TXF_LOGICAL;
4428 break;
4429 case nir_texop_txf_ms:
4430 if ((key_tex->msaa_16 & (1 << sampler)))
4431 opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
4432 else
4433 opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
4434 break;
4435 case nir_texop_txf_ms_mcs:
4436 opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
4437 break;
4438 case nir_texop_query_levels:
4439 case nir_texop_txs:
4440 opcode = SHADER_OPCODE_TXS_LOGICAL;
4441 break;
4442 case nir_texop_lod:
4443 opcode = SHADER_OPCODE_LOD_LOGICAL;
4444 break;
4445 case nir_texop_tg4:
4446 if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
4447 opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
4448 else
4449 opcode = SHADER_OPCODE_TG4_LOGICAL;
4450 break;
4451 case nir_texop_texture_samples:
4452 opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
4453 break;
4454 case nir_texop_samples_identical: {
4455 fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
4456
4457 /* If mcs is an immediate value, it means there is no MCS. In that case
4458 * just return false.
4459 */
4460 if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
4461 bld.MOV(dst, brw_imm_ud(0u));
4462 } else if ((key_tex->msaa_16 & (1 << sampler))) {
4463 fs_reg tmp = vgrf(glsl_type::uint_type);
4464 bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
4465 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
4466 bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
4467 } else {
4468 bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
4469 BRW_CONDITIONAL_EQ);
4470 }
4471 return;
4472 }
4473 default:
4474 unreachable("unknown texture opcode");
4475 }
4476
4477 if (instr->op == nir_texop_tg4) {
4478 if (instr->component == 1 &&
4479 key_tex->gather_channel_quirk_mask & (1 << texture)) {
4480 /* gather4 sampler is broken for green channel on RG32F --
4481 * we must ask for blue instead.
4482 */
4483 header_bits |= 2 << 16;
4484 } else {
4485 header_bits |= instr->component << 16;
4486 }
4487 }
4488
4489 fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
4490 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
4491 inst->offset = header_bits;
4492
4493 const unsigned dest_size = nir_tex_instr_dest_size(instr);
4494 if (devinfo->gen >= 9 &&
4495 instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
4496 unsigned write_mask = instr->dest.is_ssa ?
4497 nir_ssa_def_components_read(&instr->dest.ssa):
4498 (1 << dest_size) - 1;
4499 assert(write_mask != 0); /* dead code should have been eliminated */
4500 inst->size_written = util_last_bit(write_mask) *
4501 inst->dst.component_size(inst->exec_size);
4502 } else {
4503 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
4504 }
4505
4506 if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
4507 inst->shadow_compare = true;
4508
4509 if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
4510 emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
4511
4512 fs_reg nir_dest[4];
4513 for (unsigned i = 0; i < dest_size; i++)
4514 nir_dest[i] = offset(dst, bld, i);
4515
4516 if (instr->op == nir_texop_query_levels) {
4517 /* # levels is in .w */
4518 nir_dest[0] = offset(dst, bld, 3);
4519 } else if (instr->op == nir_texop_txs &&
4520 dest_size >= 3 && devinfo->gen < 7) {
4521 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
4522 fs_reg depth = offset(dst, bld, 2);
4523 nir_dest[2] = vgrf(glsl_type::int_type);
4524 bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
4525 }
4526
4527 bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
4528 }
4529
4530 void
4531 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
4532 {
4533 switch (instr->type) {
4534 case nir_jump_break:
4535 bld.emit(BRW_OPCODE_BREAK);
4536 break;
4537 case nir_jump_continue:
4538 bld.emit(BRW_OPCODE_CONTINUE);
4539 break;
4540 case nir_jump_return:
4541 default:
4542 unreachable("unknown jump");
4543 }
4544 }
4545
4546 /**
4547 * This helper takes the result of a load operation that reads 32-bit elements
4548 * in this format:
4549 *
4550 * x x x x x x x x
4551 * y y y y y y y y
4552 * z z z z z z z z
4553 * w w w w w w w w
4554 *
4555 * and shuffles the data to get this:
4556 *
4557 * x y x y x y x y
4558 * x y x y x y x y
4559 * z w z w z w z w
4560 * z w z w z w z w
4561 *
4562 * Which is exactly what we want if the load is reading 64-bit components
4563 * like doubles, where x represents the low 32-bit of the x double component
4564 * and y represents the high 32-bit of the x double component (likewise with
4565 * z and w for double component y). The parameter @components represents
4566 * the number of 64-bit components present in @src. This would typically be
4567 * 2 at most, since we can only fit 2 double elements in the result of a
4568 * vec4 load.
4569 *
4570 * Notice that @dst and @src can be the same register.
4571 */
4572 void
4573 shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
4574 const fs_reg &dst,
4575 const fs_reg &src,
4576 uint32_t components)
4577 {
4578 assert(type_sz(src.type) == 4);
4579 assert(type_sz(dst.type) == 8);
4580
4581 /* A temporary that we will use to shuffle the 32-bit data of each
4582 * component in the vector into valid 64-bit data. We can't write directly
4583 * to dst because dst can be (and would usually be) the same as src
4584 * and in that case the first MOV in the loop below would overwrite the
4585 * data read in the second MOV.
4586 */
4587 fs_reg tmp = bld.vgrf(dst.type);
4588
4589 for (unsigned i = 0; i < components; i++) {
4590 const fs_reg component_i = offset(src, bld, 2 * i);
4591
4592 bld.MOV(subscript(tmp, src.type, 0), component_i);
4593 bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1));
4594
4595 bld.MOV(offset(dst, bld, i), tmp);
4596 }
4597 }
4598
4599 /**
4600 * This helper does the inverse operation of
4601 * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
4602 *
4603 * We need to do this when we are going to use untyped write messsages that
4604 * operate with 32-bit components in order to arrange our 64-bit data to be
4605 * in the expected layout.
4606 *
4607 * Notice that callers of this function, unlike in the case of the inverse
4608 * operation, would typically need to call this with dst and src being
4609 * different registers, since they would otherwise corrupt the original
4610 * 64-bit data they are about to write. Because of this the function checks
4611 * that the src and dst regions involved in the operation do not overlap.
4612 */
4613 void
4614 shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
4615 const fs_reg &dst,
4616 const fs_reg &src,
4617 uint32_t components)
4618 {
4619 assert(type_sz(src.type) == 8);
4620 assert(type_sz(dst.type) == 4);
4621
4622 assert(!regions_overlap(
4623 dst, 2 * components * dst.component_size(bld.dispatch_width()),
4624 src, components * src.component_size(bld.dispatch_width())));
4625
4626 for (unsigned i = 0; i < components; i++) {
4627 const fs_reg component_i = offset(src, bld, i);
4628 bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0));
4629 bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1));
4630 }
4631 }
4632
4633 fs_reg
4634 setup_imm_df(const fs_builder &bld, double v)
4635 {
4636 const struct gen_device_info *devinfo = bld.shader->devinfo;
4637 assert(devinfo->gen >= 7);
4638
4639 if (devinfo->gen >= 8)
4640 return brw_imm_df(v);
4641
4642 /* gen7.5 does not support DF immediates straighforward but the DIM
4643 * instruction allows to set the 64-bit immediate value.
4644 */
4645 if (devinfo->is_haswell) {
4646 const fs_builder ubld = bld.exec_all().group(1, 0);
4647 fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
4648 ubld.DIM(dst, brw_imm_df(v));
4649 return component(dst, 0);
4650 }
4651
4652 /* gen7 does not support DF immediates, so we generate a 64-bit constant by
4653 * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
4654 * the high 32-bit to suboffset 4 and then applying a stride of 0.
4655 *
4656 * Alternatively, we could also produce a normal VGRF (without stride 0)
4657 * by writing to all the channels in the VGRF, however, that would hit the
4658 * gen7 bug where we have to split writes that span more than 1 register
4659 * into instructions with a width of 4 (otherwise the write to the second
4660 * register written runs into an execmask hardware bug) which isn't very
4661 * nice.
4662 */
4663 union {
4664 double d;
4665 struct {
4666 uint32_t i1;
4667 uint32_t i2;
4668 };
4669 } di;
4670
4671 di.d = v;
4672
4673 const fs_builder ubld = bld.exec_all().group(1, 0);
4674 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4675 ubld.MOV(tmp, brw_imm_ud(di.i1));
4676 ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
4677
4678 return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
4679 }