i965: Move intel_debug.h to intel/common/gen_debug.h
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_generator.cpp
1 /* Copyright © 2011 Intel Corporation
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice (including the next
11 * paragraph) shall be included in all copies or substantial portions of the
12 * Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20 * IN THE SOFTWARE.
21 */
22
23 #include "brw_vec4.h"
24 #include "brw_cfg.h"
25 #include "brw_eu.h"
26 #include "brw_program.h"
27 #include "common/gen_debug.h"
28
29 using namespace brw;
30
31 static void
32 generate_math1_gen4(struct brw_codegen *p,
33 vec4_instruction *inst,
34 struct brw_reg dst,
35 struct brw_reg src)
36 {
37 gen4_math(p,
38 dst,
39 brw_math_function(inst->opcode),
40 inst->base_mrf,
41 src,
42 BRW_MATH_PRECISION_FULL);
43 }
44
45 static void
46 check_gen6_math_src_arg(struct brw_reg src)
47 {
48 /* Source swizzles are ignored. */
49 assert(!src.abs);
50 assert(!src.negate);
51 assert(src.swizzle == BRW_SWIZZLE_XYZW);
52 }
53
54 static void
55 generate_math_gen6(struct brw_codegen *p,
56 vec4_instruction *inst,
57 struct brw_reg dst,
58 struct brw_reg src0,
59 struct brw_reg src1)
60 {
61 /* Can't do writemask because math can't be align16. */
62 assert(dst.writemask == WRITEMASK_XYZW);
63 /* Source swizzles are ignored. */
64 check_gen6_math_src_arg(src0);
65 if (src1.file == BRW_GENERAL_REGISTER_FILE)
66 check_gen6_math_src_arg(src1);
67
68 brw_set_default_access_mode(p, BRW_ALIGN_1);
69 gen6_math(p, dst, brw_math_function(inst->opcode), src0, src1);
70 brw_set_default_access_mode(p, BRW_ALIGN_16);
71 }
72
73 static void
74 generate_math2_gen4(struct brw_codegen *p,
75 vec4_instruction *inst,
76 struct brw_reg dst,
77 struct brw_reg src0,
78 struct brw_reg src1)
79 {
80 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
81 * "Message Payload":
82 *
83 * "Operand0[7]. For the INT DIV functions, this operand is the
84 * denominator."
85 * ...
86 * "Operand1[7]. For the INT DIV functions, this operand is the
87 * numerator."
88 */
89 bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
90 struct brw_reg &op0 = is_int_div ? src1 : src0;
91 struct brw_reg &op1 = is_int_div ? src0 : src1;
92
93 brw_push_insn_state(p);
94 brw_set_default_saturate(p, false);
95 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
96 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
97 brw_pop_insn_state(p);
98
99 gen4_math(p,
100 dst,
101 brw_math_function(inst->opcode),
102 inst->base_mrf,
103 op0,
104 BRW_MATH_PRECISION_FULL);
105 }
106
107 static void
108 generate_tex(struct brw_codegen *p,
109 struct brw_vue_prog_data *prog_data,
110 gl_shader_stage stage,
111 vec4_instruction *inst,
112 struct brw_reg dst,
113 struct brw_reg src,
114 struct brw_reg surface_index,
115 struct brw_reg sampler_index)
116 {
117 const struct gen_device_info *devinfo = p->devinfo;
118 int msg_type = -1;
119
120 if (devinfo->gen >= 5) {
121 switch (inst->opcode) {
122 case SHADER_OPCODE_TEX:
123 case SHADER_OPCODE_TXL:
124 if (inst->shadow_compare) {
125 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
126 } else {
127 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
128 }
129 break;
130 case SHADER_OPCODE_TXD:
131 if (inst->shadow_compare) {
132 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
133 assert(devinfo->gen >= 8 || devinfo->is_haswell);
134 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
135 } else {
136 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
137 }
138 break;
139 case SHADER_OPCODE_TXF:
140 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
141 break;
142 case SHADER_OPCODE_TXF_CMS_W:
143 assert(devinfo->gen >= 9);
144 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
145 break;
146 case SHADER_OPCODE_TXF_CMS:
147 if (devinfo->gen >= 7)
148 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
149 else
150 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
151 break;
152 case SHADER_OPCODE_TXF_MCS:
153 assert(devinfo->gen >= 7);
154 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
155 break;
156 case SHADER_OPCODE_TXS:
157 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
158 break;
159 case SHADER_OPCODE_TG4:
160 if (inst->shadow_compare) {
161 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
162 } else {
163 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
164 }
165 break;
166 case SHADER_OPCODE_TG4_OFFSET:
167 if (inst->shadow_compare) {
168 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
169 } else {
170 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
171 }
172 break;
173 case SHADER_OPCODE_SAMPLEINFO:
174 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
175 break;
176 default:
177 unreachable("should not get here: invalid vec4 texture opcode");
178 }
179 } else {
180 switch (inst->opcode) {
181 case SHADER_OPCODE_TEX:
182 case SHADER_OPCODE_TXL:
183 if (inst->shadow_compare) {
184 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
185 assert(inst->mlen == 3);
186 } else {
187 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
188 assert(inst->mlen == 2);
189 }
190 break;
191 case SHADER_OPCODE_TXD:
192 /* There is no sample_d_c message; comparisons are done manually. */
193 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
194 assert(inst->mlen == 4);
195 break;
196 case SHADER_OPCODE_TXF:
197 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
198 assert(inst->mlen == 2);
199 break;
200 case SHADER_OPCODE_TXS:
201 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
202 assert(inst->mlen == 2);
203 break;
204 default:
205 unreachable("should not get here: invalid vec4 texture opcode");
206 }
207 }
208
209 assert(msg_type != -1);
210
211 assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
212
213 /* Load the message header if present. If there's a texture offset, we need
214 * to set it up explicitly and load the offset bitfield. Otherwise, we can
215 * use an implied move from g0 to the first message register.
216 */
217 if (inst->header_size != 0) {
218 if (devinfo->gen < 6 && !inst->offset) {
219 /* Set up an implied move from g0 to the MRF. */
220 src = brw_vec8_grf(0, 0);
221 } else {
222 struct brw_reg header =
223 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
224 uint32_t dw2 = 0;
225
226 /* Explicitly set up the message header by copying g0 to the MRF. */
227 brw_push_insn_state(p);
228 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
229 brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
230
231 brw_set_default_access_mode(p, BRW_ALIGN_1);
232
233 if (inst->offset)
234 /* Set the texel offset bits in DWord 2. */
235 dw2 = inst->offset;
236
237 if (devinfo->gen >= 9)
238 /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D,
239 * based on bit 22 in the header.
240 */
241 dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2;
242
243 /* The VS, DS, and FS stages have the g0.2 payload delivered as 0,
244 * so header0.2 is 0 when g0 is copied. The HS and GS stages do
245 * not, so we must set to to 0 to avoid setting undesirable bits
246 * in the message header.
247 */
248 if (dw2 ||
249 stage == MESA_SHADER_TESS_CTRL ||
250 stage == MESA_SHADER_GEOMETRY) {
251 brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2));
252 }
253
254 brw_adjust_sampler_state_pointer(p, header, sampler_index);
255 brw_pop_insn_state(p);
256 }
257 }
258
259 uint32_t return_format;
260
261 switch (dst.type) {
262 case BRW_REGISTER_TYPE_D:
263 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
264 break;
265 case BRW_REGISTER_TYPE_UD:
266 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
267 break;
268 default:
269 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
270 break;
271 }
272
273 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
274 inst->opcode == SHADER_OPCODE_TG4_OFFSET)
275 ? prog_data->base.binding_table.gather_texture_start
276 : prog_data->base.binding_table.texture_start;
277
278 if (surface_index.file == BRW_IMMEDIATE_VALUE &&
279 sampler_index.file == BRW_IMMEDIATE_VALUE) {
280 uint32_t surface = surface_index.ud;
281 uint32_t sampler = sampler_index.ud;
282
283 brw_SAMPLE(p,
284 dst,
285 inst->base_mrf,
286 src,
287 surface + base_binding_table_index,
288 sampler % 16,
289 msg_type,
290 1, /* response length */
291 inst->mlen,
292 inst->header_size != 0,
293 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
294 return_format);
295
296 brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
297 } else {
298 /* Non-constant sampler index. */
299
300 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
301 struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
302 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
303
304 brw_push_insn_state(p);
305 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
306 brw_set_default_access_mode(p, BRW_ALIGN_1);
307
308 if (brw_regs_equal(&surface_reg, &sampler_reg)) {
309 brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
310 } else {
311 if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
312 brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
313 } else {
314 brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
315 brw_OR(p, addr, addr, surface_reg);
316 }
317 }
318 if (base_binding_table_index)
319 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
320 brw_AND(p, addr, addr, brw_imm_ud(0xfff));
321
322 brw_pop_insn_state(p);
323
324 if (inst->base_mrf != -1)
325 gen6_resolve_implied_move(p, &src, inst->base_mrf);
326
327 /* dst = send(offset, a0.0 | <descriptor>) */
328 brw_inst *insn = brw_send_indirect_message(
329 p, BRW_SFID_SAMPLER, dst, src, addr);
330 brw_set_sampler_message(p, insn,
331 0 /* surface */,
332 0 /* sampler */,
333 msg_type,
334 1 /* rlen */,
335 inst->mlen /* mlen */,
336 inst->header_size != 0 /* header */,
337 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
338 return_format);
339
340 /* visitor knows more than we do about the surface limit required,
341 * so has already done marking.
342 */
343 }
344 }
345
346 static void
347 generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
348 {
349 brw_urb_WRITE(p,
350 brw_null_reg(), /* dest */
351 inst->base_mrf, /* starting mrf reg nr */
352 brw_vec8_grf(0, 0), /* src */
353 inst->urb_write_flags,
354 inst->mlen,
355 0, /* response len */
356 inst->offset, /* urb destination offset */
357 BRW_URB_SWIZZLE_INTERLEAVE);
358 }
359
360 static void
361 generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
362 {
363 struct brw_reg src = brw_message_reg(inst->base_mrf);
364 brw_urb_WRITE(p,
365 brw_null_reg(), /* dest */
366 inst->base_mrf, /* starting mrf reg nr */
367 src,
368 inst->urb_write_flags,
369 inst->mlen,
370 0, /* response len */
371 inst->offset, /* urb destination offset */
372 BRW_URB_SWIZZLE_INTERLEAVE);
373 }
374
375 static void
376 generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst)
377 {
378 struct brw_reg src = brw_message_reg(inst->base_mrf);
379
380 /* We pass the temporary passed in src0 as the writeback register */
381 brw_urb_WRITE(p,
382 inst->src[0].as_brw_reg(), /* dest */
383 inst->base_mrf, /* starting mrf reg nr */
384 src,
385 BRW_URB_WRITE_ALLOCATE_COMPLETE,
386 inst->mlen,
387 1, /* response len */
388 inst->offset, /* urb destination offset */
389 BRW_URB_SWIZZLE_INTERLEAVE);
390
391 /* Now put allocated urb handle in dst.0 */
392 brw_push_insn_state(p);
393 brw_set_default_access_mode(p, BRW_ALIGN_1);
394 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
395 brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0),
396 get_element_ud(inst->src[0].as_brw_reg(), 0));
397 brw_pop_insn_state(p);
398 }
399
400 static void
401 generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
402 {
403 struct brw_reg src = brw_message_reg(inst->base_mrf);
404 brw_urb_WRITE(p,
405 brw_null_reg(), /* dest */
406 inst->base_mrf, /* starting mrf reg nr */
407 src,
408 BRW_URB_WRITE_EOT | inst->urb_write_flags,
409 inst->mlen,
410 0, /* response len */
411 0, /* urb destination offset */
412 BRW_URB_SWIZZLE_INTERLEAVE);
413 }
414
415 static void
416 generate_gs_set_write_offset(struct brw_codegen *p,
417 struct brw_reg dst,
418 struct brw_reg src0,
419 struct brw_reg src1)
420 {
421 /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
422 * Header: M0.3):
423 *
424 * Slot 0 Offset. This field, after adding to the Global Offset field
425 * in the message descriptor, specifies the offset (in 256-bit units)
426 * from the start of the URB entry, as referenced by URB Handle 0, at
427 * which the data will be accessed.
428 *
429 * Similar text describes DWORD M0.4, which is slot 1 offset.
430 *
431 * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
432 * of the register for geometry shader invocations 0 and 1) by the
433 * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
434 *
435 * We can do this with the following EU instruction:
436 *
437 * mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW { Align1 WE_all }
438 */
439 brw_push_insn_state(p);
440 brw_set_default_access_mode(p, BRW_ALIGN_1);
441 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
442 assert(p->devinfo->gen >= 7 &&
443 src1.file == BRW_IMMEDIATE_VALUE &&
444 src1.type == BRW_REGISTER_TYPE_UD &&
445 src1.ud <= USHRT_MAX);
446 if (src0.file == BRW_IMMEDIATE_VALUE) {
447 brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
448 brw_imm_ud(src0.ud * src1.ud));
449 } else {
450 brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
451 retype(src1, BRW_REGISTER_TYPE_UW));
452 }
453 brw_pop_insn_state(p);
454 }
455
456 static void
457 generate_gs_set_vertex_count(struct brw_codegen *p,
458 struct brw_reg dst,
459 struct brw_reg src)
460 {
461 brw_push_insn_state(p);
462 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
463
464 if (p->devinfo->gen >= 8) {
465 /* Move the vertex count into the second MRF for the EOT write. */
466 brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD),
467 src);
468 } else {
469 /* If we think of the src and dst registers as composed of 8 DWORDs each,
470 * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
471 * them to WORDs, and then pack them into DWORD 2 of dst.
472 *
473 * It's easier to get the EU to do this if we think of the src and dst
474 * registers as composed of 16 WORDS each; then, we want to pick up the
475 * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5
476 * of dst.
477 *
478 * We can do that by the following EU instruction:
479 *
480 * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask }
481 */
482 brw_set_default_access_mode(p, BRW_ALIGN_1);
483 brw_MOV(p,
484 suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
485 stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
486 }
487 brw_pop_insn_state(p);
488 }
489
490 static void
491 generate_gs_svb_write(struct brw_codegen *p,
492 struct brw_vue_prog_data *prog_data,
493 vec4_instruction *inst,
494 struct brw_reg dst,
495 struct brw_reg src0,
496 struct brw_reg src1)
497 {
498 int binding = inst->sol_binding;
499 bool final_write = inst->sol_final_write;
500
501 brw_push_insn_state(p);
502 brw_set_default_exec_size(p, BRW_EXECUTE_4);
503 /* Copy Vertex data into M0.x */
504 brw_MOV(p, stride(dst, 4, 4, 1),
505 stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1));
506 brw_pop_insn_state(p);
507
508 brw_push_insn_state(p);
509 /* Send SVB Write */
510 brw_svb_write(p,
511 final_write ? src1 : brw_null_reg(), /* dest == src1 */
512 1, /* msg_reg_nr */
513 dst, /* src0 == previous dst */
514 BRW_GEN6_SOL_BINDING_START + binding, /* binding_table_index */
515 final_write); /* send_commit_msg */
516
517 /* Finally, wait for the write commit to occur so that we can proceed to
518 * other things safely.
519 *
520 * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
521 *
522 * The write commit does not modify the destination register, but
523 * merely clears the dependency associated with the destination
524 * register. Thus, a simple “mov” instruction using the register as a
525 * source is sufficient to wait for the write commit to occur.
526 */
527 if (final_write) {
528 brw_MOV(p, src1, src1);
529 }
530 brw_pop_insn_state(p);
531 }
532
533 static void
534 generate_gs_svb_set_destination_index(struct brw_codegen *p,
535 vec4_instruction *inst,
536 struct brw_reg dst,
537 struct brw_reg src)
538 {
539 int vertex = inst->sol_vertex;
540 brw_push_insn_state(p);
541 brw_set_default_access_mode(p, BRW_ALIGN_1);
542 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
543 brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex));
544 brw_pop_insn_state(p);
545 }
546
547 static void
548 generate_gs_set_dword_2(struct brw_codegen *p,
549 struct brw_reg dst,
550 struct brw_reg src)
551 {
552 brw_push_insn_state(p);
553 brw_set_default_access_mode(p, BRW_ALIGN_1);
554 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
555 brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0));
556 brw_pop_insn_state(p);
557 }
558
559 static void
560 generate_gs_prepare_channel_masks(struct brw_codegen *p,
561 struct brw_reg dst)
562 {
563 /* We want to left shift just DWORD 4 (the x component belonging to the
564 * second geometry shader invocation) by 4 bits. So generate the
565 * instruction:
566 *
567 * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
568 */
569 dst = suboffset(vec1(dst), 4);
570 brw_push_insn_state(p);
571 brw_set_default_access_mode(p, BRW_ALIGN_1);
572 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
573 brw_SHL(p, dst, dst, brw_imm_ud(4));
574 brw_pop_insn_state(p);
575 }
576
577 static void
578 generate_gs_set_channel_masks(struct brw_codegen *p,
579 struct brw_reg dst,
580 struct brw_reg src)
581 {
582 /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
583 * Header: M0.5):
584 *
585 * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
586 *
587 * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
588 * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
589 * Vertex 0 DATA[7]. This bit is ANDed with the corresponding
590 * channel enable to determine the final channel enable. For the
591 * URB_READ_OWORD & URB_READ_HWORD messages, when final channel
592 * enable is 1 it indicates that Vertex 1 DATA [3] will be included
593 * in the writeback message. For the URB_WRITE_OWORD &
594 * URB_WRITE_HWORD messages, when final channel enable is 1 it
595 * indicates that Vertex 1 DATA [3] will be written to the surface.
596 *
597 * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
598 * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
599 *
600 * 14 Vertex 1 DATA [2] Channel Mask
601 * 13 Vertex 1 DATA [1] Channel Mask
602 * 12 Vertex 1 DATA [0] Channel Mask
603 * 11 Vertex 0 DATA [3] Channel Mask
604 * 10 Vertex 0 DATA [2] Channel Mask
605 * 9 Vertex 0 DATA [1] Channel Mask
606 * 8 Vertex 0 DATA [0] Channel Mask
607 *
608 * (This is from a section of the PRM that is agnostic to the particular
609 * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
610 * geometry shader invocations 0 and 1, respectively). Since we have the
611 * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
612 * and the enable flags for geometry shader invocation 1 in bits 7:0 of
613 * DWORD 4, we just need to OR them together and store the result in bits
614 * 15:8 of DWORD 5.
615 *
616 * It's easier to get the EU to do this if we think of the src and dst
617 * registers as composed of 32 bytes each; then, we want to pick up the
618 * contents of bytes 0 and 16 from src, OR them together, and store them in
619 * byte 21.
620 *
621 * We can do that by the following EU instruction:
622 *
623 * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
624 *
625 * Note: this relies on the source register having zeros in (a) bits 7:4 of
626 * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the
627 * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which
628 * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
629 * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
630 * contain valid channel mask values (which are in the range 0x0-0xf).
631 */
632 dst = retype(dst, BRW_REGISTER_TYPE_UB);
633 src = retype(src, BRW_REGISTER_TYPE_UB);
634 brw_push_insn_state(p);
635 brw_set_default_access_mode(p, BRW_ALIGN_1);
636 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
637 brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
638 brw_pop_insn_state(p);
639 }
640
641 static void
642 generate_gs_get_instance_id(struct brw_codegen *p,
643 struct brw_reg dst)
644 {
645 /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT
646 * and store into dst.0 & dst.4. So generate the instruction:
647 *
648 * shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q }
649 */
650 brw_push_insn_state(p);
651 brw_set_default_access_mode(p, BRW_ALIGN_1);
652 dst = retype(dst, BRW_REGISTER_TYPE_UD);
653 struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
654 brw_SHR(p, dst, stride(r0, 1, 4, 0),
655 brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT));
656 brw_pop_insn_state(p);
657 }
658
659 static void
660 generate_gs_ff_sync_set_primitives(struct brw_codegen *p,
661 struct brw_reg dst,
662 struct brw_reg src0,
663 struct brw_reg src1,
664 struct brw_reg src2)
665 {
666 brw_push_insn_state(p);
667 brw_set_default_access_mode(p, BRW_ALIGN_1);
668 /* Save src0 data in 16:31 bits of dst.0 */
669 brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0),
670 brw_imm_ud(0xffffu));
671 brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16));
672 /* Save src1 data in 0:15 bits of dst.0 */
673 brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0),
674 brw_imm_ud(0xffffu));
675 brw_OR(p, suboffset(vec1(dst), 0),
676 suboffset(vec1(dst), 0),
677 suboffset(vec1(src2), 0));
678 brw_pop_insn_state(p);
679 }
680
681 static void
682 generate_gs_ff_sync(struct brw_codegen *p,
683 vec4_instruction *inst,
684 struct brw_reg dst,
685 struct brw_reg src0,
686 struct brw_reg src1)
687 {
688 /* This opcode uses an implied MRF register for:
689 * - the header of the ff_sync message. And as such it is expected to be
690 * initialized to r0 before calling here.
691 * - the destination where we will write the allocated URB handle.
692 */
693 struct brw_reg header =
694 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
695
696 /* Overwrite dword 0 of the header (SO vertices to write) and
697 * dword 1 (number of primitives written).
698 */
699 brw_push_insn_state(p);
700 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
701 brw_set_default_access_mode(p, BRW_ALIGN_1);
702 brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0));
703 brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0));
704 brw_pop_insn_state(p);
705
706 /* Allocate URB handle in dst */
707 brw_ff_sync(p,
708 dst,
709 0,
710 header,
711 1, /* allocate */
712 1, /* response length */
713 0 /* eot */);
714
715 /* Now put allocated urb handle in header.0 */
716 brw_push_insn_state(p);
717 brw_set_default_access_mode(p, BRW_ALIGN_1);
718 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
719 brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0));
720
721 /* src1 is not an immediate when we use transform feedback */
722 if (src1.file != BRW_IMMEDIATE_VALUE) {
723 brw_set_default_exec_size(p, BRW_EXECUTE_4);
724 brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1));
725 }
726
727 brw_pop_insn_state(p);
728 }
729
730 static void
731 generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
732 {
733 /* In gen6, PrimitiveID is delivered in R0.1 of the payload */
734 struct brw_reg src = brw_vec8_grf(0, 0);
735 brw_push_insn_state(p);
736 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
737 brw_set_default_access_mode(p, BRW_ALIGN_1);
738 brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1));
739 brw_pop_insn_state(p);
740 }
741
742 static void
743 generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
744 {
745 const struct gen_device_info *devinfo = p->devinfo;
746 const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
747
748 /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
749 *
750 * Since we operate in SIMD4x2 mode, we need run half as many threads
751 * as necessary. So we assign (2i + 1, 2i) as the thread counts. We
752 * shift right by one less to accomplish the multiplication by two.
753 */
754 dst = retype(dst, BRW_REGISTER_TYPE_UD);
755 struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
756
757 brw_push_insn_state(p);
758 brw_set_default_access_mode(p, BRW_ALIGN_1);
759
760 const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
761 const int shift = ivb ? 16 : 17;
762
763 brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask));
764 brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
765 brw_imm_ud(shift - 1));
766 brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1));
767
768 brw_pop_insn_state(p);
769 }
770
771 static void
772 generate_tcs_urb_write(struct brw_codegen *p,
773 vec4_instruction *inst,
774 struct brw_reg urb_header)
775 {
776 const struct gen_device_info *devinfo = p->devinfo;
777
778 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
779 brw_set_dest(p, send, brw_null_reg());
780 brw_set_src0(p, send, urb_header);
781
782 brw_set_message_descriptor(p, send, BRW_SFID_URB,
783 inst->mlen /* mlen */, 0 /* rlen */,
784 true /* header */, false /* eot */);
785 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
786 brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
787 if (inst->urb_write_flags & BRW_URB_WRITE_EOT) {
788 brw_inst_set_eot(devinfo, send, 1);
789 } else {
790 brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
791 brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
792 }
793
794 /* what happens to swizzles? */
795 }
796
797
798 static void
799 generate_tcs_input_urb_offsets(struct brw_codegen *p,
800 struct brw_reg dst,
801 struct brw_reg vertex,
802 struct brw_reg offset)
803 {
804 /* Generates an URB read/write message header for HS/DS operation.
805 * Inputs are a vertex index, and a byte offset from the beginning of
806 * the vertex. */
807
808 /* If `vertex` is not an immediate, we clobber a0.0 */
809
810 assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE);
811 assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D);
812
813 assert(dst.file == BRW_GENERAL_REGISTER_FILE);
814
815 brw_push_insn_state(p);
816 brw_set_default_access_mode(p, BRW_ALIGN_1);
817 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
818 brw_MOV(p, dst, brw_imm_ud(0));
819
820 /* m0.5 bits 8-15 are channel enables */
821 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
822
823 /* m0.0-0.1: URB handles */
824 if (vertex.file == BRW_IMMEDIATE_VALUE) {
825 uint32_t vertex_index = vertex.ud;
826 struct brw_reg index_reg = brw_vec1_grf(
827 1 + (vertex_index >> 3), vertex_index & 7);
828
829 brw_MOV(p, vec2(get_element_ud(dst, 0)),
830 retype(index_reg, BRW_REGISTER_TYPE_UD));
831 } else {
832 /* Use indirect addressing. ICP Handles are DWords (single channels
833 * of a register) and start at g1.0.
834 *
835 * In order to start our region at g1.0, we add 8 to the vertex index,
836 * effectively skipping over the 8 channels in g0.0. This gives us a
837 * DWord offset to the ICP Handle.
838 *
839 * Indirect addressing works in terms of bytes, so we then multiply
840 * the DWord offset by 4 (by shifting left by 2).
841 */
842 struct brw_reg addr = brw_address_reg(0);
843
844 /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
845 brw_ADD(p, addr, retype(get_element_ud(vertex, 0), BRW_REGISTER_TYPE_UW),
846 brw_imm_uw(0x8));
847 brw_SHL(p, addr, addr, brw_imm_uw(2));
848 brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0));
849
850 /* top half: m0.1 = g[1.0 + vertex.4]UD */
851 brw_ADD(p, addr, retype(get_element_ud(vertex, 4), BRW_REGISTER_TYPE_UW),
852 brw_imm_uw(0x8));
853 brw_SHL(p, addr, addr, brw_imm_uw(2));
854 brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0));
855 }
856
857 /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
858 if (offset.file != ARF)
859 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
860
861 brw_pop_insn_state(p);
862 }
863
864
865 static void
866 generate_tcs_output_urb_offsets(struct brw_codegen *p,
867 struct brw_reg dst,
868 struct brw_reg write_mask,
869 struct brw_reg offset)
870 {
871 /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
872 assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE);
873
874 assert(write_mask.file == BRW_IMMEDIATE_VALUE);
875 assert(write_mask.type == BRW_REGISTER_TYPE_UD);
876
877 brw_push_insn_state(p);
878
879 brw_set_default_access_mode(p, BRW_ALIGN_1);
880 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
881 brw_MOV(p, dst, brw_imm_ud(0));
882
883 unsigned mask = write_mask.ud;
884
885 /* m0.5 bits 15:12 and 11:8 are channel enables */
886 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12)));
887
888 /* HS patch URB handle is delivered in r0.0 */
889 struct brw_reg urb_handle = brw_vec1_grf(0, 0);
890
891 /* m0.0-0.1: URB handles */
892 brw_MOV(p, vec2(get_element_ud(dst, 0)),
893 retype(urb_handle, BRW_REGISTER_TYPE_UD));
894
895 /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
896 if (offset.file != ARF)
897 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
898
899 brw_pop_insn_state(p);
900 }
901
902 static void
903 generate_tes_create_input_read_header(struct brw_codegen *p,
904 struct brw_reg dst)
905 {
906 brw_push_insn_state(p);
907 brw_set_default_access_mode(p, BRW_ALIGN_1);
908 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
909
910 /* Initialize the register to 0 */
911 brw_MOV(p, dst, brw_imm_ud(0));
912
913 /* Enable all the channels in m0.5 bits 15:8 */
914 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
915
916 /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1. For safety,
917 * mask out irrelevant "Reserved" bits, as they're not marked MBZ.
918 */
919 brw_AND(p, vec2(get_element_ud(dst, 0)),
920 retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD),
921 brw_imm_ud(0x1fff));
922 brw_pop_insn_state(p);
923 }
924
925 static void
926 generate_tes_add_indirect_urb_offset(struct brw_codegen *p,
927 struct brw_reg dst,
928 struct brw_reg header,
929 struct brw_reg offset)
930 {
931 brw_push_insn_state(p);
932 brw_set_default_access_mode(p, BRW_ALIGN_1);
933 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
934
935 brw_MOV(p, dst, header);
936 /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */
937 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
938
939 brw_pop_insn_state(p);
940 }
941
942 static void
943 generate_vec4_urb_read(struct brw_codegen *p,
944 vec4_instruction *inst,
945 struct brw_reg dst,
946 struct brw_reg header)
947 {
948 const struct gen_device_info *devinfo = p->devinfo;
949
950 assert(header.file == BRW_GENERAL_REGISTER_FILE);
951 assert(header.type == BRW_REGISTER_TYPE_UD);
952
953 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
954 brw_set_dest(p, send, dst);
955 brw_set_src0(p, send, header);
956
957 brw_set_message_descriptor(p, send, BRW_SFID_URB,
958 1 /* mlen */, 1 /* rlen */,
959 true /* header */, false /* eot */);
960 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
961 brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
962 brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
963
964 brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
965 }
966
967 static void
968 generate_tcs_release_input(struct brw_codegen *p,
969 struct brw_reg header,
970 struct brw_reg vertex,
971 struct brw_reg is_unpaired)
972 {
973 const struct gen_device_info *devinfo = p->devinfo;
974
975 assert(vertex.file == BRW_IMMEDIATE_VALUE);
976 assert(vertex.type == BRW_REGISTER_TYPE_UD);
977
978 /* m0.0-0.1: URB handles */
979 struct brw_reg urb_handles =
980 retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7),
981 BRW_REGISTER_TYPE_UD);
982
983 brw_push_insn_state(p);
984 brw_set_default_access_mode(p, BRW_ALIGN_1);
985 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
986 brw_MOV(p, header, brw_imm_ud(0));
987 brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles);
988 brw_pop_insn_state(p);
989
990 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
991 brw_set_dest(p, send, brw_null_reg());
992 brw_set_src0(p, send, header);
993 brw_set_message_descriptor(p, send, BRW_SFID_URB,
994 1 /* mlen */, 0 /* rlen */,
995 true /* header */, false /* eot */);
996 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
997 brw_inst_set_urb_complete(devinfo, send, 1);
998 brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
999 BRW_URB_SWIZZLE_NONE :
1000 BRW_URB_SWIZZLE_INTERLEAVE);
1001 }
1002
1003 static void
1004 generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
1005 {
1006 struct brw_reg header = brw_message_reg(inst->base_mrf);
1007
1008 brw_push_insn_state(p);
1009 brw_set_default_access_mode(p, BRW_ALIGN_1);
1010 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1011 brw_MOV(p, header, brw_imm_ud(0));
1012 brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8));
1013 brw_MOV(p, get_element_ud(header, 0),
1014 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
1015 brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u));
1016 brw_pop_insn_state(p);
1017
1018 brw_urb_WRITE(p,
1019 brw_null_reg(), /* dest */
1020 inst->base_mrf, /* starting mrf reg nr */
1021 header,
1022 BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD |
1023 BRW_URB_WRITE_USE_CHANNEL_MASKS,
1024 inst->mlen,
1025 0, /* response len */
1026 0, /* urb destination offset */
1027 0);
1028 }
1029
1030 static void
1031 generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
1032 {
1033 brw_push_insn_state(p);
1034 brw_set_default_access_mode(p, BRW_ALIGN_1);
1035 brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D));
1036 brw_pop_insn_state(p);
1037 }
1038
1039 static void
1040 generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
1041 {
1042 brw_push_insn_state(p);
1043 brw_set_default_access_mode(p, BRW_ALIGN_1);
1044 brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
1045 brw_pop_insn_state(p);
1046 }
1047
1048 static void
1049 generate_tcs_create_barrier_header(struct brw_codegen *p,
1050 struct brw_vue_prog_data *prog_data,
1051 struct brw_reg dst)
1052 {
1053 const struct gen_device_info *devinfo = p->devinfo;
1054 const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
1055 struct brw_reg m0_2 = get_element_ud(dst, 2);
1056 unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
1057
1058 brw_push_insn_state(p);
1059 brw_set_default_access_mode(p, BRW_ALIGN_1);
1060 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1061
1062 /* Zero the message header */
1063 brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
1064
1065 /* Copy "Barrier ID" from r0.2, bits 16:13 (Gen7.5+) or 15:12 (Gen7) */
1066 brw_AND(p, m0_2,
1067 retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
1068 brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13)));
1069
1070 /* Shift it up to bits 27:24. */
1071 brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11));
1072
1073 /* Set the Barrier Count and the enable bit */
1074 brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15)));
1075
1076 brw_pop_insn_state(p);
1077 }
1078
1079 static void
1080 generate_oword_dual_block_offsets(struct brw_codegen *p,
1081 struct brw_reg m1,
1082 struct brw_reg index)
1083 {
1084 int second_vertex_offset;
1085
1086 if (p->devinfo->gen >= 6)
1087 second_vertex_offset = 1;
1088 else
1089 second_vertex_offset = 16;
1090
1091 m1 = retype(m1, BRW_REGISTER_TYPE_D);
1092
1093 /* Set up M1 (message payload). Only the block offsets in M1.0 and
1094 * M1.4 are used, and the rest are ignored.
1095 */
1096 struct brw_reg m1_0 = suboffset(vec1(m1), 0);
1097 struct brw_reg m1_4 = suboffset(vec1(m1), 4);
1098 struct brw_reg index_0 = suboffset(vec1(index), 0);
1099 struct brw_reg index_4 = suboffset(vec1(index), 4);
1100
1101 brw_push_insn_state(p);
1102 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1103 brw_set_default_access_mode(p, BRW_ALIGN_1);
1104
1105 brw_MOV(p, m1_0, index_0);
1106
1107 if (index.file == BRW_IMMEDIATE_VALUE) {
1108 index_4.ud += second_vertex_offset;
1109 brw_MOV(p, m1_4, index_4);
1110 } else {
1111 brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
1112 }
1113
1114 brw_pop_insn_state(p);
1115 }
1116
1117 static void
1118 generate_unpack_flags(struct brw_codegen *p,
1119 struct brw_reg dst)
1120 {
1121 brw_push_insn_state(p);
1122 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1123 brw_set_default_access_mode(p, BRW_ALIGN_1);
1124
1125 struct brw_reg flags = brw_flag_reg(0, 0);
1126 struct brw_reg dst_0 = suboffset(vec1(dst), 0);
1127 struct brw_reg dst_4 = suboffset(vec1(dst), 4);
1128
1129 brw_AND(p, dst_0, flags, brw_imm_ud(0x0f));
1130 brw_AND(p, dst_4, flags, brw_imm_ud(0xf0));
1131 brw_SHR(p, dst_4, dst_4, brw_imm_ud(4));
1132
1133 brw_pop_insn_state(p);
1134 }
1135
1136 static void
1137 generate_scratch_read(struct brw_codegen *p,
1138 vec4_instruction *inst,
1139 struct brw_reg dst,
1140 struct brw_reg index)
1141 {
1142 const struct gen_device_info *devinfo = p->devinfo;
1143 struct brw_reg header = brw_vec8_grf(0, 0);
1144
1145 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1146
1147 generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
1148 index);
1149
1150 uint32_t msg_type;
1151
1152 if (devinfo->gen >= 6)
1153 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1154 else if (devinfo->gen == 5 || devinfo->is_g4x)
1155 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1156 else
1157 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1158
1159 const unsigned target_cache =
1160 devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1161 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1162 BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
1163
1164 /* Each of the 8 channel enables is considered for whether each
1165 * dword is written.
1166 */
1167 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1168 brw_set_dest(p, send, dst);
1169 brw_set_src0(p, send, header);
1170 if (devinfo->gen < 6)
1171 brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
1172 brw_set_dp_read_message(p, send,
1173 brw_scratch_surface_idx(p),
1174 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1175 msg_type, target_cache,
1176 2, /* mlen */
1177 true, /* header_present */
1178 1 /* rlen */);
1179 }
1180
1181 static void
1182 generate_scratch_write(struct brw_codegen *p,
1183 vec4_instruction *inst,
1184 struct brw_reg dst,
1185 struct brw_reg src,
1186 struct brw_reg index)
1187 {
1188 const struct gen_device_info *devinfo = p->devinfo;
1189 const unsigned target_cache =
1190 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1191 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1192 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
1193 struct brw_reg header = brw_vec8_grf(0, 0);
1194 bool write_commit;
1195
1196 /* If the instruction is predicated, we'll predicate the send, not
1197 * the header setup.
1198 */
1199 brw_set_default_predicate_control(p, false);
1200
1201 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1202
1203 generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
1204 index);
1205
1206 brw_MOV(p,
1207 retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
1208 retype(src, BRW_REGISTER_TYPE_D));
1209
1210 uint32_t msg_type;
1211
1212 if (devinfo->gen >= 7)
1213 msg_type = GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE;
1214 else if (devinfo->gen == 6)
1215 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
1216 else
1217 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
1218
1219 brw_set_default_predicate_control(p, inst->predicate);
1220
1221 /* Pre-gen6, we have to specify write commits to ensure ordering
1222 * between reads and writes within a thread. Afterwards, that's
1223 * guaranteed and write commits only matter for inter-thread
1224 * synchronization.
1225 */
1226 if (devinfo->gen >= 6) {
1227 write_commit = false;
1228 } else {
1229 /* The visitor set up our destination register to be g0. This
1230 * means that when the next read comes along, we will end up
1231 * reading from g0 and causing a block on the write commit. For
1232 * write-after-read, we are relying on the value of the previous
1233 * read being used (and thus blocking on completion) before our
1234 * write is executed. This means we have to be careful in
1235 * instruction scheduling to not violate this assumption.
1236 */
1237 write_commit = true;
1238 }
1239
1240 /* Each of the 8 channel enables is considered for whether each
1241 * dword is written.
1242 */
1243 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1244 brw_set_dest(p, send, dst);
1245 brw_set_src0(p, send, header);
1246 if (devinfo->gen < 6)
1247 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
1248 brw_set_dp_write_message(p, send,
1249 brw_scratch_surface_idx(p),
1250 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1251 msg_type,
1252 target_cache,
1253 3, /* mlen */
1254 true, /* header present */
1255 false, /* not a render target write */
1256 write_commit, /* rlen */
1257 false, /* eot */
1258 write_commit);
1259 }
1260
1261 static void
1262 generate_pull_constant_load(struct brw_codegen *p,
1263 struct brw_vue_prog_data *prog_data,
1264 vec4_instruction *inst,
1265 struct brw_reg dst,
1266 struct brw_reg index,
1267 struct brw_reg offset)
1268 {
1269 const struct gen_device_info *devinfo = p->devinfo;
1270 const unsigned target_cache =
1271 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE :
1272 BRW_DATAPORT_READ_TARGET_DATA_CACHE);
1273 assert(index.file == BRW_IMMEDIATE_VALUE &&
1274 index.type == BRW_REGISTER_TYPE_UD);
1275 uint32_t surf_index = index.ud;
1276
1277 struct brw_reg header = brw_vec8_grf(0, 0);
1278
1279 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1280
1281 if (devinfo->gen >= 6) {
1282 if (offset.file == BRW_IMMEDIATE_VALUE) {
1283 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
1284 BRW_REGISTER_TYPE_D),
1285 brw_imm_d(offset.ud >> 4));
1286 } else {
1287 brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1),
1288 BRW_REGISTER_TYPE_D),
1289 offset, brw_imm_d(4));
1290 }
1291 } else {
1292 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
1293 BRW_REGISTER_TYPE_D),
1294 offset);
1295 }
1296
1297 uint32_t msg_type;
1298
1299 if (devinfo->gen >= 6)
1300 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1301 else if (devinfo->gen == 5 || devinfo->is_g4x)
1302 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1303 else
1304 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1305
1306 /* Each of the 8 channel enables is considered for whether each
1307 * dword is written.
1308 */
1309 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1310 brw_set_dest(p, send, dst);
1311 brw_set_src0(p, send, header);
1312 if (devinfo->gen < 6)
1313 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
1314 brw_set_dp_read_message(p, send,
1315 surf_index,
1316 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1317 msg_type,
1318 target_cache,
1319 2, /* mlen */
1320 true, /* header_present */
1321 1 /* rlen */);
1322 }
1323
1324 static void
1325 generate_get_buffer_size(struct brw_codegen *p,
1326 struct brw_vue_prog_data *prog_data,
1327 vec4_instruction *inst,
1328 struct brw_reg dst,
1329 struct brw_reg src,
1330 struct brw_reg surf_index)
1331 {
1332 assert(p->devinfo->gen >= 7);
1333 assert(surf_index.type == BRW_REGISTER_TYPE_UD &&
1334 surf_index.file == BRW_IMMEDIATE_VALUE);
1335
1336 brw_SAMPLE(p,
1337 dst,
1338 inst->base_mrf,
1339 src,
1340 surf_index.ud,
1341 0,
1342 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
1343 1, /* response length */
1344 inst->mlen,
1345 inst->header_size > 0,
1346 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1347 BRW_SAMPLER_RETURN_FORMAT_SINT32);
1348
1349 brw_mark_surface_used(&prog_data->base, surf_index.ud);
1350 }
1351
1352 static void
1353 generate_pull_constant_load_gen7(struct brw_codegen *p,
1354 struct brw_vue_prog_data *prog_data,
1355 vec4_instruction *inst,
1356 struct brw_reg dst,
1357 struct brw_reg surf_index,
1358 struct brw_reg offset)
1359 {
1360 assert(surf_index.type == BRW_REGISTER_TYPE_UD);
1361
1362 if (surf_index.file == BRW_IMMEDIATE_VALUE) {
1363
1364 brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
1365 brw_set_dest(p, insn, dst);
1366 brw_set_src0(p, insn, offset);
1367 brw_set_sampler_message(p, insn,
1368 surf_index.ud,
1369 0, /* LD message ignores sampler unit */
1370 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1371 1, /* rlen */
1372 inst->mlen,
1373 inst->header_size != 0,
1374 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1375 0);
1376
1377 brw_mark_surface_used(&prog_data->base, surf_index.ud);
1378
1379 } else {
1380
1381 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1382
1383 brw_push_insn_state(p);
1384 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1385 brw_set_default_access_mode(p, BRW_ALIGN_1);
1386
1387 /* a0.0 = surf_index & 0xff */
1388 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1389 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1390 brw_set_dest(p, insn_and, addr);
1391 brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD)));
1392 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1393
1394 brw_pop_insn_state(p);
1395
1396 /* dst = send(offset, a0.0 | <descriptor>) */
1397 brw_inst *insn = brw_send_indirect_message(
1398 p, BRW_SFID_SAMPLER, dst, offset, addr);
1399 brw_set_sampler_message(p, insn,
1400 0 /* surface */,
1401 0 /* sampler */,
1402 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1403 1 /* rlen */,
1404 inst->mlen,
1405 inst->header_size != 0,
1406 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1407 0);
1408 }
1409 }
1410
1411 static void
1412 generate_set_simd4x2_header_gen9(struct brw_codegen *p,
1413 vec4_instruction *inst,
1414 struct brw_reg dst)
1415 {
1416 brw_push_insn_state(p);
1417 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1418
1419 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1420 brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1421
1422 brw_set_default_access_mode(p, BRW_ALIGN_1);
1423 brw_MOV(p, get_element_ud(dst, 2),
1424 brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
1425
1426 brw_pop_insn_state(p);
1427 }
1428
1429 static void
1430 generate_mov_indirect(struct brw_codegen *p,
1431 vec4_instruction *inst,
1432 struct brw_reg dst, struct brw_reg reg,
1433 struct brw_reg indirect, struct brw_reg length)
1434 {
1435 assert(indirect.type == BRW_REGISTER_TYPE_UD);
1436 assert(p->devinfo->gen >= 6);
1437
1438 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
1439
1440 /* This instruction acts in align1 mode */
1441 assert(dst.writemask == WRITEMASK_XYZW);
1442
1443 if (indirect.file == BRW_IMMEDIATE_VALUE) {
1444 imm_byte_offset += indirect.ud;
1445
1446 reg.nr = imm_byte_offset / REG_SIZE;
1447 reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2;
1448 unsigned shift = (imm_byte_offset / 4) % 4;
1449 reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
1450
1451 brw_MOV(p, dst, reg);
1452 } else {
1453 brw_push_insn_state(p);
1454 brw_set_default_access_mode(p, BRW_ALIGN_1);
1455 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1456
1457 struct brw_reg addr = vec8(brw_address_reg(0));
1458
1459 /* We need to move the indirect value into the address register. In
1460 * order to make things make some sense, we want to respect at least the
1461 * X component of the swizzle. In order to do that, we need to convert
1462 * the subnr (probably 0) to an align1 subnr and add in the swizzle.
1463 */
1464 assert(brw_is_single_value_swizzle(indirect.swizzle));
1465 indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0));
1466
1467 /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of
1468 * the indirect and splat it out to all four channels of the given half
1469 * of a0.
1470 */
1471 indirect.subnr *= 2;
1472 indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0);
1473 brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset));
1474
1475 /* Now we need to incorporate the swizzle from the source register */
1476 if (reg.swizzle != BRW_SWIZZLE_XXXX) {
1477 uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 |
1478 BRW_GET_SWZ(reg.swizzle, 1) << 6 |
1479 BRW_GET_SWZ(reg.swizzle, 2) << 10 |
1480 BRW_GET_SWZ(reg.swizzle, 3) << 14;
1481 uv_swiz |= uv_swiz << 16;
1482
1483 brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz));
1484 }
1485
1486 brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type));
1487
1488 brw_pop_insn_state(p);
1489 }
1490 }
1491
1492 static void
1493 generate_code(struct brw_codegen *p,
1494 const struct brw_compiler *compiler,
1495 void *log_data,
1496 const nir_shader *nir,
1497 struct brw_vue_prog_data *prog_data,
1498 const struct cfg_t *cfg)
1499 {
1500 const struct gen_device_info *devinfo = p->devinfo;
1501 const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->stage);
1502 bool debug_flag = INTEL_DEBUG &
1503 intel_debug_flag_for_shader_stage(nir->stage);
1504 struct annotation_info annotation;
1505 memset(&annotation, 0, sizeof(annotation));
1506 int spill_count = 0, fill_count = 0;
1507 int loop_count = 0;
1508
1509 foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
1510 struct brw_reg src[3], dst;
1511
1512 if (unlikely(debug_flag))
1513 annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
1514
1515 for (unsigned int i = 0; i < 3; i++) {
1516 src[i] = inst->src[i].as_brw_reg();
1517 }
1518 dst = inst->dst.as_brw_reg();
1519
1520 brw_set_default_predicate_control(p, inst->predicate);
1521 brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1522 brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1523 brw_set_default_saturate(p, inst->saturate);
1524 brw_set_default_mask_control(p, inst->force_writemask_all);
1525 brw_set_default_acc_write_control(p, inst->writes_accumulator);
1526 brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
1527
1528 assert(inst->group % inst->exec_size == 0);
1529 assert(inst->group % 8 == 0 ||
1530 inst->dst.type == BRW_REGISTER_TYPE_DF ||
1531 inst->src[0].type == BRW_REGISTER_TYPE_DF ||
1532 inst->src[1].type == BRW_REGISTER_TYPE_DF ||
1533 inst->src[2].type == BRW_REGISTER_TYPE_DF);
1534 if (!inst->force_writemask_all)
1535 brw_set_default_group(p, inst->group);
1536
1537 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1538 assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1539
1540 unsigned pre_emit_nr_insn = p->nr_insn;
1541
1542 switch (inst->opcode) {
1543 case VEC4_OPCODE_UNPACK_UNIFORM:
1544 case BRW_OPCODE_MOV:
1545 brw_MOV(p, dst, src[0]);
1546 break;
1547 case BRW_OPCODE_ADD:
1548 brw_ADD(p, dst, src[0], src[1]);
1549 break;
1550 case BRW_OPCODE_MUL:
1551 brw_MUL(p, dst, src[0], src[1]);
1552 break;
1553 case BRW_OPCODE_MACH:
1554 brw_MACH(p, dst, src[0], src[1]);
1555 break;
1556
1557 case BRW_OPCODE_MAD:
1558 assert(devinfo->gen >= 6);
1559 brw_MAD(p, dst, src[0], src[1], src[2]);
1560 break;
1561
1562 case BRW_OPCODE_FRC:
1563 brw_FRC(p, dst, src[0]);
1564 break;
1565 case BRW_OPCODE_RNDD:
1566 brw_RNDD(p, dst, src[0]);
1567 break;
1568 case BRW_OPCODE_RNDE:
1569 brw_RNDE(p, dst, src[0]);
1570 break;
1571 case BRW_OPCODE_RNDZ:
1572 brw_RNDZ(p, dst, src[0]);
1573 break;
1574
1575 case BRW_OPCODE_AND:
1576 brw_AND(p, dst, src[0], src[1]);
1577 break;
1578 case BRW_OPCODE_OR:
1579 brw_OR(p, dst, src[0], src[1]);
1580 break;
1581 case BRW_OPCODE_XOR:
1582 brw_XOR(p, dst, src[0], src[1]);
1583 break;
1584 case BRW_OPCODE_NOT:
1585 brw_NOT(p, dst, src[0]);
1586 break;
1587 case BRW_OPCODE_ASR:
1588 brw_ASR(p, dst, src[0], src[1]);
1589 break;
1590 case BRW_OPCODE_SHR:
1591 brw_SHR(p, dst, src[0], src[1]);
1592 break;
1593 case BRW_OPCODE_SHL:
1594 brw_SHL(p, dst, src[0], src[1]);
1595 break;
1596
1597 case BRW_OPCODE_CMP:
1598 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1599 break;
1600 case BRW_OPCODE_SEL:
1601 brw_SEL(p, dst, src[0], src[1]);
1602 break;
1603
1604 case BRW_OPCODE_DPH:
1605 brw_DPH(p, dst, src[0], src[1]);
1606 break;
1607
1608 case BRW_OPCODE_DP4:
1609 brw_DP4(p, dst, src[0], src[1]);
1610 break;
1611
1612 case BRW_OPCODE_DP3:
1613 brw_DP3(p, dst, src[0], src[1]);
1614 break;
1615
1616 case BRW_OPCODE_DP2:
1617 brw_DP2(p, dst, src[0], src[1]);
1618 break;
1619
1620 case BRW_OPCODE_F32TO16:
1621 assert(devinfo->gen >= 7);
1622 brw_F32TO16(p, dst, src[0]);
1623 break;
1624
1625 case BRW_OPCODE_F16TO32:
1626 assert(devinfo->gen >= 7);
1627 brw_F16TO32(p, dst, src[0]);
1628 break;
1629
1630 case BRW_OPCODE_LRP:
1631 assert(devinfo->gen >= 6);
1632 brw_LRP(p, dst, src[0], src[1], src[2]);
1633 break;
1634
1635 case BRW_OPCODE_BFREV:
1636 assert(devinfo->gen >= 7);
1637 /* BFREV only supports UD type for src and dst. */
1638 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1639 retype(src[0], BRW_REGISTER_TYPE_UD));
1640 break;
1641 case BRW_OPCODE_FBH:
1642 assert(devinfo->gen >= 7);
1643 /* FBH only supports UD type for dst. */
1644 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1645 break;
1646 case BRW_OPCODE_FBL:
1647 assert(devinfo->gen >= 7);
1648 /* FBL only supports UD type for dst. */
1649 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1650 break;
1651 case BRW_OPCODE_LZD:
1652 brw_LZD(p, dst, src[0]);
1653 break;
1654 case BRW_OPCODE_CBIT:
1655 assert(devinfo->gen >= 7);
1656 /* CBIT only supports UD type for dst. */
1657 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1658 break;
1659 case BRW_OPCODE_ADDC:
1660 assert(devinfo->gen >= 7);
1661 brw_ADDC(p, dst, src[0], src[1]);
1662 break;
1663 case BRW_OPCODE_SUBB:
1664 assert(devinfo->gen >= 7);
1665 brw_SUBB(p, dst, src[0], src[1]);
1666 break;
1667 case BRW_OPCODE_MAC:
1668 brw_MAC(p, dst, src[0], src[1]);
1669 break;
1670
1671 case BRW_OPCODE_BFE:
1672 assert(devinfo->gen >= 7);
1673 brw_BFE(p, dst, src[0], src[1], src[2]);
1674 break;
1675
1676 case BRW_OPCODE_BFI1:
1677 assert(devinfo->gen >= 7);
1678 brw_BFI1(p, dst, src[0], src[1]);
1679 break;
1680 case BRW_OPCODE_BFI2:
1681 assert(devinfo->gen >= 7);
1682 brw_BFI2(p, dst, src[0], src[1], src[2]);
1683 break;
1684
1685 case BRW_OPCODE_IF:
1686 if (!inst->src[0].is_null()) {
1687 /* The instruction has an embedded compare (only allowed on gen6) */
1688 assert(devinfo->gen == 6);
1689 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1690 } else {
1691 brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8);
1692 brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate);
1693 }
1694 break;
1695
1696 case BRW_OPCODE_ELSE:
1697 brw_ELSE(p);
1698 break;
1699 case BRW_OPCODE_ENDIF:
1700 brw_ENDIF(p);
1701 break;
1702
1703 case BRW_OPCODE_DO:
1704 brw_DO(p, BRW_EXECUTE_8);
1705 break;
1706
1707 case BRW_OPCODE_BREAK:
1708 brw_BREAK(p);
1709 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1710 break;
1711 case BRW_OPCODE_CONTINUE:
1712 brw_CONT(p);
1713 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1714 break;
1715
1716 case BRW_OPCODE_WHILE:
1717 brw_WHILE(p);
1718 loop_count++;
1719 break;
1720
1721 case SHADER_OPCODE_RCP:
1722 case SHADER_OPCODE_RSQ:
1723 case SHADER_OPCODE_SQRT:
1724 case SHADER_OPCODE_EXP2:
1725 case SHADER_OPCODE_LOG2:
1726 case SHADER_OPCODE_SIN:
1727 case SHADER_OPCODE_COS:
1728 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1729 if (devinfo->gen >= 7) {
1730 gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
1731 brw_null_reg());
1732 } else if (devinfo->gen == 6) {
1733 generate_math_gen6(p, inst, dst, src[0], brw_null_reg());
1734 } else {
1735 generate_math1_gen4(p, inst, dst, src[0]);
1736 }
1737 break;
1738
1739 case SHADER_OPCODE_POW:
1740 case SHADER_OPCODE_INT_QUOTIENT:
1741 case SHADER_OPCODE_INT_REMAINDER:
1742 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1743 if (devinfo->gen >= 7) {
1744 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1745 } else if (devinfo->gen == 6) {
1746 generate_math_gen6(p, inst, dst, src[0], src[1]);
1747 } else {
1748 generate_math2_gen4(p, inst, dst, src[0], src[1]);
1749 }
1750 break;
1751
1752 case SHADER_OPCODE_TEX:
1753 case SHADER_OPCODE_TXD:
1754 case SHADER_OPCODE_TXF:
1755 case SHADER_OPCODE_TXF_CMS:
1756 case SHADER_OPCODE_TXF_CMS_W:
1757 case SHADER_OPCODE_TXF_MCS:
1758 case SHADER_OPCODE_TXL:
1759 case SHADER_OPCODE_TXS:
1760 case SHADER_OPCODE_TG4:
1761 case SHADER_OPCODE_TG4_OFFSET:
1762 case SHADER_OPCODE_SAMPLEINFO:
1763 generate_tex(p, prog_data, nir->stage,
1764 inst, dst, src[0], src[1], src[2]);
1765 break;
1766
1767 case VS_OPCODE_URB_WRITE:
1768 generate_vs_urb_write(p, inst);
1769 break;
1770
1771 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1772 generate_scratch_read(p, inst, dst, src[0]);
1773 fill_count++;
1774 break;
1775
1776 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1777 generate_scratch_write(p, inst, dst, src[0], src[1]);
1778 spill_count++;
1779 break;
1780
1781 case VS_OPCODE_PULL_CONSTANT_LOAD:
1782 generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]);
1783 break;
1784
1785 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
1786 generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]);
1787 break;
1788
1789 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
1790 generate_set_simd4x2_header_gen9(p, inst, dst);
1791 break;
1792
1793
1794 case VS_OPCODE_GET_BUFFER_SIZE:
1795 generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
1796 break;
1797
1798 case GS_OPCODE_URB_WRITE:
1799 generate_gs_urb_write(p, inst);
1800 break;
1801
1802 case GS_OPCODE_URB_WRITE_ALLOCATE:
1803 generate_gs_urb_write_allocate(p, inst);
1804 break;
1805
1806 case GS_OPCODE_SVB_WRITE:
1807 generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]);
1808 break;
1809
1810 case GS_OPCODE_SVB_SET_DST_INDEX:
1811 generate_gs_svb_set_destination_index(p, inst, dst, src[0]);
1812 break;
1813
1814 case GS_OPCODE_THREAD_END:
1815 generate_gs_thread_end(p, inst);
1816 break;
1817
1818 case GS_OPCODE_SET_WRITE_OFFSET:
1819 generate_gs_set_write_offset(p, dst, src[0], src[1]);
1820 break;
1821
1822 case GS_OPCODE_SET_VERTEX_COUNT:
1823 generate_gs_set_vertex_count(p, dst, src[0]);
1824 break;
1825
1826 case GS_OPCODE_FF_SYNC:
1827 generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
1828 break;
1829
1830 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
1831 generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]);
1832 break;
1833
1834 case GS_OPCODE_SET_PRIMITIVE_ID:
1835 generate_gs_set_primitive_id(p, dst);
1836 break;
1837
1838 case GS_OPCODE_SET_DWORD_2:
1839 generate_gs_set_dword_2(p, dst, src[0]);
1840 break;
1841
1842 case GS_OPCODE_PREPARE_CHANNEL_MASKS:
1843 generate_gs_prepare_channel_masks(p, dst);
1844 break;
1845
1846 case GS_OPCODE_SET_CHANNEL_MASKS:
1847 generate_gs_set_channel_masks(p, dst, src[0]);
1848 break;
1849
1850 case GS_OPCODE_GET_INSTANCE_ID:
1851 generate_gs_get_instance_id(p, dst);
1852 break;
1853
1854 case SHADER_OPCODE_SHADER_TIME_ADD:
1855 brw_shader_time_add(p, src[0],
1856 prog_data->base.binding_table.shader_time_start);
1857 brw_mark_surface_used(&prog_data->base,
1858 prog_data->base.binding_table.shader_time_start);
1859 break;
1860
1861 case SHADER_OPCODE_UNTYPED_ATOMIC:
1862 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1863 brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
1864 !inst->dst.is_null());
1865 break;
1866
1867 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1868 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1869 brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
1870 src[2].ud);
1871 break;
1872
1873 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1874 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1875 brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
1876 src[2].ud);
1877 break;
1878
1879 case SHADER_OPCODE_TYPED_ATOMIC:
1880 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1881 brw_typed_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
1882 !inst->dst.is_null());
1883 break;
1884
1885 case SHADER_OPCODE_TYPED_SURFACE_READ:
1886 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1887 brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen,
1888 src[2].ud);
1889 break;
1890
1891 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1892 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1893 brw_typed_surface_write(p, src[0], src[1], inst->mlen,
1894 src[2].ud);
1895 break;
1896
1897 case SHADER_OPCODE_MEMORY_FENCE:
1898 brw_memory_fence(p, dst);
1899 break;
1900
1901 case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
1902 const struct brw_reg mask =
1903 brw_stage_has_packed_dispatch(devinfo, nir->stage,
1904 &prog_data->base) ? brw_imm_ud(~0u) :
1905 brw_dmask_reg();
1906 brw_find_live_channel(p, dst, mask);
1907 break;
1908 }
1909
1910 case SHADER_OPCODE_BROADCAST:
1911 assert(inst->force_writemask_all);
1912 brw_broadcast(p, dst, src[0], src[1]);
1913 break;
1914
1915 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
1916 generate_unpack_flags(p, dst);
1917 break;
1918
1919 case VEC4_OPCODE_MOV_BYTES: {
1920 /* Moves the low byte from each channel, using an Align1 access mode
1921 * and a <4,1,0> source region.
1922 */
1923 assert(src[0].type == BRW_REGISTER_TYPE_UB ||
1924 src[0].type == BRW_REGISTER_TYPE_B);
1925
1926 brw_set_default_access_mode(p, BRW_ALIGN_1);
1927 src[0].vstride = BRW_VERTICAL_STRIDE_4;
1928 src[0].width = BRW_WIDTH_1;
1929 src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
1930 brw_MOV(p, dst, src[0]);
1931 brw_set_default_access_mode(p, BRW_ALIGN_16);
1932 break;
1933 }
1934
1935 case VEC4_OPCODE_FROM_DOUBLE: {
1936 assert(type_sz(src[0].type) == 8);
1937 assert(type_sz(dst.type) == 4);
1938
1939 brw_set_default_access_mode(p, BRW_ALIGN_1);
1940
1941 dst.hstride = BRW_HORIZONTAL_STRIDE_2;
1942 dst.width = BRW_WIDTH_4;
1943 src[0].vstride = BRW_VERTICAL_STRIDE_4;
1944 src[0].width = BRW_WIDTH_4;
1945 brw_MOV(p, dst, src[0]);
1946
1947 struct brw_reg dst_as_src = dst;
1948 dst.hstride = BRW_HORIZONTAL_STRIDE_1;
1949 dst.width = BRW_WIDTH_8;
1950 brw_MOV(p, dst, dst_as_src);
1951
1952 brw_set_default_access_mode(p, BRW_ALIGN_16);
1953 break;
1954 }
1955
1956 case VEC4_OPCODE_TO_DOUBLE: {
1957 assert(type_sz(src[0].type) == 4);
1958 assert(type_sz(dst.type) == 8);
1959
1960 brw_set_default_access_mode(p, BRW_ALIGN_1);
1961
1962 struct brw_reg tmp = retype(dst, src[0].type);
1963 tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
1964 tmp.width = BRW_WIDTH_4;
1965 src[0].vstride = BRW_VERTICAL_STRIDE_4;
1966 src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
1967 src[0].width = BRW_WIDTH_4;
1968 brw_MOV(p, tmp, src[0]);
1969
1970 tmp.vstride = BRW_VERTICAL_STRIDE_8;
1971 tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
1972 tmp.width = BRW_WIDTH_4;
1973 brw_MOV(p, dst, tmp);
1974
1975 brw_set_default_access_mode(p, BRW_ALIGN_16);
1976 break;
1977 }
1978
1979 case VEC4_OPCODE_PICK_LOW_32BIT:
1980 case VEC4_OPCODE_PICK_HIGH_32BIT: {
1981 /* Stores the low/high 32-bit of each 64-bit element in src[0] into
1982 * dst using ALIGN1 mode and a <8,4,2>:UD region on the source.
1983 */
1984 assert(type_sz(src[0].type) == 8);
1985 assert(type_sz(dst.type) == 4);
1986
1987 brw_set_default_access_mode(p, BRW_ALIGN_1);
1988
1989 dst = retype(dst, BRW_REGISTER_TYPE_UD);
1990 dst.hstride = BRW_HORIZONTAL_STRIDE_1;
1991
1992 src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
1993 if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)
1994 src[0] = suboffset(src[0], 1);
1995 src[0].vstride = BRW_VERTICAL_STRIDE_8;
1996 src[0].width = BRW_WIDTH_4;
1997 src[0].hstride = BRW_HORIZONTAL_STRIDE_2;
1998 brw_MOV(p, dst, src[0]);
1999
2000 brw_set_default_access_mode(p, BRW_ALIGN_16);
2001 break;
2002 }
2003
2004 case VEC4_OPCODE_SET_LOW_32BIT:
2005 case VEC4_OPCODE_SET_HIGH_32BIT: {
2006 /* Reads consecutive 32-bit elements from src[0] and writes
2007 * them to the low/high 32-bit of each 64-bit element in dst.
2008 */
2009 assert(type_sz(src[0].type) == 4);
2010 assert(type_sz(dst.type) == 8);
2011
2012 brw_set_default_access_mode(p, BRW_ALIGN_1);
2013
2014 dst = retype(dst, BRW_REGISTER_TYPE_UD);
2015 if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT)
2016 dst = suboffset(dst, 1);
2017 dst.hstride = BRW_HORIZONTAL_STRIDE_2;
2018
2019 src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
2020 src[0].vstride = BRW_VERTICAL_STRIDE_4;
2021 src[0].width = BRW_WIDTH_4;
2022 src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
2023 brw_MOV(p, dst, src[0]);
2024
2025 brw_set_default_access_mode(p, BRW_ALIGN_16);
2026 break;
2027 }
2028
2029 case VEC4_OPCODE_PACK_BYTES: {
2030 /* Is effectively:
2031 *
2032 * mov(8) dst<16,4,1>:UB src<4,1,0>:UB
2033 *
2034 * but destinations' only regioning is horizontal stride, so instead we
2035 * have to use two instructions:
2036 *
2037 * mov(4) dst<1>:UB src<4,1,0>:UB
2038 * mov(4) dst.16<1>:UB src.16<4,1,0>:UB
2039 *
2040 * where they pack the four bytes from the low and high four DW.
2041 */
2042 assert(_mesa_is_pow_two(dst.writemask) &&
2043 dst.writemask != 0);
2044 unsigned offset = __builtin_ctz(dst.writemask);
2045
2046 dst.type = BRW_REGISTER_TYPE_UB;
2047
2048 brw_set_default_access_mode(p, BRW_ALIGN_1);
2049
2050 src[0].type = BRW_REGISTER_TYPE_UB;
2051 src[0].vstride = BRW_VERTICAL_STRIDE_4;
2052 src[0].width = BRW_WIDTH_1;
2053 src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
2054 dst.subnr = offset * 4;
2055 struct brw_inst *insn = brw_MOV(p, dst, src[0]);
2056 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
2057 brw_inst_set_no_dd_clear(p->devinfo, insn, true);
2058 brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check);
2059
2060 src[0].subnr = 16;
2061 dst.subnr = 16 + offset * 4;
2062 insn = brw_MOV(p, dst, src[0]);
2063 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
2064 brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear);
2065 brw_inst_set_no_dd_check(p->devinfo, insn, true);
2066
2067 brw_set_default_access_mode(p, BRW_ALIGN_16);
2068 break;
2069 }
2070
2071 case TCS_OPCODE_URB_WRITE:
2072 generate_tcs_urb_write(p, inst, src[0]);
2073 break;
2074
2075 case VEC4_OPCODE_URB_READ:
2076 generate_vec4_urb_read(p, inst, dst, src[0]);
2077 break;
2078
2079 case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
2080 generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
2081 break;
2082
2083 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
2084 generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
2085 break;
2086
2087 case TCS_OPCODE_GET_INSTANCE_ID:
2088 generate_tcs_get_instance_id(p, dst);
2089 break;
2090
2091 case TCS_OPCODE_GET_PRIMITIVE_ID:
2092 generate_tcs_get_primitive_id(p, dst);
2093 break;
2094
2095 case TCS_OPCODE_CREATE_BARRIER_HEADER:
2096 generate_tcs_create_barrier_header(p, prog_data, dst);
2097 break;
2098
2099 case TES_OPCODE_CREATE_INPUT_READ_HEADER:
2100 generate_tes_create_input_read_header(p, dst);
2101 break;
2102
2103 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
2104 generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]);
2105 break;
2106
2107 case TES_OPCODE_GET_PRIMITIVE_ID:
2108 generate_tes_get_primitive_id(p, dst);
2109 break;
2110
2111 case TCS_OPCODE_SRC0_010_IS_ZERO:
2112 /* If src_reg had stride like fs_reg, we wouldn't need this. */
2113 brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
2114 break;
2115
2116 case TCS_OPCODE_RELEASE_INPUT:
2117 generate_tcs_release_input(p, dst, src[0], src[1]);
2118 break;
2119
2120 case TCS_OPCODE_THREAD_END:
2121 generate_tcs_thread_end(p, inst);
2122 break;
2123
2124 case SHADER_OPCODE_BARRIER:
2125 brw_barrier(p, src[0]);
2126 brw_WAIT(p);
2127 break;
2128
2129 case SHADER_OPCODE_MOV_INDIRECT:
2130 generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]);
2131 break;
2132
2133 case BRW_OPCODE_DIM:
2134 assert(devinfo->is_haswell);
2135 assert(src[0].type == BRW_REGISTER_TYPE_DF);
2136 assert(dst.type == BRW_REGISTER_TYPE_DF);
2137 brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2138 break;
2139
2140 default:
2141 unreachable("Unsupported opcode");
2142 }
2143
2144 if (inst->opcode == VEC4_OPCODE_PACK_BYTES) {
2145 /* Handled dependency hints in the generator. */
2146
2147 assert(!inst->conditional_mod);
2148 } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2149 assert(p->nr_insn == pre_emit_nr_insn + 1 ||
2150 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2151 "emitting more than 1 instruction");
2152
2153 brw_inst *last = &p->store[pre_emit_nr_insn];
2154
2155 if (inst->conditional_mod)
2156 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2157 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2158 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2159 }
2160 }
2161
2162 brw_set_uip_jip(p, 0);
2163 annotation_finalize(&annotation, p->next_insn_offset);
2164
2165 #ifndef NDEBUG
2166 bool validated = brw_validate_instructions(p, 0, &annotation);
2167 #else
2168 if (unlikely(debug_flag))
2169 brw_validate_instructions(p, 0, &annotation);
2170 #endif
2171
2172 int before_size = p->next_insn_offset;
2173 brw_compact_instructions(p, 0, annotation.ann_count, annotation.ann);
2174 int after_size = p->next_insn_offset;
2175
2176 if (unlikely(debug_flag)) {
2177 fprintf(stderr, "Native code for %s %s shader %s:\n",
2178 nir->info->label ? nir->info->label : "unnamed",
2179 _mesa_shader_stage_to_string(nir->stage), nir->info->name);
2180
2181 fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
2182 "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
2183 stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
2184 spill_count, fill_count, before_size, after_size,
2185 100.0f * (before_size - after_size) / before_size);
2186
2187 dump_assembly(p->store, annotation.ann_count, annotation.ann,
2188 p->devinfo);
2189 ralloc_free(annotation.mem_ctx);
2190 }
2191 assert(validated);
2192
2193 compiler->shader_debug_log(log_data,
2194 "%s vec4 shader: %d inst, %d loops, %u cycles, "
2195 "%d:%d spills:fills, compacted %d to %d bytes.",
2196 stage_abbrev, before_size / 16,
2197 loop_count, cfg->cycle_count, spill_count,
2198 fill_count, before_size, after_size);
2199
2200 }
2201
2202 extern "C" const unsigned *
2203 brw_vec4_generate_assembly(const struct brw_compiler *compiler,
2204 void *log_data,
2205 void *mem_ctx,
2206 const nir_shader *nir,
2207 struct brw_vue_prog_data *prog_data,
2208 const struct cfg_t *cfg,
2209 unsigned *out_assembly_size)
2210 {
2211 struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen);
2212 brw_init_codegen(compiler->devinfo, p, mem_ctx);
2213 brw_set_default_access_mode(p, BRW_ALIGN_16);
2214
2215 generate_code(p, compiler, log_data, nir, prog_data, cfg);
2216
2217 return brw_get_program(p, out_assembly_size);
2218 }