i965: Drop mark_surface_used mechanism.
[mesa.git] / src / intel / compiler / brw_vec4_generator.cpp
1 /* Copyright © 2011 Intel Corporation
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice (including the next
11 * paragraph) shall be included in all copies or substantial portions of the
12 * Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20 * IN THE SOFTWARE.
21 */
22
23 #include "brw_vec4.h"
24 #include "brw_cfg.h"
25 #include "brw_eu.h"
26 #include "common/gen_debug.h"
27
28 using namespace brw;
29
30 static void
31 generate_math1_gen4(struct brw_codegen *p,
32 vec4_instruction *inst,
33 struct brw_reg dst,
34 struct brw_reg src)
35 {
36 gen4_math(p,
37 dst,
38 brw_math_function(inst->opcode),
39 inst->base_mrf,
40 src,
41 BRW_MATH_PRECISION_FULL);
42 }
43
44 static void
45 check_gen6_math_src_arg(struct brw_reg src)
46 {
47 /* Source swizzles are ignored. */
48 assert(!src.abs);
49 assert(!src.negate);
50 assert(src.swizzle == BRW_SWIZZLE_XYZW);
51 }
52
53 static void
54 generate_math_gen6(struct brw_codegen *p,
55 vec4_instruction *inst,
56 struct brw_reg dst,
57 struct brw_reg src0,
58 struct brw_reg src1)
59 {
60 /* Can't do writemask because math can't be align16. */
61 assert(dst.writemask == WRITEMASK_XYZW);
62 /* Source swizzles are ignored. */
63 check_gen6_math_src_arg(src0);
64 if (src1.file == BRW_GENERAL_REGISTER_FILE)
65 check_gen6_math_src_arg(src1);
66
67 brw_set_default_access_mode(p, BRW_ALIGN_1);
68 gen6_math(p, dst, brw_math_function(inst->opcode), src0, src1);
69 brw_set_default_access_mode(p, BRW_ALIGN_16);
70 }
71
72 static void
73 generate_math2_gen4(struct brw_codegen *p,
74 vec4_instruction *inst,
75 struct brw_reg dst,
76 struct brw_reg src0,
77 struct brw_reg src1)
78 {
79 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
80 * "Message Payload":
81 *
82 * "Operand0[7]. For the INT DIV functions, this operand is the
83 * denominator."
84 * ...
85 * "Operand1[7]. For the INT DIV functions, this operand is the
86 * numerator."
87 */
88 bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
89 struct brw_reg &op0 = is_int_div ? src1 : src0;
90 struct brw_reg &op1 = is_int_div ? src0 : src1;
91
92 brw_push_insn_state(p);
93 brw_set_default_saturate(p, false);
94 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
95 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
96 brw_pop_insn_state(p);
97
98 gen4_math(p,
99 dst,
100 brw_math_function(inst->opcode),
101 inst->base_mrf,
102 op0,
103 BRW_MATH_PRECISION_FULL);
104 }
105
106 static void
107 generate_tex(struct brw_codegen *p,
108 struct brw_vue_prog_data *prog_data,
109 gl_shader_stage stage,
110 vec4_instruction *inst,
111 struct brw_reg dst,
112 struct brw_reg src,
113 struct brw_reg surface_index,
114 struct brw_reg sampler_index)
115 {
116 const struct gen_device_info *devinfo = p->devinfo;
117 int msg_type = -1;
118
119 if (devinfo->gen >= 5) {
120 switch (inst->opcode) {
121 case SHADER_OPCODE_TEX:
122 case SHADER_OPCODE_TXL:
123 if (inst->shadow_compare) {
124 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
125 } else {
126 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
127 }
128 break;
129 case SHADER_OPCODE_TXD:
130 if (inst->shadow_compare) {
131 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
132 assert(devinfo->gen >= 8 || devinfo->is_haswell);
133 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
134 } else {
135 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
136 }
137 break;
138 case SHADER_OPCODE_TXF:
139 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
140 break;
141 case SHADER_OPCODE_TXF_CMS_W:
142 assert(devinfo->gen >= 9);
143 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
144 break;
145 case SHADER_OPCODE_TXF_CMS:
146 if (devinfo->gen >= 7)
147 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
148 else
149 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
150 break;
151 case SHADER_OPCODE_TXF_MCS:
152 assert(devinfo->gen >= 7);
153 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
154 break;
155 case SHADER_OPCODE_TXS:
156 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
157 break;
158 case SHADER_OPCODE_TG4:
159 if (inst->shadow_compare) {
160 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
161 } else {
162 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
163 }
164 break;
165 case SHADER_OPCODE_TG4_OFFSET:
166 if (inst->shadow_compare) {
167 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
168 } else {
169 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
170 }
171 break;
172 case SHADER_OPCODE_SAMPLEINFO:
173 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
174 break;
175 default:
176 unreachable("should not get here: invalid vec4 texture opcode");
177 }
178 } else {
179 switch (inst->opcode) {
180 case SHADER_OPCODE_TEX:
181 case SHADER_OPCODE_TXL:
182 if (inst->shadow_compare) {
183 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
184 assert(inst->mlen == 3);
185 } else {
186 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
187 assert(inst->mlen == 2);
188 }
189 break;
190 case SHADER_OPCODE_TXD:
191 /* There is no sample_d_c message; comparisons are done manually. */
192 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
193 assert(inst->mlen == 4);
194 break;
195 case SHADER_OPCODE_TXF:
196 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
197 assert(inst->mlen == 2);
198 break;
199 case SHADER_OPCODE_TXS:
200 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
201 assert(inst->mlen == 2);
202 break;
203 default:
204 unreachable("should not get here: invalid vec4 texture opcode");
205 }
206 }
207
208 assert(msg_type != -1);
209
210 assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
211
212 /* Load the message header if present. If there's a texture offset, we need
213 * to set it up explicitly and load the offset bitfield. Otherwise, we can
214 * use an implied move from g0 to the first message register.
215 */
216 if (inst->header_size != 0) {
217 if (devinfo->gen < 6 && !inst->offset) {
218 /* Set up an implied move from g0 to the MRF. */
219 src = brw_vec8_grf(0, 0);
220 } else {
221 struct brw_reg header =
222 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
223 uint32_t dw2 = 0;
224
225 /* Explicitly set up the message header by copying g0 to the MRF. */
226 brw_push_insn_state(p);
227 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
228 brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
229
230 brw_set_default_access_mode(p, BRW_ALIGN_1);
231
232 if (inst->offset)
233 /* Set the texel offset bits in DWord 2. */
234 dw2 = inst->offset;
235
236 if (devinfo->gen >= 9)
237 /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D,
238 * based on bit 22 in the header.
239 */
240 dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2;
241
242 /* The VS, DS, and FS stages have the g0.2 payload delivered as 0,
243 * so header0.2 is 0 when g0 is copied. The HS and GS stages do
244 * not, so we must set to to 0 to avoid setting undesirable bits
245 * in the message header.
246 */
247 if (dw2 ||
248 stage == MESA_SHADER_TESS_CTRL ||
249 stage == MESA_SHADER_GEOMETRY) {
250 brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2));
251 }
252
253 brw_adjust_sampler_state_pointer(p, header, sampler_index);
254 brw_pop_insn_state(p);
255 }
256 }
257
258 uint32_t return_format;
259
260 switch (dst.type) {
261 case BRW_REGISTER_TYPE_D:
262 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
263 break;
264 case BRW_REGISTER_TYPE_UD:
265 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
266 break;
267 default:
268 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
269 break;
270 }
271
272 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
273 inst->opcode == SHADER_OPCODE_TG4_OFFSET)
274 ? prog_data->base.binding_table.gather_texture_start
275 : prog_data->base.binding_table.texture_start;
276
277 if (surface_index.file == BRW_IMMEDIATE_VALUE &&
278 sampler_index.file == BRW_IMMEDIATE_VALUE) {
279 uint32_t surface = surface_index.ud;
280 uint32_t sampler = sampler_index.ud;
281
282 brw_SAMPLE(p,
283 dst,
284 inst->base_mrf,
285 src,
286 surface + base_binding_table_index,
287 sampler % 16,
288 msg_type,
289 1, /* response length */
290 inst->mlen,
291 inst->header_size != 0,
292 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
293 return_format);
294 } else {
295 /* Non-constant sampler index. */
296
297 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
298 struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
299 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
300
301 brw_push_insn_state(p);
302 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
303 brw_set_default_access_mode(p, BRW_ALIGN_1);
304
305 if (brw_regs_equal(&surface_reg, &sampler_reg)) {
306 brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
307 } else {
308 if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
309 brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
310 } else {
311 brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
312 brw_OR(p, addr, addr, surface_reg);
313 }
314 }
315 if (base_binding_table_index)
316 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
317 brw_AND(p, addr, addr, brw_imm_ud(0xfff));
318
319 brw_pop_insn_state(p);
320
321 if (inst->base_mrf != -1)
322 gen6_resolve_implied_move(p, &src, inst->base_mrf);
323
324 /* dst = send(offset, a0.0 | <descriptor>) */
325 brw_send_indirect_message(
326 p, BRW_SFID_SAMPLER, dst, src, addr,
327 brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
328 brw_sampler_desc(devinfo,
329 0 /* surface */,
330 0 /* sampler */,
331 msg_type,
332 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
333 return_format));
334
335 /* visitor knows more than we do about the surface limit required,
336 * so has already done marking.
337 */
338 }
339 }
340
341 static void
342 generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
343 {
344 brw_urb_WRITE(p,
345 brw_null_reg(), /* dest */
346 inst->base_mrf, /* starting mrf reg nr */
347 brw_vec8_grf(0, 0), /* src */
348 inst->urb_write_flags,
349 inst->mlen,
350 0, /* response len */
351 inst->offset, /* urb destination offset */
352 BRW_URB_SWIZZLE_INTERLEAVE);
353 }
354
355 static void
356 generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
357 {
358 struct brw_reg src = brw_message_reg(inst->base_mrf);
359 brw_urb_WRITE(p,
360 brw_null_reg(), /* dest */
361 inst->base_mrf, /* starting mrf reg nr */
362 src,
363 inst->urb_write_flags,
364 inst->mlen,
365 0, /* response len */
366 inst->offset, /* urb destination offset */
367 BRW_URB_SWIZZLE_INTERLEAVE);
368 }
369
370 static void
371 generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst)
372 {
373 struct brw_reg src = brw_message_reg(inst->base_mrf);
374
375 /* We pass the temporary passed in src0 as the writeback register */
376 brw_urb_WRITE(p,
377 inst->src[0].as_brw_reg(), /* dest */
378 inst->base_mrf, /* starting mrf reg nr */
379 src,
380 BRW_URB_WRITE_ALLOCATE_COMPLETE,
381 inst->mlen,
382 1, /* response len */
383 inst->offset, /* urb destination offset */
384 BRW_URB_SWIZZLE_INTERLEAVE);
385
386 /* Now put allocated urb handle in dst.0 */
387 brw_push_insn_state(p);
388 brw_set_default_access_mode(p, BRW_ALIGN_1);
389 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
390 brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0),
391 get_element_ud(inst->src[0].as_brw_reg(), 0));
392 brw_pop_insn_state(p);
393 }
394
395 static void
396 generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
397 {
398 struct brw_reg src = brw_message_reg(inst->base_mrf);
399 brw_urb_WRITE(p,
400 brw_null_reg(), /* dest */
401 inst->base_mrf, /* starting mrf reg nr */
402 src,
403 BRW_URB_WRITE_EOT | inst->urb_write_flags,
404 inst->mlen,
405 0, /* response len */
406 0, /* urb destination offset */
407 BRW_URB_SWIZZLE_INTERLEAVE);
408 }
409
410 static void
411 generate_gs_set_write_offset(struct brw_codegen *p,
412 struct brw_reg dst,
413 struct brw_reg src0,
414 struct brw_reg src1)
415 {
416 /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
417 * Header: M0.3):
418 *
419 * Slot 0 Offset. This field, after adding to the Global Offset field
420 * in the message descriptor, specifies the offset (in 256-bit units)
421 * from the start of the URB entry, as referenced by URB Handle 0, at
422 * which the data will be accessed.
423 *
424 * Similar text describes DWORD M0.4, which is slot 1 offset.
425 *
426 * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
427 * of the register for geometry shader invocations 0 and 1) by the
428 * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
429 *
430 * We can do this with the following EU instruction:
431 *
432 * mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW { Align1 WE_all }
433 */
434 brw_push_insn_state(p);
435 brw_set_default_access_mode(p, BRW_ALIGN_1);
436 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
437 assert(p->devinfo->gen >= 7 &&
438 src1.file == BRW_IMMEDIATE_VALUE &&
439 src1.type == BRW_REGISTER_TYPE_UD &&
440 src1.ud <= USHRT_MAX);
441 if (src0.file == BRW_IMMEDIATE_VALUE) {
442 brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
443 brw_imm_ud(src0.ud * src1.ud));
444 } else {
445 brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
446 retype(src1, BRW_REGISTER_TYPE_UW));
447 }
448 brw_pop_insn_state(p);
449 }
450
451 static void
452 generate_gs_set_vertex_count(struct brw_codegen *p,
453 struct brw_reg dst,
454 struct brw_reg src)
455 {
456 brw_push_insn_state(p);
457 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
458
459 if (p->devinfo->gen >= 8) {
460 /* Move the vertex count into the second MRF for the EOT write. */
461 brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD),
462 src);
463 } else {
464 /* If we think of the src and dst registers as composed of 8 DWORDs each,
465 * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
466 * them to WORDs, and then pack them into DWORD 2 of dst.
467 *
468 * It's easier to get the EU to do this if we think of the src and dst
469 * registers as composed of 16 WORDS each; then, we want to pick up the
470 * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5
471 * of dst.
472 *
473 * We can do that by the following EU instruction:
474 *
475 * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask }
476 */
477 brw_set_default_access_mode(p, BRW_ALIGN_1);
478 brw_MOV(p,
479 suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
480 stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
481 }
482 brw_pop_insn_state(p);
483 }
484
485 static void
486 generate_gs_svb_write(struct brw_codegen *p,
487 struct brw_vue_prog_data *prog_data,
488 vec4_instruction *inst,
489 struct brw_reg dst,
490 struct brw_reg src0,
491 struct brw_reg src1)
492 {
493 int binding = inst->sol_binding;
494 bool final_write = inst->sol_final_write;
495
496 brw_push_insn_state(p);
497 brw_set_default_exec_size(p, BRW_EXECUTE_4);
498 /* Copy Vertex data into M0.x */
499 brw_MOV(p, stride(dst, 4, 4, 1),
500 stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1));
501 brw_pop_insn_state(p);
502
503 brw_push_insn_state(p);
504 /* Send SVB Write */
505 brw_svb_write(p,
506 final_write ? src1 : brw_null_reg(), /* dest == src1 */
507 1, /* msg_reg_nr */
508 dst, /* src0 == previous dst */
509 BRW_GEN6_SOL_BINDING_START + binding, /* binding_table_index */
510 final_write); /* send_commit_msg */
511
512 /* Finally, wait for the write commit to occur so that we can proceed to
513 * other things safely.
514 *
515 * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
516 *
517 * The write commit does not modify the destination register, but
518 * merely clears the dependency associated with the destination
519 * register. Thus, a simple “mov” instruction using the register as a
520 * source is sufficient to wait for the write commit to occur.
521 */
522 if (final_write) {
523 brw_MOV(p, src1, src1);
524 }
525 brw_pop_insn_state(p);
526 }
527
528 static void
529 generate_gs_svb_set_destination_index(struct brw_codegen *p,
530 vec4_instruction *inst,
531 struct brw_reg dst,
532 struct brw_reg src)
533 {
534 int vertex = inst->sol_vertex;
535 brw_push_insn_state(p);
536 brw_set_default_access_mode(p, BRW_ALIGN_1);
537 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
538 brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex));
539 brw_pop_insn_state(p);
540 }
541
542 static void
543 generate_gs_set_dword_2(struct brw_codegen *p,
544 struct brw_reg dst,
545 struct brw_reg src)
546 {
547 brw_push_insn_state(p);
548 brw_set_default_access_mode(p, BRW_ALIGN_1);
549 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
550 brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0));
551 brw_pop_insn_state(p);
552 }
553
554 static void
555 generate_gs_prepare_channel_masks(struct brw_codegen *p,
556 struct brw_reg dst)
557 {
558 /* We want to left shift just DWORD 4 (the x component belonging to the
559 * second geometry shader invocation) by 4 bits. So generate the
560 * instruction:
561 *
562 * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
563 */
564 dst = suboffset(vec1(dst), 4);
565 brw_push_insn_state(p);
566 brw_set_default_access_mode(p, BRW_ALIGN_1);
567 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
568 brw_SHL(p, dst, dst, brw_imm_ud(4));
569 brw_pop_insn_state(p);
570 }
571
572 static void
573 generate_gs_set_channel_masks(struct brw_codegen *p,
574 struct brw_reg dst,
575 struct brw_reg src)
576 {
577 /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
578 * Header: M0.5):
579 *
580 * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
581 *
582 * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
583 * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
584 * Vertex 0 DATA[7]. This bit is ANDed with the corresponding
585 * channel enable to determine the final channel enable. For the
586 * URB_READ_OWORD & URB_READ_HWORD messages, when final channel
587 * enable is 1 it indicates that Vertex 1 DATA [3] will be included
588 * in the writeback message. For the URB_WRITE_OWORD &
589 * URB_WRITE_HWORD messages, when final channel enable is 1 it
590 * indicates that Vertex 1 DATA [3] will be written to the surface.
591 *
592 * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
593 * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
594 *
595 * 14 Vertex 1 DATA [2] Channel Mask
596 * 13 Vertex 1 DATA [1] Channel Mask
597 * 12 Vertex 1 DATA [0] Channel Mask
598 * 11 Vertex 0 DATA [3] Channel Mask
599 * 10 Vertex 0 DATA [2] Channel Mask
600 * 9 Vertex 0 DATA [1] Channel Mask
601 * 8 Vertex 0 DATA [0] Channel Mask
602 *
603 * (This is from a section of the PRM that is agnostic to the particular
604 * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
605 * geometry shader invocations 0 and 1, respectively). Since we have the
606 * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
607 * and the enable flags for geometry shader invocation 1 in bits 7:0 of
608 * DWORD 4, we just need to OR them together and store the result in bits
609 * 15:8 of DWORD 5.
610 *
611 * It's easier to get the EU to do this if we think of the src and dst
612 * registers as composed of 32 bytes each; then, we want to pick up the
613 * contents of bytes 0 and 16 from src, OR them together, and store them in
614 * byte 21.
615 *
616 * We can do that by the following EU instruction:
617 *
618 * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
619 *
620 * Note: this relies on the source register having zeros in (a) bits 7:4 of
621 * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the
622 * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which
623 * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
624 * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
625 * contain valid channel mask values (which are in the range 0x0-0xf).
626 */
627 dst = retype(dst, BRW_REGISTER_TYPE_UB);
628 src = retype(src, BRW_REGISTER_TYPE_UB);
629 brw_push_insn_state(p);
630 brw_set_default_access_mode(p, BRW_ALIGN_1);
631 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
632 brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
633 brw_pop_insn_state(p);
634 }
635
636 static void
637 generate_gs_get_instance_id(struct brw_codegen *p,
638 struct brw_reg dst)
639 {
640 /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT
641 * and store into dst.0 & dst.4. So generate the instruction:
642 *
643 * shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q }
644 */
645 brw_push_insn_state(p);
646 brw_set_default_access_mode(p, BRW_ALIGN_1);
647 dst = retype(dst, BRW_REGISTER_TYPE_UD);
648 struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
649 brw_SHR(p, dst, stride(r0, 1, 4, 0),
650 brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT));
651 brw_pop_insn_state(p);
652 }
653
654 static void
655 generate_gs_ff_sync_set_primitives(struct brw_codegen *p,
656 struct brw_reg dst,
657 struct brw_reg src0,
658 struct brw_reg src1,
659 struct brw_reg src2)
660 {
661 brw_push_insn_state(p);
662 brw_set_default_access_mode(p, BRW_ALIGN_1);
663 /* Save src0 data in 16:31 bits of dst.0 */
664 brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0),
665 brw_imm_ud(0xffffu));
666 brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16));
667 /* Save src1 data in 0:15 bits of dst.0 */
668 brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0),
669 brw_imm_ud(0xffffu));
670 brw_OR(p, suboffset(vec1(dst), 0),
671 suboffset(vec1(dst), 0),
672 suboffset(vec1(src2), 0));
673 brw_pop_insn_state(p);
674 }
675
676 static void
677 generate_gs_ff_sync(struct brw_codegen *p,
678 vec4_instruction *inst,
679 struct brw_reg dst,
680 struct brw_reg src0,
681 struct brw_reg src1)
682 {
683 /* This opcode uses an implied MRF register for:
684 * - the header of the ff_sync message. And as such it is expected to be
685 * initialized to r0 before calling here.
686 * - the destination where we will write the allocated URB handle.
687 */
688 struct brw_reg header =
689 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
690
691 /* Overwrite dword 0 of the header (SO vertices to write) and
692 * dword 1 (number of primitives written).
693 */
694 brw_push_insn_state(p);
695 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
696 brw_set_default_access_mode(p, BRW_ALIGN_1);
697 brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0));
698 brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0));
699 brw_pop_insn_state(p);
700
701 /* Allocate URB handle in dst */
702 brw_ff_sync(p,
703 dst,
704 0,
705 header,
706 1, /* allocate */
707 1, /* response length */
708 0 /* eot */);
709
710 /* Now put allocated urb handle in header.0 */
711 brw_push_insn_state(p);
712 brw_set_default_access_mode(p, BRW_ALIGN_1);
713 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
714 brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0));
715
716 /* src1 is not an immediate when we use transform feedback */
717 if (src1.file != BRW_IMMEDIATE_VALUE) {
718 brw_set_default_exec_size(p, BRW_EXECUTE_4);
719 brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1));
720 }
721
722 brw_pop_insn_state(p);
723 }
724
725 static void
726 generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
727 {
728 /* In gen6, PrimitiveID is delivered in R0.1 of the payload */
729 struct brw_reg src = brw_vec8_grf(0, 0);
730 brw_push_insn_state(p);
731 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
732 brw_set_default_access_mode(p, BRW_ALIGN_1);
733 brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1));
734 brw_pop_insn_state(p);
735 }
736
737 static void
738 generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
739 {
740 const struct gen_device_info *devinfo = p->devinfo;
741 const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
742
743 /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
744 *
745 * Since we operate in SIMD4x2 mode, we need run half as many threads
746 * as necessary. So we assign (2i + 1, 2i) as the thread counts. We
747 * shift right by one less to accomplish the multiplication by two.
748 */
749 dst = retype(dst, BRW_REGISTER_TYPE_UD);
750 struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
751
752 brw_push_insn_state(p);
753 brw_set_default_access_mode(p, BRW_ALIGN_1);
754
755 const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
756 const int shift = ivb ? 16 : 17;
757
758 brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask));
759 brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
760 brw_imm_ud(shift - 1));
761 brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1));
762
763 brw_pop_insn_state(p);
764 }
765
766 static void
767 generate_tcs_urb_write(struct brw_codegen *p,
768 vec4_instruction *inst,
769 struct brw_reg urb_header)
770 {
771 const struct gen_device_info *devinfo = p->devinfo;
772
773 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
774 brw_set_dest(p, send, brw_null_reg());
775 brw_set_src0(p, send, urb_header);
776 brw_set_desc(p, send, brw_message_desc(devinfo, inst->mlen, 0, true));
777
778 brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
779 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
780 brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
781 if (inst->urb_write_flags & BRW_URB_WRITE_EOT) {
782 brw_inst_set_eot(devinfo, send, 1);
783 } else {
784 brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
785 brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
786 }
787
788 /* what happens to swizzles? */
789 }
790
791
792 static void
793 generate_tcs_input_urb_offsets(struct brw_codegen *p,
794 struct brw_reg dst,
795 struct brw_reg vertex,
796 struct brw_reg offset)
797 {
798 /* Generates an URB read/write message header for HS/DS operation.
799 * Inputs are a vertex index, and a byte offset from the beginning of
800 * the vertex. */
801
802 /* If `vertex` is not an immediate, we clobber a0.0 */
803
804 assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE);
805 assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D);
806
807 assert(dst.file == BRW_GENERAL_REGISTER_FILE);
808
809 brw_push_insn_state(p);
810 brw_set_default_access_mode(p, BRW_ALIGN_1);
811 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
812 brw_MOV(p, dst, brw_imm_ud(0));
813
814 /* m0.5 bits 8-15 are channel enables */
815 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
816
817 /* m0.0-0.1: URB handles */
818 if (vertex.file == BRW_IMMEDIATE_VALUE) {
819 uint32_t vertex_index = vertex.ud;
820 struct brw_reg index_reg = brw_vec1_grf(
821 1 + (vertex_index >> 3), vertex_index & 7);
822
823 brw_MOV(p, vec2(get_element_ud(dst, 0)),
824 retype(index_reg, BRW_REGISTER_TYPE_UD));
825 } else {
826 /* Use indirect addressing. ICP Handles are DWords (single channels
827 * of a register) and start at g1.0.
828 *
829 * In order to start our region at g1.0, we add 8 to the vertex index,
830 * effectively skipping over the 8 channels in g0.0. This gives us a
831 * DWord offset to the ICP Handle.
832 *
833 * Indirect addressing works in terms of bytes, so we then multiply
834 * the DWord offset by 4 (by shifting left by 2).
835 */
836 struct brw_reg addr = brw_address_reg(0);
837
838 /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
839 brw_ADD(p, addr, retype(get_element_ud(vertex, 0), BRW_REGISTER_TYPE_UW),
840 brw_imm_uw(0x8));
841 brw_SHL(p, addr, addr, brw_imm_uw(2));
842 brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0));
843
844 /* top half: m0.1 = g[1.0 + vertex.4]UD */
845 brw_ADD(p, addr, retype(get_element_ud(vertex, 4), BRW_REGISTER_TYPE_UW),
846 brw_imm_uw(0x8));
847 brw_SHL(p, addr, addr, brw_imm_uw(2));
848 brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0));
849 }
850
851 /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
852 if (offset.file != ARF)
853 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
854
855 brw_pop_insn_state(p);
856 }
857
858
859 static void
860 generate_tcs_output_urb_offsets(struct brw_codegen *p,
861 struct brw_reg dst,
862 struct brw_reg write_mask,
863 struct brw_reg offset)
864 {
865 /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
866 assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE);
867
868 assert(write_mask.file == BRW_IMMEDIATE_VALUE);
869 assert(write_mask.type == BRW_REGISTER_TYPE_UD);
870
871 brw_push_insn_state(p);
872
873 brw_set_default_access_mode(p, BRW_ALIGN_1);
874 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
875 brw_MOV(p, dst, brw_imm_ud(0));
876
877 unsigned mask = write_mask.ud;
878
879 /* m0.5 bits 15:12 and 11:8 are channel enables */
880 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12)));
881
882 /* HS patch URB handle is delivered in r0.0 */
883 struct brw_reg urb_handle = brw_vec1_grf(0, 0);
884
885 /* m0.0-0.1: URB handles */
886 brw_MOV(p, vec2(get_element_ud(dst, 0)),
887 retype(urb_handle, BRW_REGISTER_TYPE_UD));
888
889 /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
890 if (offset.file != ARF)
891 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
892
893 brw_pop_insn_state(p);
894 }
895
896 static void
897 generate_tes_create_input_read_header(struct brw_codegen *p,
898 struct brw_reg dst)
899 {
900 brw_push_insn_state(p);
901 brw_set_default_access_mode(p, BRW_ALIGN_1);
902 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
903
904 /* Initialize the register to 0 */
905 brw_MOV(p, dst, brw_imm_ud(0));
906
907 /* Enable all the channels in m0.5 bits 15:8 */
908 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
909
910 /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1. For safety,
911 * mask out irrelevant "Reserved" bits, as they're not marked MBZ.
912 */
913 brw_AND(p, vec2(get_element_ud(dst, 0)),
914 retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD),
915 brw_imm_ud(0x1fff));
916 brw_pop_insn_state(p);
917 }
918
919 static void
920 generate_tes_add_indirect_urb_offset(struct brw_codegen *p,
921 struct brw_reg dst,
922 struct brw_reg header,
923 struct brw_reg offset)
924 {
925 brw_push_insn_state(p);
926 brw_set_default_access_mode(p, BRW_ALIGN_1);
927 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
928
929 brw_MOV(p, dst, header);
930
931 /* Uniforms will have a stride <0;4,1>, and we need to convert to <0;1,0>.
932 * Other values get <4;1,0>.
933 */
934 struct brw_reg restrided_offset;
935 if (offset.vstride == BRW_VERTICAL_STRIDE_0 &&
936 offset.width == BRW_WIDTH_4 &&
937 offset.hstride == BRW_HORIZONTAL_STRIDE_1) {
938 restrided_offset = stride(offset, 0, 1, 0);
939 } else {
940 restrided_offset = stride(offset, 4, 1, 0);
941 }
942
943 /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */
944 brw_MOV(p, vec2(get_element_ud(dst, 3)), restrided_offset);
945
946 brw_pop_insn_state(p);
947 }
948
949 static void
950 generate_vec4_urb_read(struct brw_codegen *p,
951 vec4_instruction *inst,
952 struct brw_reg dst,
953 struct brw_reg header)
954 {
955 const struct gen_device_info *devinfo = p->devinfo;
956
957 assert(header.file == BRW_GENERAL_REGISTER_FILE);
958 assert(header.type == BRW_REGISTER_TYPE_UD);
959
960 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
961 brw_set_dest(p, send, dst);
962 brw_set_src0(p, send, header);
963
964 brw_set_desc(p, send, brw_message_desc(devinfo, 1, 1, true));
965
966 brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
967 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
968 brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
969 brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
970
971 brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
972 }
973
974 static void
975 generate_tcs_release_input(struct brw_codegen *p,
976 struct brw_reg header,
977 struct brw_reg vertex,
978 struct brw_reg is_unpaired)
979 {
980 const struct gen_device_info *devinfo = p->devinfo;
981
982 assert(vertex.file == BRW_IMMEDIATE_VALUE);
983 assert(vertex.type == BRW_REGISTER_TYPE_UD);
984
985 /* m0.0-0.1: URB handles */
986 struct brw_reg urb_handles =
987 retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7),
988 BRW_REGISTER_TYPE_UD);
989
990 brw_push_insn_state(p);
991 brw_set_default_access_mode(p, BRW_ALIGN_1);
992 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
993 brw_MOV(p, header, brw_imm_ud(0));
994 brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles);
995 brw_pop_insn_state(p);
996
997 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
998 brw_set_dest(p, send, brw_null_reg());
999 brw_set_src0(p, send, header);
1000 brw_set_desc(p, send, brw_message_desc(devinfo, 1, 0, true));
1001
1002 brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
1003 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
1004 brw_inst_set_urb_complete(devinfo, send, 1);
1005 brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
1006 BRW_URB_SWIZZLE_NONE :
1007 BRW_URB_SWIZZLE_INTERLEAVE);
1008 }
1009
1010 static void
1011 generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
1012 {
1013 struct brw_reg header = brw_message_reg(inst->base_mrf);
1014
1015 brw_push_insn_state(p);
1016 brw_set_default_access_mode(p, BRW_ALIGN_1);
1017 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1018 brw_MOV(p, header, brw_imm_ud(0));
1019 brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8));
1020 brw_MOV(p, get_element_ud(header, 0),
1021 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
1022 brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u));
1023 brw_pop_insn_state(p);
1024
1025 brw_urb_WRITE(p,
1026 brw_null_reg(), /* dest */
1027 inst->base_mrf, /* starting mrf reg nr */
1028 header,
1029 BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD |
1030 BRW_URB_WRITE_USE_CHANNEL_MASKS,
1031 inst->mlen,
1032 0, /* response len */
1033 0, /* urb destination offset */
1034 0);
1035 }
1036
1037 static void
1038 generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
1039 {
1040 brw_push_insn_state(p);
1041 brw_set_default_access_mode(p, BRW_ALIGN_1);
1042 brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D));
1043 brw_pop_insn_state(p);
1044 }
1045
1046 static void
1047 generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
1048 {
1049 brw_push_insn_state(p);
1050 brw_set_default_access_mode(p, BRW_ALIGN_1);
1051 brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
1052 brw_pop_insn_state(p);
1053 }
1054
1055 static void
1056 generate_tcs_create_barrier_header(struct brw_codegen *p,
1057 struct brw_vue_prog_data *prog_data,
1058 struct brw_reg dst)
1059 {
1060 const struct gen_device_info *devinfo = p->devinfo;
1061 const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
1062 struct brw_reg m0_2 = get_element_ud(dst, 2);
1063 unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
1064
1065 brw_push_insn_state(p);
1066 brw_set_default_access_mode(p, BRW_ALIGN_1);
1067 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1068
1069 /* Zero the message header */
1070 brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
1071
1072 /* Copy "Barrier ID" from r0.2, bits 16:13 (Gen7.5+) or 15:12 (Gen7) */
1073 brw_AND(p, m0_2,
1074 retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
1075 brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13)));
1076
1077 /* Shift it up to bits 27:24. */
1078 brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11));
1079
1080 /* Set the Barrier Count and the enable bit */
1081 brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15)));
1082
1083 brw_pop_insn_state(p);
1084 }
1085
1086 static void
1087 generate_oword_dual_block_offsets(struct brw_codegen *p,
1088 struct brw_reg m1,
1089 struct brw_reg index)
1090 {
1091 int second_vertex_offset;
1092
1093 if (p->devinfo->gen >= 6)
1094 second_vertex_offset = 1;
1095 else
1096 second_vertex_offset = 16;
1097
1098 m1 = retype(m1, BRW_REGISTER_TYPE_D);
1099
1100 /* Set up M1 (message payload). Only the block offsets in M1.0 and
1101 * M1.4 are used, and the rest are ignored.
1102 */
1103 struct brw_reg m1_0 = suboffset(vec1(m1), 0);
1104 struct brw_reg m1_4 = suboffset(vec1(m1), 4);
1105 struct brw_reg index_0 = suboffset(vec1(index), 0);
1106 struct brw_reg index_4 = suboffset(vec1(index), 4);
1107
1108 brw_push_insn_state(p);
1109 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1110 brw_set_default_access_mode(p, BRW_ALIGN_1);
1111
1112 brw_MOV(p, m1_0, index_0);
1113
1114 if (index.file == BRW_IMMEDIATE_VALUE) {
1115 index_4.ud += second_vertex_offset;
1116 brw_MOV(p, m1_4, index_4);
1117 } else {
1118 brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
1119 }
1120
1121 brw_pop_insn_state(p);
1122 }
1123
1124 static void
1125 generate_unpack_flags(struct brw_codegen *p,
1126 struct brw_reg dst)
1127 {
1128 brw_push_insn_state(p);
1129 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1130 brw_set_default_access_mode(p, BRW_ALIGN_1);
1131
1132 struct brw_reg flags = brw_flag_reg(0, 0);
1133 struct brw_reg dst_0 = suboffset(vec1(dst), 0);
1134 struct brw_reg dst_4 = suboffset(vec1(dst), 4);
1135
1136 brw_AND(p, dst_0, flags, brw_imm_ud(0x0f));
1137 brw_AND(p, dst_4, flags, brw_imm_ud(0xf0));
1138 brw_SHR(p, dst_4, dst_4, brw_imm_ud(4));
1139
1140 brw_pop_insn_state(p);
1141 }
1142
1143 static void
1144 generate_scratch_read(struct brw_codegen *p,
1145 vec4_instruction *inst,
1146 struct brw_reg dst,
1147 struct brw_reg index)
1148 {
1149 const struct gen_device_info *devinfo = p->devinfo;
1150 struct brw_reg header = brw_vec8_grf(0, 0);
1151
1152 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1153
1154 generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
1155 index);
1156
1157 uint32_t msg_type;
1158
1159 if (devinfo->gen >= 6)
1160 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1161 else if (devinfo->gen == 5 || devinfo->is_g4x)
1162 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1163 else
1164 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1165
1166 const unsigned target_cache =
1167 devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1168 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1169 BRW_SFID_DATAPORT_READ;
1170
1171 /* Each of the 8 channel enables is considered for whether each
1172 * dword is written.
1173 */
1174 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1175 brw_inst_set_sfid(devinfo, send, target_cache);
1176 brw_set_dest(p, send, dst);
1177 brw_set_src0(p, send, header);
1178 if (devinfo->gen < 6)
1179 brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
1180 brw_set_desc(p, send,
1181 brw_message_desc(devinfo, 2, 1, true) |
1182 brw_dp_read_desc(devinfo,
1183 brw_scratch_surface_idx(p),
1184 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1185 msg_type, BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
1186 }
1187
1188 static void
1189 generate_scratch_write(struct brw_codegen *p,
1190 vec4_instruction *inst,
1191 struct brw_reg dst,
1192 struct brw_reg src,
1193 struct brw_reg index)
1194 {
1195 const struct gen_device_info *devinfo = p->devinfo;
1196 const unsigned target_cache =
1197 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1198 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1199 BRW_SFID_DATAPORT_WRITE);
1200 struct brw_reg header = brw_vec8_grf(0, 0);
1201 bool write_commit;
1202
1203 /* If the instruction is predicated, we'll predicate the send, not
1204 * the header setup.
1205 */
1206 brw_set_default_predicate_control(p, false);
1207
1208 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1209
1210 generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
1211 index);
1212
1213 brw_MOV(p,
1214 retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
1215 retype(src, BRW_REGISTER_TYPE_D));
1216
1217 uint32_t msg_type;
1218
1219 if (devinfo->gen >= 7)
1220 msg_type = GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE;
1221 else if (devinfo->gen == 6)
1222 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
1223 else
1224 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
1225
1226 brw_set_default_predicate_control(p, inst->predicate);
1227
1228 /* Pre-gen6, we have to specify write commits to ensure ordering
1229 * between reads and writes within a thread. Afterwards, that's
1230 * guaranteed and write commits only matter for inter-thread
1231 * synchronization.
1232 */
1233 if (devinfo->gen >= 6) {
1234 write_commit = false;
1235 } else {
1236 /* The visitor set up our destination register to be g0. This
1237 * means that when the next read comes along, we will end up
1238 * reading from g0 and causing a block on the write commit. For
1239 * write-after-read, we are relying on the value of the previous
1240 * read being used (and thus blocking on completion) before our
1241 * write is executed. This means we have to be careful in
1242 * instruction scheduling to not violate this assumption.
1243 */
1244 write_commit = true;
1245 }
1246
1247 /* Each of the 8 channel enables is considered for whether each
1248 * dword is written.
1249 */
1250 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1251 brw_inst_set_sfid(p->devinfo, send, target_cache);
1252 brw_set_dest(p, send, dst);
1253 brw_set_src0(p, send, header);
1254 if (devinfo->gen < 6)
1255 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
1256 brw_set_desc(p, send,
1257 brw_message_desc(devinfo, 3, write_commit, true) |
1258 brw_dp_write_desc(devinfo,
1259 brw_scratch_surface_idx(p),
1260 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1261 msg_type,
1262 false, /* not a render target write */
1263 write_commit));
1264 }
1265
1266 static void
1267 generate_pull_constant_load(struct brw_codegen *p,
1268 struct brw_vue_prog_data *prog_data,
1269 vec4_instruction *inst,
1270 struct brw_reg dst,
1271 struct brw_reg index,
1272 struct brw_reg offset)
1273 {
1274 const struct gen_device_info *devinfo = p->devinfo;
1275 const unsigned target_cache =
1276 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE :
1277 BRW_SFID_DATAPORT_READ);
1278 assert(index.file == BRW_IMMEDIATE_VALUE &&
1279 index.type == BRW_REGISTER_TYPE_UD);
1280 uint32_t surf_index = index.ud;
1281
1282 struct brw_reg header = brw_vec8_grf(0, 0);
1283
1284 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1285
1286 if (devinfo->gen >= 6) {
1287 if (offset.file == BRW_IMMEDIATE_VALUE) {
1288 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
1289 BRW_REGISTER_TYPE_D),
1290 brw_imm_d(offset.ud >> 4));
1291 } else {
1292 brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1),
1293 BRW_REGISTER_TYPE_D),
1294 offset, brw_imm_d(4));
1295 }
1296 } else {
1297 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
1298 BRW_REGISTER_TYPE_D),
1299 offset);
1300 }
1301
1302 uint32_t msg_type;
1303
1304 if (devinfo->gen >= 6)
1305 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1306 else if (devinfo->gen == 5 || devinfo->is_g4x)
1307 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1308 else
1309 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1310
1311 /* Each of the 8 channel enables is considered for whether each
1312 * dword is written.
1313 */
1314 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1315 brw_inst_set_sfid(devinfo, send, target_cache);
1316 brw_set_dest(p, send, dst);
1317 brw_set_src0(p, send, header);
1318 if (devinfo->gen < 6)
1319 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
1320 brw_set_desc(p, send,
1321 brw_message_desc(devinfo, 2, 1, true) |
1322 brw_dp_read_desc(devinfo, surf_index,
1323 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1324 msg_type,
1325 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
1326 }
1327
1328 static void
1329 generate_get_buffer_size(struct brw_codegen *p,
1330 struct brw_vue_prog_data *prog_data,
1331 vec4_instruction *inst,
1332 struct brw_reg dst,
1333 struct brw_reg src,
1334 struct brw_reg surf_index)
1335 {
1336 assert(p->devinfo->gen >= 7);
1337 assert(surf_index.type == BRW_REGISTER_TYPE_UD &&
1338 surf_index.file == BRW_IMMEDIATE_VALUE);
1339
1340 brw_SAMPLE(p,
1341 dst,
1342 inst->base_mrf,
1343 src,
1344 surf_index.ud,
1345 0,
1346 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
1347 1, /* response length */
1348 inst->mlen,
1349 inst->header_size > 0,
1350 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1351 BRW_SAMPLER_RETURN_FORMAT_SINT32);
1352 }
1353
1354 static void
1355 generate_pull_constant_load_gen7(struct brw_codegen *p,
1356 struct brw_vue_prog_data *prog_data,
1357 vec4_instruction *inst,
1358 struct brw_reg dst,
1359 struct brw_reg surf_index,
1360 struct brw_reg offset)
1361 {
1362 const struct gen_device_info *devinfo = p->devinfo;
1363 assert(surf_index.type == BRW_REGISTER_TYPE_UD);
1364
1365 if (surf_index.file == BRW_IMMEDIATE_VALUE) {
1366
1367 brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
1368 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
1369 brw_set_dest(p, insn, dst);
1370 brw_set_src0(p, insn, offset);
1371 brw_set_desc(p, insn,
1372 brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
1373 brw_sampler_desc(devinfo, surf_index.ud,
1374 0, /* LD message ignores sampler unit */
1375 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1376 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 0));
1377 } else {
1378
1379 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1380
1381 brw_push_insn_state(p);
1382 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1383 brw_set_default_access_mode(p, BRW_ALIGN_1);
1384
1385 /* a0.0 = surf_index & 0xff */
1386 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1387 brw_inst_set_exec_size(devinfo, insn_and, BRW_EXECUTE_1);
1388 brw_set_dest(p, insn_and, addr);
1389 brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD)));
1390 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1391
1392 brw_pop_insn_state(p);
1393
1394 /* dst = send(offset, a0.0 | <descriptor>) */
1395 brw_send_indirect_message(
1396 p, BRW_SFID_SAMPLER, dst, offset, addr,
1397 brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
1398 brw_sampler_desc(devinfo,
1399 0 /* surface */,
1400 0 /* sampler */,
1401 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1402 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1403 0));
1404 }
1405 }
1406
1407 static void
1408 generate_set_simd4x2_header_gen9(struct brw_codegen *p,
1409 vec4_instruction *,
1410 struct brw_reg dst)
1411 {
1412 brw_push_insn_state(p);
1413 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1414
1415 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1416 brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1417
1418 brw_set_default_access_mode(p, BRW_ALIGN_1);
1419 brw_MOV(p, get_element_ud(dst, 2),
1420 brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
1421
1422 brw_pop_insn_state(p);
1423 }
1424
1425 static void
1426 generate_mov_indirect(struct brw_codegen *p,
1427 vec4_instruction *,
1428 struct brw_reg dst, struct brw_reg reg,
1429 struct brw_reg indirect)
1430 {
1431 assert(indirect.type == BRW_REGISTER_TYPE_UD);
1432 assert(p->devinfo->gen >= 6);
1433
1434 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
1435
1436 /* This instruction acts in align1 mode */
1437 assert(dst.writemask == WRITEMASK_XYZW);
1438
1439 if (indirect.file == BRW_IMMEDIATE_VALUE) {
1440 imm_byte_offset += indirect.ud;
1441
1442 reg.nr = imm_byte_offset / REG_SIZE;
1443 reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2;
1444 unsigned shift = (imm_byte_offset / 4) % 4;
1445 reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
1446
1447 brw_MOV(p, dst, reg);
1448 } else {
1449 brw_push_insn_state(p);
1450 brw_set_default_access_mode(p, BRW_ALIGN_1);
1451 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1452
1453 struct brw_reg addr = vec8(brw_address_reg(0));
1454
1455 /* We need to move the indirect value into the address register. In
1456 * order to make things make some sense, we want to respect at least the
1457 * X component of the swizzle. In order to do that, we need to convert
1458 * the subnr (probably 0) to an align1 subnr and add in the swizzle.
1459 */
1460 assert(brw_is_single_value_swizzle(indirect.swizzle));
1461 indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0));
1462
1463 /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of
1464 * the indirect and splat it out to all four channels of the given half
1465 * of a0.
1466 */
1467 indirect.subnr *= 2;
1468 indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0);
1469 brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset));
1470
1471 /* Now we need to incorporate the swizzle from the source register */
1472 if (reg.swizzle != BRW_SWIZZLE_XXXX) {
1473 uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 |
1474 BRW_GET_SWZ(reg.swizzle, 1) << 6 |
1475 BRW_GET_SWZ(reg.swizzle, 2) << 10 |
1476 BRW_GET_SWZ(reg.swizzle, 3) << 14;
1477 uv_swiz |= uv_swiz << 16;
1478
1479 brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz));
1480 }
1481
1482 brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type));
1483
1484 brw_pop_insn_state(p);
1485 }
1486 }
1487
1488 static void
1489 generate_code(struct brw_codegen *p,
1490 const struct brw_compiler *compiler,
1491 void *log_data,
1492 const nir_shader *nir,
1493 struct brw_vue_prog_data *prog_data,
1494 const struct cfg_t *cfg)
1495 {
1496 const struct gen_device_info *devinfo = p->devinfo;
1497 const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->info.stage);
1498 bool debug_flag = INTEL_DEBUG &
1499 intel_debug_flag_for_shader_stage(nir->info.stage);
1500 struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
1501 int spill_count = 0, fill_count = 0;
1502 int loop_count = 0;
1503
1504 foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
1505 struct brw_reg src[3], dst;
1506
1507 if (unlikely(debug_flag))
1508 disasm_annotate(disasm_info, inst, p->next_insn_offset);
1509
1510 for (unsigned int i = 0; i < 3; i++) {
1511 src[i] = inst->src[i].as_brw_reg();
1512 }
1513 dst = inst->dst.as_brw_reg();
1514
1515 brw_set_default_predicate_control(p, inst->predicate);
1516 brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1517 brw_set_default_flag_reg(p, inst->flag_subreg / 2, inst->flag_subreg % 2);
1518 brw_set_default_saturate(p, inst->saturate);
1519 brw_set_default_mask_control(p, inst->force_writemask_all);
1520 brw_set_default_acc_write_control(p, inst->writes_accumulator);
1521
1522 assert(inst->group % inst->exec_size == 0);
1523 assert(inst->group % 4 == 0);
1524
1525 /* There are some instructions where the destination is 64-bit
1526 * but we retype it to a smaller type. In that case, we cannot
1527 * double the exec_size.
1528 */
1529 const bool is_df = (get_exec_type_size(inst) == 8 ||
1530 inst->dst.type == BRW_REGISTER_TYPE_DF) &&
1531 inst->opcode != VEC4_OPCODE_PICK_LOW_32BIT &&
1532 inst->opcode != VEC4_OPCODE_PICK_HIGH_32BIT &&
1533 inst->opcode != VEC4_OPCODE_SET_LOW_32BIT &&
1534 inst->opcode != VEC4_OPCODE_SET_HIGH_32BIT;
1535
1536 unsigned exec_size = inst->exec_size;
1537 if (devinfo->gen == 7 && !devinfo->is_haswell && is_df)
1538 exec_size *= 2;
1539
1540 brw_set_default_exec_size(p, cvt(exec_size) - 1);
1541
1542 if (!inst->force_writemask_all)
1543 brw_set_default_group(p, inst->group);
1544
1545 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1546 assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1547
1548 unsigned pre_emit_nr_insn = p->nr_insn;
1549
1550 switch (inst->opcode) {
1551 case VEC4_OPCODE_UNPACK_UNIFORM:
1552 case BRW_OPCODE_MOV:
1553 brw_MOV(p, dst, src[0]);
1554 break;
1555 case BRW_OPCODE_ADD:
1556 brw_ADD(p, dst, src[0], src[1]);
1557 break;
1558 case BRW_OPCODE_MUL:
1559 brw_MUL(p, dst, src[0], src[1]);
1560 break;
1561 case BRW_OPCODE_MACH:
1562 brw_MACH(p, dst, src[0], src[1]);
1563 break;
1564
1565 case BRW_OPCODE_MAD:
1566 assert(devinfo->gen >= 6);
1567 brw_MAD(p, dst, src[0], src[1], src[2]);
1568 break;
1569
1570 case BRW_OPCODE_FRC:
1571 brw_FRC(p, dst, src[0]);
1572 break;
1573 case BRW_OPCODE_RNDD:
1574 brw_RNDD(p, dst, src[0]);
1575 break;
1576 case BRW_OPCODE_RNDE:
1577 brw_RNDE(p, dst, src[0]);
1578 break;
1579 case BRW_OPCODE_RNDZ:
1580 brw_RNDZ(p, dst, src[0]);
1581 break;
1582
1583 case BRW_OPCODE_AND:
1584 brw_AND(p, dst, src[0], src[1]);
1585 break;
1586 case BRW_OPCODE_OR:
1587 brw_OR(p, dst, src[0], src[1]);
1588 break;
1589 case BRW_OPCODE_XOR:
1590 brw_XOR(p, dst, src[0], src[1]);
1591 break;
1592 case BRW_OPCODE_NOT:
1593 brw_NOT(p, dst, src[0]);
1594 break;
1595 case BRW_OPCODE_ASR:
1596 brw_ASR(p, dst, src[0], src[1]);
1597 break;
1598 case BRW_OPCODE_SHR:
1599 brw_SHR(p, dst, src[0], src[1]);
1600 break;
1601 case BRW_OPCODE_SHL:
1602 brw_SHL(p, dst, src[0], src[1]);
1603 break;
1604
1605 case BRW_OPCODE_CMP:
1606 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1607 break;
1608 case BRW_OPCODE_SEL:
1609 brw_SEL(p, dst, src[0], src[1]);
1610 break;
1611
1612 case BRW_OPCODE_DPH:
1613 brw_DPH(p, dst, src[0], src[1]);
1614 break;
1615
1616 case BRW_OPCODE_DP4:
1617 brw_DP4(p, dst, src[0], src[1]);
1618 break;
1619
1620 case BRW_OPCODE_DP3:
1621 brw_DP3(p, dst, src[0], src[1]);
1622 break;
1623
1624 case BRW_OPCODE_DP2:
1625 brw_DP2(p, dst, src[0], src[1]);
1626 break;
1627
1628 case BRW_OPCODE_F32TO16:
1629 assert(devinfo->gen >= 7);
1630 brw_F32TO16(p, dst, src[0]);
1631 break;
1632
1633 case BRW_OPCODE_F16TO32:
1634 assert(devinfo->gen >= 7);
1635 brw_F16TO32(p, dst, src[0]);
1636 break;
1637
1638 case BRW_OPCODE_LRP:
1639 assert(devinfo->gen >= 6);
1640 brw_LRP(p, dst, src[0], src[1], src[2]);
1641 break;
1642
1643 case BRW_OPCODE_BFREV:
1644 assert(devinfo->gen >= 7);
1645 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1646 retype(src[0], BRW_REGISTER_TYPE_UD));
1647 break;
1648 case BRW_OPCODE_FBH:
1649 assert(devinfo->gen >= 7);
1650 brw_FBH(p, retype(dst, src[0].type), src[0]);
1651 break;
1652 case BRW_OPCODE_FBL:
1653 assert(devinfo->gen >= 7);
1654 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
1655 retype(src[0], BRW_REGISTER_TYPE_UD));
1656 break;
1657 case BRW_OPCODE_LZD:
1658 brw_LZD(p, dst, src[0]);
1659 break;
1660 case BRW_OPCODE_CBIT:
1661 assert(devinfo->gen >= 7);
1662 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
1663 retype(src[0], BRW_REGISTER_TYPE_UD));
1664 break;
1665 case BRW_OPCODE_ADDC:
1666 assert(devinfo->gen >= 7);
1667 brw_ADDC(p, dst, src[0], src[1]);
1668 break;
1669 case BRW_OPCODE_SUBB:
1670 assert(devinfo->gen >= 7);
1671 brw_SUBB(p, dst, src[0], src[1]);
1672 break;
1673 case BRW_OPCODE_MAC:
1674 brw_MAC(p, dst, src[0], src[1]);
1675 break;
1676
1677 case BRW_OPCODE_BFE:
1678 assert(devinfo->gen >= 7);
1679 brw_BFE(p, dst, src[0], src[1], src[2]);
1680 break;
1681
1682 case BRW_OPCODE_BFI1:
1683 assert(devinfo->gen >= 7);
1684 brw_BFI1(p, dst, src[0], src[1]);
1685 break;
1686 case BRW_OPCODE_BFI2:
1687 assert(devinfo->gen >= 7);
1688 brw_BFI2(p, dst, src[0], src[1], src[2]);
1689 break;
1690
1691 case BRW_OPCODE_IF:
1692 if (!inst->src[0].is_null()) {
1693 /* The instruction has an embedded compare (only allowed on gen6) */
1694 assert(devinfo->gen == 6);
1695 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1696 } else {
1697 brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8);
1698 brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate);
1699 }
1700 break;
1701
1702 case BRW_OPCODE_ELSE:
1703 brw_ELSE(p);
1704 break;
1705 case BRW_OPCODE_ENDIF:
1706 brw_ENDIF(p);
1707 break;
1708
1709 case BRW_OPCODE_DO:
1710 brw_DO(p, BRW_EXECUTE_8);
1711 break;
1712
1713 case BRW_OPCODE_BREAK:
1714 brw_BREAK(p);
1715 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1716 break;
1717 case BRW_OPCODE_CONTINUE:
1718 brw_CONT(p);
1719 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1720 break;
1721
1722 case BRW_OPCODE_WHILE:
1723 brw_WHILE(p);
1724 loop_count++;
1725 break;
1726
1727 case SHADER_OPCODE_RCP:
1728 case SHADER_OPCODE_RSQ:
1729 case SHADER_OPCODE_SQRT:
1730 case SHADER_OPCODE_EXP2:
1731 case SHADER_OPCODE_LOG2:
1732 case SHADER_OPCODE_SIN:
1733 case SHADER_OPCODE_COS:
1734 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1735 if (devinfo->gen >= 7) {
1736 gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
1737 brw_null_reg());
1738 } else if (devinfo->gen == 6) {
1739 generate_math_gen6(p, inst, dst, src[0], brw_null_reg());
1740 } else {
1741 generate_math1_gen4(p, inst, dst, src[0]);
1742 }
1743 break;
1744
1745 case SHADER_OPCODE_POW:
1746 case SHADER_OPCODE_INT_QUOTIENT:
1747 case SHADER_OPCODE_INT_REMAINDER:
1748 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1749 if (devinfo->gen >= 7) {
1750 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1751 } else if (devinfo->gen == 6) {
1752 generate_math_gen6(p, inst, dst, src[0], src[1]);
1753 } else {
1754 generate_math2_gen4(p, inst, dst, src[0], src[1]);
1755 }
1756 break;
1757
1758 case SHADER_OPCODE_TEX:
1759 case SHADER_OPCODE_TXD:
1760 case SHADER_OPCODE_TXF:
1761 case SHADER_OPCODE_TXF_CMS:
1762 case SHADER_OPCODE_TXF_CMS_W:
1763 case SHADER_OPCODE_TXF_MCS:
1764 case SHADER_OPCODE_TXL:
1765 case SHADER_OPCODE_TXS:
1766 case SHADER_OPCODE_TG4:
1767 case SHADER_OPCODE_TG4_OFFSET:
1768 case SHADER_OPCODE_SAMPLEINFO:
1769 generate_tex(p, prog_data, nir->info.stage,
1770 inst, dst, src[0], src[1], src[2]);
1771 break;
1772
1773 case SHADER_OPCODE_GET_BUFFER_SIZE:
1774 generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
1775 break;
1776
1777 case VS_OPCODE_URB_WRITE:
1778 generate_vs_urb_write(p, inst);
1779 break;
1780
1781 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1782 generate_scratch_read(p, inst, dst, src[0]);
1783 fill_count++;
1784 break;
1785
1786 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1787 generate_scratch_write(p, inst, dst, src[0], src[1]);
1788 spill_count++;
1789 break;
1790
1791 case VS_OPCODE_PULL_CONSTANT_LOAD:
1792 generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]);
1793 break;
1794
1795 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
1796 generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]);
1797 break;
1798
1799 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
1800 generate_set_simd4x2_header_gen9(p, inst, dst);
1801 break;
1802
1803 case GS_OPCODE_URB_WRITE:
1804 generate_gs_urb_write(p, inst);
1805 break;
1806
1807 case GS_OPCODE_URB_WRITE_ALLOCATE:
1808 generate_gs_urb_write_allocate(p, inst);
1809 break;
1810
1811 case GS_OPCODE_SVB_WRITE:
1812 generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]);
1813 break;
1814
1815 case GS_OPCODE_SVB_SET_DST_INDEX:
1816 generate_gs_svb_set_destination_index(p, inst, dst, src[0]);
1817 break;
1818
1819 case GS_OPCODE_THREAD_END:
1820 generate_gs_thread_end(p, inst);
1821 break;
1822
1823 case GS_OPCODE_SET_WRITE_OFFSET:
1824 generate_gs_set_write_offset(p, dst, src[0], src[1]);
1825 break;
1826
1827 case GS_OPCODE_SET_VERTEX_COUNT:
1828 generate_gs_set_vertex_count(p, dst, src[0]);
1829 break;
1830
1831 case GS_OPCODE_FF_SYNC:
1832 generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
1833 break;
1834
1835 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
1836 generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]);
1837 break;
1838
1839 case GS_OPCODE_SET_PRIMITIVE_ID:
1840 generate_gs_set_primitive_id(p, dst);
1841 break;
1842
1843 case GS_OPCODE_SET_DWORD_2:
1844 generate_gs_set_dword_2(p, dst, src[0]);
1845 break;
1846
1847 case GS_OPCODE_PREPARE_CHANNEL_MASKS:
1848 generate_gs_prepare_channel_masks(p, dst);
1849 break;
1850
1851 case GS_OPCODE_SET_CHANNEL_MASKS:
1852 generate_gs_set_channel_masks(p, dst, src[0]);
1853 break;
1854
1855 case GS_OPCODE_GET_INSTANCE_ID:
1856 generate_gs_get_instance_id(p, dst);
1857 break;
1858
1859 case SHADER_OPCODE_SHADER_TIME_ADD:
1860 brw_shader_time_add(p, src[0],
1861 prog_data->base.binding_table.shader_time_start);
1862 break;
1863
1864 case SHADER_OPCODE_UNTYPED_ATOMIC:
1865 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1866 brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
1867 !inst->dst.is_null(), inst->header_size);
1868 break;
1869
1870 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1871 assert(!inst->header_size);
1872 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1873 brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
1874 src[2].ud);
1875 break;
1876
1877 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1878 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1879 brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
1880 src[2].ud, inst->header_size);
1881 break;
1882
1883 case SHADER_OPCODE_TYPED_ATOMIC:
1884 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1885 brw_typed_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
1886 !inst->dst.is_null(), inst->header_size);
1887 break;
1888
1889 case SHADER_OPCODE_TYPED_SURFACE_READ:
1890 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1891 brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen,
1892 src[2].ud, inst->header_size);
1893 break;
1894
1895 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1896 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1897 brw_typed_surface_write(p, src[0], src[1], inst->mlen,
1898 src[2].ud, inst->header_size);
1899 break;
1900
1901 case SHADER_OPCODE_MEMORY_FENCE:
1902 brw_memory_fence(p, dst, BRW_OPCODE_SEND);
1903 break;
1904
1905 case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
1906 const struct brw_reg mask =
1907 brw_stage_has_packed_dispatch(devinfo, nir->info.stage,
1908 &prog_data->base) ? brw_imm_ud(~0u) :
1909 brw_dmask_reg();
1910 brw_find_live_channel(p, dst, mask);
1911 break;
1912 }
1913
1914 case SHADER_OPCODE_BROADCAST:
1915 assert(inst->force_writemask_all);
1916 brw_broadcast(p, dst, src[0], src[1]);
1917 break;
1918
1919 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
1920 generate_unpack_flags(p, dst);
1921 break;
1922
1923 case VEC4_OPCODE_MOV_BYTES: {
1924 /* Moves the low byte from each channel, using an Align1 access mode
1925 * and a <4,1,0> source region.
1926 */
1927 assert(src[0].type == BRW_REGISTER_TYPE_UB ||
1928 src[0].type == BRW_REGISTER_TYPE_B);
1929
1930 brw_set_default_access_mode(p, BRW_ALIGN_1);
1931 src[0].vstride = BRW_VERTICAL_STRIDE_4;
1932 src[0].width = BRW_WIDTH_1;
1933 src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
1934 brw_MOV(p, dst, src[0]);
1935 brw_set_default_access_mode(p, BRW_ALIGN_16);
1936 break;
1937 }
1938
1939 case VEC4_OPCODE_DOUBLE_TO_F32:
1940 case VEC4_OPCODE_DOUBLE_TO_D32:
1941 case VEC4_OPCODE_DOUBLE_TO_U32: {
1942 assert(type_sz(src[0].type) == 8);
1943 assert(type_sz(dst.type) == 8);
1944
1945 brw_reg_type dst_type;
1946
1947 switch (inst->opcode) {
1948 case VEC4_OPCODE_DOUBLE_TO_F32:
1949 dst_type = BRW_REGISTER_TYPE_F;
1950 break;
1951 case VEC4_OPCODE_DOUBLE_TO_D32:
1952 dst_type = BRW_REGISTER_TYPE_D;
1953 break;
1954 case VEC4_OPCODE_DOUBLE_TO_U32:
1955 dst_type = BRW_REGISTER_TYPE_UD;
1956 break;
1957 default:
1958 unreachable("Not supported conversion");
1959 }
1960 dst = retype(dst, dst_type);
1961
1962 brw_set_default_access_mode(p, BRW_ALIGN_1);
1963
1964 /* When converting from DF->F, we set destination's stride as 2 as an
1965 * aligment requirement. But in IVB/BYT, each DF implicitly writes
1966 * two floats, being the first one the converted value. So we don't
1967 * need to explicitly set stride 2, but 1.
1968 */
1969 struct brw_reg spread_dst;
1970 if (devinfo->gen == 7 && !devinfo->is_haswell)
1971 spread_dst = stride(dst, 8, 4, 1);
1972 else
1973 spread_dst = stride(dst, 8, 4, 2);
1974
1975 brw_MOV(p, spread_dst, src[0]);
1976
1977 brw_set_default_access_mode(p, BRW_ALIGN_16);
1978 break;
1979 }
1980
1981 case VEC4_OPCODE_TO_DOUBLE: {
1982 assert(type_sz(src[0].type) == 4);
1983 assert(type_sz(dst.type) == 8);
1984
1985 brw_set_default_access_mode(p, BRW_ALIGN_1);
1986
1987 brw_MOV(p, dst, src[0]);
1988
1989 brw_set_default_access_mode(p, BRW_ALIGN_16);
1990 break;
1991 }
1992
1993 case VEC4_OPCODE_PICK_LOW_32BIT:
1994 case VEC4_OPCODE_PICK_HIGH_32BIT: {
1995 /* Stores the low/high 32-bit of each 64-bit element in src[0] into
1996 * dst using ALIGN1 mode and a <8,4,2>:UD region on the source.
1997 */
1998 assert(type_sz(src[0].type) == 8);
1999 assert(type_sz(dst.type) == 4);
2000
2001 brw_set_default_access_mode(p, BRW_ALIGN_1);
2002
2003 dst = retype(dst, BRW_REGISTER_TYPE_UD);
2004 dst.hstride = BRW_HORIZONTAL_STRIDE_1;
2005
2006 src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
2007 if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)
2008 src[0] = suboffset(src[0], 1);
2009 src[0] = spread(src[0], 2);
2010 brw_MOV(p, dst, src[0]);
2011
2012 brw_set_default_access_mode(p, BRW_ALIGN_16);
2013 break;
2014 }
2015
2016 case VEC4_OPCODE_SET_LOW_32BIT:
2017 case VEC4_OPCODE_SET_HIGH_32BIT: {
2018 /* Reads consecutive 32-bit elements from src[0] and writes
2019 * them to the low/high 32-bit of each 64-bit element in dst.
2020 */
2021 assert(type_sz(src[0].type) == 4);
2022 assert(type_sz(dst.type) == 8);
2023
2024 brw_set_default_access_mode(p, BRW_ALIGN_1);
2025
2026 dst = retype(dst, BRW_REGISTER_TYPE_UD);
2027 if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT)
2028 dst = suboffset(dst, 1);
2029 dst.hstride = BRW_HORIZONTAL_STRIDE_2;
2030
2031 src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
2032 brw_MOV(p, dst, src[0]);
2033
2034 brw_set_default_access_mode(p, BRW_ALIGN_16);
2035 break;
2036 }
2037
2038 case VEC4_OPCODE_PACK_BYTES: {
2039 /* Is effectively:
2040 *
2041 * mov(8) dst<16,4,1>:UB src<4,1,0>:UB
2042 *
2043 * but destinations' only regioning is horizontal stride, so instead we
2044 * have to use two instructions:
2045 *
2046 * mov(4) dst<1>:UB src<4,1,0>:UB
2047 * mov(4) dst.16<1>:UB src.16<4,1,0>:UB
2048 *
2049 * where they pack the four bytes from the low and high four DW.
2050 */
2051 assert(_mesa_is_pow_two(dst.writemask) &&
2052 dst.writemask != 0);
2053 unsigned offset = __builtin_ctz(dst.writemask);
2054
2055 dst.type = BRW_REGISTER_TYPE_UB;
2056
2057 brw_set_default_access_mode(p, BRW_ALIGN_1);
2058
2059 src[0].type = BRW_REGISTER_TYPE_UB;
2060 src[0].vstride = BRW_VERTICAL_STRIDE_4;
2061 src[0].width = BRW_WIDTH_1;
2062 src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
2063 dst.subnr = offset * 4;
2064 struct brw_inst *insn = brw_MOV(p, dst, src[0]);
2065 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
2066 brw_inst_set_no_dd_clear(p->devinfo, insn, true);
2067 brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check);
2068
2069 src[0].subnr = 16;
2070 dst.subnr = 16 + offset * 4;
2071 insn = brw_MOV(p, dst, src[0]);
2072 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
2073 brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear);
2074 brw_inst_set_no_dd_check(p->devinfo, insn, true);
2075
2076 brw_set_default_access_mode(p, BRW_ALIGN_16);
2077 break;
2078 }
2079
2080 case TCS_OPCODE_URB_WRITE:
2081 generate_tcs_urb_write(p, inst, src[0]);
2082 break;
2083
2084 case VEC4_OPCODE_URB_READ:
2085 generate_vec4_urb_read(p, inst, dst, src[0]);
2086 break;
2087
2088 case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
2089 generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
2090 break;
2091
2092 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
2093 generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
2094 break;
2095
2096 case TCS_OPCODE_GET_INSTANCE_ID:
2097 generate_tcs_get_instance_id(p, dst);
2098 break;
2099
2100 case TCS_OPCODE_GET_PRIMITIVE_ID:
2101 generate_tcs_get_primitive_id(p, dst);
2102 break;
2103
2104 case TCS_OPCODE_CREATE_BARRIER_HEADER:
2105 generate_tcs_create_barrier_header(p, prog_data, dst);
2106 break;
2107
2108 case TES_OPCODE_CREATE_INPUT_READ_HEADER:
2109 generate_tes_create_input_read_header(p, dst);
2110 break;
2111
2112 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
2113 generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]);
2114 break;
2115
2116 case TES_OPCODE_GET_PRIMITIVE_ID:
2117 generate_tes_get_primitive_id(p, dst);
2118 break;
2119
2120 case TCS_OPCODE_SRC0_010_IS_ZERO:
2121 /* If src_reg had stride like fs_reg, we wouldn't need this. */
2122 brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
2123 break;
2124
2125 case TCS_OPCODE_RELEASE_INPUT:
2126 generate_tcs_release_input(p, dst, src[0], src[1]);
2127 break;
2128
2129 case TCS_OPCODE_THREAD_END:
2130 generate_tcs_thread_end(p, inst);
2131 break;
2132
2133 case SHADER_OPCODE_BARRIER:
2134 brw_barrier(p, src[0]);
2135 brw_WAIT(p);
2136 break;
2137
2138 case SHADER_OPCODE_MOV_INDIRECT:
2139 generate_mov_indirect(p, inst, dst, src[0], src[1]);
2140 break;
2141
2142 case BRW_OPCODE_DIM:
2143 assert(devinfo->is_haswell);
2144 assert(src[0].type == BRW_REGISTER_TYPE_DF);
2145 assert(dst.type == BRW_REGISTER_TYPE_DF);
2146 brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2147 break;
2148
2149 default:
2150 unreachable("Unsupported opcode");
2151 }
2152
2153 if (inst->opcode == VEC4_OPCODE_PACK_BYTES) {
2154 /* Handled dependency hints in the generator. */
2155
2156 assert(!inst->conditional_mod);
2157 } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2158 assert(p->nr_insn == pre_emit_nr_insn + 1 ||
2159 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2160 "emitting more than 1 instruction");
2161
2162 brw_inst *last = &p->store[pre_emit_nr_insn];
2163
2164 if (inst->conditional_mod)
2165 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2166 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2167 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2168 }
2169 }
2170
2171 brw_set_uip_jip(p, 0);
2172
2173 /* end of program sentinel */
2174 disasm_new_inst_group(disasm_info, p->next_insn_offset);
2175
2176 #ifndef NDEBUG
2177 bool validated =
2178 #else
2179 if (unlikely(debug_flag))
2180 #endif
2181 brw_validate_instructions(devinfo, p->store,
2182 0, p->next_insn_offset,
2183 disasm_info);
2184
2185 int before_size = p->next_insn_offset;
2186 brw_compact_instructions(p, 0, disasm_info);
2187 int after_size = p->next_insn_offset;
2188
2189 if (unlikely(debug_flag)) {
2190 fprintf(stderr, "Native code for %s %s shader %s:\n",
2191 nir->info.label ? nir->info.label : "unnamed",
2192 _mesa_shader_stage_to_string(nir->info.stage), nir->info.name);
2193
2194 fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
2195 "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
2196 stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
2197 spill_count, fill_count, before_size, after_size,
2198 100.0f * (before_size - after_size) / before_size);
2199
2200 dump_assembly(p->store, disasm_info);
2201 }
2202 ralloc_free(disasm_info);
2203 assert(validated);
2204
2205 compiler->shader_debug_log(log_data,
2206 "%s vec4 shader: %d inst, %d loops, %u cycles, "
2207 "%d:%d spills:fills, compacted %d to %d bytes.",
2208 stage_abbrev, before_size / 16,
2209 loop_count, cfg->cycle_count, spill_count,
2210 fill_count, before_size, after_size);
2211
2212 }
2213
2214 extern "C" const unsigned *
2215 brw_vec4_generate_assembly(const struct brw_compiler *compiler,
2216 void *log_data,
2217 void *mem_ctx,
2218 const nir_shader *nir,
2219 struct brw_vue_prog_data *prog_data,
2220 const struct cfg_t *cfg)
2221 {
2222 struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen);
2223 brw_init_codegen(compiler->devinfo, p, mem_ctx);
2224 brw_set_default_access_mode(p, BRW_ALIGN_16);
2225
2226 generate_code(p, compiler, log_data, nir, prog_data, cfg);
2227
2228 return brw_get_program(p, &prog_data->base.program_size);
2229 }