i965: support gl_InvocationID for gen7
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_generator.cpp
1 /* Copyright © 2011 Intel Corporation
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice (including the next
11 * paragraph) shall be included in all copies or substantial portions of the
12 * Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20 * IN THE SOFTWARE.
21 */
22
23 #include "brw_vec4.h"
24
25 extern "C" {
26 #include "brw_eu.h"
27 #include "main/macros.h"
28 #include "program/prog_print.h"
29 #include "program/prog_parameter.h"
30 };
31
32 namespace brw {
33
34 struct brw_reg
35 vec4_instruction::get_dst(void)
36 {
37 struct brw_reg brw_reg;
38
39 switch (dst.file) {
40 case GRF:
41 brw_reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
42 brw_reg = retype(brw_reg, dst.type);
43 brw_reg.dw1.bits.writemask = dst.writemask;
44 break;
45
46 case MRF:
47 brw_reg = brw_message_reg(dst.reg + dst.reg_offset);
48 brw_reg = retype(brw_reg, dst.type);
49 brw_reg.dw1.bits.writemask = dst.writemask;
50 break;
51
52 case HW_REG:
53 assert(dst.type == dst.fixed_hw_reg.type);
54 brw_reg = dst.fixed_hw_reg;
55 break;
56
57 case BAD_FILE:
58 brw_reg = brw_null_reg();
59 break;
60
61 default:
62 assert(!"not reached");
63 brw_reg = brw_null_reg();
64 break;
65 }
66 return brw_reg;
67 }
68
69 struct brw_reg
70 vec4_instruction::get_src(const struct brw_vec4_prog_data *prog_data, int i)
71 {
72 struct brw_reg brw_reg;
73
74 switch (src[i].file) {
75 case GRF:
76 brw_reg = brw_vec8_grf(src[i].reg + src[i].reg_offset, 0);
77 brw_reg = retype(brw_reg, src[i].type);
78 brw_reg.dw1.bits.swizzle = src[i].swizzle;
79 if (src[i].abs)
80 brw_reg = brw_abs(brw_reg);
81 if (src[i].negate)
82 brw_reg = negate(brw_reg);
83 break;
84
85 case IMM:
86 switch (src[i].type) {
87 case BRW_REGISTER_TYPE_F:
88 brw_reg = brw_imm_f(src[i].imm.f);
89 break;
90 case BRW_REGISTER_TYPE_D:
91 brw_reg = brw_imm_d(src[i].imm.i);
92 break;
93 case BRW_REGISTER_TYPE_UD:
94 brw_reg = brw_imm_ud(src[i].imm.u);
95 break;
96 default:
97 assert(!"not reached");
98 brw_reg = brw_null_reg();
99 break;
100 }
101 break;
102
103 case UNIFORM:
104 brw_reg = stride(brw_vec4_grf(prog_data->dispatch_grf_start_reg +
105 (src[i].reg + src[i].reg_offset) / 2,
106 ((src[i].reg + src[i].reg_offset) % 2) * 4),
107 0, 4, 1);
108 brw_reg = retype(brw_reg, src[i].type);
109 brw_reg.dw1.bits.swizzle = src[i].swizzle;
110 if (src[i].abs)
111 brw_reg = brw_abs(brw_reg);
112 if (src[i].negate)
113 brw_reg = negate(brw_reg);
114
115 /* This should have been moved to pull constants. */
116 assert(!src[i].reladdr);
117 break;
118
119 case HW_REG:
120 assert(src[i].type == src[i].fixed_hw_reg.type);
121 brw_reg = src[i].fixed_hw_reg;
122 break;
123
124 case BAD_FILE:
125 /* Probably unused. */
126 brw_reg = brw_null_reg();
127 break;
128 case ATTR:
129 default:
130 assert(!"not reached");
131 brw_reg = brw_null_reg();
132 break;
133 }
134
135 return brw_reg;
136 }
137
138 vec4_generator::vec4_generator(struct brw_context *brw,
139 struct gl_shader_program *shader_prog,
140 struct gl_program *prog,
141 struct brw_vec4_prog_data *prog_data,
142 void *mem_ctx,
143 bool debug_flag)
144 : brw(brw), shader_prog(shader_prog), prog(prog), prog_data(prog_data),
145 mem_ctx(mem_ctx), debug_flag(debug_flag)
146 {
147 p = rzalloc(mem_ctx, struct brw_compile);
148 brw_init_compile(brw, p, mem_ctx);
149 }
150
151 vec4_generator::~vec4_generator()
152 {
153 }
154
155 void
156 vec4_generator::generate_math1_gen4(vec4_instruction *inst,
157 struct brw_reg dst,
158 struct brw_reg src)
159 {
160 brw_math(p,
161 dst,
162 brw_math_function(inst->opcode),
163 inst->base_mrf,
164 src,
165 BRW_MATH_DATA_VECTOR,
166 BRW_MATH_PRECISION_FULL);
167 }
168
169 static void
170 check_gen6_math_src_arg(struct brw_reg src)
171 {
172 /* Source swizzles are ignored. */
173 assert(!src.abs);
174 assert(!src.negate);
175 assert(src.dw1.bits.swizzle == BRW_SWIZZLE_XYZW);
176 }
177
178 void
179 vec4_generator::generate_math1_gen6(vec4_instruction *inst,
180 struct brw_reg dst,
181 struct brw_reg src)
182 {
183 /* Can't do writemask because math can't be align16. */
184 assert(dst.dw1.bits.writemask == WRITEMASK_XYZW);
185 check_gen6_math_src_arg(src);
186
187 brw_set_access_mode(p, BRW_ALIGN_1);
188 brw_math(p,
189 dst,
190 brw_math_function(inst->opcode),
191 inst->base_mrf,
192 src,
193 BRW_MATH_DATA_SCALAR,
194 BRW_MATH_PRECISION_FULL);
195 brw_set_access_mode(p, BRW_ALIGN_16);
196 }
197
198 void
199 vec4_generator::generate_math2_gen7(vec4_instruction *inst,
200 struct brw_reg dst,
201 struct brw_reg src0,
202 struct brw_reg src1)
203 {
204 brw_math2(p,
205 dst,
206 brw_math_function(inst->opcode),
207 src0, src1);
208 }
209
210 void
211 vec4_generator::generate_math2_gen6(vec4_instruction *inst,
212 struct brw_reg dst,
213 struct brw_reg src0,
214 struct brw_reg src1)
215 {
216 /* Can't do writemask because math can't be align16. */
217 assert(dst.dw1.bits.writemask == WRITEMASK_XYZW);
218 /* Source swizzles are ignored. */
219 check_gen6_math_src_arg(src0);
220 check_gen6_math_src_arg(src1);
221
222 brw_set_access_mode(p, BRW_ALIGN_1);
223 brw_math2(p,
224 dst,
225 brw_math_function(inst->opcode),
226 src0, src1);
227 brw_set_access_mode(p, BRW_ALIGN_16);
228 }
229
230 void
231 vec4_generator::generate_math2_gen4(vec4_instruction *inst,
232 struct brw_reg dst,
233 struct brw_reg src0,
234 struct brw_reg src1)
235 {
236 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
237 * "Message Payload":
238 *
239 * "Operand0[7]. For the INT DIV functions, this operand is the
240 * denominator."
241 * ...
242 * "Operand1[7]. For the INT DIV functions, this operand is the
243 * numerator."
244 */
245 bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
246 struct brw_reg &op0 = is_int_div ? src1 : src0;
247 struct brw_reg &op1 = is_int_div ? src0 : src1;
248
249 brw_push_insn_state(p);
250 brw_set_saturate(p, false);
251 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
252 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
253 brw_pop_insn_state(p);
254
255 brw_math(p,
256 dst,
257 brw_math_function(inst->opcode),
258 inst->base_mrf,
259 op0,
260 BRW_MATH_DATA_VECTOR,
261 BRW_MATH_PRECISION_FULL);
262 }
263
264 void
265 vec4_generator::generate_tex(vec4_instruction *inst,
266 struct brw_reg dst,
267 struct brw_reg src)
268 {
269 int msg_type = -1;
270
271 if (brw->gen >= 5) {
272 switch (inst->opcode) {
273 case SHADER_OPCODE_TEX:
274 case SHADER_OPCODE_TXL:
275 if (inst->shadow_compare) {
276 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
277 } else {
278 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
279 }
280 break;
281 case SHADER_OPCODE_TXD:
282 if (inst->shadow_compare) {
283 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
284 assert(brw->is_haswell);
285 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
286 } else {
287 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
288 }
289 break;
290 case SHADER_OPCODE_TXF:
291 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
292 break;
293 case SHADER_OPCODE_TXF_CMS:
294 if (brw->gen >= 7)
295 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
296 else
297 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
298 break;
299 case SHADER_OPCODE_TXF_MCS:
300 assert(brw->gen >= 7);
301 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
302 break;
303 case SHADER_OPCODE_TXS:
304 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
305 break;
306 case SHADER_OPCODE_TG4:
307 if (inst->shadow_compare) {
308 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
309 } else {
310 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
311 }
312 break;
313 case SHADER_OPCODE_TG4_OFFSET:
314 if (inst->shadow_compare) {
315 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
316 } else {
317 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
318 }
319 break;
320 default:
321 assert(!"should not get here: invalid vec4 texture opcode");
322 break;
323 }
324 } else {
325 switch (inst->opcode) {
326 case SHADER_OPCODE_TEX:
327 case SHADER_OPCODE_TXL:
328 if (inst->shadow_compare) {
329 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
330 assert(inst->mlen == 3);
331 } else {
332 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
333 assert(inst->mlen == 2);
334 }
335 break;
336 case SHADER_OPCODE_TXD:
337 /* There is no sample_d_c message; comparisons are done manually. */
338 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
339 assert(inst->mlen == 4);
340 break;
341 case SHADER_OPCODE_TXF:
342 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
343 assert(inst->mlen == 2);
344 break;
345 case SHADER_OPCODE_TXS:
346 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
347 assert(inst->mlen == 2);
348 break;
349 default:
350 assert(!"should not get here: invalid vec4 texture opcode");
351 break;
352 }
353 }
354
355 assert(msg_type != -1);
356
357 /* Load the message header if present. If there's a texture offset, we need
358 * to set it up explicitly and load the offset bitfield. Otherwise, we can
359 * use an implied move from g0 to the first message register.
360 */
361 if (inst->header_present) {
362 if (brw->gen < 6 && !inst->texture_offset) {
363 /* Set up an implied move from g0 to the MRF. */
364 src = brw_vec8_grf(0, 0);
365 } else {
366 struct brw_reg header =
367 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
368
369 /* Explicitly set up the message header by copying g0 to the MRF. */
370 brw_push_insn_state(p);
371 brw_set_mask_control(p, BRW_MASK_DISABLE);
372 brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
373
374 brw_set_access_mode(p, BRW_ALIGN_1);
375
376 if (inst->texture_offset) {
377 /* Set the texel offset bits in DWord 2. */
378 brw_MOV(p, get_element_ud(header, 2),
379 brw_imm_ud(inst->texture_offset));
380 }
381
382 if (inst->sampler >= 16) {
383 /* The "Sampler Index" field can only store values between 0 and 15.
384 * However, we can add an offset to the "Sampler State Pointer"
385 * field, effectively selecting a different set of 16 samplers.
386 *
387 * The "Sampler State Pointer" needs to be aligned to a 32-byte
388 * offset, and each sampler state is only 16-bytes, so we can't
389 * exclusively use the offset - we have to use both.
390 */
391 assert(brw->is_haswell); /* field only exists on Haswell */
392 brw_ADD(p,
393 get_element_ud(header, 3),
394 get_element_ud(brw_vec8_grf(0, 0), 3),
395 brw_imm_ud(16 * (inst->sampler / 16) *
396 sizeof(gen7_sampler_state)));
397 }
398 brw_pop_insn_state(p);
399 }
400 }
401
402 uint32_t return_format;
403
404 switch (dst.type) {
405 case BRW_REGISTER_TYPE_D:
406 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
407 break;
408 case BRW_REGISTER_TYPE_UD:
409 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
410 break;
411 default:
412 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
413 break;
414 }
415
416 uint32_t surface_index = ((inst->opcode == SHADER_OPCODE_TG4 ||
417 inst->opcode == SHADER_OPCODE_TG4_OFFSET)
418 ? prog_data->base.binding_table.gather_texture_start
419 : prog_data->base.binding_table.texture_start) + inst->sampler;
420
421 brw_SAMPLE(p,
422 dst,
423 inst->base_mrf,
424 src,
425 surface_index,
426 inst->sampler % 16,
427 msg_type,
428 1, /* response length */
429 inst->mlen,
430 inst->header_present,
431 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
432 return_format);
433
434 brw_mark_surface_used(&prog_data->base, surface_index);
435 }
436
437 void
438 vec4_generator::generate_vs_urb_write(vec4_instruction *inst)
439 {
440 brw_urb_WRITE(p,
441 brw_null_reg(), /* dest */
442 inst->base_mrf, /* starting mrf reg nr */
443 brw_vec8_grf(0, 0), /* src */
444 inst->urb_write_flags,
445 inst->mlen,
446 0, /* response len */
447 inst->offset, /* urb destination offset */
448 BRW_URB_SWIZZLE_INTERLEAVE);
449 }
450
451 void
452 vec4_generator::generate_gs_urb_write(vec4_instruction *inst)
453 {
454 struct brw_reg src = brw_message_reg(inst->base_mrf);
455 brw_urb_WRITE(p,
456 brw_null_reg(), /* dest */
457 inst->base_mrf, /* starting mrf reg nr */
458 src,
459 inst->urb_write_flags,
460 inst->mlen,
461 0, /* response len */
462 inst->offset, /* urb destination offset */
463 BRW_URB_SWIZZLE_INTERLEAVE);
464 }
465
466 void
467 vec4_generator::generate_gs_thread_end(vec4_instruction *inst)
468 {
469 struct brw_reg src = brw_message_reg(inst->base_mrf);
470 brw_urb_WRITE(p,
471 brw_null_reg(), /* dest */
472 inst->base_mrf, /* starting mrf reg nr */
473 src,
474 BRW_URB_WRITE_EOT,
475 1, /* message len */
476 0, /* response len */
477 0, /* urb destination offset */
478 BRW_URB_SWIZZLE_INTERLEAVE);
479 }
480
481 void
482 vec4_generator::generate_gs_set_write_offset(struct brw_reg dst,
483 struct brw_reg src0,
484 struct brw_reg src1)
485 {
486 /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
487 * Header: M0.3):
488 *
489 * Slot 0 Offset. This field, after adding to the Global Offset field
490 * in the message descriptor, specifies the offset (in 256-bit units)
491 * from the start of the URB entry, as referenced by URB Handle 0, at
492 * which the data will be accessed.
493 *
494 * Similar text describes DWORD M0.4, which is slot 1 offset.
495 *
496 * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
497 * of the register for geometry shader invocations 0 and 1) by the
498 * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
499 *
500 * We can do this with the following EU instruction:
501 *
502 * mul(2) dst.3<1>UD src0<8;2,4>UD src1 { Align1 WE_all }
503 */
504 brw_push_insn_state(p);
505 brw_set_access_mode(p, BRW_ALIGN_1);
506 brw_set_mask_control(p, BRW_MASK_DISABLE);
507 brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
508 src1);
509 brw_set_access_mode(p, BRW_ALIGN_16);
510 brw_pop_insn_state(p);
511 }
512
513 void
514 vec4_generator::generate_gs_set_vertex_count(struct brw_reg dst,
515 struct brw_reg src)
516 {
517 brw_push_insn_state(p);
518 brw_set_access_mode(p, BRW_ALIGN_1);
519 brw_set_mask_control(p, BRW_MASK_DISABLE);
520
521 /* If we think of the src and dst registers as composed of 8 DWORDs each,
522 * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
523 * them to WORDs, and then pack them into DWORD 2 of dst.
524 *
525 * It's easier to get the EU to do this if we think of the src and dst
526 * registers as composed of 16 WORDS each; then, we want to pick up the
527 * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5 of
528 * dst.
529 *
530 * We can do that by the following EU instruction:
531 *
532 * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask }
533 */
534 brw_MOV(p, suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
535 stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
536 brw_set_access_mode(p, BRW_ALIGN_16);
537 brw_pop_insn_state(p);
538 }
539
540 void
541 vec4_generator::generate_gs_set_dword_2_immed(struct brw_reg dst,
542 struct brw_reg src)
543 {
544 assert(src.file == BRW_IMMEDIATE_VALUE);
545
546 brw_push_insn_state(p);
547 brw_set_access_mode(p, BRW_ALIGN_1);
548 brw_set_mask_control(p, BRW_MASK_DISABLE);
549 brw_MOV(p, suboffset(vec1(dst), 2), src);
550 brw_set_access_mode(p, BRW_ALIGN_16);
551 brw_pop_insn_state(p);
552 }
553
554 void
555 vec4_generator::generate_gs_prepare_channel_masks(struct brw_reg dst)
556 {
557 /* We want to left shift just DWORD 4 (the x component belonging to the
558 * second geometry shader invocation) by 4 bits. So generate the
559 * instruction:
560 *
561 * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
562 */
563 dst = suboffset(vec1(dst), 4);
564 brw_push_insn_state(p);
565 brw_set_access_mode(p, BRW_ALIGN_1);
566 brw_set_mask_control(p, BRW_MASK_DISABLE);
567 brw_SHL(p, dst, dst, brw_imm_ud(4));
568 brw_pop_insn_state(p);
569 }
570
571 void
572 vec4_generator::generate_gs_set_channel_masks(struct brw_reg dst,
573 struct brw_reg src)
574 {
575 /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
576 * Header: M0.5):
577 *
578 * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
579 *
580 * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
581 * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
582 * Vertex 0 DATA[7]. This bit is ANDed with the corresponding
583 * channel enable to determine the final channel enable. For the
584 * URB_READ_OWORD & URB_READ_HWORD messages, when final channel
585 * enable is 1 it indicates that Vertex 1 DATA [3] will be included
586 * in the writeback message. For the URB_WRITE_OWORD &
587 * URB_WRITE_HWORD messages, when final channel enable is 1 it
588 * indicates that Vertex 1 DATA [3] will be written to the surface.
589 *
590 * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
591 * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
592 *
593 * 14 Vertex 1 DATA [2] Channel Mask
594 * 13 Vertex 1 DATA [1] Channel Mask
595 * 12 Vertex 1 DATA [0] Channel Mask
596 * 11 Vertex 0 DATA [3] Channel Mask
597 * 10 Vertex 0 DATA [2] Channel Mask
598 * 9 Vertex 0 DATA [1] Channel Mask
599 * 8 Vertex 0 DATA [0] Channel Mask
600 *
601 * (This is from a section of the PRM that is agnostic to the particular
602 * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
603 * geometry shader invocations 0 and 1, respectively). Since we have the
604 * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
605 * and the enable flags for geometry shader invocation 1 in bits 7:0 of
606 * DWORD 4, we just need to OR them together and store the result in bits
607 * 15:8 of DWORD 5.
608 *
609 * It's easier to get the EU to do this if we think of the src and dst
610 * registers as composed of 32 bytes each; then, we want to pick up the
611 * contents of bytes 0 and 16 from src, OR them together, and store them in
612 * byte 21.
613 *
614 * We can do that by the following EU instruction:
615 *
616 * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
617 *
618 * Note: this relies on the source register having zeros in (a) bits 7:4 of
619 * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the
620 * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which
621 * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
622 * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
623 * contain valid channel mask values (which are in the range 0x0-0xf).
624 */
625 dst = retype(dst, BRW_REGISTER_TYPE_UB);
626 src = retype(src, BRW_REGISTER_TYPE_UB);
627 brw_push_insn_state(p);
628 brw_set_access_mode(p, BRW_ALIGN_1);
629 brw_set_mask_control(p, BRW_MASK_DISABLE);
630 brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
631 brw_pop_insn_state(p);
632 }
633
634 void
635 vec4_generator::generate_gs_get_instance_id(struct brw_reg dst)
636 {
637 /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT
638 * and store into dst.0 & dst.4. So generate the instruction:
639 *
640 * shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q }
641 */
642 brw_push_insn_state(p);
643 brw_set_access_mode(p, BRW_ALIGN_1);
644 dst = retype(dst, BRW_REGISTER_TYPE_UD);
645 struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
646 brw_SHR(p, dst, stride(r0, 1, 4, 0),
647 brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT));
648 brw_pop_insn_state(p);
649 }
650
651 void
652 vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1,
653 struct brw_reg index)
654 {
655 int second_vertex_offset;
656
657 if (brw->gen >= 6)
658 second_vertex_offset = 1;
659 else
660 second_vertex_offset = 16;
661
662 m1 = retype(m1, BRW_REGISTER_TYPE_D);
663
664 /* Set up M1 (message payload). Only the block offsets in M1.0 and
665 * M1.4 are used, and the rest are ignored.
666 */
667 struct brw_reg m1_0 = suboffset(vec1(m1), 0);
668 struct brw_reg m1_4 = suboffset(vec1(m1), 4);
669 struct brw_reg index_0 = suboffset(vec1(index), 0);
670 struct brw_reg index_4 = suboffset(vec1(index), 4);
671
672 brw_push_insn_state(p);
673 brw_set_mask_control(p, BRW_MASK_DISABLE);
674 brw_set_access_mode(p, BRW_ALIGN_1);
675
676 brw_MOV(p, m1_0, index_0);
677
678 if (index.file == BRW_IMMEDIATE_VALUE) {
679 index_4.dw1.ud += second_vertex_offset;
680 brw_MOV(p, m1_4, index_4);
681 } else {
682 brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
683 }
684
685 brw_pop_insn_state(p);
686 }
687
688 void
689 vec4_generator::generate_unpack_flags(vec4_instruction *inst,
690 struct brw_reg dst)
691 {
692 brw_push_insn_state(p);
693 brw_set_mask_control(p, BRW_MASK_DISABLE);
694 brw_set_access_mode(p, BRW_ALIGN_1);
695
696 struct brw_reg flags = brw_flag_reg(0, 0);
697 struct brw_reg dst_0 = suboffset(vec1(dst), 0);
698 struct brw_reg dst_4 = suboffset(vec1(dst), 4);
699
700 brw_AND(p, dst_0, flags, brw_imm_ud(0x0f));
701 brw_AND(p, dst_4, flags, brw_imm_ud(0xf0));
702 brw_SHR(p, dst_4, dst_4, brw_imm_ud(4));
703
704 brw_pop_insn_state(p);
705 }
706
707 void
708 vec4_generator::generate_scratch_read(vec4_instruction *inst,
709 struct brw_reg dst,
710 struct brw_reg index)
711 {
712 struct brw_reg header = brw_vec8_grf(0, 0);
713
714 gen6_resolve_implied_move(p, &header, inst->base_mrf);
715
716 generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1),
717 index);
718
719 uint32_t msg_type;
720
721 if (brw->gen >= 6)
722 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
723 else if (brw->gen == 5 || brw->is_g4x)
724 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
725 else
726 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
727
728 /* Each of the 8 channel enables is considered for whether each
729 * dword is written.
730 */
731 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
732 brw_set_dest(p, send, dst);
733 brw_set_src0(p, send, header);
734 if (brw->gen < 6)
735 send->header.destreg__conditionalmod = inst->base_mrf;
736 brw_set_dp_read_message(p, send,
737 255, /* binding table index: stateless access */
738 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
739 msg_type,
740 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
741 2, /* mlen */
742 true, /* header_present */
743 1 /* rlen */);
744 }
745
746 void
747 vec4_generator::generate_scratch_write(vec4_instruction *inst,
748 struct brw_reg dst,
749 struct brw_reg src,
750 struct brw_reg index)
751 {
752 struct brw_reg header = brw_vec8_grf(0, 0);
753 bool write_commit;
754
755 /* If the instruction is predicated, we'll predicate the send, not
756 * the header setup.
757 */
758 brw_set_predicate_control(p, false);
759
760 gen6_resolve_implied_move(p, &header, inst->base_mrf);
761
762 generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1),
763 index);
764
765 brw_MOV(p,
766 retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
767 retype(src, BRW_REGISTER_TYPE_D));
768
769 uint32_t msg_type;
770
771 if (brw->gen >= 7)
772 msg_type = GEN7_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
773 else if (brw->gen == 6)
774 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
775 else
776 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
777
778 brw_set_predicate_control(p, inst->predicate);
779
780 /* Pre-gen6, we have to specify write commits to ensure ordering
781 * between reads and writes within a thread. Afterwards, that's
782 * guaranteed and write commits only matter for inter-thread
783 * synchronization.
784 */
785 if (brw->gen >= 6) {
786 write_commit = false;
787 } else {
788 /* The visitor set up our destination register to be g0. This
789 * means that when the next read comes along, we will end up
790 * reading from g0 and causing a block on the write commit. For
791 * write-after-read, we are relying on the value of the previous
792 * read being used (and thus blocking on completion) before our
793 * write is executed. This means we have to be careful in
794 * instruction scheduling to not violate this assumption.
795 */
796 write_commit = true;
797 }
798
799 /* Each of the 8 channel enables is considered for whether each
800 * dword is written.
801 */
802 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
803 brw_set_dest(p, send, dst);
804 brw_set_src0(p, send, header);
805 if (brw->gen < 6)
806 send->header.destreg__conditionalmod = inst->base_mrf;
807 brw_set_dp_write_message(p, send,
808 255, /* binding table index: stateless access */
809 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
810 msg_type,
811 3, /* mlen */
812 true, /* header present */
813 false, /* not a render target write */
814 write_commit, /* rlen */
815 false, /* eot */
816 write_commit);
817 }
818
819 void
820 vec4_generator::generate_pull_constant_load(vec4_instruction *inst,
821 struct brw_reg dst,
822 struct brw_reg index,
823 struct brw_reg offset)
824 {
825 assert(brw->gen <= 7);
826 assert(index.file == BRW_IMMEDIATE_VALUE &&
827 index.type == BRW_REGISTER_TYPE_UD);
828 uint32_t surf_index = index.dw1.ud;
829
830 struct brw_reg header = brw_vec8_grf(0, 0);
831
832 gen6_resolve_implied_move(p, &header, inst->base_mrf);
833
834 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_D),
835 offset);
836
837 uint32_t msg_type;
838
839 if (brw->gen >= 6)
840 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
841 else if (brw->gen == 5 || brw->is_g4x)
842 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
843 else
844 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
845
846 /* Each of the 8 channel enables is considered for whether each
847 * dword is written.
848 */
849 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
850 brw_set_dest(p, send, dst);
851 brw_set_src0(p, send, header);
852 if (brw->gen < 6)
853 send->header.destreg__conditionalmod = inst->base_mrf;
854 brw_set_dp_read_message(p, send,
855 surf_index,
856 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
857 msg_type,
858 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
859 2, /* mlen */
860 true, /* header_present */
861 1 /* rlen */);
862
863 brw_mark_surface_used(&prog_data->base, surf_index);
864 }
865
866 void
867 vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst,
868 struct brw_reg dst,
869 struct brw_reg surf_index,
870 struct brw_reg offset)
871 {
872 assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
873 surf_index.type == BRW_REGISTER_TYPE_UD);
874
875 brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
876 brw_set_dest(p, insn, dst);
877 brw_set_src0(p, insn, offset);
878 brw_set_sampler_message(p, insn,
879 surf_index.dw1.ud,
880 0, /* LD message ignores sampler unit */
881 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
882 1, /* rlen */
883 1, /* mlen */
884 false, /* no header */
885 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
886 0);
887
888 brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
889 }
890
891 void
892 vec4_generator::generate_untyped_atomic(vec4_instruction *inst,
893 struct brw_reg dst,
894 struct brw_reg atomic_op,
895 struct brw_reg surf_index)
896 {
897 assert(atomic_op.file == BRW_IMMEDIATE_VALUE &&
898 atomic_op.type == BRW_REGISTER_TYPE_UD &&
899 surf_index.file == BRW_IMMEDIATE_VALUE &&
900 surf_index.type == BRW_REGISTER_TYPE_UD);
901
902 brw_untyped_atomic(p, dst, brw_message_reg(inst->base_mrf),
903 atomic_op.dw1.ud, surf_index.dw1.ud,
904 inst->mlen, 1);
905
906 brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
907 }
908
909 void
910 vec4_generator::generate_untyped_surface_read(vec4_instruction *inst,
911 struct brw_reg dst,
912 struct brw_reg surf_index)
913 {
914 assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
915 surf_index.type == BRW_REGISTER_TYPE_UD);
916
917 brw_untyped_surface_read(p, dst, brw_message_reg(inst->base_mrf),
918 surf_index.dw1.ud,
919 inst->mlen, 1);
920
921 brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
922 }
923
924 /**
925 * Generate assembly for a Vec4 IR instruction.
926 *
927 * \param instruction The Vec4 IR instruction to generate code for.
928 * \param dst The destination register.
929 * \param src An array of up to three source registers.
930 */
931 void
932 vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
933 struct brw_reg dst,
934 struct brw_reg *src)
935 {
936 vec4_instruction *inst = (vec4_instruction *) instruction;
937
938 if (dst.width == BRW_WIDTH_4) {
939 /* This happens in attribute fixups for "dual instanced" geometry
940 * shaders, since they use attributes that are vec4's. Since the exec
941 * width is only 4, it's essential that the caller set
942 * force_writemask_all in order to make sure the instruction is executed
943 * regardless of which channels are enabled.
944 */
945 assert(inst->force_writemask_all);
946
947 /* Fix up any <8;8,1> or <0;4,1> source registers to <4;4,1> to satisfy
948 * the following register region restrictions (from Graphics BSpec:
949 * 3D-Media-GPGPU Engine > EU Overview > Registers and Register Regions
950 * > Register Region Restrictions)
951 *
952 * 1. ExecSize must be greater than or equal to Width.
953 *
954 * 2. If ExecSize = Width and HorzStride != 0, VertStride must be set
955 * to Width * HorzStride."
956 */
957 for (int i = 0; i < 3; i++) {
958 if (src[i].file == BRW_GENERAL_REGISTER_FILE)
959 src[i] = stride(src[i], 4, 4, 1);
960 }
961 }
962
963 switch (inst->opcode) {
964 case BRW_OPCODE_MOV:
965 brw_MOV(p, dst, src[0]);
966 break;
967 case BRW_OPCODE_ADD:
968 brw_ADD(p, dst, src[0], src[1]);
969 break;
970 case BRW_OPCODE_MUL:
971 brw_MUL(p, dst, src[0], src[1]);
972 break;
973 case BRW_OPCODE_MACH:
974 brw_set_acc_write_control(p, 1);
975 brw_MACH(p, dst, src[0], src[1]);
976 brw_set_acc_write_control(p, 0);
977 break;
978
979 case BRW_OPCODE_MAD:
980 assert(brw->gen >= 6);
981 brw_MAD(p, dst, src[0], src[1], src[2]);
982 break;
983
984 case BRW_OPCODE_FRC:
985 brw_FRC(p, dst, src[0]);
986 break;
987 case BRW_OPCODE_RNDD:
988 brw_RNDD(p, dst, src[0]);
989 break;
990 case BRW_OPCODE_RNDE:
991 brw_RNDE(p, dst, src[0]);
992 break;
993 case BRW_OPCODE_RNDZ:
994 brw_RNDZ(p, dst, src[0]);
995 break;
996
997 case BRW_OPCODE_AND:
998 brw_AND(p, dst, src[0], src[1]);
999 break;
1000 case BRW_OPCODE_OR:
1001 brw_OR(p, dst, src[0], src[1]);
1002 break;
1003 case BRW_OPCODE_XOR:
1004 brw_XOR(p, dst, src[0], src[1]);
1005 break;
1006 case BRW_OPCODE_NOT:
1007 brw_NOT(p, dst, src[0]);
1008 break;
1009 case BRW_OPCODE_ASR:
1010 brw_ASR(p, dst, src[0], src[1]);
1011 break;
1012 case BRW_OPCODE_SHR:
1013 brw_SHR(p, dst, src[0], src[1]);
1014 break;
1015 case BRW_OPCODE_SHL:
1016 brw_SHL(p, dst, src[0], src[1]);
1017 break;
1018
1019 case BRW_OPCODE_CMP:
1020 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1021 break;
1022 case BRW_OPCODE_SEL:
1023 brw_SEL(p, dst, src[0], src[1]);
1024 break;
1025
1026 case BRW_OPCODE_DPH:
1027 brw_DPH(p, dst, src[0], src[1]);
1028 break;
1029
1030 case BRW_OPCODE_DP4:
1031 brw_DP4(p, dst, src[0], src[1]);
1032 break;
1033
1034 case BRW_OPCODE_DP3:
1035 brw_DP3(p, dst, src[0], src[1]);
1036 break;
1037
1038 case BRW_OPCODE_DP2:
1039 brw_DP2(p, dst, src[0], src[1]);
1040 break;
1041
1042 case BRW_OPCODE_F32TO16:
1043 assert(brw->gen >= 7);
1044 brw_F32TO16(p, dst, src[0]);
1045 break;
1046
1047 case BRW_OPCODE_F16TO32:
1048 assert(brw->gen >= 7);
1049 brw_F16TO32(p, dst, src[0]);
1050 break;
1051
1052 case BRW_OPCODE_LRP:
1053 assert(brw->gen >= 6);
1054 brw_LRP(p, dst, src[0], src[1], src[2]);
1055 break;
1056
1057 case BRW_OPCODE_BFREV:
1058 assert(brw->gen >= 7);
1059 /* BFREV only supports UD type for src and dst. */
1060 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1061 retype(src[0], BRW_REGISTER_TYPE_UD));
1062 break;
1063 case BRW_OPCODE_FBH:
1064 assert(brw->gen >= 7);
1065 /* FBH only supports UD type for dst. */
1066 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1067 break;
1068 case BRW_OPCODE_FBL:
1069 assert(brw->gen >= 7);
1070 /* FBL only supports UD type for dst. */
1071 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1072 break;
1073 case BRW_OPCODE_CBIT:
1074 assert(brw->gen >= 7);
1075 /* CBIT only supports UD type for dst. */
1076 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1077 break;
1078 case BRW_OPCODE_ADDC:
1079 assert(brw->gen >= 7);
1080 brw_set_acc_write_control(p, 1);
1081 brw_ADDC(p, dst, src[0], src[1]);
1082 brw_set_acc_write_control(p, 0);
1083 break;
1084 case BRW_OPCODE_SUBB:
1085 assert(brw->gen >= 7);
1086 brw_set_acc_write_control(p, 1);
1087 brw_SUBB(p, dst, src[0], src[1]);
1088 brw_set_acc_write_control(p, 0);
1089 break;
1090
1091 case BRW_OPCODE_BFE:
1092 assert(brw->gen >= 7);
1093 brw_BFE(p, dst, src[0], src[1], src[2]);
1094 break;
1095
1096 case BRW_OPCODE_BFI1:
1097 assert(brw->gen >= 7);
1098 brw_BFI1(p, dst, src[0], src[1]);
1099 break;
1100 case BRW_OPCODE_BFI2:
1101 assert(brw->gen >= 7);
1102 brw_BFI2(p, dst, src[0], src[1], src[2]);
1103 break;
1104
1105 case BRW_OPCODE_IF:
1106 if (inst->src[0].file != BAD_FILE) {
1107 /* The instruction has an embedded compare (only allowed on gen6) */
1108 assert(brw->gen == 6);
1109 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1110 } else {
1111 struct brw_instruction *brw_inst = brw_IF(p, BRW_EXECUTE_8);
1112 brw_inst->header.predicate_control = inst->predicate;
1113 }
1114 break;
1115
1116 case BRW_OPCODE_ELSE:
1117 brw_ELSE(p);
1118 break;
1119 case BRW_OPCODE_ENDIF:
1120 brw_ENDIF(p);
1121 break;
1122
1123 case BRW_OPCODE_DO:
1124 brw_DO(p, BRW_EXECUTE_8);
1125 break;
1126
1127 case BRW_OPCODE_BREAK:
1128 brw_BREAK(p);
1129 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1130 break;
1131 case BRW_OPCODE_CONTINUE:
1132 /* FINISHME: We need to write the loop instruction support still. */
1133 if (brw->gen >= 6)
1134 gen6_CONT(p);
1135 else
1136 brw_CONT(p);
1137 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1138 break;
1139
1140 case BRW_OPCODE_WHILE:
1141 brw_WHILE(p);
1142 break;
1143
1144 case SHADER_OPCODE_RCP:
1145 case SHADER_OPCODE_RSQ:
1146 case SHADER_OPCODE_SQRT:
1147 case SHADER_OPCODE_EXP2:
1148 case SHADER_OPCODE_LOG2:
1149 case SHADER_OPCODE_SIN:
1150 case SHADER_OPCODE_COS:
1151 if (brw->gen == 6) {
1152 generate_math1_gen6(inst, dst, src[0]);
1153 } else {
1154 /* Also works for Gen7. */
1155 generate_math1_gen4(inst, dst, src[0]);
1156 }
1157 break;
1158
1159 case SHADER_OPCODE_POW:
1160 case SHADER_OPCODE_INT_QUOTIENT:
1161 case SHADER_OPCODE_INT_REMAINDER:
1162 if (brw->gen >= 7) {
1163 generate_math2_gen7(inst, dst, src[0], src[1]);
1164 } else if (brw->gen == 6) {
1165 generate_math2_gen6(inst, dst, src[0], src[1]);
1166 } else {
1167 generate_math2_gen4(inst, dst, src[0], src[1]);
1168 }
1169 break;
1170
1171 case SHADER_OPCODE_TEX:
1172 case SHADER_OPCODE_TXD:
1173 case SHADER_OPCODE_TXF:
1174 case SHADER_OPCODE_TXF_CMS:
1175 case SHADER_OPCODE_TXF_MCS:
1176 case SHADER_OPCODE_TXL:
1177 case SHADER_OPCODE_TXS:
1178 case SHADER_OPCODE_TG4:
1179 case SHADER_OPCODE_TG4_OFFSET:
1180 generate_tex(inst, dst, src[0]);
1181 break;
1182
1183 case VS_OPCODE_URB_WRITE:
1184 generate_vs_urb_write(inst);
1185 break;
1186
1187 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1188 generate_scratch_read(inst, dst, src[0]);
1189 break;
1190
1191 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1192 generate_scratch_write(inst, dst, src[0], src[1]);
1193 break;
1194
1195 case VS_OPCODE_PULL_CONSTANT_LOAD:
1196 generate_pull_constant_load(inst, dst, src[0], src[1]);
1197 break;
1198
1199 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
1200 generate_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1201 break;
1202
1203 case GS_OPCODE_URB_WRITE:
1204 generate_gs_urb_write(inst);
1205 break;
1206
1207 case GS_OPCODE_THREAD_END:
1208 generate_gs_thread_end(inst);
1209 break;
1210
1211 case GS_OPCODE_SET_WRITE_OFFSET:
1212 generate_gs_set_write_offset(dst, src[0], src[1]);
1213 break;
1214
1215 case GS_OPCODE_SET_VERTEX_COUNT:
1216 generate_gs_set_vertex_count(dst, src[0]);
1217 break;
1218
1219 case GS_OPCODE_SET_DWORD_2_IMMED:
1220 generate_gs_set_dword_2_immed(dst, src[0]);
1221 break;
1222
1223 case GS_OPCODE_PREPARE_CHANNEL_MASKS:
1224 generate_gs_prepare_channel_masks(dst);
1225 break;
1226
1227 case GS_OPCODE_SET_CHANNEL_MASKS:
1228 generate_gs_set_channel_masks(dst, src[0]);
1229 break;
1230
1231 case GS_OPCODE_GET_INSTANCE_ID:
1232 generate_gs_get_instance_id(dst);
1233 break;
1234
1235 case SHADER_OPCODE_SHADER_TIME_ADD:
1236 brw_shader_time_add(p, src[0],
1237 prog_data->base.binding_table.shader_time_start);
1238 brw_mark_surface_used(&prog_data->base,
1239 prog_data->base.binding_table.shader_time_start);
1240 break;
1241
1242 case SHADER_OPCODE_UNTYPED_ATOMIC:
1243 generate_untyped_atomic(inst, dst, src[0], src[1]);
1244 break;
1245
1246 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1247 generate_untyped_surface_read(inst, dst, src[0]);
1248 break;
1249
1250 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
1251 generate_unpack_flags(inst, dst);
1252 break;
1253
1254 default:
1255 if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
1256 _mesa_problem(&brw->ctx, "Unsupported opcode in `%s' in vec4\n",
1257 opcode_descs[inst->opcode].name);
1258 } else {
1259 _mesa_problem(&brw->ctx, "Unsupported opcode %d in vec4", inst->opcode);
1260 }
1261 abort();
1262 }
1263 }
1264
1265 void
1266 vec4_generator::generate_code(exec_list *instructions)
1267 {
1268 int last_native_insn_offset = 0;
1269 const char *last_annotation_string = NULL;
1270 const void *last_annotation_ir = NULL;
1271
1272 if (unlikely(debug_flag)) {
1273 if (shader_prog) {
1274 printf("Native code for vertex shader %d:\n", shader_prog->Name);
1275 } else {
1276 printf("Native code for vertex program %d:\n", prog->Id);
1277 }
1278 }
1279
1280 foreach_list(node, instructions) {
1281 vec4_instruction *inst = (vec4_instruction *)node;
1282 struct brw_reg src[3], dst;
1283
1284 if (unlikely(debug_flag)) {
1285 if (last_annotation_ir != inst->ir) {
1286 last_annotation_ir = inst->ir;
1287 if (last_annotation_ir) {
1288 printf(" ");
1289 if (shader_prog) {
1290 ((ir_instruction *) last_annotation_ir)->print();
1291 } else {
1292 const prog_instruction *vpi;
1293 vpi = (const prog_instruction *) inst->ir;
1294 printf("%d: ", (int)(vpi - prog->Instructions));
1295 _mesa_fprint_instruction_opt(stdout, vpi, 0,
1296 PROG_PRINT_DEBUG, NULL);
1297 }
1298 printf("\n");
1299 }
1300 }
1301 if (last_annotation_string != inst->annotation) {
1302 last_annotation_string = inst->annotation;
1303 if (last_annotation_string)
1304 printf(" %s\n", last_annotation_string);
1305 }
1306 }
1307
1308 for (unsigned int i = 0; i < 3; i++) {
1309 src[i] = inst->get_src(this->prog_data, i);
1310 }
1311 dst = inst->get_dst();
1312
1313 brw_set_conditionalmod(p, inst->conditional_mod);
1314 brw_set_predicate_control(p, inst->predicate);
1315 brw_set_predicate_inverse(p, inst->predicate_inverse);
1316 brw_set_saturate(p, inst->saturate);
1317 brw_set_mask_control(p, inst->force_writemask_all);
1318
1319 unsigned pre_emit_nr_insn = p->nr_insn;
1320
1321 generate_vec4_instruction(inst, dst, src);
1322
1323 if (inst->no_dd_clear || inst->no_dd_check) {
1324 assert(p->nr_insn == pre_emit_nr_insn + 1 ||
1325 !"no_dd_check or no_dd_clear set for IR emitting more "
1326 "than 1 instruction");
1327
1328 struct brw_instruction *last = &p->store[pre_emit_nr_insn];
1329
1330 if (inst->no_dd_clear)
1331 last->header.dependency_control |= BRW_DEPENDENCY_NOTCLEARED;
1332 if (inst->no_dd_check)
1333 last->header.dependency_control |= BRW_DEPENDENCY_NOTCHECKED;
1334 }
1335
1336 if (unlikely(debug_flag)) {
1337 brw_dump_compile(p, stdout,
1338 last_native_insn_offset, p->next_insn_offset);
1339 }
1340
1341 last_native_insn_offset = p->next_insn_offset;
1342 }
1343
1344 if (unlikely(debug_flag)) {
1345 printf("\n");
1346 }
1347
1348 brw_set_uip_jip(p);
1349
1350 /* OK, while the INTEL_DEBUG=vs above is very nice for debugging VS
1351 * emit issues, it doesn't get the jump distances into the output,
1352 * which is often something we want to debug. So this is here in
1353 * case you're doing that.
1354 */
1355 if (0 && unlikely(debug_flag)) {
1356 brw_dump_compile(p, stdout, 0, p->next_insn_offset);
1357 }
1358 }
1359
1360 const unsigned *
1361 vec4_generator::generate_assembly(exec_list *instructions,
1362 unsigned *assembly_size)
1363 {
1364 brw_set_access_mode(p, BRW_ALIGN_16);
1365 generate_code(instructions);
1366 return brw_get_program(p, assembly_size);
1367 }
1368
1369 } /* namespace brw */